微信截图_20210116202556.png
微信截图_20210116203019.png

词法分析

Lexical Analysis takes the raw code and splits it apart into these things called tokens by a thing called a tokenizer (or lexer). Tokens are an array of tiny little objects that describe an isolated piece of the syntax. They could be numbers, labels, punctuation, operators, whatever.

微信截图_20210116204151.png

  1. 通过从左往右阅读源码将字符解析为一个个词法单位(token)
  2. 利用向前看(lookahead)操作来确定当前的词素是否结束以及下一个词素是否开始

    有限自动机

    依据构造好的有限自动机,在不同的状态中迁移,从而解析出 Token 来

正则表达式

常用的词法都用正则表达式描述出来的

微信截图_20210130135031.png

  1. //Tokens might look something like this:
  2. [
  3. { type: 'paren', value: '(' },
  4. { type: 'name', value: 'add' },
  5. { type: 'number', value: '2' },
  6. { type: 'paren', value: '(' },
  7. { type: 'name', value: 'subtract' },
  8. { type: 'number', value: '4' },
  9. { type: 'number', value: '2' },
  10. { type: 'paren', value: ')' },
  11. { type: 'paren', value: ')' },
  12. ]

看了这里的转换才知道那些 XXX括号的算法题是干嘛用的了

  1. // We start by accepting an input string of code, and we're gonna set up two
  2. // things...
  3. function tokenizer(input) {
  4. // A `current` variable for tracking our position in the code like a cursor.
  5. let current = 0;
  6. // And a `tokens` array for pushing our tokens to.
  7. let tokens = [];
  8. // We start by creating a `while` loop where we are setting up our `current`
  9. // variable to be incremented as much as we want `inside` the loop.
  10. //
  11. // We do this because we may want to increment `current` many times within a
  12. // single loop because our tokens can be any length.
  13. while (current < input.length) {
  14. // We're also going to store the `current` character in the `input`.
  15. let char = input[current];
  16. // The first thing we want to check for is an open parenthesis. This will
  17. // later be used for `CallExpression` but for now we only care about the
  18. // character.
  19. //
  20. // We check to see if we have an open parenthesis:
  21. if (char === '(') {
  22. // If we do, we push a new token with the type `paren` and set the value
  23. // to an open parenthesis.
  24. tokens.push({
  25. type: 'paren',
  26. value: '(',
  27. });
  28. // Then we increment `current`
  29. current++;
  30. // And we `continue` onto the next cycle of the loop.
  31. continue;
  32. }
  33. // Next we're going to check for a closing parenthesis. We do the same exact
  34. // thing as before: Check for a closing parenthesis, add a new token,
  35. // increment `current`, and `continue`.
  36. if (char === ')') {
  37. tokens.push({
  38. type: 'paren',
  39. value: ')',
  40. });
  41. current++;
  42. continue;
  43. }
  44. // Moving on, we're now going to check for whitespace. This is interesting
  45. // because we care that whitespace exists to separate characters, but it
  46. // isn't actually important for us to store as a token. We would only throw
  47. // it out later.
  48. //
  49. // So here we're just going to test for existence and if it does exist we're
  50. // going to just `continue` on.
  51. let WHITESPACE = /\s/;
  52. if (WHITESPACE.test(char)) {
  53. current++;
  54. continue;
  55. }
  56. // The next type of token is a number. This is different than what we have
  57. // seen before because a number could be any number of characters and we
  58. // want to capture the entire sequence of characters as one token.
  59. //
  60. // (add 123 456)
  61. // ^^^ ^^^
  62. // Only two separate tokens
  63. //
  64. // So we start this off when we encounter the first number in a sequence.
  65. let NUMBERS = /[0-9]/;
  66. if (NUMBERS.test(char)) {
  67. // We're going to create a `value` string that we are going to push
  68. // characters to.
  69. let value = '';
  70. // Then we're going to loop through each character in the sequence until
  71. // we encounter a character that is not a number, pushing each character
  72. // that is a number to our `value` and incrementing `current` as we go.
  73. while (NUMBERS.test(char)) {
  74. value += char;
  75. char = input[++current];
  76. }
  77. // After that we push our `number` token to the `tokens` array.
  78. tokens.push({ type: 'number', value });
  79. // And we continue on.
  80. continue;
  81. }
  82. // We'll also add support for strings in our language which will be any
  83. // text surrounded by double quotes (").
  84. //
  85. // (concat "foo" "bar")
  86. // ^^^ ^^^ string tokens
  87. //
  88. // We'll start by checking for the opening quote:
  89. if (char === '"') {
  90. // Keep a `value` variable for building up our string token.
  91. let value = '';
  92. // We'll skip the opening double quote in our token.
  93. char = input[++current];
  94. // Then we'll iterate through each character until we reach another
  95. // double quote.
  96. while (char !== '"') {
  97. value += char;
  98. char = input[++current];
  99. }
  100. // Skip the closing double quote.
  101. char = input[++current];
  102. // And add our `string` token to the `tokens` array.
  103. tokens.push({ type: 'string', value });
  104. continue;
  105. }
  106. // The last type of token will be a `name` token. This is a sequence of
  107. // letters instead of numbers, that are the names of functions in our lisp
  108. // syntax.
  109. //
  110. // (add 2 4)
  111. // ^^^
  112. // Name token
  113. //
  114. let LETTERS = /[a-z]/i;
  115. if (LETTERS.test(char)) {
  116. let value = '';
  117. // Again we're just going to loop through all the letters pushing them to
  118. // a value.
  119. while (LETTERS.test(char)) {
  120. value += char;
  121. char = input[++current];
  122. }
  123. // And pushing that value as a token with the type `name` and continuing.
  124. tokens.push({ type: 'name', value });
  125. continue;
  126. }
  127. // Finally if we have not matched a character by now, we're going to throw
  128. // an error and completely exit.
  129. throw new TypeError('I dont know what this character is: ' + char);
  130. }
  131. // Then at the end of our `tokenizer` we simply return the tokens array.
  132. return tokens;
  133. }


语法分析

在这里将词法分析之后的 token 转换为 AST 树,在这里就看出递归算法、回溯算法的作用了,手动狗头
微信截图_20210116223936.png

  1. // And an Abstract Syntax Tree (AST) might look like this:
  2. {
  3. type: 'Program',
  4. body: [{
  5. type: 'CallExpression',
  6. name: 'add',
  7. params: [{
  8. type: 'NumberLiteral',
  9. value: '2',
  10. }, {
  11. type: 'CallExpression',
  12. name: 'subtract',
  13. params: [{
  14. type: 'NumberLiteral',
  15. value: '4',
  16. }, {
  17. type: 'NumberLiteral',
  18. value: '2',
  19. }]
  20. }]
  21. }]
  22. }
  1. // Okay, so we define a `parser` function that accepts our array of `tokens`.
  2. function parser(tokens) {
  3. // Again we keep a `current` variable that we will use as a cursor.
  4. let current = 0;
  5. // But this time we're going to use recursion instead of a `while` loop. So we
  6. // define a `walk` function.
  7. function walk() {
  8. // Inside the walk function we start by grabbing the `current` token.
  9. let token = tokens[current];
  10. // We're going to split each type of token off into a different code path,
  11. // starting off with `number` tokens.
  12. //
  13. // We test to see if we have a `number` token.
  14. if (token.type === 'number') {
  15. // If we have one, we'll increment `current`.
  16. current++;
  17. // And we'll return a new AST node called `NumberLiteral` and setting its
  18. // value to the value of our token.
  19. return {
  20. type: 'NumberLiteral',
  21. value: token.value,
  22. };
  23. }
  24. // If we have a string we will do the same as number and create a
  25. // `StringLiteral` node.
  26. if (token.type === 'string') {
  27. current++;
  28. return {
  29. type: 'StringLiteral',
  30. value: token.value,
  31. };
  32. }
  33. // Next we're going to look for CallExpressions. We start this off when we
  34. // encounter an open parenthesis.
  35. if (
  36. token.type === 'paren' &&
  37. token.value === '('
  38. ) {
  39. // We'll increment `current` to skip the parenthesis since we don't care
  40. // about it in our AST.
  41. token = tokens[++current];
  42. // We create a base node with the type `CallExpression`, and we're going
  43. // to set the name as the current token's value since the next token after
  44. // the open parenthesis is the name of the function.
  45. let node = {
  46. type: 'CallExpression',
  47. name: token.value,
  48. params: [],
  49. };
  50. // We increment `current` *again* to skip the name token.
  51. token = tokens[++current];
  52. while (
  53. (token.type !== 'paren') ||
  54. (token.type === 'paren' && token.value !== ')')
  55. ) {
  56. // we'll call the `walk` function which will return a `node` and we'll
  57. // push it into our `node.params`.
  58. node.params.push(walk());
  59. token = tokens[current];
  60. }
  61. // Finally we will increment `current` one last time to skip the closing
  62. // parenthesis.
  63. current++;
  64. // And return the node.
  65. return node;
  66. }
  67. // Again, if we haven't recognized the token type by now we're going to
  68. // throw an error.
  69. throw new TypeError(token.type);
  70. }
  71. // Now, we're going to create our AST which will have a root which is a
  72. // `Program` node.
  73. let ast = {
  74. type: 'Program',
  75. body: [],
  76. };
  77. while (current < tokens.length) {
  78. ast.body.push(walk());
  79. }
  80. // At the end of our parser we'll return the AST.
  81. return ast;
  82. }

学习资料

  1. the-super-tiny-compiler