编译器 - html-parser - 《前端飞行随笔》

src\compiler\parser\html-parser.js

// 匹配属性，兼容 class="some-class"/class='some-class'/class=some-class/disable 四种写法
const attribute = /^\s*([^\s"'<>\/=]+)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/
// 动态属性值（如 @/:/v- 等）
const dynamicArgAttribute = /^\s*((?:v-[\w-]+:|@|:|#)\[[^=]+?\][^\s"'<>\/=]*)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/
// 不包含前缀名的 tag 名称
const ncname = `[a-zA-Z_][\\-\\.0-9_a-zA-Z${unicodeRegExp.source}]*`
// 包含前缀的 tag 名称
const qnameCapture = `((?:${ncname}\\:)?${ncname})`
// 开始 tag
const startTagOpen = new RegExp(`^<${qnameCapture}`)
// tag 结束前的内容
const startTagClose = /^\s*(\/?)>/
// 结束标签
const endTag = new RegExp(`^<\\/${qnameCapture}[^>]*>`)
// DOCTYPE 标签
const doctype = /^<!DOCTYPE [^>]+>/i
// 注释节点
const comment = /^<!\--/
// 条件注释节点
const conditionalComment = /^<!\[/
// 纯文本标签
export const isPlainTextElement = makeMap('script,style,textarea', true)
const reCache = {}
// html 中特殊字符的 decode
const decodingMap = {
  '&lt;': '<',
  '&gt;': '>',
  '&quot;': '"',
  '&amp;': '&',
  '&#10;': '\n',
  '&#9;': '\t',
  '&#39;': "'"
}
// 匹配上面被转义的特殊字符
const encodedAttr = /&(?:lt|gt|quot|amp|#39);/g
const encodedAttrWithNewLines = /&(?:lt|gt|quot|amp|#39|#10|#9);/g
// 是否保留 html 的换行在特殊的标签内
const isIgnoreNewlineTag = makeMap('pre,textarea', true)
const shouldIgnoreFirstNewline = (tag, html) => tag && isIgnoreNewlineTag(tag) && html[0] === '\n'
// 将被转义的特殊字符转义回来
function decodeAttr (value, shouldDecodeNewlines) {
  const re = shouldDecodeNewlines ? encodedAttrWithNewLines : encodedAttr
  return value.replace(re, match => decodingMap[match])
}

function parseHTML (html, options) {
  // 用于存放 tag 的堆栈
  const stack = []
  // 传入的 options 之一
  const expectHTML = options.expectHTML
  // 传入的 options 之一，用于判断是否是一元标签
  const isUnaryTag = options.isUnaryTag || no
  // 传入的 options 之一，用于判断标签是否可以时自闭和标签
  const canBeLeftOpenTag = options.canBeLeftOpenTag || no
  // 当前字符流读入的位置
  let index = 0
  // 尚未 parse 的 html 字符串、stack 栈顶的标签元素
  let last, lastTag
  // 当 html 被 parse 完了则退出循环
  while (html) {
    // last 用于存放尚未被 parse 的 html
    last = html
    // Make sure we're not in a plaintext content element like script/style
    if (!lastTag || !isPlainTextElement(lastTag)) {
      // 如果内容不是在纯文本标签里（script、style、textarea）
      // 获取第一个 < 出现的位置
      let textEnd = html.indexOf('<')
      // 如果 < 出现在第一个位置
      if (textEnd === 0) {
        // Comment:
        // 判断是不是注释节点
        if (comment.test(html)) {
          // 确认是否是完整的注释节点
          const commentEnd = html.indexOf('-->')
          // 如果是，则根据配置要求处理
          if (commentEnd >= 0) {
            if (options.shouldKeepComment) {
              options.comment(html.substring(4, commentEnd), index, index + commentEnd + 3)
            }
            // 跳转到注释结束
            advance(commentEnd + 3)
            // 结束当前循环
            continue
          }
        }
        // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
        // 如果是条件注释节点
        if (conditionalComment.test(html)) {
          // 确认是否是条件注释节点
          const conditionalEnd = html.indexOf(']>')
          // 确认是
          if (conditionalEnd >= 0) {
            // 跳转时注释结束
            advance(conditionalEnd + 2)
            // 结束当前循环
            continue
          }
        }
        // Doctype:
        // 如果是 Doctype 节点
        const doctypeMatch = html.match(doctype)
        if (doctypeMatch) {
          // 跳转时 Doctype 结束
          advance(doctypeMatch[0].length)
          // 结束当前循环
          continue
        }
        // End tag:
        // 结束标签
        const endTagMatch = html.match(endTag)
        // 如果是结束标签
        if (endTagMatch) {
          // 标记开始位置
          const curIndex = index
          // 跳到结束标签后面
          advance(endTagMatch[0].length)
          // 解析结束标签
          parseEndTag(endTagMatch[1], curIndex, index)
          continue
        }
        // Start tag:
        // 开始标签
        const startTagMatch = parseStartTag()
        // 如果有对应的匹配结果，说明是开始标签
        if (startTagMatch) {
          // 处理并分析匹配的结果
          handleStartTag(startTagMatch)
          if (shouldIgnoreFirstNewline(startTagMatch.tagName, html)) {
            advance(1)
          }
          continue
        }
      }
      let text, rest, next
      // 一直循环跳到非文本的 < 位置
      if (textEnd >= 0) {
        rest = html.slice(textEnd)
        while (
          !endTag.test(rest) &&
          !startTagOpen.test(rest) &&
          !comment.test(rest) &&
          !conditionalComment.test(rest)
        ) {
          // < in plain text, be forgiving and treat it as text
          next = rest.indexOf('<', 1)
          if (next < 0) break
          textEnd += next
          rest = html.slice(textEnd)
        }
        text = html.substring(0, textEnd)
      }
      // 说明剩余的都是文本
      if (textEnd < 0) {
        text = html
      }
      // 跳过文本内容
      if (text) {
        advance(text.length)
      }
      if (options.chars && text) {
        options.chars(text, index - text.length, index)
      }
    } else {
      // 如果是纯文本 tag，则将内容视作文本处理
      let endTagLength = 0
      // 小写的 tag 名称
      const stackedTag = lastTag.toLowerCase()
      // 正则 reStackedTag 的作用是用来匹配纯文本标签的内容以及结束标签的
      const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)(</' + stackedTag + '[^>]*>)', 'i'))
      const rest = html.replace(reStackedTag, function (all, text, endTag) {
        endTagLength = endTag.length
        if (!isPlainTextElement(stackedTag) && stackedTag !== 'noscript') {
          text = text
            .replace(/<!\--([\s\S]*?)-->/g, '$1') // #7298
            .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1')
        }
        if (shouldIgnoreFirstNewline(stackedTag, text)) {
          text = text.slice(1)
        }
        if (options.chars) {
          options.chars(text)
        }
        return ''
      })
      index += html.length - rest.length
      html = rest
      parseEndTag(stackedTag, index - endTagLength, index)
    }
    // 如果先前的 parse 没有发生任何改变，则将 html 视作纯文本来对待
    if (html === last) {
      options.chars && options.chars(html)
      if (process.env.NODE_ENV !== 'production' && !stack.length && options.warn) {
        options.warn(`Mal-formatted tag at end of template: "${html}"`, { start: index + html.length })
      }
      break
    }
  }
  // Clean up any remaining tags
  parseEndTag()
  // 跳过指定字符长度
  function advance (n) {
    index += n
    html = html.substring(n)
  }
  // 解析开始标签
  function parseStartTag () {
    // 使用正则开始解析
    const start = html.match(startTagOpen)
    // 如果解析到了
    if (start) {
      const match = {
        // 解析到的 tag 名称
        tagName: start[1],
        // tag 对应的参数名称
        attrs: [],
        // tag 开始的位置
        start: index
      }
      // 跳到开始标签的后面
      advance(start[0].length)
      let end, attr
      // 如果没有解析到起始标签的结束，并且能匹配到参数
      while (!(end = html.match(startTagClose)) && (attr = html.match(dynamicArgAttribute) || html.match(attribute))) {
        // 属性的起点标记
        attr.start = index
        // 跳转到属性的后面
        advance(attr[0].length)
        // 标记属性的结尾位置
        attr.end = index
        // push 到 attrs 中
        match.attrs.push(attr)
      }
      // 如果已经没有属性能匹配的同时，还没有到最后一位
      if (end) {
        // end[1] 如果有值说明是一元标签
        match.unarySlash = end[1]
        // 跳到标签的结束
        advance(end[0].length)
        // 标记开始标签的结束为止
        match.end = index
        return match
      }
    }
  }
  // 处理开始标签相关信息
  function handleStartTag (match) {
    // tag 名
    const tagName = match.tagName
    // 是否是一元标签
    const unarySlash = match.unarySlash
    // 根据配置判断合法 html
    if (expectHTML) {
      // 如果上一个标签是 p，同时自身不是流式内容的标签，则直接结束 p 标签
      if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
        parseEndTag(lastTag)
      }
      // 如果当前解析的标签是一个可以省略结束标签的标签，并且与上一个解析到的开始标签相同时，则会立刻关闭当前标签
      if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
        parseEndTag(tagName)
      }
    }
    // 判断当前标签是否为一元标签
    const unary = isUnaryTag(tagName) || !!unarySlash
        // 标签属性的长度
    const l = match.attrs.length
    // 创建一个跟标签属性同等长度的数组
    const attrs = new Array(l)
    // 遍历属性
    for (let i = 0; i < l; i++) {
             // 取出属性匹配结果
      const args = match.attrs[i]
      // 拿出属性对应的值
      const value = args[3] || args[4] || args[5] || ''
      // 根据实际情况拿取相关换行的配置
      const shouldDecodeNewlines = tagName === 'a' && args[1] === 'href'
        ? options.shouldDecodeNewlinesForHref
        : options.shouldDecodeNewlines
      // 注入到准备好的参数数组中
      attrs[i] = {
        // 参数名
        name: args[1],
        // 进行过 decode 的值
        value: decodeAttr(value, shouldDecodeNewlines)
      }
      // 开发所需 sourcemap
      if (process.env.NODE_ENV !== 'production' && options.outputSourceRange) {
        attrs[i].start = args.start + args[0].match(/^\s*/).length
        attrs[i].end = args.end
      }
    }
    // 如果不是一元标签
    if (!unary) {
      // 则将当前标签信息入栈
      stack.push({ tag: tagName, lowerCasedTag: tagName.toLowerCase(), attrs: attrs, start: match.start, end: match.end })
      // 设置栈顶信息
      lastTag = tagName
    }
    // 调用 start 钩子函数
    if (options.start) {
      options.start(tagName, attrs, unary, match.start, match.end)
    }
  }
  // 解析结束标签
  // 检测是否缺少闭合标签
  // 处理 stack 栈中剩余的标签
  // 解析 </br> 与 </p> 标签，与浏览器的行为相同
  function parseEndTag (tagName, start, end) {
    // pos 用于判断 html 字符串是否缺少结束标签
    // lowerCasedTagName 用于存储小写的 tag 名称
    let pos, lowerCasedTagName
    // 空处理
    if (start == null) start = index
    if (end == null) end = index
    // Find the closest opened tag of the same type
    // 如果有 tag 名称
    if (tagName) {
      // 获取小写的 tag 名称
      lowerCasedTagName = tagName.toLowerCase()
      // 以倒叙遍历堆栈的方式查找当前 tag 的开始标枪在堆栈里的位置
      for (pos = stack.length - 1; pos >= 0; pos--) {
        if (stack[pos].lowerCasedTag === lowerCasedTagName) {
          break
        }
      }
    } else {
      // If no tag name is provided, clean shop
      // 反之位置为 0
      pos = 0
    }
    // 如果在堆栈里找到了对应的开始位置，或者干脆当前没有对应 tagName
    if (pos >= 0) {
      // Close all the open elements, up the stack
      // 倒叙遍历堆栈，如果开始标签后有标签，在开发环境会报出对应的错误
      for (let i = stack.length - 1; i >= pos; i--) {
        if (process.env.NODE_ENV !== 'production' &&
          (i > pos || !tagName) &&
          options.warn
        ) {
          options.warn(
            `tag <${stack[i].tag}> has no matching end tag.`,
            { start: stack[i].start, end: stack[i].end }
          )
        }
        // 闭合该 tag
        if (options.end) {
          options.end(stack[i].tag, start, end)
        }
      }
      // Remove the open elements from the stack
      // 更新堆栈和栈顶信息
      stack.length = pos
      lastTag = pos && stack[pos - 1].tag
    } else if (lowerCasedTagName === 'br') {
      // 如果结束标签为 br，同时没有找到开始标签，则在此之前就地添加开始标签
      if (options.start) {
        options.start(tagName, [], true, start, end)
      }
    } else if (lowerCasedTagName === 'p') {
      // 如果结束标签为 p，同时没有找到开始标签，则在此之前就地添加开始标签
      if (options.start) {
        options.start(tagName, [], false, start, end)
      }
      if (options.end) {
        options.end(tagName, start, end)
      }
    }
  }
}