:::info 本文更新于 2018.12.19 :::

本文以 /a/, /^a/, /a$/ 三个简单正则来分析hyperscan smwr分支的匹配算法. 注意:

  • 编译时没有加任何标志位(比如HS_FLAG_CASELESS, 不区分大小写)
  • 3个正则的id都是8

    /a/

    smallwrite_nfa.txt信息: ``` mcclellan 8 report: 8, states: 3, length: 448 astart: 1, fstart: 1 single accept: 1, has_accel: 0 accel_limit: 2, accept_limit 2

Alphabet 0: a 1: [^a]

00000 [00 - ff]->0 00001 [00 - 60]->1 61->2 [62 - ff]->1 00002 [00 - 60]->1 61->2 [62 - ff]->1

Acceleration

Queue : 0 Length : 448 bytes Num Positions : 3 Scratch State : 1 bytes Stream State : 1 bytes Flags : Max Width : inf/unknown Min Width : 0 BiAnchored Width : inf/unknown Max Offset : inf/unknown Reverse Acceleration : NONE

  1. NFA图:<br />![image.png](https://cdn.nlark.com/yuque/0/2019/png/331848/1572404360952-4ef62943-b93c-4a13-b0fa-2c162f43e56c.png#align=left&display=inline&height=197&originHeight=394&originWidth=726&size=26288&status=done&width=363)
  2. 图中, 蓝色箭头表示start_anchored位置, 红色箭头表示start_floating位置. 一共3个状态, 状态0意味着MO_DEAD, 没有画出.
  3. 0-255个可能输入被归纳成2个字母表, 即信息中的Alphabet:
  4. - 0: `a`
  5. - 1: `[^a]`
  6. start_anchored开始匹配:
  7. ```c
  8. // src/nfa/mcclellan.c
  9. static really_inline
  10. char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,
  11. size_t length, NfaCallback cb, void *context,
  12. char single) {
  13. assert(n->type == MCCLELLAN_NFA_8);
  14. const struct mcclellan *m = getImplNfa(n);
  15. u32 s = m->start_anchored;
  16. if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single,
  17. NULL, CALLBACK_OUTPUT)
  18. == MO_DEAD) {
  19. return MO_DEAD;
  20. }
  21. const struct mstate_aux *aux = get_aux(m, s);
  22. if (aux->accept_eod) {
  23. doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);
  24. }
  25. return s ? MO_ALIVE : MO_DEAD;
  26. }

mcclellanExec8_i:

  1. // src/nfa/mcclellan.c
  2. static really_inline
  3. char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,
  4. size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
  5. char single, const u8 **c_final, enum MatchMode mode) {
  6. if (!len) {
  7. if (mode == STOP_AT_MATCH) {
  8. *c_final = buf;
  9. }
  10. return MO_ALIVE;
  11. }
  12. u32 s = *state;
  13. const u8 *c = buf;
  14. const u8 *c_end = buf + len;
  15. const struct mstate_aux *aux
  16. = (const struct mstate_aux *)((const char *)m + m->aux_offset
  17. - sizeof(struct NFA));
  18. u32 accept_limit = m->accept_limit_8;
  19. u32 cached_accept_id = 0;
  20. u32 cached_accept_state = 0;
  21. DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
  22. DEBUG_PRINTF("s: %u, len %zu\n", s, len);
  23. const u8 *min_accel_offset = c;
  24. if (!m->has_accel || len < ACCEL_MIN_LEN) {
  25. min_accel_offset = c_end;
  26. goto without_accel;
  27. }
  28. without_accel:
  29. do {
  30. assert(c < min_accel_offset);
  31. if (!s) {
  32. goto exit;
  33. }
  34. s = doNormal8(m, &c, min_accel_offset, s, 0, mode);
  35. if (mode != NO_MATCHES && s >= accept_limit) {
  36. if (mode == STOP_AT_MATCH) {
  37. DEBUG_PRINTF("match - pausing\n");
  38. *state = s;
  39. *c_final = c - 1;
  40. return MO_MATCHES_PENDING;
  41. }
  42. u64a loc = (c - 1) - buf + offAdj + 1;
  43. if (single) {
  44. DEBUG_PRINTF("reporting %u\n", m->arb_report);
  45. if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
  46. return MO_DEAD;
  47. }
  48. } else if (doComplexReport(cb, ctxt, m, s, loc, 0,
  49. &cached_accept_state, &cached_accept_id)
  50. == MO_HALT_MATCHING) {
  51. return MO_DEAD;
  52. }
  53. }
  54. assert(c <= min_accel_offset);
  55. } while (c < min_accel_offset);
  56. if (c == c_end) {
  57. goto exit;
  58. }
  59. exit:
  60. *state = s;
  61. if (mode == STOP_AT_MATCH) {
  62. *c_final = c_end;
  63. }
  64. return MO_ALIVE;
  65. }

doNormal8:

  1. static really_inline
  2. u32 doNormal8(const struct mcclellan *m, const u8 **c_inout, const u8 *end,
  3. u32 s, char do_accel, enum MatchMode mode) {
  4. const u8 *c = *c_inout;
  5. u32 accel_limit = m->accel_limit_8;
  6. u32 accept_limit = m->accept_limit_8;
  7. const u32 as = m->alphaShift;
  8. const u8 *succ_table = (const u8 *)((const char *)m
  9. + sizeof(struct mcclellan));
  10. while (c < end && s) {
  11. u8 cprime = m->remap[*c];
  12. DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
  13. ourisprint(*c) ? *c : '?', cprime);
  14. s = succ_table[(s << as) + cprime];
  15. DEBUG_PRINTF("s: %u\n", s);
  16. c++;
  17. if (do_accel) {
  18. if (s >= accel_limit) {
  19. break;
  20. }
  21. } else {
  22. if (mode != NO_MATCHES && s >= accept_limit) {
  23. break;
  24. }
  25. }
  26. }
  27. *c_inout = c;
  28. return s;
  29. }

输入为”xax”时的匹配过程:

  1. 初始状态, state = m->start_anchored = 1
  2. 输入x, 跳转到1
  3. 输入a, 跳转到2, 2=accept_limit, 暂时跳出状态机, 调用callback上报
  4. 上报后用户回调函数没有让停止, 继续匹配
  5. 输入x, 跳到1
  6. 所有输入数据扫描完成, 退出循环
  7. 最后的状态是1, 检查它有没有eod匹配, 有则上报
  8. 结束, 因最后状态不是0, 返回MO_ALIVE

/^a/

smallwrite_nfa.txt信息:

  1. mcclellan 8
  2. report: 8, states: 3, length: 448
  3. astart: 1, fstart: 0
  4. single accept: 1, has_accel: 0
  5. accel_limit: 2, accept_limit 2
  6. Alphabet
  7. 0: a
  8. 1: [^a]
  9. 00000 [00 - ff]->0
  10. 00001 [00 - 60]->0 61->2 [62 - ff]->0
  11. 00002 [00 - ff]->0
  12. Acceleration
  13. ------------
  14. Queue : 0
  15. Length : 448 bytes
  16. Num Positions : 3
  17. Scratch State : 1 bytes
  18. Stream State : 1 bytes
  19. Flags :
  20. Max Width : inf/unknown
  21. Min Width : 0
  22. BiAnchored Width : inf/unknown
  23. Max Offset : inf/unknown
  24. Reverse Acceleration : NONE

NFA图:
image.png

此时, mcclellan->start_anchored=1, mcclellan->start_floating: 0.

输入为”ax”时的匹配过程:

  1. 初始状态, state = m->start_anchored = 1
  2. 输入a, 跳转到2, 2=accept_limit, 暂时跳出状态机, 调用callback上报
  3. 上报后用户回调函数没有让停止, 继续匹配
  4. 输入x, 跳到0
  5. 所有输入数据扫描完成, 退出循环
  6. 最后的状态是0, 检查它有没有eod匹配, 有则上报
  7. 结束, 因此最后状态是0, 返回MO_DEAD

/a$/

smallwrite_nfa.txt信息:

mcclellan 8
report: 8, states: 4, length: 480
astart: 1, fstart: 1
single accept: 0, has_accel: 0
accel_limit: 4, accept_limit 4

Alphabet
0: a
1: \n
2: [^\na]

00000 [00 - ff]->0
00001 [00 - 60]->1 61->2 [62 - ff]->1
00002 [00 - 09]->1 0a->3 [0b - 60]->1 61->2 [62 - ff]->1
00003 [00 - 60]->1 61->2 [62 - ff]->1

Acceleration
------------
Queue                : 0
Length               : 480 bytes
Num Positions        : 4
Scratch State        : 1 bytes
Stream State         : 1 bytes
Flags                : ACCEPTS_EOD
Max Width            : inf/unknown
Min Width            : 0
BiAnchored Width     : inf/unknown
Max Offset           : inf/unknown
Reverse Acceleration : NONE

NFA图:
image.png

输入为”xa”时的匹配过程:

  1. 初始状态, state = m->start_anchored = 1
  2. 输入x, 跳转到1
  3. 输入a, 跳转到2
  4. 所有输入数据扫描完成, 退出循环
  5. 最后的状态是2, 检查它有没有eod匹配, 发现有(aux->accept_eod), 上报
  6. 结束, 因此最后状态是2, 返回MO_ALIVE

hyperscan: McClellan匹配 - 图3
本作品采用知识共享署名-非商业性使用-禁止演绎 3.0 未本地化版本许可协议进行许可。