:::info 本文更新于 2018.12.19 :::
本文以 /a/, /^a/, /a$/ 三个简单正则来分析hyperscan smwr分支的匹配算法. 注意:
- 编译时没有加任何标志位(比如HS_FLAG_CASELESS, 不区分大小写)
- 3个正则的id都是8
/a/
smallwrite_nfa.txt信息: ``` mcclellan 8 report: 8, states: 3, length: 448 astart: 1, fstart: 1 single accept: 1, has_accel: 0 accel_limit: 2, accept_limit 2
Alphabet 0: a 1: [^a]
00000 [00 - ff]->0 00001 [00 - 60]->1 61->2 [62 - ff]->1 00002 [00 - 60]->1 61->2 [62 - ff]->1
Acceleration
Queue : 0 Length : 448 bytes Num Positions : 3 Scratch State : 1 bytes Stream State : 1 bytes Flags : Max Width : inf/unknown Min Width : 0 BiAnchored Width : inf/unknown Max Offset : inf/unknown Reverse Acceleration : NONE
NFA图:<br />图中, 蓝色箭头表示start_anchored位置, 红色箭头表示start_floating位置. 一共3个状态, 状态0意味着MO_DEAD, 没有画出.0-255个可能输入被归纳成2个字母表, 即信息中的Alphabet:- 0: `a`- 1: `[^a]`从start_anchored开始匹配:```c// src/nfa/mcclellan.cstatic really_inlinechar nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer,size_t length, NfaCallback cb, void *context,char single) {assert(n->type == MCCLELLAN_NFA_8);const struct mcclellan *m = getImplNfa(n);u32 s = m->start_anchored;if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single,NULL, CALLBACK_OUTPUT)== MO_DEAD) {return MO_DEAD;}const struct mstate_aux *aux = get_aux(m, s);if (aux->accept_eod) {doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL);}return s ? MO_ALIVE : MO_DEAD;}
mcclellanExec8_i:
// src/nfa/mcclellan.cstatic really_inlinechar mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf,size_t len, u64a offAdj, NfaCallback cb, void *ctxt,char single, const u8 **c_final, enum MatchMode mode) {if (!len) {if (mode == STOP_AT_MATCH) {*c_final = buf;}return MO_ALIVE;}u32 s = *state;const u8 *c = buf;const u8 *c_end = buf + len;const struct mstate_aux *aux= (const struct mstate_aux *)((const char *)m + m->aux_offset- sizeof(struct NFA));u32 accept_limit = m->accept_limit_8;u32 cached_accept_id = 0;u32 cached_accept_state = 0;DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);DEBUG_PRINTF("s: %u, len %zu\n", s, len);const u8 *min_accel_offset = c;if (!m->has_accel || len < ACCEL_MIN_LEN) {min_accel_offset = c_end;goto without_accel;}without_accel:do {assert(c < min_accel_offset);if (!s) {goto exit;}s = doNormal8(m, &c, min_accel_offset, s, 0, mode);if (mode != NO_MATCHES && s >= accept_limit) {if (mode == STOP_AT_MATCH) {DEBUG_PRINTF("match - pausing\n");*state = s;*c_final = c - 1;return MO_MATCHES_PENDING;}u64a loc = (c - 1) - buf + offAdj + 1;if (single) {DEBUG_PRINTF("reporting %u\n", m->arb_report);if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {return MO_DEAD;}} else if (doComplexReport(cb, ctxt, m, s, loc, 0,&cached_accept_state, &cached_accept_id)== MO_HALT_MATCHING) {return MO_DEAD;}}assert(c <= min_accel_offset);} while (c < min_accel_offset);if (c == c_end) {goto exit;}exit:*state = s;if (mode == STOP_AT_MATCH) {*c_final = c_end;}return MO_ALIVE;}
doNormal8:
static really_inlineu32 doNormal8(const struct mcclellan *m, const u8 **c_inout, const u8 *end,u32 s, char do_accel, enum MatchMode mode) {const u8 *c = *c_inout;u32 accel_limit = m->accel_limit_8;u32 accept_limit = m->accept_limit_8;const u32 as = m->alphaShift;const u8 *succ_table = (const u8 *)((const char *)m+ sizeof(struct mcclellan));while (c < end && s) {u8 cprime = m->remap[*c];DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,ourisprint(*c) ? *c : '?', cprime);s = succ_table[(s << as) + cprime];DEBUG_PRINTF("s: %u\n", s);c++;if (do_accel) {if (s >= accel_limit) {break;}} else {if (mode != NO_MATCHES && s >= accept_limit) {break;}}}*c_inout = c;return s;}
输入为”xax”时的匹配过程:
- 初始状态, state = m->start_anchored = 1
- 输入x, 跳转到1
- 输入a, 跳转到2, 2=accept_limit, 暂时跳出状态机, 调用callback上报
- 上报后用户回调函数没有让停止, 继续匹配
- 输入x, 跳到1
- 所有输入数据扫描完成, 退出循环
- 最后的状态是1, 检查它有没有eod匹配, 有则上报
- 结束, 因最后状态不是0, 返回MO_ALIVE
/^a/
smallwrite_nfa.txt信息:
mcclellan 8report: 8, states: 3, length: 448astart: 1, fstart: 0single accept: 1, has_accel: 0accel_limit: 2, accept_limit 2Alphabet0: a1: [^a]00000 [00 - ff]->000001 [00 - 60]->0 61->2 [62 - ff]->000002 [00 - ff]->0Acceleration------------Queue : 0Length : 448 bytesNum Positions : 3Scratch State : 1 bytesStream State : 1 bytesFlags :Max Width : inf/unknownMin Width : 0BiAnchored Width : inf/unknownMax Offset : inf/unknownReverse Acceleration : NONE
NFA图:
此时, mcclellan->start_anchored=1, mcclellan->start_floating: 0.
输入为”ax”时的匹配过程:
- 初始状态, state = m->start_anchored = 1
- 输入a, 跳转到2, 2=accept_limit, 暂时跳出状态机, 调用callback上报
- 上报后用户回调函数没有让停止, 继续匹配
- 输入x, 跳到0
- 所有输入数据扫描完成, 退出循环
- 最后的状态是0, 检查它有没有eod匹配, 有则上报
- 结束, 因此最后状态是0, 返回MO_DEAD
/a$/
smallwrite_nfa.txt信息:
mcclellan 8
report: 8, states: 4, length: 480
astart: 1, fstart: 1
single accept: 0, has_accel: 0
accel_limit: 4, accept_limit 4
Alphabet
0: a
1: \n
2: [^\na]
00000 [00 - ff]->0
00001 [00 - 60]->1 61->2 [62 - ff]->1
00002 [00 - 09]->1 0a->3 [0b - 60]->1 61->2 [62 - ff]->1
00003 [00 - 60]->1 61->2 [62 - ff]->1
Acceleration
------------
Queue : 0
Length : 480 bytes
Num Positions : 4
Scratch State : 1 bytes
Stream State : 1 bytes
Flags : ACCEPTS_EOD
Max Width : inf/unknown
Min Width : 0
BiAnchored Width : inf/unknown
Max Offset : inf/unknown
Reverse Acceleration : NONE
NFA图:
输入为”xa”时的匹配过程:
- 初始状态, state = m->start_anchored = 1
- 输入x, 跳转到1
- 输入a, 跳转到2
- 所有输入数据扫描完成, 退出循环
- 最后的状态是2, 检查它有没有eod匹配, 发现有(aux->accept_eod), 上报
- 结束, 因此最后状态是2, 返回MO_ALIVE

本作品采用知识共享署名-非商业性使用-禁止演绎 3.0 未本地化版本许可协议进行许可。
