发现

一直在用正则,但是如何写一个正则,没有什么头绪,在网上搜索时发现MuJS,作者发布了一版正则实现,不过是C语言写的,里面使用了不少操作内存地址的用法,调试起来不是很舒服,我想能不能移植到JavaScript上面呢?

移植

这样调试起来会方便许多,随即开始移植,经历了两天左右初版就移植完成了,肯定还有Bug,后续再说吧,总算调试器来不是那么不舒服了。

结果

算是移植成功了吧,大多是按照C语言原写法做的处理,有的地方做了符合JavaScript的处理,但是没有完全按照JavaScript来进行处理,还是有遗憾!仓库在这里。希望对大家有帮助 !

代码

  1. /* regcomp flags */
  2. const REG_ICASE = 1
  3. const REG_NEWLINE = 2
  4. /* regexec flags */
  5. const REG_NOTBOL = 4
  6. /* limits */
  7. const REG_MAXSUB = 10
  8. const ESCAPES = "BbDdSsWw^$\\.*+?()[]{}|0123456789"
  9. let REPINF = 255
  10. let MAXSUB = REG_MAXSUB
  11. let MAXPROG = (32 << 10)
  12. L_CHAR = 256
  13. L_CCLASS = 257 /* character class */
  14. L_NCCLASS = 258 /* negative character class */
  15. L_NC = 259 /* "(?:" no capture */
  16. L_PLA = 300 /* "(?=" positive lookahead */
  17. L_NLA = 301 /* "(?!" negative lookahead */
  18. L_WORD = 302 /* "\b" word boundary */
  19. L_NWORD = 303 /* "\B" non-word boundary */
  20. L_REF = 304 /* "\1" back-reference */
  21. L_COUNT = 305 /* {M,N} */
  22. /* Parse */
  23. P_CAT = 0
  24. P_ALT = 1
  25. P_REP = 2
  26. P_BOL = 3
  27. P_EOL = 4
  28. P_WORD = 5
  29. P_NWORD = 6
  30. P_PAR = 7
  31. P_PLA = 8
  32. P_NLA = 9
  33. P_ANY = 10
  34. P_CHAR = 11
  35. P_CCLASS = 12
  36. P_NCCLASS = 13
  37. P_REF = 14
  38. /* Compile */
  39. I_END = 0;
  40. I_JUMP = 1;
  41. I_SPLIT = 2;
  42. I_PLA = 3;
  43. I_NLA = 4;
  44. I_ANYNL = 5;
  45. I_ANY = 6;
  46. I_CHAR = 7;
  47. I_CCLASS = 8;
  48. I_NCCLASS = 9;
  49. I_REF = 10;
  50. I_BOL = 11;
  51. I_EOL = 12;
  52. I_WORD = 13;
  53. I_NWORD = 14;
  54. I_LPAR = 15;
  55. I_RPAR = 16
  56. let ccclass_memory = []
  57. for (let i = 0; i < 16; i++) {
  58. ccclass_memory.push({
  59. end: 3452816845, // 参考C语言
  60. spans: Array(64).fill(3452816845)
  61. })
  62. }
  63. let g = {
  64. sub: [],
  65. prog: {
  66. cclass: ccclass_memory
  67. }
  68. }
  69. // 由于JavaScript中无法使用指针访问内存地址,在移植C语言程序时,使用数组中放置空对象模拟一片内存空间,暂时先放100个内存单元
  70. let memory = []
  71. for (let i = 0; i < 100; i++) {
  72. memory.push({})
  73. }
  74. function recomp(pattern, cflags) {
  75. console.log(pattern)
  76. let node;
  77. let split;
  78. let jump;
  79. let i = 0;
  80. let j = 0;
  81. g.pstart = null;
  82. // g.prog = {}; // 分配内存
  83. n = pattern.length * 2;
  84. if (n > 0) {
  85. // 分配内存
  86. g.pstart = g.pend = {}
  87. }
  88. g.source = pattern;
  89. g.ncclass = 0;
  90. g.nsub = 1;
  91. for (i = 1; i < MAXSUB; ++i) {
  92. g.sub[i] = 0;
  93. }
  94. g.prog.flags = cflags;
  95. next()
  96. node = parsealt();
  97. if (g.lookahead === ')')
  98. die("unmatched ')'");
  99. // if (g.lookahead != 0) // c语言最后为0
  100. if (g.lookahead != 0) // c语言和JavaScript非严格等号相同 "" == 0 为true
  101. die("syntax error");
  102. n = 6 + count(node);
  103. if (n < 0 || n > MAXPROG)
  104. die("program too large");
  105. g.prog.nsub = g.nsub;
  106. g.prog.start = g.prog.end = memory[0];
  107. split = emit(g.prog, I_SPLIT);
  108. let splitIndex = memory.indexOf(split);
  109. split.x = memory[splitIndex + 3];
  110. split.y = memory[splitIndex + 1];
  111. emit(g.prog, I_ANYNL);
  112. jump = emit(g.prog, I_JUMP);
  113. jump.x = split;
  114. emit(g.prog, I_LPAR);
  115. compile(g.prog, node);
  116. emit(g.prog, I_RPAR);
  117. emit(g.prog, I_END);
  118. // free(g.pstart);
  119. // if (errorp) *errorp = NULL;
  120. return g.prog;
  121. }
  122. function toupperrune(c) {
  123. /* TODO: Add unicode support */
  124. if (c >= 'a' && c <= 'z')
  125. return c - 'a' + 'A';
  126. return c;
  127. }
  128. function canon(c) {
  129. let u = toupperrune(c);
  130. if (c >= 128 && u < 128)
  131. return c;
  132. return u;
  133. }
  134. function compile(prog, node) {
  135. let inst, split, jump;
  136. let i;
  137. let flag = true
  138. if (!node)
  139. return;
  140. loop:
  141. while (flag) {
  142. switch (node.type) {
  143. case P_CAT:
  144. compile(prog, node.x);
  145. node = node.y;
  146. continue loop;
  147. case P_ALT:
  148. split = emit(prog, I_SPLIT);
  149. compile(prog, node.x);
  150. jump = emit(prog, I_JUMP);
  151. compile(prog, node.y);
  152. let splitIndex = memory.indexOf(split);
  153. let jumpIndex = memory.indexOf(jump);
  154. split.x = memory[splitIndex + 1];
  155. split.y = memory[jumpIndex + 1];
  156. jump.x = prog.end;
  157. flag = false;
  158. break;
  159. case P_REP:
  160. for (i = 0; i < node.m; ++i) {
  161. inst = prog.end;
  162. compile(prog, node.x);
  163. }
  164. if (node.m == node.n) {
  165. flag = false;
  166. break;
  167. }
  168. if (node.n < REPINF) {
  169. for (i = node.m; i < node.n; ++i) {
  170. split = emit(prog, I_SPLIT);
  171. compile(prog, node.x);
  172. if (node.ng) {
  173. let splitIndex = memory.indexOf(split);
  174. split.y = memory[splitIndex + 1];
  175. split.x = prog.end;
  176. } else {
  177. let splitIndex = memory.indexOf(split);
  178. split.x = memory[splitIndex + 1];
  179. split.y = prog.end;
  180. }
  181. }
  182. } else if (node.m == 0) {
  183. split = emit(prog, I_SPLIT);
  184. compile(prog, node.x);
  185. jump = emit(prog, I_JUMP);
  186. if (node.ng) {
  187. let splitIndex = memory.indexOf(split);
  188. split.y = memory[splitIndex + 1];
  189. split.x = prog.end;
  190. } else {
  191. let splitIndex = memory.indexOf(split);
  192. split.x = memory[splitIndex + 1];
  193. split.y = prog.end;
  194. }
  195. jump.x = split;
  196. } else {
  197. split = emit(prog, I_SPLIT);
  198. if (node.ng) {
  199. split.y = inst;
  200. split.x = prog.end;
  201. } else {
  202. split.x = inst;
  203. split.y = prog.end;
  204. }
  205. }
  206. flag = false;
  207. break;
  208. case P_BOL:
  209. emit(prog, I_BOL);
  210. flag = false;
  211. break;
  212. case P_EOL:
  213. emit(prog, I_EOL);
  214. flag = false;
  215. break;
  216. case P_WORD:
  217. emit(prog, I_WORD);
  218. flag = false;
  219. break;
  220. case P_NWORD:
  221. emit(prog, I_NWORD);
  222. flag = false;
  223. break;
  224. case P_PAR:
  225. inst = emit(prog, I_LPAR);
  226. inst.n = node.n;
  227. compile(prog, node.x);
  228. inst = emit(prog, I_RPAR);
  229. inst.n = node.n;
  230. flag = false;
  231. break;
  232. case P_PLA:
  233. split = emit(prog, I_PLA);
  234. compile(prog, node.x);
  235. emit(prog, I_END);
  236. splitIndex = memory.indexOf(split);
  237. split.x = memory[splitIndex + 1];
  238. split.y = prog.end;
  239. flag = false;
  240. break;
  241. case P_NLA:
  242. split = emit(prog, I_NLA);
  243. compile(prog, node.x);
  244. emit(prog, I_END);
  245. splitIndex = memory.indexOf(split);
  246. split.x = memory[splitIndex + 1];
  247. split.y = prog.end;
  248. flag = false;
  249. break;
  250. case P_ANY:
  251. emit(prog, I_ANY);
  252. flag = false;
  253. break;
  254. case P_CHAR:
  255. inst = emit(prog, I_CHAR);
  256. inst.c = (prog.flags & REG_ICASE) ? canon(node.c) : node.c;
  257. flag = false;
  258. break;
  259. case P_CCLASS:
  260. inst = emit(prog, I_CCLASS);
  261. inst.cc = node.cc;
  262. flag = false;
  263. break;
  264. case P_NCCLASS:
  265. inst = emit(prog, I_NCCLASS);
  266. inst.cc = node.cc;
  267. flag = false;
  268. break;
  269. case P_REF:
  270. inst = emit(prog, I_REF);
  271. inst.n = node.n;
  272. flag = false;
  273. break;
  274. }
  275. }
  276. }
  277. function count(node) {
  278. let min, max, n;
  279. if (!node) return 0;
  280. switch (node.type) {
  281. default:
  282. return 1;
  283. case P_CAT:
  284. return count(node.x) + count(node.y);
  285. case P_ALT:
  286. return count(node.x) + count(node.y) + 2;
  287. case P_REP:
  288. min = node.m;
  289. max = node.n;
  290. if (min == max) n = count(node.x) * min;
  291. else if (max < REPINF) n = count(node.x) * max + (max - min);
  292. else n = count(node.x) * (min + 1) + 2;
  293. if (n > MAXPROG) die("program too large");
  294. return n;
  295. case P_PAR:
  296. return count(node.x) + 2;
  297. case P_PLA:
  298. return count(node.x) + 2;
  299. case P_NLA:
  300. return count(node.x) + 2;
  301. }
  302. }
  303. function emit(prog, opcode) {
  304. // Reinst *inst = prog.end++;
  305. let inst = prog.end; // 先将地址赋值给inst
  306. let index = memory.indexOf(prog.end); // 在模拟内存中寻找索引
  307. prog.end = memory[index + 1]; // 将模拟内存中的下一个地址赋值给end
  308. inst.opcode = opcode;
  309. inst.n = 0;
  310. inst.c = 0;
  311. inst.cc = null;
  312. inst.x = inst.y = null;
  313. return inst;
  314. }
  315. function accept(t) {
  316. if (g.lookahead == t) {
  317. next();
  318. return 1;
  319. }
  320. return 0;
  321. }
  322. function newnode(type) {
  323. // let node = g.pend++;
  324. let node = {}
  325. node.type = type;
  326. node.cc = null;
  327. node.c = 0;
  328. node.ng = 0;
  329. node.m = 0;
  330. node.n = 0;
  331. node.x = node.y = null;
  332. return node;
  333. }
  334. function parseatom() {
  335. let atom;
  336. if (g.lookahead == L_CHAR) {
  337. atom = newnode(P_CHAR);
  338. atom.c = g.yychar;
  339. next();
  340. return atom;
  341. }
  342. if (g.lookahead == L_CCLASS) {
  343. atom = newnode(P_CCLASS);
  344. atom.cc = g.yycc;
  345. next();
  346. return atom;
  347. }
  348. if (g.lookahead == L_NCCLASS) {
  349. atom = newnode(P_NCCLASS);
  350. atom.cc = g.yycc;
  351. next();
  352. return atom;
  353. }
  354. if (g.lookahead == L_REF) {
  355. atom = newnode(P_REF);
  356. if (g.yychar == '' || g.yychar > g.nsub || !g.sub[g.yychar])
  357. die("invalid back-reference");
  358. atom.n = g.yychar;
  359. atom.x = g.sub[g.yychar];
  360. next();
  361. return atom;
  362. }
  363. if (accept('.'))
  364. return newnode(P_ANY);
  365. if (accept('(')) {
  366. atom = newnode(P_PAR);
  367. if (g.nsub == MAXSUB)
  368. die("too many captures");
  369. atom.n = g.nsub++;
  370. atom.x = parsealt();
  371. g.sub[atom.n] = atom;
  372. if (!accept(')'))
  373. die("unmatched '('");
  374. return atom;
  375. }
  376. if (accept(L_NC)) {
  377. atom = parsealt();
  378. if (!accept(')'))
  379. die("unmatched '('");
  380. return atom;
  381. }
  382. if (accept(L_PLA)) {
  383. atom = newnode(P_PLA);
  384. atom.x = parsealt();
  385. if (!accept(')'))
  386. die("unmatched '('");
  387. return atom;
  388. }
  389. if (accept(L_NLA)) {
  390. atom = newnode(P_NLA);
  391. atom.x = parsealt();
  392. if (!accept(')'))
  393. die("unmatched '('");
  394. return atom;
  395. }
  396. die("syntax error");
  397. return null;
  398. }
  399. function parserep() {
  400. let atom;
  401. if (accept('^')) return newnode(P_BOL);
  402. if (accept('$')) return newnode(P_EOL);
  403. if (accept(L_WORD)) return newnode(P_WORD);
  404. if (accept(L_NWORD)) return newnode(P_NWORD);
  405. atom = parseatom();
  406. if (g.lookahead == L_COUNT) {
  407. let min = g.yymin,
  408. max = g.yymax;
  409. next();
  410. if (max < min)
  411. die("invalid quantifier");
  412. return newrep(atom, accept('?'), min, max);
  413. }
  414. if (accept('*')) return newrep(atom, accept('?'), 0, REPINF);
  415. if (accept('+')) return newrep(atom, accept('?'), 1, REPINF);
  416. if (accept('?')) return newrep(atom, accept('?'), 0, 1);
  417. return atom;
  418. }
  419. function parsecat() {
  420. let cat, head, tail = {};
  421. if (g.lookahead && g.lookahead != '|' && g.lookahead != ')') {
  422. /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
  423. head = parserep();
  424. tail.point = head; // ??
  425. let prev = head;
  426. while (g.lookahead && g.lookahead != '|' && g.lookahead != ')') {
  427. cat = newnode(P_CAT);
  428. cat.x = tail.point;
  429. cat.y = parserep();
  430. if (prev.x === tail.point) {
  431. tail.all.x = cat
  432. prev = cat
  433. } else if (prev.y === tail.point) {
  434. tail.all.y = cat
  435. prev = cat
  436. } else if (prev === tail.point) {
  437. head = cat
  438. prev = cat
  439. }
  440. tail.point = cat.y;
  441. tail.all = cat;
  442. }
  443. return head;
  444. }
  445. return null;
  446. }
  447. function empty(node) {
  448. if (!node) return 1;
  449. switch (node.type) {
  450. default:
  451. return 1;
  452. case P_CAT:
  453. return empty(node.x) && empty(node.y);
  454. case P_ALT:
  455. return empty(node.x) || empty(node.y);
  456. case P_REP:
  457. return empty(node.x) || node.m == 0;
  458. case P_PAR:
  459. return empty(node.x);
  460. case P_REF:
  461. return empty(node.x);
  462. case P_ANY:
  463. case P_CHAR:
  464. case P_CCLASS:
  465. case P_NCCLASS:
  466. return 0;
  467. }
  468. }
  469. function newrep(atom, ng, min, max) {
  470. let rep = newnode(P_REP);
  471. if (max == REPINF && empty(atom))
  472. die("infinite loop matching the empty string");
  473. rep.ng = ng;
  474. rep.m = min;
  475. rep.n = max;
  476. rep.x = atom;
  477. return rep;
  478. }
  479. function parsealt() {
  480. let alt, x;
  481. alt = parsecat();
  482. while (accept('|')) {
  483. x = alt;
  484. alt = newnode(P_ALT);
  485. alt.x = x;
  486. alt.y = parsecat();
  487. }
  488. return alt;
  489. }
  490. function next() {
  491. g.lookahead = lex();
  492. }
  493. function chartorune(r, s, key) {
  494. /* TODO: Add UTF-8 decoding */
  495. r[key] = s.slice(0, 1);
  496. return 1;
  497. }
  498. function incclasscanon(cc, c) {
  499. let p, r;
  500. for (p = cc.spans; p < cc.end; p += 2)
  501. for (r = p[0]; r <= p[1]; ++r)
  502. if (c == canon(r))
  503. return 1;
  504. return 0;
  505. }
  506. function incclass(cc, c) {
  507. let p;
  508. // for (p = cc.spans; p < cc.end; p += 2)
  509. for (p = 0; p < cc.end; p += 2)
  510. if (cc.spans[p] <= c && c <= cc.spans[p + 1])
  511. return 1;
  512. return 0;
  513. }
  514. function isnewline(c) {
  515. return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
  516. }
  517. function iswordchar(c) {
  518. return c == '_' ||
  519. (c >= 'a' && c <= 'z') ||
  520. (c >= 'A' && c <= 'Z') ||
  521. (c >= '0' && c <= '9');
  522. }
  523. function die(message) {
  524. g.error = message;
  525. throw new Error(message);
  526. // longjmp(g.kaboom, 1);
  527. }
  528. function hex(c) {
  529. if (c >= '0' && c <= '9') return c - '0';
  530. if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
  531. if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
  532. die("invalid escape sequence");
  533. return 0;
  534. }
  535. function nextrune() {
  536. g.source = g.source.slice(chartorune(g, g.source, "yychar"), g.source.length);
  537. if (g.yychar == '\\') {
  538. g.source = g.source.slice(chartorune(g, g.source, "yychar"), g.source.length);
  539. switch (g.yychar) {
  540. case 0:
  541. die("unterminated escape sequence");
  542. break;
  543. case 'f':
  544. g.yychar = '\f';
  545. return 0;
  546. case 'n':
  547. g.yychar = '\n';
  548. return 0;
  549. case 'r':
  550. g.yychar = '\r';
  551. return 0;
  552. case 't':
  553. g.yychar = '\t';
  554. return 0;
  555. case 'v':
  556. g.yychar = '\v';
  557. return 0;
  558. case 'c':
  559. g.yychar = (g.source++) & 31;
  560. return 0;
  561. case 'x':
  562. g.yychar = hex(g.source++) << 4;
  563. g.yychar += hex(g.source++);
  564. if (g.yychar == '') { // ??
  565. g.yychar = '0';
  566. return 1;
  567. }
  568. return 0;
  569. case 'u':
  570. g.yychar = hex(g.source++) << 12;
  571. g.yychar += hex(g.source++) << 8;
  572. g.yychar += hex(g.source++) << 4;
  573. g.yychar += hex(g.source++);
  574. if (g.yychar == '') { // ??
  575. g.yychar = '0';
  576. return 1;
  577. }
  578. return 0;
  579. }
  580. if (ESCAPES.includes(g.yychar)) {
  581. return 1;
  582. }
  583. // if (strchr(ESCAPES, g.yychar)) // strchr 该函数返回在字符串 str 中第一次出现字符 c 的位置,如果未找到该字符则返回 NULL。
  584. // return 1;
  585. if (isunicodeletter(g.yychar) || g.yychar == '_') /* check identity escape */
  586. die("invalid escape character");
  587. return 0;
  588. }
  589. return 0;
  590. }
  591. function newcclass() {
  592. // #define nelem(a) (sizeof (a) / sizeof (a)[0]) 求数组长度
  593. // if (g.ncclass >= nelem(g.prog.cclass))
  594. if (g.ncclass >= g.prog.cclass.length)
  595. die("too many character classes");
  596. // g.yycc = g.prog.cclass + g.ncclass++;
  597. g.yycc = ccclass_memory[g.ncclass++];
  598. g.yycc.end = g.yycc.spans[0]; // ??
  599. rangeIndex = 0; // 新的[]区间,索引清零
  600. }
  601. function addranges_d() {
  602. addrange('0', '9');
  603. }
  604. function addranges_D() {
  605. addrange(0, '0' - 1);
  606. addrange('9' + 1, 0xFFFF);
  607. }
  608. function addranges_W() {
  609. addrange(0, '0' - 1);
  610. addrange('9' + 1, 'A' - 1);
  611. addrange('Z' + 1, '_' - 1);
  612. addrange('_' + 1, 'a' - 1);
  613. addrange('z' + 1, 0xFFFF);
  614. }
  615. function addranges_w() {
  616. addrange('0', '9');
  617. addrange('A', 'Z');
  618. addrange('_', '_');
  619. addrange('a', 'z');
  620. }
  621. function addranges_S() {
  622. addrange(0, 0x9 - 1);
  623. addrange(0x9 + 1, 0xA - 1);
  624. addrange(0xD + 1, 0x20 - 1);
  625. addrange(0x20 + 1, 0xA0 - 1);
  626. addrange(0xA0 + 1, 0x2028 - 1);
  627. addrange(0x2029 + 1, 0xFEFF - 1);
  628. addrange(0xFEFF + 1, 0xFFFF);
  629. }
  630. function addranges_s() {
  631. addrange(0x9, 0x9);
  632. addrange(0xA, 0xD);
  633. addrange(0x20, 0x20);
  634. addrange(0xA0, 0xA0);
  635. addrange(0x2028, 0x2029);
  636. addrange(0xFEFF, 0xFEFF);
  637. }
  638. let rangeIndex = 0; // 为了避免复杂,以JavaScript的方式实现了相同功能
  639. function addrange(a, b) {
  640. if (a > b)
  641. die("invalid character class range");
  642. // if (g.yycc.end + 2 == g.yycc.spans + nelem(g.yycc.spans))
  643. // die("too many character class ranges");
  644. // g.yycc.end++ = a;
  645. // g.yycc.end++ = b;
  646. if (rangeIndex + 2 == 1 + g.yycc.spans.length) { // ??
  647. die("too many character class ranges");
  648. }
  649. // ??
  650. g.yycc.spans[rangeIndex++] = a;
  651. g.yycc.spans[rangeIndex++] = b;
  652. g.yycc.end = rangeIndex; // 无法访问指针,使用JavaScript的方式解决
  653. }
  654. function isalpharune(c) {
  655. /* TODO: Add unicode support */
  656. return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
  657. }
  658. function isunicodeletter(c) {
  659. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
  660. }
  661. function lexclass() {
  662. let type = L_CCLASS;
  663. let quoted, havesave, havedash;
  664. let save = 0;
  665. newcclass();
  666. quoted = nextrune();
  667. if (!quoted && g.yychar == '^') {
  668. type = L_NCCLASS;
  669. quoted = nextrune();
  670. }
  671. havesave = havedash = 0;
  672. for (;;) {
  673. // if (g.yychar == 0)
  674. if (g.yychar == '')
  675. die("unterminated character class");
  676. if (!quoted && g.yychar == ']')
  677. break;
  678. if (!quoted && g.yychar == '-') {
  679. if (havesave) {
  680. if (havedash) {
  681. addrange(save, '-');
  682. havesave = havedash = 0;
  683. } else {
  684. havedash = 1;
  685. }
  686. } else {
  687. save = '-';
  688. havesave = 1;
  689. }
  690. } else if (quoted && "DSWdsw".includes(g.yychar)) {
  691. if (havesave) {
  692. addrange(save, save);
  693. if (havedash)
  694. addrange('-', '-');
  695. }
  696. switch (g.yychar) {
  697. case 'd':
  698. addranges_d();
  699. break;
  700. case 's':
  701. addranges_s();
  702. break;
  703. case 'w':
  704. addranges_w();
  705. break;
  706. case 'D':
  707. addranges_D();
  708. break;
  709. case 'S':
  710. addranges_S();
  711. break;
  712. case 'W':
  713. addranges_W();
  714. break;
  715. }
  716. havesave = havedash = 0;
  717. } else {
  718. if (quoted) {
  719. if (g.yychar == 'b')
  720. g.yychar = '\b';
  721. else if (g.yychar == '0')
  722. g.yychar = 0;
  723. /* else identity escape */
  724. }
  725. if (havesave) {
  726. if (havedash) {
  727. addrange(save, g.yychar);
  728. havesave = havedash = 0;
  729. } else {
  730. addrange(save, save);
  731. save = g.yychar;
  732. }
  733. } else {
  734. save = g.yychar;
  735. havesave = 1;
  736. }
  737. }
  738. quoted = nextrune();
  739. }
  740. if (havesave) {
  741. addrange(save, save);
  742. if (havedash)
  743. addrange('-', '-');
  744. }
  745. return type;
  746. }
  747. function lex() {
  748. let quoted = nextrune();
  749. if (quoted) {
  750. switch (g.yychar) {
  751. case 'b':
  752. return L_WORD;
  753. case 'B':
  754. return L_NWORD;
  755. case 'd':
  756. newcclass();
  757. addranges_d();
  758. return L_CCLASS;
  759. case 's':
  760. newcclass();
  761. addranges_s();
  762. return L_CCLASS;
  763. case 'w':
  764. newcclass();
  765. addranges_w();
  766. return L_CCLASS;
  767. case 'D':
  768. newcclass();
  769. addranges_d();
  770. return L_NCCLASS;
  771. case 'S':
  772. newcclass();
  773. addranges_s();
  774. return L_NCCLASS;
  775. case 'W':
  776. newcclass();
  777. addranges_w();
  778. return L_NCCLASS;
  779. case '0':
  780. g.yychar = 0;
  781. return L_CHAR;
  782. }
  783. if (g.yychar >= '0' && g.yychar <= '9') {
  784. g.yychar -= '0';
  785. if (g.source >= '0' && g.source <= '9')
  786. g.yychar = g.yychar * 10 + g.source++ - '0';
  787. return L_REF;
  788. }
  789. return L_CHAR;
  790. }
  791. switch (g.yychar) {
  792. case 0: // C里面最后是0
  793. case "":
  794. case '$':
  795. case ')':
  796. case '*':
  797. case '+':
  798. case '.':
  799. case '?':
  800. case '^':
  801. case '|':
  802. return g.yychar;
  803. }
  804. if (g.yychar == '{')
  805. return lexcount();
  806. if (g.yychar == '[')
  807. return lexclass();
  808. if (g.yychar == '(') {
  809. if (g.source[0] == '?') {
  810. if (g.source[1] == ':') {
  811. // g.source += 2;
  812. g.source = g.source.slice(2, g.source.length);
  813. return L_NC;
  814. }
  815. if (g.source[1] == '=') {
  816. // g.source += 2;
  817. g.source = g.source.slice(2, g.source.length);
  818. return L_PLA;
  819. }
  820. if (g.source[1] == '!') {
  821. // g.source += 2;
  822. g.source = g.source.slice(2, g.source.length);
  823. return L_NLA;
  824. }
  825. }
  826. return '(';
  827. }
  828. return L_CHAR;
  829. }
  830. function strncmpcanon(a, b, n) {
  831. let ra, rb;
  832. let c;
  833. let tempra = {
  834. ra: ra
  835. }
  836. let temprb = {
  837. rb: rb
  838. }
  839. while (n--) {
  840. if (!a) return -1;
  841. if (!b) return 1;
  842. a = a.slice(chartorune(tempra, a, "ra"), a.length);
  843. ra = tempra.ra;
  844. b = b.slice(chartorune(temprb, b, "rb"), b.length);
  845. rb = temprb.rb;
  846. c = canon(ra) - canon(rb);
  847. if (c)
  848. return c;
  849. }
  850. return 0;
  851. }
  852. function strncmp(str1, str2, n) {
  853. str1 = str1.substring(0, n);
  854. str2 = str2.substring(0, n);
  855. return ((str1 == str2) ? 0 :
  856. ((str1 > str2) ? 1 : -1));
  857. }
  858. function match(pc, sp, bol, flags, out) {
  859. let scratch;
  860. let i;
  861. let c;
  862. let tempc = {
  863. c: c
  864. }
  865. let pcIndex;
  866. for (;;) {
  867. switch (pc.opcode) {
  868. case I_END:
  869. return 1;
  870. case I_JUMP:
  871. pc = pc.x;
  872. break;
  873. case I_SPLIT:
  874. scratch = out;
  875. if (match(pc.x, sp, bol, flags, scratch)) {
  876. out = scratch;
  877. return 1;
  878. }
  879. pc = pc.y;
  880. break;
  881. case I_PLA:
  882. if (!match(pc.x, sp, bol, flags, out))
  883. return 0;
  884. pc = pc.y;
  885. break;
  886. case I_NLA:
  887. scratch = out;
  888. if (match(pc.x, sp, bol, flags, scratch))
  889. return 0;
  890. pc = pc.y;
  891. break;
  892. case I_ANYNL:
  893. sp = sp.slice(chartorune(tempc, sp, "c"), sp.length);
  894. c = tempc.c;
  895. if (c == '') // JavaScript中不使用严格等号 "" == 0 为true
  896. return 0;
  897. pcIndex = memory.indexOf(pc);
  898. pc = memory[pcIndex + 1];
  899. break;
  900. case I_ANY:
  901. sp = sp.slice(chartorune(tempc, sp, "c"), sp.length);
  902. c = tempc.c;
  903. if (c == '')
  904. return 0;
  905. if (isnewline(c))
  906. return 0;
  907. pcIndex = memory.indexOf(pc);
  908. pc = memory[pcIndex + 1];
  909. break;
  910. case I_CHAR:
  911. sp = sp.slice(chartorune(tempc, sp, "c"), sp.length);
  912. c = tempc.c;
  913. if (c == '')
  914. return 0;
  915. if (flags & REG_ICASE)
  916. c = canon(c);
  917. if (c != pc.c)
  918. return 0;
  919. pcIndex = memory.indexOf(pc);
  920. pc = memory[pcIndex + 1];
  921. break;
  922. case I_CCLASS:
  923. sp = sp.slice(chartorune(tempc, sp, "c"), sp.length);
  924. c = tempc.c;
  925. if (c == '')
  926. return 0;
  927. if (flags & REG_ICASE) {
  928. if (!incclasscanon(pc.cc, canon(c)))
  929. return 0;
  930. } else {
  931. if (!incclass(pc.cc, c))
  932. return 0;
  933. }
  934. pcIndex = memory.indexOf(pc);
  935. pc = memory[pcIndex + 1];
  936. break;
  937. case I_NCCLASS:
  938. sp = sp.slice(chartorune(tempc, sp, "c"), sp.length);
  939. c = tempc.c;
  940. if (c == '')
  941. return 0;
  942. if (flags & REG_ICASE) {
  943. if (incclasscanon(pc.cc, canon(c)))
  944. return 0;
  945. } else {
  946. if (incclass(pc.cc, c))
  947. return 0;
  948. }
  949. pcIndex = memory.indexOf(pc);
  950. pc = memory[pcIndex + 1];
  951. break;
  952. case I_REF:
  953. i = out.sub[pc.n].sp.length - out.sub[pc.n].ep.length;
  954. if (flags & REG_ICASE) {
  955. if (strncmpcanon(sp, out.sub[pc.n].sp, i))
  956. return 0;
  957. } else {
  958. if (strncmp(sp, out.sub[pc.n].sp, i))
  959. return 0;
  960. }
  961. if (i > 0)
  962. sp = sp.slice(i, sp.length);
  963. pcIndex = memory.indexOf(pc);
  964. pc = memory[pcIndex + 1];
  965. break;
  966. case I_BOL:
  967. if (sp == bol && !(flags & REG_NOTBOL)) {
  968. pcIndex = memory.indexOf(pc);
  969. pc = memory[pcIndex + 1];
  970. break;
  971. }
  972. if (flags & REG_NEWLINE) {
  973. // 如果有Bug,修复建议,sp > bol之间的比较,在C语言中,sp与bol是char*指针,即内存地址,
  974. // sp > bol若为true,则表示sp的内存地址(指针)较大,实际存储的有效字符个数(内存地址开头到\0之间的字符)是sp比bol少的
  975. // 这和JavaScript里面字符串比较很大不同
  976. // if (sp > bol && isnewline(sp[-1])) { // 原C代码
  977. if (sp.length < bol.length /*改为字符串长度比较*/ && isnewline(sp[-1])) {
  978. pcIndex = memory.indexOf(pc);
  979. pc = memory[pcIndex + 1];
  980. break;
  981. }
  982. }
  983. return 0;
  984. case I_EOL:
  985. if (sp == '') { // 在C语言和JavaScript非严格等号里面成立
  986. pcIndex = memory.indexOf(pc);
  987. pc = memory[pcIndex + 1];
  988. break;
  989. }
  990. if (flags & REG_NEWLINE) {
  991. if (isnewline(sp)) {
  992. pcIndex = memory.indexOf(pc);
  993. pc = memory[pcIndex + 1];
  994. break;
  995. }
  996. }
  997. return 0;
  998. case I_WORD:
  999. // i = sp > bol && iswordchar(sp[-1]);
  1000. if (sp.length < bol.length) {
  1001. let index = bol.indexOf(sp);
  1002. i = iswordchar(bol[index - 1]);
  1003. }
  1004. i ^= iswordchar(sp[0]);
  1005. if (!i)
  1006. return 0;
  1007. pcIndex = memory.indexOf(pc);
  1008. pc = memory[pcIndex + 1];
  1009. break;
  1010. case I_NWORD:
  1011. // i = sp > bol && iswordchar(sp[-1]);
  1012. if (sp.length < bol.length) {
  1013. let index = bol.indexOf(sp);
  1014. i = iswordchar(bol[index - 1]);
  1015. }
  1016. i ^= iswordchar(sp[0]);
  1017. if (i)
  1018. return 0;
  1019. pcIndex = memory.indexOf(pc);
  1020. pc = memory[pcIndex + 1];
  1021. break;
  1022. case I_LPAR:
  1023. out.sub[pc.n].sp = sp;
  1024. pcIndex = memory.indexOf(pc);
  1025. pc = memory[pcIndex + 1];
  1026. break;
  1027. case I_RPAR:
  1028. out.sub[pc.n].ep = sp;
  1029. pcIndex = memory.indexOf(pc);
  1030. pc = memory[pcIndex + 1];
  1031. break;
  1032. default:
  1033. return 0;
  1034. }
  1035. }
  1036. }
  1037. function regexec(prog, sp, sub, eflags) {
  1038. let scratch;
  1039. let i;
  1040. if (!sub)
  1041. sub = scratch;
  1042. sub.nsub = prog.nsub;
  1043. for (i = 0; i < MAXSUB; ++i) {
  1044. if (!sub.sub[i]) {
  1045. sub.sub[i] = {}
  1046. }
  1047. // sub.sub[i].sp = sub.sub[i].ep = null;
  1048. sub.sub[i].sp = sub.sub[i].ep = "";
  1049. }
  1050. return !match(prog.start, sp, sp, prog.flags | eflags, sub);
  1051. }
  1052. function main() {
  1053. let m = {
  1054. sub: []
  1055. }
  1056. // let p = recomp(String.raw `.+\/(.+\..+)$`, 0);
  1057. // let s = "/root/temp/hello.mp3";
  1058. // let p = recomp(String.raw `\B..`, 0);
  1059. // let s = "noonday";
  1060. let p = recomp(String.raw `^((?:[_a-zA-Z])+(?:[_a-zA-Z\d])*)[ ]*(?:\((.*)\))`, 0);
  1061. let s = "_foo0_ (x,y)";
  1062. console.log("nsub =", p.nsub)
  1063. if (!regexec(p, s, m, 0)) {
  1064. for (i = 0; i < m.nsub; ++i) {
  1065. let n = m.sub[i].sp.length - m.sub[i].ep.length;
  1066. if (n > 0)
  1067. console.log("match %d: s=%d e=%d n=%d '%s'\n", i, (s.length - m.sub[i].sp.length), (s.length - m.sub[i].ep.length), n, m.sub[i].sp.slice(0, n));
  1068. else
  1069. console.log("match %d: n=0 ''\n", i);
  1070. }
  1071. } else {
  1072. console.log("no match\n");
  1073. }
  1074. }
  1075. main()