From b2180201d596b3fe5dcf4d248c15984debe6cfca Mon Sep 17 00:00:00 2001 From: Kyryl Melekhin Date: Sat, 11 Dec 2021 15:30:56 +0000 Subject: [PATCH] apply sparse set trick for nlist exclusion --- README | 20 ++++----- pike.c | 139 ++++++++++++++++++++++++--------------------------------- 2 files changed, 68 insertions(+), 91 deletions(-) diff --git a/README b/README index b5359ad..c50bc9a 100644 --- a/README +++ b/README @@ -164,16 +164,16 @@ near the overflow, but as you may guess that does not come for free. Currently I removed all dynamic global state from the instructions -fixing any overlow issue at the cost of slight overhead of needing -to look though the nlist states, to prevent their readdition. This -solution is still fast because it affects only nlist + split run on -so most other uses of regex don't suffer big performace penalty. -This does not solve the ambiguity problem with multiple -continuous states though. Finding a fast O(1) solution for continuous -ambiguity is the last thing preventing me to call this regex engine -PERFECT and limitation free. While yet, this is to be invented it -takes a big deal of genius and creativity to make new algorithms -or find improvements in what we already know. +fixing any overlow issue utilizing a sparse set datastructure trick +which abuses the uninitialized varibles. This allows the redundant +states to be excluded in O(1) operation. That said, don't run +valgrind on pikevm as it will go crazy, or find a way to surpress +errors from pikevm. + +Further reading +=============== +https://research.swtch.com/sparse +https://swtch.com/~rsc/regexp/regexp1.html Author and License ================== diff --git a/pike.c b/pike.c index fe33289..26296df 100644 --- a/pike.c +++ b/pike.c @@ -132,12 +132,12 @@ void re_dumpcode(rcode *prog) pc = prog->unilen; break; case SPLIT: - printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]); - pc++; + printf("split %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]); + pc+=2; break; case RSPLIT: - printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]); - pc++; + printf("rsplit %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]); + pc+=2; break; case JMP: printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]); @@ -180,7 +180,8 @@ void re_dumpcode(rcode *prog) break; } } - printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i); + printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n", + prog->unilen, prog->len, prog->splits, i); } /* next todo: crack and factor out this recursion, @@ -204,7 +205,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) if (re - *re_loc > 2 && re[-2] == '\\') break; EMIT(PC++, *re == '<' ? WBEG : WEND); - prog->len++; term = PC; break; } @@ -212,12 +212,10 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) term = PC; EMIT(PC++, CHAR); uc_code(c, re) EMIT(PC++, c); - prog->len++; break; case '.': term = PC; EMIT(PC++, ANY); - prog->len++; break; case '[':; int cnt; @@ -230,7 +228,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) } else EMIT(PC++, 1); PC++; // Skip "# of pairs" byte - prog->len++; for (cnt = 0; *re != ']'; cnt++) { if (*re == '\\') re++; if (!*re) goto syntax_error; @@ -262,7 +259,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) sub = ++prog->sub; EMIT(PC++, SAVE); EMIT(PC++, sub); - prog->len++; } int res = _compilecode(&re, prog, sizecode); *re_loc = re; @@ -271,83 +267,53 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) if (capture) { EMIT(PC++, SAVE); EMIT(PC++, sub + prog->presub + 1); - prog->len++; } break; case '{':; - int maxcnt = 0, mincnt = 0, - i = 0, icnt = 0, inf = 0, size; + int maxcnt = 0, mincnt = 0, i = 0, size = PC - term; re++; while (isdigit((unsigned char) *re)) mincnt = mincnt * 10 + *re++ - '0'; if (*re == ',') { re++; - if (*re == '}') - inf = 1; + if (*re == '}') { + EMIT(PC, RSPLIT); + EMIT(PC+1, REL(PC, PC - size - 1)); + PC += 3; + maxcnt = mincnt; + } while (isdigit((unsigned char) *re)) maxcnt = maxcnt * 10 + *re++ - '0'; } else maxcnt = mincnt; - for (size = PC - term; i < mincnt-1; i++) { + for (; i < mincnt-1; i++) { if (code) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; } - if (inf) { - EMIT(PC, RSPLIT); - EMIT(PC+1, REL(PC, PC - size)); - PC += 2; - prog->len++; - prog->splits++; - maxcnt = mincnt; - } for (i = maxcnt-mincnt; i > 0; i--) { EMIT(PC++, SPLIT); - EMIT(PC++, REL(PC, PC+((size+2)*i))); - prog->splits++; - prog->len++; + EMIT(PC++, REL(PC-1, PC+((size+3)*i))); + PC++; if (code) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; } - if (code) { - inf = 0; - for (i = 0; i < size; i++) - switch (code[term+i]) { - case CLASS: - i += code[term+i+2] * 2 + 2; - icnt++; - break; - case SPLIT: - case RSPLIT: - inf++; - case JMP: - case SAVE: - case CHAR: - i++; - case ANY: - icnt++; - } - prog->splits += (maxcnt-1) * inf; - prog->len += (maxcnt-1) * icnt; - } break; case '?': if (PC == term) goto syntax_error; - INSERT_CODE(term, 2, PC); + INSERT_CODE(term, 3, PC); if (re[1] == '?') { EMIT(term, RSPLIT); re++; } else EMIT(term, SPLIT); - EMIT(term + 1, REL(term, PC)); - prog->len++; - prog->splits++; + EMIT(term + 1, REL(term, PC - 1)); term = PC; break; case '*': if (PC == term) goto syntax_error; - INSERT_CODE(term, 2, PC); + INSERT_CODE(term, 3, PC); EMIT(PC, JMP); EMIT(PC + 1, REL(PC, term)); PC += 2; @@ -356,9 +322,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) re++; } else EMIT(term, SPLIT); - EMIT(term + 1, REL(term, PC)); - prog->splits++; - prog->len += 2; + EMIT(term + 1, REL(term, PC - 1)); term = PC; break; case '+': @@ -368,32 +332,26 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) re++; } else EMIT(PC, RSPLIT); - EMIT(PC + 1, REL(PC, term)); - PC += 2; - prog->splits++; - prog->len++; + EMIT(PC + 1, REL(PC - 1, term)); + PC += 3; term = PC; break; case '|': if (alt_label) alt_stack[altc++] = alt_label; - INSERT_CODE(start, 2, PC); + INSERT_CODE(start, 3, PC); EMIT(PC++, JMP); alt_label = PC++; EMIT(start, SPLIT); - EMIT(start + 1, REL(start, PC)); - prog->splits++; - prog->len += 2; + EMIT(start + 1, REL(start, PC-1)); term = PC; break; case '^': EMIT(PC++, BOL); - prog->len++; term = PC; break; case '$': EMIT(PC++, EOL); - prog->len++; term = PC; break; } @@ -402,7 +360,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) if (code && alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); for (int alts = altc; altc; altc--) { - int at = alt_stack[alts-altc]+altc*2; + int at = alt_stack[alts-altc]+altc*3; EMIT(at, REL(at, PC) + 1); } } @@ -439,12 +397,29 @@ int re_comp(rcode *prog, const char *re, int nsubs) if (res < 0) return res; // If unparsed chars left if (*re) return RE_SYNTAX_ERROR; - + int icnt = 0, scnt = 0; + for (int i = 0; i < prog->unilen; i++) + switch (prog->insts[i]) { + case CLASS: + i += prog->insts[i+2] * 2 + 2; + icnt++; + break; + case SPLIT: + case RSPLIT: + prog->insts[i + 2] = scnt++; + i++; + case JMP: + case SAVE: + case CHAR: + i++; + case ANY: + icnt++; + } prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = prog->sub + 1; prog->insts[prog->unilen++] = MATCH; - prog->len += 2; - + prog->splits = scnt; + prog->len = icnt+2; return RE_SUCCESS; } @@ -465,10 +440,11 @@ if (--csub->ref == 0) { \ #define onclist(nn) #define onnlist(nn) \ -for (j = 0; j < plistidx; j++) \ - if (npc == plist[j]) \ +if (sparse[npc[2]] < sparsesz) \ + if (sdense[sparse[npc[2]]] == npc[2]) \ deccheck(nn) \ -plist[plistidx++] = npc; \ +sdense[sparsesz] = npc[2]; \ +sparse[npc[2]] = sparsesz++; \ #define fastrec(nn, list, listidx) \ nsub->ref++; \ @@ -524,8 +500,8 @@ if (spc < WBEG) { \ next##nn: \ if (spc == SPLIT) { \ on##list(nn) \ - npc += 2; \ - pcs[si] = npc + npc[-1]; \ + npc += 3; \ + pcs[si] = npc + npc[-2]; \ fastrec(nn, list, listidx) \ } else if (spc == SAVE) { \ if (nsub->ref > 1) { \ @@ -544,9 +520,9 @@ if (spc == SPLIT) { \ npc++; goto rec##nn; \ } else if (spc == RSPLIT) { \ on##list(nn) \ - npc += 2; \ + npc += 3; \ pcs[si] = npc; \ - npc += npc[-1]; \ + npc += npc[-2]; \ fastrec(nn, list, listidx) \ } else if (spc == WEND) { \ if (isword(_sp)) \ @@ -571,12 +547,13 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) { int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp); int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*); - int clistidx = 0, nlistidx, plistidx, spc, mcont = MATCH; + int clistidx = 0, nlistidx, sparsesz, spc, mcont = MATCH; const char *sp = s, *_sp = s; int *insts = prog->insts; - int *pcs[prog->splits], *plist[prog->splits]; + int *pcs[prog->splits]; + unsigned int sdense[prog->splits], sparse[prog->splits]; rsub *subs[prog->splits]; - char nsubs[500000]; + char nsubs[rsubsize * (prog->len-prog->splits+3)]; rsub *nsub, *s1, *matched = NULL, *freesub = NULL; rthread _clist[prog->len+1], _nlist[prog->len+1]; _clist[0].pc = insts, _nlist[0].pc = insts; @@ -585,7 +562,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) for (;; sp = _sp) { uc_len(i, sp) uc_code(c, sp) _sp = sp+i; - nlistidx = 0; plistidx = 0; + nlistidx = 0; sparsesz = 0; for (i = 0; i < clistidx; i++) { npc = clist[i].pc; nsub = clist[i].sub;