From 55a582230ce7109309cd2156ad73dff3b504ee07 Mon Sep 17 00:00:00 2001 From: Kyryl Melekhin Date: Fri, 15 Oct 2021 17:44:52 +0000 Subject: [PATCH] replace error prone gen global state with better solution --- README | 47 ++++++++++++++++++++++++++++++++ pike.c | 85 +++++++++++++++++++++++++--------------------------------- 2 files changed, 84 insertions(+), 48 deletions(-) diff --git a/README b/README index 420f220..703ec0a 100644 --- a/README +++ b/README @@ -94,6 +94,53 @@ is a must, this is the algorithm. Research has shown that it is possible to disambiguate NFA in polynomial time but it brings serious performance issues on non ambiguous inputs. +This pikevm features an improved submatch extraction +algorithm based on Russ Cox's original design. +I - Kyryl Melekhin have found a way to optimize the tracking +properly of 1st number in the submatch pair. Based on simple +observation of how the NFA is constructed I noticed that +there is no way for addthread1() to ever reach inner SAVE +instructions in the regex, so that leaves tracking 2nd pairs by +addthread1() irrelevant to the final results (except the need to +initialize the sub after allocation). This improved the overall +performance by 25% which is massive considering that at the +time there was nothing else left to can be done to make it faster. + +What are on##list macros? +Redundant state inside nlist can happen in couple of +ways, and has to do with the (closure) a* (star) operations and +also +. Due to the automata machine design split happens +to be above the next consumed instruction and if that +state gets added onto the list we may segfault or give +wrong submatch result. Rsplit does not have this problem +because it is generated below the consumer instruction, but +it can still add redundant states. Overall this is extremely +difficult to understand or explain, but this is just something +we have to check for. We checked for this using extra int inside +the split instructions, so this left some global state inside the +machine insts. Most of the time we just added to the next +gen number and kept incrementing it forever. This leaves a small +chance of overflowing the int and getting a run on a false state +left from previous use of the regex. Though if overflow never +happens there is no chance of getting a false state. Overflows +like this pose a high security threat, if the hacker knows +how many cycles he needs to overflow the gen varible and get +inconsistent result. It is possible to reset the marks if we +near the overflow, but as you may guess that does not come +for free. + +Currently I removed all dynamic global state from the instructions +fixing any overlow issue at the cost of slight overhead of needing +to look though the nlist states, to prevent their readdition. This +solution is still fast because it affects only nlist + split run on +so most other uses of regex don't suffer big performace penalty. +This does not solve the ambiguity problem with multible +continuous states though. Finding a fast solution for continuous +ambiguity is the last thing preventing me to call this regex engine +PERFECT and limitation free. While yet, this is to be invented it +takes a big deal of genius and creativity to make new algorithms +or find improvements in what we already know. + Author and License ================== licensed under BSD license, just as the original re1. diff --git a/pike.c b/pike.c index fb49c9d..15eb9ff 100644 --- a/pike.c +++ b/pike.c @@ -58,7 +58,6 @@ struct rcode int sub; int presub; int splits; - int gen; int insts[]; }; @@ -93,6 +92,7 @@ typedef struct rsub rsub; struct rsub { int ref; + rsub *freesub; const char *sub[]; }; @@ -134,12 +134,12 @@ void re_dumpcode(rcode *prog) pc = prog->unilen; break; case SPLIT: - printf("split %d (%d)\n", pc + code[pc] + 2, code[pc]); - pc+=2; + printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]); + pc++; break; case RSPLIT: - printf("rsplit %d (%d)\n", pc + code[pc] + 2, code[pc]); - pc+=2; + printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]); + pc++; break; case JMP: printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]); @@ -295,17 +295,15 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) } if (inf) { EMIT(PC, RSPLIT); - EMIT(PC+1, REL(PC, PC - size -1)); - EMIT(PC+2, 0); - PC += 3; + EMIT(PC+1, REL(PC, PC - size)); + PC += 2; prog->len++; prog->splits++; maxcnt = mincnt; } for (i = maxcnt-mincnt; i > 0; i--) { EMIT(PC++, SPLIT); - EMIT(PC++, REL(PC-1, PC+((size+3)*i))); - EMIT(PC++, 0); + EMIT(PC++, REL(PC, PC+((size+2)*i))); prog->splits++; prog->len++; if (code) @@ -331,21 +329,20 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) break; case '?': if (PC == term) goto syntax_error; - INSERT_CODE(term, 3, PC); + INSERT_CODE(term, 2, PC); if (re[1] == '?') { EMIT(term, RSPLIT); re++; } else EMIT(term, SPLIT); - EMIT(term + 1, REL(term, PC-1)); - EMIT(term + 2, 0); + EMIT(term + 1, REL(term, PC)); prog->len++; prog->splits++; term = PC; break; case '*': if (PC == term) goto syntax_error; - INSERT_CODE(term, 3, PC); + INSERT_CODE(term, 2, PC); EMIT(PC, JMP); EMIT(PC + 1, REL(PC, term)); PC += 2; @@ -354,8 +351,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) re++; } else EMIT(term, SPLIT); - EMIT(term + 1, REL(term, PC-1)); - EMIT(term + 2, 0); + EMIT(term + 1, REL(term, PC)); prog->splits++; prog->len += 2; term = PC; @@ -367,9 +363,8 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) re++; } else EMIT(PC, RSPLIT); - EMIT(PC + 1, REL(PC-1, term)); - EMIT(PC + 2, 0); - PC += 3; + EMIT(PC + 1, REL(PC, term)); + PC += 2; prog->splits++; prog->len++; term = PC; @@ -377,12 +372,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) case '|': if (alt_label) alt_stack[altc++] = alt_label; - INSERT_CODE(start, 3, PC); + INSERT_CODE(start, 2, PC); EMIT(PC++, JMP); alt_label = PC++; EMIT(start, SPLIT); - EMIT(start + 1, REL(start, PC-1)); - EMIT(start + 2, 0); + EMIT(start + 1, REL(start, PC)); prog->splits++; prog->len += 2; term = PC; @@ -403,7 +397,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) if (code && alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); for (int alts = altc; altc; altc--) { - int at = alt_stack[alts-altc]+altc*3; + int at = alt_stack[alts-altc]+altc*2; EMIT(at, REL(at, PC) + 1); } } @@ -435,7 +429,6 @@ int re_comp(rcode *prog, const char *re, int nsubs) prog->sub = 0; prog->presub = nsubs; prog->splits = 0; - prog->gen = 1; int res = _compilecode(&re, prog, /*sizecode*/0); if (res < 0) return res; @@ -450,27 +443,32 @@ int re_comp(rcode *prog, const char *re, int nsubs) return RE_SUCCESS; } -#define _return(state) \ -{ prog->gen = gen + 1; return state; } \ - #define newsub(init, copy) \ if (freesub) \ - { s1 = freesub; freesub = (rsub*)s1->sub[0]; copy } \ + { s1 = freesub; freesub = s1->freesub; copy } \ else \ { s1 = (rsub*)&nsubs[suboff+=rsubsize]; init } \ #define decref(csub) \ if (--csub->ref == 0) { \ - csub->sub[0] = (char*)freesub; \ + csub->freesub = freesub; \ freesub = csub; \ } \ #define deccheck(nn) \ { decref(nsub) goto rec_check##nn; } \ +#define onnlist(nn, list, listidx, when, pre) \ +when for (j = 0; j < listidx; j++) \ + if (npc == list[j].pc) \ + { pre deccheck(nn) } \ + +#define onclist(nn, list, listidx, i, pre) \ + #define fastrec(nn, list, listidx) \ nsub->ref++; \ if (*npc < WBEG) { \ + on##list(nn, list, listidx, /*nop*/, subs[i++] = nsub;) \ list[listidx].sub = nsub; \ list[listidx++].pc = npc; \ npc = pcs[i]; \ @@ -487,18 +485,12 @@ memcpy(s1->sub, nsub->sub, osubp / 2);) \ newsub(/*nop*/, /*nop*/) \ memcpy(s1->sub, nsub->sub, osubp); \ -#define onnlist(nn) \ -if (npc[2] == gen) \ - deccheck(nn) \ -npc[2] = gen; \ - -#define onclist(nn) /* nop */ \ - #define addthread(nn, list, listidx) \ { \ int i = 0; \ rec##nn: \ if (*npc < WBEG) { \ + on##list(nn, list, listidx, if (i), /*nop*/) \ list[listidx].sub = nsub; \ list[listidx++].pc = npc; \ rec_check##nn: \ @@ -515,15 +507,13 @@ npc[2] = gen; \ npc += 2 + npc[1]; \ goto rec##nn; \ case SPLIT: \ - on##list(nn) \ - npc += 3; \ - pcs[i] = npc + npc[-2]; \ + npc += 2; \ + pcs[i] = npc + npc[-1]; \ fastrec(nn, list, listidx) \ case RSPLIT: \ - on##list(nn) \ - npc += 3; \ + npc += 2; \ pcs[i] = npc; \ - npc += npc[-2]; \ + npc += npc[-1]; \ fastrec(nn, list, listidx) \ case SAVE: \ if (nsub->ref > 1) { \ @@ -547,7 +537,7 @@ npc[2] = gen; \ case BOL: \ if (_sp != s) { \ if (!i && !listidx) \ - _return(0) \ + return 0; \ deccheck(nn) \ } \ npc++; goto rec##nn; \ @@ -561,7 +551,7 @@ npc[2] = gen; \ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) { int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp); - int i, j, c, gen, suboff = rsubsize, *npc; + int i, j, c, suboff = rsubsize, *npc; int clistidx = 0, nlistidx = 0, osubp = nsubp * sizeof(char*); const char *sp = s, *_sp = s; int *insts = prog->insts; @@ -571,10 +561,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) rsub *nsub, *s1, *matched = NULL, *freesub = NULL; rthread _clist[prog->len], _nlist[prog->len]; rthread *clist = _clist, *nlist = _nlist, *tmp; - gen = prog->gen; goto jmp_start; for (;; sp = _sp) { - gen++; uc_len(i, sp) uc_code(c, sp) + uc_len(i, sp) uc_code(c, sp) _sp = sp+i; for (i = 0; i < clistidx; i++) { npc = clist[i].pc; @@ -625,9 +614,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) subp[i] = matched->sub[j]; subp[i+1] = matched->sub[nsubp / 2 + j]; } - _return(1) + return 1; } - _return(0) + return 0; } int main(int argc, char *argv[])