replace error prone gen global state with better solution

This commit is contained in:
Kyryl Melekhin
2021-10-15 17:44:52 +00:00
parent 364e04a0f9
commit 55a582230c
2 changed files with 84 additions and 48 deletions

47
README
View File

@@ -94,6 +94,53 @@ is a must, this is the algorithm.
Research has shown that it is possible to disambiguate NFA in polynomial time
but it brings serious performance issues on non ambiguous inputs.
This pikevm features an improved submatch extraction
algorithm based on Russ Cox's original design.
I - Kyryl Melekhin have found a way to optimize the tracking
properly of 1st number in the submatch pair. Based on simple
observation of how the NFA is constructed I noticed that
there is no way for addthread1() to ever reach inner SAVE
instructions in the regex, so that leaves tracking 2nd pairs by
addthread1() irrelevant to the final results (except the need to
initialize the sub after allocation). This improved the overall
performance by 25% which is massive considering that at the
time there was nothing else left to can be done to make it faster.
What are on##list macros?
Redundant state inside nlist can happen in couple of
ways, and has to do with the (closure) a* (star) operations and
also +. Due to the automata machine design split happens
to be above the next consumed instruction and if that
state gets added onto the list we may segfault or give
wrong submatch result. Rsplit does not have this problem
because it is generated below the consumer instruction, but
it can still add redundant states. Overall this is extremely
difficult to understand or explain, but this is just something
we have to check for. We checked for this using extra int inside
the split instructions, so this left some global state inside the
machine insts. Most of the time we just added to the next
gen number and kept incrementing it forever. This leaves a small
chance of overflowing the int and getting a run on a false state
left from previous use of the regex. Though if overflow never
happens there is no chance of getting a false state. Overflows
like this pose a high security threat, if the hacker knows
how many cycles he needs to overflow the gen varible and get
inconsistent result. It is possible to reset the marks if we
near the overflow, but as you may guess that does not come
for free.
Currently I removed all dynamic global state from the instructions
fixing any overlow issue at the cost of slight overhead of needing
to look though the nlist states, to prevent their readdition. This
solution is still fast because it affects only nlist + split run on
so most other uses of regex don't suffer big performace penalty.
This does not solve the ambiguity problem with multible
continuous states though. Finding a fast solution for continuous
ambiguity is the last thing preventing me to call this regex engine
PERFECT and limitation free. While yet, this is to be invented it
takes a big deal of genius and creativity to make new algorithms
or find improvements in what we already know.
Author and License
==================
licensed under BSD license, just as the original re1.

85
pike.c
View File

@@ -58,7 +58,6 @@ struct rcode
int sub;
int presub;
int splits;
int gen;
int insts[];
};
@@ -93,6 +92,7 @@ typedef struct rsub rsub;
struct rsub
{
int ref;
rsub *freesub;
const char *sub[];
};
@@ -134,12 +134,12 @@ void re_dumpcode(rcode *prog)
pc = prog->unilen;
break;
case SPLIT:
printf("split %d (%d)\n", pc + code[pc] + 2, code[pc]);
pc+=2;
printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]);
pc++;
break;
case RSPLIT:
printf("rsplit %d (%d)\n", pc + code[pc] + 2, code[pc]);
pc+=2;
printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]);
pc++;
break;
case JMP:
printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]);
@@ -295,17 +295,15 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
}
if (inf) {
EMIT(PC, RSPLIT);
EMIT(PC+1, REL(PC, PC - size -1));
EMIT(PC+2, 0);
PC += 3;
EMIT(PC+1, REL(PC, PC - size));
PC += 2;
prog->len++;
prog->splits++;
maxcnt = mincnt;
}
for (i = maxcnt-mincnt; i > 0; i--) {
EMIT(PC++, SPLIT);
EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
EMIT(PC++, 0);
EMIT(PC++, REL(PC, PC+((size+2)*i)));
prog->splits++;
prog->len++;
if (code)
@@ -331,21 +329,20 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
break;
case '?':
if (PC == term) goto syntax_error;
INSERT_CODE(term, 3, PC);
INSERT_CODE(term, 2, PC);
if (re[1] == '?') {
EMIT(term, RSPLIT);
re++;
} else
EMIT(term, SPLIT);
EMIT(term + 1, REL(term, PC-1));
EMIT(term + 2, 0);
EMIT(term + 1, REL(term, PC));
prog->len++;
prog->splits++;
term = PC;
break;
case '*':
if (PC == term) goto syntax_error;
INSERT_CODE(term, 3, PC);
INSERT_CODE(term, 2, PC);
EMIT(PC, JMP);
EMIT(PC + 1, REL(PC, term));
PC += 2;
@@ -354,8 +351,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
re++;
} else
EMIT(term, SPLIT);
EMIT(term + 1, REL(term, PC-1));
EMIT(term + 2, 0);
EMIT(term + 1, REL(term, PC));
prog->splits++;
prog->len += 2;
term = PC;
@@ -367,9 +363,8 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
re++;
} else
EMIT(PC, RSPLIT);
EMIT(PC + 1, REL(PC-1, term));
EMIT(PC + 2, 0);
PC += 3;
EMIT(PC + 1, REL(PC, term));
PC += 2;
prog->splits++;
prog->len++;
term = PC;
@@ -377,12 +372,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
case '|':
if (alt_label)
alt_stack[altc++] = alt_label;
INSERT_CODE(start, 3, PC);
INSERT_CODE(start, 2, PC);
EMIT(PC++, JMP);
alt_label = PC++;
EMIT(start, SPLIT);
EMIT(start + 1, REL(start, PC-1));
EMIT(start + 2, 0);
EMIT(start + 1, REL(start, PC));
prog->splits++;
prog->len += 2;
term = PC;
@@ -403,7 +397,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (code && alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1);
for (int alts = altc; altc; altc--) {
int at = alt_stack[alts-altc]+altc*3;
int at = alt_stack[alts-altc]+altc*2;
EMIT(at, REL(at, PC) + 1);
}
}
@@ -435,7 +429,6 @@ int re_comp(rcode *prog, const char *re, int nsubs)
prog->sub = 0;
prog->presub = nsubs;
prog->splits = 0;
prog->gen = 1;
int res = _compilecode(&re, prog, /*sizecode*/0);
if (res < 0) return res;
@@ -450,27 +443,32 @@ int re_comp(rcode *prog, const char *re, int nsubs)
return RE_SUCCESS;
}
#define _return(state) \
{ prog->gen = gen + 1; return state; } \
#define newsub(init, copy) \
if (freesub) \
{ s1 = freesub; freesub = (rsub*)s1->sub[0]; copy } \
{ s1 = freesub; freesub = s1->freesub; copy } \
else \
{ s1 = (rsub*)&nsubs[suboff+=rsubsize]; init } \
#define decref(csub) \
if (--csub->ref == 0) { \
csub->sub[0] = (char*)freesub; \
csub->freesub = freesub; \
freesub = csub; \
} \
#define deccheck(nn) \
{ decref(nsub) goto rec_check##nn; } \
#define onnlist(nn, list, listidx, when, pre) \
when for (j = 0; j < listidx; j++) \
if (npc == list[j].pc) \
{ pre deccheck(nn) } \
#define onclist(nn, list, listidx, i, pre) \
#define fastrec(nn, list, listidx) \
nsub->ref++; \
if (*npc < WBEG) { \
on##list(nn, list, listidx, /*nop*/, subs[i++] = nsub;) \
list[listidx].sub = nsub; \
list[listidx++].pc = npc; \
npc = pcs[i]; \
@@ -487,18 +485,12 @@ memcpy(s1->sub, nsub->sub, osubp / 2);) \
newsub(/*nop*/, /*nop*/) \
memcpy(s1->sub, nsub->sub, osubp); \
#define onnlist(nn) \
if (npc[2] == gen) \
deccheck(nn) \
npc[2] = gen; \
#define onclist(nn) /* nop */ \
#define addthread(nn, list, listidx) \
{ \
int i = 0; \
rec##nn: \
if (*npc < WBEG) { \
on##list(nn, list, listidx, if (i), /*nop*/) \
list[listidx].sub = nsub; \
list[listidx++].pc = npc; \
rec_check##nn: \
@@ -515,15 +507,13 @@ npc[2] = gen; \
npc += 2 + npc[1]; \
goto rec##nn; \
case SPLIT: \
on##list(nn) \
npc += 3; \
pcs[i] = npc + npc[-2]; \
npc += 2; \
pcs[i] = npc + npc[-1]; \
fastrec(nn, list, listidx) \
case RSPLIT: \
on##list(nn) \
npc += 3; \
npc += 2; \
pcs[i] = npc; \
npc += npc[-2]; \
npc += npc[-1]; \
fastrec(nn, list, listidx) \
case SAVE: \
if (nsub->ref > 1) { \
@@ -547,7 +537,7 @@ npc[2] = gen; \
case BOL: \
if (_sp != s) { \
if (!i && !listidx) \
_return(0) \
return 0; \
deccheck(nn) \
} \
npc++; goto rec##nn; \
@@ -561,7 +551,7 @@ npc[2] = gen; \
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
{
int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
int i, j, c, gen, suboff = rsubsize, *npc;
int i, j, c, suboff = rsubsize, *npc;
int clistidx = 0, nlistidx = 0, osubp = nsubp * sizeof(char*);
const char *sp = s, *_sp = s;
int *insts = prog->insts;
@@ -571,10 +561,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
rthread _clist[prog->len], _nlist[prog->len];
rthread *clist = _clist, *nlist = _nlist, *tmp;
gen = prog->gen;
goto jmp_start;
for (;; sp = _sp) {
gen++; uc_len(i, sp) uc_code(c, sp)
uc_len(i, sp) uc_code(c, sp)
_sp = sp+i;
for (i = 0; i < clistidx; i++) {
npc = clist[i].pc;
@@ -625,9 +614,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
subp[i] = matched->sub[j];
subp[i+1] = matched->sub[nsubp / 2 + j];
}
_return(1)
return 1;
}
_return(0)
return 0;
}
int main(int argc, char *argv[])