apply sparse set trick for nlist exclusion

This commit is contained in:
Kyryl Melekhin
2021-12-11 15:30:56 +00:00
parent 14fd849706
commit b2180201d5
2 changed files with 68 additions and 91 deletions

20
README
View File

@@ -164,16 +164,16 @@ near the overflow, but as you may guess that does not come
for free. for free.
Currently I removed all dynamic global state from the instructions Currently I removed all dynamic global state from the instructions
fixing any overlow issue at the cost of slight overhead of needing fixing any overlow issue utilizing a sparse set datastructure trick
to look though the nlist states, to prevent their readdition. This which abuses the uninitialized varibles. This allows the redundant
solution is still fast because it affects only nlist + split run on states to be excluded in O(1) operation. That said, don't run
so most other uses of regex don't suffer big performace penalty. valgrind on pikevm as it will go crazy, or find a way to surpress
This does not solve the ambiguity problem with multiple errors from pikevm.
continuous states though. Finding a fast O(1) solution for continuous
ambiguity is the last thing preventing me to call this regex engine Further reading
PERFECT and limitation free. While yet, this is to be invented it ===============
takes a big deal of genius and creativity to make new algorithms https://research.swtch.com/sparse
or find improvements in what we already know. https://swtch.com/~rsc/regexp/regexp1.html
Author and License Author and License
================== ==================

139
pike.c
View File

@@ -132,12 +132,12 @@ void re_dumpcode(rcode *prog)
pc = prog->unilen; pc = prog->unilen;
break; break;
case SPLIT: case SPLIT:
printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]); printf("split %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]);
pc++; pc+=2;
break; break;
case RSPLIT: case RSPLIT:
printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]); printf("rsplit %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]);
pc++; pc+=2;
break; break;
case JMP: case JMP:
printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]); printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]);
@@ -180,7 +180,8 @@ void re_dumpcode(rcode *prog)
break; break;
} }
} }
printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i); printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
prog->unilen, prog->len, prog->splits, i);
} }
/* next todo: crack and factor out this recursion, /* next todo: crack and factor out this recursion,
@@ -204,7 +205,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (re - *re_loc > 2 && re[-2] == '\\') if (re - *re_loc > 2 && re[-2] == '\\')
break; break;
EMIT(PC++, *re == '<' ? WBEG : WEND); EMIT(PC++, *re == '<' ? WBEG : WEND);
prog->len++;
term = PC; term = PC;
break; break;
} }
@@ -212,12 +212,10 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
EMIT(PC++, CHAR); EMIT(PC++, CHAR);
uc_code(c, re) EMIT(PC++, c); uc_code(c, re) EMIT(PC++, c);
prog->len++;
break; break;
case '.': case '.':
term = PC; term = PC;
EMIT(PC++, ANY); EMIT(PC++, ANY);
prog->len++;
break; break;
case '[':; case '[':;
int cnt; int cnt;
@@ -230,7 +228,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
} else } else
EMIT(PC++, 1); EMIT(PC++, 1);
PC++; // Skip "# of pairs" byte PC++; // Skip "# of pairs" byte
prog->len++;
for (cnt = 0; *re != ']'; cnt++) { for (cnt = 0; *re != ']'; cnt++) {
if (*re == '\\') re++; if (*re == '\\') re++;
if (!*re) goto syntax_error; if (!*re) goto syntax_error;
@@ -262,7 +259,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
sub = ++prog->sub; sub = ++prog->sub;
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, sub); EMIT(PC++, sub);
prog->len++;
} }
int res = _compilecode(&re, prog, sizecode); int res = _compilecode(&re, prog, sizecode);
*re_loc = re; *re_loc = re;
@@ -271,83 +267,53 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (capture) { if (capture) {
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, sub + prog->presub + 1); EMIT(PC++, sub + prog->presub + 1);
prog->len++;
} }
break; break;
case '{':; case '{':;
int maxcnt = 0, mincnt = 0, int maxcnt = 0, mincnt = 0, i = 0, size = PC - term;
i = 0, icnt = 0, inf = 0, size;
re++; re++;
while (isdigit((unsigned char) *re)) while (isdigit((unsigned char) *re))
mincnt = mincnt * 10 + *re++ - '0'; mincnt = mincnt * 10 + *re++ - '0';
if (*re == ',') { if (*re == ',') {
re++; re++;
if (*re == '}') if (*re == '}') {
inf = 1; EMIT(PC, RSPLIT);
EMIT(PC+1, REL(PC, PC - size - 1));
PC += 3;
maxcnt = mincnt;
}
while (isdigit((unsigned char) *re)) while (isdigit((unsigned char) *re))
maxcnt = maxcnt * 10 + *re++ - '0'; maxcnt = maxcnt * 10 + *re++ - '0';
} else } else
maxcnt = mincnt; maxcnt = mincnt;
for (size = PC - term; i < mincnt-1; i++) { for (; i < mincnt-1; i++) {
if (code) if (code)
memcpy(&code[PC], &code[term], size*sizeof(int)); memcpy(&code[PC], &code[term], size*sizeof(int));
PC += size; PC += size;
} }
if (inf) {
EMIT(PC, RSPLIT);
EMIT(PC+1, REL(PC, PC - size));
PC += 2;
prog->len++;
prog->splits++;
maxcnt = mincnt;
}
for (i = maxcnt-mincnt; i > 0; i--) { for (i = maxcnt-mincnt; i > 0; i--) {
EMIT(PC++, SPLIT); EMIT(PC++, SPLIT);
EMIT(PC++, REL(PC, PC+((size+2)*i))); EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
prog->splits++; PC++;
prog->len++;
if (code) if (code)
memcpy(&code[PC], &code[term], size*sizeof(int)); memcpy(&code[PC], &code[term], size*sizeof(int));
PC += size; PC += size;
} }
if (code) {
inf = 0;
for (i = 0; i < size; i++)
switch (code[term+i]) {
case CLASS:
i += code[term+i+2] * 2 + 2;
icnt++;
break;
case SPLIT:
case RSPLIT:
inf++;
case JMP:
case SAVE:
case CHAR:
i++;
case ANY:
icnt++;
}
prog->splits += (maxcnt-1) * inf;
prog->len += (maxcnt-1) * icnt;
}
break; break;
case '?': case '?':
if (PC == term) goto syntax_error; if (PC == term) goto syntax_error;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 3, PC);
if (re[1] == '?') { if (re[1] == '?') {
EMIT(term, RSPLIT); EMIT(term, RSPLIT);
re++; re++;
} else } else
EMIT(term, SPLIT); EMIT(term, SPLIT);
EMIT(term + 1, REL(term, PC)); EMIT(term + 1, REL(term, PC - 1));
prog->len++;
prog->splits++;
term = PC; term = PC;
break; break;
case '*': case '*':
if (PC == term) goto syntax_error; if (PC == term) goto syntax_error;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 3, PC);
EMIT(PC, JMP); EMIT(PC, JMP);
EMIT(PC + 1, REL(PC, term)); EMIT(PC + 1, REL(PC, term));
PC += 2; PC += 2;
@@ -356,9 +322,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
re++; re++;
} else } else
EMIT(term, SPLIT); EMIT(term, SPLIT);
EMIT(term + 1, REL(term, PC)); EMIT(term + 1, REL(term, PC - 1));
prog->splits++;
prog->len += 2;
term = PC; term = PC;
break; break;
case '+': case '+':
@@ -368,32 +332,26 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
re++; re++;
} else } else
EMIT(PC, RSPLIT); EMIT(PC, RSPLIT);
EMIT(PC + 1, REL(PC, term)); EMIT(PC + 1, REL(PC - 1, term));
PC += 2; PC += 3;
prog->splits++;
prog->len++;
term = PC; term = PC;
break; break;
case '|': case '|':
if (alt_label) if (alt_label)
alt_stack[altc++] = alt_label; alt_stack[altc++] = alt_label;
INSERT_CODE(start, 2, PC); INSERT_CODE(start, 3, PC);
EMIT(PC++, JMP); EMIT(PC++, JMP);
alt_label = PC++; alt_label = PC++;
EMIT(start, SPLIT); EMIT(start, SPLIT);
EMIT(start + 1, REL(start, PC)); EMIT(start + 1, REL(start, PC-1));
prog->splits++;
prog->len += 2;
term = PC; term = PC;
break; break;
case '^': case '^':
EMIT(PC++, BOL); EMIT(PC++, BOL);
prog->len++;
term = PC; term = PC;
break; break;
case '$': case '$':
EMIT(PC++, EOL); EMIT(PC++, EOL);
prog->len++;
term = PC; term = PC;
break; break;
} }
@@ -402,7 +360,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (code && alt_label) { if (code && alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1); EMIT(alt_label, REL(alt_label, PC) + 1);
for (int alts = altc; altc; altc--) { for (int alts = altc; altc; altc--) {
int at = alt_stack[alts-altc]+altc*2; int at = alt_stack[alts-altc]+altc*3;
EMIT(at, REL(at, PC) + 1); EMIT(at, REL(at, PC) + 1);
} }
} }
@@ -439,12 +397,29 @@ int re_comp(rcode *prog, const char *re, int nsubs)
if (res < 0) return res; if (res < 0) return res;
// If unparsed chars left // If unparsed chars left
if (*re) return RE_SYNTAX_ERROR; if (*re) return RE_SYNTAX_ERROR;
int icnt = 0, scnt = 0;
for (int i = 0; i < prog->unilen; i++)
switch (prog->insts[i]) {
case CLASS:
i += prog->insts[i+2] * 2 + 2;
icnt++;
break;
case SPLIT:
case RSPLIT:
prog->insts[i + 2] = scnt++;
i++;
case JMP:
case SAVE:
case CHAR:
i++;
case ANY:
icnt++;
}
prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = SAVE;
prog->insts[prog->unilen++] = prog->sub + 1; prog->insts[prog->unilen++] = prog->sub + 1;
prog->insts[prog->unilen++] = MATCH; prog->insts[prog->unilen++] = MATCH;
prog->len += 2; prog->splits = scnt;
prog->len = icnt+2;
return RE_SUCCESS; return RE_SUCCESS;
} }
@@ -465,10 +440,11 @@ if (--csub->ref == 0) { \
#define onclist(nn) #define onclist(nn)
#define onnlist(nn) \ #define onnlist(nn) \
for (j = 0; j < plistidx; j++) \ if (sparse[npc[2]] < sparsesz) \
if (npc == plist[j]) \ if (sdense[sparse[npc[2]]] == npc[2]) \
deccheck(nn) \ deccheck(nn) \
plist[plistidx++] = npc; \ sdense[sparsesz] = npc[2]; \
sparse[npc[2]] = sparsesz++; \
#define fastrec(nn, list, listidx) \ #define fastrec(nn, list, listidx) \
nsub->ref++; \ nsub->ref++; \
@@ -524,8 +500,8 @@ if (spc < WBEG) { \
next##nn: \ next##nn: \
if (spc == SPLIT) { \ if (spc == SPLIT) { \
on##list(nn) \ on##list(nn) \
npc += 2; \ npc += 3; \
pcs[si] = npc + npc[-1]; \ pcs[si] = npc + npc[-2]; \
fastrec(nn, list, listidx) \ fastrec(nn, list, listidx) \
} else if (spc == SAVE) { \ } else if (spc == SAVE) { \
if (nsub->ref > 1) { \ if (nsub->ref > 1) { \
@@ -544,9 +520,9 @@ if (spc == SPLIT) { \
npc++; goto rec##nn; \ npc++; goto rec##nn; \
} else if (spc == RSPLIT) { \ } else if (spc == RSPLIT) { \
on##list(nn) \ on##list(nn) \
npc += 2; \ npc += 3; \
pcs[si] = npc; \ pcs[si] = npc; \
npc += npc[-1]; \ npc += npc[-2]; \
fastrec(nn, list, listidx) \ fastrec(nn, list, listidx) \
} else if (spc == WEND) { \ } else if (spc == WEND) { \
if (isword(_sp)) \ if (isword(_sp)) \
@@ -571,12 +547,13 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
{ {
int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp); int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*); int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
int clistidx = 0, nlistidx, plistidx, spc, mcont = MATCH; int clistidx = 0, nlistidx, sparsesz, spc, mcont = MATCH;
const char *sp = s, *_sp = s; const char *sp = s, *_sp = s;
int *insts = prog->insts; int *insts = prog->insts;
int *pcs[prog->splits], *plist[prog->splits]; int *pcs[prog->splits];
unsigned int sdense[prog->splits], sparse[prog->splits];
rsub *subs[prog->splits]; rsub *subs[prog->splits];
char nsubs[500000]; char nsubs[rsubsize * (prog->len-prog->splits+3)];
rsub *nsub, *s1, *matched = NULL, *freesub = NULL; rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
rthread _clist[prog->len+1], _nlist[prog->len+1]; rthread _clist[prog->len+1], _nlist[prog->len+1];
_clist[0].pc = insts, _nlist[0].pc = insts; _clist[0].pc = insts, _nlist[0].pc = insts;
@@ -585,7 +562,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
for (;; sp = _sp) { for (;; sp = _sp) {
uc_len(i, sp) uc_code(c, sp) uc_len(i, sp) uc_code(c, sp)
_sp = sp+i; _sp = sp+i;
nlistidx = 0; plistidx = 0; nlistidx = 0; sparsesz = 0;
for (i = 0; i < clistidx; i++) { for (i = 0; i < clistidx; i++) {
npc = clist[i].pc; npc = clist[i].pc;
nsub = clist[i].sub; nsub = clist[i].sub;