apply sparse set trick for nlist exclusion
This commit is contained in:
20
README
20
README
@@ -164,16 +164,16 @@ near the overflow, but as you may guess that does not come
|
|||||||
for free.
|
for free.
|
||||||
|
|
||||||
Currently I removed all dynamic global state from the instructions
|
Currently I removed all dynamic global state from the instructions
|
||||||
fixing any overlow issue at the cost of slight overhead of needing
|
fixing any overlow issue utilizing a sparse set datastructure trick
|
||||||
to look though the nlist states, to prevent their readdition. This
|
which abuses the uninitialized varibles. This allows the redundant
|
||||||
solution is still fast because it affects only nlist + split run on
|
states to be excluded in O(1) operation. That said, don't run
|
||||||
so most other uses of regex don't suffer big performace penalty.
|
valgrind on pikevm as it will go crazy, or find a way to surpress
|
||||||
This does not solve the ambiguity problem with multiple
|
errors from pikevm.
|
||||||
continuous states though. Finding a fast O(1) solution for continuous
|
|
||||||
ambiguity is the last thing preventing me to call this regex engine
|
Further reading
|
||||||
PERFECT and limitation free. While yet, this is to be invented it
|
===============
|
||||||
takes a big deal of genius and creativity to make new algorithms
|
https://research.swtch.com/sparse
|
||||||
or find improvements in what we already know.
|
https://swtch.com/~rsc/regexp/regexp1.html
|
||||||
|
|
||||||
Author and License
|
Author and License
|
||||||
==================
|
==================
|
||||||
|
|||||||
139
pike.c
139
pike.c
@@ -132,12 +132,12 @@ void re_dumpcode(rcode *prog)
|
|||||||
pc = prog->unilen;
|
pc = prog->unilen;
|
||||||
break;
|
break;
|
||||||
case SPLIT:
|
case SPLIT:
|
||||||
printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]);
|
printf("split %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]);
|
||||||
pc++;
|
pc+=2;
|
||||||
break;
|
break;
|
||||||
case RSPLIT:
|
case RSPLIT:
|
||||||
printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]);
|
printf("rsplit %d (%d) #%d\n", pc + code[pc] + 2, code[pc], code[pc+1]);
|
||||||
pc++;
|
pc+=2;
|
||||||
break;
|
break;
|
||||||
case JMP:
|
case JMP:
|
||||||
printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]);
|
printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]);
|
||||||
@@ -180,7 +180,8 @@ void re_dumpcode(rcode *prog)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i);
|
printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
|
||||||
|
prog->unilen, prog->len, prog->splits, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* next todo: crack and factor out this recursion,
|
/* next todo: crack and factor out this recursion,
|
||||||
@@ -204,7 +205,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
if (re - *re_loc > 2 && re[-2] == '\\')
|
if (re - *re_loc > 2 && re[-2] == '\\')
|
||||||
break;
|
break;
|
||||||
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
||||||
prog->len++;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -212,12 +212,10 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
term = PC;
|
term = PC;
|
||||||
EMIT(PC++, CHAR);
|
EMIT(PC++, CHAR);
|
||||||
uc_code(c, re) EMIT(PC++, c);
|
uc_code(c, re) EMIT(PC++, c);
|
||||||
prog->len++;
|
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
term = PC;
|
term = PC;
|
||||||
EMIT(PC++, ANY);
|
EMIT(PC++, ANY);
|
||||||
prog->len++;
|
|
||||||
break;
|
break;
|
||||||
case '[':;
|
case '[':;
|
||||||
int cnt;
|
int cnt;
|
||||||
@@ -230,7 +228,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
} else
|
} else
|
||||||
EMIT(PC++, 1);
|
EMIT(PC++, 1);
|
||||||
PC++; // Skip "# of pairs" byte
|
PC++; // Skip "# of pairs" byte
|
||||||
prog->len++;
|
|
||||||
for (cnt = 0; *re != ']'; cnt++) {
|
for (cnt = 0; *re != ']'; cnt++) {
|
||||||
if (*re == '\\') re++;
|
if (*re == '\\') re++;
|
||||||
if (!*re) goto syntax_error;
|
if (!*re) goto syntax_error;
|
||||||
@@ -262,7 +259,6 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
sub = ++prog->sub;
|
sub = ++prog->sub;
|
||||||
EMIT(PC++, SAVE);
|
EMIT(PC++, SAVE);
|
||||||
EMIT(PC++, sub);
|
EMIT(PC++, sub);
|
||||||
prog->len++;
|
|
||||||
}
|
}
|
||||||
int res = _compilecode(&re, prog, sizecode);
|
int res = _compilecode(&re, prog, sizecode);
|
||||||
*re_loc = re;
|
*re_loc = re;
|
||||||
@@ -271,83 +267,53 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
if (capture) {
|
if (capture) {
|
||||||
EMIT(PC++, SAVE);
|
EMIT(PC++, SAVE);
|
||||||
EMIT(PC++, sub + prog->presub + 1);
|
EMIT(PC++, sub + prog->presub + 1);
|
||||||
prog->len++;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '{':;
|
case '{':;
|
||||||
int maxcnt = 0, mincnt = 0,
|
int maxcnt = 0, mincnt = 0, i = 0, size = PC - term;
|
||||||
i = 0, icnt = 0, inf = 0, size;
|
|
||||||
re++;
|
re++;
|
||||||
while (isdigit((unsigned char) *re))
|
while (isdigit((unsigned char) *re))
|
||||||
mincnt = mincnt * 10 + *re++ - '0';
|
mincnt = mincnt * 10 + *re++ - '0';
|
||||||
if (*re == ',') {
|
if (*re == ',') {
|
||||||
re++;
|
re++;
|
||||||
if (*re == '}')
|
if (*re == '}') {
|
||||||
inf = 1;
|
EMIT(PC, RSPLIT);
|
||||||
|
EMIT(PC+1, REL(PC, PC - size - 1));
|
||||||
|
PC += 3;
|
||||||
|
maxcnt = mincnt;
|
||||||
|
}
|
||||||
while (isdigit((unsigned char) *re))
|
while (isdigit((unsigned char) *re))
|
||||||
maxcnt = maxcnt * 10 + *re++ - '0';
|
maxcnt = maxcnt * 10 + *re++ - '0';
|
||||||
} else
|
} else
|
||||||
maxcnt = mincnt;
|
maxcnt = mincnt;
|
||||||
for (size = PC - term; i < mincnt-1; i++) {
|
for (; i < mincnt-1; i++) {
|
||||||
if (code)
|
if (code)
|
||||||
memcpy(&code[PC], &code[term], size*sizeof(int));
|
memcpy(&code[PC], &code[term], size*sizeof(int));
|
||||||
PC += size;
|
PC += size;
|
||||||
}
|
}
|
||||||
if (inf) {
|
|
||||||
EMIT(PC, RSPLIT);
|
|
||||||
EMIT(PC+1, REL(PC, PC - size));
|
|
||||||
PC += 2;
|
|
||||||
prog->len++;
|
|
||||||
prog->splits++;
|
|
||||||
maxcnt = mincnt;
|
|
||||||
}
|
|
||||||
for (i = maxcnt-mincnt; i > 0; i--) {
|
for (i = maxcnt-mincnt; i > 0; i--) {
|
||||||
EMIT(PC++, SPLIT);
|
EMIT(PC++, SPLIT);
|
||||||
EMIT(PC++, REL(PC, PC+((size+2)*i)));
|
EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
|
||||||
prog->splits++;
|
PC++;
|
||||||
prog->len++;
|
|
||||||
if (code)
|
if (code)
|
||||||
memcpy(&code[PC], &code[term], size*sizeof(int));
|
memcpy(&code[PC], &code[term], size*sizeof(int));
|
||||||
PC += size;
|
PC += size;
|
||||||
}
|
}
|
||||||
if (code) {
|
|
||||||
inf = 0;
|
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
switch (code[term+i]) {
|
|
||||||
case CLASS:
|
|
||||||
i += code[term+i+2] * 2 + 2;
|
|
||||||
icnt++;
|
|
||||||
break;
|
|
||||||
case SPLIT:
|
|
||||||
case RSPLIT:
|
|
||||||
inf++;
|
|
||||||
case JMP:
|
|
||||||
case SAVE:
|
|
||||||
case CHAR:
|
|
||||||
i++;
|
|
||||||
case ANY:
|
|
||||||
icnt++;
|
|
||||||
}
|
|
||||||
prog->splits += (maxcnt-1) * inf;
|
|
||||||
prog->len += (maxcnt-1) * icnt;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case '?':
|
case '?':
|
||||||
if (PC == term) goto syntax_error;
|
if (PC == term) goto syntax_error;
|
||||||
INSERT_CODE(term, 2, PC);
|
INSERT_CODE(term, 3, PC);
|
||||||
if (re[1] == '?') {
|
if (re[1] == '?') {
|
||||||
EMIT(term, RSPLIT);
|
EMIT(term, RSPLIT);
|
||||||
re++;
|
re++;
|
||||||
} else
|
} else
|
||||||
EMIT(term, SPLIT);
|
EMIT(term, SPLIT);
|
||||||
EMIT(term + 1, REL(term, PC));
|
EMIT(term + 1, REL(term, PC - 1));
|
||||||
prog->len++;
|
|
||||||
prog->splits++;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '*':
|
case '*':
|
||||||
if (PC == term) goto syntax_error;
|
if (PC == term) goto syntax_error;
|
||||||
INSERT_CODE(term, 2, PC);
|
INSERT_CODE(term, 3, PC);
|
||||||
EMIT(PC, JMP);
|
EMIT(PC, JMP);
|
||||||
EMIT(PC + 1, REL(PC, term));
|
EMIT(PC + 1, REL(PC, term));
|
||||||
PC += 2;
|
PC += 2;
|
||||||
@@ -356,9 +322,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
re++;
|
re++;
|
||||||
} else
|
} else
|
||||||
EMIT(term, SPLIT);
|
EMIT(term, SPLIT);
|
||||||
EMIT(term + 1, REL(term, PC));
|
EMIT(term + 1, REL(term, PC - 1));
|
||||||
prog->splits++;
|
|
||||||
prog->len += 2;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '+':
|
case '+':
|
||||||
@@ -368,32 +332,26 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
re++;
|
re++;
|
||||||
} else
|
} else
|
||||||
EMIT(PC, RSPLIT);
|
EMIT(PC, RSPLIT);
|
||||||
EMIT(PC + 1, REL(PC, term));
|
EMIT(PC + 1, REL(PC - 1, term));
|
||||||
PC += 2;
|
PC += 3;
|
||||||
prog->splits++;
|
|
||||||
prog->len++;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '|':
|
case '|':
|
||||||
if (alt_label)
|
if (alt_label)
|
||||||
alt_stack[altc++] = alt_label;
|
alt_stack[altc++] = alt_label;
|
||||||
INSERT_CODE(start, 2, PC);
|
INSERT_CODE(start, 3, PC);
|
||||||
EMIT(PC++, JMP);
|
EMIT(PC++, JMP);
|
||||||
alt_label = PC++;
|
alt_label = PC++;
|
||||||
EMIT(start, SPLIT);
|
EMIT(start, SPLIT);
|
||||||
EMIT(start + 1, REL(start, PC));
|
EMIT(start + 1, REL(start, PC-1));
|
||||||
prog->splits++;
|
|
||||||
prog->len += 2;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '^':
|
case '^':
|
||||||
EMIT(PC++, BOL);
|
EMIT(PC++, BOL);
|
||||||
prog->len++;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '$':
|
case '$':
|
||||||
EMIT(PC++, EOL);
|
EMIT(PC++, EOL);
|
||||||
prog->len++;
|
|
||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -402,7 +360,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
if (code && alt_label) {
|
if (code && alt_label) {
|
||||||
EMIT(alt_label, REL(alt_label, PC) + 1);
|
EMIT(alt_label, REL(alt_label, PC) + 1);
|
||||||
for (int alts = altc; altc; altc--) {
|
for (int alts = altc; altc; altc--) {
|
||||||
int at = alt_stack[alts-altc]+altc*2;
|
int at = alt_stack[alts-altc]+altc*3;
|
||||||
EMIT(at, REL(at, PC) + 1);
|
EMIT(at, REL(at, PC) + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -439,12 +397,29 @@ int re_comp(rcode *prog, const char *re, int nsubs)
|
|||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
// If unparsed chars left
|
// If unparsed chars left
|
||||||
if (*re) return RE_SYNTAX_ERROR;
|
if (*re) return RE_SYNTAX_ERROR;
|
||||||
|
int icnt = 0, scnt = 0;
|
||||||
|
for (int i = 0; i < prog->unilen; i++)
|
||||||
|
switch (prog->insts[i]) {
|
||||||
|
case CLASS:
|
||||||
|
i += prog->insts[i+2] * 2 + 2;
|
||||||
|
icnt++;
|
||||||
|
break;
|
||||||
|
case SPLIT:
|
||||||
|
case RSPLIT:
|
||||||
|
prog->insts[i + 2] = scnt++;
|
||||||
|
i++;
|
||||||
|
case JMP:
|
||||||
|
case SAVE:
|
||||||
|
case CHAR:
|
||||||
|
i++;
|
||||||
|
case ANY:
|
||||||
|
icnt++;
|
||||||
|
}
|
||||||
prog->insts[prog->unilen++] = SAVE;
|
prog->insts[prog->unilen++] = SAVE;
|
||||||
prog->insts[prog->unilen++] = prog->sub + 1;
|
prog->insts[prog->unilen++] = prog->sub + 1;
|
||||||
prog->insts[prog->unilen++] = MATCH;
|
prog->insts[prog->unilen++] = MATCH;
|
||||||
prog->len += 2;
|
prog->splits = scnt;
|
||||||
|
prog->len = icnt+2;
|
||||||
return RE_SUCCESS;
|
return RE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -465,10 +440,11 @@ if (--csub->ref == 0) { \
|
|||||||
|
|
||||||
#define onclist(nn)
|
#define onclist(nn)
|
||||||
#define onnlist(nn) \
|
#define onnlist(nn) \
|
||||||
for (j = 0; j < plistidx; j++) \
|
if (sparse[npc[2]] < sparsesz) \
|
||||||
if (npc == plist[j]) \
|
if (sdense[sparse[npc[2]]] == npc[2]) \
|
||||||
deccheck(nn) \
|
deccheck(nn) \
|
||||||
plist[plistidx++] = npc; \
|
sdense[sparsesz] = npc[2]; \
|
||||||
|
sparse[npc[2]] = sparsesz++; \
|
||||||
|
|
||||||
#define fastrec(nn, list, listidx) \
|
#define fastrec(nn, list, listidx) \
|
||||||
nsub->ref++; \
|
nsub->ref++; \
|
||||||
@@ -524,8 +500,8 @@ if (spc < WBEG) { \
|
|||||||
next##nn: \
|
next##nn: \
|
||||||
if (spc == SPLIT) { \
|
if (spc == SPLIT) { \
|
||||||
on##list(nn) \
|
on##list(nn) \
|
||||||
npc += 2; \
|
npc += 3; \
|
||||||
pcs[si] = npc + npc[-1]; \
|
pcs[si] = npc + npc[-2]; \
|
||||||
fastrec(nn, list, listidx) \
|
fastrec(nn, list, listidx) \
|
||||||
} else if (spc == SAVE) { \
|
} else if (spc == SAVE) { \
|
||||||
if (nsub->ref > 1) { \
|
if (nsub->ref > 1) { \
|
||||||
@@ -544,9 +520,9 @@ if (spc == SPLIT) { \
|
|||||||
npc++; goto rec##nn; \
|
npc++; goto rec##nn; \
|
||||||
} else if (spc == RSPLIT) { \
|
} else if (spc == RSPLIT) { \
|
||||||
on##list(nn) \
|
on##list(nn) \
|
||||||
npc += 2; \
|
npc += 3; \
|
||||||
pcs[si] = npc; \
|
pcs[si] = npc; \
|
||||||
npc += npc[-1]; \
|
npc += npc[-2]; \
|
||||||
fastrec(nn, list, listidx) \
|
fastrec(nn, list, listidx) \
|
||||||
} else if (spc == WEND) { \
|
} else if (spc == WEND) { \
|
||||||
if (isword(_sp)) \
|
if (isword(_sp)) \
|
||||||
@@ -571,12 +547,13 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
{
|
{
|
||||||
int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
|
int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
|
||||||
int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
|
int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
|
||||||
int clistidx = 0, nlistidx, plistidx, spc, mcont = MATCH;
|
int clistidx = 0, nlistidx, sparsesz, spc, mcont = MATCH;
|
||||||
const char *sp = s, *_sp = s;
|
const char *sp = s, *_sp = s;
|
||||||
int *insts = prog->insts;
|
int *insts = prog->insts;
|
||||||
int *pcs[prog->splits], *plist[prog->splits];
|
int *pcs[prog->splits];
|
||||||
|
unsigned int sdense[prog->splits], sparse[prog->splits];
|
||||||
rsub *subs[prog->splits];
|
rsub *subs[prog->splits];
|
||||||
char nsubs[500000];
|
char nsubs[rsubsize * (prog->len-prog->splits+3)];
|
||||||
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
|
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
|
||||||
rthread _clist[prog->len+1], _nlist[prog->len+1];
|
rthread _clist[prog->len+1], _nlist[prog->len+1];
|
||||||
_clist[0].pc = insts, _nlist[0].pc = insts;
|
_clist[0].pc = insts, _nlist[0].pc = insts;
|
||||||
@@ -585,7 +562,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
for (;; sp = _sp) {
|
for (;; sp = _sp) {
|
||||||
uc_len(i, sp) uc_code(c, sp)
|
uc_len(i, sp) uc_code(c, sp)
|
||||||
_sp = sp+i;
|
_sp = sp+i;
|
||||||
nlistidx = 0; plistidx = 0;
|
nlistidx = 0; sparsesz = 0;
|
||||||
for (i = 0; i < clistidx; i++) {
|
for (i = 0; i < clistidx; i++) {
|
||||||
npc = clist[i].pc;
|
npc = clist[i].pc;
|
||||||
nsub = clist[i].sub;
|
nsub = clist[i].sub;
|
||||||
|
|||||||
Reference in New Issue
Block a user