implement more efficient search
This commit is contained in:
94
pike.c
94
pike.c
@@ -458,28 +458,13 @@ int re_sizecode(const char *re)
|
|||||||
return dummyprog.unilen;
|
return dummyprog.unilen;
|
||||||
}
|
}
|
||||||
|
|
||||||
int re_comp(rcode *prog, const char *re, int anchored)
|
int re_comp(rcode *prog, const char *re)
|
||||||
{
|
{
|
||||||
prog->len = 0;
|
prog->len = 0;
|
||||||
prog->unilen = 0;
|
prog->unilen = 0;
|
||||||
prog->sub = 0;
|
prog->sub = 0;
|
||||||
prog->splits = 0;
|
prog->splits = 0;
|
||||||
|
|
||||||
// Add code to implement non-anchored operation ("search").
|
|
||||||
// For anchored operation ("match"), this code will be just skipped.
|
|
||||||
// TODO: Implement search in much more efficient manner
|
|
||||||
if (!anchored) {
|
|
||||||
prog->insts[prog->unilen++] = RSPLIT;
|
|
||||||
prog->insts[prog->unilen++] = 3;
|
|
||||||
prog->insts[prog->unilen++] = ANY;
|
|
||||||
prog->insts[prog->unilen++] = JMP;
|
|
||||||
prog->insts[prog->unilen++] = -5;
|
|
||||||
|
|
||||||
prog->insts[prog->unilen++] = SAVE;
|
|
||||||
prog->insts[prog->unilen++] = 0;
|
|
||||||
prog->len += 4;
|
|
||||||
prog->splits++;
|
|
||||||
}
|
|
||||||
int res = _compilecode(&re, prog, /*sizecode*/0);
|
int res = _compilecode(&re, prog, /*sizecode*/0);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
// If unparsed chars left
|
// If unparsed chars left
|
||||||
@@ -493,11 +478,28 @@ int re_comp(rcode *prog, const char *re, int anchored)
|
|||||||
return RE_SUCCESS;
|
return RE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define save(nn, csub) \
|
||||||
|
if (csub->ref > 1) { \
|
||||||
|
for (j = 0; j < subidx; j++) { \
|
||||||
|
if (!nsubs[j].ref) { \
|
||||||
|
s1 = &nsubs[j]; \
|
||||||
|
goto freedsub##nn; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
s1 = &nsubs[subidx++]; \
|
||||||
|
freedsub##nn: \
|
||||||
|
for (j = 0; j < nsubp; j++) \
|
||||||
|
s1->sub[j] = csub->sub[j]; \
|
||||||
|
csub->ref--; \
|
||||||
|
csub = s1; \
|
||||||
|
csub->ref = 1; \
|
||||||
|
} \
|
||||||
|
|
||||||
#define addthread(nn, list, _pc, _sub, cont) \
|
#define addthread(nn, list, _pc, _sub, cont) \
|
||||||
{ \
|
{ \
|
||||||
int i = 0, j, *pc = _pc; \
|
int i = 0, *pc = _pc; \
|
||||||
const char *_sp = sp+l; \
|
const char *_sp = sp+l; \
|
||||||
rsub *s1, *sub = _sub; \
|
rsub *sub = _sub; \
|
||||||
rec##nn: \
|
rec##nn: \
|
||||||
if(plist[pc - prog->insts] == gen) { \
|
if(plist[pc - prog->insts] == gen) { \
|
||||||
sub->ref--; \
|
sub->ref--; \
|
||||||
@@ -532,21 +534,7 @@ int re_comp(rcode *prog, const char *re, int anchored)
|
|||||||
pc += pc[-1]; \
|
pc += pc[-1]; \
|
||||||
goto rec##nn; \
|
goto rec##nn; \
|
||||||
case SAVE: \
|
case SAVE: \
|
||||||
if (sub->ref > 1) { \
|
save(nn, sub) \
|
||||||
for (j = 0; j < subidx; j++) { \
|
|
||||||
if (!nsubs[j].ref) { \
|
|
||||||
s1 = &nsubs[j]; \
|
|
||||||
goto freedsub##nn; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
s1 = &nsubs[subidx++]; \
|
|
||||||
freedsub##nn: \
|
|
||||||
for (j = 0; j < nsubp; j++) \
|
|
||||||
s1->sub[j] = sub->sub[j]; \
|
|
||||||
sub->ref--; \
|
|
||||||
sub = s1; \
|
|
||||||
sub->ref = 1; \
|
|
||||||
} \
|
|
||||||
sub->sub[pc[1]] = _sp; \
|
sub->sub[pc[1]] = _sp; \
|
||||||
pc += 2; \
|
pc += 2; \
|
||||||
goto rec##nn; \
|
goto rec##nn; \
|
||||||
@@ -562,22 +550,27 @@ int re_comp(rcode *prog, const char *re, int anchored)
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
#define swaplist() \
|
||||||
|
tmp = clist; \
|
||||||
|
clist = nlist; \
|
||||||
|
nlist = tmp; \
|
||||||
|
nlist->n = 0; \
|
||||||
|
|
||||||
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||||
{
|
{
|
||||||
int i, c, l = 0, *npc, gen = 1, subidx = 1;
|
int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
|
||||||
const char *sp = s;
|
const char *sp = s;
|
||||||
rsub nsubs[256];
|
rsub nsubs[256];
|
||||||
int plist[prog->unilen];
|
int plist[prog->unilen];
|
||||||
int *pcs[prog->splits];
|
int *pcs[prog->splits];
|
||||||
rsub *subs[prog->splits];
|
rsub *subs[prog->splits];
|
||||||
rsub *nsub = nsubs, *matched = NULL;
|
rsub *nsub = nsubs, *lsub = nsub, *matched = NULL, *s1;
|
||||||
rthreadlist _clist[1+prog->len];
|
rthreadlist _clist[1+prog->len];
|
||||||
rthreadlist _nlist[1+prog->len];
|
rthreadlist _nlist[1+prog->len];
|
||||||
rthreadlist *clist = _clist, *nlist = _nlist, *tmp;
|
rthreadlist *clist = _clist, *nlist = _nlist, *tmp;
|
||||||
memset(plist, 0, prog->unilen*sizeof(plist[0]));
|
memset(plist, 0, prog->unilen*sizeof(plist[0]));
|
||||||
memset(clist, 0, (1+prog->len)*sizeof(rthread));
|
memset(clist, 0, (1+prog->len)*sizeof(rthread));
|
||||||
memset(nlist, 0, (1+prog->len)*sizeof(rthread));
|
memset(nlist, 0, (1+prog->len)*sizeof(rthread));
|
||||||
nsub->ref = 1;
|
|
||||||
|
|
||||||
for(i=0; i<nsubp; i++) {
|
for(i=0; i<nsubp; i++) {
|
||||||
subp[i] = NULL;
|
subp[i] = NULL;
|
||||||
@@ -585,8 +578,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
gen = 1;
|
gen = 1;
|
||||||
while (1)
|
nsub->ref = 2;
|
||||||
addthread(1, clist, prog->insts, nsub, break)
|
save(0, nsub);
|
||||||
|
nsub->sub[0] = sp;
|
||||||
|
goto jmp_start;
|
||||||
for(; clist->n; sp += l) {
|
for(; clist->n; sp += l) {
|
||||||
gen++; uc_len(l, sp)
|
gen++; uc_len(l, sp)
|
||||||
for(i=0; i<clist->n; i++) {
|
for(i=0; i<clist->n; i++) {
|
||||||
@@ -594,8 +589,11 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
nsub = clist->t[i].sub;
|
nsub = clist->t[i].sub;
|
||||||
// If we need to match a character, but there's none left,
|
// If we need to match a character, but there's none left,
|
||||||
// it's fail (we don't schedule current thread for continuation)
|
// it's fail (we don't schedule current thread for continuation)
|
||||||
if (inst_is_consumer(*npc) && !*sp)
|
if (inst_is_consumer(*npc) && !*sp) {
|
||||||
|
if (i >= clist->n-1)
|
||||||
|
goto BreakFor;
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
switch(*npc++) {
|
switch(*npc++) {
|
||||||
case CHAR:
|
case CHAR:
|
||||||
uc_code(c, sp)
|
uc_code(c, sp)
|
||||||
@@ -616,11 +614,19 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
}
|
}
|
||||||
nsub->ref--;
|
nsub->ref--;
|
||||||
}
|
}
|
||||||
|
if (!matched) {
|
||||||
|
nsub = lsub;
|
||||||
|
nsub->ref++;
|
||||||
|
save(3, nsub)
|
||||||
|
nsub->sub[0] = sp + l;
|
||||||
|
swaplist()
|
||||||
|
jmp_start:
|
||||||
|
while (1)
|
||||||
|
addthread(1, clist, prog->insts, nsub, break)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
BreakFor:
|
BreakFor:
|
||||||
tmp = clist;
|
swaplist()
|
||||||
clist = nlist;
|
|
||||||
nlist = tmp;
|
|
||||||
nlist->n = 0;
|
|
||||||
}
|
}
|
||||||
if(matched) {
|
if(matched) {
|
||||||
for(i=0; i<nsubp; i++)
|
for(i=0; i<nsubp; i++)
|
||||||
@@ -640,7 +646,7 @@ int main(int argc, char *argv[])
|
|||||||
printf("Precalculated size: %d\n", sz);
|
printf("Precalculated size: %d\n", sz);
|
||||||
char code[sizeof(rcode)+sz];
|
char code[sizeof(rcode)+sz];
|
||||||
rcode *_code = (rcode*)code;
|
rcode *_code = (rcode*)code;
|
||||||
if (re_comp(_code, argv[1], 0))
|
if (re_comp(_code, argv[1]))
|
||||||
re_fatal("Error in re_comp");
|
re_fatal("Error in re_comp");
|
||||||
re_dumpcode(_code);
|
re_dumpcode(_code);
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|||||||
27
test.sh
27
test.sh
@@ -49,9 +49,18 @@ abc+h+d+f
|
|||||||
[A-Fa-f0-9]{64}
|
[A-Fa-f0-9]{64}
|
||||||
<tag>[^<]*</tag>
|
<tag>[^<]*</tag>
|
||||||
^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$
|
^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$
|
||||||
|
abc\$d
|
||||||
|
abc$|cdb
|
||||||
|
abc$|c
|
||||||
|
^ac|cdb
|
||||||
|
^abc+d
|
||||||
|
^(abc|kj)
|
||||||
|
^(abc|kj)
|
||||||
\\\\babc
|
\\\\babc
|
||||||
ab\\\\bd
|
ab\\\\bd
|
||||||
\\\\b(as|js)
|
\\\\b(as|js)
|
||||||
|
([^qwe]*rty)|(asd[^fgh]*)
|
||||||
|
([^qwe]*rty+)|(asd[^fgh]*)
|
||||||
"
|
"
|
||||||
input="\
|
input="\
|
||||||
abcdef
|
abcdef
|
||||||
@@ -102,9 +111,18 @@ abcccccccccccchdf
|
|||||||
bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5
|
bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5
|
||||||
ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs
|
ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs
|
||||||
veloval596@godpeed.com
|
veloval596@godpeed.com
|
||||||
|
abc
|
||||||
|
abccdb
|
||||||
|
abcc
|
||||||
|
abccdb
|
||||||
|
abccdb
|
||||||
|
kj
|
||||||
|
jhdfh kj hhd
|
||||||
abc
|
abc
|
||||||
ab d
|
ab d
|
||||||
js hashasd
|
js hashasd
|
||||||
|
qweasd qqqq fff
|
||||||
|
qwehh sjsjsj rtyyyyyyyyyj sdj
|
||||||
"
|
"
|
||||||
expect="\
|
expect="\
|
||||||
(0,3)
|
(0,3)
|
||||||
@@ -155,9 +173,18 @@ expect="\
|
|||||||
(0,64)
|
(0,64)
|
||||||
(6,44)
|
(6,44)
|
||||||
(0,22)(0,10)(11,18)(19,22)
|
(0,22)(0,10)(11,18)(19,22)
|
||||||
|
-nomatch-
|
||||||
|
(3,6)
|
||||||
|
(2,3)
|
||||||
|
(3,6)
|
||||||
|
(0,5)
|
||||||
|
(0,2)(0,2)
|
||||||
|
-nomatch-
|
||||||
(7,10)
|
(7,10)
|
||||||
-nomatch-
|
-nomatch-
|
||||||
(5,7)(5,7)
|
(5,7)(5,7)
|
||||||
|
(3,16)(?,?)(3,16)
|
||||||
|
(3,25)(3,25)(?,?)
|
||||||
(0,0)
|
(0,0)
|
||||||
"
|
"
|
||||||
c=1
|
c=1
|
||||||
|
|||||||
Reference in New Issue
Block a user