implement more efficient search

2021-07-24 11:35:06 +00:00
parent 678295f25e
commit bfe7983331
2 changed files with 77 additions and 44 deletions
--- a/pike.c
+++ b/pike.c
@@ -458,28 +458,13 @@ int re_sizecode(const char *re)
 	return dummyprog.unilen;
 }
-int re_comp(rcode *prog, const char *re, int anchored)
+int re_comp(rcode *prog, const char *re)
 {
 	prog->len = 0;
 	prog->unilen = 0;
 	prog->sub = 0;
 	prog->splits = 0;
 	// Add code to implement non-anchored operation ("search").
 	// For anchored operation ("match"), this code will be just skipped.
 	// TODO: Implement search in much more efficient manner
 	if (!anchored) {
 		prog->insts[prog->unilen++] = RSPLIT;
 		prog->insts[prog->unilen++] = 3;
 		prog->insts[prog->unilen++] = ANY;
 		prog->insts[prog->unilen++] = JMP;
 		prog->insts[prog->unilen++] = -5;
 		prog->insts[prog->unilen++] = SAVE;
 		prog->insts[prog->unilen++] = 0;
 		prog->len += 4;
 		prog->splits++;
 	}
 	int res = _compilecode(&re, prog, /*sizecode*/0);
 	if (res < 0) return res;
 	// If unparsed chars left
@@ -493,11 +478,28 @@ int re_comp(rcode *prog, const char *re, int anchored)
 	return RE_SUCCESS;
 }
 #define save(nn, csub) \
 if (csub->ref > 1) { \
 	for (j = 0; j < subidx; j++) { \
 		if (!nsubs[j].ref) { \
 			s1 = &nsubs[j]; \
 			goto freedsub##nn; \
 		} \
 	} \
 	s1 = &nsubs[subidx++]; \
 	freedsub##nn: \
 	for (j = 0; j < nsubp; j++) \
 		s1->sub[j] = csub->sub[j]; \
 	csub->ref--; \
 	csub = s1; \
 	csub->ref = 1; \
 } \
 #define addthread(nn, list, _pc, _sub, cont) \
 { \
-	int i = 0, j, *pc = _pc; \
+	int i = 0, *pc = _pc; \
 	const char *_sp = sp+l; \
-	rsub *s1, *sub = _sub; \
+	rsub *sub = _sub; \
 	rec##nn: \
 	if(plist[pc - prog->insts] == gen) { \
 		sub->ref--; \
@@ -532,21 +534,7 @@ int re_comp(rcode *prog, const char *re, int anchored)
 		pc += pc[-1]; \
 		goto rec##nn; \
 	case SAVE: \
-		if (sub->ref > 1) { \
+		save(nn, sub) \
 			for (j = 0; j < subidx; j++) { \
 				if (!nsubs[j].ref) { \
 					s1 = &nsubs[j]; \
 					goto freedsub##nn; \
 				} \
 			} \
 			s1 = &nsubs[subidx++]; \
 			freedsub##nn: \
 			for (j = 0; j < nsubp; j++) \
 				s1->sub[j] = sub->sub[j]; \
 			sub->ref--; \
 			sub = s1; \
 			sub->ref = 1; \
 		} \
 		sub->sub[pc[1]] = _sp; \
 		pc += 2; \
 		goto rec##nn; \
@@ -562,22 +550,27 @@ int re_comp(rcode *prog, const char *re, int anchored)
 	} \
 } \
 #define swaplist() \
 tmp = clist; \
 clist = nlist; \
 nlist = tmp; \
 nlist->n = 0; \
 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
-	int i, c, l = 0, *npc, gen = 1, subidx = 1;
+	int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
 	const char *sp = s;
 	rsub nsubs[256];
 	int plist[prog->unilen];
 	int *pcs[prog->splits];
 	rsub *subs[prog->splits];
-	rsub *nsub = nsubs, *matched = NULL;
+	rsub *nsub = nsubs, *lsub = nsub, *matched = NULL, *s1;
 	rthreadlist _clist[1+prog->len]; 
 	rthreadlist _nlist[1+prog->len]; 
 	rthreadlist *clist = _clist, *nlist = _nlist, *tmp;
 	memset(plist, 0, prog->unilen*sizeof(plist[0]));
 	memset(clist, 0, (1+prog->len)*sizeof(rthread));
 	memset(nlist, 0, (1+prog->len)*sizeof(rthread));
 	nsub->ref = 1;
 	for(i=0; i<nsubp; i++) {
 		subp[i] = NULL;
@@ -585,8 +578,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 	}
 	gen = 1;
-	while (1)
+	nsub->ref = 2;
-		addthread(1, clist, prog->insts, nsub, break)
+	save(0, nsub);
 	nsub->sub[0] = sp;
 	goto jmp_start;
 	for(; clist->n; sp += l) {
 		gen++; uc_len(l, sp)
 		for(i=0; i<clist->n; i++) {
@@ -594,8 +589,11 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 			nsub = clist->t[i].sub;
 			// If we need to match a character, but there's none left,
 			// it's fail (we don't schedule current thread for continuation)
-			if (inst_is_consumer(*npc) && !*sp)
+			if (inst_is_consumer(*npc) && !*sp) {
 				if (i >= clist->n-1)
 					goto BreakFor;
 				continue;
 			}
 			switch(*npc++) {
 			case CHAR:
 				uc_code(c, sp)
@@ -616,11 +614,19 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 			}
 			nsub->ref--;
 		}
 		if (!matched) {
 			nsub = lsub;
 			nsub->ref++;
 			save(3, nsub)
 			nsub->sub[0] = sp + l;
 			swaplist()
 			jmp_start:
 			while (1)
 				addthread(1, clist, prog->insts, nsub, break)
 			continue;
 		}
 	BreakFor:
-		tmp = clist;
+		swaplist()
 		clist = nlist;
 		nlist = tmp;
 		nlist->n = 0;
 	}
 	if(matched) {
 		for(i=0; i<nsubp; i++)
@@ -640,7 +646,7 @@ int main(int argc, char *argv[])
 	printf("Precalculated size: %d\n", sz);
 	char code[sizeof(rcode)+sz];
 	rcode *_code = (rcode*)code;
-	if (re_comp(_code, argv[1], 0))
+	if (re_comp(_code, argv[1]))
 		re_fatal("Error in re_comp");
 	re_dumpcode(_code);
 	#include <time.h>
--- a/test.sh
+++ b/test.sh
@@ -49,9 +49,18 @@ abc+h+d+f
 [A-Fa-f0-9]{64}
 <tag>[^<]*</tag>
 ^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$
 abc\$d
 abc$|cdb
 abc$|c
 ^ac|cdb
 ^abc+d
 ^(abc|kj)
 ^(abc|kj)
 \\\\babc
 ab\\\\bd
 \\\\b(as|js)
 ([^qwe]*rty)|(asd[^fgh]*)
 ([^qwe]*rty+)|(asd[^fgh]*)
 "
 input="\
 abcdef
@@ -102,9 +111,18 @@ abcccccccccccchdf
 bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5
 ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs
 veloval596@godpeed.com
 abc
 abccdb
 abcc
 abccdb
 abccdb
 kj
 jhdfh kj hhd
   	   abc
 ab   d
     js hashasd
 qweasd     qqqq fff
 qwehh  sjsjsj rtyyyyyyyyyj sdj
 "
 expect="\
 (0,3)
@@ -155,9 +173,18 @@ expect="\
 (0,64)
 (6,44)
 (0,22)(0,10)(11,18)(19,22)
 -nomatch-
 (3,6)
 (2,3)
 (3,6)
 (0,5)
 (0,2)(0,2)
 -nomatch-
 (7,10)
 -nomatch-
 (5,7)(5,7)
 (3,16)(?,?)(3,16)
 (3,25)(3,25)(?,?)
 (0,0)
 "
 c=1