finally add those pesky word assertions, god

2021-08-03 18:23:09 +00:00
parent c774bef5c2
commit 11c505447c
3 changed files with 119 additions and 22 deletions
--- a/17
+++ b/17
@@ -30,18 +30,25 @@ to that.
 so that the user does not need to waste time taking strlen()
 * Support for quoted chars in regex.
 * Support for ^, $ assertions in regex.
 * Support for "match" vs "search" operations, as common in other regex APIs.
 * Support for named character classes: \d \D \s \S \w \W.
 * Support for repetition operator {n} and {n,m}.
 * Support for Unicode (UTF-8).
 * Unlike other engines, the output is byte level offset. (Which is more useful)
 * Support for wordend & wordbeg assertions
 - Some limitations for word assertions are meta chars like spaces being used
 in for expression itself, for example "\< abc" should match " abc" exactly at
 that space word boundary but it won't. It's possible to fix this, but it would
 require rsplit before word assert, and some dirty logic to check that the character
 or class is a space we want to match not assert at. But the code for it was too
 dirty and I scrapped it. Syntax for word assertions are like posix C library, not
 the pcre "\b" which can be used both in front or back of the word, because there is
 no distinction, it makes the implementation potentially even uglier.
 TODO
 ====
-* Support for matching flags like case-insensitive, dot matches all,
+* Support for matching flags like case-insensitive
-multiline, etc.
+* maybe add lookaround, ahead, behind
 * Support for wordend & wordbeg assertions
 Author and License
 ==================
--- a/pike.c
+++ b/pike.c
@@ -81,6 +81,8 @@ enum
 	ASSERT,
 	BOL,
 	EOL,
 	WBEG,
 	WEND,
 	// Instructions which take relative offset as arg
 	JMP,
 	SPLIT,
@@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog)
 				printf("assert bol\n");
 			else if (code[pc] == EOL)
 				printf("assert eol\n");
 			else if (code[pc] == WBEG)
 				printf("assert wbeg\n");
 			else if (code[pc] == WEND)
 				printf("assert wend\n");
 			pc++;
 			break;
 		}
@@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 		case '\\':
 			re++;
 			if (!*re) goto syntax_error; // Trailing backslash
 			if (*re == '<' || *re == '>') {
 				EMIT(PC++, ASSERT);
 				EMIT(PC++, *re == '<' ? WBEG : WEND);
 				prog->len++;
 				term = PC;
 				break;
 			}
 		default:
 			term = PC;
 			EMIT(PC++, CHAR);
@@ -411,8 +424,7 @@ unsupported_escape:
 int re_sizecode(const char *re)
 {
 	rcode dummyprog;
-	// SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code
+	dummyprog.unilen = 3;
 	dummyprog.unilen = 10;
 	int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
 	if (res < 0) return res;
@@ -469,6 +481,7 @@ if (--csub->ref == 0) { \
 	rsub *sub = _sub; \
 	rec##nn: \
 	if(plist[pc - prog->insts] == gen) { \
 		dec_check##nn: \
 		decref(sub) \
 		rec_check##nn: \
 		if (i) { \
@@ -508,21 +521,22 @@ if (--csub->ref == 0) { \
 		goto rec##nn; \
 	case ASSERT: \
 		pc++; \
-		if(*pc == BOL && _sp != s) \
+		if (*pc == BOL && _sp != s) { \
-			goto rec_check##nn; \
+			if (!i && !listidx) \
-		if(*pc == EOL && *_sp) \
+				return 0; \
-			goto rec_check##nn; \
+			goto dec_check##nn; \
 		} \
 		if (*pc == EOL && *_sp) \
 			goto dec_check##nn; \
 		if (*pc == WBEG && (!isword(_sp) || isword(sp)) \
 				&& !(sp == s && isword(sp))) \
 			goto dec_check##nn; \
 		if (*pc == WEND && isword(_sp)) \
 			goto dec_check##nn; \
 		pc++; goto rec##nn; \
 	} \
 } \
 #define swaplist() \
 tmp = clist; \
 clist = nlist; \
 nlist = tmp; \
 clistidx = nlistidx; \
 nlistidx = 0; \
 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
 	int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
@@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 	newsub(nsub);
 	nsub->sub[0] = sp;
 	goto jmp_start;
-	for(; clistidx; sp += l) {
+	for(;; sp += l) {
 		gen++; uc_len(l, sp) uc_code(c, sp)
 		for(i = 0; i < clistidx; i++) {
 			npc = clist[i].pc;
@@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 		break_for:
 		if (!c)
 			break;
 		tmp = clist;
 		clist = nlist;
 		nlist = tmp;
 		clistidx = nlistidx;
 		nlistidx = 0;
 		if (!matched) {
 			nsub = lsub;
 			nsub->ref++;
 			newsub(nsub)
 			nsub->sub[0] = sp + l;
 			swaplist()
 			jmp_start:
 			while (1)
 				addthread(1, clist, clistidx, prog->insts, nsub, break)
 			continue;
-		}
+		} else if (!clistidx)
-		swaplist()
+			break;
 	}
 	if(matched) {
 		for(i=0; i<nsubp; i++)
--- a/test.sh
+++ b/test.sh
@@ -56,9 +56,33 @@ abc$|c
 ^abc+d
 ^(abc|kj)
 ^(abc|kj)
 (^abc)|(abc)
 (abc)|(^abc)
 (^abc)|(abc$)
 (^abc)|(abc$)
 (^abc)|(abc$)
 ([^qwe]*rty)|(asd[^fgh]*)
 ([^qwe]*rty+)|(asd[^fgh]*)
 ((abc))(fv)
 \\\\<abc
 \\\\<abc
 \\\\<(as|js)
 \\\\<(as|js)
 ab\\\\<d
 \\\\<d+(abc|fbc|bcd)
 \\\\<d+(abc|fbc|bcd)
 \\\\<d+(abc|fbc|bcd)
 b|\\\\<(abc|fbc|bcd)
 \\\\<abc
 \\\\<abc\\\\>
 abc\\\\>
 abc\\\\>
 \\\\<(hello|world|word|nice|try)\\\\>
 \\\\<(hello|world|word|nice|try)\\\\>
 \\\\<(hello|world|word|nice|try)\\\\>
 \\\\<(hello|world|word|nice|try)\\\\>
 \\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\>
 (abc+)|\\\\<[^k]*\\\\>
 "
 input="\
 abcdef
@@ -116,9 +140,33 @@ abccdb
 abccdb
 kj
 jhdfh kj hhd
 abc
 abc
 abc
 abc
 abc bc
 qweasd     qqqq fff
 qwehh  sjsjsj rtyyyyyyyyyj sdj
 abcfv
   	   abc
 hsdh  abc
     js hashasd
     gjs hashasd
 ab   d
     bcddd bddddfbc
     bcddd ddvddfbc
     bcddd ddddfbc
     bcddd fbc
 abc
   abc   
 abcccc
 abc
 world
 world 
    worldfsd
    dworld
    nice   
    nicehdhfd
 "
 expect="\
 (0,3)
@@ -176,9 +224,33 @@ expect="\
 (0,5)
 (0,2)(0,2)
 -nomatch-
 (1,4)(?,?)(1,4)
 (1,4)(1,4)(?,?)
 (1,4)(?,?)(1,4)
 (0,3)(0,3)(?,?)
 -nomatch-
 (3,16)(?,?)(3,16)
 (3,25)(3,25)(?,?)
 (0,5)(0,3)(0,3)(3,5)
 (7,10)
 (7,10)
 (5,7)(5,7)
 -nomatch-
 -nomatch-
 -nomatch-
 -nomatch-
 (11,18)(15,18)
 (5,6)(?,?)
 (0,3)
 (3,6)
 -nomatch-
 (0,3)
 (0,5)(0,5)
 (1,6)(1,6)
 -nomatch-
 -nomatch-
 (4,8)(?,?)(4,8)
 (4,13)(?,?)
 (0,0)
 "
 c=1