finally add those pesky word assertions, god

2021-08-03 18:23:09 +00:00
parent c774bef5c2
commit 11c505447c
3 changed files with 119 additions and 22 deletions
--- a/17
+++ b/17
@@ -30,18 +30,25 @@ to that.
 so that the user does not need to waste time taking strlen()
 * Support for quoted chars in regex.
 * Support for ^, $ assertions in regex.
-* Support for "match" vs "search" operations, as common in other regex APIs.
-* Support for named character classes: \d \D \s \S \w \W.
 * Support for repetition operator {n} and {n,m}.
 * Support for Unicode (UTF-8).
 * Unlike other engines, the output is byte level offset. (Which is more useful)
+* Support for wordend & wordbeg assertions
+- Some limitations for word assertions are meta chars like spaces being used
+in for expression itself, for example "\< abc" should match " abc" exactly at
+that space word boundary but it won't. It's possible to fix this, but it would
+require rsplit before word assert, and some dirty logic to check that the character
+or class is a space we want to match not assert at. But the code for it was too
+dirty and I scrapped it. Syntax for word assertions are like posix C library, not
+the pcre "\b" which can be used both in front or back of the word, because there is
+no distinction, it makes the implementation potentially even uglier.
+

 TODO
 ====

-* Support for matching flags like case-insensitive, dot matches all,
-multiline, etc.
-* Support for wordend & wordbeg assertions
+* Support for matching flags like case-insensitive
+* maybe add lookaround, ahead, behind

 Author and License
 ==================
--- a/pike.c
+++ b/pike.c
@@ -81,6 +81,8 @@ enum
 	ASSERT,
 	BOL,
 	EOL,
+	WBEG,
+	WEND,
 	// Instructions which take relative offset as arg
 	JMP,
 	SPLIT,
@@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog)
 				printf("assert bol\n");
 			else if (code[pc] == EOL)
 				printf("assert eol\n");
+			else if (code[pc] == WBEG)
+				printf("assert wbeg\n");
+			else if (code[pc] == WEND)
+				printf("assert wend\n");
 			pc++;
 			break;
 		}
@@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 		case '\\':
 			re++;
 			if (!*re) goto syntax_error; // Trailing backslash
+			if (*re == '<' || *re == '>') {
+				EMIT(PC++, ASSERT);
+				EMIT(PC++, *re == '<' ? WBEG : WEND);
+				prog->len++;
+				term = PC;
+				break;
+			}
 		default:
 			term = PC;
 			EMIT(PC++, CHAR);
@@ -411,8 +424,7 @@ unsupported_escape:
 int re_sizecode(const char *re)
 {
 	rcode dummyprog;
-	// SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code
-	dummyprog.unilen = 10;
+	dummyprog.unilen = 3;

 	int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
 	if (res < 0) return res;
@@ -469,6 +481,7 @@ if (--csub->ref == 0) { \
 	rsub *sub = _sub; \
 	rec##nn: \
 	if(plist[pc - prog->insts] == gen) { \
+		dec_check##nn: \
 		decref(sub) \
 		rec_check##nn: \
 		if (i) { \
@@ -508,21 +521,22 @@ if (--csub->ref == 0) { \
 		goto rec##nn; \
 	case ASSERT: \
 		pc++; \
-		if(*pc == BOL && _sp != s) \
-			goto rec_check##nn; \
+		if (*pc == BOL && _sp != s) { \
+			if (!i && !listidx) \
+				return 0; \
+			goto dec_check##nn; \
+		} \
 		if (*pc == EOL && *_sp) \
-			goto rec_check##nn; \
+			goto dec_check##nn; \
+		if (*pc == WBEG && (!isword(_sp) || isword(sp)) \
+				&& !(sp == s && isword(sp))) \
+			goto dec_check##nn; \
+		if (*pc == WEND && isword(_sp)) \
+			goto dec_check##nn; \
 		pc++; goto rec##nn; \
 	} \
 } \

-#define swaplist() \
-tmp = clist; \
-clist = nlist; \
-nlist = tmp; \
-clistidx = nlistidx; \
-nlistidx = 0; \
-
 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
 	int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
@@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 	newsub(nsub);
 	nsub->sub[0] = sp;
 	goto jmp_start;
-	for(; clistidx; sp += l) {
+	for(;; sp += l) {
 		gen++; uc_len(l, sp) uc_code(c, sp)
 		for(i = 0; i < clistidx; i++) {
 			npc = clist[i].pc;
@@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 		break_for:
 		if (!c)
 			break;
+		tmp = clist;
+		clist = nlist;
+		nlist = tmp;
+		clistidx = nlistidx;
+		nlistidx = 0;
 		if (!matched) {
 			nsub = lsub;
 			nsub->ref++;
 			newsub(nsub)
 			nsub->sub[0] = sp + l;
-			swaplist()
 			jmp_start:
 			while (1)
 				addthread(1, clist, clistidx, prog->insts, nsub, break)
 			continue;
-		}
-		swaplist()
+		} else if (!clistidx)
+			break;
 	}
 	if(matched) {
 		for(i=0; i<nsubp; i++)
--- a/test.sh
+++ b/test.sh
@@ -56,9 +56,33 @@ abc$|c
 ^abc+d
 ^(abc|kj)
 ^(abc|kj)
+(^abc)|(abc)
+(abc)|(^abc)
+(^abc)|(abc$)
+(^abc)|(abc$)
+(^abc)|(abc$)
 ([^qwe]*rty)|(asd[^fgh]*)
 ([^qwe]*rty+)|(asd[^fgh]*)
 ((abc))(fv)
+\\\\<abc
+\\\\<abc
+\\\\<(as|js)
+\\\\<(as|js)
+ab\\\\<d
+\\\\<d+(abc|fbc|bcd)
+\\\\<d+(abc|fbc|bcd)
+\\\\<d+(abc|fbc|bcd)
+b|\\\\<(abc|fbc|bcd)
+\\\\<abc
+\\\\<abc\\\\>
+abc\\\\>
+abc\\\\>
+\\\\<(hello|world|word|nice|try)\\\\>
+\\\\<(hello|world|word|nice|try)\\\\>
+\\\\<(hello|world|word|nice|try)\\\\>
+\\\\<(hello|world|word|nice|try)\\\\>
+\\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\>
+(abc+)|\\\\<[^k]*\\\\>
 "
 input="\
 abcdef
@@ -116,9 +140,33 @@ abccdb
 abccdb
 kj
 jhdfh kj hhd
+ abc
+ abc
+ abc
+abc
+ abc bc
 qweasd     qqqq fff
 qwehh  sjsjsj rtyyyyyyyyyj sdj
 abcfv
+   	   abc
+ hsdh  abc
+     js hashasd
+     gjs hashasd
+ab   d
+     bcddd bddddfbc
+     bcddd ddvddfbc
+     bcddd ddddfbc
+     bcddd fbc
+abc
+   abc   
+abcccc
+abc
+world
+ world 
+    worldfsd
+    dworld
+    nice   
+    nicehdhfd
 "
 expect="\
 (0,3)
@@ -176,9 +224,33 @@ expect="\
 (0,5)
 (0,2)(0,2)
 -nomatch-
+(1,4)(?,?)(1,4)
+(1,4)(1,4)(?,?)
+(1,4)(?,?)(1,4)
+(0,3)(0,3)(?,?)
+-nomatch-
 (3,16)(?,?)(3,16)
 (3,25)(3,25)(?,?)
 (0,5)(0,3)(0,3)(3,5)
+(7,10)
+(7,10)
+(5,7)(5,7)
+-nomatch-
+-nomatch-
+-nomatch-
+-nomatch-
+(11,18)(15,18)
+(5,6)(?,?)
+(0,3)
+(3,6)
+-nomatch-
+(0,3)
+(0,5)(0,5)
+(1,6)(1,6)
+-nomatch-
+-nomatch-
+(4,8)(?,?)(4,8)
+(4,13)(?,?)
 (0,0)
 "
 c=1