diff --git a/README b/README index fdafa67..341baea 100644 --- a/README +++ b/README @@ -30,18 +30,25 @@ to that. so that the user does not need to waste time taking strlen() * Support for quoted chars in regex. * Support for ^, $ assertions in regex. -* Support for "match" vs "search" operations, as common in other regex APIs. -* Support for named character classes: \d \D \s \S \w \W. * Support for repetition operator {n} and {n,m}. * Support for Unicode (UTF-8). * Unlike other engines, the output is byte level offset. (Which is more useful) +* Support for wordend & wordbeg assertions +- Some limitations for word assertions are meta chars like spaces being used +in for expression itself, for example "\< abc" should match " abc" exactly at +that space word boundary but it won't. It's possible to fix this, but it would +require rsplit before word assert, and some dirty logic to check that the character +or class is a space we want to match not assert at. But the code for it was too +dirty and I scrapped it. Syntax for word assertions are like posix C library, not +the pcre "\b" which can be used both in front or back of the word, because there is +no distinction, it makes the implementation potentially even uglier. + TODO ==== -* Support for matching flags like case-insensitive, dot matches all, -multiline, etc. -* Support for wordend & wordbeg assertions +* Support for matching flags like case-insensitive +* maybe add lookaround, ahead, behind Author and License ================== diff --git a/pike.c b/pike.c index 27521c3..8e92afb 100644 --- a/pike.c +++ b/pike.c @@ -81,6 +81,8 @@ enum ASSERT, BOL, EOL, + WBEG, + WEND, // Instructions which take relative offset as arg JMP, SPLIT, @@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog) printf("assert bol\n"); else if (code[pc] == EOL) printf("assert eol\n"); + else if (code[pc] == WBEG) + printf("assert wbeg\n"); + else if (code[pc] == WEND) + printf("assert wend\n"); pc++; break; } @@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) case '\\': re++; if (!*re) goto syntax_error; // Trailing backslash + if (*re == '<' || *re == '>') { + EMIT(PC++, ASSERT); + EMIT(PC++, *re == '<' ? WBEG : WEND); + prog->len++; + term = PC; + break; + } default: term = PC; EMIT(PC++, CHAR); @@ -411,8 +424,7 @@ unsupported_escape: int re_sizecode(const char *re) { rcode dummyprog; - // SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code - dummyprog.unilen = 10; + dummyprog.unilen = 3; int res = _compilecode(&re, &dummyprog, /*sizecode*/1); if (res < 0) return res; @@ -469,6 +481,7 @@ if (--csub->ref == 0) { \ rsub *sub = _sub; \ rec##nn: \ if(plist[pc - prog->insts] == gen) { \ + dec_check##nn: \ decref(sub) \ rec_check##nn: \ if (i) { \ @@ -508,21 +521,22 @@ if (--csub->ref == 0) { \ goto rec##nn; \ case ASSERT: \ pc++; \ - if(*pc == BOL && _sp != s) \ - goto rec_check##nn; \ - if(*pc == EOL && *_sp) \ - goto rec_check##nn; \ + if (*pc == BOL && _sp != s) { \ + if (!i && !listidx) \ + return 0; \ + goto dec_check##nn; \ + } \ + if (*pc == EOL && *_sp) \ + goto dec_check##nn; \ + if (*pc == WBEG && (!isword(_sp) || isword(sp)) \ + && !(sp == s && isword(sp))) \ + goto dec_check##nn; \ + if (*pc == WEND && isword(_sp)) \ + goto dec_check##nn; \ pc++; goto rec##nn; \ } \ } \ -#define swaplist() \ -tmp = clist; \ -clist = nlist; \ -nlist = tmp; \ -clistidx = nlistidx; \ -nlistidx = 0; \ - int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) { int i, j, c, l = 0, *npc, gen = 1, subidx = 1; @@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) newsub(nsub); nsub->sub[0] = sp; goto jmp_start; - for(; clistidx; sp += l) { + for(;; sp += l) { gen++; uc_len(l, sp) uc_code(c, sp) for(i = 0; i < clistidx; i++) { npc = clist[i].pc; @@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) break_for: if (!c) break; + tmp = clist; + clist = nlist; + nlist = tmp; + clistidx = nlistidx; + nlistidx = 0; if (!matched) { nsub = lsub; nsub->ref++; newsub(nsub) nsub->sub[0] = sp + l; - swaplist() jmp_start: while (1) addthread(1, clist, clistidx, prog->insts, nsub, break) continue; - } - swaplist() + } else if (!clistidx) + break; } if(matched) { for(i=0; i +abc\\\\> +abc\\\\> +\\\\<(hello|world|word|nice|try)\\\\> +\\\\<(hello|world|word|nice|try)\\\\> +\\\\<(hello|world|word|nice|try)\\\\> +\\\\<(hello|world|word|nice|try)\\\\> +\\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\> +(abc+)|\\\\<[^k]*\\\\> " input="\ abcdef @@ -116,9 +140,33 @@ abccdb abccdb kj jhdfh kj hhd + abc + abc + abc +abc + abc bc qweasd qqqq fff qwehh sjsjsj rtyyyyyyyyyj sdj abcfv + abc + hsdh abc + js hashasd + gjs hashasd +ab d + bcddd bddddfbc + bcddd ddvddfbc + bcddd ddddfbc + bcddd fbc +abc + abc +abcccc +abc +world + world + worldfsd + dworld + nice + nicehdhfd " expect="\ (0,3) @@ -176,9 +224,33 @@ expect="\ (0,5) (0,2)(0,2) -nomatch- +(1,4)(?,?)(1,4) +(1,4)(1,4)(?,?) +(1,4)(?,?)(1,4) +(0,3)(0,3)(?,?) +-nomatch- (3,16)(?,?)(3,16) (3,25)(3,25)(?,?) (0,5)(0,3)(0,3)(3,5) +(7,10) +(7,10) +(5,7)(5,7) +-nomatch- +-nomatch- +-nomatch- +-nomatch- +(11,18)(15,18) +(5,6)(?,?) +(0,3) +(3,6) +-nomatch- +(0,3) +(0,5)(0,5) +(1,6)(1,6) +-nomatch- +-nomatch- +(4,8)(?,?)(4,8) +(4,13)(?,?) (0,0) " c=1