finally add those pesky word assertions, god

This commit is contained in:
Kyryl Melekhin
2021-08-03 18:23:09 +00:00
parent c774bef5c2
commit 11c505447c
3 changed files with 119 additions and 22 deletions

17
README
View File

@@ -30,18 +30,25 @@ to that.
so that the user does not need to waste time taking strlen()
* Support for quoted chars in regex.
* Support for ^, $ assertions in regex.
* Support for "match" vs "search" operations, as common in other regex APIs.
* Support for named character classes: \d \D \s \S \w \W.
* Support for repetition operator {n} and {n,m}.
* Support for Unicode (UTF-8).
* Unlike other engines, the output is byte level offset. (Which is more useful)
* Support for wordend & wordbeg assertions
- Some limitations for word assertions are meta chars like spaces being used
in for expression itself, for example "\< abc" should match " abc" exactly at
that space word boundary but it won't. It's possible to fix this, but it would
require rsplit before word assert, and some dirty logic to check that the character
or class is a space we want to match not assert at. But the code for it was too
dirty and I scrapped it. Syntax for word assertions are like posix C library, not
the pcre "\b" which can be used both in front or back of the word, because there is
no distinction, it makes the implementation potentially even uglier.
TODO
====
* Support for matching flags like case-insensitive, dot matches all,
multiline, etc.
* Support for wordend & wordbeg assertions
* Support for matching flags like case-insensitive
* maybe add lookaround, ahead, behind
Author and License
==================

50
pike.c
View File

@@ -81,6 +81,8 @@ enum
ASSERT,
BOL,
EOL,
WBEG,
WEND,
// Instructions which take relative offset as arg
JMP,
SPLIT,
@@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog)
printf("assert bol\n");
else if (code[pc] == EOL)
printf("assert eol\n");
else if (code[pc] == WBEG)
printf("assert wbeg\n");
else if (code[pc] == WEND)
printf("assert wend\n");
pc++;
break;
}
@@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
case '\\':
re++;
if (!*re) goto syntax_error; // Trailing backslash
if (*re == '<' || *re == '>') {
EMIT(PC++, ASSERT);
EMIT(PC++, *re == '<' ? WBEG : WEND);
prog->len++;
term = PC;
break;
}
default:
term = PC;
EMIT(PC++, CHAR);
@@ -411,8 +424,7 @@ unsupported_escape:
int re_sizecode(const char *re)
{
rcode dummyprog;
// SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code
dummyprog.unilen = 10;
dummyprog.unilen = 3;
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
if (res < 0) return res;
@@ -469,6 +481,7 @@ if (--csub->ref == 0) { \
rsub *sub = _sub; \
rec##nn: \
if(plist[pc - prog->insts] == gen) { \
dec_check##nn: \
decref(sub) \
rec_check##nn: \
if (i) { \
@@ -508,21 +521,22 @@ if (--csub->ref == 0) { \
goto rec##nn; \
case ASSERT: \
pc++; \
if(*pc == BOL && _sp != s) \
goto rec_check##nn; \
if (*pc == BOL && _sp != s) { \
if (!i && !listidx) \
return 0; \
goto dec_check##nn; \
} \
if (*pc == EOL && *_sp) \
goto rec_check##nn; \
goto dec_check##nn; \
if (*pc == WBEG && (!isword(_sp) || isword(sp)) \
&& !(sp == s && isword(sp))) \
goto dec_check##nn; \
if (*pc == WEND && isword(_sp)) \
goto dec_check##nn; \
pc++; goto rec##nn; \
} \
} \
#define swaplist() \
tmp = clist; \
clist = nlist; \
nlist = tmp; \
clistidx = nlistidx; \
nlistidx = 0; \
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
{
int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
@@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
newsub(nsub);
nsub->sub[0] = sp;
goto jmp_start;
for(; clistidx; sp += l) {
for(;; sp += l) {
gen++; uc_len(l, sp) uc_code(c, sp)
for(i = 0; i < clistidx; i++) {
npc = clist[i].pc;
@@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
break_for:
if (!c)
break;
tmp = clist;
clist = nlist;
nlist = tmp;
clistidx = nlistidx;
nlistidx = 0;
if (!matched) {
nsub = lsub;
nsub->ref++;
newsub(nsub)
nsub->sub[0] = sp + l;
swaplist()
jmp_start:
while (1)
addthread(1, clist, clistidx, prog->insts, nsub, break)
continue;
}
swaplist()
} else if (!clistidx)
break;
}
if(matched) {
for(i=0; i<nsubp; i++)

72
test.sh
View File

@@ -56,9 +56,33 @@ abc$|c
^abc+d
^(abc|kj)
^(abc|kj)
(^abc)|(abc)
(abc)|(^abc)
(^abc)|(abc$)
(^abc)|(abc$)
(^abc)|(abc$)
([^qwe]*rty)|(asd[^fgh]*)
([^qwe]*rty+)|(asd[^fgh]*)
((abc))(fv)
\\\\<abc
\\\\<abc
\\\\<(as|js)
\\\\<(as|js)
ab\\\\<d
\\\\<d+(abc|fbc|bcd)
\\\\<d+(abc|fbc|bcd)
\\\\<d+(abc|fbc|bcd)
b|\\\\<(abc|fbc|bcd)
\\\\<abc
\\\\<abc\\\\>
abc\\\\>
abc\\\\>
\\\\<(hello|world|word|nice|try)\\\\>
\\\\<(hello|world|word|nice|try)\\\\>
\\\\<(hello|world|word|nice|try)\\\\>
\\\\<(hello|world|word|nice|try)\\\\>
\\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\>
(abc+)|\\\\<[^k]*\\\\>
"
input="\
abcdef
@@ -116,9 +140,33 @@ abccdb
abccdb
kj
jhdfh kj hhd
abc
abc
abc
abc
abc bc
qweasd qqqq fff
qwehh sjsjsj rtyyyyyyyyyj sdj
abcfv
abc
hsdh abc
js hashasd
gjs hashasd
ab d
bcddd bddddfbc
bcddd ddvddfbc
bcddd ddddfbc
bcddd fbc
abc
abc
abcccc
abc
world
world
worldfsd
dworld
nice
nicehdhfd
"
expect="\
(0,3)
@@ -176,9 +224,33 @@ expect="\
(0,5)
(0,2)(0,2)
-nomatch-
(1,4)(?,?)(1,4)
(1,4)(1,4)(?,?)
(1,4)(?,?)(1,4)
(0,3)(0,3)(?,?)
-nomatch-
(3,16)(?,?)(3,16)
(3,25)(3,25)(?,?)
(0,5)(0,3)(0,3)(3,5)
(7,10)
(7,10)
(5,7)(5,7)
-nomatch-
-nomatch-
-nomatch-
-nomatch-
(11,18)(15,18)
(5,6)(?,?)
(0,3)
(3,6)
-nomatch-
(0,3)
(0,5)(0,5)
(1,6)(1,6)
-nomatch-
-nomatch-
(4,8)(?,?)(4,8)
(4,13)(?,?)
(0,0)
"
c=1