finally add those pesky word assertions, god
This commit is contained in:
17
README
17
README
@@ -30,18 +30,25 @@ to that.
|
||||
so that the user does not need to waste time taking strlen()
|
||||
* Support for quoted chars in regex.
|
||||
* Support for ^, $ assertions in regex.
|
||||
* Support for "match" vs "search" operations, as common in other regex APIs.
|
||||
* Support for named character classes: \d \D \s \S \w \W.
|
||||
* Support for repetition operator {n} and {n,m}.
|
||||
* Support for Unicode (UTF-8).
|
||||
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
||||
* Support for wordend & wordbeg assertions
|
||||
- Some limitations for word assertions are meta chars like spaces being used
|
||||
in for expression itself, for example "\< abc" should match " abc" exactly at
|
||||
that space word boundary but it won't. It's possible to fix this, but it would
|
||||
require rsplit before word assert, and some dirty logic to check that the character
|
||||
or class is a space we want to match not assert at. But the code for it was too
|
||||
dirty and I scrapped it. Syntax for word assertions are like posix C library, not
|
||||
the pcre "\b" which can be used both in front or back of the word, because there is
|
||||
no distinction, it makes the implementation potentially even uglier.
|
||||
|
||||
|
||||
TODO
|
||||
====
|
||||
|
||||
* Support for matching flags like case-insensitive, dot matches all,
|
||||
multiline, etc.
|
||||
* Support for wordend & wordbeg assertions
|
||||
* Support for matching flags like case-insensitive
|
||||
* maybe add lookaround, ahead, behind
|
||||
|
||||
Author and License
|
||||
==================
|
||||
|
||||
52
pike.c
52
pike.c
@@ -81,6 +81,8 @@ enum
|
||||
ASSERT,
|
||||
BOL,
|
||||
EOL,
|
||||
WBEG,
|
||||
WEND,
|
||||
// Instructions which take relative offset as arg
|
||||
JMP,
|
||||
SPLIT,
|
||||
@@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog)
|
||||
printf("assert bol\n");
|
||||
else if (code[pc] == EOL)
|
||||
printf("assert eol\n");
|
||||
else if (code[pc] == WBEG)
|
||||
printf("assert wbeg\n");
|
||||
else if (code[pc] == WEND)
|
||||
printf("assert wend\n");
|
||||
pc++;
|
||||
break;
|
||||
}
|
||||
@@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
||||
case '\\':
|
||||
re++;
|
||||
if (!*re) goto syntax_error; // Trailing backslash
|
||||
if (*re == '<' || *re == '>') {
|
||||
EMIT(PC++, ASSERT);
|
||||
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
||||
prog->len++;
|
||||
term = PC;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
term = PC;
|
||||
EMIT(PC++, CHAR);
|
||||
@@ -411,8 +424,7 @@ unsupported_escape:
|
||||
int re_sizecode(const char *re)
|
||||
{
|
||||
rcode dummyprog;
|
||||
// SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code
|
||||
dummyprog.unilen = 10;
|
||||
dummyprog.unilen = 3;
|
||||
|
||||
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
|
||||
if (res < 0) return res;
|
||||
@@ -469,6 +481,7 @@ if (--csub->ref == 0) { \
|
||||
rsub *sub = _sub; \
|
||||
rec##nn: \
|
||||
if(plist[pc - prog->insts] == gen) { \
|
||||
dec_check##nn: \
|
||||
decref(sub) \
|
||||
rec_check##nn: \
|
||||
if (i) { \
|
||||
@@ -508,21 +521,22 @@ if (--csub->ref == 0) { \
|
||||
goto rec##nn; \
|
||||
case ASSERT: \
|
||||
pc++; \
|
||||
if(*pc == BOL && _sp != s) \
|
||||
goto rec_check##nn; \
|
||||
if(*pc == EOL && *_sp) \
|
||||
goto rec_check##nn; \
|
||||
if (*pc == BOL && _sp != s) { \
|
||||
if (!i && !listidx) \
|
||||
return 0; \
|
||||
goto dec_check##nn; \
|
||||
} \
|
||||
if (*pc == EOL && *_sp) \
|
||||
goto dec_check##nn; \
|
||||
if (*pc == WBEG && (!isword(_sp) || isword(sp)) \
|
||||
&& !(sp == s && isword(sp))) \
|
||||
goto dec_check##nn; \
|
||||
if (*pc == WEND && isword(_sp)) \
|
||||
goto dec_check##nn; \
|
||||
pc++; goto rec##nn; \
|
||||
} \
|
||||
} \
|
||||
|
||||
#define swaplist() \
|
||||
tmp = clist; \
|
||||
clist = nlist; \
|
||||
nlist = tmp; \
|
||||
clistidx = nlistidx; \
|
||||
nlistidx = 0; \
|
||||
|
||||
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||
{
|
||||
int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
|
||||
@@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||
newsub(nsub);
|
||||
nsub->sub[0] = sp;
|
||||
goto jmp_start;
|
||||
for(; clistidx; sp += l) {
|
||||
for(;; sp += l) {
|
||||
gen++; uc_len(l, sp) uc_code(c, sp)
|
||||
for(i = 0; i < clistidx; i++) {
|
||||
npc = clist[i].pc;
|
||||
@@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||
break_for:
|
||||
if (!c)
|
||||
break;
|
||||
tmp = clist;
|
||||
clist = nlist;
|
||||
nlist = tmp;
|
||||
clistidx = nlistidx;
|
||||
nlistidx = 0;
|
||||
if (!matched) {
|
||||
nsub = lsub;
|
||||
nsub->ref++;
|
||||
newsub(nsub)
|
||||
nsub->sub[0] = sp + l;
|
||||
swaplist()
|
||||
jmp_start:
|
||||
while (1)
|
||||
addthread(1, clist, clistidx, prog->insts, nsub, break)
|
||||
continue;
|
||||
}
|
||||
swaplist()
|
||||
} else if (!clistidx)
|
||||
break;
|
||||
}
|
||||
if(matched) {
|
||||
for(i=0; i<nsubp; i++)
|
||||
|
||||
72
test.sh
72
test.sh
@@ -56,9 +56,33 @@ abc$|c
|
||||
^abc+d
|
||||
^(abc|kj)
|
||||
^(abc|kj)
|
||||
(^abc)|(abc)
|
||||
(abc)|(^abc)
|
||||
(^abc)|(abc$)
|
||||
(^abc)|(abc$)
|
||||
(^abc)|(abc$)
|
||||
([^qwe]*rty)|(asd[^fgh]*)
|
||||
([^qwe]*rty+)|(asd[^fgh]*)
|
||||
((abc))(fv)
|
||||
\\\\<abc
|
||||
\\\\<abc
|
||||
\\\\<(as|js)
|
||||
\\\\<(as|js)
|
||||
ab\\\\<d
|
||||
\\\\<d+(abc|fbc|bcd)
|
||||
\\\\<d+(abc|fbc|bcd)
|
||||
\\\\<d+(abc|fbc|bcd)
|
||||
b|\\\\<(abc|fbc|bcd)
|
||||
\\\\<abc
|
||||
\\\\<abc\\\\>
|
||||
abc\\\\>
|
||||
abc\\\\>
|
||||
\\\\<(hello|world|word|nice|try)\\\\>
|
||||
\\\\<(hello|world|word|nice|try)\\\\>
|
||||
\\\\<(hello|world|word|nice|try)\\\\>
|
||||
\\\\<(hello|world|word|nice|try)\\\\>
|
||||
\\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\>
|
||||
(abc+)|\\\\<[^k]*\\\\>
|
||||
"
|
||||
input="\
|
||||
abcdef
|
||||
@@ -116,9 +140,33 @@ abccdb
|
||||
abccdb
|
||||
kj
|
||||
jhdfh kj hhd
|
||||
abc
|
||||
abc
|
||||
abc
|
||||
abc
|
||||
abc bc
|
||||
qweasd qqqq fff
|
||||
qwehh sjsjsj rtyyyyyyyyyj sdj
|
||||
abcfv
|
||||
abc
|
||||
hsdh abc
|
||||
js hashasd
|
||||
gjs hashasd
|
||||
ab d
|
||||
bcddd bddddfbc
|
||||
bcddd ddvddfbc
|
||||
bcddd ddddfbc
|
||||
bcddd fbc
|
||||
abc
|
||||
abc
|
||||
abcccc
|
||||
abc
|
||||
world
|
||||
world
|
||||
worldfsd
|
||||
dworld
|
||||
nice
|
||||
nicehdhfd
|
||||
"
|
||||
expect="\
|
||||
(0,3)
|
||||
@@ -176,9 +224,33 @@ expect="\
|
||||
(0,5)
|
||||
(0,2)(0,2)
|
||||
-nomatch-
|
||||
(1,4)(?,?)(1,4)
|
||||
(1,4)(1,4)(?,?)
|
||||
(1,4)(?,?)(1,4)
|
||||
(0,3)(0,3)(?,?)
|
||||
-nomatch-
|
||||
(3,16)(?,?)(3,16)
|
||||
(3,25)(3,25)(?,?)
|
||||
(0,5)(0,3)(0,3)(3,5)
|
||||
(7,10)
|
||||
(7,10)
|
||||
(5,7)(5,7)
|
||||
-nomatch-
|
||||
-nomatch-
|
||||
-nomatch-
|
||||
-nomatch-
|
||||
(11,18)(15,18)
|
||||
(5,6)(?,?)
|
||||
(0,3)
|
||||
(3,6)
|
||||
-nomatch-
|
||||
(0,3)
|
||||
(0,5)(0,5)
|
||||
(1,6)(1,6)
|
||||
-nomatch-
|
||||
-nomatch-
|
||||
(4,8)(?,?)(4,8)
|
||||
(4,13)(?,?)
|
||||
(0,0)
|
||||
"
|
||||
c=1
|
||||
|
||||
Reference in New Issue
Block a user