finally add those pesky word assertions, god
This commit is contained in:
17
README
17
README
@@ -30,18 +30,25 @@ to that.
|
|||||||
so that the user does not need to waste time taking strlen()
|
so that the user does not need to waste time taking strlen()
|
||||||
* Support for quoted chars in regex.
|
* Support for quoted chars in regex.
|
||||||
* Support for ^, $ assertions in regex.
|
* Support for ^, $ assertions in regex.
|
||||||
* Support for "match" vs "search" operations, as common in other regex APIs.
|
|
||||||
* Support for named character classes: \d \D \s \S \w \W.
|
|
||||||
* Support for repetition operator {n} and {n,m}.
|
* Support for repetition operator {n} and {n,m}.
|
||||||
* Support for Unicode (UTF-8).
|
* Support for Unicode (UTF-8).
|
||||||
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
||||||
|
* Support for wordend & wordbeg assertions
|
||||||
|
- Some limitations for word assertions are meta chars like spaces being used
|
||||||
|
in for expression itself, for example "\< abc" should match " abc" exactly at
|
||||||
|
that space word boundary but it won't. It's possible to fix this, but it would
|
||||||
|
require rsplit before word assert, and some dirty logic to check that the character
|
||||||
|
or class is a space we want to match not assert at. But the code for it was too
|
||||||
|
dirty and I scrapped it. Syntax for word assertions are like posix C library, not
|
||||||
|
the pcre "\b" which can be used both in front or back of the word, because there is
|
||||||
|
no distinction, it makes the implementation potentially even uglier.
|
||||||
|
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
====
|
====
|
||||||
|
|
||||||
* Support for matching flags like case-insensitive, dot matches all,
|
* Support for matching flags like case-insensitive
|
||||||
multiline, etc.
|
* maybe add lookaround, ahead, behind
|
||||||
* Support for wordend & wordbeg assertions
|
|
||||||
|
|
||||||
Author and License
|
Author and License
|
||||||
==================
|
==================
|
||||||
|
|||||||
52
pike.c
52
pike.c
@@ -81,6 +81,8 @@ enum
|
|||||||
ASSERT,
|
ASSERT,
|
||||||
BOL,
|
BOL,
|
||||||
EOL,
|
EOL,
|
||||||
|
WBEG,
|
||||||
|
WEND,
|
||||||
// Instructions which take relative offset as arg
|
// Instructions which take relative offset as arg
|
||||||
JMP,
|
JMP,
|
||||||
SPLIT,
|
SPLIT,
|
||||||
@@ -187,6 +189,10 @@ void re_dumpcode(rcode *prog)
|
|||||||
printf("assert bol\n");
|
printf("assert bol\n");
|
||||||
else if (code[pc] == EOL)
|
else if (code[pc] == EOL)
|
||||||
printf("assert eol\n");
|
printf("assert eol\n");
|
||||||
|
else if (code[pc] == WBEG)
|
||||||
|
printf("assert wbeg\n");
|
||||||
|
else if (code[pc] == WEND)
|
||||||
|
printf("assert wend\n");
|
||||||
pc++;
|
pc++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -206,6 +212,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
case '\\':
|
case '\\':
|
||||||
re++;
|
re++;
|
||||||
if (!*re) goto syntax_error; // Trailing backslash
|
if (!*re) goto syntax_error; // Trailing backslash
|
||||||
|
if (*re == '<' || *re == '>') {
|
||||||
|
EMIT(PC++, ASSERT);
|
||||||
|
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
||||||
|
prog->len++;
|
||||||
|
term = PC;
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
term = PC;
|
term = PC;
|
||||||
EMIT(PC++, CHAR);
|
EMIT(PC++, CHAR);
|
||||||
@@ -411,8 +424,7 @@ unsupported_escape:
|
|||||||
int re_sizecode(const char *re)
|
int re_sizecode(const char *re)
|
||||||
{
|
{
|
||||||
rcode dummyprog;
|
rcode dummyprog;
|
||||||
// SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code
|
dummyprog.unilen = 3;
|
||||||
dummyprog.unilen = 10;
|
|
||||||
|
|
||||||
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
|
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
@@ -469,6 +481,7 @@ if (--csub->ref == 0) { \
|
|||||||
rsub *sub = _sub; \
|
rsub *sub = _sub; \
|
||||||
rec##nn: \
|
rec##nn: \
|
||||||
if(plist[pc - prog->insts] == gen) { \
|
if(plist[pc - prog->insts] == gen) { \
|
||||||
|
dec_check##nn: \
|
||||||
decref(sub) \
|
decref(sub) \
|
||||||
rec_check##nn: \
|
rec_check##nn: \
|
||||||
if (i) { \
|
if (i) { \
|
||||||
@@ -508,21 +521,22 @@ if (--csub->ref == 0) { \
|
|||||||
goto rec##nn; \
|
goto rec##nn; \
|
||||||
case ASSERT: \
|
case ASSERT: \
|
||||||
pc++; \
|
pc++; \
|
||||||
if(*pc == BOL && _sp != s) \
|
if (*pc == BOL && _sp != s) { \
|
||||||
goto rec_check##nn; \
|
if (!i && !listidx) \
|
||||||
if(*pc == EOL && *_sp) \
|
return 0; \
|
||||||
goto rec_check##nn; \
|
goto dec_check##nn; \
|
||||||
|
} \
|
||||||
|
if (*pc == EOL && *_sp) \
|
||||||
|
goto dec_check##nn; \
|
||||||
|
if (*pc == WBEG && (!isword(_sp) || isword(sp)) \
|
||||||
|
&& !(sp == s && isword(sp))) \
|
||||||
|
goto dec_check##nn; \
|
||||||
|
if (*pc == WEND && isword(_sp)) \
|
||||||
|
goto dec_check##nn; \
|
||||||
pc++; goto rec##nn; \
|
pc++; goto rec##nn; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
#define swaplist() \
|
|
||||||
tmp = clist; \
|
|
||||||
clist = nlist; \
|
|
||||||
nlist = tmp; \
|
|
||||||
clistidx = nlistidx; \
|
|
||||||
nlistidx = 0; \
|
|
||||||
|
|
||||||
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||||
{
|
{
|
||||||
int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
|
int i, j, c, l = 0, *npc, gen = 1, subidx = 1;
|
||||||
@@ -550,7 +564,7 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
newsub(nsub);
|
newsub(nsub);
|
||||||
nsub->sub[0] = sp;
|
nsub->sub[0] = sp;
|
||||||
goto jmp_start;
|
goto jmp_start;
|
||||||
for(; clistidx; sp += l) {
|
for(;; sp += l) {
|
||||||
gen++; uc_len(l, sp) uc_code(c, sp)
|
gen++; uc_len(l, sp) uc_code(c, sp)
|
||||||
for(i = 0; i < clistidx; i++) {
|
for(i = 0; i < clistidx; i++) {
|
||||||
npc = clist[i].pc;
|
npc = clist[i].pc;
|
||||||
@@ -577,18 +591,22 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
break_for:
|
break_for:
|
||||||
if (!c)
|
if (!c)
|
||||||
break;
|
break;
|
||||||
|
tmp = clist;
|
||||||
|
clist = nlist;
|
||||||
|
nlist = tmp;
|
||||||
|
clistidx = nlistidx;
|
||||||
|
nlistidx = 0;
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
nsub = lsub;
|
nsub = lsub;
|
||||||
nsub->ref++;
|
nsub->ref++;
|
||||||
newsub(nsub)
|
newsub(nsub)
|
||||||
nsub->sub[0] = sp + l;
|
nsub->sub[0] = sp + l;
|
||||||
swaplist()
|
|
||||||
jmp_start:
|
jmp_start:
|
||||||
while (1)
|
while (1)
|
||||||
addthread(1, clist, clistidx, prog->insts, nsub, break)
|
addthread(1, clist, clistidx, prog->insts, nsub, break)
|
||||||
continue;
|
continue;
|
||||||
}
|
} else if (!clistidx)
|
||||||
swaplist()
|
break;
|
||||||
}
|
}
|
||||||
if(matched) {
|
if(matched) {
|
||||||
for(i=0; i<nsubp; i++)
|
for(i=0; i<nsubp; i++)
|
||||||
|
|||||||
72
test.sh
72
test.sh
@@ -56,9 +56,33 @@ abc$|c
|
|||||||
^abc+d
|
^abc+d
|
||||||
^(abc|kj)
|
^(abc|kj)
|
||||||
^(abc|kj)
|
^(abc|kj)
|
||||||
|
(^abc)|(abc)
|
||||||
|
(abc)|(^abc)
|
||||||
|
(^abc)|(abc$)
|
||||||
|
(^abc)|(abc$)
|
||||||
|
(^abc)|(abc$)
|
||||||
([^qwe]*rty)|(asd[^fgh]*)
|
([^qwe]*rty)|(asd[^fgh]*)
|
||||||
([^qwe]*rty+)|(asd[^fgh]*)
|
([^qwe]*rty+)|(asd[^fgh]*)
|
||||||
((abc))(fv)
|
((abc))(fv)
|
||||||
|
\\\\<abc
|
||||||
|
\\\\<abc
|
||||||
|
\\\\<(as|js)
|
||||||
|
\\\\<(as|js)
|
||||||
|
ab\\\\<d
|
||||||
|
\\\\<d+(abc|fbc|bcd)
|
||||||
|
\\\\<d+(abc|fbc|bcd)
|
||||||
|
\\\\<d+(abc|fbc|bcd)
|
||||||
|
b|\\\\<(abc|fbc|bcd)
|
||||||
|
\\\\<abc
|
||||||
|
\\\\<abc\\\\>
|
||||||
|
abc\\\\>
|
||||||
|
abc\\\\>
|
||||||
|
\\\\<(hello|world|word|nice|try)\\\\>
|
||||||
|
\\\\<(hello|world|word|nice|try)\\\\>
|
||||||
|
\\\\<(hello|world|word|nice|try)\\\\>
|
||||||
|
\\\\<(hello|world|word|nice|try)\\\\>
|
||||||
|
\\\\<(hello|world)\\\\>|\\\\<(word|nice|try)\\\\>
|
||||||
|
(abc+)|\\\\<[^k]*\\\\>
|
||||||
"
|
"
|
||||||
input="\
|
input="\
|
||||||
abcdef
|
abcdef
|
||||||
@@ -116,9 +140,33 @@ abccdb
|
|||||||
abccdb
|
abccdb
|
||||||
kj
|
kj
|
||||||
jhdfh kj hhd
|
jhdfh kj hhd
|
||||||
|
abc
|
||||||
|
abc
|
||||||
|
abc
|
||||||
|
abc
|
||||||
|
abc bc
|
||||||
qweasd qqqq fff
|
qweasd qqqq fff
|
||||||
qwehh sjsjsj rtyyyyyyyyyj sdj
|
qwehh sjsjsj rtyyyyyyyyyj sdj
|
||||||
abcfv
|
abcfv
|
||||||
|
abc
|
||||||
|
hsdh abc
|
||||||
|
js hashasd
|
||||||
|
gjs hashasd
|
||||||
|
ab d
|
||||||
|
bcddd bddddfbc
|
||||||
|
bcddd ddvddfbc
|
||||||
|
bcddd ddddfbc
|
||||||
|
bcddd fbc
|
||||||
|
abc
|
||||||
|
abc
|
||||||
|
abcccc
|
||||||
|
abc
|
||||||
|
world
|
||||||
|
world
|
||||||
|
worldfsd
|
||||||
|
dworld
|
||||||
|
nice
|
||||||
|
nicehdhfd
|
||||||
"
|
"
|
||||||
expect="\
|
expect="\
|
||||||
(0,3)
|
(0,3)
|
||||||
@@ -176,9 +224,33 @@ expect="\
|
|||||||
(0,5)
|
(0,5)
|
||||||
(0,2)(0,2)
|
(0,2)(0,2)
|
||||||
-nomatch-
|
-nomatch-
|
||||||
|
(1,4)(?,?)(1,4)
|
||||||
|
(1,4)(1,4)(?,?)
|
||||||
|
(1,4)(?,?)(1,4)
|
||||||
|
(0,3)(0,3)(?,?)
|
||||||
|
-nomatch-
|
||||||
(3,16)(?,?)(3,16)
|
(3,16)(?,?)(3,16)
|
||||||
(3,25)(3,25)(?,?)
|
(3,25)(3,25)(?,?)
|
||||||
(0,5)(0,3)(0,3)(3,5)
|
(0,5)(0,3)(0,3)(3,5)
|
||||||
|
(7,10)
|
||||||
|
(7,10)
|
||||||
|
(5,7)(5,7)
|
||||||
|
-nomatch-
|
||||||
|
-nomatch-
|
||||||
|
-nomatch-
|
||||||
|
-nomatch-
|
||||||
|
(11,18)(15,18)
|
||||||
|
(5,6)(?,?)
|
||||||
|
(0,3)
|
||||||
|
(3,6)
|
||||||
|
-nomatch-
|
||||||
|
(0,3)
|
||||||
|
(0,5)(0,5)
|
||||||
|
(1,6)(1,6)
|
||||||
|
-nomatch-
|
||||||
|
-nomatch-
|
||||||
|
(4,8)(?,?)(4,8)
|
||||||
|
(4,13)(?,?)
|
||||||
(0,0)
|
(0,0)
|
||||||
"
|
"
|
||||||
c=1
|
c=1
|
||||||
|
|||||||
Reference in New Issue
Block a user