add word boundary assert

This commit is contained in:
Kyryl Melekhin
2021-07-21 15:39:11 +00:00
parent ba17d90916
commit 678295f25e
2 changed files with 39 additions and 1 deletions

31
pike.c
View File

@@ -47,6 +47,19 @@ else if (~dst & 0x08) \
else \
dst = 0; \
static int isword(const char *s)
{
int c = (unsigned char) s[0];
return isalnum(c) || c == '_' || c > 127;
}
static char *uc_beg(char *beg, char *s)
{
while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
s--;
return s;
}
typedef struct rinst rinst;
struct rinst
{
@@ -84,6 +97,7 @@ enum /* rinst.opcode */
ASSERT,
BOL,
EOL,
WBND,
// Instructions which take relative offset as arg
JMP,
SPLIT,
@@ -196,7 +210,13 @@ void re_dumpcode(rcode *prog)
printf("save %d\n", code[pc++]);
break;
case ASSERT:
printf("assert %s\n", code[pc++] == BOL ? "bol" : "eol");
if (code[pc] == BOL)
printf("assert bol\n");
else if (code[pc] == EOL)
printf("assert eol\n");
else if (code[pc] == WBND)
printf("assert WBND\n");
pc++;
break;
}
}
@@ -215,6 +235,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
case '\\':
re++;
if (!*re) goto syntax_error; // Trailing backslash
if (*re == 'b') {
EMIT(PC++, ASSERT);
EMIT(PC++, WBND);
prog->len++;
term = PC;
break;
}
default:
term = PC;
EMIT(PC++, CHAR);
@@ -529,6 +556,8 @@ int re_comp(rcode *prog, const char *re, int anchored)
goto rec_check##nn; \
if(*pc == EOL && *_sp) \
goto rec_check##nn; \
if(*pc == WBND && isword(sp)) \
goto rec_check##nn; \
pc++; goto rec##nn; \
} \
} \

View File

@@ -49,6 +49,9 @@ abc+h+d+f
[A-Fa-f0-9]{64}
<tag>[^<]*</tag>
^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$
\\\\babc
ab\\\\bd
\\\\b(as|js)
"
input="\
abcdef
@@ -99,6 +102,9 @@ abcccccccccccchdf
bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5
ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs
veloval596@godpeed.com
abc
ab d
js hashasd
"
expect="\
(0,3)
@@ -149,6 +155,9 @@ expect="\
(0,64)
(6,44)
(0,22)(0,10)(11,18)(19,22)
(7,10)
-nomatch-
(5,7)(5,7)
(0,0)
"
c=1