add word boundary assert

This commit is contained in:
Kyryl Melekhin
2021-07-21 15:39:11 +00:00
parent ba17d90916
commit 678295f25e
2 changed files with 39 additions and 1 deletions

31
pike.c
View File

@@ -47,6 +47,19 @@ else if (~dst & 0x08) \
else \ else \
dst = 0; \ dst = 0; \
static int isword(const char *s)
{
int c = (unsigned char) s[0];
return isalnum(c) || c == '_' || c > 127;
}
static char *uc_beg(char *beg, char *s)
{
while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
s--;
return s;
}
typedef struct rinst rinst; typedef struct rinst rinst;
struct rinst struct rinst
{ {
@@ -84,6 +97,7 @@ enum /* rinst.opcode */
ASSERT, ASSERT,
BOL, BOL,
EOL, EOL,
WBND,
// Instructions which take relative offset as arg // Instructions which take relative offset as arg
JMP, JMP,
SPLIT, SPLIT,
@@ -196,7 +210,13 @@ void re_dumpcode(rcode *prog)
printf("save %d\n", code[pc++]); printf("save %d\n", code[pc++]);
break; break;
case ASSERT: case ASSERT:
printf("assert %s\n", code[pc++] == BOL ? "bol" : "eol"); if (code[pc] == BOL)
printf("assert bol\n");
else if (code[pc] == EOL)
printf("assert eol\n");
else if (code[pc] == WBND)
printf("assert WBND\n");
pc++;
break; break;
} }
} }
@@ -215,6 +235,13 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
case '\\': case '\\':
re++; re++;
if (!*re) goto syntax_error; // Trailing backslash if (!*re) goto syntax_error; // Trailing backslash
if (*re == 'b') {
EMIT(PC++, ASSERT);
EMIT(PC++, WBND);
prog->len++;
term = PC;
break;
}
default: default:
term = PC; term = PC;
EMIT(PC++, CHAR); EMIT(PC++, CHAR);
@@ -529,6 +556,8 @@ int re_comp(rcode *prog, const char *re, int anchored)
goto rec_check##nn; \ goto rec_check##nn; \
if(*pc == EOL && *_sp) \ if(*pc == EOL && *_sp) \
goto rec_check##nn; \ goto rec_check##nn; \
if(*pc == WBND && isword(sp)) \
goto rec_check##nn; \
pc++; goto rec##nn; \ pc++; goto rec##nn; \
} \ } \
} \ } \

View File

@@ -49,6 +49,9 @@ abc+h+d+f
[A-Fa-f0-9]{64} [A-Fa-f0-9]{64}
<tag>[^<]*</tag> <tag>[^<]*</tag>
^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$ ^([a-z0-9_.-]+)@([0-9a-z.-]+)\\\\.([a-z.]{2,5})$
\\\\babc
ab\\\\bd
\\\\b(as|js)
" "
input="\ input="\
abcdef abcdef
@@ -99,6 +102,9 @@ abcccccccccccchdf
bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5 bf33d4a0dbbee85061531c9d47e5aae692c0729e5c9c1fa21c46d9bcab5f52c5
ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs ajdas <tag> sidufisudf hsdfhshdfh sdf </tag> asjdfjs
veloval596@godpeed.com veloval596@godpeed.com
abc
ab d
js hashasd
" "
expect="\ expect="\
(0,3) (0,3)
@@ -149,6 +155,9 @@ expect="\
(0,64) (0,64)
(6,44) (6,44)
(0,22)(0,10)(11,18)(19,22) (0,22)(0,10)(11,18)(19,22)
(7,10)
-nomatch-
(5,7)(5,7)
(0,0) (0,0)
" "
c=1 c=1