handle {n,} repetition without blowing up codesize

This commit is contained in:
Kyryl Melekhin
2021-10-08 12:04:24 +00:00
parent 3bb28cd1f8
commit 541d881b4f
3 changed files with 23 additions and 10 deletions

3
README
View File

@@ -31,7 +31,8 @@ so that the user does not need to waste time taking strlen()
* Highly optimized source code, probably 2x faster than re1.5 * Highly optimized source code, probably 2x faster than re1.5
* Support for quoted chars in regex. Escapes in brackets. * Support for quoted chars in regex. Escapes in brackets.
* Support for ^, $ assertions in regex. * Support for ^, $ assertions in regex.
* Support for repetition operator {n} and {n,m}. * Support for repetition operator {n} and {n,m} and {n,}.
- Note: cases with 0 are not handled, avoid them, they can easily be replaced.
* Support for Unicode (UTF-8). * Support for Unicode (UTF-8).
* Unlike other engines, the output is byte level offset. (Which is more useful) * Unlike other engines, the output is byte level offset. (Which is more useful)
* Support for non capture group ?: * Support for non capture group ?:

27
pike.c
View File

@@ -125,10 +125,10 @@ static int re_classmatch(const int *pc, int c)
void re_dumpcode(rcode *prog) void re_dumpcode(rcode *prog)
{ {
int pc = 0; int pc = 0, i = 0;
int *code = prog->insts; int *code = prog->insts;
while (pc < prog->unilen) { while (pc < prog->unilen) {
printf("%4d: ", pc); printf("%4d: ", pc); i++;
switch(code[pc++]) { switch(code[pc++]) {
default: default:
pc = prog->unilen; pc = prog->unilen;
@@ -182,7 +182,7 @@ void re_dumpcode(rcode *prog)
break; break;
} }
} }
printf("Unilen: %d, insts: %d\n", prog->unilen, prog->len); printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i);
} }
/* next todo: crack and factor out this recursion, /* next todo: crack and factor out this recursion,
@@ -276,14 +276,14 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
break; break;
case '{':; case '{':;
int maxcnt = 0, mincnt = 0, int maxcnt = 0, mincnt = 0,
i = 0, icnt = 0, size; i = 0, icnt = 0, inf = 0, size;
re++; re++;
while (isdigit((unsigned char) *re)) while (isdigit((unsigned char) *re))
mincnt = mincnt * 10 + *re++ - '0'; mincnt = mincnt * 10 + *re++ - '0';
if (*re == ',') { if (*re == ',') {
re++; re++;
if (*re == '}') if (*re == '}')
maxcnt = 256; inf = 1;
while (isdigit((unsigned char) *re)) while (isdigit((unsigned char) *re))
maxcnt = maxcnt * 10 + *re++ - '0'; maxcnt = maxcnt * 10 + *re++ - '0';
} else } else
@@ -293,12 +293,21 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
memcpy(&code[PC], &code[term], size*sizeof(int)); memcpy(&code[PC], &code[term], size*sizeof(int));
PC += size; PC += size;
} }
for (i = maxcnt-mincnt; i > 0; i--) if (inf) {
{ EMIT(PC, RSPLIT);
EMIT(PC+1, REL(PC, PC - size -1));
EMIT(PC+2, 0);
PC += 3;
prog->len++;
prog->splits++; prog->splits++;
maxcnt = mincnt;
}
for (i = maxcnt-mincnt; i > 0; i--) {
EMIT(PC++, SPLIT); EMIT(PC++, SPLIT);
EMIT(PC++, REL(PC-1, PC+((size+3)*i))); EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
EMIT(PC++, 0); EMIT(PC++, 0);
prog->splits++;
prog->len++;
if (code) if (code)
memcpy(&code[PC], &code[term], size*sizeof(int)); memcpy(&code[PC], &code[term], size*sizeof(int));
PC += size; PC += size;
@@ -313,11 +322,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
case RSPLIT: case RSPLIT:
case SAVE: case SAVE:
case CHAR: case CHAR:
case ANY:
i++; i++;
case ANY:
icnt++; icnt++;
} }
prog->len += maxcnt * icnt; prog->len += (maxcnt-1) * icnt;
} }
break; break;
case '?': case '?':

View File

@@ -123,6 +123,7 @@ aaaaa(aa)aa(aa(a)a)?aa
(\\\\$\\\\([a-zA-Z0-9_]+\\\\))|(([A-Za-z_%.]+):) (\\\\$\\\\([a-zA-Z0-9_]+\\\\))|(([A-Za-z_%.]+):)
.{5} .{5}
.{10,15} .{10,15}
(a(abc)+){3,}
" "
input="\ input="\
abcdef abcdef
@@ -247,6 +248,7 @@ https://kyryl.tk/404
OBJ = \$(SRC:.c=.o) OBJ = \$(SRC:.c=.o)
рврыр рврыр
рврырdhsjhh рврырdhsjhh
aabcabcaabcaabc
" "
expect="\ expect="\
(0,3) (0,3)
@@ -371,6 +373,7 @@ expect="\
(8,12)(?,?)(8,12)(8,11) (8,12)(?,?)(8,12)(8,11)
(0,10) (0,10)
(0,16) (0,16)
(0,15)(11,15)(12,15)
(0,0) (0,0)
" "