From 541d881b4f79dc120477979e4cd656c1ac5146f5 Mon Sep 17 00:00:00 2001 From: Kyryl Melekhin Date: Fri, 8 Oct 2021 12:04:24 +0000 Subject: [PATCH] handle {n,} repetition without blowing up codesize --- README | 3 ++- pike.c | 27 ++++++++++++++++++--------- test.sh | 3 +++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/README b/README index 9b6985a..420f220 100644 --- a/README +++ b/README @@ -31,7 +31,8 @@ so that the user does not need to waste time taking strlen() * Highly optimized source code, probably 2x faster than re1.5 * Support for quoted chars in regex. Escapes in brackets. * Support for ^, $ assertions in regex. -* Support for repetition operator {n} and {n,m}. +* Support for repetition operator {n} and {n,m} and {n,}. +- Note: cases with 0 are not handled, avoid them, they can easily be replaced. * Support for Unicode (UTF-8). * Unlike other engines, the output is byte level offset. (Which is more useful) * Support for non capture group ?: diff --git a/pike.c b/pike.c index 375cf2f..5197008 100644 --- a/pike.c +++ b/pike.c @@ -125,10 +125,10 @@ static int re_classmatch(const int *pc, int c) void re_dumpcode(rcode *prog) { - int pc = 0; + int pc = 0, i = 0; int *code = prog->insts; while (pc < prog->unilen) { - printf("%4d: ", pc); + printf("%4d: ", pc); i++; switch(code[pc++]) { default: pc = prog->unilen; @@ -182,7 +182,7 @@ void re_dumpcode(rcode *prog) break; } } - printf("Unilen: %d, insts: %d\n", prog->unilen, prog->len); + printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i); } /* next todo: crack and factor out this recursion, @@ -276,14 +276,14 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) break; case '{':; int maxcnt = 0, mincnt = 0, - i = 0, icnt = 0, size; + i = 0, icnt = 0, inf = 0, size; re++; while (isdigit((unsigned char) *re)) mincnt = mincnt * 10 + *re++ - '0'; if (*re == ',') { re++; if (*re == '}') - maxcnt = 256; + inf = 1; while (isdigit((unsigned char) *re)) maxcnt = maxcnt * 10 + *re++ - '0'; } else @@ -293,12 +293,21 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; } - for (i = maxcnt-mincnt; i > 0; i--) - { + if (inf) { + EMIT(PC, RSPLIT); + EMIT(PC+1, REL(PC, PC - size -1)); + EMIT(PC+2, 0); + PC += 3; + prog->len++; prog->splits++; + maxcnt = mincnt; + } + for (i = maxcnt-mincnt; i > 0; i--) { EMIT(PC++, SPLIT); EMIT(PC++, REL(PC-1, PC+((size+3)*i))); EMIT(PC++, 0); + prog->splits++; + prog->len++; if (code) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; @@ -313,11 +322,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) case RSPLIT: case SAVE: case CHAR: - case ANY: i++; + case ANY: icnt++; } - prog->len += maxcnt * icnt; + prog->len += (maxcnt-1) * icnt; } break; case '?': diff --git a/test.sh b/test.sh index aa6c54d..d9440cf 100755 --- a/test.sh +++ b/test.sh @@ -123,6 +123,7 @@ aaaaa(aa)aa(aa(a)a)?aa (\\\\$\\\\([a-zA-Z0-9_]+\\\\))|(([A-Za-z_%.]+):) .{5} .{10,15} +(a(abc)+){3,} " input="\ abcdef @@ -247,6 +248,7 @@ https://kyryl.tk/404 OBJ = \$(SRC:.c=.o) рврыр рврырdhsjhh +aabcabcaabcaabc " expect="\ (0,3) @@ -371,6 +373,7 @@ expect="\ (8,12)(?,?)(8,12)(8,11) (0,10) (0,16) +(0,15)(11,15)(12,15) (0,0) "