handle {n,} repetition without blowing up codesize
This commit is contained in:
3
README
3
README
@@ -31,7 +31,8 @@ so that the user does not need to waste time taking strlen()
|
|||||||
* Highly optimized source code, probably 2x faster than re1.5
|
* Highly optimized source code, probably 2x faster than re1.5
|
||||||
* Support for quoted chars in regex. Escapes in brackets.
|
* Support for quoted chars in regex. Escapes in brackets.
|
||||||
* Support for ^, $ assertions in regex.
|
* Support for ^, $ assertions in regex.
|
||||||
* Support for repetition operator {n} and {n,m}.
|
* Support for repetition operator {n} and {n,m} and {n,}.
|
||||||
|
- Note: cases with 0 are not handled, avoid them, they can easily be replaced.
|
||||||
* Support for Unicode (UTF-8).
|
* Support for Unicode (UTF-8).
|
||||||
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
||||||
* Support for non capture group ?:
|
* Support for non capture group ?:
|
||||||
|
|||||||
27
pike.c
27
pike.c
@@ -125,10 +125,10 @@ static int re_classmatch(const int *pc, int c)
|
|||||||
|
|
||||||
void re_dumpcode(rcode *prog)
|
void re_dumpcode(rcode *prog)
|
||||||
{
|
{
|
||||||
int pc = 0;
|
int pc = 0, i = 0;
|
||||||
int *code = prog->insts;
|
int *code = prog->insts;
|
||||||
while (pc < prog->unilen) {
|
while (pc < prog->unilen) {
|
||||||
printf("%4d: ", pc);
|
printf("%4d: ", pc); i++;
|
||||||
switch(code[pc++]) {
|
switch(code[pc++]) {
|
||||||
default:
|
default:
|
||||||
pc = prog->unilen;
|
pc = prog->unilen;
|
||||||
@@ -182,7 +182,7 @@ void re_dumpcode(rcode *prog)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("Unilen: %d, insts: %d\n", prog->unilen, prog->len);
|
printf("Unilen: %d, insts: %d, counted insts: %d\n", prog->unilen, prog->len, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* next todo: crack and factor out this recursion,
|
/* next todo: crack and factor out this recursion,
|
||||||
@@ -276,14 +276,14 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
break;
|
break;
|
||||||
case '{':;
|
case '{':;
|
||||||
int maxcnt = 0, mincnt = 0,
|
int maxcnt = 0, mincnt = 0,
|
||||||
i = 0, icnt = 0, size;
|
i = 0, icnt = 0, inf = 0, size;
|
||||||
re++;
|
re++;
|
||||||
while (isdigit((unsigned char) *re))
|
while (isdigit((unsigned char) *re))
|
||||||
mincnt = mincnt * 10 + *re++ - '0';
|
mincnt = mincnt * 10 + *re++ - '0';
|
||||||
if (*re == ',') {
|
if (*re == ',') {
|
||||||
re++;
|
re++;
|
||||||
if (*re == '}')
|
if (*re == '}')
|
||||||
maxcnt = 256;
|
inf = 1;
|
||||||
while (isdigit((unsigned char) *re))
|
while (isdigit((unsigned char) *re))
|
||||||
maxcnt = maxcnt * 10 + *re++ - '0';
|
maxcnt = maxcnt * 10 + *re++ - '0';
|
||||||
} else
|
} else
|
||||||
@@ -293,12 +293,21 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
memcpy(&code[PC], &code[term], size*sizeof(int));
|
memcpy(&code[PC], &code[term], size*sizeof(int));
|
||||||
PC += size;
|
PC += size;
|
||||||
}
|
}
|
||||||
for (i = maxcnt-mincnt; i > 0; i--)
|
if (inf) {
|
||||||
{
|
EMIT(PC, RSPLIT);
|
||||||
|
EMIT(PC+1, REL(PC, PC - size -1));
|
||||||
|
EMIT(PC+2, 0);
|
||||||
|
PC += 3;
|
||||||
|
prog->len++;
|
||||||
prog->splits++;
|
prog->splits++;
|
||||||
|
maxcnt = mincnt;
|
||||||
|
}
|
||||||
|
for (i = maxcnt-mincnt; i > 0; i--) {
|
||||||
EMIT(PC++, SPLIT);
|
EMIT(PC++, SPLIT);
|
||||||
EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
|
EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
|
||||||
EMIT(PC++, 0);
|
EMIT(PC++, 0);
|
||||||
|
prog->splits++;
|
||||||
|
prog->len++;
|
||||||
if (code)
|
if (code)
|
||||||
memcpy(&code[PC], &code[term], size*sizeof(int));
|
memcpy(&code[PC], &code[term], size*sizeof(int));
|
||||||
PC += size;
|
PC += size;
|
||||||
@@ -313,11 +322,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
case RSPLIT:
|
case RSPLIT:
|
||||||
case SAVE:
|
case SAVE:
|
||||||
case CHAR:
|
case CHAR:
|
||||||
case ANY:
|
|
||||||
i++;
|
i++;
|
||||||
|
case ANY:
|
||||||
icnt++;
|
icnt++;
|
||||||
}
|
}
|
||||||
prog->len += maxcnt * icnt;
|
prog->len += (maxcnt-1) * icnt;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '?':
|
case '?':
|
||||||
|
|||||||
3
test.sh
3
test.sh
@@ -123,6 +123,7 @@ aaaaa(aa)aa(aa(a)a)?aa
|
|||||||
(\\\\$\\\\([a-zA-Z0-9_]+\\\\))|(([A-Za-z_%.]+):)
|
(\\\\$\\\\([a-zA-Z0-9_]+\\\\))|(([A-Za-z_%.]+):)
|
||||||
.{5}
|
.{5}
|
||||||
.{10,15}
|
.{10,15}
|
||||||
|
(a(abc)+){3,}
|
||||||
"
|
"
|
||||||
input="\
|
input="\
|
||||||
abcdef
|
abcdef
|
||||||
@@ -247,6 +248,7 @@ https://kyryl.tk/404
|
|||||||
OBJ = \$(SRC:.c=.o)
|
OBJ = \$(SRC:.c=.o)
|
||||||
рврыр
|
рврыр
|
||||||
рврырdhsjhh
|
рврырdhsjhh
|
||||||
|
aabcabcaabcaabc
|
||||||
"
|
"
|
||||||
expect="\
|
expect="\
|
||||||
(0,3)
|
(0,3)
|
||||||
@@ -371,6 +373,7 @@ expect="\
|
|||||||
(8,12)(?,?)(8,12)(8,11)
|
(8,12)(?,?)(8,12)(8,11)
|
||||||
(0,10)
|
(0,10)
|
||||||
(0,16)
|
(0,16)
|
||||||
|
(0,15)(11,15)(12,15)
|
||||||
(0,0)
|
(0,0)
|
||||||
"
|
"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user