further improve submatch extraction

This commit is contained in:
Kyryl Melekhin
2021-09-02 16:36:19 +00:00
parent 695f6b1f56
commit 67c691f95b
2 changed files with 33 additions and 16 deletions

42
pike.c
View File

@@ -56,6 +56,7 @@ struct rcode
int unilen; int unilen;
int len; int len;
int sub; int sub;
int presub;
int splits; int splits;
int gen; int gen;
int insts[]; int insts[];
@@ -254,7 +255,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (capture) { if (capture) {
sub = ++prog->sub; sub = ++prog->sub;
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, 2 * sub); EMIT(PC++, sub);
prog->len++; prog->len++;
} }
int res = _compilecode(&re, prog, sizecode); int res = _compilecode(&re, prog, sizecode);
@@ -263,7 +264,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
if (*re != ')') return RE_SYNTAX_ERROR; if (*re != ')') return RE_SYNTAX_ERROR;
if (capture) { if (capture) {
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, 2 * sub + 1); EMIT(PC++, sub + prog->presub + 1);
prog->len++; prog->len++;
} }
break; break;
@@ -387,24 +388,26 @@ syntax_error:
return RE_SYNTAX_ERROR; return RE_SYNTAX_ERROR;
} }
int re_sizecode(const char *re) int re_sizecode(const char *re, int *nsub)
{ {
rcode dummyprog; rcode dummyprog;
dummyprog.unilen = 3; dummyprog.unilen = 3;
dummyprog.sub = 0;
int res = _compilecode(&re, &dummyprog, /*sizecode*/1); int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
if (res < 0) return res; if (res < 0) return res;
// If unparsed chars left // If unparsed chars left
if (*re) return RE_SYNTAX_ERROR; if (*re) return RE_SYNTAX_ERROR;
*nsub = dummyprog.sub;
return dummyprog.unilen; return dummyprog.unilen;
} }
int re_comp(rcode *prog, const char *re) int re_comp(rcode *prog, const char *re, int nsubs)
{ {
prog->len = 0; prog->len = 0;
prog->unilen = 0; prog->unilen = 0;
prog->sub = 0; prog->sub = 0;
prog->presub = nsubs;
prog->splits = 0; prog->splits = 0;
prog->gen = 1; prog->gen = 1;
@@ -414,7 +417,7 @@ int re_comp(rcode *prog, const char *re)
if (*re) return RE_SYNTAX_ERROR; if (*re) return RE_SYNTAX_ERROR;
prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = SAVE;
prog->insts[prog->unilen++] = 1; prog->insts[prog->unilen++] = prog->sub + 1;
prog->insts[prog->unilen++] = MATCH; prog->insts[prog->unilen++] = MATCH;
prog->len += 2; prog->len += 2;
@@ -450,6 +453,16 @@ if (*pc < WBEG) { \
subs[i++] = sub; \ subs[i++] = sub; \
goto next##nn; \ goto next##nn; \
#define save1() \
newsub(for (j = nsubp / 2; j < nsubp; j++) s1->sub[j] = NULL;) \
for (j = 0; j < nsubp / 2; j++) \
s1->sub[j] = sub->sub[j]; \
#define save2() \
newsub(/*nop*/) \
for (j = 0; j < nsubp; j++) \
s1->sub[j] = sub->sub[j]; \
#define addthread(nn, list, listidx, _pc, _sub) \ #define addthread(nn, list, listidx, _pc, _sub) \
{ \ { \
int i = 0, *pc = _pc; \ int i = 0, *pc = _pc; \
@@ -491,9 +504,7 @@ goto next##nn; \
case SAVE: \ case SAVE: \
if (sub->ref > 1) { \ if (sub->ref > 1) { \
sub->ref--; \ sub->ref--; \
newsub(/*nop*/) \ save##nn() \
for (j = 0; j < nsubp; j++) \
s1->sub[j] = sub->sub[j]; \
sub = s1; \ sub = s1; \
sub->ref = 1; \ sub->ref = 1; \
} \ } \
@@ -583,8 +594,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
break; break;
} }
if (matched) { if (matched) {
for (i = 0; i < nsubp; i++) for (i = 0, j = i; i < nsubp; i+=2, j++) {
subp[i] = matched->sub[i]; subp[i] = matched->sub[j];
subp[i+1] = matched->sub[j+(nsubp/2)];
}
_return(1) _return(1)
} }
_return(0) _return(0)
@@ -596,19 +609,20 @@ int main(int argc, char *argv[])
printf("usage: <regex> <str...> <str...> ...\n"); printf("usage: <regex> <str...> <str...> ...\n");
return 0; return 0;
} }
int sz = re_sizecode(argv[1]) * sizeof(int); int sub_els;
int sz = re_sizecode(argv[1], &sub_els) * sizeof(int);
printf("Precalculated size: %d\n", sz); printf("Precalculated size: %d\n", sz);
char code[(sizeof(rcode)+sz)*2]; char code[(sizeof(rcode)+sz)*2];
memset(code+sizeof(rcode)+sz, 0, sizeof(rcode)+sz); memset(code+sizeof(rcode)+sz, 0, sizeof(rcode)+sz);
rcode *_code = (rcode*)code; rcode *_code = (rcode*)code;
if (re_comp(_code, argv[1])) { if (re_comp(_code, argv[1], sub_els)) {
printf("Error in re_comp"); printf("Error in re_comp");
return 1; return 1;
} }
re_dumpcode(_code); re_dumpcode(_code);
#include <time.h> #include <time.h>
if (argc > 2) { if (argc > 2) {
int sub_els = (_code->sub + 1) * 2; sub_els = (sub_els + 1) * 2;
const char *sub[sub_els]; const char *sub[sub_els];
for (int i = 2; i < argc; i++) { for (int i = 2; i < argc; i++) {
printf("input bytelen: %ld\n", strlen(argv[i])); printf("input bytelen: %ld\n", strlen(argv[i]));

View File

@@ -91,6 +91,7 @@ abc\\\\>
[-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\> [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
[-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\> [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
[-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\> [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
(([-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>))
qwerty.*$ qwerty.*$
([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$) ([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$)
([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$) ([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$)
@@ -197,6 +198,7 @@ world
0x663q 0x663q
x37247 x37247
124435.7727ULL 124435.7727ULL
str + len - 1;
jjdfjk sjdjjsqwerty jdfjdfhhdhfdjjjfj jjjdf jjdfjk sjdjjsqwerty jdfjdfhhdhfdjjjfj jjjdf
$\"}, /* email */ $\"}, /* email */
$\"}, /* email */$ $\"}, /* email */$
@@ -303,6 +305,7 @@ expect="\
-nomatch- -nomatch-
-nomatch- -nomatch-
(2,16)(2,16) (2,16)(2,16)
(12,13)(12,13)(12,13)(12,13)
(14,44) (14,44)
(9,14)(9,14)(?,?)(?,?) (9,14)(9,14)(?,?)(?,?)
(0,18)(?,?)(0,18)(1,17) (0,18)(?,?)(0,18)(1,17)