factor out recursion

This commit is contained in:
Kyryl Melekhin
2022-04-21 14:13:14 +00:00
parent 8bee489e34
commit e26cb69a45
2 changed files with 51 additions and 47 deletions

95
pike.c
View File

@@ -83,13 +83,6 @@ enum
RSPLIT, RSPLIT,
}; };
/* Return codes for re_sizecode() and re_comp() */
enum {
RE_SUCCESS = 0,
RE_SYNTAX_ERROR = -2,
RE_UNSUPPORTED_SYNTAX = -3,
};
typedef struct rsub rsub; typedef struct rsub rsub;
struct rsub struct rsub
{ {
@@ -184,25 +177,22 @@ void re_dumpcode(rcode *prog)
prog->unilen, prog->len, prog->splits, i); prog->unilen, prog->len, prog->splits, i);
} }
/* next todo: crack and factor out this recursion, static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
no recursion will allow to make a meta macro out
of this, such that re_sizecode() becomes efficient
difficulty: very high, probably not any time soon */
static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
{ {
const char *re = *re_loc; const char *re = re_loc;
int *code = sizecode ? NULL : prog->insts; int *code = sizecode ? NULL : prog->insts;
int start = PC, term = PC; int start = PC, term = PC;
int alt_label = 0, c; int alt_label = 0, c;
int alt_stack[5000], altc = 0; int alt_stack[4096], altc = 0;
int cap_stack[4096 * 5], capc = 0;
for (; *re && *re != ')';) { while (*re) {
switch (*re) { switch (*re) {
case '\\': case '\\':
re++; re++;
if (!*re) goto syntax_error; /* Trailing backslash */ if (!*re) return -1; /* Trailing backslash */
if (*re == '<' || *re == '>') { if (*re == '<' || *re == '>') {
if (re - *re_loc > 2 && re[-2] == '\\') if (re - re_loc > 2 && re[-2] == '\\')
break; break;
EMIT(PC++, *re == '<' ? WBEG : WEND); EMIT(PC++, *re == '<' ? WBEG : WEND);
term = PC; term = PC;
@@ -230,7 +220,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
PC++; /* Skip "# of pairs" byte */ PC++; /* Skip "# of pairs" byte */
for (cnt = 0; *re != ']'; cnt++) { for (cnt = 0; *re != ']'; cnt++) {
if (*re == '\\') re++; if (*re == '\\') re++;
if (!*re) goto syntax_error; if (!*re) return -1;
uc_code(c, re) EMIT(PC++, c); uc_code(c, re) EMIT(PC++, c);
uc_len(c, re) uc_len(c, re)
if (re[c] == '-' && re[c+1] != ']') if (re[c] == '-' && re[c+1] != ']')
@@ -244,29 +234,42 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
int sub; int sub;
int capture = 1; int capture = 1;
re++; if (*(re+1) == '?') {
if (*re == '?') { re += 2;
re++; if (*re == ':')
if (*re == ':') {
capture = 0; capture = 0;
re++; else
} else { return -1;
*re_loc = re;
return RE_UNSUPPORTED_SYNTAX;
}
} }
if (capture) { if (capture) {
sub = ++prog->sub; sub = ++prog->sub;
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, sub); EMIT(PC++, sub);
} }
int res = _compilecode(&re, prog, sizecode); cap_stack[capc++] = capture;
*re_loc = re; cap_stack[capc++] = term;
if (res < 0) return res; cap_stack[capc++] = alt_label;
if (*re != ')') return RE_SYNTAX_ERROR; cap_stack[capc++] = start;
if (capture) { cap_stack[capc++] = altc;
alt_label = 0;
start = PC;
break;
case ')':
if (--capc-4 < 0) return -1;
if (code && alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1);
int _altc = cap_stack[capc];
for (int alts = altc; altc > _altc; altc--) {
int at = alt_stack[_altc+alts-altc]+(altc-_altc)*2;
EMIT(at, REL(at, PC) + 1);
}
}
start = cap_stack[--capc];
alt_label = cap_stack[--capc];
term = cap_stack[--capc];
if (cap_stack[--capc]) {
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, sub + prog->presub + 1); EMIT(PC++, code[term+1] + prog->presub + 1);
} }
break; break;
case '{':; case '{':;
@@ -300,7 +303,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
} }
break; break;
case '?': case '?':
if (PC == term) goto syntax_error; if (PC == term) return -1;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 2, PC);
if (re[1] == '?') { if (re[1] == '?') {
EMIT(term, RSPLIT); EMIT(term, RSPLIT);
@@ -311,7 +314,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
break; break;
case '*': case '*':
if (PC == term) goto syntax_error; if (PC == term) return -1;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 2, PC);
EMIT(PC, JMP); EMIT(PC, JMP);
EMIT(PC + 1, REL(PC, term)); EMIT(PC + 1, REL(PC, term));
@@ -325,7 +328,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
break; break;
case '+': case '+':
if (PC == term) goto syntax_error; if (PC == term) return -1;
if (re[1] == '?') { if (re[1] == '?') {
EMIT(PC, SPLIT); EMIT(PC, SPLIT);
re++; re++;
@@ -363,11 +366,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
EMIT(at, REL(at, PC) + 1); EMIT(at, REL(at, PC) + 1);
} }
} }
*re_loc = re; return capc ? -1 : 0;
return RE_SUCCESS;
syntax_error:
*re_loc = re;
return RE_SYNTAX_ERROR;
} }
int re_sizecode(const char *re, int *nsub) int re_sizecode(const char *re, int *nsub)
@@ -376,9 +375,8 @@ int re_sizecode(const char *re, int *nsub)
dummyprog.unilen = 3; dummyprog.unilen = 3;
dummyprog.sub = 0; dummyprog.sub = 0;
int res = _compilecode(&re, &dummyprog, 1); int res = _compilecode(re, &dummyprog, 1);
if (res < 0) return res; if (res < 0) return res;
if (*re) return RE_SYNTAX_ERROR;
*nsub = dummyprog.sub; *nsub = dummyprog.sub;
return dummyprog.unilen; return dummyprog.unilen;
} }
@@ -391,9 +389,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
prog->presub = nsubs; prog->presub = nsubs;
prog->splits = 0; prog->splits = 0;
int res = _compilecode(&re, prog, 0); int res = _compilecode(re, prog, 0);
if (res < 0) return res; if (res < 0) return res;
if (*re) return RE_SYNTAX_ERROR;
int icnt = 0, scnt = SPLIT; int icnt = 0, scnt = SPLIT;
for (int i = 0; i < prog->unilen; i++) for (int i = 0; i < prog->unilen; i++)
switch (prog->insts[i]) { switch (prog->insts[i]) {
@@ -424,7 +421,7 @@ int re_comp(rcode *prog, const char *re, int nsubs)
prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2); prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
prog->sub = prog->presub * (prog->len - prog->splits + 3); prog->sub = prog->presub * (prog->len - prog->splits + 3);
prog->sparsesz = scnt; prog->sparsesz = scnt;
return RE_SUCCESS; return 0;
} }
#define newsub(init, copy) \ #define newsub(init, copy) \
@@ -636,10 +633,14 @@ int main(int argc, char *argv[])
int sub_els; int sub_els;
int sz = re_sizecode(argv[1], &sub_els) * sizeof(int); int sz = re_sizecode(argv[1], &sub_els) * sizeof(int);
printf("Precalculated size: %d\n", sz); printf("Precalculated size: %d\n", sz);
if (sz < 0) {
printf("Error in re_sizecode\n");
return 1;
}
char code[sizeof(rcode)+sz]; char code[sizeof(rcode)+sz];
rcode *_code = (rcode*)code; rcode *_code = (rcode*)code;
if (re_comp(_code, argv[1], sub_els)) { if (re_comp(_code, argv[1], sub_els)) {
printf("Error in re_comp"); printf("Error in re_comp\n");
return 1; return 1;
} }
re_dumpcode(_code); re_dumpcode(_code);

View File

@@ -171,6 +171,7 @@ aaaaa(aa)aa(aa(a)a)?aa
[0-9]+.(.*) [0-9]+.(.*)
([0-9])+.(.*) ([0-9])+.(.*)
(([0-9])+)(.)(.*) (([0-9])+)(.)(.*)
(abc|sjd|qwq(hs|qw|oo)|(ty|xx|pp)we)
" "
input="\ input="\
abcdef abcdef
@@ -343,6 +344,7 @@ h:98: :3234utt;strokeliin:miter;stroke-mirlimit:10;stroke-dasharray:none;strok
650-253-000123434-45551221 650-253-000123434-45551221
650-253-000123434-45551221 650-253-000123434-45551221
650-253-000123434-455512213224hsaqer 650-253-000123434-455512213224hsaqer
ppwe
" "
expect="\ expect="\
(0,3) (0,3)
@@ -515,6 +517,7 @@ expect="\
(0,26)(4,26) (0,26)(4,26)
(0,26)(2,3)(4,26) (0,26)(2,3)(4,26)
(0,36)(0,3)(2,3)(3,4)(4,36) (0,36)(0,3)(2,3)(3,4)(4,36)
(0,4)(0,4)(?,?)(0,2)
(0,0) (0,0)
" "