factor out recursion
This commit is contained in:
95
pike.c
95
pike.c
@@ -83,13 +83,6 @@ enum
|
|||||||
RSPLIT,
|
RSPLIT,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Return codes for re_sizecode() and re_comp() */
|
|
||||||
enum {
|
|
||||||
RE_SUCCESS = 0,
|
|
||||||
RE_SYNTAX_ERROR = -2,
|
|
||||||
RE_UNSUPPORTED_SYNTAX = -3,
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef struct rsub rsub;
|
typedef struct rsub rsub;
|
||||||
struct rsub
|
struct rsub
|
||||||
{
|
{
|
||||||
@@ -184,25 +177,22 @@ void re_dumpcode(rcode *prog)
|
|||||||
prog->unilen, prog->len, prog->splits, i);
|
prog->unilen, prog->len, prog->splits, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* next todo: crack and factor out this recursion,
|
static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
|
||||||
no recursion will allow to make a meta macro out
|
|
||||||
of this, such that re_sizecode() becomes efficient
|
|
||||||
difficulty: very high, probably not any time soon */
|
|
||||||
static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|
||||||
{
|
{
|
||||||
const char *re = *re_loc;
|
const char *re = re_loc;
|
||||||
int *code = sizecode ? NULL : prog->insts;
|
int *code = sizecode ? NULL : prog->insts;
|
||||||
int start = PC, term = PC;
|
int start = PC, term = PC;
|
||||||
int alt_label = 0, c;
|
int alt_label = 0, c;
|
||||||
int alt_stack[5000], altc = 0;
|
int alt_stack[4096], altc = 0;
|
||||||
|
int cap_stack[4096 * 5], capc = 0;
|
||||||
|
|
||||||
for (; *re && *re != ')';) {
|
while (*re) {
|
||||||
switch (*re) {
|
switch (*re) {
|
||||||
case '\\':
|
case '\\':
|
||||||
re++;
|
re++;
|
||||||
if (!*re) goto syntax_error; /* Trailing backslash */
|
if (!*re) return -1; /* Trailing backslash */
|
||||||
if (*re == '<' || *re == '>') {
|
if (*re == '<' || *re == '>') {
|
||||||
if (re - *re_loc > 2 && re[-2] == '\\')
|
if (re - re_loc > 2 && re[-2] == '\\')
|
||||||
break;
|
break;
|
||||||
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
EMIT(PC++, *re == '<' ? WBEG : WEND);
|
||||||
term = PC;
|
term = PC;
|
||||||
@@ -230,7 +220,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
PC++; /* Skip "# of pairs" byte */
|
PC++; /* Skip "# of pairs" byte */
|
||||||
for (cnt = 0; *re != ']'; cnt++) {
|
for (cnt = 0; *re != ']'; cnt++) {
|
||||||
if (*re == '\\') re++;
|
if (*re == '\\') re++;
|
||||||
if (!*re) goto syntax_error;
|
if (!*re) return -1;
|
||||||
uc_code(c, re) EMIT(PC++, c);
|
uc_code(c, re) EMIT(PC++, c);
|
||||||
uc_len(c, re)
|
uc_len(c, re)
|
||||||
if (re[c] == '-' && re[c+1] != ']')
|
if (re[c] == '-' && re[c+1] != ']')
|
||||||
@@ -244,29 +234,42 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
term = PC;
|
term = PC;
|
||||||
int sub;
|
int sub;
|
||||||
int capture = 1;
|
int capture = 1;
|
||||||
re++;
|
if (*(re+1) == '?') {
|
||||||
if (*re == '?') {
|
re += 2;
|
||||||
re++;
|
if (*re == ':')
|
||||||
if (*re == ':') {
|
|
||||||
capture = 0;
|
capture = 0;
|
||||||
re++;
|
else
|
||||||
} else {
|
return -1;
|
||||||
*re_loc = re;
|
|
||||||
return RE_UNSUPPORTED_SYNTAX;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (capture) {
|
if (capture) {
|
||||||
sub = ++prog->sub;
|
sub = ++prog->sub;
|
||||||
EMIT(PC++, SAVE);
|
EMIT(PC++, SAVE);
|
||||||
EMIT(PC++, sub);
|
EMIT(PC++, sub);
|
||||||
}
|
}
|
||||||
int res = _compilecode(&re, prog, sizecode);
|
cap_stack[capc++] = capture;
|
||||||
*re_loc = re;
|
cap_stack[capc++] = term;
|
||||||
if (res < 0) return res;
|
cap_stack[capc++] = alt_label;
|
||||||
if (*re != ')') return RE_SYNTAX_ERROR;
|
cap_stack[capc++] = start;
|
||||||
if (capture) {
|
cap_stack[capc++] = altc;
|
||||||
|
alt_label = 0;
|
||||||
|
start = PC;
|
||||||
|
break;
|
||||||
|
case ')':
|
||||||
|
if (--capc-4 < 0) return -1;
|
||||||
|
if (code && alt_label) {
|
||||||
|
EMIT(alt_label, REL(alt_label, PC) + 1);
|
||||||
|
int _altc = cap_stack[capc];
|
||||||
|
for (int alts = altc; altc > _altc; altc--) {
|
||||||
|
int at = alt_stack[_altc+alts-altc]+(altc-_altc)*2;
|
||||||
|
EMIT(at, REL(at, PC) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start = cap_stack[--capc];
|
||||||
|
alt_label = cap_stack[--capc];
|
||||||
|
term = cap_stack[--capc];
|
||||||
|
if (cap_stack[--capc]) {
|
||||||
EMIT(PC++, SAVE);
|
EMIT(PC++, SAVE);
|
||||||
EMIT(PC++, sub + prog->presub + 1);
|
EMIT(PC++, code[term+1] + prog->presub + 1);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '{':;
|
case '{':;
|
||||||
@@ -300,7 +303,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '?':
|
case '?':
|
||||||
if (PC == term) goto syntax_error;
|
if (PC == term) return -1;
|
||||||
INSERT_CODE(term, 2, PC);
|
INSERT_CODE(term, 2, PC);
|
||||||
if (re[1] == '?') {
|
if (re[1] == '?') {
|
||||||
EMIT(term, RSPLIT);
|
EMIT(term, RSPLIT);
|
||||||
@@ -311,7 +314,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '*':
|
case '*':
|
||||||
if (PC == term) goto syntax_error;
|
if (PC == term) return -1;
|
||||||
INSERT_CODE(term, 2, PC);
|
INSERT_CODE(term, 2, PC);
|
||||||
EMIT(PC, JMP);
|
EMIT(PC, JMP);
|
||||||
EMIT(PC + 1, REL(PC, term));
|
EMIT(PC + 1, REL(PC, term));
|
||||||
@@ -325,7 +328,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
case '+':
|
case '+':
|
||||||
if (PC == term) goto syntax_error;
|
if (PC == term) return -1;
|
||||||
if (re[1] == '?') {
|
if (re[1] == '?') {
|
||||||
EMIT(PC, SPLIT);
|
EMIT(PC, SPLIT);
|
||||||
re++;
|
re++;
|
||||||
@@ -363,11 +366,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
EMIT(at, REL(at, PC) + 1);
|
EMIT(at, REL(at, PC) + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*re_loc = re;
|
return capc ? -1 : 0;
|
||||||
return RE_SUCCESS;
|
|
||||||
syntax_error:
|
|
||||||
*re_loc = re;
|
|
||||||
return RE_SYNTAX_ERROR;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int re_sizecode(const char *re, int *nsub)
|
int re_sizecode(const char *re, int *nsub)
|
||||||
@@ -376,9 +375,8 @@ int re_sizecode(const char *re, int *nsub)
|
|||||||
dummyprog.unilen = 3;
|
dummyprog.unilen = 3;
|
||||||
dummyprog.sub = 0;
|
dummyprog.sub = 0;
|
||||||
|
|
||||||
int res = _compilecode(&re, &dummyprog, 1);
|
int res = _compilecode(re, &dummyprog, 1);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
if (*re) return RE_SYNTAX_ERROR;
|
|
||||||
*nsub = dummyprog.sub;
|
*nsub = dummyprog.sub;
|
||||||
return dummyprog.unilen;
|
return dummyprog.unilen;
|
||||||
}
|
}
|
||||||
@@ -391,9 +389,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
|
|||||||
prog->presub = nsubs;
|
prog->presub = nsubs;
|
||||||
prog->splits = 0;
|
prog->splits = 0;
|
||||||
|
|
||||||
int res = _compilecode(&re, prog, 0);
|
int res = _compilecode(re, prog, 0);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
if (*re) return RE_SYNTAX_ERROR;
|
|
||||||
int icnt = 0, scnt = SPLIT;
|
int icnt = 0, scnt = SPLIT;
|
||||||
for (int i = 0; i < prog->unilen; i++)
|
for (int i = 0; i < prog->unilen; i++)
|
||||||
switch (prog->insts[i]) {
|
switch (prog->insts[i]) {
|
||||||
@@ -424,7 +421,7 @@ int re_comp(rcode *prog, const char *re, int nsubs)
|
|||||||
prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
|
prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
|
||||||
prog->sub = prog->presub * (prog->len - prog->splits + 3);
|
prog->sub = prog->presub * (prog->len - prog->splits + 3);
|
||||||
prog->sparsesz = scnt;
|
prog->sparsesz = scnt;
|
||||||
return RE_SUCCESS;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define newsub(init, copy) \
|
#define newsub(init, copy) \
|
||||||
@@ -636,10 +633,14 @@ int main(int argc, char *argv[])
|
|||||||
int sub_els;
|
int sub_els;
|
||||||
int sz = re_sizecode(argv[1], &sub_els) * sizeof(int);
|
int sz = re_sizecode(argv[1], &sub_els) * sizeof(int);
|
||||||
printf("Precalculated size: %d\n", sz);
|
printf("Precalculated size: %d\n", sz);
|
||||||
|
if (sz < 0) {
|
||||||
|
printf("Error in re_sizecode\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
char code[sizeof(rcode)+sz];
|
char code[sizeof(rcode)+sz];
|
||||||
rcode *_code = (rcode*)code;
|
rcode *_code = (rcode*)code;
|
||||||
if (re_comp(_code, argv[1], sub_els)) {
|
if (re_comp(_code, argv[1], sub_els)) {
|
||||||
printf("Error in re_comp");
|
printf("Error in re_comp\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
re_dumpcode(_code);
|
re_dumpcode(_code);
|
||||||
|
|||||||
3
test.sh
3
test.sh
@@ -171,6 +171,7 @@ aaaaa(aa)aa(aa(a)a)?aa
|
|||||||
[0-9]+.(.*)
|
[0-9]+.(.*)
|
||||||
([0-9])+.(.*)
|
([0-9])+.(.*)
|
||||||
(([0-9])+)(.)(.*)
|
(([0-9])+)(.)(.*)
|
||||||
|
(abc|sjd|qwq(hs|qw|oo)|(ty|xx|pp)we)
|
||||||
"
|
"
|
||||||
input="\
|
input="\
|
||||||
abcdef
|
abcdef
|
||||||
@@ -343,6 +344,7 @@ h:98: :3234utt;strokeliin:miter;stroke-mirlimit:10;stroke-dasharray:none;strok
|
|||||||
650-253-000123434-45551221
|
650-253-000123434-45551221
|
||||||
650-253-000123434-45551221
|
650-253-000123434-45551221
|
||||||
650-253-000123434-455512213224hsaqer
|
650-253-000123434-455512213224hsaqer
|
||||||
|
ppwe
|
||||||
"
|
"
|
||||||
expect="\
|
expect="\
|
||||||
(0,3)
|
(0,3)
|
||||||
@@ -515,6 +517,7 @@ expect="\
|
|||||||
(0,26)(4,26)
|
(0,26)(4,26)
|
||||||
(0,26)(2,3)(4,26)
|
(0,26)(2,3)(4,26)
|
||||||
(0,36)(0,3)(2,3)(3,4)(4,36)
|
(0,36)(0,3)(2,3)(3,4)(4,36)
|
||||||
|
(0,4)(0,4)(?,?)(0,2)
|
||||||
(0,0)
|
(0,0)
|
||||||
"
|
"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user