get rid of all globals, inline/optimize

This commit is contained in:
Kyryl Melekhin
2021-07-18 14:23:13 +00:00
parent 5a3bb5729b
commit c4caa646e5
2 changed files with 115 additions and 134 deletions

233
pike.c
View File

@@ -70,6 +70,7 @@ struct rcode
int unilen; int unilen;
int len; int len;
int sub; int sub;
int splits;
int insts[]; int insts[];
}; };
@@ -105,7 +106,6 @@ typedef struct rsub rsub;
struct rsub struct rsub
{ {
int ref; int ref;
int nsub;
const char *sub[128]; const char *sub[128];
}; };
@@ -137,46 +137,6 @@ void re_fatal(char *msg)
exit(2); exit(2);
} }
static rsub *freesub;
static rsub subs[10];
static int subidx;
rsub* newsub(int n)
{
rsub *s = freesub;
if(s != NULL)
freesub = (rsub*)s->sub[0];
else
s = &subs[subidx++];
s->nsub = n;
s->ref = 1;
return s;
}
rsub* update(rsub *s, int i, const char *p)
{
rsub *s1;
int j;
if(s->ref > 1) {
s1 = newsub(s->nsub);
for(j=0; j<s->nsub; j++)
s1->sub[j] = s->sub[j];
s->ref--;
s = s1;
}
s->sub[i] = p;
return s;
}
void decref(rsub *s)
{
if(--s->ref == 0) {
s->sub[0] = (char*)freesub;
freesub = s;
}
}
int re_classmatch(const int *pc, const char *sp) int re_classmatch(const int *pc, const char *sp)
{ {
// pc points to "classnot" byte after opcode // pc points to "classnot" byte after opcode
@@ -382,6 +342,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
split = *(re+1) == '[' ? RSPLIT : SPLIT; split = *(re+1) == '[' ? RSPLIT : SPLIT;
for (i = maxcnt-mincnt; i > 0; i--) for (i = maxcnt-mincnt; i > 0; i--)
{ {
prog->splits++;
EMIT(PC++, split); EMIT(PC++, split);
EMIT(PC++, REL(PC, PC+((size+2)*i))); EMIT(PC++, REL(PC, PC+((size+2)*i)));
if (code) if (code)
@@ -414,6 +375,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
} }
EMIT(term + 1, REL(term, PC)); EMIT(term + 1, REL(term, PC));
prog->len++; prog->len++;
prog->splits++;
term = PC; term = PC;
break; break;
case '*': case '*':
@@ -429,6 +391,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
EMIT(term, SPLIT); EMIT(term, SPLIT);
} }
EMIT(term + 1, REL(term, PC)); EMIT(term + 1, REL(term, PC));
prog->splits++;
prog->len += 2; prog->len += 2;
term = PC; term = PC;
break; break;
@@ -442,6 +405,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
} }
EMIT(PC + 1, REL(PC, term)); EMIT(PC + 1, REL(PC, term));
PC += 2; PC += 2;
prog->splits++;
prog->len++; prog->len++;
term = PC; term = PC;
break; break;
@@ -454,6 +418,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
alt_label = PC++; alt_label = PC++;
EMIT(start, SPLIT); EMIT(start, SPLIT);
EMIT(start + 1, REL(start, PC)); EMIT(start + 1, REL(start, PC));
prog->splits++;
prog->len += 2; prog->len += 2;
term = PC; term = PC;
break; break;
@@ -502,6 +467,7 @@ int re_comp(rcode *prog, const char *re, int anchored)
prog->len = 0; prog->len = 0;
prog->unilen = 0; prog->unilen = 0;
prog->sub = 0; prog->sub = 0;
prog->splits = 0;
// Add code to implement non-anchored operation ("search"). // Add code to implement non-anchored operation ("search").
// For anchored operation ("match"), this code will be just skipped. // For anchored operation ("match"), this code will be just skipped.
@@ -516,6 +482,7 @@ int re_comp(rcode *prog, const char *re, int anchored)
prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = SAVE;
prog->insts[prog->unilen++] = 0; prog->insts[prog->unilen++] = 0;
prog->len += 4; prog->len += 4;
prog->splits++;
} }
int res = _compilecode(&re, prog, /*sizecode*/0); int res = _compilecode(&re, prog, /*sizecode*/0);
if (res < 0) return res; if (res < 0) return res;
@@ -524,136 +491,141 @@ int re_comp(rcode *prog, const char *re, int anchored)
prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = SAVE;
prog->insts[prog->unilen++] = 1; prog->insts[prog->unilen++] = 1;
prog->insts[prog->unilen++] = MATCH; prog->insts[prog->unilen++] = MATCH;
prog->len += 2; prog->len += 2;
return RE_SUCCESS; return RE_SUCCESS;
} }
static void addthread(const int *pbeg, int *plist, int gen, rthreadlist *l, #define addthread(nn, list, _pc, _sub, _sp, cont) \
int *pc, rsub *sub, const char *beg, const char *sp) { \
{ int i = 0, j, *pc = _pc; \
int i = 0, *pcs[10]; rsub *s1, *sub = _sub; \
rsub *subs[10]; rec##nn: \
rec: if(plist[pc - prog->insts] == gen) { \
if(plist[pc - pbeg] == gen) { sub->ref--; \
decref(sub); rec_check##nn: \
rec_check: if (i) { \
if (i) { pc = pcs[--i]; \
pc = pcs[--i]; sub = subs[i]; \
sub = subs[i]; goto rec##nn; \
goto rec; } \
} cont; \
return; // already on list } \
} plist[pc - prog->insts] = gen; \
plist[pc - pbeg] = gen; switch(*pc) { \
default: \
switch(*pc) { list->t[list->n].sub = sub; \
default: list->t[list->n++].pc = pc; \
l->t[l->n].sub = sub; goto rec_check##nn; \
l->t[l->n++].pc = pc; case JMP: \
goto rec_check; pc += 2 + pc[1]; \
case JMP: goto rec##nn; \
pc += 2 + pc[1]; case SPLIT: \
goto rec; subs[i] = sub; \
case SPLIT: sub->ref++; \
subs[i] = sub; pc += 2; \
sub->ref++; pcs[i++] = pc + pc[-1]; \
pc += 2; goto rec##nn; \
pcs[i++] = pc + pc[-1]; case RSPLIT: \
goto rec; subs[i] = sub; \
case RSPLIT: sub->ref++; \
subs[i] = sub; pc += 2; \
sub->ref++; pcs[i++] = pc; \
pc += 2; pc += pc[-1]; \
pcs[i++] = pc; goto rec##nn; \
pc += pc[-1]; case SAVE: \
goto rec; if (sub->ref > 1) { \
case SAVE: for (j = 0; j < subidx; j++) { \
sub = update(sub, pc[1], sp); if (nsubs[j].ref <= 0) { \
pc += 2; s1 = &nsubs[j]; \
goto rec; goto freedsub##nn; \
case BOL: } \
if(sp != beg) } \
goto rec_check; s1 = &nsubs[subidx++]; \
pc++; goto rec; freedsub##nn: \
case EOL: for (j = 0; j < nsubp; j++) \
if(*sp) s1->sub[j] = sub->sub[j]; \
goto rec_check; sub = s1; \
pc++; goto rec; sub->ref = 1; \
} } \
} sub->sub[pc[1]] = _sp; \
pc += 2; \
goto rec##nn; \
case BOL: \
if(_sp != s) \
goto rec_check##nn; \
pc++; goto rec##nn; \
case EOL: \
if(*(_sp)) \
goto rec_check##nn; \
pc++; goto rec##nn; \
} \
} \
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
{ {
int i, c, l, gen, *pc; int i, c, l, *npc, gen = 1, subidx = 1;
const char *sp; const char *sp;
rsub nsubs[256];
int plist[prog->unilen]; int plist[prog->unilen];
rsub *sub, *matched = NULL; int *pcs[prog->splits];
rsub *subs[prog->splits];
rsub *nsub = nsubs, *matched = NULL;
rthreadlist _clist[1+prog->len]; rthreadlist _clist[1+prog->len];
rthreadlist _nlist[1+prog->len]; rthreadlist _nlist[1+prog->len];
rthreadlist *clist = _clist, *nlist = _nlist, *tmp; rthreadlist *clist = _clist, *nlist = _nlist, *tmp;
memset(plist, 0, prog->unilen*sizeof(plist[0])); memset(plist, 0, prog->unilen*sizeof(plist[0]));
memset(clist, 0, (1+prog->len)*sizeof(rthread)); memset(clist, 0, (1+prog->len)*sizeof(rthread));
memset(nlist, 0, (1+prog->len)*sizeof(rthread)); memset(nlist, 0, (1+prog->len)*sizeof(rthread));
nsub->ref = 1;
subidx = 0; for(i=0; i<nsubp; i++) {
freesub = NULL;
for(i=0; i<nsubp; i++)
subp[i] = NULL; subp[i] = NULL;
sub = newsub(nsubp); nsub->sub[i] = NULL;
for(i=0; i<nsubp; i++) }
sub->sub[i] = NULL;
gen = 1; gen = 1;
addthread(prog->insts, plist, gen, clist, prog->insts, sub, s, s); while (1)
addthread(1, clist, prog->insts, nsub, s, break)
for(sp=s;; sp += l) { for(sp=s;; sp += l) {
if(clist->n == 0) if(clist->n == 0)
break; break;
gen++; uc_len(l, s) gen++; uc_len(l, s)
for(i=0; i<clist->n; i++) { for(i=0; i<clist->n; i++) {
pc = clist->t[i].pc; npc = clist->t[i].pc;
sub = clist->t[i].sub; nsub = clist->t[i].sub;
if (inst_is_consumer(*pc) && !*sp) { if (inst_is_consumer(*npc) && !*sp) {
// If we need to match a character, but there's none left, // If we need to match a character, but there's none left,
// it's fail (we don't schedule current thread for continuation) // it's fail (we don't schedule current thread for continuation)
decref(sub); nsub->ref--;
continue; continue;
} }
switch(*pc++) { switch(*npc++) {
case CHAR: case CHAR:
uc_code(c, sp) uc_code(c, sp)
if(c != *pc++) { if(c != *npc++)
decref(sub);
break; break;
}
case ANY: case ANY:
addthread: addthread:
addthread(prog->insts, plist, gen, nlist, pc, sub, s, sp+l); addthread(2, nlist, npc, nsub, sp+l, continue)
break;
case CLASS: case CLASS:
if (!re_classmatch(pc, sp)) { if (!re_classmatch(npc, sp))
decref(sub);
break; break;
} npc += *(npc+1) * 2 + 2;
pc += *(pc+1) * 2 + 2;
goto addthread; goto addthread;
case NAMEDCLASS: case NAMEDCLASS:
if (!re_namedclassmatch(pc, sp)) { if (!re_namedclassmatch(npc, sp))
decref(sub);
break; break;
} npc++;
pc++;
goto addthread; goto addthread;
case MATCH: case MATCH:
if(matched) matched = nsub;
decref(matched);
matched = sub;
for(i++; i < clist->n; i++) for(i++; i < clist->n; i++)
decref(clist->t[i].sub); clist->t[i].sub->ref--;
goto BreakFor; goto BreakFor;
} }
nsub->ref--;
} }
BreakFor: BreakFor:
tmp = clist; tmp = clist;
@@ -664,7 +636,6 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
if(matched) { if(matched) {
for(i=0; i<nsubp; i++) for(i=0; i<nsubp; i++)
subp[i] = matched->sub[i]; subp[i] = matched->sub[i];
decref(matched);
return 1; return 1;
} }
return 0; return 0;
@@ -679,21 +650,24 @@ int main(int argc, char *argv[])
int sz = re_sizecode(argv[1]) * sizeof(int); int sz = re_sizecode(argv[1]) * sizeof(int);
printf("Precalculated size: %d\n", sz); printf("Precalculated size: %d\n", sz);
char code[sizeof(rcode)+sz]; char code[sizeof(rcode)+sz];
rcode *_code = (rcode*)&code; rcode *_code = (rcode*)code;
if (re_comp(_code, argv[1], 0)) if (re_comp(_code, argv[1], 0))
re_fatal("Error in re_comp"); re_fatal("Error in re_comp");
re_dumpcode(_code); re_dumpcode(_code);
#include <time.h>
if (argc > 2) { if (argc > 2) {
int sub_els = (_code->sub + 1) * 2; int sub_els = (_code->sub + 1) * 2;
const char *sub[sub_els]; const char *sub[sub_els];
for (int i = 2; i < argc; i++) { for (int i = 2; i < argc; i++) {
printf("sub depth %d\n", subidx);
printf("input bytelen: %d\n", strlen(argv[i])); printf("input bytelen: %d\n", strlen(argv[i]));
clock_t start_time = clock();
if(!re_pikevm(_code, argv[i], sub, sub_els)) if(!re_pikevm(_code, argv[i], sub, sub_els))
{ printf("-nomatch-\n"); continue; } { printf("-nomatch-\n"); continue; }
for(int k=sub_els; k>0; k--) for(int k=sub_els; k>0; k--)
if(sub[k-1]) if(sub[k-1])
break; break;
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time);
for(int l=0; l<sub_els; l+=2) { for(int l=0; l<sub_els; l+=2) {
printf("("); printf("(");
if(sub[l] == NULL) if(sub[l] == NULL)
@@ -709,7 +683,6 @@ int main(int argc, char *argv[])
} }
printf("\n"); printf("\n");
} }
} }
return 0; return 0;
} }

16
test.sh
View File

@@ -46,6 +46,8 @@ b[^c]*
([^abc])|(a+) ([^abc])|(a+)
[a-g]+ [a-g]+
[а-г]+ [а-г]+
called|chief|dust|familiar|forth|waif|campaign|divers|smile|notice|kill|human|stands|nightshade|dollar|doughty|gloaming|twist|July|officers|wrest|coop|one|ability|welcome|significance|writer|spring|it's|helped|set|Paris|from|coomb|stay|hummock|taken|anon|makes|boat|nearly|am|justice|further|expression|contemporary|sooth|order|about|question|lived|apply|educational|of|night|satisfy|opened|never|success|until|visit|promise|parts|beneath|matter|typical|bade|apartment|rapidly|primary|bring|throat|hold|laws|understand|trade|desire|material|evidence|another|often|plash|model|someone|bond|hell|relationship|probably|exercise|performance|wants|known|countries|gammer|leeward|took|itself|representative|objection|aircraft
abc+h+d+f
" "
input="\ input="\
abcdef abcdef
@@ -93,6 +95,8 @@ abc
aaaa aaaa
aaaabcdefghij aaaabcdefghij
ааааабвг... ааааабвг...
hhfd h23 performance
abcccccccccccchdf
" "
expect="\ expect="\
(0,3) (0,3)
@@ -140,18 +144,22 @@ expect="\
(0,4)(?,?)(0,4) (0,4)(?,?)(0,4)
(0,10) (0,10)
(0,16) (0,16)
(10,21)
(0,17)
(0,0) (0,0)
" "
c=1 c=1
echo "$regex" | tr '\n' | while read re; do echo "$regex" | tr '\n' | while read re; do
inp=$(echo "$input" | awk -v c=$c 'BEGIN{ RS = "" ; FS = "\n" }{print $c}') inp=$(echo "$input" | awk -v c=$c 'BEGIN{ RS = "" ; FS = "\n" }{print $c}')
exp=$(echo "$expect" | awk -v c=$c 'BEGIN{ RS = "" ; FS = "\n" }{print $c}') exp=$(echo "$expect" | awk -v c=$c 'BEGIN{ RS = "" ; FS = "\n" }{print $c}')
var=$(echo $(./a.out "$re" "$inp" | awk 'END{print}')) var=$(./a.out "$re" "$inp")
if [ ! "$exp" = "$var" ]; then var1=$(echo "$var" | tail -1)
echo "fail test$c regex:$re input:$inp expect:$exp output:$var" if [ ! "$exp" = "$var1" ]; then
echo "fail test$c regex:$re input:$inp expect:$exp output:$var1"
exit 1 exit 1
fi fi
echo "pass test$c regex:$re input:$inp expect:$exp output:$var" time=$(echo "$var" | tail -2 | head -n1)
echo "pass test$c regex:$re input:$inp expect:$exp output:$var1 $time"
c=$((c+1)) c=$((c+1))
done done