commit ad347d54b91a30eb9c883da66fe73ae0b310cfb6 Author: Kyryl Melekhin Date: Wed Jun 30 20:03:21 2021 +0000 push diff --git a/README b/README new file mode 100644 index 0000000..467a3a3 --- /dev/null +++ b/README @@ -0,0 +1,52 @@ +What is pikevm? +============== + +re1 (http://code.google.com/p/re1/) is "toy regular expression implementation" +by Russel Cox, featuring simplicity and minimal code size unheard of in other +implementations. re2 (http://code.google.com/p/re2/) is "an efficient, +principled regular expression library" by the same author. It is robust, +full-featured, and ... bloated, comparing to re1. + +This is implementation of pikevm based on re1.5 which adds features required for +minimalistic real-world use, while sticking to the minimal code size and +memory use. +https://github.com/pfalcon/re1.5 + +Why? +==== +Pikevm guarantees that any input regex will scale O(n) with the size of the +string, thus making it the fastest regex implementation. There is no backtracking +that usually expodes to O(n^2). My goals were to explore this code and try +to use in my text editor, but after closer analysis pike performs roughly +3 times slower on small strings than traditional well optimized backtrack +engine. The cost of addthread is not exactly O(1) so it results in many +extra operations since every character is processed in lockstep. There is +also a problem of submatch tracking that grows memory usage. + +Features +======== + +* UnLike re1.5, here is only pikevm, one file easy to use. +* Unlike re1.5, regexes is compiled to type sized code rather than bytecode, +eliviating the problem of byte overflow in splits/jmps on large regexes. +Currently the type used is int, and every atom in compiled code is aligned +to that. +* Matcher does not take size of string as param, it checks for '\0' instead, +so that the user does not need to waste time taking strlen() +* Support for quoted chars in regex. +* Support for ^, $ assertions in regex. +* Support for "match" vs "search" operations, as common in other regex APIs. +* Support for named character classes: \d \D \s \S \w \W. +* Support for repetition operator {n} and {n,m}. + +TODO +==== + +* Support for Unicode (UTF-8). (trivial to do, because of int type sized code) +* Support for matching flags like case-insensitive, dot matches all, +multiline, etc. +* Support for more assertions like \A, \Z. + +Author and License +================== +licensed under BSD license, just as the original re1. diff --git a/pike.c b/pike.c new file mode 100644 index 0000000..5cd2598 --- /dev/null +++ b/pike.c @@ -0,0 +1,701 @@ +// Copyright 2007-2009 Russ Cox. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +typedef struct rinst rinst; +struct rinst +{ + int opcode; + int c; + int n; + rinst *x; + rinst *y; +}; + +typedef struct rprog rprog; +struct rprog +{ + rinst *start; + int len; +}; + +typedef struct rcode rcode; +struct rcode +{ + int unilen; + int len; + int sub; + int insts[]; +}; + +enum /* rinst.opcode */ +{ + // Instructions which consume input bytes (and thus fail if none left) + CHAR = 1, + ANY, + CLASS, + CLASSNOT, + NAMEDCLASS, + // Assert position + BOL = 0x50, + EOL, + // Instructions which take relative offset as arg + JMP = 0x60, + SPLIT, + RSPLIT, + // Other (special) instructions + SAVE = 0x7e, + MATCH = 0x7f, +}; + +// Return codes for re_sizecode() and re_comp() +enum { + RE_SUCCESS = 0, + RE_SYNTAX_ERROR = -2, + RE_UNSUPPORTED_ESCAPE = -3, + RE_UNSUPPORTED_SYNTAX = -4, +}; + +#define inst_is_consumer(inst) ((inst) < BOL) +typedef struct rsub rsub; +struct rsub +{ + int ref; + int nsub; + const char *sub[128]; +}; + +typedef struct rthread rthread; +struct rthread +{ + int *pc; + rsub *sub; +}; + +typedef struct rthreadlist rthreadlist; +struct rthreadlist +{ + int n; + rthread t[1]; +}; + +#define INSERT_CODE(at, num, pc) \ +if (code) \ + memmove(code + at + num, code + at, (pc - at)*sizeof(int)); \ +pc += num; +#define REL(at, to) (to - at - 2) +#define EMIT(at, byte) (code ? (code[at] = byte) : at) +#define PC (prog->unilen) + +void re_fatal(char *msg) +{ + fprintf(stderr, "fatal error: %s\n", msg); + exit(2); +} + +static rsub *freesub; +static rsub subs[10]; +static int subidx; + +rsub* newsub(int n) +{ + rsub *s = freesub; + if(s != NULL) + freesub = (rsub*)s->sub[0]; + else + s = &subs[subidx++]; + s->nsub = n; + s->ref = 1; + return s; +} + +rsub* incref(rsub *s) +{ + s->ref++; + return s; +} + +rsub* update(rsub *s, int i, const char *p) +{ + rsub *s1; + int j; + + if(s->ref > 1) { + s1 = newsub(s->nsub); + for(j=0; jnsub; j++) + s1->sub[j] = s->sub[j]; + s->ref--; + s = s1; + } + s->sub[i] = p; + return s; +} + +void decref(rsub *s) +{ + if(--s->ref == 0) { + s->sub[0] = (char*)freesub; + freesub = s; + } +} + +int re_classmatch(const int *pc, const char *sp) +{ + // pc points to "cnt" byte after opcode + int is_positive = (pc[-1] == CLASS); + int cnt = *pc++; + while (cnt--) { + if (*sp >= *pc && *sp <= pc[1]) return is_positive; + pc += 2; + } + return !is_positive; +} + +int re_namedclassmatch(const int *pc, const char *sp) +{ + // pc points to name of class + int off = (*pc >> 5) & 1; + if ((*pc | 0x20) == 'd') { + if (!(*sp >= '0' && *sp <= '9')) + off ^= 1; + } else if ((*pc | 0x20) == 's') { + if (!(*sp == ' ' || (*sp >= '\t' && *sp <= '\r'))) + off ^= 1; + } else { // w + if (!((*sp >= 'A' && *sp <= 'Z') || (*sp >= 'a' && *sp <= 'z') || + (*sp >= '0' && *sp <= '9') || *sp == '_')) + off ^= 1; + } + return off; +} + +void re_dumpcode(rcode *prog) +{ + int pc = 0; + int *code = prog->insts; + while (pc < prog->unilen) { + printf("%4d: ", pc); + switch(code[pc++]) { + default: + pc = prog->unilen; + break; + case SPLIT: + printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]); + pc++; + break; + case RSPLIT: + printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]); + pc++; + break; + case JMP: + printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]); + pc++; + break; + case CHAR: + printf("char %c\n", code[pc]); + pc++; + break; + case ANY: + printf("any\n"); + break; + case CLASS: + case CLASSNOT: { + int num = code[pc]; + printf("class%s %d", (code[pc - 1] == CLASSNOT ? "not" : ""), num); + pc++; + while (num--) { + printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]); + pc += 2; + } + printf("\n"); + break; + } + case NAMEDCLASS: + printf("namedclass %c\n", code[pc++]); + break; + case MATCH: + printf("match\n"); + break; + case SAVE: + printf("save %d\n", code[pc++]); + break; + case BOL: + printf("assert bol\n"); + break; + case EOL: + printf("assert eol\n"); + break; + } + } + printf("Unilen: %d, insts: %d\n", prog->unilen, prog->len); +} + +static int _compilecode(const char **re_loc, rcode *prog, int sizecode) +{ + const char *re = *re_loc; + int *code = sizecode ? NULL : prog->insts; + int start = PC; + int term = PC; + int alt_label = 0; + + for (; *re && *re != ')'; re++) { + switch (*re) { + case '\\':; + re++; + if (!*re) goto syntax_error; // Trailing backslash + char c = *re | 0x20; + if (c == 'd' || c == 's' || c == 'w') { + term = PC; + EMIT(PC++, NAMEDCLASS); + EMIT(PC++, *re); + prog->len++; + break; + } + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) + goto unsupported_escape; + default: + term = PC; + EMIT(PC++, CHAR); + EMIT(PC++, *re); + prog->len++; + break; + case '.': + term = PC; + EMIT(PC++, ANY); + prog->len++; + break; + case '[':; + int cnt; + term = PC; + re++; + if (*re == '^') { + EMIT(PC++, CLASSNOT); + re++; + } else { + EMIT(PC++, CLASS); + } + PC++; // Skip "# of pairs" byte + prog->len++; + for (cnt = 0; *re != ']'; re++, cnt++) { + if (!*re) goto syntax_error; + if (*re == '\\') { + re++; + if (!*re) goto syntax_error; + if (*re != '\\' && *re != ']') + goto unsupported_escape; + } + EMIT(PC++, *re); + if (re[1] == '-' && re[2] != ']') + re += 2; + EMIT(PC++, *re); + } + EMIT(term + 1, cnt); + break; + case '(':; + term = PC; + int sub; + int capture = 1; + re++; + if (*re == '?') { + re++; + if (*re == ':') { + capture = 0; + re++; + } else { + *re_loc = re; + return RE_UNSUPPORTED_SYNTAX; + } + } + + if (capture) { + sub = ++prog->sub; + EMIT(PC++, SAVE); + EMIT(PC++, 2 * sub); + prog->len++; + } + int res = _compilecode(&re, prog, sizecode); + *re_loc = re; + if (res < 0) return res; + if (*re != ')') return RE_SYNTAX_ERROR; + + if (capture) { + EMIT(PC++, SAVE); + EMIT(PC++, 2 * sub + 1); + prog->len++; + } + break; + case '{':; + int maxcnt = 0, mincnt = 0, + i = 0, icnt = 1, size, split; + re++; + while (isdigit((unsigned char) *re)) + mincnt = mincnt * 10 + *re++ - '0'; + if (*re == ',') { + re++; + if (*re == '}') + maxcnt = 256; + while (isdigit((unsigned char) *re)) + maxcnt = maxcnt * 10 + *re++ - '0'; + } else + maxcnt = mincnt; + for (size = PC - term; i < mincnt-1; i++) { + if (code) + memcpy(&code[PC], &code[term], size*sizeof(int)); + PC += size; + } + split = *(re+1) == '[' ? RSPLIT : SPLIT; + for (i = maxcnt-mincnt; i > 0; i--) + { + EMIT(PC++, split); + EMIT(PC++, REL(PC, PC+((size+2)*i))); + if (code) + memcpy(&code[PC], &code[term], size*sizeof(int)); + PC += size; + } + if (code) { + for (i = 0; i < size; i++) + switch (code[term]) { + case CLASS: + case CLASSNOT: + case NAMEDCLASS: + case JMP: + case SPLIT: + case RSPLIT: + case SAVE: + case CHAR: + icnt++; + } + } + prog->len += maxcnt * icnt; + break; + case '?': + if (PC == term) goto syntax_error; // nothing to repeat + INSERT_CODE(term, 2, PC); + if (re[1] == '?') { + EMIT(term, RSPLIT); + re++; + } else { + EMIT(term, SPLIT); + } + EMIT(term + 1, REL(term, PC)); + prog->len++; + term = PC; + break; + case '*': + if (PC == term) goto syntax_error; // nothing to repeat + INSERT_CODE(term, 2, PC); + EMIT(PC, JMP); + EMIT(PC + 1, REL(PC, term)); + PC += 2; + if (re[1] == '?') { + EMIT(term, RSPLIT); + re++; + } else { + EMIT(term, SPLIT); + } + EMIT(term + 1, REL(term, PC)); + prog->len += 2; + term = PC; + break; + case '+': + if (PC == term) goto syntax_error; // nothing to repeat + if (re[1] == '?') { + EMIT(PC, SPLIT); + re++; + } else { + EMIT(PC, RSPLIT); + } + EMIT(PC + 1, REL(PC, term)); + PC += 2; + prog->len++; + term = PC; + break; + case '|': + if (alt_label) { + EMIT(alt_label, REL(alt_label, PC) + 1); + } + INSERT_CODE(start, 2, PC); + EMIT(PC++, JMP); + alt_label = PC++; + EMIT(start, SPLIT); + EMIT(start + 1, REL(start, PC)); + prog->len += 2; + term = PC; + break; + case '^': + EMIT(PC++, BOL); + prog->len++; + term = PC; + break; + case '$': + EMIT(PC++, EOL); + prog->len++; + term = PC; + break; + } + } + if (alt_label) { + EMIT(alt_label, REL(alt_label, PC) + 1); + } + *re_loc = re; + return RE_SUCCESS; +syntax_error: + *re_loc = re; + return RE_SYNTAX_ERROR; +unsupported_escape: + *re_loc = re; + return RE_UNSUPPORTED_ESCAPE; +} + +int re_sizecode(const char *re) +{ + rcode dummyprog; + // SAVE 0, SAVE 1, MATCH; more bytes for "search" (vs "match") prefix code + dummyprog.unilen = 10; + + int res = _compilecode(&re, &dummyprog, /*sizecode*/1); + if (res < 0) return res; + // If unparsed chars left + if (*re) return RE_SYNTAX_ERROR; + + return dummyprog.unilen; +} + +int re_comp(rcode *prog, const char *re, int anchored) +{ + prog->len = 0; + prog->unilen = 0; + prog->sub = 0; + + // Add code to implement non-anchored operation ("search"). + // For anchored operation ("match"), this code will be just skipped. + // TODO: Implement search in much more efficient manner + if (!anchored) { + prog->insts[prog->unilen++] = RSPLIT; + prog->insts[prog->unilen++] = 3; + prog->insts[prog->unilen++] = ANY; + prog->insts[prog->unilen++] = JMP; + prog->insts[prog->unilen++] = -5; + + prog->insts[prog->unilen++] = SAVE; + prog->insts[prog->unilen++] = 0; + prog->len += 4; + } + int res = _compilecode(&re, prog, /*sizecode*/0); + if (res < 0) return res; + // If unparsed chars left + if (*re) return RE_SYNTAX_ERROR; + + prog->insts[prog->unilen++] = SAVE; + prog->insts[prog->unilen++] = 1; + + prog->insts[prog->unilen++] = MATCH; + prog->len += 2; + + return RE_SUCCESS; +} + +void cleanmarks(rcode *prog) +{ + int *pc = prog->insts; + int *end = pc + prog->unilen; + while (pc < end) { + *pc &= 0x7f; + switch (*pc) { + case CLASS: + case CLASSNOT: + pc += pc[1] * 2; + case NAMEDCLASS: + case JMP: + case SPLIT: + case RSPLIT: + case SAVE: + case CHAR: + pc++; + break; + } + pc++; + } +} + +static rthread thread(int *pc, rsub *sub) +{ + rthread t = {pc, sub}; + return t; +} + +static void addthread(rthreadlist *l, rthread t, const char *beg, const char *sp) +{ + int off; + if(*t.pc & 0x80) { + decref(t.sub); + return; // already on list + } + *t.pc |= 0x80; + + switch(*t.pc & 0x7f) { + default: + l->t[l->n++] = t; + break; + case JMP: + off = t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off, t.sub), beg, sp); + break; + case SPLIT: + off = t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc, incref(t.sub)), beg, sp); + addthread(l, thread(t.pc + off, t.sub), beg, sp); + break; + case RSPLIT: + off = t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc + off, incref(t.sub)), beg, sp); + addthread(l, thread(t.pc, t.sub), beg, sp); + break; + case SAVE: + off = t.pc[1]; + t.pc += 2; + addthread(l, thread(t.pc, update(t.sub, off, sp)), beg, sp); + break; + case BOL: + if(sp == beg) + addthread(l, thread(t.pc + 1, t.sub), beg, sp); + break; + case EOL: + if(!*sp) + addthread(l, thread(t.pc + 1, t.sub), beg, sp); + break; + } +} + +int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) +{ + int i, *pc; + const char *sp; + rsub *sub, *matched = NULL; + rthreadlist _clist[1+prog->len]; + rthreadlist _nlist[1+prog->len]; + rthreadlist *clist = _clist, *nlist = _nlist, *tmp; + memset(clist, 0, (1+prog->len)*sizeof(rthread)); + memset(nlist, 0, (1+prog->len)*sizeof(rthread)); + + subidx = 0; + freesub = NULL; + for(i=0; isub[i] = NULL; + + cleanmarks(prog); + addthread(clist, thread(prog->insts, sub), s, s); + for(sp=s;; sp++) { + if(clist->n == 0) + break; + cleanmarks(prog); + for(i=0; in; i++) { + pc = clist->t[i].pc; + sub = clist->t[i].sub; + if (inst_is_consumer(*pc & 0x7f) && !*sp) { + // If we need to match a character, but there's none left, + // it's fail (we don't schedule current thread for continuation) + decref(sub); + continue; + } + switch(*pc++ & 0x7f) { + case CHAR: + if(*sp != *pc++) { + decref(sub); + break; + } + case ANY: + addthread: + addthread(nlist, thread(pc, sub), s, sp+1); + break; + case CLASS: + case CLASSNOT: + if (!re_classmatch(pc, sp)) { + decref(sub); + break; + } + pc += *pc * 2 + 1; + goto addthread; + case NAMEDCLASS: + if (!re_namedclassmatch(pc, sp)) { + decref(sub); + break; + } + pc++; + goto addthread; + case MATCH: + if(matched) + decref(matched); + matched = sub; + for(i++; i < clist->n; i++) + decref(clist->t[i].sub); + goto BreakFor; + } + } + BreakFor: + tmp = clist; + clist = nlist; + nlist = tmp; + nlist->n = 0; + } + if(matched) { + for(i=0; isub[i]; + decref(matched); + return 1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + int sz = re_sizecode(argv[1]) * sizeof(int); + printf("Precalculated size: %d\n", sz); + char code[sizeof(rcode)+sz]; + rcode *_code = (rcode*)&code; + if (re_comp(_code, argv[1], 0)) + re_fatal("Error in re_comp"); + re_dumpcode(_code); + if (argc > 2) { + int sub_els = (_code->sub + 1) * 2; + const char *sub[sub_els]; + for (int i = 2; i < argc; i++) { + printf("sub depth %d\n", subidx); + if(!re_pikevm(_code, argv[i], sub, sub_els)) + { printf("-no match-\n"); continue; } + for(int k=sub_els; k>0; k--) + if(sub[k-1]) + break; + for(int l=0; l