// Copyright 2007-2009 Russ Cox. All Rights Reserved. // Use of this source code is governed by a BSD-style #include #include #include #include const unsigned char utf8_length[256] = { /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ /* 0 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C */ 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* F */ 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; /* return the length of a utf-8 character */ #define uc_len(dst, s) \ dst = utf8_length[(unsigned char)s[0]]; \ /* the unicode codepoint of the given utf-8 character */ #define uc_code(dst, s) \ dst = (unsigned char) s[0]; \ if (~dst & 0xc0); \ else if (~dst & 0x20) \ dst = ((dst & 0x1f) << 6) | (s[1] & 0x3f); \ else if (~dst & 0x10) \ dst = ((dst & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); \ else if (~dst & 0x08) \ dst = ((dst & 0x07) << 18) | ((s[1] & 0x3f) << 12) | \ ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); \ else \ dst = 0; \ static int isword(const char *s) { int c = (unsigned char) s[0]; return isalnum(c) || c == '_' || c > 127; } typedef struct rcode rcode; struct rcode { int unilen; int len; int sub; int presub; int splits; int gen; int insts[]; }; enum { // Instructions which consume input bytes (and thus fail if none left) CHAR = 1, ANY, CLASS, MATCH, // Assert position WBEG, WEND, BOL, EOL, // Instructions which take relative offset as arg JMP, SPLIT, RSPLIT, // Other (special) instructions SAVE, }; // Return codes for re_sizecode() and re_comp() enum { RE_SUCCESS = 0, RE_SYNTAX_ERROR = -2, RE_UNSUPPORTED_SYNTAX = -3, }; typedef struct rsub rsub; struct rsub { int ref; const char *sub[]; }; typedef struct rthread rthread; struct rthread { int *pc; rsub *sub; }; #define INSERT_CODE(at, num, pc) \ if (code) \ memmove(code + at + num, code + at, (pc - at)*sizeof(int)); \ pc += num; #define REL(at, to) (to - at - 2) #define EMIT(at, byte) (code ? (code[at] = byte) : at) #define PC (prog->unilen) static int re_classmatch(const int *pc, int c) { // pc points to "classnot" byte after opcode int is_positive = *pc++; int cnt = *pc++; while (cnt--) { if (c >= *pc && c <= pc[1]) return is_positive; pc += 2; } return !is_positive; } void re_dumpcode(rcode *prog) { int pc = 0; int *code = prog->insts; while (pc < prog->unilen) { printf("%4d: ", pc); switch(code[pc++]) { default: pc = prog->unilen; break; case SPLIT: printf("split %d (%d)\n", pc + code[pc] + 2, code[pc]); pc+=2; break; case RSPLIT: printf("rsplit %d (%d)\n", pc + code[pc] + 2, code[pc]); pc+=2; break; case JMP: printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]); pc++; break; case CHAR: printf("char %c\n", code[pc]); pc++; break; case ANY: printf("any\n"); break; case CLASS:; pc += 2; int num = code[pc - 1]; printf("class%s %d", (code[pc - 2] ? "" : "not"), num); while (num--) { printf(" 0x%02x-0x%02x", code[pc], code[pc + 1]); pc += 2; } printf("\n"); break; case MATCH: printf("match\n"); break; case SAVE: printf("save %d\n", code[pc++]); break; case WBEG: printf("assert wbeg\n"); break; case WEND: printf("assert wend\n"); break; case BOL: printf("assert bol\n"); break; case EOL: printf("assert eol\n"); break; } } printf("Unilen: %d, insts: %d\n", prog->unilen, prog->len); } static int _compilecode(const char **re_loc, rcode *prog, int sizecode) { const char *re = *re_loc; int *code = sizecode ? NULL : prog->insts; int start = PC, term = PC; int alt_label = 0, c; for (; *re && *re != ')';) { switch (*re) { case '\\': re++; if (!*re) goto syntax_error; // Trailing backslash if (*re == '<' || *re == '>') { EMIT(PC++, *re == '<' ? WBEG : WEND); prog->len++; term = PC; break; } default: term = PC; EMIT(PC++, CHAR); uc_code(c, re) EMIT(PC++, c); prog->len++; break; case '.': term = PC; EMIT(PC++, ANY); prog->len++; break; case '[':; int cnt; term = PC; re++; EMIT(PC++, CLASS); if (*re == '^') { EMIT(PC++, 0); re++; } else EMIT(PC++, 1); PC++; // Skip "# of pairs" byte prog->len++; for (cnt = 0; *re != ']'; cnt++) { if (!*re) goto syntax_error; uc_code(c, re) EMIT(PC++, c); uc_len(c, re) if (re[c] == '-' && re[c+1] != ']') re += c+1; uc_code(c, re) EMIT(PC++, c); uc_len(c, re) re += c; } EMIT(term + 2, cnt); break; case '(':; term = PC; int sub; int capture = 1; re++; if (*re == '?') { re++; if (*re == ':') { capture = 0; re++; } else { *re_loc = re; return RE_UNSUPPORTED_SYNTAX; } } if (capture) { sub = ++prog->sub; EMIT(PC++, SAVE); EMIT(PC++, sub); prog->len++; } int res = _compilecode(&re, prog, sizecode); *re_loc = re; if (res < 0) return res; if (*re != ')') return RE_SYNTAX_ERROR; if (capture) { EMIT(PC++, SAVE); EMIT(PC++, sub + prog->presub + 1); prog->len++; } break; case '{':; int maxcnt = 0, mincnt = 0, i = 0, icnt = 0, size; re++; while (isdigit((unsigned char) *re)) mincnt = mincnt * 10 + *re++ - '0'; if (*re == ',') { re++; if (*re == '}') maxcnt = 256; while (isdigit((unsigned char) *re)) maxcnt = maxcnt * 10 + *re++ - '0'; } else maxcnt = mincnt; for (size = PC - term; i < mincnt-1; i++) { if (code) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; } for (i = maxcnt-mincnt; i > 0; i--) { prog->splits++; EMIT(PC++, SPLIT); EMIT(PC++, REL(PC-1, PC+((size+3)*i))); EMIT(PC++, 0); if (code) memcpy(&code[PC], &code[term], size*sizeof(int)); PC += size; } if (code) { for (i = 0; i < size; i++) switch (code[term+i]) { case CLASS: i += code[term+i+2] * 2 + 1; case JMP: case SPLIT: case RSPLIT: case SAVE: case CHAR: i++; icnt++; } } prog->len += maxcnt * icnt; break; case '?': if (PC == term) goto syntax_error; INSERT_CODE(term, 3, PC); if (re[1] == '?') { EMIT(term, RSPLIT); re++; } else EMIT(term, SPLIT); EMIT(term + 1, REL(term, PC-1)); EMIT(term + 2, 0); prog->len++; prog->splits++; term = PC; break; case '*': if (PC == term) goto syntax_error; INSERT_CODE(term, 3, PC); EMIT(PC, JMP); EMIT(PC + 1, REL(PC, term)); PC += 2; if (re[1] == '?') { EMIT(term, RSPLIT); re++; } else EMIT(term, SPLIT); EMIT(term + 1, REL(term, PC-1)); EMIT(term + 2, 0); prog->splits++; prog->len += 2; term = PC; break; case '+': if (PC == term) goto syntax_error; if (re[1] == '?') { EMIT(PC, SPLIT); re++; } else EMIT(PC, RSPLIT); EMIT(PC + 1, REL(PC-1, term)); EMIT(PC + 2, 0); PC += 3; prog->splits++; prog->len++; term = PC; break; case '|': if (alt_label) EMIT(alt_label, REL(alt_label, PC) + 1); INSERT_CODE(start, 3, PC); EMIT(PC++, JMP); alt_label = PC++; EMIT(start, SPLIT); EMIT(start + 1, REL(start, PC-1)); EMIT(start + 2, 0); prog->splits++; prog->len += 2; term = PC; break; case '^': EMIT(PC++, BOL); prog->len++; term = PC; break; case '$': EMIT(PC++, EOL); prog->len++; term = PC; break; } uc_len(c, re) re += c; } if (alt_label) EMIT(alt_label, REL(alt_label, PC) + 1); *re_loc = re; return RE_SUCCESS; syntax_error: *re_loc = re; return RE_SYNTAX_ERROR; } int re_sizecode(const char *re, int *nsub) { rcode dummyprog; dummyprog.unilen = 3; dummyprog.sub = 0; int res = _compilecode(&re, &dummyprog, /*sizecode*/1); if (res < 0) return res; // If unparsed chars left if (*re) return RE_SYNTAX_ERROR; *nsub = dummyprog.sub; return dummyprog.unilen; } int re_comp(rcode *prog, const char *re, int nsubs) { prog->len = 0; prog->unilen = 0; prog->sub = 0; prog->presub = nsubs; prog->splits = 0; prog->gen = 1; int res = _compilecode(&re, prog, /*sizecode*/0); if (res < 0) return res; // If unparsed chars left if (*re) return RE_SYNTAX_ERROR; prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = prog->sub + 1; prog->insts[prog->unilen++] = MATCH; prog->len += 2; return RE_SUCCESS; } #define _return(state) \ { prog->gen = gen + 1; return state; } \ #define newsub(init, copy) \ s1 = freesub; \ if (s1) \ { freesub = (rsub*)s1->sub[0]; copy } \ else \ { s1 = (rsub*)&nsubs[rsubsize * subidx++]; init } \ #define decref(csub) \ if (--csub->ref == 0) { \ csub->sub[0] = (char*)freesub; \ freesub = csub; \ } \ #define deccheck(nn) \ { decref(nsub) goto rec_check##nn; } \ #define fastrec(nn, list, listidx) \ nsub->ref++; \ if (*npc < WBEG) { \ list[listidx].sub = nsub; \ list[listidx++].pc = npc; \ npc = pcs[i]; \ goto rec##nn; \ } \ subs[i++] = nsub; \ goto next##nn; \ #define saveclist() \ newsub(for (j = 0; j < nsubp; j++) s1->sub[j] = nsub->sub[j];, \ for (j = 0; j < nsubp / 2 - 1; j++) s1->sub[j] = nsub->sub[j];) \ #define savenlist() \ newsub(/*nop*/, /*nop*/) \ for (j = 0; j < nsubp; j++) s1->sub[j] = nsub->sub[j]; \ #define addthread(nn, list, listidx) \ { \ int i = 0; \ rec##nn: \ if (*npc < WBEG) { \ list[listidx].sub = nsub; \ list[listidx++].pc = npc; \ rec_check##nn: \ if (i) { \ npc = pcs[--i]; \ nsub = subs[i]; \ goto rec##nn; \ } \ continue; \ } \ next##nn: \ switch(*npc) { \ case JMP: \ npc += 2 + npc[1]; \ goto rec##nn; \ case SPLIT: \ if (npc[2] == gen) \ deccheck(nn) \ npc[2] = gen; \ npc += 3; \ pcs[i] = npc + npc[-2]; \ fastrec(nn, list, listidx) \ case RSPLIT: \ if (npc[2] == gen) \ deccheck(nn) \ npc[2] = gen; \ npc += 3; \ pcs[i] = npc; \ npc += npc[-2]; \ fastrec(nn, list, listidx) \ case SAVE: \ if (nsub->ref > 1) { \ nsub->ref--; \ save##list() \ nsub = s1; \ nsub->ref = 1; \ } \ nsub->sub[npc[1]] = _sp; \ npc += 2; \ goto rec##nn; \ case WBEG: \ if ((sp != s && isword(sp)) || !isword(_sp)) \ deccheck(nn) \ npc++; goto rec##nn; \ case WEND: \ if (isword(_sp)) \ deccheck(nn) \ npc++; goto rec##nn; \ case BOL: \ if (_sp != s) { \ if (!i && !listidx) \ _return(0) \ deccheck(nn) \ } \ npc++; goto rec##nn; \ case EOL: \ if (*_sp) \ deccheck(nn) \ npc++; goto rec##nn; \ } \ } \ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) { int i, j, c, gen, subidx = 1, *npc; int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp); int clistidx = 0, nlistidx = 0; const char *sp = s, *_sp = s; int *insts = prog->insts; int *pcs[prog->splits]; rsub *subs[prog->splits]; char nsubs[rsubsize * (prog->len+3 - prog->splits)]; rsub *nsub, *s1, *matched = NULL, *freesub = NULL; rthread _clist[prog->len], _nlist[prog->len]; rthread *clist = _clist, *nlist = _nlist, *tmp; gen = prog->gen; goto jmp_start; for (;; sp = _sp) { gen++; uc_len(i, sp) uc_code(c, sp) _sp = sp+i; for (i = 0; i < clistidx; i++) { npc = clist[i].pc; nsub = clist[i].sub; switch(*npc++) { case CHAR: if (c != *npc++) break; case ANY: addthread: addthread(2, nlist, nlistidx) case CLASS: if (!re_classmatch(npc, c)) break; npc += *(npc+1) * 2 + 2; goto addthread; case MATCH: if (matched) { decref(matched) subidx = 0; } matched = nsub; goto break_for; } decref(nsub) } break_for: if (!c) break; tmp = clist; clist = nlist; nlist = tmp; clistidx = nlistidx; nlistidx = 0; if (!matched) { jmp_start: newsub(for (i = 1; i < nsubp; i++) s1->sub[i] = NULL;, /*nop*/) s1->ref = 1; s1->sub[0] = _sp; npc = insts; nsub = s1; addthread(1, clist, clistidx) } else if (!clistidx) break; } if (matched) { for (i = 0, j = i; i < nsubp; i+=2, j++) { subp[i] = matched->sub[j]; subp[i+1] = matched->sub[nsubp / 2 + j]; } _return(1) } _return(0) } int main(int argc, char *argv[]) { if (argc < 2) { printf("usage: ...\n"); return 0; } int sub_els; int sz = re_sizecode(argv[1], &sub_els) * sizeof(int); printf("Precalculated size: %d\n", sz); char code[sizeof(rcode)+sz]; rcode *_code = (rcode*)code; if (re_comp(_code, argv[1], sub_els)) { printf("Error in re_comp"); return 1; } re_dumpcode(_code); #include if (argc > 2) { sub_els = (sub_els + 1) * 2; const char *sub[sub_els]; for (int i = 2; i < argc; i++) { printf("input bytelen: %ld\n", strlen(argv[i])); clock_t start_time = clock(); sz = re_pikevm(_code, argv[i], sub, sub_els); double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC; printf("Done in %f seconds\n", elapsed_time); if (!sz) { printf("-nomatch-\n"); continue; } for (int k=sub_els; k>0; k--) if (sub[k-1]) break; for (int l=0; l