diff --git a/src/lexer.c b/src/lexer.c index b362691..c71a869 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1,5 +1,6 @@ #include #include +#include #include "main.h" struct patch_list @@ -11,10 +12,10 @@ struct patch_list struct frag { struct rgx_nfa *start; - struct patch_list *out; + struct patch_list *pl; }; -struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa) +struct patch_list * pl_append(struct patch_list * pl, struct rgx_nfa ** nfa) { struct patch_list * res, * root = pl; res = malloc(sizeof(struct patch_list)); @@ -33,7 +34,25 @@ struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa) return root; } -struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) { +void pl_merge(struct patch_list ** pl1, struct patch_list * pl2) +{ + struct patch_list * pl = *pl1; + + if (!pl ) { + *pl1 = pl2; + return; + } + + while (pl->next) { + pl = pl->next; + } + + pl->next = pl2; + +} + +struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) +{ while (pl) { *pl->ptr = to; pl = pl->next; @@ -41,6 +60,29 @@ struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) { return to; } +void collapse(struct frag * stack, unsigned char * i) +{ + if (stack[*i].start->op == BLOCK ) { + stack[(*i)--] ; + return; + } + + while (*i > 0 ) + { + struct frag to; + + to = stack[(*i)--]; + + if (stack[*i].start->op == BLOCK ) { + stack[*i]= to; + return; + } + + patch(stack[*i].pl, to.start); + stack[*i].pl = to.pl; + } +} + struct rgx_nfa * mknfa(struct rgx_nfa nfa) { struct rgx_nfa * res; @@ -50,91 +92,115 @@ struct rgx_nfa * mknfa(struct rgx_nfa nfa) } struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v) { - struct rgx_nfa *r; unsigned char i=-1; - struct frag stack[0xff], a, b; + struct frag stack[0xff]; + struct patch_list * union_pl = NULL; + + if (l) { + struct rgx_nfa *r; + r = mknfa((struct rgx_nfa){UNION, {l, NULL}, 1}); + stack[++i] = (struct frag){r, pl_append(NULL, &r->node[1])}; + } for(;*s; ++s) { + struct rgx_nfa *r; + struct frag a, b; switch(*s) { + case '(': + r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0}); + stack[++i] = (struct frag){r, NULL }; + break; + case ')': + collapse(stack, &i); + pl_merge(&stack[i].pl, union_pl); + union_pl=NULL; + break; case '|': + collapse(stack, &i); + pl_merge(&union_pl, stack[i].pl); + + if (i!= 0) { + a = stack[i--]; + r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0}); + stack[++i] = (struct frag){r, NULL }; + stack[++i]= a; + } + + r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0}); + stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])}; break; + case '\\': + switch(*(++s)) + { + case 's': + case 'd': + case 'w': + a.start = mknfa((struct rgx_nfa){CLASS, {NULL}, *s}); + a.pl = pl_append(NULL, &a.start->node[0]); + stack[++i] = a; + break; + case '\\': + case '|': + case '(': + case ')': + case '*': + case '+': + case '?': + a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s}); + a.pl = pl_append(NULL, &a.start->node[0]); + stack[++i] = a; + break; + default: + a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, '\\'}); + a.pl = pl_append(NULL, &a.start->node[0]); + stack[++i] = a; + + a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s}); + a.pl = pl_append(NULL, &a.start->node[0]); + stack[++i] = a; + break; + } + break; case '*': - b = stack[i--]; - - if (i==0xff) - { - r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); - patch(b.out, r); - stack[++i] = (struct frag){r, append(NULL, &r->node[1])}; - break; - } - else - { - a = stack[i--]; - r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); - patch(a.out, r); - patch(b.out, r); - a.out = append(NULL, &r->node[1]); - stack[++i] = a; - break; - } + r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0}); + patch(stack[i].pl, r); + stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])}; + break; case '?': - b = stack[i--]; - - if (i==0xff) - { - r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); - b.out = append(b.out, &r->node[1]); - stack[++i] = (struct frag){r, b.out }; - break; - } - else - { - a = stack[i--]; - r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); - a.out = append(b.out, &patch(a.out, r)->node[1]); - stack[++i] = a; - break; - } - + r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0}); + stack[i] = (struct frag){r, pl_append(stack[i].pl, &r->node[1]) }; + break; case '+': - // This implemention may cause a infinite loop that crashes. - a = stack[i--]; - r = mknfa((struct rgx_nfa){UNION, {NULL, a.start}}); - a.out = append(NULL, &patch(a.out, r)->node[0]); - stack[++i] = a; + r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0}); + stack[i].pl = pl_append(NULL, &patch(stack[i].pl, r)->node[1]); break; case '.': - a.start = mknfa((struct rgx_nfa){C_ANY, {NULL}}); - a.out = append(NULL, &a.start->node[0]); + a.start = mknfa((struct rgx_nfa){ANY, {NULL}}); + a.pl = pl_append(NULL, &a.start->node[0]); stack[++i] = a; break; default: a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s}); - a.out = append(NULL, &a.start->node[0]); + a.pl = pl_append(NULL, &a.start->node[0]); stack[++i] = a; break; } } - // Append Final - { - stack[++i].start = mknfa((struct rgx_nfa){FINAL, {NULL}, v}); - stack[i].out = append(NULL, &stack[i].start->node[0]); - } - - // Collapse Stack - while (i > 0) - { - struct frag to; - to = stack[i--]; - patch(stack[i].out, to.start); + // Collapse and add FINAL state + while ( i!=0 ) { + if (stack[i].start->op == BLOCK ) + stack[i--]; + collapse(stack, &i); } + pl_merge(&stack[i].pl, union_pl); + union_pl = NULL; + patch(stack[i].pl, mknfa((struct rgx_nfa){FINAL, {NULL}, v})); return stack[i].start; } @@ -144,19 +210,47 @@ int rgx_run(struct rgx_nfa *l, char *s) { switch (l->op) { case FINAL: + //printf("FINAL: %c\n", *s? *s:'0'); return *s ? 0: l->c; - case C_ANY: + case ANY: + //printf("ANY: %c\n", *s? *s:'0'); if (!*(s++)) return 0; l = l->node[0]; break; + case CLASS: + //printf("CLASS(%c): %c\n", l->c, *s? *s: '0'); + switch(l->c) + { + case 's': + if(!isspace(*(s++))) + return 0; + l = l->node[0]; + break; + case 'd': + if(!isdigit(*(s++))) + return 0; + l = l->node[0]; + break; + case 'w': + if(!isalpha(*(s++))) + return 0; + l = l->node[0]; + break; + default: + return 0; + } + break; case RUNE: + //printf("RUNE(%c): %c\n", l->c, *s?*s:'0'); if (*(s++) != l->c) return 0; l = l->node[0]; break; case UNION: + //printf("{\n"); int res = rgx_run(l->node[0], s); + //printf("}\n"); if (res) return res; l = l->node[1]; diff --git a/src/main.c b/src/main.c index e69de29..f949ea7 100644 --- a/src/main.c +++ b/src/main.c @@ -0,0 +1,34 @@ +#include +#include "main.h" + +char * tokens[] = +{ + "\"(\\\"|\"!)*\"", + "\\d+", + "L|R|N", + "\\w+", + "\\s+", + "[\\s*", + "\\s*]", + "\\s*:\\s*", + "\\s*\\|\\s*", + "\\s*<\\s*", + "\\s*;", + NULL, +}; + +int main(void) +{ + int i; + struct rgx_nfa * l = NULL; + + //for (i=0; tokens[i]; ++i) { + l=rgx_compile(l, tokens[0], 1); + //} + + printf("\n%d\n", rgx_run(l, "\"\"")) ; + printf("\n%d\n", rgx_run(l, "\"b\"")) ; + printf("\n%d\n", rgx_run(l, "\"bb\"")) ; + printf("\n%d\n", rgx_run(l, "\"\"\"")) ; + printf("\n%d\n", rgx_run(l, "\"\\\"\"")) ; +} diff --git a/src/main.h b/src/main.h index 2ba064d..c79a8a8 100644 --- a/src/main.h +++ b/src/main.h @@ -1,6 +1,6 @@ struct rgx_nfa { - enum {RUNE, C_ANY, UNION, FINAL} op; + enum {RUNE, ANY, UNION, FINAL, CLASS, BLOCK} op; struct rgx_nfa *node[2]; unsigned char c; }; diff --git a/src/test.c b/src/test.c index c2531f1..a0fb3a5 100644 --- a/src/test.c +++ b/src/test.c @@ -24,27 +24,46 @@ struct test {"a.*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {"abbc", 1}, {NULL}}}, {"a.?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {NULL}}}, + // TODO: Implement NFA to DFA to avoid loops + // Equal to * {"ab+?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, {"ab*?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, - {"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, - {"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, - - // . Alt - //{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}}, - - {"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 1}, {NULL}}}, - {"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}}, - {"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}}, - - // Xd - {"a?", {{"aaaa", 0}, {"a", 1}, {NULL}}}, - {"a*", {{"aaaa", 1}, {"a", 1}, {NULL}}}, - // REDOS + //{"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, + //{"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, {"a+*", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}}, + //{"a**", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}}, + + // Nilpotent + {"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 0}, {NULL}}}, + {"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}}, + //{"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}}, + + // UNION + {"a()", {{"a", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}}, + {"a(b)", {{"ab", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}}, + {"a(b)*", {{"ab", 1},{"abb", 1}, {"a", 1 }, {"ba", 0}, {NULL}}}, + {"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}}, + {"(a|.)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}}, + {"(.|a)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}}, + {"(b|a)*", {{"a", 1},{"cab", 0}, {"b", 1 }, {"ab", 1}, {"cacb", 0}, {NULL}}}, + {"c(b|a)*", {{"ca", 1},{"cab", 1}, {"b", 0 }, {"bb", 0}, {"cacb", 0}, {NULL}}}, + {"ab|b", {{"ab", 1}, {"b", 1}, {"abb", 0}, {NULL}}}, + {"a|b|c", {{"a", 1}, {"b", 1}, {"c", 1},{"d",0}, {NULL}}}, + {"((a|b)|c)d", {{"ad", 1}, {"bd", 1}, {"cd", 1},{"a",0}, {NULL}}}, + + // BLOCK + {"(abb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}}, + {"abb)", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}}, + {"a(bb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}}, + {"ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}}, + {"(ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}}, + {"(ab)+", {{"ab", 1}, {"abab", 1}, {"b", 0}, {NULL}}}, + {"(a(bc)*)", {{"a", 1}, {"abc", 1}, {"abb", 0}, {NULL}}}, + {"(ab|cd)", {{"ab", 1}, {"cd", 1}, {"abcd", 0}, {NULL}}}, + + // REDOS {"a*b?a*", {{"aaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", 1}, {NULL}}}, - {"aa*+", {{"aaaa", 1}, {NULL}}}, - {"a*+", {{"a", 1}, {NULL}}}, } ; @@ -60,6 +79,11 @@ int main(void) printf("\n\t%s\n", t.regex); nfa = rgx_compile(NULL, t.regex, 1); + if (!nfa ){ + printf("Malformed\n"); + continue; + } + for( struct match * m = t.matches; m->s; ++m) { int res;