From 6acef48f0927b2c9d1fa8f5c943d2732d862dac1 Mon Sep 17 00:00:00 2001 From: PedroEdiaz Date: Thu, 4 Dec 2025 21:57:58 -0600 Subject: [PATCH] First workin regex engine --- Makefile | 15 +++++ src/lexer.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.c | 0 src/main.h | 9 +++ src/test.c | 70 ++++++++++++++++++++++ 5 files changed, 261 insertions(+) create mode 100644 Makefile create mode 100644 src/lexer.c create mode 100644 src/main.c create mode 100644 src/main.h create mode 100644 src/test.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8ded207 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +BIN = src/main +TEST = src/test + +OBJ = \ + src/lexer.o + +#all: $(BIN) $(TEST) + +test: $(TEST) + +$(BIN): $(OBJ) $(BIN).o + $(CC) -g -o $(BIN) $(BIN).o $(OBJ) + +$(TEST): $(OBJ) $(TEST).o + $(CC) -g -o $(TEST) $(TEST).o $(OBJ) diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..b362691 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,167 @@ +#include +#include +#include "main.h" + +struct patch_list +{ + struct rgx_nfa **ptr; + struct patch_list *next; +}; + +struct frag +{ + struct rgx_nfa *start; + struct patch_list *out; +}; + +struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa) +{ + struct patch_list * res, * root = pl; + res = malloc(sizeof(struct patch_list)); + *res = (struct patch_list){nfa, NULL}; + + if (pl == 0 ) { + return res; + } + + while (pl->next) { + pl = pl->next; + } + + pl->next = res; + + return root; +} + +struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) { + while (pl) { + *pl->ptr = to; + pl = pl->next; + } + return to; +} + +struct rgx_nfa * mknfa(struct rgx_nfa nfa) +{ + struct rgx_nfa * res; + res = malloc(sizeof(struct rgx_nfa)); + *res = nfa; + return res; +} + +struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v) { + struct rgx_nfa *r; + unsigned char i=-1; + struct frag stack[0xff], a, b; + + for(;*s; ++s) + { + + switch(*s) + { + case '|': + break; + + case '*': + b = stack[i--]; + + if (i==0xff) + { + r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); + patch(b.out, r); + stack[++i] = (struct frag){r, append(NULL, &r->node[1])}; + break; + } + else + { + a = stack[i--]; + r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); + patch(a.out, r); + patch(b.out, r); + a.out = append(NULL, &r->node[1]); + stack[++i] = a; + break; + } + case '?': + b = stack[i--]; + + if (i==0xff) + { + r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); + b.out = append(b.out, &r->node[1]); + stack[++i] = (struct frag){r, b.out }; + break; + } + else + { + a = stack[i--]; + r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}}); + a.out = append(b.out, &patch(a.out, r)->node[1]); + stack[++i] = a; + break; + } + + case '+': + // This implemention may cause a infinite loop that crashes. + a = stack[i--]; + r = mknfa((struct rgx_nfa){UNION, {NULL, a.start}}); + a.out = append(NULL, &patch(a.out, r)->node[0]); + stack[++i] = a; + break; + case '.': + a.start = mknfa((struct rgx_nfa){C_ANY, {NULL}}); + a.out = append(NULL, &a.start->node[0]); + stack[++i] = a; + break; + default: + a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s}); + a.out = append(NULL, &a.start->node[0]); + stack[++i] = a; + break; + } + } + + // Append Final + { + stack[++i].start = mknfa((struct rgx_nfa){FINAL, {NULL}, v}); + stack[i].out = append(NULL, &stack[i].start->node[0]); + } + + // Collapse Stack + while (i > 0) + { + struct frag to; + to = stack[i--]; + patch(stack[i].out, to.start); + } + + return stack[i].start; +} + +int rgx_run(struct rgx_nfa *l, char *s) { + while (l) + { + switch (l->op) + { + case FINAL: + return *s ? 0: l->c; + case C_ANY: + if (!*(s++)) + return 0; + l = l->node[0]; + break; + case RUNE: + if (*(s++) != l->c) + return 0; + l = l->node[0]; + break; + case UNION: + int res = rgx_run(l->node[0], s); + if (res) + return res; + l = l->node[1]; + break; + } + } + return 0; +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..e69de29 diff --git a/src/main.h b/src/main.h new file mode 100644 index 0000000..2ba064d --- /dev/null +++ b/src/main.h @@ -0,0 +1,9 @@ +struct rgx_nfa +{ + enum {RUNE, C_ANY, UNION, FINAL} op; + struct rgx_nfa *node[2]; + unsigned char c; +}; + +struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v); +int rgx_run(struct rgx_nfa * l, char * s); diff --git a/src/test.c b/src/test.c new file mode 100644 index 0000000..c2531f1 --- /dev/null +++ b/src/test.c @@ -0,0 +1,70 @@ +#include +#include "main.h" + +struct test +{ + char * regex; + struct match + { + char * s; + int expect; + } matches[0xff]; +} test_suite[] = +{ + // Basic + {"abc", {{"a", 0},{"abc", 1}, {"abcd",0 }, {NULL}}}, + {"a.c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"aac", 1}, {NULL}}}, + {"ab+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {NULL}}}, + {"ab*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, + {"ab?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {NULL}}}, + + // . Compose + {"a..c", {{"a", 0},{"abc", 0}, {"abcd",0 }, {"acc", 0}, {"ac", 0}, {"abbc", 1}, {NULL}}}, + {"a.+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 0}, {"abbc", 1}, {NULL}}}, + {"a.*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {"abbc", 1}, {NULL}}}, + {"a.?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {NULL}}}, + + // Equal to * + {"ab+?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, + {"ab*?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, + {"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, + {"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, + + // . Alt + //{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}}, + + {"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 1}, {NULL}}}, + {"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}}, + {"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}}, + + // Xd + {"a?", {{"aaaa", 0}, {"a", 1}, {NULL}}}, + {"a*", {{"aaaa", 1}, {"a", 1}, {NULL}}}, + // REDOS + {"a+*", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}}, + {"a*b?a*", {{"aaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", 1}, {NULL}}}, + {"aa*+", {{"aaaa", 1}, {NULL}}}, + {"a*+", {{"a", 1}, {NULL}}}, +} ; + + +int main(void) +{ + int i=0, j=0; + + for(i =0; is; ++m) + { + int res; + res = rgx_run(nfa, m->s); + printf("%s: %d %s\n", res == m->expect?"PASS":"FAIL", res, m->s); + } + } +}