Make Union and Blocks, avoid postfix notation

This commit is contained in:
PedroEdiaz
2025-12-10 15:21:56 -06:00
parent 6acef48f09
commit e4f67ab3fe
4 changed files with 232 additions and 80 deletions

View File

@@ -1,5 +1,6 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <ctype.h>
#include "main.h" #include "main.h"
struct patch_list struct patch_list
@@ -11,10 +12,10 @@ struct patch_list
struct frag struct frag
{ {
struct rgx_nfa *start; struct rgx_nfa *start;
struct patch_list *out; struct patch_list *pl;
}; };
struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa) struct patch_list * pl_append(struct patch_list * pl, struct rgx_nfa ** nfa)
{ {
struct patch_list * res, * root = pl; struct patch_list * res, * root = pl;
res = malloc(sizeof(struct patch_list)); res = malloc(sizeof(struct patch_list));
@@ -33,7 +34,25 @@ struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa)
return root; return root;
} }
struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) { void pl_merge(struct patch_list ** pl1, struct patch_list * pl2)
{
struct patch_list * pl = *pl1;
if (!pl ) {
*pl1 = pl2;
return;
}
while (pl->next) {
pl = pl->next;
}
pl->next = pl2;
}
struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to)
{
while (pl) { while (pl) {
*pl->ptr = to; *pl->ptr = to;
pl = pl->next; pl = pl->next;
@@ -41,6 +60,29 @@ struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) {
return to; return to;
} }
void collapse(struct frag * stack, unsigned char * i)
{
if (stack[*i].start->op == BLOCK ) {
stack[(*i)--] ;
return;
}
while (*i > 0 )
{
struct frag to;
to = stack[(*i)--];
if (stack[*i].start->op == BLOCK ) {
stack[*i]= to;
return;
}
patch(stack[*i].pl, to.start);
stack[*i].pl = to.pl;
}
}
struct rgx_nfa * mknfa(struct rgx_nfa nfa) struct rgx_nfa * mknfa(struct rgx_nfa nfa)
{ {
struct rgx_nfa * res; struct rgx_nfa * res;
@@ -50,91 +92,115 @@ struct rgx_nfa * mknfa(struct rgx_nfa nfa)
} }
struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v) { struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v) {
struct rgx_nfa *r;
unsigned char i=-1; unsigned char i=-1;
struct frag stack[0xff], a, b; struct frag stack[0xff];
struct patch_list * union_pl = NULL;
if (l) {
struct rgx_nfa *r;
r = mknfa((struct rgx_nfa){UNION, {l, NULL}, 1});
stack[++i] = (struct frag){r, pl_append(NULL, &r->node[1])};
}
for(;*s; ++s) for(;*s; ++s)
{ {
struct rgx_nfa *r;
struct frag a, b;
switch(*s) switch(*s)
{ {
case '(':
r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0});
stack[++i] = (struct frag){r, NULL };
break;
case ')':
collapse(stack, &i);
pl_merge(&stack[i].pl, union_pl);
union_pl=NULL;
break;
case '|': case '|':
collapse(stack, &i);
pl_merge(&union_pl, stack[i].pl);
if (i!= 0) {
a = stack[i--];
r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0});
stack[++i] = (struct frag){r, NULL };
stack[++i]= a;
}
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])};
break; break;
case '\\':
switch(*(++s))
{
case 's':
case 'd':
case 'w':
a.start = mknfa((struct rgx_nfa){CLASS, {NULL}, *s});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
case '\\':
case '|':
case '(':
case ')':
case '*': case '*':
b = stack[i--];
if (i==0xff)
{
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
patch(b.out, r);
stack[++i] = (struct frag){r, append(NULL, &r->node[1])};
break;
}
else
{
a = stack[i--];
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
patch(a.out, r);
patch(b.out, r);
a.out = append(NULL, &r->node[1]);
stack[++i] = a;
break;
}
case '?':
b = stack[i--];
if (i==0xff)
{
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
b.out = append(b.out, &r->node[1]);
stack[++i] = (struct frag){r, b.out };
break;
}
else
{
a = stack[i--];
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
a.out = append(b.out, &patch(a.out, r)->node[1]);
stack[++i] = a;
break;
}
case '+': case '+':
// This implemention may cause a infinite loop that crashes. case '?':
a = stack[i--]; a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
r = mknfa((struct rgx_nfa){UNION, {NULL, a.start}}); a.pl = pl_append(NULL, &a.start->node[0]);
a.out = append(NULL, &patch(a.out, r)->node[0]);
stack[++i] = a; stack[++i] = a;
break; break;
default:
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, '\\'});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
}
break;
case '*':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
patch(stack[i].pl, r);
stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])};
break;
case '?':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i] = (struct frag){r, pl_append(stack[i].pl, &r->node[1]) };
break;
case '+':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i].pl = pl_append(NULL, &patch(stack[i].pl, r)->node[1]);
break;
case '.': case '.':
a.start = mknfa((struct rgx_nfa){C_ANY, {NULL}}); a.start = mknfa((struct rgx_nfa){ANY, {NULL}});
a.out = append(NULL, &a.start->node[0]); a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a; stack[++i] = a;
break; break;
default: default:
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s}); a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
a.out = append(NULL, &a.start->node[0]); a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a; stack[++i] = a;
break; break;
} }
} }
// Append Final // Collapse and add FINAL state
{ while ( i!=0 ) {
stack[++i].start = mknfa((struct rgx_nfa){FINAL, {NULL}, v}); if (stack[i].start->op == BLOCK )
stack[i].out = append(NULL, &stack[i].start->node[0]); stack[i--];
} collapse(stack, &i);
// Collapse Stack
while (i > 0)
{
struct frag to;
to = stack[i--];
patch(stack[i].out, to.start);
} }
pl_merge(&stack[i].pl, union_pl);
union_pl = NULL;
patch(stack[i].pl, mknfa((struct rgx_nfa){FINAL, {NULL}, v}));
return stack[i].start; return stack[i].start;
} }
@@ -144,19 +210,47 @@ int rgx_run(struct rgx_nfa *l, char *s) {
switch (l->op) switch (l->op)
{ {
case FINAL: case FINAL:
//printf("FINAL: %c\n", *s? *s:'0');
return *s ? 0: l->c; return *s ? 0: l->c;
case C_ANY: case ANY:
//printf("ANY: %c\n", *s? *s:'0');
if (!*(s++)) if (!*(s++))
return 0; return 0;
l = l->node[0]; l = l->node[0];
break; break;
case CLASS:
//printf("CLASS(%c): %c\n", l->c, *s? *s: '0');
switch(l->c)
{
case 's':
if(!isspace(*(s++)))
return 0;
l = l->node[0];
break;
case 'd':
if(!isdigit(*(s++)))
return 0;
l = l->node[0];
break;
case 'w':
if(!isalpha(*(s++)))
return 0;
l = l->node[0];
break;
default:
return 0;
}
break;
case RUNE: case RUNE:
//printf("RUNE(%c): %c\n", l->c, *s?*s:'0');
if (*(s++) != l->c) if (*(s++) != l->c)
return 0; return 0;
l = l->node[0]; l = l->node[0];
break; break;
case UNION: case UNION:
//printf("{\n");
int res = rgx_run(l->node[0], s); int res = rgx_run(l->node[0], s);
//printf("}\n");
if (res) if (res)
return res; return res;
l = l->node[1]; l = l->node[1];

View File

@@ -0,0 +1,34 @@
#include <stdio.h>
#include "main.h"
char * tokens[] =
{
"\"(\\\"|\"!)*\"",
"\\d+",
"L|R|N",
"\\w+",
"\\s+",
"[\\s*",
"\\s*]",
"\\s*:\\s*",
"\\s*\\|\\s*",
"\\s*<\\s*",
"\\s*;",
NULL,
};
int main(void)
{
int i;
struct rgx_nfa * l = NULL;
//for (i=0; tokens[i]; ++i) {
l=rgx_compile(l, tokens[0], 1);
//}
printf("\n%d\n", rgx_run(l, "\"\"")) ;
printf("\n%d\n", rgx_run(l, "\"b\"")) ;
printf("\n%d\n", rgx_run(l, "\"bb\"")) ;
printf("\n%d\n", rgx_run(l, "\"\"\"")) ;
printf("\n%d\n", rgx_run(l, "\"\\\"\"")) ;
}

View File

@@ -1,6 +1,6 @@
struct rgx_nfa struct rgx_nfa
{ {
enum {RUNE, C_ANY, UNION, FINAL} op; enum {RUNE, ANY, UNION, FINAL, CLASS, BLOCK} op;
struct rgx_nfa *node[2]; struct rgx_nfa *node[2];
unsigned char c; unsigned char c;
}; };

View File

@@ -24,27 +24,46 @@ struct test
{"a.*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {"abbc", 1}, {NULL}}}, {"a.*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {"abbc", 1}, {NULL}}},
{"a.?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {NULL}}}, {"a.?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {NULL}}},
// TODO: Implement NFA to DFA to avoid loops
// Equal to * // Equal to *
{"ab+?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, {"ab+?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}},
{"ab*?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}}, {"ab*?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}},
{"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, //{"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
{"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}}, //{"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
// . Alt
//{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}},
{"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 1}, {NULL}}},
{"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}},
{"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}},
// Xd
{"a?", {{"aaaa", 0}, {"a", 1}, {NULL}}},
{"a*", {{"aaaa", 1}, {"a", 1}, {NULL}}},
// REDOS
{"a+*", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}}, {"a+*", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}},
//{"a**", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}},
// Nilpotent
{"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 0}, {NULL}}},
{"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}},
//{"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}},
// UNION
{"a()", {{"a", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}},
{"a(b)", {{"ab", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}},
{"a(b)*", {{"ab", 1},{"abb", 1}, {"a", 1 }, {"ba", 0}, {NULL}}},
{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}},
{"(a|.)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}},
{"(.|a)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}},
{"(b|a)*", {{"a", 1},{"cab", 0}, {"b", 1 }, {"ab", 1}, {"cacb", 0}, {NULL}}},
{"c(b|a)*", {{"ca", 1},{"cab", 1}, {"b", 0 }, {"bb", 0}, {"cacb", 0}, {NULL}}},
{"ab|b", {{"ab", 1}, {"b", 1}, {"abb", 0}, {NULL}}},
{"a|b|c", {{"a", 1}, {"b", 1}, {"c", 1},{"d",0}, {NULL}}},
{"((a|b)|c)d", {{"ad", 1}, {"bd", 1}, {"cd", 1},{"a",0}, {NULL}}},
// BLOCK
{"(abb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"abb)", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"a(bb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"(ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"(ab)+", {{"ab", 1}, {"abab", 1}, {"b", 0}, {NULL}}},
{"(a(bc)*)", {{"a", 1}, {"abc", 1}, {"abb", 0}, {NULL}}},
{"(ab|cd)", {{"ab", 1}, {"cd", 1}, {"abcd", 0}, {NULL}}},
// REDOS
{"a*b?a*", {{"aaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", 1}, {NULL}}}, {"a*b?a*", {{"aaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", 1}, {NULL}}},
{"aa*+", {{"aaaa", 1}, {NULL}}},
{"a*+", {{"a", 1}, {NULL}}},
} ; } ;
@@ -60,6 +79,11 @@ int main(void)
printf("\n\t%s\n", t.regex); printf("\n\t%s\n", t.regex);
nfa = rgx_compile(NULL, t.regex, 1); nfa = rgx_compile(NULL, t.regex, 1);
if (!nfa ){
printf("Malformed\n");
continue;
}
for( struct match * m = t.matches; m->s; ++m) for( struct match * m = t.matches; m->s; ++m)
{ {
int res; int res;