Make Union and Blocks, avoid postfix notation

This commit is contained in:
PedroEdiaz
2025-12-10 15:21:56 -06:00
parent 6acef48f09
commit e4f67ab3fe
4 changed files with 232 additions and 80 deletions

View File

@@ -1,5 +1,6 @@
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "main.h"
struct patch_list
@@ -11,10 +12,10 @@ struct patch_list
struct frag
{
struct rgx_nfa *start;
struct patch_list *out;
struct patch_list *pl;
};
struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa)
struct patch_list * pl_append(struct patch_list * pl, struct rgx_nfa ** nfa)
{
struct patch_list * res, * root = pl;
res = malloc(sizeof(struct patch_list));
@@ -33,7 +34,25 @@ struct patch_list * append(struct patch_list * pl, struct rgx_nfa ** nfa)
return root;
}
struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) {
void pl_merge(struct patch_list ** pl1, struct patch_list * pl2)
{
struct patch_list * pl = *pl1;
if (!pl ) {
*pl1 = pl2;
return;
}
while (pl->next) {
pl = pl->next;
}
pl->next = pl2;
}
struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to)
{
while (pl) {
*pl->ptr = to;
pl = pl->next;
@@ -41,6 +60,29 @@ struct rgx_nfa * patch(struct patch_list *pl, struct rgx_nfa *to) {
return to;
}
void collapse(struct frag * stack, unsigned char * i)
{
if (stack[*i].start->op == BLOCK ) {
stack[(*i)--] ;
return;
}
while (*i > 0 )
{
struct frag to;
to = stack[(*i)--];
if (stack[*i].start->op == BLOCK ) {
stack[*i]= to;
return;
}
patch(stack[*i].pl, to.start);
stack[*i].pl = to.pl;
}
}
struct rgx_nfa * mknfa(struct rgx_nfa nfa)
{
struct rgx_nfa * res;
@@ -50,91 +92,115 @@ struct rgx_nfa * mknfa(struct rgx_nfa nfa)
}
struct rgx_nfa * rgx_compile(struct rgx_nfa * l, char * s, int v) {
struct rgx_nfa *r;
unsigned char i=-1;
struct frag stack[0xff], a, b;
struct frag stack[0xff];
struct patch_list * union_pl = NULL;
if (l) {
struct rgx_nfa *r;
r = mknfa((struct rgx_nfa){UNION, {l, NULL}, 1});
stack[++i] = (struct frag){r, pl_append(NULL, &r->node[1])};
}
for(;*s; ++s)
{
struct rgx_nfa *r;
struct frag a, b;
switch(*s)
{
case '(':
r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0});
stack[++i] = (struct frag){r, NULL };
break;
case ')':
collapse(stack, &i);
pl_merge(&stack[i].pl, union_pl);
union_pl=NULL;
break;
case '|':
collapse(stack, &i);
pl_merge(&union_pl, stack[i].pl);
if (i!= 0) {
a = stack[i--];
r = mknfa((struct rgx_nfa){BLOCK, {NULL}, 0});
stack[++i] = (struct frag){r, NULL };
stack[++i]= a;
}
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])};
break;
case '\\':
switch(*(++s))
{
case 's':
case 'd':
case 'w':
a.start = mknfa((struct rgx_nfa){CLASS, {NULL}, *s});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
case '\\':
case '|':
case '(':
case ')':
case '*':
b = stack[i--];
if (i==0xff)
{
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
patch(b.out, r);
stack[++i] = (struct frag){r, append(NULL, &r->node[1])};
break;
}
else
{
a = stack[i--];
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
patch(a.out, r);
patch(b.out, r);
a.out = append(NULL, &r->node[1]);
stack[++i] = a;
break;
}
case '?':
b = stack[i--];
if (i==0xff)
{
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
b.out = append(b.out, &r->node[1]);
stack[++i] = (struct frag){r, b.out };
break;
}
else
{
a = stack[i--];
r = mknfa((struct rgx_nfa){UNION, {b.start, NULL}});
a.out = append(b.out, &patch(a.out, r)->node[1]);
stack[++i] = a;
break;
}
case '+':
// This implemention may cause a infinite loop that crashes.
a = stack[i--];
r = mknfa((struct rgx_nfa){UNION, {NULL, a.start}});
a.out = append(NULL, &patch(a.out, r)->node[0]);
case '?':
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
default:
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, '\\'});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
}
break;
case '*':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
patch(stack[i].pl, r);
stack[i] = (struct frag){r, pl_append(NULL, &r->node[1])};
break;
case '?':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i] = (struct frag){r, pl_append(stack[i].pl, &r->node[1]) };
break;
case '+':
r = mknfa((struct rgx_nfa){UNION, {stack[i].start, NULL}, 0});
stack[i].pl = pl_append(NULL, &patch(stack[i].pl, r)->node[1]);
break;
case '.':
a.start = mknfa((struct rgx_nfa){C_ANY, {NULL}});
a.out = append(NULL, &a.start->node[0]);
a.start = mknfa((struct rgx_nfa){ANY, {NULL}});
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
default:
a.start = mknfa((struct rgx_nfa){RUNE, {NULL}, *s});
a.out = append(NULL, &a.start->node[0]);
a.pl = pl_append(NULL, &a.start->node[0]);
stack[++i] = a;
break;
}
}
// Append Final
{
stack[++i].start = mknfa((struct rgx_nfa){FINAL, {NULL}, v});
stack[i].out = append(NULL, &stack[i].start->node[0]);
}
// Collapse Stack
while (i > 0)
{
struct frag to;
to = stack[i--];
patch(stack[i].out, to.start);
// Collapse and add FINAL state
while ( i!=0 ) {
if (stack[i].start->op == BLOCK )
stack[i--];
collapse(stack, &i);
}
pl_merge(&stack[i].pl, union_pl);
union_pl = NULL;
patch(stack[i].pl, mknfa((struct rgx_nfa){FINAL, {NULL}, v}));
return stack[i].start;
}
@@ -144,19 +210,47 @@ int rgx_run(struct rgx_nfa *l, char *s) {
switch (l->op)
{
case FINAL:
//printf("FINAL: %c\n", *s? *s:'0');
return *s ? 0: l->c;
case C_ANY:
case ANY:
//printf("ANY: %c\n", *s? *s:'0');
if (!*(s++))
return 0;
l = l->node[0];
break;
case CLASS:
//printf("CLASS(%c): %c\n", l->c, *s? *s: '0');
switch(l->c)
{
case 's':
if(!isspace(*(s++)))
return 0;
l = l->node[0];
break;
case 'd':
if(!isdigit(*(s++)))
return 0;
l = l->node[0];
break;
case 'w':
if(!isalpha(*(s++)))
return 0;
l = l->node[0];
break;
default:
return 0;
}
break;
case RUNE:
//printf("RUNE(%c): %c\n", l->c, *s?*s:'0');
if (*(s++) != l->c)
return 0;
l = l->node[0];
break;
case UNION:
//printf("{\n");
int res = rgx_run(l->node[0], s);
//printf("}\n");
if (res)
return res;
l = l->node[1];

View File

@@ -0,0 +1,34 @@
#include <stdio.h>
#include "main.h"
char * tokens[] =
{
"\"(\\\"|\"!)*\"",
"\\d+",
"L|R|N",
"\\w+",
"\\s+",
"[\\s*",
"\\s*]",
"\\s*:\\s*",
"\\s*\\|\\s*",
"\\s*<\\s*",
"\\s*;",
NULL,
};
int main(void)
{
int i;
struct rgx_nfa * l = NULL;
//for (i=0; tokens[i]; ++i) {
l=rgx_compile(l, tokens[0], 1);
//}
printf("\n%d\n", rgx_run(l, "\"\"")) ;
printf("\n%d\n", rgx_run(l, "\"b\"")) ;
printf("\n%d\n", rgx_run(l, "\"bb\"")) ;
printf("\n%d\n", rgx_run(l, "\"\"\"")) ;
printf("\n%d\n", rgx_run(l, "\"\\\"\"")) ;
}

View File

@@ -1,6 +1,6 @@
struct rgx_nfa
{
enum {RUNE, C_ANY, UNION, FINAL} op;
enum {RUNE, ANY, UNION, FINAL, CLASS, BLOCK} op;
struct rgx_nfa *node[2];
unsigned char c;
};

View File

@@ -24,27 +24,46 @@ struct test
{"a.*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {"abbc", 1}, {NULL}}},
{"a.?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"acc", 1}, {"ac", 1}, {NULL}}},
// TODO: Implement NFA to DFA to avoid loops
// Equal to *
{"ab+?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}},
{"ab*?c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {NULL}}},
{"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
{"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
// . Alt
//{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}},
{"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 1}, {NULL}}},
{"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}},
{"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}},
// Xd
{"a?", {{"aaaa", 0}, {"a", 1}, {NULL}}},
{"a*", {{"aaaa", 1}, {"a", 1}, {NULL}}},
// REDOS
//{"ab?+c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
//{"ab?*c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"ababc", 1}, {"ac", 1}, {NULL}}},
{"a+*", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}},
//{"a**", {{"aaaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1}, {NULL}}},
// Nilpotent
{"ab??c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 0}, {"ac", 1}, {"c", 0}, {NULL}}},
{"ab++c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 0}, {"c", 0}, {"ababc",0}, {NULL}}},
//{"ab**c", {{"a", 0},{"abc", 1}, {"abcd",0 }, {"abbc", 1}, {"ac", 1}, {"c", 1}, {"ababc",1}, {NULL}}},
// UNION
{"a()", {{"a", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}},
{"a(b)", {{"ab", 1},{"abc", 0}, {"b", 0 }, {"ba", 0}, {NULL}}},
{"a(b)*", {{"ab", 1},{"abb", 1}, {"a", 1 }, {"ba", 0}, {NULL}}},
{"a|b", {{"a", 1},{"abc", 0}, {"b", 1 }, {"ba", 0}, {NULL}}},
{"(a|.)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}},
{"(.|a)*", {{"a", 1},{"abc", 1}, {"b", 1 }, {"ba", 1}, {NULL}}},
{"(b|a)*", {{"a", 1},{"cab", 0}, {"b", 1 }, {"ab", 1}, {"cacb", 0}, {NULL}}},
{"c(b|a)*", {{"ca", 1},{"cab", 1}, {"b", 0 }, {"bb", 0}, {"cacb", 0}, {NULL}}},
{"ab|b", {{"ab", 1}, {"b", 1}, {"abb", 0}, {NULL}}},
{"a|b|c", {{"a", 1}, {"b", 1}, {"c", 1},{"d",0}, {NULL}}},
{"((a|b)|c)d", {{"ad", 1}, {"bd", 1}, {"cd", 1},{"a",0}, {NULL}}},
// BLOCK
{"(abb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"abb)", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"a(bb", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"(ab)b", {{"abb", 1}, {"ab", 0}, {"b", 0}, {NULL}}},
{"(ab)+", {{"ab", 1}, {"abab", 1}, {"b", 0}, {NULL}}},
{"(a(bc)*)", {{"a", 1}, {"abc", 1}, {"abb", 0}, {NULL}}},
{"(ab|cd)", {{"ab", 1}, {"cd", 1}, {"abcd", 0}, {NULL}}},
// REDOS
{"a*b?a*", {{"aaa", 1}, {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", 1}, {NULL}}},
{"aa*+", {{"aaaa", 1}, {NULL}}},
{"a*+", {{"a", 1}, {NULL}}},
} ;
@@ -60,6 +79,11 @@ int main(void)
printf("\n\t%s\n", t.regex);
nfa = rgx_compile(NULL, t.regex, 1);
if (!nfa ){
printf("Malformed\n");
continue;
}
for( struct match * m = t.matches; m->s; ++m)
{
int res;