From 55a582230ce7109309cd2156ad73dff3b504ee07 Mon Sep 17 00:00:00 2001
From: Kyryl Melekhin <k.melekhin@gmail.com>
Date: Fri, 15 Oct 2021 17:44:52 +0000
Subject: [PATCH] replace error prone gen global state with better solution

---
 README | 47 ++++++++++++++++++++++++++++++++
 pike.c | 85 +++++++++++++++++++++++++---------------------------------
 2 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/README b/README
index 420f220..703ec0a 100644
--- a/README
+++ b/README
@@ -94,6 +94,53 @@ is a must, this is the algorithm.
 Research has shown that it is possible to disambiguate NFA in polynomial time
 but it brings serious performance issues on non ambiguous inputs.
 
+This pikevm features an improved submatch extraction
+algorithm based on Russ Cox's original design. 
+I - Kyryl Melekhin have found a way to optimize the tracking
+properly of 1st number in the submatch pair. Based on simple
+observation of how the NFA is constructed I noticed that
+there is no way for addthread1() to ever reach inner SAVE
+instructions in the regex, so that leaves tracking 2nd pairs by
+addthread1() irrelevant to the final results (except the need to 
+initialize the sub after allocation). This improved the overall 
+performance by 25% which is massive considering that at the 
+time there was nothing else left to can be done to make it faster.
+
+What are on##list macros?
+Redundant state inside nlist can happen in couple of
+ways, and has to do with the (closure) a* (star) operations and
+also +. Due to the automata machine design split happens
+to be above the next consumed instruction and if that
+state gets added onto the list we may segfault or give
+wrong submatch result. Rsplit does not have this problem
+because it is generated below the consumer instruction, but
+it can still add redundant states. Overall this is extremely
+difficult to understand or explain, but this is just something
+we have to check for. We checked for this using extra int inside
+the split instructions, so this left some global state inside the
+machine insts. Most of the time we just added to the next
+gen number and kept incrementing it forever. This leaves a small
+chance of overflowing the int and getting a run on a false state
+left from previous use of the regex. Though if overflow never
+happens there is no chance of getting a false state. Overflows
+like this pose a high security threat, if the hacker knows
+how many cycles he needs to overflow the gen varible and get
+inconsistent result. It is possible to reset the marks if we
+near the overflow, but as you may guess that does not come
+for free.
+
+Currently I removed all dynamic global state from the instructions
+fixing any overlow issue at the cost of slight overhead of needing
+to look though the nlist states, to prevent their readdition. This
+solution is still fast because it affects only nlist + split run on
+so most other uses of regex don't suffer big performace penalty.
+This does not solve the ambiguity problem with multible
+continuous states though. Finding a fast solution for continuous
+ambiguity is the last thing preventing me to call this regex engine
+PERFECT and limitation free. While yet, this is to be invented it
+takes a big deal of genius and creativity to make new algorithms
+or find improvements in what we already know.
+
 Author and License
 ==================
 licensed under BSD license, just as the original re1.
diff --git a/pike.c b/pike.c
index fb49c9d..15eb9ff 100644
--- a/pike.c
+++ b/pike.c
@@ -58,7 +58,6 @@ struct rcode
 	int sub;
 	int presub;
 	int splits;
-	int gen;
 	int insts[];
 };
 
@@ -93,6 +92,7 @@ typedef struct rsub rsub;
 struct rsub
 {
 	int ref;
+	rsub *freesub;
 	const char *sub[];
 };
 
@@ -134,12 +134,12 @@ void re_dumpcode(rcode *prog)
 			pc = prog->unilen;
 			break;
 		case SPLIT:
-			printf("split %d (%d)\n", pc + code[pc] + 2, code[pc]);
-			pc+=2;
+			printf("split %d (%d)\n", pc + code[pc] + 1, code[pc]);
+			pc++;
 			break;
 		case RSPLIT:
-			printf("rsplit %d (%d)\n", pc + code[pc] + 2, code[pc]);
-			pc+=2;
+			printf("rsplit %d (%d)\n", pc + code[pc] + 1, code[pc]);
+			pc++;
 			break;
 		case JMP:
 			printf("jmp %d (%d)\n", pc + code[pc] + 1, code[pc]);
@@ -295,17 +295,15 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 			}
 			if (inf) {
 				EMIT(PC, RSPLIT);
-				EMIT(PC+1, REL(PC, PC - size -1));
-				EMIT(PC+2, 0);
-				PC += 3;
+				EMIT(PC+1, REL(PC, PC - size));
+				PC += 2;
 				prog->len++;
 				prog->splits++;
 				maxcnt = mincnt;
 			}
 			for (i = maxcnt-mincnt; i > 0; i--) {
 				EMIT(PC++, SPLIT);
-				EMIT(PC++, REL(PC-1, PC+((size+3)*i)));
-				EMIT(PC++, 0);
+				EMIT(PC++, REL(PC, PC+((size+2)*i)));
 				prog->splits++;
 				prog->len++;
 				if (code)
@@ -331,21 +329,20 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 			break;
 		case '?':
 			if (PC == term) goto syntax_error;
-			INSERT_CODE(term, 3, PC);
+			INSERT_CODE(term, 2, PC);
 			if (re[1] == '?') {
 				EMIT(term, RSPLIT);
 				re++;
 			} else
 				EMIT(term, SPLIT);
-			EMIT(term + 1, REL(term, PC-1));
-			EMIT(term + 2, 0);
+			EMIT(term + 1, REL(term, PC));
 			prog->len++;
 			prog->splits++;
 			term = PC;
 			break;
 		case '*':
 			if (PC == term) goto syntax_error;
-			INSERT_CODE(term, 3, PC);
+			INSERT_CODE(term, 2, PC);
 			EMIT(PC, JMP);
 			EMIT(PC + 1, REL(PC, term));
 			PC += 2;
@@ -354,8 +351,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 				re++;
 			} else
 				EMIT(term, SPLIT);
-			EMIT(term + 1, REL(term, PC-1));
-			EMIT(term + 2, 0);
+			EMIT(term + 1, REL(term, PC));
 			prog->splits++;
 			prog->len += 2;
 			term = PC;
@@ -367,9 +363,8 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 				re++;
 			} else
 				EMIT(PC, RSPLIT);
-			EMIT(PC + 1, REL(PC-1, term));
-			EMIT(PC + 2, 0);
-			PC += 3;
+			EMIT(PC + 1, REL(PC, term));
+			PC += 2;
 			prog->splits++;
 			prog->len++;
 			term = PC;
@@ -377,12 +372,11 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 		case '|':
 			if (alt_label)
 				alt_stack[altc++] = alt_label;
-			INSERT_CODE(start, 3, PC);
+			INSERT_CODE(start, 2, PC);
 			EMIT(PC++, JMP);
 			alt_label = PC++;
 			EMIT(start, SPLIT);
-			EMIT(start + 1, REL(start, PC-1));
-			EMIT(start + 2, 0);
+			EMIT(start + 1, REL(start, PC));
 			prog->splits++;
 			prog->len += 2;
 			term = PC;
@@ -403,7 +397,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 	if (code && alt_label) {
 		EMIT(alt_label, REL(alt_label, PC) + 1);
 		for (int alts = altc; altc; altc--) {
-			int at = alt_stack[alts-altc]+altc*3;
+			int at = alt_stack[alts-altc]+altc*2;
 			EMIT(at, REL(at, PC) + 1);
 		}
 	}
@@ -435,7 +429,6 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	prog->sub = 0;
 	prog->presub = nsubs;
 	prog->splits = 0;
-	prog->gen = 1;
 
 	int res = _compilecode(&re, prog, /*sizecode*/0);
 	if (res < 0) return res;
@@ -450,27 +443,32 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	return RE_SUCCESS;
 }
 
-#define _return(state) \
-{ prog->gen = gen + 1; return state; } \
-
 #define newsub(init, copy) \
 if (freesub) \
-	{ s1 = freesub; freesub = (rsub*)s1->sub[0]; copy } \
+	{ s1 = freesub; freesub = s1->freesub; copy } \
 else \
 	{ s1 = (rsub*)&nsubs[suboff+=rsubsize]; init } \
 
 #define decref(csub) \
 if (--csub->ref == 0) { \
-	csub->sub[0] = (char*)freesub; \
+	csub->freesub = freesub; \
 	freesub = csub; \
 } \
 
 #define deccheck(nn) \
 { decref(nsub) goto rec_check##nn; } \
 
+#define onnlist(nn, list, listidx, when, pre) \
+when for (j = 0; j < listidx; j++) \
+	if (npc == list[j].pc) \
+		{ pre deccheck(nn) } \
+
+#define onclist(nn, list, listidx, i, pre) \
+
 #define fastrec(nn, list, listidx) \
 nsub->ref++; \
 if (*npc < WBEG) { \
+	on##list(nn, list, listidx, /*nop*/, subs[i++] = nsub;) \
 	list[listidx].sub = nsub; \
 	list[listidx++].pc = npc; \
 	npc = pcs[i]; \
@@ -487,18 +485,12 @@ memcpy(s1->sub, nsub->sub, osubp / 2);) \
 newsub(/*nop*/, /*nop*/) \
 memcpy(s1->sub, nsub->sub, osubp); \
 
-#define onnlist(nn) \
-if (npc[2] == gen) \
-	deccheck(nn) \
-npc[2] = gen; \
-
-#define onclist(nn) /* nop */ \
-
 #define addthread(nn, list, listidx) \
 { \
 	int i = 0; \
 	rec##nn: \
 	if (*npc < WBEG) { \
+		on##list(nn, list, listidx, if (i), /*nop*/) \
 		list[listidx].sub = nsub; \
 		list[listidx++].pc = npc; \
 		rec_check##nn: \
@@ -515,15 +507,13 @@ npc[2] = gen; \
 		npc += 2 + npc[1]; \
 		goto rec##nn; \
 	case SPLIT: \
-		on##list(nn) \
-		npc += 3; \
-		pcs[i] = npc + npc[-2]; \
+		npc += 2; \
+		pcs[i] = npc + npc[-1]; \
 		fastrec(nn, list, listidx) \
 	case RSPLIT: \
-		on##list(nn) \
-		npc += 3; \
+		npc += 2; \
 		pcs[i] = npc; \
-		npc += npc[-2]; \
+		npc += npc[-1]; \
 		fastrec(nn, list, listidx) \
 	case SAVE: \
 		if (nsub->ref > 1) { \
@@ -547,7 +537,7 @@ npc[2] = gen; \
 	case BOL: \
 		if (_sp != s) { \
 			if (!i && !listidx) \
-				_return(0) \
+				return 0; \
 			deccheck(nn) \
 		} \
 		npc++; goto rec##nn; \
@@ -561,7 +551,7 @@ npc[2] = gen; \
 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
 	int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
-	int i, j, c, gen, suboff = rsubsize, *npc;
+	int i, j, c, suboff = rsubsize, *npc;
 	int clistidx = 0, nlistidx = 0, osubp = nsubp * sizeof(char*);
 	const char *sp = s, *_sp = s;
 	int *insts = prog->insts;
@@ -571,10 +561,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 	rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
 	rthread _clist[prog->len], _nlist[prog->len];
 	rthread *clist = _clist, *nlist = _nlist, *tmp;
-	gen = prog->gen;
 	goto jmp_start;
 	for (;; sp = _sp) {
-		gen++; uc_len(i, sp) uc_code(c, sp)
+		uc_len(i, sp) uc_code(c, sp)
 		_sp = sp+i;
 		for (i = 0; i < clistidx; i++) {
 			npc = clist[i].pc;
@@ -625,9 +614,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 			subp[i] = matched->sub[j];
 			subp[i+1] = matched->sub[nsubp / 2 + j];
 		}
-		_return(1)
+		return 1;
 	}
-	_return(0)
+	return 0;
 }
 
 int main(int argc, char *argv[])