From eb01f29134e0468f7289c64eb85be5a8d435a55d Mon Sep 17 00:00:00 2001
From: Kyryl Melekhin <k.melekhin@gmail.com>
Date: Thu, 17 Feb 2022 18:20:06 +0000
Subject: [PATCH] pike: improve size calculations

---
 README | 192 ++++++++++++++++++++++++++-------------------------------
 pike.c |  83 +++++++++++++------------
 2 files changed, 131 insertions(+), 144 deletions(-)

diff --git a/README b/README
index c50bc9a..86dc7b6 100644
--- a/README
+++ b/README
@@ -23,7 +23,7 @@ Features
 
 * UnLike re1.5, here is only pikevm, one file easy to use.
 * Unlike re1.5, regexes is compiled to type sized code rather than bytecode,
-eliviating the problem of byte overflow in splits/jmps on large regexes. 
+eliviating the problem of byte overflow in splits/jmps on large regexes.
 Currently the type used is int, and every atom in compiled code is aligned
 to that.
 * Matcher does not take size of string as param, it checks for '\0' instead,
@@ -54,121 +54,103 @@ NOTES
 The problem described in this paper has been fixed. Ambiguous matching is correct.
 HISTORY:
 https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf
-"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching, 
-which is based on the observation that reversing the longest-match rule 
-simplifies the handling of iteration subexpressions: instead of maximizing 
-submatch from the first to the last iteration, one needs to maximize the 
-iterations in reverse order. This means that the disambiguation is always 
-based on the most recent iteration, removing the need to remember all previous 
-iterations (except for the backwards-first, i.e.  the last one, which contains 
-submatch result). The algorithm tracks two pairs of offsets per each submatch 
-group: the active pair (used for disambiguation) and the result pair. It gives 
-incorrect results under two conditions: (1) ambiguous matches have equal 
-offsets on some iteration, and (2) disambiguation happens too late, when 
-the active offsets have already been updated and the difference between 
-ambiguous matches is erased. We found that such situations may occur for two 
-reasons. First, the ε-closure algorithm may compare ambiguous paths after 
+"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching,
+which is based on the observation that reversing the longest-match rule
+simplifies the handling of iteration subexpressions: instead of maximizing
+submatch from the first to the last iteration, one needs to maximize the
+iterations in reverse order. This means that the disambiguation is always
+based on the most recent iteration, removing the need to remember all previous
+iterations (except for the backwards-first, i.e.  the last one, which contains
+submatch result). The algorithm tracks two pairs of offsets per each submatch
+group: the active pair (used for disambiguation) and the result pair. It gives
+incorrect results under two conditions: (1) ambiguous matches have equal
+offsets on some iteration, and (2) disambiguation happens too late, when
+the active offsets have already been updated and the difference between
+ambiguous matches is erased. We found that such situations may occur for two
+reasons. First, the ε-closure algorithm may compare ambiguous paths after
 their join point, when both paths have a common suffix with tagged
-transitions. This is the case with the Cox prototype implementation; for 
-example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such 
-failures can be repaired by exploring states in topological order, but a 
-topological order does not exist in the presence of ε-loops. The second reason 
-is bounded repetition: ambiguous paths may not have an intermediate join point 
-at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we 
-have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number 
-of iterations. Assuming that the bounded repetition is unrolled by chaining 
-three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time 
-ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox 
-algorithm is interesting: if somehow the delayed comparison problem was fixed, 
+transitions. This is the case with the Cox prototype implementation; for
+example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such
+failures can be repaired by exploring states in topological order, but a
+topological order does not exist in the presence of ε-loops. The second reason
+is bounded repetition: ambiguous paths may not have an intermediate join point
+at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we
+have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number
+of iterations. Assuming that the bounded repetition is unrolled by chaining
+three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time
+ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox
+algorithm is interesting: if somehow the delayed comparison problem was fixed,
 it would work.  The algorithm requires O(mt) memory and O(nm^2t) time
 (assuming a worst-case optimal closure algorithm), where n is the
-length of input, m it the size of RE and t is the number of submatch groups 
+length of input, m it the size of RE and t is the number of submatch groups
 and subexpressions that contain them."
 
 Research has shown that it is possible to disambiguate NFA in polynomial time
-but it brings serious performance issues on non ambiguous inputs.
-See the branch "disambiguate_paths" on this repo shows what is being
-done to solve it and the potential performance costs. In short it
-requires tracking the parent of every state added on nlist from clist.
-If the state from nlist matches the consumer, the alternative clist
-state related to that nlist state gets discarded and the nsub ref
-can be decremented (freed). The reason why this problem does not
-exist for non ambiguous regexes is because the alternative clist
-state will never match due to the next state having a different
-consumer. There is no need for any extra handling it gets freed normally.
-I decided to not apply this solution here because I think
-most use cases for regex are not ambiguious like say regex:
-"a{10000}". If you try matching 10000 'a' characters in a row
-like that you will have a problem where the stack usage will
-jump up to 10000*(subsize) but it will never exceed the size
-of regex though, but the number of NFA states will also increase
-by the same amount, so at the charater 9999 you will find
-9999 redundant nlist states, that will degrade performance
-linearly, however it will be very slow compared to uplimited
-regex like a+. The cost of this solution is somewhere around
-2% general performance decrease (broadly), but a magnitude of 
-complexity decrease for ambiguous cases, for example
-matching 64 characters went down from 30 to 9 microseconds.
-Another solution to this problem can be to determine the
-ambiguous paths at compile time and flag the inner
-states as ambiguous ahead of time, still this can't avoid
-having a loop though the alt states as their positioning
-in clist can't be precomputed due to the dynamic changes.
-
+but it brings serious performance issues on non ambiguous inputs.  See the
+branch "disambiguate_paths" on this repo shows what is being done to solve it
+and the potential performance costs. In short it requires tracking the parent
+of every state added on nlist from clist.  If the state from nlist matches
+the consumer, the alternative clist state related to that nlist state gets
+discarded and the nsub ref can be decremented (freed). The reason why this
+problem does not exist for non ambiguous regexes is because the alternative
+clist state will never match due to the next state having a different consumer
+. There is no need for any extra handling it gets freed normally.  I decided
+to not apply this solution here because I think most use cases for regex are
+not ambiguious like say regex: "a{10000}". If you try matching 10000 'a'
+characters in a row like that you will have a problem where the stack usage
+will jump up to 10000*(subsize) but it will never exceed the size of regex
+though, but the number of NFA states will also increase by the same amount,
+so at the charater 9999 you will find 9999 redundant nlist states, that will
+degrade performance linearly, however it will be very slow compared to
+uplimited regex like a+. The cost of this solution is somewhere around 2%
+general performance decrease (broadly), but a magnitude of complexity
+decrease for ambiguous cases, for example matching 64 characters went down
+from 30 to 9 microseconds.  Another solution to this problem can be to
+determine the ambiguous paths at compile time and flag the inner states as
+ambiguous ahead of time, still this can't avoid having a loop though the alt
+states as their positioning in clist can't be precomputed due to the dynamic
+changes.
 (Comment about O(mt) memory complexity)
-This worst case scenario can only happen on ambiguous input, that is why nsubs
-size is set to half a MB just in case, this can match 5000000 
-ambiguous consumers (char, class, any) assuming t is 1. In practice there
-is almost never a situation where someone wants to search using regex this
-large. Use of alloca() instead of VLA, could remove this limit, I just wish
-it was standardized. If you ever wondered about a situation where alloca
-is a must, this is the algorithm.
-Most of the time memory usage is very low and the space
-complexity for non ambigious regex is O(nt) where n is
-the number of currently considering alternate paths
-in the regex and t is the number of submatch groups.
+This worst case scenario can only happen on ambiguous input. Ambiguous
+consumers (char, class, any) assuming t is 1. In practice there is almost
+never a situation where someone wants to search using regex this large. Most
+of the time memory usage is very low and the space complexity for non
+ambigious regex is O(nt) where n is the number of currently considering
+alternate paths in the regex and t is the number of submatch groups.
 
-This pikevm features an improved submatch extraction
-algorithm based on Russ Cox's original design. 
-I - Kyryl Melekhin have found a way to optimize the tracking
-properly of 1st number in the submatch pair. Based on simple
-observation of how the NFA is constructed I noticed that
-there is no way for addthread1() to ever reach inner SAVE
-instructions in the regex, so that leaves tracking 2nd pairs by
-addthread1() irrelevant to the final results (except the need to 
-initialize the sub after allocation). This improved the overall 
-performance by 25% which is massive considering that at the 
-time there was nothing else left to can be done to make it faster.
+This pikevm implementation features an improved submatch extraction algorithm
+based on Russ Cox's original design.  I - Kyryl Melekhin have found a way to
+optimize the tracking properly of 1st number in the submatch pair. Based on
+simple observation of how the NFA is constructed I noticed that there is no
+way for addthread1() to ever reach inner SAVE instructions in the regex, so
+that leaves tracking 2nd pairs by addthread1() irrelevant to the final
+results (except the need to initialize the sub after allocation). This
+improved the overall performance by 25% which is massive considering that at
+the time there was nothing else left to can be done to make it faster.
 
 What are on##list macros?
-Redundant state inside nlist can happen in couple of
-ways, and has to do with the (closure) a* (star) operations and
-also +. Due to the automata machine design split happens
-to be above the next consumed instruction and if that
-state gets added onto the list we may segfault or give
-wrong submatch result. Rsplit does not have this problem
-because it is generated below the consumer instruction, but
-it can still add redundant states. Overall this is extremely
-difficult to understand or explain, but this is just something
-we have to check for. We checked for this using extra int inside
-the split instructions, so this left some global state inside the
-machine insts. Most of the time we just added to the next
-gen number and kept incrementing it forever. This leaves a small
-chance of overflowing the int and getting a run on a false state
-left from previous use of the regex. Though if overflow never
-happens there is no chance of getting a false state. Overflows
-like this pose a high security threat, if the hacker knows
-how many cycles he needs to overflow the gen variable and get
-inconsistent result. It is possible to reset the marks if we
-near the overflow, but as you may guess that does not come
-for free.
+Redundant state inside nlist can happen in couple of ways, and has to do with 
+the (closure) a* (star) operations and also +. Due to the automata machine 
+design split happens to be above the next consumed instruction and if that 
+state gets added onto the list we may segfault or give wrong submatch result. 
+Rsplit does not have this problem because it is generated below the consumer 
+instruction, but it can still add redundant states. Overall this is extremely 
+difficult to understand or explain, but this is just something we have to 
+check for. We checked for this using extra int inside the split instructions, 
+so this left some global state inside the machine insts. Most of the time we 
+just added to the next gen number and kept incrementing it forever. This 
+leaves a small chance of overflowing the int and getting a run on a false 
+state left from previous use of the regex. Though if overflow never happens 
+there is no chance of getting a false state. Overflows like this pose a high 
+security threat, if the hacker knows how many cycles he needs to overflow the 
+gen variable and get inconsistent result. It is possible to reset the marks 
+if we near the overflow, but as you may guess that does not come for free.
 
-Currently I removed all dynamic global state from the instructions
-fixing any overlow issue utilizing a sparse set datastructure trick
-which abuses the uninitialized varibles. This allows the redundant
-states to be excluded in O(1) operation. That said, don't run
-valgrind on pikevm as it will go crazy, or find a way to surpress
-errors from pikevm.
+Currently I removed all dynamic global state from the instructions fixing any 
+overlow issue utilizing a sparse set datastructure trick which abuses the 
+uninitialized varibles. This allows the redundant states to be excluded in
+O(1) operation. That said, don't run valgrind on pikevm as it will go crazy, or 
+find a way to surpress errors from pikevm.
 
 Further reading
 ===============
diff --git a/pike.c b/pike.c
index 08951da..cab0eca 100644
--- a/pike.c
+++ b/pike.c
@@ -1,5 +1,8 @@
-// Copyright 2007-2009 Russ Cox.  All Rights Reserved.
-// Use of this source code is governed by a BSD-style
+/* 
+Copyright 2007-2009 Russ Cox.  All Rights Reserved.
+Copyright 2020-2021 Kyryl Melekhin.  All Rights Reserved.
+Use of this source code is governed by a BSD-style
+*/
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -51,35 +54,36 @@ static int isword(const char *s)
 typedef struct rcode rcode;
 struct rcode
 {
-	int unilen;
-	int len;
-	int sub;
-	int presub;
-	int splits;
-	int insts[];
+	int unilen;	/* number of integers in insts */
+	int len;	/* number of atoms/instructions */
+	int sub;	/* interim val = save count; final val = nsubs size */
+	int presub;	/* interim val = save count; final val = 1 rsub size */
+	int splits;	/* number of split insts */
+	int sparsesz;	/* sdense size */
+	int insts[];	/* re code */
 };
 
 enum
 {
-	// Instructions which consume input bytes (and thus fail if none left)
+	/* Instructions which consume input bytes */
 	CHAR = 1,
 	CLASS,
 	MATCH,
 	ANY,
-	// Assert position
+	/* Assert position */
 	WBEG,
 	WEND,
 	BOL,
 	EOL,
-	// Other (special) instructions
+	/* Other (special) instructions */
 	SAVE,
-	// Instructions which take relative offset as arg
+	/* Instructions which take relative offset as arg */
 	JMP,
 	SPLIT,
 	RSPLIT,
 };
 
-// Return codes for re_sizecode() and re_comp()
+/* Return codes for re_sizecode() and re_comp() */
 enum {
 	RE_SUCCESS = 0,
 	RE_SYNTAX_ERROR = -2,
@@ -111,7 +115,7 @@ pc += num;
 
 static int re_classmatch(const int *pc, int c)
 {
-	// pc points to "classnot" byte after opcode
+	/* pc points to "classnot" byte after opcode */
 	int is_positive = *pc++;
 	int cnt = *pc++;
 	while (cnt--) {
@@ -176,7 +180,7 @@ void re_dumpcode(rcode *prog)
 			break;
 		}
 	}
-	printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
+	printf("unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
 		prog->unilen, prog->len, prog->splits, i);
 }
 
@@ -196,7 +200,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 		switch (*re) {
 		case '\\':
 			re++;
-			if (!*re) goto syntax_error; // Trailing backslash
+			if (!*re) goto syntax_error; /* Trailing backslash */
 			if (*re == '<' || *re == '>') {
 				if (re - *re_loc > 2 && re[-2] == '\\')
 					break;
@@ -223,7 +227,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 				re++;
 			} else
 				EMIT(PC++, 1);
-			PC++; // Skip "# of pairs" byte
+			PC++; /* Skip "# of pairs" byte */
 			for (cnt = 0; *re != ']'; cnt++) {
 				if (*re == '\\') re++;
 				if (!*re) goto syntax_error;
@@ -372,9 +376,8 @@ int re_sizecode(const char *re, int *nsub)
 	dummyprog.unilen = 3;
 	dummyprog.sub = 0;
 
-	int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
+	int res = _compilecode(&re, &dummyprog, 1);
 	if (res < 0) return res;
-	// If unparsed chars left
 	if (*re) return RE_SYNTAX_ERROR;
 	*nsub = dummyprog.sub;
 	return dummyprog.unilen;
@@ -388,9 +391,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	prog->presub = nsubs;
 	prog->splits = 0;
 
-	int res = _compilecode(&re, prog, /*sizecode*/0);
+	int res = _compilecode(&re, prog, 0);
 	if (res < 0) return res;
-	// If unparsed chars left
 	if (*re) return RE_SYNTAX_ERROR;
 	int icnt = 0, scnt = SPLIT;
 	for (int i = 0; i < prog->unilen; i++)
@@ -417,8 +419,11 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	prog->insts[prog->unilen++] = SAVE;
 	prog->insts[prog->unilen++] = prog->sub + 1;
 	prog->insts[prog->unilen++] = MATCH;
-	prog->splits = (scnt - SPLIT) / 2 + SPLIT;
-	prog->len = icnt+2;
+	prog->splits = (scnt - SPLIT) / 2;
+	prog->len = icnt + 2;
+	prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
+	prog->sub = prog->presub * (prog->len - prog->splits + 4);
+	prog->sparsesz = (scnt - 2) * 2;
 	return RE_SUCCESS;
 }
 
@@ -434,8 +439,14 @@ if (--csub->ref == 0) { \
 	freesub = csub; \
 } \
 
-#define deccheck(nn) \
-{ decref(nsub) goto rec_check##nn; } \
+#define rec_check(nn) \
+if (si) { \
+	npc = pcs[--si]; \
+	nsub = subs[si]; \
+	goto rec##nn; \
+} \
+
+#define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \
 
 #define onclist(nn)
 #define onnlist(nn) \
@@ -493,19 +504,13 @@ if (spc == MATCH) \
 	} \
 
 #define addthread(nn, list, listidx) \
-si = 0; \
 rec##nn: \
 spc = *npc; \
 if ((unsigned int)spc < WBEG) { \
 	list[listidx].sub = nsub; \
 	list[listidx++].pc = npc; \
+	rec_check(nn) \
 	list##match() \
-	rec_check##nn: \
-	if (si) { \
-		npc = pcs[--si]; \
-		nsub = subs[si]; \
-		goto rec##nn; \
-	} \
 	continue; \
 } \
 next##nn: \
@@ -557,18 +562,18 @@ clistidx = nlistidx; \
 
 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
-	int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
-	int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
-	int clistidx = 0, nlistidx, spc, mcont = MATCH;
+	int rsubsize = prog->presub, suboff = rsubsize;
+	int spc, i, j, c, *npc, osubp = nsubp * sizeof(char*);
+	int si = 0, clistidx = 0, nlistidx, mcont = MATCH;
 	const char *sp = s, *_sp = s;
 	int *insts = prog->insts;
 	int *pcs[prog->splits];
-	unsigned int sdense[prog->splits * 2], sparsesz;
 	rsub *subs[prog->splits];
-	char nsubs[rsubsize * (prog->len-prog->splits+14)];
+	unsigned int sdense[prog->sparsesz], sparsesz;
 	rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
 	rthread _clist[prog->len], _nlist[prog->len];
 	rthread *clist = _clist, *nlist = _nlist, *tmp;
+	char nsubs[prog->sub];
 	goto jmp_start;
 	for (;; sp = _sp) {
 		uc_len(i, sp) uc_code(c, sp)
@@ -651,10 +656,10 @@ int main(int argc, char *argv[])
 			printf("Done in %f seconds\n", elapsed_time);
 			if (!sz)
 				{ printf("-nomatch-\n"); continue; }
-			for (int k=sub_els; k>0; k--)
+			for (int k = sub_els; k > 0; k--)
 				if (sub[k-1])
 					break;
-			for (int l=0; l<sub_els; l+=2) {
+			for (int l = 0; l < sub_els; l+=2) {
 				printf("(");
 				if (sub[l] == NULL || sub[l+1] == NULL)
 					printf("?");