pike: improve size calculations

2022-02-17 18:20:06 +00:00
parent 281820d7c9
commit eb01f29134
2 changed files with 131 additions and 144 deletions
--- a/138
+++ b/138
@@ -86,89 +86,71 @@ length of input, m it the size of RE and t is the number of submatch groups
 and subexpressions that contain them."

 Research has shown that it is possible to disambiguate NFA in polynomial time
-but it brings serious performance issues on non ambiguous inputs.
-See the branch "disambiguate_paths" on this repo shows what is being
-done to solve it and the potential performance costs. In short it
-requires tracking the parent of every state added on nlist from clist.
-If the state from nlist matches the consumer, the alternative clist
-state related to that nlist state gets discarded and the nsub ref
-can be decremented (freed). The reason why this problem does not
-exist for non ambiguous regexes is because the alternative clist
-state will never match due to the next state having a different
-consumer. There is no need for any extra handling it gets freed normally.
-I decided to not apply this solution here because I think
-most use cases for regex are not ambiguious like say regex:
-"a{10000}". If you try matching 10000 'a' characters in a row
-like that you will have a problem where the stack usage will
-jump up to 10000*(subsize) but it will never exceed the size
-of regex though, but the number of NFA states will also increase
-by the same amount, so at the charater 9999 you will find
-9999 redundant nlist states, that will degrade performance
-linearly, however it will be very slow compared to uplimited
-regex like a+. The cost of this solution is somewhere around
-2% general performance decrease (broadly), but a magnitude of 
-complexity decrease for ambiguous cases, for example
-matching 64 characters went down from 30 to 9 microseconds.
-Another solution to this problem can be to determine the
-ambiguous paths at compile time and flag the inner
-states as ambiguous ahead of time, still this can't avoid
-having a loop though the alt states as their positioning
-in clist can't be precomputed due to the dynamic changes.
-
+but it brings serious performance issues on non ambiguous inputs.  See the
+branch "disambiguate_paths" on this repo shows what is being done to solve it
+and the potential performance costs. In short it requires tracking the parent
+of every state added on nlist from clist.  If the state from nlist matches
+the consumer, the alternative clist state related to that nlist state gets
+discarded and the nsub ref can be decremented (freed). The reason why this
+problem does not exist for non ambiguous regexes is because the alternative
+clist state will never match due to the next state having a different consumer
+. There is no need for any extra handling it gets freed normally.  I decided
+to not apply this solution here because I think most use cases for regex are
+not ambiguious like say regex: "a{10000}". If you try matching 10000 'a'
+characters in a row like that you will have a problem where the stack usage
+will jump up to 10000*(subsize) but it will never exceed the size of regex
+though, but the number of NFA states will also increase by the same amount,
+so at the charater 9999 you will find 9999 redundant nlist states, that will
+degrade performance linearly, however it will be very slow compared to
+uplimited regex like a+. The cost of this solution is somewhere around 2%
+general performance decrease (broadly), but a magnitude of complexity
+decrease for ambiguous cases, for example matching 64 characters went down
+from 30 to 9 microseconds.  Another solution to this problem can be to
+determine the ambiguous paths at compile time and flag the inner states as
+ambiguous ahead of time, still this can't avoid having a loop though the alt
+states as their positioning in clist can't be precomputed due to the dynamic
+changes.
 (Comment about O(mt) memory complexity)
-This worst case scenario can only happen on ambiguous input, that is why nsubs
-size is set to half a MB just in case, this can match 5000000 
-ambiguous consumers (char, class, any) assuming t is 1. In practice there
-is almost never a situation where someone wants to search using regex this
-large. Use of alloca() instead of VLA, could remove this limit, I just wish
-it was standardized. If you ever wondered about a situation where alloca
-is a must, this is the algorithm.
-Most of the time memory usage is very low and the space
-complexity for non ambigious regex is O(nt) where n is
-the number of currently considering alternate paths
-in the regex and t is the number of submatch groups.
+This worst case scenario can only happen on ambiguous input. Ambiguous
+consumers (char, class, any) assuming t is 1. In practice there is almost
+never a situation where someone wants to search using regex this large. Most
+of the time memory usage is very low and the space complexity for non
+ambigious regex is O(nt) where n is the number of currently considering
+alternate paths in the regex and t is the number of submatch groups.

-This pikevm features an improved submatch extraction
-algorithm based on Russ Cox's original design. 
-I - Kyryl Melekhin have found a way to optimize the tracking
-properly of 1st number in the submatch pair. Based on simple
-observation of how the NFA is constructed I noticed that
-there is no way for addthread1() to ever reach inner SAVE
-instructions in the regex, so that leaves tracking 2nd pairs by
-addthread1() irrelevant to the final results (except the need to 
-initialize the sub after allocation). This improved the overall 
-performance by 25% which is massive considering that at the 
-time there was nothing else left to can be done to make it faster.
+This pikevm implementation features an improved submatch extraction algorithm
+based on Russ Cox's original design.  I - Kyryl Melekhin have found a way to
+optimize the tracking properly of 1st number in the submatch pair. Based on
+simple observation of how the NFA is constructed I noticed that there is no
+way for addthread1() to ever reach inner SAVE instructions in the regex, so
+that leaves tracking 2nd pairs by addthread1() irrelevant to the final
+results (except the need to initialize the sub after allocation). This
+improved the overall performance by 25% which is massive considering that at
+the time there was nothing else left to can be done to make it faster.

 What are on##list macros?
-Redundant state inside nlist can happen in couple of
-ways, and has to do with the (closure) a* (star) operations and
-also +. Due to the automata machine design split happens
-to be above the next consumed instruction and if that
-state gets added onto the list we may segfault or give
-wrong submatch result. Rsplit does not have this problem
-because it is generated below the consumer instruction, but
-it can still add redundant states. Overall this is extremely
-difficult to understand or explain, but this is just something
-we have to check for. We checked for this using extra int inside
-the split instructions, so this left some global state inside the
-machine insts. Most of the time we just added to the next
-gen number and kept incrementing it forever. This leaves a small
-chance of overflowing the int and getting a run on a false state
-left from previous use of the regex. Though if overflow never
-happens there is no chance of getting a false state. Overflows
-like this pose a high security threat, if the hacker knows
-how many cycles he needs to overflow the gen variable and get
-inconsistent result. It is possible to reset the marks if we
-near the overflow, but as you may guess that does not come
-for free.
+Redundant state inside nlist can happen in couple of ways, and has to do with 
+the (closure) a* (star) operations and also +. Due to the automata machine 
+design split happens to be above the next consumed instruction and if that 
+state gets added onto the list we may segfault or give wrong submatch result. 
+Rsplit does not have this problem because it is generated below the consumer 
+instruction, but it can still add redundant states. Overall this is extremely 
+difficult to understand or explain, but this is just something we have to 
+check for. We checked for this using extra int inside the split instructions, 
+so this left some global state inside the machine insts. Most of the time we 
+just added to the next gen number and kept incrementing it forever. This 
+leaves a small chance of overflowing the int and getting a run on a false 
+state left from previous use of the regex. Though if overflow never happens 
+there is no chance of getting a false state. Overflows like this pose a high 
+security threat, if the hacker knows how many cycles he needs to overflow the 
+gen variable and get inconsistent result. It is possible to reset the marks 
+if we near the overflow, but as you may guess that does not come for free.

-Currently I removed all dynamic global state from the instructions
-fixing any overlow issue utilizing a sparse set datastructure trick
-which abuses the uninitialized varibles. This allows the redundant
-states to be excluded in O(1) operation. That said, don't run
-valgrind on pikevm as it will go crazy, or find a way to surpress
-errors from pikevm.
+Currently I removed all dynamic global state from the instructions fixing any 
+overlow issue utilizing a sparse set datastructure trick which abuses the 
+uninitialized varibles. This allows the redundant states to be excluded in
+O(1) operation. That said, don't run valgrind on pikevm as it will go crazy, or 
+find a way to surpress errors from pikevm.

 Further reading
 ===============
--- a/pike.c
+++ b/pike.c
@@ -1,5 +1,8 @@
-// Copyright 2007-2009 Russ Cox.  All Rights Reserved.
-// Use of this source code is governed by a BSD-style
+/* 
+Copyright 2007-2009 Russ Cox.  All Rights Reserved.
+Copyright 2020-2021 Kyryl Melekhin.  All Rights Reserved.
+Use of this source code is governed by a BSD-style
+*/

 #include <stdio.h>
 #include <stdlib.h>
@@ -51,35 +54,36 @@ static int isword(const char *s)
 typedef struct rcode rcode;
 struct rcode
 {
-	int unilen;
-	int len;
-	int sub;
-	int presub;
-	int splits;
-	int insts[];
+	int unilen;	/* number of integers in insts */
+	int len;	/* number of atoms/instructions */
+	int sub;	/* interim val = save count; final val = nsubs size */
+	int presub;	/* interim val = save count; final val = 1 rsub size */
+	int splits;	/* number of split insts */
+	int sparsesz;	/* sdense size */
+	int insts[];	/* re code */
 };

 enum
 {
-	// Instructions which consume input bytes (and thus fail if none left)
+	/* Instructions which consume input bytes */
 	CHAR = 1,
 	CLASS,
 	MATCH,
 	ANY,
-	// Assert position
+	/* Assert position */
 	WBEG,
 	WEND,
 	BOL,
 	EOL,
-	// Other (special) instructions
+	/* Other (special) instructions */
 	SAVE,
-	// Instructions which take relative offset as arg
+	/* Instructions which take relative offset as arg */
 	JMP,
 	SPLIT,
 	RSPLIT,
 };

-// Return codes for re_sizecode() and re_comp()
+/* Return codes for re_sizecode() and re_comp() */
 enum {
 	RE_SUCCESS = 0,
 	RE_SYNTAX_ERROR = -2,
@@ -111,7 +115,7 @@ pc += num;

 static int re_classmatch(const int *pc, int c)
 {
-	// pc points to "classnot" byte after opcode
+	/* pc points to "classnot" byte after opcode */
 	int is_positive = *pc++;
 	int cnt = *pc++;
 	while (cnt--) {
@@ -176,7 +180,7 @@ void re_dumpcode(rcode *prog)
 			break;
 		}
 	}
-	printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
+	printf("unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
 		prog->unilen, prog->len, prog->splits, i);
 }

@@ -196,7 +200,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 		switch (*re) {
 		case '\\':
 			re++;
-			if (!*re) goto syntax_error; // Trailing backslash
+			if (!*re) goto syntax_error; /* Trailing backslash */
 			if (*re == '<' || *re == '>') {
 				if (re - *re_loc > 2 && re[-2] == '\\')
 					break;
@@ -223,7 +227,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 				re++;
 			} else
 				EMIT(PC++, 1);
-			PC++; // Skip "# of pairs" byte
+			PC++; /* Skip "# of pairs" byte */
 			for (cnt = 0; *re != ']'; cnt++) {
 				if (*re == '\\') re++;
 				if (!*re) goto syntax_error;
@@ -372,9 +376,8 @@ int re_sizecode(const char *re, int *nsub)
 	dummyprog.unilen = 3;
 	dummyprog.sub = 0;

-	int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
+	int res = _compilecode(&re, &dummyprog, 1);
 	if (res < 0) return res;
-	// If unparsed chars left
 	if (*re) return RE_SYNTAX_ERROR;
 	*nsub = dummyprog.sub;
 	return dummyprog.unilen;
@@ -388,9 +391,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	prog->presub = nsubs;
 	prog->splits = 0;

-	int res = _compilecode(&re, prog, /*sizecode*/0);
+	int res = _compilecode(&re, prog, 0);
 	if (res < 0) return res;
-	// If unparsed chars left
 	if (*re) return RE_SYNTAX_ERROR;
 	int icnt = 0, scnt = SPLIT;
 	for (int i = 0; i < prog->unilen; i++)
@@ -417,8 +419,11 @@ int re_comp(rcode *prog, const char *re, int nsubs)
 	prog->insts[prog->unilen++] = SAVE;
 	prog->insts[prog->unilen++] = prog->sub + 1;
 	prog->insts[prog->unilen++] = MATCH;
-	prog->splits = (scnt - SPLIT) / 2 + SPLIT;
+	prog->splits = (scnt - SPLIT) / 2;
 	prog->len = icnt + 2;
+	prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
+	prog->sub = prog->presub * (prog->len - prog->splits + 4);
+	prog->sparsesz = (scnt - 2) * 2;
 	return RE_SUCCESS;
 }

@@ -434,8 +439,14 @@ if (--csub->ref == 0) { \
 	freesub = csub; \
 } \

-#define deccheck(nn) \
-{ decref(nsub) goto rec_check##nn; } \
+#define rec_check(nn) \
+if (si) { \
+	npc = pcs[--si]; \
+	nsub = subs[si]; \
+	goto rec##nn; \
+} \
+
+#define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \

 #define onclist(nn)
 #define onnlist(nn) \
@@ -493,19 +504,13 @@ if (spc == MATCH) \
 	} \

 #define addthread(nn, list, listidx) \
-si = 0; \
 rec##nn: \
 spc = *npc; \
 if ((unsigned int)spc < WBEG) { \
 	list[listidx].sub = nsub; \
 	list[listidx++].pc = npc; \
+	rec_check(nn) \
 	list##match() \
-	rec_check##nn: \
-	if (si) { \
-		npc = pcs[--si]; \
-		nsub = subs[si]; \
-		goto rec##nn; \
-	} \
 	continue; \
 } \
 next##nn: \
@@ -557,18 +562,18 @@ clistidx = nlistidx; \

 int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 {
-	int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
-	int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
-	int clistidx = 0, nlistidx, spc, mcont = MATCH;
+	int rsubsize = prog->presub, suboff = rsubsize;
+	int spc, i, j, c, *npc, osubp = nsubp * sizeof(char*);
+	int si = 0, clistidx = 0, nlistidx, mcont = MATCH;
 	const char *sp = s, *_sp = s;
 	int *insts = prog->insts;
 	int *pcs[prog->splits];
-	unsigned int sdense[prog->splits * 2], sparsesz;
 	rsub *subs[prog->splits];
-	char nsubs[rsubsize * (prog->len-prog->splits+14)];
+	unsigned int sdense[prog->sparsesz], sparsesz;
 	rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
 	rthread _clist[prog->len], _nlist[prog->len];
 	rthread *clist = _clist, *nlist = _nlist, *tmp;
+	char nsubs[prog->sub];
 	goto jmp_start;
 	for (;; sp = _sp) {
 		uc_len(i, sp) uc_code(c, sp)