From eb01f29134e0468f7289c64eb85be5a8d435a55d Mon Sep 17 00:00:00 2001 From: Kyryl Melekhin Date: Thu, 17 Feb 2022 18:20:06 +0000 Subject: [PATCH] pike: improve size calculations --- README | 192 ++++++++++++++++++++++++++------------------------------- pike.c | 83 +++++++++++++------------ 2 files changed, 131 insertions(+), 144 deletions(-) diff --git a/README b/README index c50bc9a..86dc7b6 100644 --- a/README +++ b/README @@ -23,7 +23,7 @@ Features * UnLike re1.5, here is only pikevm, one file easy to use. * Unlike re1.5, regexes is compiled to type sized code rather than bytecode, -eliviating the problem of byte overflow in splits/jmps on large regexes. +eliviating the problem of byte overflow in splits/jmps on large regexes. Currently the type used is int, and every atom in compiled code is aligned to that. * Matcher does not take size of string as param, it checks for '\0' instead, @@ -54,121 +54,103 @@ NOTES The problem described in this paper has been fixed. Ambiguous matching is correct. HISTORY: https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf -"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching, -which is based on the observation that reversing the longest-match rule -simplifies the handling of iteration subexpressions: instead of maximizing -submatch from the first to the last iteration, one needs to maximize the -iterations in reverse order. This means that the disambiguation is always -based on the most recent iteration, removing the need to remember all previous -iterations (except for the backwards-first, i.e. the last one, which contains -submatch result). The algorithm tracks two pairs of offsets per each submatch -group: the active pair (used for disambiguation) and the result pair. It gives -incorrect results under two conditions: (1) ambiguous matches have equal -offsets on some iteration, and (2) disambiguation happens too late, when -the active offsets have already been updated and the difference between -ambiguous matches is erased. We found that such situations may occur for two -reasons. First, the ε-closure algorithm may compare ambiguous paths after +"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching, +which is based on the observation that reversing the longest-match rule +simplifies the handling of iteration subexpressions: instead of maximizing +submatch from the first to the last iteration, one needs to maximize the +iterations in reverse order. This means that the disambiguation is always +based on the most recent iteration, removing the need to remember all previous +iterations (except for the backwards-first, i.e. the last one, which contains +submatch result). The algorithm tracks two pairs of offsets per each submatch +group: the active pair (used for disambiguation) and the result pair. It gives +incorrect results under two conditions: (1) ambiguous matches have equal +offsets on some iteration, and (2) disambiguation happens too late, when +the active offsets have already been updated and the difference between +ambiguous matches is erased. We found that such situations may occur for two +reasons. First, the ε-closure algorithm may compare ambiguous paths after their join point, when both paths have a common suffix with tagged -transitions. This is the case with the Cox prototype implementation; for -example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such -failures can be repaired by exploring states in topological order, but a -topological order does not exist in the presence of ε-loops. The second reason -is bounded repetition: ambiguous paths may not have an intermediate join point -at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we -have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number -of iterations. Assuming that the bounded repetition is unrolled by chaining -three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time -ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox -algorithm is interesting: if somehow the delayed comparison problem was fixed, +transitions. This is the case with the Cox prototype implementation; for +example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such +failures can be repaired by exploring states in topological order, but a +topological order does not exist in the presence of ε-loops. The second reason +is bounded repetition: ambiguous paths may not have an intermediate join point +at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we +have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number +of iterations. Assuming that the bounded repetition is unrolled by chaining +three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time +ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox +algorithm is interesting: if somehow the delayed comparison problem was fixed, it would work. The algorithm requires O(mt) memory and O(nm^2t) time (assuming a worst-case optimal closure algorithm), where n is the -length of input, m it the size of RE and t is the number of submatch groups +length of input, m it the size of RE and t is the number of submatch groups and subexpressions that contain them." Research has shown that it is possible to disambiguate NFA in polynomial time -but it brings serious performance issues on non ambiguous inputs. -See the branch "disambiguate_paths" on this repo shows what is being -done to solve it and the potential performance costs. In short it -requires tracking the parent of every state added on nlist from clist. -If the state from nlist matches the consumer, the alternative clist -state related to that nlist state gets discarded and the nsub ref -can be decremented (freed). The reason why this problem does not -exist for non ambiguous regexes is because the alternative clist -state will never match due to the next state having a different -consumer. There is no need for any extra handling it gets freed normally. -I decided to not apply this solution here because I think -most use cases for regex are not ambiguious like say regex: -"a{10000}". If you try matching 10000 'a' characters in a row -like that you will have a problem where the stack usage will -jump up to 10000*(subsize) but it will never exceed the size -of regex though, but the number of NFA states will also increase -by the same amount, so at the charater 9999 you will find -9999 redundant nlist states, that will degrade performance -linearly, however it will be very slow compared to uplimited -regex like a+. The cost of this solution is somewhere around -2% general performance decrease (broadly), but a magnitude of -complexity decrease for ambiguous cases, for example -matching 64 characters went down from 30 to 9 microseconds. -Another solution to this problem can be to determine the -ambiguous paths at compile time and flag the inner -states as ambiguous ahead of time, still this can't avoid -having a loop though the alt states as their positioning -in clist can't be precomputed due to the dynamic changes. - +but it brings serious performance issues on non ambiguous inputs. See the +branch "disambiguate_paths" on this repo shows what is being done to solve it +and the potential performance costs. In short it requires tracking the parent +of every state added on nlist from clist. If the state from nlist matches +the consumer, the alternative clist state related to that nlist state gets +discarded and the nsub ref can be decremented (freed). The reason why this +problem does not exist for non ambiguous regexes is because the alternative +clist state will never match due to the next state having a different consumer +. There is no need for any extra handling it gets freed normally. I decided +to not apply this solution here because I think most use cases for regex are +not ambiguious like say regex: "a{10000}". If you try matching 10000 'a' +characters in a row like that you will have a problem where the stack usage +will jump up to 10000*(subsize) but it will never exceed the size of regex +though, but the number of NFA states will also increase by the same amount, +so at the charater 9999 you will find 9999 redundant nlist states, that will +degrade performance linearly, however it will be very slow compared to +uplimited regex like a+. The cost of this solution is somewhere around 2% +general performance decrease (broadly), but a magnitude of complexity +decrease for ambiguous cases, for example matching 64 characters went down +from 30 to 9 microseconds. Another solution to this problem can be to +determine the ambiguous paths at compile time and flag the inner states as +ambiguous ahead of time, still this can't avoid having a loop though the alt +states as their positioning in clist can't be precomputed due to the dynamic +changes. (Comment about O(mt) memory complexity) -This worst case scenario can only happen on ambiguous input, that is why nsubs -size is set to half a MB just in case, this can match 5000000 -ambiguous consumers (char, class, any) assuming t is 1. In practice there -is almost never a situation where someone wants to search using regex this -large. Use of alloca() instead of VLA, could remove this limit, I just wish -it was standardized. If you ever wondered about a situation where alloca -is a must, this is the algorithm. -Most of the time memory usage is very low and the space -complexity for non ambigious regex is O(nt) where n is -the number of currently considering alternate paths -in the regex and t is the number of submatch groups. +This worst case scenario can only happen on ambiguous input. Ambiguous +consumers (char, class, any) assuming t is 1. In practice there is almost +never a situation where someone wants to search using regex this large. Most +of the time memory usage is very low and the space complexity for non +ambigious regex is O(nt) where n is the number of currently considering +alternate paths in the regex and t is the number of submatch groups. -This pikevm features an improved submatch extraction -algorithm based on Russ Cox's original design. -I - Kyryl Melekhin have found a way to optimize the tracking -properly of 1st number in the submatch pair. Based on simple -observation of how the NFA is constructed I noticed that -there is no way for addthread1() to ever reach inner SAVE -instructions in the regex, so that leaves tracking 2nd pairs by -addthread1() irrelevant to the final results (except the need to -initialize the sub after allocation). This improved the overall -performance by 25% which is massive considering that at the -time there was nothing else left to can be done to make it faster. +This pikevm implementation features an improved submatch extraction algorithm +based on Russ Cox's original design. I - Kyryl Melekhin have found a way to +optimize the tracking properly of 1st number in the submatch pair. Based on +simple observation of how the NFA is constructed I noticed that there is no +way for addthread1() to ever reach inner SAVE instructions in the regex, so +that leaves tracking 2nd pairs by addthread1() irrelevant to the final +results (except the need to initialize the sub after allocation). This +improved the overall performance by 25% which is massive considering that at +the time there was nothing else left to can be done to make it faster. What are on##list macros? -Redundant state inside nlist can happen in couple of -ways, and has to do with the (closure) a* (star) operations and -also +. Due to the automata machine design split happens -to be above the next consumed instruction and if that -state gets added onto the list we may segfault or give -wrong submatch result. Rsplit does not have this problem -because it is generated below the consumer instruction, but -it can still add redundant states. Overall this is extremely -difficult to understand or explain, but this is just something -we have to check for. We checked for this using extra int inside -the split instructions, so this left some global state inside the -machine insts. Most of the time we just added to the next -gen number and kept incrementing it forever. This leaves a small -chance of overflowing the int and getting a run on a false state -left from previous use of the regex. Though if overflow never -happens there is no chance of getting a false state. Overflows -like this pose a high security threat, if the hacker knows -how many cycles he needs to overflow the gen variable and get -inconsistent result. It is possible to reset the marks if we -near the overflow, but as you may guess that does not come -for free. +Redundant state inside nlist can happen in couple of ways, and has to do with +the (closure) a* (star) operations and also +. Due to the automata machine +design split happens to be above the next consumed instruction and if that +state gets added onto the list we may segfault or give wrong submatch result. +Rsplit does not have this problem because it is generated below the consumer +instruction, but it can still add redundant states. Overall this is extremely +difficult to understand or explain, but this is just something we have to +check for. We checked for this using extra int inside the split instructions, +so this left some global state inside the machine insts. Most of the time we +just added to the next gen number and kept incrementing it forever. This +leaves a small chance of overflowing the int and getting a run on a false +state left from previous use of the regex. Though if overflow never happens +there is no chance of getting a false state. Overflows like this pose a high +security threat, if the hacker knows how many cycles he needs to overflow the +gen variable and get inconsistent result. It is possible to reset the marks +if we near the overflow, but as you may guess that does not come for free. -Currently I removed all dynamic global state from the instructions -fixing any overlow issue utilizing a sparse set datastructure trick -which abuses the uninitialized varibles. This allows the redundant -states to be excluded in O(1) operation. That said, don't run -valgrind on pikevm as it will go crazy, or find a way to surpress -errors from pikevm. +Currently I removed all dynamic global state from the instructions fixing any +overlow issue utilizing a sparse set datastructure trick which abuses the +uninitialized varibles. This allows the redundant states to be excluded in +O(1) operation. That said, don't run valgrind on pikevm as it will go crazy, or +find a way to surpress errors from pikevm. Further reading =============== diff --git a/pike.c b/pike.c index 08951da..cab0eca 100644 --- a/pike.c +++ b/pike.c @@ -1,5 +1,8 @@ -// Copyright 2007-2009 Russ Cox. All Rights Reserved. -// Use of this source code is governed by a BSD-style +/* +Copyright 2007-2009 Russ Cox. All Rights Reserved. +Copyright 2020-2021 Kyryl Melekhin. All Rights Reserved. +Use of this source code is governed by a BSD-style +*/ #include #include @@ -51,35 +54,36 @@ static int isword(const char *s) typedef struct rcode rcode; struct rcode { - int unilen; - int len; - int sub; - int presub; - int splits; - int insts[]; + int unilen; /* number of integers in insts */ + int len; /* number of atoms/instructions */ + int sub; /* interim val = save count; final val = nsubs size */ + int presub; /* interim val = save count; final val = 1 rsub size */ + int splits; /* number of split insts */ + int sparsesz; /* sdense size */ + int insts[]; /* re code */ }; enum { - // Instructions which consume input bytes (and thus fail if none left) + /* Instructions which consume input bytes */ CHAR = 1, CLASS, MATCH, ANY, - // Assert position + /* Assert position */ WBEG, WEND, BOL, EOL, - // Other (special) instructions + /* Other (special) instructions */ SAVE, - // Instructions which take relative offset as arg + /* Instructions which take relative offset as arg */ JMP, SPLIT, RSPLIT, }; -// Return codes for re_sizecode() and re_comp() +/* Return codes for re_sizecode() and re_comp() */ enum { RE_SUCCESS = 0, RE_SYNTAX_ERROR = -2, @@ -111,7 +115,7 @@ pc += num; static int re_classmatch(const int *pc, int c) { - // pc points to "classnot" byte after opcode + /* pc points to "classnot" byte after opcode */ int is_positive = *pc++; int cnt = *pc++; while (cnt--) { @@ -176,7 +180,7 @@ void re_dumpcode(rcode *prog) break; } } - printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n", + printf("unilen: %d, insts: %d, splits: %d, counted insts: %d\n", prog->unilen, prog->len, prog->splits, i); } @@ -196,7 +200,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) switch (*re) { case '\\': re++; - if (!*re) goto syntax_error; // Trailing backslash + if (!*re) goto syntax_error; /* Trailing backslash */ if (*re == '<' || *re == '>') { if (re - *re_loc > 2 && re[-2] == '\\') break; @@ -223,7 +227,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode) re++; } else EMIT(PC++, 1); - PC++; // Skip "# of pairs" byte + PC++; /* Skip "# of pairs" byte */ for (cnt = 0; *re != ']'; cnt++) { if (*re == '\\') re++; if (!*re) goto syntax_error; @@ -372,9 +376,8 @@ int re_sizecode(const char *re, int *nsub) dummyprog.unilen = 3; dummyprog.sub = 0; - int res = _compilecode(&re, &dummyprog, /*sizecode*/1); + int res = _compilecode(&re, &dummyprog, 1); if (res < 0) return res; - // If unparsed chars left if (*re) return RE_SYNTAX_ERROR; *nsub = dummyprog.sub; return dummyprog.unilen; @@ -388,9 +391,8 @@ int re_comp(rcode *prog, const char *re, int nsubs) prog->presub = nsubs; prog->splits = 0; - int res = _compilecode(&re, prog, /*sizecode*/0); + int res = _compilecode(&re, prog, 0); if (res < 0) return res; - // If unparsed chars left if (*re) return RE_SYNTAX_ERROR; int icnt = 0, scnt = SPLIT; for (int i = 0; i < prog->unilen; i++) @@ -417,8 +419,11 @@ int re_comp(rcode *prog, const char *re, int nsubs) prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = prog->sub + 1; prog->insts[prog->unilen++] = MATCH; - prog->splits = (scnt - SPLIT) / 2 + SPLIT; - prog->len = icnt+2; + prog->splits = (scnt - SPLIT) / 2; + prog->len = icnt + 2; + prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2); + prog->sub = prog->presub * (prog->len - prog->splits + 4); + prog->sparsesz = (scnt - 2) * 2; return RE_SUCCESS; } @@ -434,8 +439,14 @@ if (--csub->ref == 0) { \ freesub = csub; \ } \ -#define deccheck(nn) \ -{ decref(nsub) goto rec_check##nn; } \ +#define rec_check(nn) \ +if (si) { \ + npc = pcs[--si]; \ + nsub = subs[si]; \ + goto rec##nn; \ +} \ + +#define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \ #define onclist(nn) #define onnlist(nn) \ @@ -493,19 +504,13 @@ if (spc == MATCH) \ } \ #define addthread(nn, list, listidx) \ -si = 0; \ rec##nn: \ spc = *npc; \ if ((unsigned int)spc < WBEG) { \ list[listidx].sub = nsub; \ list[listidx++].pc = npc; \ + rec_check(nn) \ list##match() \ - rec_check##nn: \ - if (si) { \ - npc = pcs[--si]; \ - nsub = subs[si]; \ - goto rec##nn; \ - } \ continue; \ } \ next##nn: \ @@ -557,18 +562,18 @@ clistidx = nlistidx; \ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) { - int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp); - int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*); - int clistidx = 0, nlistidx, spc, mcont = MATCH; + int rsubsize = prog->presub, suboff = rsubsize; + int spc, i, j, c, *npc, osubp = nsubp * sizeof(char*); + int si = 0, clistidx = 0, nlistidx, mcont = MATCH; const char *sp = s, *_sp = s; int *insts = prog->insts; int *pcs[prog->splits]; - unsigned int sdense[prog->splits * 2], sparsesz; rsub *subs[prog->splits]; - char nsubs[rsubsize * (prog->len-prog->splits+14)]; + unsigned int sdense[prog->sparsesz], sparsesz; rsub *nsub, *s1, *matched = NULL, *freesub = NULL; rthread _clist[prog->len], _nlist[prog->len]; rthread *clist = _clist, *nlist = _nlist, *tmp; + char nsubs[prog->sub]; goto jmp_start; for (;; sp = _sp) { uc_len(i, sp) uc_code(c, sp) @@ -651,10 +656,10 @@ int main(int argc, char *argv[]) printf("Done in %f seconds\n", elapsed_time); if (!sz) { printf("-nomatch-\n"); continue; } - for (int k=sub_els; k>0; k--) + for (int k = sub_els; k > 0; k--) if (sub[k-1]) break; - for (int l=0; l