pike: improve size calculations
This commit is contained in:
192
README
192
README
@@ -23,7 +23,7 @@ Features
|
|||||||
|
|
||||||
* UnLike re1.5, here is only pikevm, one file easy to use.
|
* UnLike re1.5, here is only pikevm, one file easy to use.
|
||||||
* Unlike re1.5, regexes is compiled to type sized code rather than bytecode,
|
* Unlike re1.5, regexes is compiled to type sized code rather than bytecode,
|
||||||
eliviating the problem of byte overflow in splits/jmps on large regexes.
|
eliviating the problem of byte overflow in splits/jmps on large regexes.
|
||||||
Currently the type used is int, and every atom in compiled code is aligned
|
Currently the type used is int, and every atom in compiled code is aligned
|
||||||
to that.
|
to that.
|
||||||
* Matcher does not take size of string as param, it checks for '\0' instead,
|
* Matcher does not take size of string as param, it checks for '\0' instead,
|
||||||
@@ -54,121 +54,103 @@ NOTES
|
|||||||
The problem described in this paper has been fixed. Ambiguous matching is correct.
|
The problem described in this paper has been fixed. Ambiguous matching is correct.
|
||||||
HISTORY:
|
HISTORY:
|
||||||
https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf
|
https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf
|
||||||
"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching,
|
"Cox, 2009 (incorrect). Cox came up with the idea of backward POSIX matching,
|
||||||
which is based on the observation that reversing the longest-match rule
|
which is based on the observation that reversing the longest-match rule
|
||||||
simplifies the handling of iteration subexpressions: instead of maximizing
|
simplifies the handling of iteration subexpressions: instead of maximizing
|
||||||
submatch from the first to the last iteration, one needs to maximize the
|
submatch from the first to the last iteration, one needs to maximize the
|
||||||
iterations in reverse order. This means that the disambiguation is always
|
iterations in reverse order. This means that the disambiguation is always
|
||||||
based on the most recent iteration, removing the need to remember all previous
|
based on the most recent iteration, removing the need to remember all previous
|
||||||
iterations (except for the backwards-first, i.e. the last one, which contains
|
iterations (except for the backwards-first, i.e. the last one, which contains
|
||||||
submatch result). The algorithm tracks two pairs of offsets per each submatch
|
submatch result). The algorithm tracks two pairs of offsets per each submatch
|
||||||
group: the active pair (used for disambiguation) and the result pair. It gives
|
group: the active pair (used for disambiguation) and the result pair. It gives
|
||||||
incorrect results under two conditions: (1) ambiguous matches have equal
|
incorrect results under two conditions: (1) ambiguous matches have equal
|
||||||
offsets on some iteration, and (2) disambiguation happens too late, when
|
offsets on some iteration, and (2) disambiguation happens too late, when
|
||||||
the active offsets have already been updated and the difference between
|
the active offsets have already been updated and the difference between
|
||||||
ambiguous matches is erased. We found that such situations may occur for two
|
ambiguous matches is erased. We found that such situations may occur for two
|
||||||
reasons. First, the ε-closure algorithm may compare ambiguous paths after
|
reasons. First, the ε-closure algorithm may compare ambiguous paths after
|
||||||
their join point, when both paths have a common suffix with tagged
|
their join point, when both paths have a common suffix with tagged
|
||||||
transitions. This is the case with the Cox prototype implementation; for
|
transitions. This is the case with the Cox prototype implementation; for
|
||||||
example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such
|
example, it gives incorrect results for (aa|a)* and string aaaaa. Most of such
|
||||||
failures can be repaired by exploring states in topological order, but a
|
failures can be repaired by exploring states in topological order, but a
|
||||||
topological order does not exist in the presence of ε-loops. The second reason
|
topological order does not exist in the presence of ε-loops. The second reason
|
||||||
is bounded repetition: ambiguous paths may not have an intermediate join point
|
is bounded repetition: ambiguous paths may not have an intermediate join point
|
||||||
at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we
|
at all. For example, in the case of (aaaa|aaa|a){3,4} and string aaaaaaaaaa we
|
||||||
have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number
|
have matches (aaaa)(aaaa)(a)(a) and (aaaa)(aaa)(aaa) with a different number
|
||||||
of iterations. Assuming that the bounded repetition is unrolled by chaining
|
of iterations. Assuming that the bounded repetition is unrolled by chaining
|
||||||
three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time
|
three sub-automata for (aaaa|aaa|a) and an optional fourth one, by the time
|
||||||
ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox
|
ambiguous paths meet both have active offsets (0,4). Despite the flaw, Cox
|
||||||
algorithm is interesting: if somehow the delayed comparison problem was fixed,
|
algorithm is interesting: if somehow the delayed comparison problem was fixed,
|
||||||
it would work. The algorithm requires O(mt) memory and O(nm^2t) time
|
it would work. The algorithm requires O(mt) memory and O(nm^2t) time
|
||||||
(assuming a worst-case optimal closure algorithm), where n is the
|
(assuming a worst-case optimal closure algorithm), where n is the
|
||||||
length of input, m it the size of RE and t is the number of submatch groups
|
length of input, m it the size of RE and t is the number of submatch groups
|
||||||
and subexpressions that contain them."
|
and subexpressions that contain them."
|
||||||
|
|
||||||
Research has shown that it is possible to disambiguate NFA in polynomial time
|
Research has shown that it is possible to disambiguate NFA in polynomial time
|
||||||
but it brings serious performance issues on non ambiguous inputs.
|
but it brings serious performance issues on non ambiguous inputs. See the
|
||||||
See the branch "disambiguate_paths" on this repo shows what is being
|
branch "disambiguate_paths" on this repo shows what is being done to solve it
|
||||||
done to solve it and the potential performance costs. In short it
|
and the potential performance costs. In short it requires tracking the parent
|
||||||
requires tracking the parent of every state added on nlist from clist.
|
of every state added on nlist from clist. If the state from nlist matches
|
||||||
If the state from nlist matches the consumer, the alternative clist
|
the consumer, the alternative clist state related to that nlist state gets
|
||||||
state related to that nlist state gets discarded and the nsub ref
|
discarded and the nsub ref can be decremented (freed). The reason why this
|
||||||
can be decremented (freed). The reason why this problem does not
|
problem does not exist for non ambiguous regexes is because the alternative
|
||||||
exist for non ambiguous regexes is because the alternative clist
|
clist state will never match due to the next state having a different consumer
|
||||||
state will never match due to the next state having a different
|
. There is no need for any extra handling it gets freed normally. I decided
|
||||||
consumer. There is no need for any extra handling it gets freed normally.
|
to not apply this solution here because I think most use cases for regex are
|
||||||
I decided to not apply this solution here because I think
|
not ambiguious like say regex: "a{10000}". If you try matching 10000 'a'
|
||||||
most use cases for regex are not ambiguious like say regex:
|
characters in a row like that you will have a problem where the stack usage
|
||||||
"a{10000}". If you try matching 10000 'a' characters in a row
|
will jump up to 10000*(subsize) but it will never exceed the size of regex
|
||||||
like that you will have a problem where the stack usage will
|
though, but the number of NFA states will also increase by the same amount,
|
||||||
jump up to 10000*(subsize) but it will never exceed the size
|
so at the charater 9999 you will find 9999 redundant nlist states, that will
|
||||||
of regex though, but the number of NFA states will also increase
|
degrade performance linearly, however it will be very slow compared to
|
||||||
by the same amount, so at the charater 9999 you will find
|
uplimited regex like a+. The cost of this solution is somewhere around 2%
|
||||||
9999 redundant nlist states, that will degrade performance
|
general performance decrease (broadly), but a magnitude of complexity
|
||||||
linearly, however it will be very slow compared to uplimited
|
decrease for ambiguous cases, for example matching 64 characters went down
|
||||||
regex like a+. The cost of this solution is somewhere around
|
from 30 to 9 microseconds. Another solution to this problem can be to
|
||||||
2% general performance decrease (broadly), but a magnitude of
|
determine the ambiguous paths at compile time and flag the inner states as
|
||||||
complexity decrease for ambiguous cases, for example
|
ambiguous ahead of time, still this can't avoid having a loop though the alt
|
||||||
matching 64 characters went down from 30 to 9 microseconds.
|
states as their positioning in clist can't be precomputed due to the dynamic
|
||||||
Another solution to this problem can be to determine the
|
changes.
|
||||||
ambiguous paths at compile time and flag the inner
|
|
||||||
states as ambiguous ahead of time, still this can't avoid
|
|
||||||
having a loop though the alt states as their positioning
|
|
||||||
in clist can't be precomputed due to the dynamic changes.
|
|
||||||
|
|
||||||
(Comment about O(mt) memory complexity)
|
(Comment about O(mt) memory complexity)
|
||||||
This worst case scenario can only happen on ambiguous input, that is why nsubs
|
This worst case scenario can only happen on ambiguous input. Ambiguous
|
||||||
size is set to half a MB just in case, this can match 5000000
|
consumers (char, class, any) assuming t is 1. In practice there is almost
|
||||||
ambiguous consumers (char, class, any) assuming t is 1. In practice there
|
never a situation where someone wants to search using regex this large. Most
|
||||||
is almost never a situation where someone wants to search using regex this
|
of the time memory usage is very low and the space complexity for non
|
||||||
large. Use of alloca() instead of VLA, could remove this limit, I just wish
|
ambigious regex is O(nt) where n is the number of currently considering
|
||||||
it was standardized. If you ever wondered about a situation where alloca
|
alternate paths in the regex and t is the number of submatch groups.
|
||||||
is a must, this is the algorithm.
|
|
||||||
Most of the time memory usage is very low and the space
|
|
||||||
complexity for non ambigious regex is O(nt) where n is
|
|
||||||
the number of currently considering alternate paths
|
|
||||||
in the regex and t is the number of submatch groups.
|
|
||||||
|
|
||||||
This pikevm features an improved submatch extraction
|
This pikevm implementation features an improved submatch extraction algorithm
|
||||||
algorithm based on Russ Cox's original design.
|
based on Russ Cox's original design. I - Kyryl Melekhin have found a way to
|
||||||
I - Kyryl Melekhin have found a way to optimize the tracking
|
optimize the tracking properly of 1st number in the submatch pair. Based on
|
||||||
properly of 1st number in the submatch pair. Based on simple
|
simple observation of how the NFA is constructed I noticed that there is no
|
||||||
observation of how the NFA is constructed I noticed that
|
way for addthread1() to ever reach inner SAVE instructions in the regex, so
|
||||||
there is no way for addthread1() to ever reach inner SAVE
|
that leaves tracking 2nd pairs by addthread1() irrelevant to the final
|
||||||
instructions in the regex, so that leaves tracking 2nd pairs by
|
results (except the need to initialize the sub after allocation). This
|
||||||
addthread1() irrelevant to the final results (except the need to
|
improved the overall performance by 25% which is massive considering that at
|
||||||
initialize the sub after allocation). This improved the overall
|
the time there was nothing else left to can be done to make it faster.
|
||||||
performance by 25% which is massive considering that at the
|
|
||||||
time there was nothing else left to can be done to make it faster.
|
|
||||||
|
|
||||||
What are on##list macros?
|
What are on##list macros?
|
||||||
Redundant state inside nlist can happen in couple of
|
Redundant state inside nlist can happen in couple of ways, and has to do with
|
||||||
ways, and has to do with the (closure) a* (star) operations and
|
the (closure) a* (star) operations and also +. Due to the automata machine
|
||||||
also +. Due to the automata machine design split happens
|
design split happens to be above the next consumed instruction and if that
|
||||||
to be above the next consumed instruction and if that
|
state gets added onto the list we may segfault or give wrong submatch result.
|
||||||
state gets added onto the list we may segfault or give
|
Rsplit does not have this problem because it is generated below the consumer
|
||||||
wrong submatch result. Rsplit does not have this problem
|
instruction, but it can still add redundant states. Overall this is extremely
|
||||||
because it is generated below the consumer instruction, but
|
difficult to understand or explain, but this is just something we have to
|
||||||
it can still add redundant states. Overall this is extremely
|
check for. We checked for this using extra int inside the split instructions,
|
||||||
difficult to understand or explain, but this is just something
|
so this left some global state inside the machine insts. Most of the time we
|
||||||
we have to check for. We checked for this using extra int inside
|
just added to the next gen number and kept incrementing it forever. This
|
||||||
the split instructions, so this left some global state inside the
|
leaves a small chance of overflowing the int and getting a run on a false
|
||||||
machine insts. Most of the time we just added to the next
|
state left from previous use of the regex. Though if overflow never happens
|
||||||
gen number and kept incrementing it forever. This leaves a small
|
there is no chance of getting a false state. Overflows like this pose a high
|
||||||
chance of overflowing the int and getting a run on a false state
|
security threat, if the hacker knows how many cycles he needs to overflow the
|
||||||
left from previous use of the regex. Though if overflow never
|
gen variable and get inconsistent result. It is possible to reset the marks
|
||||||
happens there is no chance of getting a false state. Overflows
|
if we near the overflow, but as you may guess that does not come for free.
|
||||||
like this pose a high security threat, if the hacker knows
|
|
||||||
how many cycles he needs to overflow the gen variable and get
|
|
||||||
inconsistent result. It is possible to reset the marks if we
|
|
||||||
near the overflow, but as you may guess that does not come
|
|
||||||
for free.
|
|
||||||
|
|
||||||
Currently I removed all dynamic global state from the instructions
|
Currently I removed all dynamic global state from the instructions fixing any
|
||||||
fixing any overlow issue utilizing a sparse set datastructure trick
|
overlow issue utilizing a sparse set datastructure trick which abuses the
|
||||||
which abuses the uninitialized varibles. This allows the redundant
|
uninitialized varibles. This allows the redundant states to be excluded in
|
||||||
states to be excluded in O(1) operation. That said, don't run
|
O(1) operation. That said, don't run valgrind on pikevm as it will go crazy, or
|
||||||
valgrind on pikevm as it will go crazy, or find a way to surpress
|
find a way to surpress errors from pikevm.
|
||||||
errors from pikevm.
|
|
||||||
|
|
||||||
Further reading
|
Further reading
|
||||||
===============
|
===============
|
||||||
|
|||||||
83
pike.c
83
pike.c
@@ -1,5 +1,8 @@
|
|||||||
// Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
/*
|
||||||
// Use of this source code is governed by a BSD-style
|
Copyright 2007-2009 Russ Cox. All Rights Reserved.
|
||||||
|
Copyright 2020-2021 Kyryl Melekhin. All Rights Reserved.
|
||||||
|
Use of this source code is governed by a BSD-style
|
||||||
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@@ -51,35 +54,36 @@ static int isword(const char *s)
|
|||||||
typedef struct rcode rcode;
|
typedef struct rcode rcode;
|
||||||
struct rcode
|
struct rcode
|
||||||
{
|
{
|
||||||
int unilen;
|
int unilen; /* number of integers in insts */
|
||||||
int len;
|
int len; /* number of atoms/instructions */
|
||||||
int sub;
|
int sub; /* interim val = save count; final val = nsubs size */
|
||||||
int presub;
|
int presub; /* interim val = save count; final val = 1 rsub size */
|
||||||
int splits;
|
int splits; /* number of split insts */
|
||||||
int insts[];
|
int sparsesz; /* sdense size */
|
||||||
|
int insts[]; /* re code */
|
||||||
};
|
};
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
// Instructions which consume input bytes (and thus fail if none left)
|
/* Instructions which consume input bytes */
|
||||||
CHAR = 1,
|
CHAR = 1,
|
||||||
CLASS,
|
CLASS,
|
||||||
MATCH,
|
MATCH,
|
||||||
ANY,
|
ANY,
|
||||||
// Assert position
|
/* Assert position */
|
||||||
WBEG,
|
WBEG,
|
||||||
WEND,
|
WEND,
|
||||||
BOL,
|
BOL,
|
||||||
EOL,
|
EOL,
|
||||||
// Other (special) instructions
|
/* Other (special) instructions */
|
||||||
SAVE,
|
SAVE,
|
||||||
// Instructions which take relative offset as arg
|
/* Instructions which take relative offset as arg */
|
||||||
JMP,
|
JMP,
|
||||||
SPLIT,
|
SPLIT,
|
||||||
RSPLIT,
|
RSPLIT,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Return codes for re_sizecode() and re_comp()
|
/* Return codes for re_sizecode() and re_comp() */
|
||||||
enum {
|
enum {
|
||||||
RE_SUCCESS = 0,
|
RE_SUCCESS = 0,
|
||||||
RE_SYNTAX_ERROR = -2,
|
RE_SYNTAX_ERROR = -2,
|
||||||
@@ -111,7 +115,7 @@ pc += num;
|
|||||||
|
|
||||||
static int re_classmatch(const int *pc, int c)
|
static int re_classmatch(const int *pc, int c)
|
||||||
{
|
{
|
||||||
// pc points to "classnot" byte after opcode
|
/* pc points to "classnot" byte after opcode */
|
||||||
int is_positive = *pc++;
|
int is_positive = *pc++;
|
||||||
int cnt = *pc++;
|
int cnt = *pc++;
|
||||||
while (cnt--) {
|
while (cnt--) {
|
||||||
@@ -176,7 +180,7 @@ void re_dumpcode(rcode *prog)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("Unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
|
printf("unilen: %d, insts: %d, splits: %d, counted insts: %d\n",
|
||||||
prog->unilen, prog->len, prog->splits, i);
|
prog->unilen, prog->len, prog->splits, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -196,7 +200,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
switch (*re) {
|
switch (*re) {
|
||||||
case '\\':
|
case '\\':
|
||||||
re++;
|
re++;
|
||||||
if (!*re) goto syntax_error; // Trailing backslash
|
if (!*re) goto syntax_error; /* Trailing backslash */
|
||||||
if (*re == '<' || *re == '>') {
|
if (*re == '<' || *re == '>') {
|
||||||
if (re - *re_loc > 2 && re[-2] == '\\')
|
if (re - *re_loc > 2 && re[-2] == '\\')
|
||||||
break;
|
break;
|
||||||
@@ -223,7 +227,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
re++;
|
re++;
|
||||||
} else
|
} else
|
||||||
EMIT(PC++, 1);
|
EMIT(PC++, 1);
|
||||||
PC++; // Skip "# of pairs" byte
|
PC++; /* Skip "# of pairs" byte */
|
||||||
for (cnt = 0; *re != ']'; cnt++) {
|
for (cnt = 0; *re != ']'; cnt++) {
|
||||||
if (*re == '\\') re++;
|
if (*re == '\\') re++;
|
||||||
if (!*re) goto syntax_error;
|
if (!*re) goto syntax_error;
|
||||||
@@ -372,9 +376,8 @@ int re_sizecode(const char *re, int *nsub)
|
|||||||
dummyprog.unilen = 3;
|
dummyprog.unilen = 3;
|
||||||
dummyprog.sub = 0;
|
dummyprog.sub = 0;
|
||||||
|
|
||||||
int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
|
int res = _compilecode(&re, &dummyprog, 1);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
// If unparsed chars left
|
|
||||||
if (*re) return RE_SYNTAX_ERROR;
|
if (*re) return RE_SYNTAX_ERROR;
|
||||||
*nsub = dummyprog.sub;
|
*nsub = dummyprog.sub;
|
||||||
return dummyprog.unilen;
|
return dummyprog.unilen;
|
||||||
@@ -388,9 +391,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
|
|||||||
prog->presub = nsubs;
|
prog->presub = nsubs;
|
||||||
prog->splits = 0;
|
prog->splits = 0;
|
||||||
|
|
||||||
int res = _compilecode(&re, prog, /*sizecode*/0);
|
int res = _compilecode(&re, prog, 0);
|
||||||
if (res < 0) return res;
|
if (res < 0) return res;
|
||||||
// If unparsed chars left
|
|
||||||
if (*re) return RE_SYNTAX_ERROR;
|
if (*re) return RE_SYNTAX_ERROR;
|
||||||
int icnt = 0, scnt = SPLIT;
|
int icnt = 0, scnt = SPLIT;
|
||||||
for (int i = 0; i < prog->unilen; i++)
|
for (int i = 0; i < prog->unilen; i++)
|
||||||
@@ -417,8 +419,11 @@ int re_comp(rcode *prog, const char *re, int nsubs)
|
|||||||
prog->insts[prog->unilen++] = SAVE;
|
prog->insts[prog->unilen++] = SAVE;
|
||||||
prog->insts[prog->unilen++] = prog->sub + 1;
|
prog->insts[prog->unilen++] = prog->sub + 1;
|
||||||
prog->insts[prog->unilen++] = MATCH;
|
prog->insts[prog->unilen++] = MATCH;
|
||||||
prog->splits = (scnt - SPLIT) / 2 + SPLIT;
|
prog->splits = (scnt - SPLIT) / 2;
|
||||||
prog->len = icnt+2;
|
prog->len = icnt + 2;
|
||||||
|
prog->presub = sizeof(rsub)+(sizeof(char*) * (nsubs + 1) * 2);
|
||||||
|
prog->sub = prog->presub * (prog->len - prog->splits + 4);
|
||||||
|
prog->sparsesz = (scnt - 2) * 2;
|
||||||
return RE_SUCCESS;
|
return RE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -434,8 +439,14 @@ if (--csub->ref == 0) { \
|
|||||||
freesub = csub; \
|
freesub = csub; \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
#define deccheck(nn) \
|
#define rec_check(nn) \
|
||||||
{ decref(nsub) goto rec_check##nn; } \
|
if (si) { \
|
||||||
|
npc = pcs[--si]; \
|
||||||
|
nsub = subs[si]; \
|
||||||
|
goto rec##nn; \
|
||||||
|
} \
|
||||||
|
|
||||||
|
#define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \
|
||||||
|
|
||||||
#define onclist(nn)
|
#define onclist(nn)
|
||||||
#define onnlist(nn) \
|
#define onnlist(nn) \
|
||||||
@@ -493,19 +504,13 @@ if (spc == MATCH) \
|
|||||||
} \
|
} \
|
||||||
|
|
||||||
#define addthread(nn, list, listidx) \
|
#define addthread(nn, list, listidx) \
|
||||||
si = 0; \
|
|
||||||
rec##nn: \
|
rec##nn: \
|
||||||
spc = *npc; \
|
spc = *npc; \
|
||||||
if ((unsigned int)spc < WBEG) { \
|
if ((unsigned int)spc < WBEG) { \
|
||||||
list[listidx].sub = nsub; \
|
list[listidx].sub = nsub; \
|
||||||
list[listidx++].pc = npc; \
|
list[listidx++].pc = npc; \
|
||||||
|
rec_check(nn) \
|
||||||
list##match() \
|
list##match() \
|
||||||
rec_check##nn: \
|
|
||||||
if (si) { \
|
|
||||||
npc = pcs[--si]; \
|
|
||||||
nsub = subs[si]; \
|
|
||||||
goto rec##nn; \
|
|
||||||
} \
|
|
||||||
continue; \
|
continue; \
|
||||||
} \
|
} \
|
||||||
next##nn: \
|
next##nn: \
|
||||||
@@ -557,18 +562,18 @@ clistidx = nlistidx; \
|
|||||||
|
|
||||||
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||||
{
|
{
|
||||||
int rsubsize = sizeof(rsub)+(sizeof(char*)*nsubp);
|
int rsubsize = prog->presub, suboff = rsubsize;
|
||||||
int si, i, j, c, suboff = rsubsize, *npc, osubp = nsubp * sizeof(char*);
|
int spc, i, j, c, *npc, osubp = nsubp * sizeof(char*);
|
||||||
int clistidx = 0, nlistidx, spc, mcont = MATCH;
|
int si = 0, clistidx = 0, nlistidx, mcont = MATCH;
|
||||||
const char *sp = s, *_sp = s;
|
const char *sp = s, *_sp = s;
|
||||||
int *insts = prog->insts;
|
int *insts = prog->insts;
|
||||||
int *pcs[prog->splits];
|
int *pcs[prog->splits];
|
||||||
unsigned int sdense[prog->splits * 2], sparsesz;
|
|
||||||
rsub *subs[prog->splits];
|
rsub *subs[prog->splits];
|
||||||
char nsubs[rsubsize * (prog->len-prog->splits+14)];
|
unsigned int sdense[prog->sparsesz], sparsesz;
|
||||||
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
|
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
|
||||||
rthread _clist[prog->len], _nlist[prog->len];
|
rthread _clist[prog->len], _nlist[prog->len];
|
||||||
rthread *clist = _clist, *nlist = _nlist, *tmp;
|
rthread *clist = _clist, *nlist = _nlist, *tmp;
|
||||||
|
char nsubs[prog->sub];
|
||||||
goto jmp_start;
|
goto jmp_start;
|
||||||
for (;; sp = _sp) {
|
for (;; sp = _sp) {
|
||||||
uc_len(i, sp) uc_code(c, sp)
|
uc_len(i, sp) uc_code(c, sp)
|
||||||
@@ -651,10 +656,10 @@ int main(int argc, char *argv[])
|
|||||||
printf("Done in %f seconds\n", elapsed_time);
|
printf("Done in %f seconds\n", elapsed_time);
|
||||||
if (!sz)
|
if (!sz)
|
||||||
{ printf("-nomatch-\n"); continue; }
|
{ printf("-nomatch-\n"); continue; }
|
||||||
for (int k=sub_els; k>0; k--)
|
for (int k = sub_els; k > 0; k--)
|
||||||
if (sub[k-1])
|
if (sub[k-1])
|
||||||
break;
|
break;
|
||||||
for (int l=0; l<sub_els; l+=2) {
|
for (int l = 0; l < sub_els; l+=2) {
|
||||||
printf("(");
|
printf("(");
|
||||||
if (sub[l] == NULL || sub[l+1] == NULL)
|
if (sub[l] == NULL || sub[l+1] == NULL)
|
||||||
printf("?");
|
printf("?");
|
||||||
|
|||||||
Reference in New Issue
Block a user