further improve submatch extraction

2021-09-02 16:36:19 +00:00
parent 695f6b1f56
commit 67c691f95b
2 changed files with 33 additions and 16 deletions
--- a/pike.c
+++ b/pike.c
@@ -56,6 +56,7 @@ struct rcode
 	int unilen;
 	int len;
 	int sub;
 	int presub;
 	int splits;
 	int gen;
 	int insts[];
@@ -254,7 +255,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 			if (capture) {
 				sub = ++prog->sub;
 				EMIT(PC++, SAVE);
-				EMIT(PC++, 2 * sub);
+				EMIT(PC++, sub);
 				prog->len++;
 			}
 			int res = _compilecode(&re, prog, sizecode);
@@ -263,7 +264,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
 			if (*re != ')') return RE_SYNTAX_ERROR;
 			if (capture) {
 				EMIT(PC++, SAVE);
-				EMIT(PC++, 2 * sub + 1);
+				EMIT(PC++, sub + prog->presub + 1);
 				prog->len++;
 			}
 			break;
@@ -387,24 +388,26 @@ syntax_error:
 	return RE_SYNTAX_ERROR;
 }
-int re_sizecode(const char *re)
+int re_sizecode(const char *re, int *nsub)
 {
 	rcode dummyprog;
 	dummyprog.unilen = 3;
 	dummyprog.sub = 0;
 	int res = _compilecode(&re, &dummyprog, /*sizecode*/1);
 	if (res < 0) return res;
 	// If unparsed chars left
 	if (*re) return RE_SYNTAX_ERROR;
-
+	*nsub = dummyprog.sub;
 	return dummyprog.unilen;
 }
-int re_comp(rcode *prog, const char *re)
+int re_comp(rcode *prog, const char *re, int nsubs)
 {
 	prog->len = 0;
 	prog->unilen = 0;
 	prog->sub = 0;
 	prog->presub = nsubs;
 	prog->splits = 0;
 	prog->gen = 1;
@@ -414,7 +417,7 @@ int re_comp(rcode *prog, const char *re)
 	if (*re) return RE_SYNTAX_ERROR;
 	prog->insts[prog->unilen++] = SAVE;
-	prog->insts[prog->unilen++] = 1;
+	prog->insts[prog->unilen++] = prog->sub + 1;
 	prog->insts[prog->unilen++] = MATCH;
 	prog->len += 2;
@@ -450,6 +453,16 @@ if (*pc < WBEG) { \
 subs[i++] = sub; \
 goto next##nn; \
 #define save1() \
 newsub(for (j = nsubp / 2; j < nsubp; j++) s1->sub[j] = NULL;) \
 for (j = 0; j < nsubp / 2; j++) \
 	s1->sub[j] = sub->sub[j]; \
 #define save2() \
 newsub(/*nop*/) \
 for (j = 0; j < nsubp; j++) \
 	s1->sub[j] = sub->sub[j]; \
 #define addthread(nn, list, listidx, _pc, _sub) \
 { \
 	int i = 0, *pc = _pc; \
@@ -491,9 +504,7 @@ goto next##nn; \
 	case SAVE: \
 		if (sub->ref > 1) { \
 			sub->ref--; \
-			newsub(/*nop*/) \
+			save##nn() \
 			for (j = 0; j < nsubp; j++) \
 				s1->sub[j] = sub->sub[j]; \
 			sub = s1; \
 			sub->ref = 1; \
 		} \
@@ -583,8 +594,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
 			break;
 	}
 	if (matched) {
-		for (i = 0; i < nsubp; i++)
+		for (i = 0, j = i; i < nsubp; i+=2, j++) {
-			subp[i] = matched->sub[i];
+			subp[i] = matched->sub[j];
 			subp[i+1] = matched->sub[j+(nsubp/2)];
 		}
 		_return(1)
 	}
 	_return(0)
@@ -596,19 +609,20 @@ int main(int argc, char *argv[])
 		printf("usage: <regex> <str...> <str...> ...\n");
 		return 0;
 	}
-	int sz = re_sizecode(argv[1]) * sizeof(int);
+	int sub_els;
 	int sz = re_sizecode(argv[1], &sub_els) * sizeof(int);
 	printf("Precalculated size: %d\n", sz);
 	char code[(sizeof(rcode)+sz)*2];
 	memset(code+sizeof(rcode)+sz, 0, sizeof(rcode)+sz);
 	rcode *_code = (rcode*)code;
-	if (re_comp(_code, argv[1])) {
+	if (re_comp(_code, argv[1], sub_els)) {
 		printf("Error in re_comp");
 		return 1;
 	}
 	re_dumpcode(_code);
 	#include <time.h>
 	if (argc > 2) {
-		int sub_els = (_code->sub + 1) * 2;
+		sub_els = (sub_els + 1) * 2;
 		const char *sub[sub_els];
 		for (int i = 2; i < argc; i++) {
 			printf("input bytelen: %ld\n", strlen(argv[i]));
--- a/test.sh
+++ b/test.sh
@@ -91,6 +91,7 @@ abc\\\\>
 [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
 [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
 [-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>
 (([-+]?\\\\<(0[xX][0-9a-fA-FUL]+|[0-9.]{1,}[0-9eEfFuULl]+|[0-9]+)\\\\>))
 qwerty.*$
 ([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$)
 ([a-zA-Z0-9_][^1]*[a-zA-Z0-9_])|(\\\\\$([^\$]+)\\\\\$)
@@ -197,6 +198,7 @@ world
   0x663q
 x37247
  124435.7727ULL
 str + len - 1;
 jjdfjk sjdjjsqwerty jdfjdfhhdhfdjjjfj jjjdf
 $\"},  /* email */
 $\"},  /* email */$
@@ -303,6 +305,7 @@ expect="\
 -nomatch-
 -nomatch-
 (2,16)(2,16)
 (12,13)(12,13)(12,13)(12,13)
 (14,44)
 (9,14)(9,14)(?,?)(?,?)
 (0,18)(?,?)(0,18)(1,17)