pike.c: updated codebase

This commit is contained in:
Kyryl Melekhin
2025-11-01 18:30:11 +00:00
parent 2b7c8eb846
commit cad0f12966
2 changed files with 113 additions and 109 deletions

212
pike.c
View File

@@ -1,6 +1,6 @@
/* /*
Copyright 2007-2009 Russ Cox. All Rights Reserved. Copyright 2007-2009 Russ Cox. All Rights Reserved.
Copyright 2020-2021 Kyryl Melekhin. All Rights Reserved. Copyright 2020-2025 Kyryl Melekhin. All Rights Reserved.
Use of this source code is governed by a BSD-style Use of this source code is governed by a BSD-style
*/ */
@@ -8,6 +8,9 @@ Use of this source code is governed by a BSD-style
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
#include <time.h>
#define MAX(a, b) ((a) < (b) ? (b) : (a))
unsigned char utf8_length[256] = { unsigned char utf8_length[256] = {
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
@@ -30,16 +33,17 @@ unsigned char utf8_length[256] = {
}; };
/* return the length of a utf-8 character */ /* return the length of a utf-8 character */
#define uc_len(dst, s) dst = utf8_length[(unsigned char)s[0]]; #define uc_len(s) utf8_length[(unsigned char)s[0]]
/* the unicode codepoint of the given utf-8 character */ /* the unicode codepoint of the given utf-8 character */
#define uc_code(dst, s) \ #define uc_code(dst, s, l) \
dst = (unsigned char)s[0]; \ dst = (unsigned char)s[0]; \
if (dst < 192){} \ l = utf8_length[dst]; \
else if (dst < 224) \ if (l == 1); \
else if (l == 2) \
dst = ((dst & 0x1f) << 6) | (s[1] & 0x3f); \ dst = ((dst & 0x1f) << 6) | (s[1] & 0x3f); \
else if (dst < 240) \ else if (l == 3) \
dst = ((dst & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); \ dst = ((dst & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); \
else if (dst < 248) \ else if (l == 4) \
dst = ((dst & 0x07) << 18) | ((s[1] & 0x3f) << 12) | \ dst = ((dst & 0x07) << 18) | ((s[1] & 0x3f) << 12) | \
((s[2] & 0x3f) << 6) | (s[3] & 0x3f); \ ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); \
else \ else \
@@ -106,18 +110,6 @@ pc += num;
#define EMIT(at, byte) (code ? (code[at] = byte) : at) #define EMIT(at, byte) (code ? (code[at] = byte) : at)
#define PC (prog->unilen) #define PC (prog->unilen)
static int re_classmatch(const int *pc, int c)
{
/* pc points to "classnot" byte after opcode */
int is_positive = *pc++;
int cnt = *pc++;
while (cnt--) {
if (c >= *pc && c <= pc[1]) return is_positive;
pc += 2;
}
return !is_positive;
}
void re_dumpcode(rcode *prog) void re_dumpcode(rcode *prog)
{ {
int pc = 0, i = 0; int pc = 0, i = 0;
@@ -177,12 +169,12 @@ void re_dumpcode(rcode *prog)
prog->unilen, prog->len, prog->splits, i); prog->unilen, prog->len, prog->splits, i);
} }
static int _compilecode(const char *re_loc, rcode *prog, int sizecode) static int compilecode(const char *re_loc, rcode *prog, int sizecode)
{ {
const char *re = re_loc; const char *re = re_loc;
int *code = sizecode ? NULL : prog->insts; int *code = sizecode ? NULL : prog->insts;
int start = PC, term = PC; int start = PC, term = PC;
int alt_label = 0, c; int alt_label = 0, c, l, cnt;
int alt_stack[4096], altc = 0; int alt_stack[4096], altc = 0;
int cap_stack[4096 * 5], capc = 0; int cap_stack[4096 * 5], capc = 0;
@@ -190,10 +182,9 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
switch (*re) { switch (*re) {
case '\\': case '\\':
re++; re++;
if (!*re) return -1; /* Trailing backslash */ if (!*re)
return -1; /* Trailing backslash */
if (*re == '<' || *re == '>') { if (*re == '<' || *re == '>') {
if (re - re_loc > 2 && re[-2] == '\\')
break;
EMIT(PC++, *re == '<' ? WBEG : WEND); EMIT(PC++, *re == '<' ? WBEG : WEND);
term = PC; term = PC;
break; break;
@@ -201,14 +192,14 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
default: default:
term = PC; term = PC;
EMIT(PC++, CHAR); EMIT(PC++, CHAR);
uc_code(c, re) EMIT(PC++, c); uc_code(c, re, l)
EMIT(PC++, c);
break; break;
case '.': case '.':
term = PC; term = PC;
EMIT(PC++, ANY); EMIT(PC++, ANY);
break; break;
case '[':; case '[':;
int cnt;
term = PC; term = PC;
re++; re++;
EMIT(PC++, CLASS); EMIT(PC++, CLASS);
@@ -217,36 +208,39 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
re++; re++;
} else } else
EMIT(PC++, 1); EMIT(PC++, 1);
PC++; /* Skip "# of pairs" byte */ PC++;
for (cnt = 0; *re != ']'; cnt++) { for (cnt = 0; *re != ']'; cnt++) {
if (*re == '\\') re++; if (*re == '\\')
if (!*re) return -1; re++;
uc_code(c, re) EMIT(PC++, c); uc_code(c, re, l)
uc_len(c, re) EMIT(PC++, c);
if (re[c] == '-' && re[c+1] != ']') if (re[l] == '-' && re[l+1] != ']') {
re += c+1; re += l + 1 + (re[l+1] == '\\');
uc_code(c, re) EMIT(PC++, c); uc_code(c, re, l)
uc_len(c, re) re += c; }
EMIT(PC++, c);
if (!l)
return -1;
re += l;
} }
EMIT(term + 2, cnt); EMIT(term + 2, cnt);
break; break;
case '(':; case '(':;
term = PC; term = PC;
int sub; int sub;
int capture = 1; if (re[1] == '?') {
if (*(re+1) == '?') {
re += 2; re += 2;
if (*re == ':') if (*re == ':') {
capture = 0; cap_stack[capc++] = 0;
else goto non_capture;
} else
return -1; return -1;
} }
if (capture) {
sub = ++prog->sub; sub = ++prog->sub;
EMIT(PC++, SAVE); EMIT(PC++, SAVE);
EMIT(PC++, sub); EMIT(PC++, sub);
} cap_stack[capc++] = 1;
cap_stack[capc++] = capture; non_capture:
cap_stack[capc++] = term; cap_stack[capc++] = term;
cap_stack[capc++] = alt_label; cap_stack[capc++] = alt_label;
cap_stack[capc++] = start; cap_stack[capc++] = start;
@@ -255,7 +249,8 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
start = PC; start = PC;
break; break;
case ')': case ')':
if (--capc-4 < 0) return -1; if (--capc-4 < 0)
return -1;
if (code && alt_label) { if (code && alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1); EMIT(alt_label, REL(alt_label, PC) + 1);
int _altc = cap_stack[capc]; int _altc = cap_stack[capc];
@@ -303,7 +298,8 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
} }
break; break;
case '?': case '?':
if (PC == term) return -1; if (PC == term)
return -1;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 2, PC);
if (re[1] == '?') { if (re[1] == '?') {
EMIT(term, RSPLIT); EMIT(term, RSPLIT);
@@ -314,7 +310,8 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
break; break;
case '*': case '*':
if (PC == term) return -1; if (PC == term)
return -1;
INSERT_CODE(term, 2, PC); INSERT_CODE(term, 2, PC);
EMIT(PC, JMP); EMIT(PC, JMP);
EMIT(PC + 1, REL(PC, term)); EMIT(PC + 1, REL(PC, term));
@@ -328,7 +325,8 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
break; break;
case '+': case '+':
if (PC == term) return -1; if (PC == term)
return -1;
if (re[1] == '?') { if (re[1] == '?') {
EMIT(PC, SPLIT); EMIT(PC, SPLIT);
re++; re++;
@@ -357,7 +355,7 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
term = PC; term = PC;
break; break;
} }
uc_len(c, re) re += c; re += uc_len(re);
} }
if (code && alt_label) { if (code && alt_label) {
EMIT(alt_label, REL(alt_label, PC) + 1); EMIT(alt_label, REL(alt_label, PC) + 1);
@@ -369,28 +367,25 @@ static int _compilecode(const char *re_loc, rcode *prog, int sizecode)
return capc ? -1 : 0; return capc ? -1 : 0;
} }
int re_sizecode(const char *re, int *nsub) static int re_sizecode(const char *re, int *nsub)
{ {
rcode dummyprog; rcode dummyprog;
dummyprog.unilen = 3; dummyprog.unilen = 4;
dummyprog.sub = 0; dummyprog.sub = 0;
int res = compilecode(re, &dummyprog, 1);
int res = _compilecode(re, &dummyprog, 1);
if (res < 0) return res;
*nsub = dummyprog.sub; *nsub = dummyprog.sub;
return dummyprog.unilen; return res < 0 ? res : dummyprog.unilen;
} }
int re_comp(rcode *prog, const char *re, int nsubs) static int reg_comp(rcode *prog, const char *re, int nsubs)
{ {
prog->len = 0; prog->len = 0;
prog->unilen = 0; prog->unilen = 0;
prog->sub = 0; prog->sub = 0;
prog->presub = nsubs; prog->presub = nsubs;
prog->splits = 0; prog->splits = 0;
if (compilecode(re, prog, 0) < 0)
int res = _compilecode(re, prog, 0); return -1;
if (res < 0) return res;
int icnt = 0, scnt = SPLIT; int icnt = 0, scnt = SPLIT;
for (int i = 0; i < prog->unilen; i++) for (int i = 0; i < prog->unilen; i++)
switch (prog->insts[i]) { switch (prog->insts[i]) {
@@ -416,20 +411,30 @@ int re_comp(rcode *prog, const char *re, int nsubs)
prog->insts[prog->unilen++] = SAVE; prog->insts[prog->unilen++] = SAVE;
prog->insts[prog->unilen++] = prog->sub + 1; prog->insts[prog->unilen++] = prog->sub + 1;
prog->insts[prog->unilen++] = MATCH; prog->insts[prog->unilen++] = MATCH;
prog->splits = (scnt - SPLIT) / 2; prog->splits = MAX((scnt - SPLIT) / 2, 1);
prog->len = icnt + 2; prog->len = icnt + 3;
prog->presub = sizeof(rsub) + (sizeof(char*) * (nsubs + 1) * 2); prog->presub = sizeof(rsub) + (sizeof(char*) * (nsubs + 1) * 2);
prog->sub = prog->presub * (prog->len - prog->splits + 3); prog->sub = prog->presub * (icnt + 6);
prog->sparsesz = scnt; prog->sparsesz = scnt;
return 0; return 0;
} }
#define newsub(init, copy) \ #define newsub(init, copy) \
if (freesub) \ if (freesub) { \
{ s1 = freesub; freesub = s1->freesub; copy } \ sub = freesub; freesub = sub->freesub; copy \
else \ } else { \
{ if (suboff == prog->sub) suboff = 0; \ if (suboff == prog->sub) \
s1 = (rsub*)&nsubs[suboff]; suboff += rsubsize; init } \ suboff = 0; \
sub = (rsub*)&nsubs[suboff]; \
suboff += rsubsize; init \
} \
#define onlist(nn) \
if (sdense[spc] < sparsesz) \
if (sdense[sdense[spc] << 1] == (unsigned int)spc) \
deccheck(nn) \
sdense[spc] = sparsesz; \
sdense[sparsesz++ << 1] = spc; \
#define decref(csub) \ #define decref(csub) \
if (--csub->ref == 0) { \ if (--csub->ref == 0) { \
@@ -446,13 +451,6 @@ if (si) { \
#define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \ #define deccheck(nn) { decref(nsub) rec_check(nn) continue; } \
#define onlist(nn) \
if (sdense[spc] < sparsesz) \
if (sdense[sdense[spc] * 2] == (unsigned int)spc) \
deccheck(nn) \
sdense[spc] = sparsesz; \
sdense[sparsesz++ * 2] = spc; \
#define fastrec(nn, list, listidx) \ #define fastrec(nn, list, listidx) \
nsub->ref++; \ nsub->ref++; \
spc = *npc; \ spc = *npc; \
@@ -466,20 +464,20 @@ subs[si++] = nsub; \
goto next##nn; \ goto next##nn; \
#define saveclist() \ #define saveclist() \
if (npc[1] > nsubp / 2 && nsub->ref > 1) { \ if (npc[1] > (nsubp >> 1) && nsub->ref > 1) { \
nsub->ref--; \ nsub->ref--; \
newsub(memcpy(s1->sub, nsub->sub, osubp);, \ newsub(memcpy(sub->sub, nsub->sub, osubp);, \
memcpy(s1->sub, nsub->sub, osubp / 2);) \ memcpy(sub->sub, nsub->sub, osubp >> 1);) \
nsub = s1; \ nsub = sub; \
nsub->ref = 1; \ nsub->ref = 1; \
} \ } \
#define savenlist() \ #define savenlist() \
if (nsub->ref > 1) { \ if (nsub->ref > 1) { \
nsub->ref--; \ nsub->ref--; \
newsub(/*nop*/, /*nop*/) \ newsub(,) \
memcpy(s1->sub, nsub->sub, osubp); \ memcpy(sub->sub, nsub->sub, osubp); \
nsub = s1; \ nsub = sub; \
nsub->ref = 1; \ nsub->ref = 1; \
} \ } \
@@ -513,8 +511,7 @@ if (spc > JMP) { \
} else if (spc == SAVE) { \ } else if (spc == SAVE) { \
save##list() \ save##list() \
nsub->sub[npc[1]] = _sp; \ nsub->sub[npc[1]] = _sp; \
npc += 2; \ npc += 2; goto rec##nn; \
goto rec##nn; \
} else if (spc == WBEG) { \ } else if (spc == WBEG) { \
if (((sp != s || sp != _sp) && isword(sp)) \ if (((sp != s || sp != _sp) && isword(sp)) \
|| !isword(_sp)) \ || !isword(_sp)) \
@@ -555,23 +552,24 @@ clistidx = nlistidx; \
#define deccont() { decref(nsub) continue; } #define deccont() { decref(nsub) continue; }
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp) static int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
{ {
int rsubsize = prog->presub, suboff = 0; if (!*s)
int spc, i, j, c, *npc, osubp = nsubp * sizeof(char*); return 0;
int si = 0, clistidx = 0, nlistidx, mcont = MATCH;
const char *sp = s, *_sp = s; const char *sp = s, *_sp = s;
int *insts = prog->insts; int *pcs[prog->splits], *npc, *pc, *insts = prog->insts;
int *pcs[prog->splits];
rsub *subs[prog->splits]; rsub *subs[prog->splits];
unsigned int sdense[prog->sparsesz], sparsesz = 0; rsub *nsub, *sub, *matched = NULL, *freesub = NULL;
rsub *nsub, *s1, *matched = NULL, *freesub = NULL;
rthread _clist[prog->len], _nlist[prog->len]; rthread _clist[prog->len], _nlist[prog->len];
rthread *clist = _clist, *nlist = _nlist, *tmp; rthread *clist = _clist, *nlist = _nlist, *tmp;
int rsubsize = prog->presub, suboff = 0;
int cnt, spc, i, c, osubp = nsubp * sizeof(char*);
int si = 0, clistidx = 0, nlistidx, mcont = MATCH;
unsigned int sdense[prog->sparsesz], sparsesz = 0;
char nsubs[prog->sub]; char nsubs[prog->sub];
goto jmp_start; goto jmp_start;
for (;; sp = _sp) { for (;; sp = _sp) {
uc_len(i, sp) uc_code(c, sp) uc_code(c, sp, i)
_sp = sp+i; _sp = sp+i;
nlistidx = 0; sparsesz = 0; nlistidx = 0; sparsesz = 0;
for (i = 0; i < clistidx; i++) { for (i = 0; i < clistidx; i++) {
@@ -579,13 +577,20 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
nsub = clist[i].sub; nsub = clist[i].sub;
spc = *npc; spc = *npc;
if (spc == CHAR) { if (spc == CHAR) {
if (c != *(npc+1)) if (c != npc[1])
deccont() deccont()
npc += 2; npc += 2;
} else if (spc == CLASS) { } else if (spc == CLASS) {
if (!re_classmatch(npc+1, c)) pc = npc+1;
cnt = pc[1];
for (; cnt > 0; cnt--) {
pc += 2;
if (c >= *pc && c <= pc[1])
cnt = -1;
}
if ((!cnt && npc[1]) || (cnt < 0 && !npc[1]))
deccont() deccont()
npc += *(npc+2) * 2 + 3; npc += npc[2] * 2 + 3;
} else if (spc == MATCH) { } else if (spc == MATCH) {
matched: matched:
nlist[nlistidx++].pc = &mcont; nlist[nlistidx++].pc = &mcont;
@@ -595,9 +600,9 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
matched = nsub; matched = nsub;
} }
if (sp == _sp || nlistidx == 1) { if (sp == _sp || nlistidx == 1) {
for (i = 0, j = i; i < nsubp; i+=2, j++) { for (i = 0; i < nsubp; i+=2) {
subp[i] = matched->sub[j]; subp[i] = matched->sub[i >> 1];
subp[i+1] = matched->sub[nsubp / 2 + j]; subp[i+1] = matched->sub[(nsubp >> 1) + (i >> 1)];
} }
return 1; return 1;
} }
@@ -611,10 +616,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
break; break;
swaplist() swaplist()
jmp_start: jmp_start:
newsub(memset(s1->sub, 0, osubp);, /*nop*/) newsub(memset(sub->sub, 0, osubp);,)
s1->ref = 1; sub->ref = 1;
s1->sub[0] = _sp; sub->sub[0] = _sp;
nsub = s1; npc = insts; nsub = sub; npc = insts;
addthread(1, clist, clistidx) addthread(1, clist, clistidx)
_continue:; _continue:;
} }
@@ -636,12 +641,11 @@ int main(int argc, char *argv[])
} }
char code[sizeof(rcode)+sz]; char code[sizeof(rcode)+sz];
rcode *_code = (rcode*)code; rcode *_code = (rcode*)code;
if (re_comp(_code, argv[1], sub_els)) { if (reg_comp(_code, argv[1], sub_els)) {
printf("Error in re_comp\n"); printf("Error in reg_comp\n");
return 1; return 1;
} }
re_dumpcode(_code); re_dumpcode(_code);
#include <time.h>
if (argc > 2) { if (argc > 2) {
sub_els = (sub_els + 1) * 2; sub_els = (sub_els + 1) * 2;
const char *sub[sub_els]; const char *sub[sub_els];

View File

@@ -581,7 +581,7 @@ expect="\
(0,10)(0,10)(0,10)(?,?) (0,10)(0,10)(0,10)(?,?)
(0,0)(0,0)(0,0)(?,?) (0,0)(0,0)(0,0)(?,?)
(0,20)(0,20)(0,20)(?,?) (0,20)(0,20)(0,20)(?,?)
(0,0) -nomatch-
" "
if [ ! -f ./a.out ]; then if [ ! -f ./a.out ]; then