initial utf-8 support
This commit is contained in:
3
README
3
README
@@ -38,11 +38,12 @@ so that the user does not need to waste time taking strlen()
|
|||||||
* Support for "match" vs "search" operations, as common in other regex APIs.
|
* Support for "match" vs "search" operations, as common in other regex APIs.
|
||||||
* Support for named character classes: \d \D \s \S \w \W.
|
* Support for named character classes: \d \D \s \S \w \W.
|
||||||
* Support for repetition operator {n} and {n,m}.
|
* Support for repetition operator {n} and {n,m}.
|
||||||
|
* Support for Unicode (UTF-8).
|
||||||
|
* Unlike other engines, the output is byte level offset. (Which is more useful)
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
====
|
====
|
||||||
|
|
||||||
* Support for Unicode (UTF-8). (trivial to do, because of int type sized code)
|
|
||||||
* Support for matching flags like case-insensitive, dot matches all,
|
* Support for matching flags like case-insensitive, dot matches all,
|
||||||
multiline, etc.
|
multiline, etc.
|
||||||
* Support for more assertions like \A, \Z.
|
* Support for more assertions like \A, \Z.
|
||||||
|
|||||||
81
pike.c
81
pike.c
@@ -9,6 +9,44 @@
|
|||||||
|
|
||||||
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
||||||
|
|
||||||
|
const unsigned char utf8_length[256] = {
|
||||||
|
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
|
||||||
|
/* 0 */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 1 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 2 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
/* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
/* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
/* A */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
/* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
/* C */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
/* D */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||||
|
/* E */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||||
|
/* F */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||||
|
};
|
||||||
|
|
||||||
|
/* return the length of a utf-8 character */
|
||||||
|
#define uc_len(dst, s) \
|
||||||
|
dst = utf8_length[(unsigned char)s[0]]; \
|
||||||
|
|
||||||
|
/* the unicode codepoint of the given utf-8 character */
|
||||||
|
#define uc_code(dst, s) \
|
||||||
|
dst = (unsigned char) s[0]; \
|
||||||
|
if (~dst & 0xc0); \
|
||||||
|
else if (~dst & 0x20) \
|
||||||
|
dst = ((dst & 0x1f) << 6) | (s[1] & 0x3f); \
|
||||||
|
else if (~dst & 0x10) \
|
||||||
|
dst = ((dst & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); \
|
||||||
|
else if (~dst & 0x08) \
|
||||||
|
dst = ((dst & 0x07) << 18) | ((s[1] & 0x3f) << 12) | \
|
||||||
|
((s[2] & 0x3f) << 6) | (s[3] & 0x3f); \
|
||||||
|
else \
|
||||||
|
dst = 0; \
|
||||||
|
|
||||||
typedef struct rinst rinst;
|
typedef struct rinst rinst;
|
||||||
struct rinst
|
struct rinst
|
||||||
{
|
{
|
||||||
@@ -144,9 +182,10 @@ int re_classmatch(const int *pc, const char *sp)
|
|||||||
{
|
{
|
||||||
// pc points to "cnt" byte after opcode
|
// pc points to "cnt" byte after opcode
|
||||||
int is_positive = (pc[-1] == CLASS);
|
int is_positive = (pc[-1] == CLASS);
|
||||||
int cnt = *pc++;
|
int cnt = *pc++, c;
|
||||||
|
uc_code(c, sp)
|
||||||
while (cnt--) {
|
while (cnt--) {
|
||||||
if (*sp >= *pc && *sp <= pc[1]) return is_positive;
|
if (c >= *pc && c <= pc[1]) return is_positive;
|
||||||
pc += 2;
|
pc += 2;
|
||||||
}
|
}
|
||||||
return !is_positive;
|
return !is_positive;
|
||||||
@@ -234,16 +273,15 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
{
|
{
|
||||||
const char *re = *re_loc;
|
const char *re = *re_loc;
|
||||||
int *code = sizecode ? NULL : prog->insts;
|
int *code = sizecode ? NULL : prog->insts;
|
||||||
int start = PC;
|
int start = PC, term = PC;
|
||||||
int term = PC;
|
int alt_label = 0, c;
|
||||||
int alt_label = 0;
|
|
||||||
|
|
||||||
for (; *re && *re != ')'; re++) {
|
for (; *re && *re != ')';) {
|
||||||
switch (*re) {
|
switch (*re) {
|
||||||
case '\\':;
|
case '\\':
|
||||||
re++;
|
re++;
|
||||||
if (!*re) goto syntax_error; // Trailing backslash
|
if (!*re) goto syntax_error; // Trailing backslash
|
||||||
char c = *re | 0x20;
|
c = *re | 0x20;
|
||||||
if (c == 'd' || c == 's' || c == 'w') {
|
if (c == 'd' || c == 's' || c == 'w') {
|
||||||
term = PC;
|
term = PC;
|
||||||
EMIT(PC++, NAMEDCLASS);
|
EMIT(PC++, NAMEDCLASS);
|
||||||
@@ -256,7 +294,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
default:
|
default:
|
||||||
term = PC;
|
term = PC;
|
||||||
EMIT(PC++, CHAR);
|
EMIT(PC++, CHAR);
|
||||||
EMIT(PC++, *re);
|
uc_code(c, re) EMIT(PC++, c);
|
||||||
prog->len++;
|
prog->len++;
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
@@ -276,7 +314,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
}
|
}
|
||||||
PC++; // Skip "# of pairs" byte
|
PC++; // Skip "# of pairs" byte
|
||||||
prog->len++;
|
prog->len++;
|
||||||
for (cnt = 0; *re != ']'; re++, cnt++) {
|
for (cnt = 0; *re != ']'; cnt++) {
|
||||||
if (!*re) goto syntax_error;
|
if (!*re) goto syntax_error;
|
||||||
if (*re == '\\') {
|
if (*re == '\\') {
|
||||||
re++;
|
re++;
|
||||||
@@ -284,10 +322,12 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
if (*re != '\\' && *re != ']')
|
if (*re != '\\' && *re != ']')
|
||||||
goto unsupported_escape;
|
goto unsupported_escape;
|
||||||
}
|
}
|
||||||
EMIT(PC++, *re);
|
uc_code(c, re) EMIT(PC++, c);
|
||||||
if (re[1] == '-' && re[2] != ']')
|
uc_len(c, re)
|
||||||
re += 2;
|
if (re[c] == '-' && re[c+1] != ']')
|
||||||
EMIT(PC++, *re);
|
re += c+1;
|
||||||
|
uc_code(c, re) EMIT(PC++, c);
|
||||||
|
uc_len(c, re) re += c;
|
||||||
}
|
}
|
||||||
EMIT(term + 1, cnt);
|
EMIT(term + 1, cnt);
|
||||||
break;
|
break;
|
||||||
@@ -433,6 +473,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
|
|||||||
term = PC;
|
term = PC;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
uc_len(c, re) re += c;
|
||||||
}
|
}
|
||||||
if (alt_label) {
|
if (alt_label) {
|
||||||
EMIT(alt_label, REL(alt_label, PC) + 1);
|
EMIT(alt_label, REL(alt_label, PC) + 1);
|
||||||
@@ -551,7 +592,7 @@ static void addthread(const int *pbeg, int *plist, int gen, rthreadlist *l,
|
|||||||
|
|
||||||
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
||||||
{
|
{
|
||||||
int i, gen, *pc;
|
int i, c, l, gen, *pc;
|
||||||
const char *sp;
|
const char *sp;
|
||||||
int plist[prog->unilen];
|
int plist[prog->unilen];
|
||||||
rsub *sub, *matched = NULL;
|
rsub *sub, *matched = NULL;
|
||||||
@@ -572,10 +613,10 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
|
|
||||||
gen = 1;
|
gen = 1;
|
||||||
addthread(prog->insts, plist, gen, clist, prog->insts, sub, s, s);
|
addthread(prog->insts, plist, gen, clist, prog->insts, sub, s, s);
|
||||||
for(sp=s;; sp++) {
|
for(sp=s;; sp += l) {
|
||||||
if(clist->n == 0)
|
if(clist->n == 0)
|
||||||
break;
|
break;
|
||||||
gen++;
|
gen++; uc_len(l, s)
|
||||||
for(i=0; i<clist->n; i++) {
|
for(i=0; i<clist->n; i++) {
|
||||||
pc = clist->t[i].pc;
|
pc = clist->t[i].pc;
|
||||||
sub = clist->t[i].sub;
|
sub = clist->t[i].sub;
|
||||||
@@ -587,13 +628,14 @@ int re_pikevm(rcode *prog, const char *s, const char **subp, int nsubp)
|
|||||||
}
|
}
|
||||||
switch(*pc++) {
|
switch(*pc++) {
|
||||||
case CHAR:
|
case CHAR:
|
||||||
if(*sp != *pc++) {
|
uc_code(c, sp)
|
||||||
|
if(c != *pc++) {
|
||||||
decref(sub);
|
decref(sub);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ANY:
|
case ANY:
|
||||||
addthread:
|
addthread:
|
||||||
addthread(prog->insts, plist, gen, nlist, pc, sub, s, sp+1);
|
addthread(prog->insts, plist, gen, nlist, pc, sub, s, sp+l);
|
||||||
break;
|
break;
|
||||||
case CLASS:
|
case CLASS:
|
||||||
case CLASSNOT:
|
case CLASSNOT:
|
||||||
@@ -652,6 +694,7 @@ int main(int argc, char *argv[])
|
|||||||
const char *sub[sub_els];
|
const char *sub[sub_els];
|
||||||
for (int i = 2; i < argc; i++) {
|
for (int i = 2; i < argc; i++) {
|
||||||
printf("sub depth %d\n", subidx);
|
printf("sub depth %d\n", subidx);
|
||||||
|
printf("input bytelen: %d\n", strlen(argv[i]));
|
||||||
if(!re_pikevm(_code, argv[i], sub, sub_els))
|
if(!re_pikevm(_code, argv[i], sub, sub_els))
|
||||||
{ printf("-nomatch-\n"); continue; }
|
{ printf("-nomatch-\n"); continue; }
|
||||||
for(int k=sub_els; k>0; k--)
|
for(int k=sub_els; k>0; k--)
|
||||||
|
|||||||
24
test.sh
24
test.sh
@@ -38,6 +38,14 @@ a{5}
|
|||||||
(abc+){5}|[0-9]{1,}
|
(abc+){5}|[0-9]{1,}
|
||||||
(abc+){5}|[0-9]{1,}
|
(abc+){5}|[0-9]{1,}
|
||||||
b[^c]*
|
b[^c]*
|
||||||
|
۱۲۳۴۵۶۷۸۹۰
|
||||||
|
[йцукен]
|
||||||
|
日本語*
|
||||||
|
([^ひらがな])|(な+)
|
||||||
|
([^ひらがな])|(な+)
|
||||||
|
([^abc])|(a+)
|
||||||
|
[a-g]+
|
||||||
|
[а-г]+
|
||||||
"
|
"
|
||||||
input="\
|
input="\
|
||||||
abcdef
|
abcdef
|
||||||
@@ -77,6 +85,14 @@ vbcvb
|
|||||||
abcabcabcabcabcabchsdfhsdh
|
abcabcabcabcabcabchsdfhsdh
|
||||||
62374623
|
62374623
|
||||||
djfjgjsdfjbshdhfhshd
|
djfjgjsdfjbshdhfhshd
|
||||||
|
۱۲۳۴۵۶۷۸۹۰
|
||||||
|
ке
|
||||||
|
日本語語語語語語語本本本本
|
||||||
|
なななな
|
||||||
|
abc
|
||||||
|
aaaa
|
||||||
|
aaaabcdefghij
|
||||||
|
ааааабвг...
|
||||||
"
|
"
|
||||||
expect="\
|
expect="\
|
||||||
(0,3)
|
(0,3)
|
||||||
@@ -116,6 +132,14 @@ expect="\
|
|||||||
(0,15)(12,15)
|
(0,15)(12,15)
|
||||||
(0,8)(?,?)
|
(0,8)(?,?)
|
||||||
(10,20)
|
(10,20)
|
||||||
|
(0,20)
|
||||||
|
(0,2)
|
||||||
|
(0,27)
|
||||||
|
(0,12)(?,?)(0,12)
|
||||||
|
(0,1)(0,1)(?,?)
|
||||||
|
(0,4)(?,?)(0,4)
|
||||||
|
(0,10)
|
||||||
|
(0,16)
|
||||||
(0,0)
|
(0,0)
|
||||||
"
|
"
|
||||||
c=1
|
c=1
|
||||||
|
|||||||
Reference in New Issue
Block a user