Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix regexp case insensitive flag #531

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 1 addition & 56 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@
/*
TODO:

- Add full unicode canonicalize rules for character ranges (not
really useful but needed for exact "ignorecase" compatibility).

- Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution
Expand Down Expand Up @@ -123,33 +120,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
return 0;
}

/* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_unicode) {
if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a';
} else {
lre_case_conv(res, c, 2);
c = res[0];
}
} else {
if (likely(c < 128)) {
if (c >= 'a' && c <= 'z')
c = c - 'a' + 'A';
} else {
/* legacy regexp: to upper case if single char >= 128 */
len = lre_case_conv(res, c, FALSE);
if (len == 1 && res[0] >= 128)
c = res[0];
}
}
return c;
}

static const uint16_t char_range_d[] = {
1,
0x0030, 0x0039 + 1,
Expand Down Expand Up @@ -248,31 +218,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
return -1;
}

static int cr_canonicalize(CharRange *cr)
{
CharRange a;
uint32_t pt[2];
int i, ret;

cr_init(&a, cr->mem_opaque, lre_realloc);
pt[0] = 'a';
pt[1] = 'z' + 1;
ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
if (ret)
goto fail;
/* convert to upper case */
/* XXX: the generic unicode case would be much more complicated
and not really useful */
for(i = 0; i < a.len; i++) {
a.points[i] += 'A' - 'a';
}
/* Note: for simplicity we keep the lower case ranges */
ret = cr_union1(cr, a.points, a.len);
fail:
cr_free(&a);
return ret;
}

#ifdef DUMP_REOP
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
int buf_len)
Expand Down Expand Up @@ -955,7 +900,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
}
}
if (s->ignore_case) {
if (cr_canonicalize(cr))
if (cr_regexp_canonicalize(cr, s->is_unicode))
goto memory_error;
}
if (invert) {
Expand Down
124 changes: 61 additions & 63 deletions libunicode-table.h
Original file line number Diff line number Diff line change
Expand Up @@ -3777,72 +3777,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = {
0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80,
};

static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = {
0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44,
0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f,
0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80,
0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b,
0x84,
static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = {
0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80,
0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00,
0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81,
0x89, 0x10, 0x81, 0x8d, 0x80,
};

static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = {
static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = {
0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12,
0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b,
0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80,
0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08,
0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08,
0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80,
0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87,
0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11,
0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe,
0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88,
0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00,
0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03,
0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81,
0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10,
0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1,
0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87,
0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80,
0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08,
0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b,
0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a,
0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a,
0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06,
0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80,
0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41,
0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6,
0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0,
0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40,
0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09,
0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf,
0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f,
0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40,
0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80,
0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81,
0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9,
0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00,
0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91,
0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95,
0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00,
0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40,
0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80,
0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60,
0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87,
0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01,
0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80,
0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04,
0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23,
0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18,
0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00,
0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00,
0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b,
0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90,
0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84,
0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee,
0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d,
0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05,
0xe1, 0x4f, 0xff,
0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84,
0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb,
0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89,
0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80,
0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28,
0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08,
0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40,
0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7,
0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03,
0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80,
0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b,
0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52,
0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80,
0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40,
0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80,
0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c,
0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f,
0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00,
0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80,
0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01,
0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05,
0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40,
0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34,
0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82,
0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80,
0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5,
0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80,
0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e,
0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60,
0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80,
0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60,
0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89,
0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2,
0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb,
0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7,
0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80,
0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08,
0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d,
0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20,
0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54,
0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e,
0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80,
0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06,
0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41,
0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f,
0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08,
0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00,
0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00,
0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00,
0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90,
0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99,
0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83,
0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05,
0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff,
};

static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {
Expand Down
Loading
Loading