From b55ee55e8750e2710a6148ffa78d1825ecbe427e Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 27 Jun 2023 18:25:27 +0200 Subject: [PATCH 01/18] mysql: introduce icuregex package Co-authored-by: Dirkjan Bussink Signed-off-by: Vicent Marti --- go/mysql/icuregex/compiler.go | 3645 +++++++++++++++++ go/mysql/icuregex/compiler_table.go | 357 ++ go/mysql/icuregex/debug.go | 157 + go/mysql/icuregex/error.go | 125 + go/mysql/icuregex/icu_test.go | 430 ++ .../icuregex/internal/bytestrie/bytes_trie.go | 373 ++ go/mysql/icuregex/internal/icudata/README.md | 46 + go/mysql/icuregex/internal/icudata/char.brk | Bin 0 -> 13680 bytes go/mysql/icuregex/internal/icudata/embed.go | 96 + go/mysql/icuregex/internal/icudata/nfc.nrm | Bin 0 -> 35124 bytes go/mysql/icuregex/internal/icudata/nfkc.nrm | Bin 0 -> 54136 bytes .../icuregex/internal/icudata/nfkc_cf.nrm | Bin 0 -> 51472 bytes go/mysql/icuregex/internal/icudata/pnames.icu | Bin 0 -> 42682 bytes go/mysql/icuregex/internal/icudata/ubidi.icu | Bin 0 -> 26636 bytes go/mysql/icuregex/internal/icudata/ucase.icu | Bin 0 -> 28898 bytes .../icuregex/internal/icudata/ulayout.icu | Bin 0 -> 13488 bytes go/mysql/icuregex/internal/icudata/unames.icu | Bin 0 -> 283932 bytes go/mysql/icuregex/internal/icudata/uprops.icu | Bin 0 -> 135656 bytes go/mysql/icuregex/internal/icudata/word.brk | Bin 0 -> 22232 bytes .../icuregex/internal/pattern/unescape.go | 201 + .../internal/pattern/unescape_test.go | 38 + go/mysql/icuregex/internal/pattern/utils.go | 111 + go/mysql/icuregex/internal/ubidi/ubidi.go | 460 +++ go/mysql/icuregex/internal/ucase/fold.go | 244 ++ go/mysql/icuregex/internal/ucase/ucase.go | 422 ++ go/mysql/icuregex/internal/uchar/constants.go | 238 ++ go/mysql/icuregex/internal/uchar/uchar.go | 400 ++ go/mysql/icuregex/internal/udata/udata.go | 160 + go/mysql/icuregex/internal/uerror/error.go | 159 + go/mysql/icuregex/internal/ulayout/ulayout.go | 133 + go/mysql/icuregex/internal/unames/unames.go | 484 +++ .../icuregex/internal/unames/unames_test.go | 64 + .../icuregex/internal/uprops/constants.go | 625 +++ go/mysql/icuregex/internal/uprops/uprops.go | 287 ++ .../icuregex/internal/uprops/uprops_binary.go | 229 ++ .../icuregex/internal/uprops/uprops_int.go | 287 ++ go/mysql/icuregex/internal/uprops/uscript.go | 507 +++ go/mysql/icuregex/internal/uset/close.go | 98 + go/mysql/icuregex/internal/uset/pattern.go | 107 + go/mysql/icuregex/internal/uset/properties.go | 417 ++ .../icuregex/internal/uset/unicode_set.go | 653 +++ .../internal/uset/unicode_set_test.go | 43 + go/mysql/icuregex/internal/utf16/helpers.go | 76 + go/mysql/icuregex/internal/utrie/ucptrie.go | 715 ++++ go/mysql/icuregex/internal/utrie/utrie2.go | 448 ++ go/mysql/icuregex/matcher.go | 1890 +++++++++ go/mysql/icuregex/ops.go | 417 ++ go/mysql/icuregex/pattern.go | 144 + go/mysql/icuregex/perl_test.go | 218 + go/mysql/icuregex/sets.go | 103 + go/mysql/icuregex/sets_test.go | 50 + go/mysql/icuregex/testdata/re_tests.txt | 923 +++++ go/mysql/icuregex/testdata/regextst.txt | 2793 +++++++++++++ .../icuregex/testdata/regextst_extended.txt | 88 + 54 files changed, 19461 insertions(+) create mode 100644 go/mysql/icuregex/compiler.go create mode 100644 go/mysql/icuregex/compiler_table.go create mode 100644 go/mysql/icuregex/debug.go create mode 100644 go/mysql/icuregex/error.go create mode 100644 go/mysql/icuregex/icu_test.go create mode 100644 go/mysql/icuregex/internal/bytestrie/bytes_trie.go create mode 100644 go/mysql/icuregex/internal/icudata/README.md create mode 100644 go/mysql/icuregex/internal/icudata/char.brk create mode 100644 go/mysql/icuregex/internal/icudata/embed.go create mode 100644 go/mysql/icuregex/internal/icudata/nfc.nrm create mode 100644 go/mysql/icuregex/internal/icudata/nfkc.nrm create mode 100644 go/mysql/icuregex/internal/icudata/nfkc_cf.nrm create mode 100644 go/mysql/icuregex/internal/icudata/pnames.icu create mode 100644 go/mysql/icuregex/internal/icudata/ubidi.icu create mode 100644 go/mysql/icuregex/internal/icudata/ucase.icu create mode 100644 go/mysql/icuregex/internal/icudata/ulayout.icu create mode 100644 go/mysql/icuregex/internal/icudata/unames.icu create mode 100644 go/mysql/icuregex/internal/icudata/uprops.icu create mode 100644 go/mysql/icuregex/internal/icudata/word.brk create mode 100644 go/mysql/icuregex/internal/pattern/unescape.go create mode 100644 go/mysql/icuregex/internal/pattern/unescape_test.go create mode 100644 go/mysql/icuregex/internal/pattern/utils.go create mode 100644 go/mysql/icuregex/internal/ubidi/ubidi.go create mode 100644 go/mysql/icuregex/internal/ucase/fold.go create mode 100644 go/mysql/icuregex/internal/ucase/ucase.go create mode 100644 go/mysql/icuregex/internal/uchar/constants.go create mode 100644 go/mysql/icuregex/internal/uchar/uchar.go create mode 100644 go/mysql/icuregex/internal/udata/udata.go create mode 100644 go/mysql/icuregex/internal/uerror/error.go create mode 100644 go/mysql/icuregex/internal/ulayout/ulayout.go create mode 100644 go/mysql/icuregex/internal/unames/unames.go create mode 100644 go/mysql/icuregex/internal/unames/unames_test.go create mode 100644 go/mysql/icuregex/internal/uprops/constants.go create mode 100644 go/mysql/icuregex/internal/uprops/uprops.go create mode 100644 go/mysql/icuregex/internal/uprops/uprops_binary.go create mode 100644 go/mysql/icuregex/internal/uprops/uprops_int.go create mode 100644 go/mysql/icuregex/internal/uprops/uscript.go create mode 100644 go/mysql/icuregex/internal/uset/close.go create mode 100644 go/mysql/icuregex/internal/uset/pattern.go create mode 100644 go/mysql/icuregex/internal/uset/properties.go create mode 100644 go/mysql/icuregex/internal/uset/unicode_set.go create mode 100644 go/mysql/icuregex/internal/uset/unicode_set_test.go create mode 100644 go/mysql/icuregex/internal/utf16/helpers.go create mode 100644 go/mysql/icuregex/internal/utrie/ucptrie.go create mode 100644 go/mysql/icuregex/internal/utrie/utrie2.go create mode 100644 go/mysql/icuregex/matcher.go create mode 100644 go/mysql/icuregex/ops.go create mode 100644 go/mysql/icuregex/pattern.go create mode 100644 go/mysql/icuregex/perl_test.go create mode 100644 go/mysql/icuregex/sets.go create mode 100644 go/mysql/icuregex/sets_test.go create mode 100644 go/mysql/icuregex/testdata/re_tests.txt create mode 100644 go/mysql/icuregex/testdata/regextst.txt create mode 100644 go/mysql/icuregex/testdata/regextst_extended.txt diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go new file mode 100644 index 00000000000..cef7f26623b --- /dev/null +++ b/go/mysql/icuregex/compiler.go @@ -0,0 +1,3645 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "math" + "os" + "strings" + "unicode/utf8" + + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" + "vitess.io/vitess/go/mysql/icuregex/internal/unames" + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +const BreakIteration = false +const kStackSize = 100 + +type reChar struct { + char rune + quoted bool +} + +const ( + parenPlain = -1 + parenCapturing = -2 + parenAtomic = -3 + parenLookahead = -4 + parenNegLookahead = -5 + parenFlags = -6 + parenLookBehind = -7 + parenLookBehindN = -8 +) + +type setOperation uint32 + +const ( + setStart setOperation = 0<<16 | 1 + setEnd setOperation = 1<<16 | 2 + setNegation setOperation = 2<<16 | 3 + setCaseClose setOperation = 2<<16 | 9 + setDifference2 setOperation = 3<<16 | 4 // '--' set difference operator + setIntersection2 setOperation = 3<<16 | 5 // '&&' set intersection operator + setUnion setOperation = 4<<16 | 6 // implicit union of adjacent items + setDifference1 setOperation = 4<<16 | 7 // '-', single dash difference op, for compatibility with old UnicodeSet. + setIntersection1 setOperation = 4<<16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. +) + +type Compiler struct { + err error + out *Pattern + p string + + scanIndex int + quoteMode bool + inBackslashQuote bool + eolComments bool + + lineNum int + charNum int + lastChar rune + peekChar rune + + c reChar + stack [kStackSize]uint16 + stackPtr int + + modeFlags RegexpFlag + newModeFlags RegexpFlag + setModeFlag bool + + literalChars []rune + patternLength int + + parenStack []int + matchOpenParen int + matchCloseParen int + + intervalLow int + intervalUpper int + + setStack []*uset.UnicodeSet + setOpStack []setOperation + + lastSetLiteral rune + captureName *strings.Builder +} + +func NewCompiler(pat *Pattern) *Compiler { + return &Compiler{ + out: pat, + scanIndex: 0, + eolComments: true, + lineNum: 1, + charNum: 0, + lastChar: -1, + peekChar: -1, + modeFlags: RegexpFlag(uint32(pat.flags) | 0x80000000), + matchOpenParen: -1, + matchCloseParen: -1, + lastSetLiteral: -1, + } +} + +func (c *Compiler) nextCharLL() (ch rune) { + if c.peekChar != -1 { + ch, c.peekChar = c.peekChar, -1 + return + } + var w int + ch, w = utf8.DecodeRuneInString(c.p) + c.p = c.p[w:] + if ch == utf8.RuneError && (w == 0 || w == 1) { + return -1 + } + + if ch == chCR || ch == chNEL || ch == chLS || (ch == chLF && c.lastChar != chCR) { + c.lineNum++ + c.charNum = 0 + } else { + if ch != chLF { + c.charNum++ + } + } + c.lastChar = ch + return +} + +func (c *Compiler) peekCharLL() rune { + if c.peekChar == -1 { + c.peekChar = c.nextCharLL() + } + return c.peekChar +} + +func (c *Compiler) nextChar(ch *reChar) { + c.scanIndex++ + ch.char = c.nextCharLL() + ch.quoted = false + + if c.quoteMode { + ch.quoted = true + if (ch.char == chBackSlash && c.peekCharLL() == chE && ((c.modeFlags & UREGEX_LITERAL) == 0)) || + ch.char == -1 { + c.quoteMode = false // Exit quote mode, + c.nextCharLL() // discard the E + c.nextChar(ch) + return + } + } else if c.inBackslashQuote { + // The current character immediately follows a '\' + // Don't check for any further escapes, just return it as-is. + // Don't set c.fQuoted, because that would prevent the state machine from + // dispatching on the character. + c.inBackslashQuote = false + } else { + // We are not in a \Q quoted region \E of the source. + // + if (c.modeFlags & UREGEX_COMMENTS) != 0 { + // + // We are in free-spacing and comments mode. + // Scan through any white space and comments, until we + // reach a significant character or the end of inut. + for { + if ch.char == -1 { + break // End of Input + } + if ch.char == chPound && c.eolComments { + // Start of a comment. Consume the rest of it, until EOF or a new line + for { + ch.char = c.nextCharLL() + if ch.char == -1 || // EOF + ch.char == chCR || + ch.char == chLF || + ch.char == chNEL || + ch.char == chLS { + break + } + } + } + // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. + if !pattern.IsWhitespace(ch.char) { + break + } + ch.char = c.nextCharLL() + } + } + + // + // check for backslash escaped characters. + // + if ch.char == chBackSlash { + beforeEscape := c.p + if staticSetUnescape.ContainsRune(c.peekCharLL()) { + // + // A '\' sequence that is handled by ICU's standard unescapeAt function. + // Includes \uxxxx, \n, \r, many others. + // Return the single equivalent character. + // + c.nextCharLL() // get & discard the peeked char. + ch.quoted = true + + ch.char, c.p = pattern.UnescapeAt(beforeEscape) + if ch.char < 0 { + c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + } + c.charNum += len(beforeEscape) - len(c.p) + } else if c.peekCharLL() == chDigit0 { + // Octal Escape, using Java Regexp Conventions + // which are \0 followed by 1-3 octal digits. + // Different from ICU Unescape handling of Octal, which does not + // require the leading 0. + // Java also has the convention of only consuming 2 octal digits if + // the three digit number would be > 0xff + // + ch.char = 0 + c.nextCharLL() // Consume the initial 0. + for index := 0; index < 3; index++ { + ch2 := c.peekCharLL() + if ch2 < chDigit0 || ch2 > chDigit7 { + if index == 0 { + // \0 is not followed by any octal digits. + c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + } + break + } + ch.char <<= 3 + ch.char += ch2 & 7 + if ch.char <= 255 { + c.nextCharLL() + } else { + // The last digit made the number too big. Forget we saw it. + ch.char >>= 3 + } + } + ch.quoted = true + } else if c.peekCharLL() == chQ { + // "\Q" enter quote mode, which will continue until "\E" + c.quoteMode = true + c.nextCharLL() // discard the 'Q'. + c.nextChar(ch) // recurse to get the real next char. + return + } else { + // We are in a '\' escape that will be handled by the state table scanner. + // Just return the backslash, but remember that the following char is to + // be taken literally. + c.inBackslashQuote = true + } + } + } + + // re-enable # to end-of-line comments, in case they were disabled. + // They are disabled by the parser upon seeing '(?', but this lasts for + // the fetching of the next character only. + c.eolComments = true +} + +const ( + chCR = 0x0d // New lines, for terminating comments. + chLF = 0x0a // Line Feed + chPound = 0x23 // '#', introduces a comment. + chDigit0 = 0x30 // '0' + chDigit7 = 0x37 // '9' + chColon = 0x3A // ':' + chE = 0x45 // 'E' + chQ = 0x51 // 'Q' + chN = 0x4E // 'N' + chP = 0x50 // 'P' + chBackSlash = 0x5c // '\' introduces a char escape + chLBracket = 0x5b // '[' + chRBracket = 0x5d // ']' + chUp = 0x5e // '^' + chLowerP = 0x70 + chLBrace = 0x7b // '{' + chRBrace = 0x7d // '}' + chNEL = 0x85 // NEL newline variant + chLS = 0x2028 // Unicode Line Separator + chAmp = 0x26 // '&' + chDash = 0x2d // '-' +) + +func (c *Compiler) compile(pat string) error { + if c.err != nil { + return c.err + } + if c.out.pattern != "" { + panic("cannot reuse pattern") + } + + c.out.pattern = pat + c.p = pat + c.patternLength = utf8.RuneCountInString(pat) + + var state uint16 = 1 + var table []regexTableEl + + // UREGEX_LITERAL force entire pattern to be treated as a literal string. + if c.modeFlags&UREGEX_LITERAL != 0 { + c.quoteMode = true + } + + c.nextChar(&c.c) + + // Main loop for the regex pattern parsing state machine. + // Runs once per state transition. + // Each time through optionally performs, depending on the state table, + // - an advance to the the next pattern char + // - an action to be performed. + // - pushing or popping a state to/from the local state return stack. + // file regexcst.txt is the source for the state table. The logic behind + // recongizing the pattern syntax is there, not here. + for { + if c.err != nil { + break + } + + if state == 0 { + panic("bad state?") + } + + table = parseStateTable[state:] + for len(table) > 0 { + if table[0].charClass < 127 && !c.c.quoted && rune(table[0].charClass) == c.c.char { + break + } + if table[0].charClass == 255 { + break + } + if table[0].charClass == 254 && c.c.quoted { + break + } + if table[0].charClass == 253 && c.c.char == -1 { + break + } + if table[0].charClass >= 128 && table[0].charClass < 240 && !c.c.quoted && c.c.char != -1 { + if staticRuleSet[table[0].charClass-128].ContainsRune(c.c.char) { + break + } + } + + table = table[1:] + } + + if !c.doParseActions(table[0].action) { + break + } + + if table[0].pushState != 0 { + c.stackPtr++ + if c.stackPtr >= kStackSize { + c.error(uerror.U_REGEX_INTERNAL_ERROR) + c.stackPtr-- + } + c.stack[c.stackPtr] = uint16(table[0].pushState) + } + + if table[0].nextChar { + c.nextChar(&c.c) + } + + if table[0].nextState != 255 { + state = uint16(table[0].nextState) + } else { + state = c.stack[c.stackPtr] + c.stackPtr-- + if c.stackPtr < 0 { + c.stackPtr++ + c.error(uerror.U_REGEX_MISMATCHED_PAREN) + } + } + } + + if c.err != nil { + return c.err + } + + c.allocateStackData(RESTACKFRAME_HDRCOUNT) + c.stripNOPs() + + c.out.minMatchLen = c.minMatchLength(3, len(c.out.compiledPat)-1) + + c.matchStartType() + return c.err +} + +const DebugParseActions = false + +func (c *Compiler) doParseActions(action patternParseAction) bool { + if DebugParseActions { + fmt.Fprintf(os.Stderr, "doParseActions(action=%d)\n\t%s\n", action, c.p) + } + + switch action { + case doPatStart: + // Start of pattern compiles to: + //0 SAVE 2 Fall back to position of FAIL + //1 jmp 3 + //2 FAIL Stop if we ever reach here. + //3 NOP Dummy, so start of pattern looks the same as + // the start of an ( grouping. + //4 NOP Resreved, will be replaced by a save if there are + // OR | operators at the top level + c.appendOp(URX_STATE_SAVE, 2) + c.appendOp(URX_JMP, 3) + c.appendOp(URX_FAIL, 0) + + // Standard open nonCapture paren action emits the two NOPs and + // sets up the paren stack frame. + c.doParseActions(doOpenNonCaptureParen) + + case doPatFinish: + // We've scanned to the end of the pattern + // The end of pattern compiles to: + // URX_END + // which will stop the runtime match engine. + // Encountering end of pattern also behaves like a close paren, + // and forces fixups of the State Save at the beginning of the compiled pattern + // and of any OR operations at the top level. + // + c.handleCloseParen() + if len(c.parenStack) > 0 { + // Missing close paren in pattern. + c.error(uerror.U_REGEX_MISMATCHED_PAREN) + } + + // add the END operation to the compiled pattern. + c.appendOp(URX_END, 0) + + // Terminate the pattern compilation state machine. + return false + + case doOrOperator: + // Scanning a '|', as in (A|B) + // Generate code for any pending literals preceding the '|' + c.fixLiterals(false) + + // Insert a SAVE operation at the start of the pattern section preceding + // this OR at this level. This SAVE will branch the match forward + // to the right hand side of the OR in the event that the left hand + // side fails to match and backtracks. Locate the position for the + // save from the location on the top of the parentheses stack. + var savePosition int + savePosition, c.parenStack = stackPop(c.parenStack) + op := c.out.compiledPat[savePosition] + + if op.Type() != URX_NOP { + panic("expected a NOP placeholder") + } + + op = c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)+1) + c.out.compiledPat[savePosition] = op + + // Append an JMP operation into the compiled pattern. The operand for + // the JMP will eventually be the location following the ')' for the + // group. This will be patched in later, when the ')' is encountered. + c.appendOp(URX_JMP, 0) + + // Push the position of the newly added JMP op onto the parentheses stack. + // This registers if for fixup when this block's close paren is encountered. + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Append a NOP to the compiled pattern. This is the slot reserved + // for a SAVE in the event that there is yet another '|' following + // this one. + c.appendOp(URX_NOP, 0) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doBeginNamedCapture: + // Scanning (? + // Compile to a + // - NOP, which later may be replaced if the parenthesized group + // has a quantifier, followed by + // - STO_SP save state stack position, so it can be restored at the ")" + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + c.fixLiterals(false) + c.appendOp(URX_NOP, 0) + varLoc := c.allocateData(1) // Reserve a data location for saving the state stack ptr. + c.appendOp(URX_STO_SP, varLoc) + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs. Depending on what follows in the pattern, the + // NOPs may be changed to SAVE_STATE or JMP ops, with a target + // address of the end of the parenthesized group. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenAtomic) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-3) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doOpenLookAhead: + // Positive Look-ahead (?= stuff ) + // + // Note: Addition of transparent input regions, with the need to + // restore the original regions when failing out of a lookahead + // block, complicated this sequence. Some combined opcodes + // might make sense - or might not, lookahead aren't that common. + // + // Caution: min match length optimization knows about this + // sequence; don't change without making updates there too. + // + // Compiles to + // 1 LA_START dataLoc Saves SP, Input Pos, Active input region. + // 2. STATE_SAVE 4 on failure of lookahead, goto 4 + // 3 JMP 6 continue ... + // + // 4. LA_END Look Ahead failed. Restore regions. + // 5. BACKTRACK and back track again. + // + // 6. NOP reserved for use by quantifiers on the block. + // Look-ahead can't have quantifiers, but paren stack + // compile time conventions require the slot anyhow. + // 7. NOP may be replaced if there is are '|' ops in the block. + // 8. code for parenthesized stuff. + // 9. LA_END + // + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + c.fixLiterals(false) + dataLoc := c.allocateData(4) + c.appendOp(URX_LA_START, dataLoc) + c.appendOp(URX_STATE_SAVE, len(c.out.compiledPat)+2) + c.appendOp(URX_JMP, len(c.out.compiledPat)+3) + c.appendOp(URX_LA_END, dataLoc) + c.appendOp(URX_BACKTRACK, 0) + c.appendOp(URX_NOP, 0) + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the NOPs. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookahead) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doOpenLookAheadNeg: + // Negated Lookahead. (?! stuff ) + // Compiles to + // 1. LA_START dataloc + // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, + // // which continues with the match. + // 3. NOP // Std. Open Paren sequence, for possible '|' + // 4. code for parenthesized stuff. + // 5. LA_END // Cut back stack, remove saved state from step 2. + // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. + // 7. END_LA // Restore match region, in case look-ahead was using + // an alternate (transparent) region. + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + c.fixLiterals(false) + dataLoc := c.allocateData(4) + c.appendOp(URX_LA_START, dataLoc) + c.appendOp(URX_STATE_SAVE, 0) // dest address will be patched later. + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the StateSave and NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenNegLookahead) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Instructions #5 - #7 will be added when the ')' is encountered. + + case doOpenLookBehind: + // Compile a (?<= look-behind open paren. + // + // Compiles to + // 0 URX_LB_START dataLoc + // 1 URX_LB_CONT dataLoc + // 2 MinMatchLen + // 3 MaxMatchLen + // 4 URX_NOP Standard '(' boilerplate. + // 5 URX_NOP Reserved slot for use with '|' ops within (block). + // 6 + // 7 URX_LB_END dataLoc # Check match len, restore input len + // 8 URX_LA_END dataLoc # Restore stack, input pos + // + // Allocate a block of matcher data, to contain (when running a match) + // 0: Stack ptr on entry + // 1: Input Index on entry + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END + + // Generate match code for any pending literals. + c.fixLiterals(false) + + // Allocate data space + dataLoc := c.allocateData(5) + + // Emit URX_LB_START + c.appendOp(URX_LB_START, dataLoc) + + // Emit URX_LB_CONT + c.appendOp(URX_LB_CONT, dataLoc) + c.appendOp(URX_RESERVED_OP, 0) // MinMatchLength. To be filled later. + c.appendOp(URX_RESERVED_OP, 0) // MaxMatchLength. To be filled later. + + // Emit the NOPs + c.appendOp(URX_NOP, 0) + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the URX_LB_CONT and the NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookBehind) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // The final two instructions will be added when the ')' is encountered. + + case doOpenLookBehindNeg: + // Compile a (? + // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL + // 9 ... + // + // Allocate a block of matcher data, to contain (when running a match) + // 0: Stack ptr on entry + // 1: Input Index on entry + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END + + // Generate match code for any pending literals. + c.fixLiterals(false) + + // Allocate data space + dataLoc := c.allocateData(5) + + // Emit URX_LB_START + c.appendOp(URX_LB_START, dataLoc) + + // Emit URX_LBN_CONT + c.appendOp(URX_LBN_CONT, dataLoc) + c.appendOp(URX_RESERVED_OP, 0) // MinMatchLength. To be filled later. + c.appendOp(URX_RESERVED_OP, 0) // MaxMatchLength. To be filled later. + c.appendOp(URX_RESERVED_OP, 0) // Continue Loc. To be filled later. + + // Emit the NOPs + c.appendOp(URX_NOP, 0) + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the URX_LB_CONT and the NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookBehindN) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // The final two instructions will be added when the ')' is encountered. + + case doConditionalExpr, doPerlInline: + // Conditionals such as (?(1)a:b) + // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. + c.error(uerror.U_REGEX_UNIMPLEMENTED) + + case doCloseParen: + c.handleCloseParen() + if len(c.parenStack) == 0 { + // Extra close paren, or missing open paren. + c.error(uerror.U_REGEX_MISMATCHED_PAREN) + } + + case doNOP: + + case doBadOpenParenType, doRuleError: + c.error(uerror.U_REGEX_RULE_SYNTAX) + + case doMismatchedParenErr: + c.error(uerror.U_REGEX_MISMATCHED_PAREN) + + case doPlus: + // Normal '+' compiles to + // 1. stuff to be repeated (already built) + // 2. jmp-sav 1 + // 3. ... + // + // Or, if the item to be repeated can match a zero length string, + // 1. STO_INP_LOC data-loc + // 2. body of stuff to be repeated + // 3. JMP_SAV_X 2 + // 4. ... + + // + // Or, if the item to be repeated is simple + // 1. Item to be repeated. + // 2. LOOP_SR_I set number (assuming repeated item is a set ref) + // 3. LOOP_C stack location + topLoc := c.blockTopLoc(false) // location of item #1 + + // Check for simple constructs, which may get special optimized code. + if topLoc == len(c.out.compiledPat)-1 { + repeatedOp := c.out.compiledPat[topLoc] + + if repeatedOp.Type() == URX_SETREF { + // Emit optimized code for [char set]+ + c.appendOp(URX_LOOP_SR_I, repeatedOp.Value()) + frameLoc := c.allocateStackData(1) + c.appendOp(URX_LOOP_C, frameLoc) + break + } + + if repeatedOp.Type() == URX_DOTANY || repeatedOp.Type() == URX_DOTANY_ALL || repeatedOp.Type() == URX_DOTANY_UNIX { + // Emit Optimized code for .+ operations. + loopOpI := c.buildOp(URX_LOOP_DOT_I, 0) + if repeatedOp.Type() == URX_DOTANY_ALL { + // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. + loopOpI |= 1 + } + if c.modeFlags&UREGEX_UNIX_LINES != 0 { + loopOpI |= 2 + } + c.appendIns(loopOpI) + frameLoc := c.allocateStackData(1) + c.appendOp(URX_LOOP_C, frameLoc) + break + } + } + + // General case. + + // Check for minimum match length of zero, which requires + // extra loop-breaking code. + if c.minMatchLength(topLoc, len(c.out.compiledPat)-1) == 0 { + // Zero length match is possible. + // Emit the code sequence that can handle it. + c.insertOp(topLoc) + frameLoc := c.allocateStackData(1) + op := c.buildOp(URX_STO_INP_LOC, frameLoc) + c.out.compiledPat[topLoc] = op + + c.appendOp(URX_JMP_SAV_X, topLoc+1) + } else { + // Simpler code when the repeated body must match something non-empty + c.appendOp(URX_JMP_SAV, topLoc) + } + + case doNGPlus: + // Non-greedy '+?' compiles to + // 1. stuff to be repeated (already built) + // 2. state-save 1 + // 3. ... + topLoc := c.blockTopLoc(false) + c.appendOp(URX_STATE_SAVE, topLoc) + + case doOpt: + // Normal (greedy) ? quantifier. + // Compiles to + // 1. state save 3 + // 2. body of optional block + // 3. ... + // Insert the state save into the compiled pattern, and we're done. + saveStateLoc := c.blockTopLoc(true) + saveStateOp := c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)) + c.out.compiledPat[saveStateLoc] = saveStateOp + + case doNGOpt: + // Non-greedy ?? quantifier + // compiles to + // 1. jmp 4 + // 2. body of optional block + // 3 jmp 5 + // 4. state save 2 + // 5 ... + // This code is less than ideal, with two jmps instead of one, because we can only + // insert one instruction at the top of the block being iterated. + jmp1Loc := c.blockTopLoc(true) + jmp2Loc := len(c.out.compiledPat) + + jmp1Op := c.buildOp(URX_JMP, jmp2Loc+1) + c.out.compiledPat[jmp1Loc] = jmp1Op + + c.appendOp(URX_JMP, jmp2Loc+2) + c.appendOp(URX_STATE_SAVE, jmp1Loc+1) + + case doStar: + // Normal (greedy) * quantifier. + // Compiles to + // 1. STATE_SAVE 4 + // 2. body of stuff being iterated over + // 3. JMP_SAV 2 + // 4. ... + // + // Or, if the body is a simple [Set], + // 1. LOOP_SR_I set number + // 2. LOOP_C stack location + // ... + // + // Or if this is a .* + // 1. LOOP_DOT_I (. matches all mode flag) + // 2. LOOP_C stack location + // + // Or, if the body can match a zero-length string, to inhibit infinite loops, + // 1. STATE_SAVE 5 + // 2. STO_INP_LOC data-loc + // 3. body of stuff + // 4. JMP_SAV_X 2 + // 5. ... + // location of item #1, the STATE_SAVE + topLoc := c.blockTopLoc(false) + + // Check for simple *, where the construct being repeated + // compiled to single opcode, and might be optimizable. + if topLoc == len(c.out.compiledPat)-1 { + repeatedOp := c.out.compiledPat[topLoc] + + if repeatedOp.Type() == URX_SETREF { + // Emit optimized code for a [char set]* + loopOpI := c.buildOp(URX_LOOP_SR_I, repeatedOp.Value()) + c.out.compiledPat[topLoc] = loopOpI + dataLoc := c.allocateStackData(1) + c.appendOp(URX_LOOP_C, dataLoc) + break + } + + if repeatedOp.Type() == URX_DOTANY || repeatedOp.Type() == URX_DOTANY_ALL || repeatedOp.Type() == URX_DOTANY_UNIX { + // Emit Optimized code for .* operations. + loopOpI := c.buildOp(URX_LOOP_DOT_I, 0) + if repeatedOp.Type() == URX_DOTANY_ALL { + // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. + loopOpI |= 1 + } + if (c.modeFlags & UREGEX_UNIX_LINES) != 0 { + loopOpI |= 2 + } + c.out.compiledPat[topLoc] = loopOpI + dataLoc := c.allocateStackData(1) + c.appendOp(URX_LOOP_C, dataLoc) + break + } + } + + // Emit general case code for this * + // The optimizations did not apply. + + saveStateLoc := c.blockTopLoc(true) + jmpOp := c.buildOp(URX_JMP_SAV, saveStateLoc+1) + + // Check for minimum match length of zero, which requires + // extra loop-breaking code. + if c.minMatchLength(saveStateLoc, len(c.out.compiledPat)-1) == 0 { + c.insertOp(saveStateLoc) + dataLoc := c.allocateStackData(1) + + op := c.buildOp(URX_STO_INP_LOC, dataLoc) + c.out.compiledPat[saveStateLoc+1] = op + jmpOp = c.buildOp(URX_JMP_SAV_X, saveStateLoc+2) + } + + // Locate the position in the compiled pattern where the match will continue + // after completing the *. (4 or 5 in the comment above) + continueLoc := len(c.out.compiledPat) + 1 + + // Put together the save state op and store it into the compiled code. + saveStateOp := c.buildOp(URX_STATE_SAVE, continueLoc) + c.out.compiledPat[saveStateLoc] = saveStateOp + + // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. + c.appendIns(jmpOp) + + case doNGStar: + // Non-greedy *? quantifier + // compiles to + // 1. JMP 3 + // 2. body of stuff being iterated over + // 3. STATE_SAVE 2 + // 4 ... + jmpLoc := c.blockTopLoc(true) // loc 1. + saveLoc := len(c.out.compiledPat) // loc 3. + jmpOp := c.buildOp(URX_JMP, saveLoc) + c.out.compiledPat[jmpLoc] = jmpOp + c.appendOp(URX_STATE_SAVE, jmpLoc+1) + + case doIntervalInit: + // The '{' opening an interval quantifier was just scanned. + // Init the counter varaiables that will accumulate the values as the digits + // are scanned. + c.intervalLow = 0 + c.intervalUpper = -1 + + case doIntevalLowerDigit: + // Scanned a digit from the lower value of an {lower,upper} interval + digitValue := u_charDigitValue(c.c.char) + val := int64(c.intervalLow)*10 + digitValue + if val > math.MaxInt32 { + c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + } else { + c.intervalLow = int(val) + } + + case doIntervalUpperDigit: + // Scanned a digit from the upper value of an {lower,upper} interval + if c.intervalUpper < 0 { + c.intervalUpper = 0 + } + digitValue := u_charDigitValue(c.c.char) + val := int64(c.intervalUpper)*10 + digitValue + if val > math.MaxInt32 { + c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + } else { + c.intervalUpper = int(val) + } + + case doIntervalSame: + // Scanned a single value interval like {27}. Upper = Lower. + c.intervalUpper = c.intervalLow + + case doInterval: + // Finished scanning a normal {lower,upper} interval. Generate the code for it. + if !c.compileInlineInterval() { + c.compileInterval(URX_CTR_INIT, URX_CTR_LOOP) + } + + case doPossessiveInterval: + // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. + + // Remember the loc for the top of the block being looped over. + // (Can not reserve a slot in the compiled pattern at this time, because + // compileInterval needs to reserve also, and blockTopLoc can only reserve + // once per block.) + topLoc := c.blockTopLoc(false) + + // Produce normal looping code. + c.compileInterval(URX_CTR_INIT, URX_CTR_LOOP) + + // Surround the just-emitted normal looping code with a STO_SP ... LD_SP + // just as if the loop was inclosed in atomic parentheses. + + // First the STO_SP before the start of the loop + c.insertOp(topLoc) + + varLoc := c.allocateData(1) // Reserve a data location for saving the + op := c.buildOp(URX_STO_SP, varLoc) + c.out.compiledPat[topLoc] = op + + var loopOp Instruction + loopOp, c.out.compiledPat = stackPop(c.out.compiledPat) + if loopOp.Type() != URX_CTR_LOOP || loopOp.Value() != topLoc { + panic("bad instruction at the end of compiled pattern") + } + + loopOp++ // point LoopOp after the just-inserted STO_SP + c.appendIns(loopOp) + + // Then the LD_SP after the end of the loop + c.appendOp(URX_LD_SP, varLoc) + + case doNGInterval: + // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. + c.compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG) + + case doIntervalError: + c.error(uerror.U_REGEX_BAD_INTERVAL) + + case doLiteralChar: + // We've just scanned a "normal" character from the pattern, + c.literalChar(c.c.char) + + case doEscapedLiteralChar: + // We've just scanned an backslashed escaped character with no + // special meaning. It represents itself. + if (c.modeFlags&UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || /* in [A-Z] */ (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] + c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + } + c.literalChar(c.c.char) + + case doDotAny: + // scanned a ".", match any single character. + c.fixLiterals(false) + if (c.modeFlags & UREGEX_DOTALL) != 0 { + c.appendOp(URX_DOTANY_ALL, 0) + } else if (c.modeFlags & UREGEX_UNIX_LINES) != 0 { + c.appendOp(URX_DOTANY_UNIX, 0) + } else { + c.appendOp(URX_DOTANY, 0) + } + + case doCaret: + c.fixLiterals(false) + if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { + c.appendOp(URX_CARET, 0) + } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { + c.appendOp(URX_CARET_M, 0) + } else if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { + c.appendOp(URX_CARET, 0) // Only testing true start of input. + } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { + c.appendOp(URX_CARET_M_UNIX, 0) + } + + case doDollar: + c.fixLiterals(false) + if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { + c.appendOp(URX_DOLLAR, 0) + } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { + c.appendOp(URX_DOLLAR_M, 0) + } else if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { + c.appendOp(URX_DOLLAR_D, 0) + } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { + c.appendOp(URX_DOLLAR_MD, 0) + } + + case doBackslashA: + c.fixLiterals(false) + c.appendOp(URX_CARET, 0) + + case doBackslashB: + if !BreakIteration { + if (c.modeFlags & UREGEX_UWORD) != 0 { + c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + } + } + c.fixLiterals(false) + if c.modeFlags&UREGEX_UWORD != 0 { + c.appendOp(URX_BACKSLASH_BU, 1) + } else { + c.appendOp(URX_BACKSLASH_B, 1) + } + + case doBackslashb: + if !BreakIteration { + if (c.modeFlags & UREGEX_UWORD) != 0 { + c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + } + } + c.fixLiterals(false) + if c.modeFlags&UREGEX_UWORD != 0 { + c.appendOp(URX_BACKSLASH_BU, 0) + } else { + c.appendOp(URX_BACKSLASH_B, 0) + } + + case doBackslashD: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_D, 1) + + case doBackslashd: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_D, 0) + + case doBackslashG: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_G, 0) + + case doBackslashH: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_H, 1) + + case doBackslashh: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_H, 0) + + case doBackslashR: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_R, 0) + + case doBackslashS: + c.fixLiterals(false) + c.appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET) + + case doBackslashs: + c.fixLiterals(false) + c.appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET) + + case doBackslashV: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_V, 1) + + case doBackslashv: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_V, 0) + + case doBackslashW: + c.fixLiterals(false) + c.appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET) + + case doBackslashw: + c.fixLiterals(false) + c.appendOp(URX_STATIC_SETREF, URX_ISWORD_SET) + + case doBackslashX: + if !BreakIteration { + // Grapheme Cluster Boundary requires ICU break iteration. + c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + } + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_X, 0) + + case doBackslashZ: + c.fixLiterals(false) + c.appendOp(URX_DOLLAR, 0) + + case doBackslashz: + c.fixLiterals(false) + c.appendOp(URX_BACKSLASH_Z, 0) + + case doEscapeError: + c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + + case doExit: + c.fixLiterals(false) + return false + + case doProperty: + c.fixLiterals(false) + theSet := c.scanProp() + c.compileSet(theSet) + + case doNamedChar: + ch := c.scanNamedChar() + c.literalChar(ch) + + case doBackRef: + // BackReference. Somewhat unusual in that the front-end can not completely parse + // the regular expression, because the number of digits to be consumed + // depends on the number of capture groups that have been defined. So + // we have to do it here instead. + numCaptureGroups := len(c.out.groupMap) + groupNum := int64(0) + ch := c.c.char + + for { + // Loop once per digit, for max allowed number of digits in a back reference. + digit := u_charDigitValue(ch) + groupNum = groupNum*10 + digit + if groupNum >= int64(numCaptureGroups) { + break + } + ch = c.peekCharLL() + if !staticRuleSet[kRuleSetDigitChar-128].ContainsRune(ch) { + break + } + c.nextCharLL() + } + + // Scan of the back reference in the source regexp is complete. Now generate + // the compiled code for it. + // Because capture groups can be forward-referenced by back-references, + // we fill the operand with the capture group number. At the end + // of compilation, it will be changed to the variable's location. + if groupNum == 0 { + panic("\\0 begins an octal escape sequence, and shouldn't enter this code path at all") + } + c.fixLiterals(false) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.appendOp(URX_BACKREF_I, int(groupNum)) + } else { + c.appendOp(URX_BACKREF, int(groupNum)) + } + + case doBeginNamedBackRef: + if c.captureName != nil { + panic("should not replace capture name") + } + c.captureName = &strings.Builder{} + + case doContinueNamedBackRef: + c.captureName.WriteRune(c.c.char) + + case doCompleteNamedBackRef: + { + groupNumber := c.out.namedCaptureMap[c.captureName.String()] + if groupNumber == 0 { + // Group name has not been defined. + // Could be a forward reference. If we choose to support them at some + // future time, extra mechanism will be required at this point. + c.error(uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME) + } else { + // Given the number, handle identically to a \n numbered back reference. + // See comments above, under doBackRef + c.fixLiterals(false) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.appendOp(URX_BACKREF_I, groupNumber) + } else { + c.appendOp(URX_BACKREF, groupNumber) + } + } + c.captureName = nil + } + + case doPossessivePlus: + // Possessive ++ quantifier. + // Compiles to + // 1. STO_SP + // 2. body of stuff being iterated over + // 3. STATE_SAVE 5 + // 4. JMP 2 + // 5. LD_SP + // 6. ... + // + // Note: TODO: This is pretty inefficient. A mass of saved state is built up + // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 + // + // Emit the STO_SP + topLoc := c.blockTopLoc(true) + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(URX_STO_SP, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the STATE_SAVE + c.appendOp(URX_STATE_SAVE, len(c.out.compiledPat)+2) + + // Emit the JMP + c.appendOp(URX_JMP, topLoc+1) + + // Emit the LD_SP + c.appendOp(URX_LD_SP, stoLoc) + + case doPossessiveStar: + // Possessive *+ quantifier. + // Compiles to + // 1. STO_SP loc + // 2. STATE_SAVE 5 + // 3. body of stuff being iterated over + // 4. JMP 2 + // 5. LD_SP loc + // 6 ... + // TODO: do something to cut back the state stack each time through the loop. + // Reserve two slots at the top of the block. + topLoc := c.blockTopLoc(true) + c.insertOp(topLoc) + + // emit STO_SP loc + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(URX_STO_SP, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the SAVE_STATE 5 + L7 := len(c.out.compiledPat) + 1 + op = c.buildOp(URX_STATE_SAVE, L7) + c.out.compiledPat[topLoc+1] = op + + // Append the JMP operation. + c.appendOp(URX_JMP, topLoc+1) + + // Emit the LD_SP loc + c.appendOp(URX_LD_SP, stoLoc) + + case doPossessiveOpt: + // Possessive ?+ quantifier. + // Compiles to + // 1. STO_SP loc + // 2. SAVE_STATE 5 + // 3. body of optional block + // 4. LD_SP loc + // 5. ... + // + // Reserve two slots at the top of the block. + topLoc := c.blockTopLoc(true) + c.insertOp(topLoc) + + // Emit the STO_SP + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(URX_STO_SP, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the SAVE_STATE + continueLoc := len(c.out.compiledPat) + 1 + op = c.buildOp(URX_STATE_SAVE, continueLoc) + c.out.compiledPat[topLoc+1] = op + + // Emit the LD_SP + c.appendOp(URX_LD_SP, stoLoc) + + case doBeginMatchMode: + c.newModeFlags = c.modeFlags + c.setModeFlag = true + case doMatchMode: // (?i) and similar + var bit RegexpFlag + switch c.c.char { + case 0x69: /* 'i' */ + bit = UREGEX_CASE_INSENSITIVE + case 0x64: /* 'd' */ + bit = UREGEX_UNIX_LINES + case 0x6d: /* 'm' */ + bit = UREGEX_MULTILINE + case 0x73: /* 's' */ + bit = UREGEX_DOTALL + case 0x75: /* 'u' */ + bit = 0 /* Unicode casing */ + case 0x77: /* 'w' */ + bit = UREGEX_UWORD + case 0x78: /* 'x' */ + bit = UREGEX_COMMENTS + case 0x2d: /* '-' */ + c.setModeFlag = false + default: + // Should never happen. Other chars are filtered out by the scanner. + panic("unreachable") + } + if c.setModeFlag { + c.newModeFlags |= bit + } else { + c.newModeFlags &= ^bit + } + + case doSetMatchMode: + // Emit code to match any pending literals, using the not-yet changed match mode. + c.fixLiterals(false) + + // We've got a (?i) or similar. The match mode is being changed, but + // the change is not scoped to a parenthesized block. + if c.newModeFlags >= 0 { + panic("cNewModeFlags not properly initialized") + } + c.modeFlags = c.newModeFlags + + case doMatchModeParen: + // We've got a (?i: or similar. Begin a parenthesized block, save old + // mode flags so they can be restored at the close of the block. + // + // Compile to a + // - NOP, which later may be replaced by a save-state if the + // parenthesized group gets a * quantifier, followed by + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + c.fixLiterals(false) + c.appendOp(URX_NOP, 0) + c.appendOp(URX_NOP, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs (a normal non-capturing () frame, except for the + // saving of the orignal mode flags.) + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenFlags) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Set the current mode flags to the new values. + if c.newModeFlags >= 0 { + panic("cNewModeFlags not properly initialized") + } + c.modeFlags = c.newModeFlags + + case doBadModeFlag: + c.error(uerror.U_REGEX_INVALID_FLAG) + + case doSuppressComments: + // We have just scanned a '(?'. We now need to prevent the character scanner from + // treating a '#' as a to-the-end-of-line comment. + // (This Perl compatibility just gets uglier and uglier to do...) + c.eolComments = false + + case doSetAddAmp: + set := c.setStack[len(c.setStack)-1] + set.AddRune(chAmp) + + case doSetAddDash: + set := c.setStack[len(c.setStack)-1] + set.AddRune(chDash) + + case doSetBackslash_s: + set := c.setStack[len(c.setStack)-1] + set.AddAll(staticPropertySets[URX_ISSPACE_SET]) + + case doSetBackslash_S: + sset := uset.New() + sset.AddAll(staticPropertySets[URX_ISSPACE_SET]) // TODO: add latin1 spaces + sset.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(sset) + + case doSetBackslash_d: + set := c.setStack[len(c.setStack)-1] + set.AddCategory(uchar.U_GC_ND_MASK) + + case doSetBackslash_D: + digits := uset.New() + digits.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) + digits.Complement() + set := c.setStack[len(c.setStack)-1] + set.AddAll(digits) + + case doSetBackslash_h: + h := uset.New() + h.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + h.AddRune(9) // Tab + + set := c.setStack[len(c.setStack)-1] + set.AddAll(h) + + case doSetBackslash_H: + h := uset.New() + h.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + h.AddRune(9) // Tab + h.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(h) + + case doSetBackslash_v: + set := c.setStack[len(c.setStack)-1] + set.AddRuneRange(0x0a, 0x0d) // add range + set.AddRune(0x85) + set.AddRuneRange(0x2028, 0x2029) + + case doSetBackslash_V: + v := uset.New() + v.AddRuneRange(0x0a, 0x0d) // add range + v.AddRune(0x85) + v.AddRuneRange(0x2028, 0x2029) + v.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(v) + + case doSetBackslash_w: + set := c.setStack[len(c.setStack)-1] + set.AddAll(staticPropertySets[URX_ISWORD_SET]) + + case doSetBackslash_W: + sset := uset.New() + sset.AddAll(staticPropertySets[URX_ISWORD_SET]) + sset.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(sset) + + case doSetBegin: + c.fixLiterals(false) + c.setStack = append(c.setStack, uset.New()) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginDifference1: + // We have scanned something like [[abc]-[ + // Set up a new UnicodeSet for the set beginning with the just-scanned '[' + // Push a Difference operator, which will cause the new set to be subtracted from what + // went before once it is created. + c.setPushOp(setDifference1) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginIntersection1: + // We have scanned something like [[abc]&[ + // Need both the '&' operator and the open '[' operator. + c.setPushOp(setIntersection1) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginUnion: + // We have scanned something like [[abc][ + // Need to handle the union operation explicitly [[abc] | [ + c.setPushOp(setUnion) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetDifference2: + // We have scanned something like [abc-- + // Consider this to unambiguously be a set difference operator. + c.setPushOp(setDifference2) + + case doSetEnd: + // Have encountered the ']' that closes a set. + // Force the evaluation of any pending operations within this set, + // leave the completed set on the top of the set stack. + c.setEval(setEnd) + var start setOperation + start, c.setOpStack = stackPop(c.setOpStack) + if start != setStart { + panic("bad set operation in stack") + } + + case doSetFinish: + // Finished a complete set expression, including all nested sets. + // The close bracket has already triggered clearing out pending set operators, + // the operator stack should be empty and the operand stack should have just + // one entry, the result set. + if len(c.setOpStack) > 0 { + panic("expected setOpStack to be empty") + } + var set *uset.UnicodeSet + set, c.setStack = stackPop(c.setStack) + c.compileSet(set) + + case doSetIntersection2: + // Have scanned something like [abc&& + c.setPushOp(setIntersection2) + + case doSetLiteral: + // Union the just-scanned literal character into the set being built. + // This operation is the highest precedence set operation, so we can always do + // it immediately, without waiting to see what follows. It is necessary to perform + // any pending '-' or '&' operation first, because these have the same precedence + // as union-ing in a literal' + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(c.c.char) + c.lastSetLiteral = c.c.char + + case doSetLiteralEscaped: + // A back-slash escaped literal character was encountered. + // Processing is the same as with setLiteral, above, with the addition of + // the optional check for errors on escaped ASCII letters. + if (c.modeFlags&UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && + ((c.c.char >= 0x41 && c.c.char <= 0x5A) || // in [A-Z] + (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] + c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + } + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(c.c.char) + c.lastSetLiteral = c.c.char + + case doSetNamedChar: + // Scanning a \N{UNICODE CHARACTER NAME} + // Aside from the source of the character, the processing is identical to doSetLiteral, + // above. + ch := c.scanNamedChar() + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(ch) + c.lastSetLiteral = ch + + case doSetNamedRange: + // We have scanned literal-\N{CHAR NAME}. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side needs to be picked up, the scan is at the 'N'. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + ch := c.scanNamedChar() + if c.err == nil && (c.lastSetLiteral == -1 || c.lastSetLiteral > ch) { + c.error(uerror.U_REGEX_INVALID_RANGE) + } + set := c.setStack[len(c.setStack)-1] + set.AddRuneRange(c.lastSetLiteral, ch) + c.lastSetLiteral = ch + + case doSetNegate: + // Scanned a '^' at the start of a set. + // Push the negation operator onto the set op stack. + // A twist for case-insensitive matching: + // the case closure operation must happen _before_ negation. + // But the case closure operation will already be on the stack if it's required. + // This requires checking for case closure, and swapping the stack order + // if it is present. + tosOp := c.setOpStack[len(c.setOpStack)-1] + if tosOp == setCaseClose { + _, c.setOpStack = stackPop(c.setOpStack) + c.setOpStack = append(c.setOpStack, setNegation) + c.setOpStack = append(c.setOpStack, setCaseClose) + } else { + c.setOpStack = append(c.setOpStack, setNegation) + } + + case doSetNoCloseError: + c.error(uerror.U_REGEX_MISSING_CLOSE_BRACKET) + + case doSetOpError: + c.error(uerror.U_REGEX_RULE_SYNTAX) // -- or && at the end of a set. Illegal. + + case doSetPosixProp: + if set := c.scanPosixProp(); set != nil { + c.setStack[len(c.setStack)-1].AddAll(set) + } + + case doSetProp: + // Scanned a \p \P within [brackets]. + if set := c.scanProp(); set != nil { + c.setStack[len(c.setStack)-1].AddAll(set) + } + + case doSetRange: + // We have scanned literal-literal. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side is the current character. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + + if c.lastSetLiteral == -1 || c.lastSetLiteral > c.c.char { + c.error(uerror.U_REGEX_INVALID_RANGE) + } + c.setStack[len(c.setStack)-1].AddRuneRange(c.lastSetLiteral, c.c.char) + + default: + panic("unexpected OP in parser") + } + + return c.err == nil +} + +func u_charDigitValue(char rune) int64 { + if char >= '0' && char <= '9' { + return int64(char - '0') + } + return -1 +} + +func stackPop[T any](stack []T) (T, []T) { + var out T + if len(stack) > 0 { + out = stack[len(stack)-1] + stack = stack[:len(stack)-1] + } + return out, stack +} + +func (c *Compiler) error(e uerror.URegexCompileErrorCode) { + c.err = &CompileError{ + Code: e, + Line: c.lineNum, + Offset: c.charNum, + Context: c.out.pattern, + } +} + +func (c *Compiler) stripNOPs() { + if c.err != nil { + return + } + + end := len(c.out.compiledPat) + deltas := make([]int, 0, end) + + // Make a first pass over the code, computing the amount that things + // will be offset at each location in the original code. + var loc, d int + for loc = 0; loc < end; loc++ { + deltas = append(deltas, d) + op := c.out.compiledPat[loc] + if op.Type() == URX_NOP { + d++ + } + } + + // Make a second pass over the code, removing the NOPs by moving following + // code up, and patching operands that refer to code locations that + // are being moved. The array of offsets from the first step is used + // to compute the new operand values. + var src, dst int + for src = 0; src < end; src++ { + op := c.out.compiledPat[src] + opType := op.Type() + + switch opType { + case URX_NOP: + // skip + + case URX_STATE_SAVE, + URX_JMP, + URX_CTR_LOOP, + URX_CTR_LOOP_NG, + URX_RELOC_OPRND, + URX_JMPX, + URX_JMP_SAV, + URX_JMP_SAV_X: + // These are instructions with operands that refer to code locations. + operandAddress := op.Value() + // U_ASSERT(operandAddress >= 0 && operandAddress < deltas.size()); + fixedOperandAddress := operandAddress - deltas[operandAddress] + op = c.buildOp(opType, fixedOperandAddress) + c.out.compiledPat[dst] = op + dst++ + + case URX_BACKREF, URX_BACKREF_I: + where := op.Value() + if int(where) > len(c.out.groupMap) { + c.error(uerror.U_REGEX_INVALID_BACK_REF) + break + } + + where = int(c.out.groupMap[where-1]) + op = c.buildOp(opType, where) + c.out.compiledPat[dst] = op + dst++ + c.out.needsAltInput = true + + case URX_RESERVED_OP, + URX_RESERVED_OP_N, + URX_BACKTRACK, + URX_END, + URX_ONECHAR, + URX_STRING, + URX_STRING_LEN, + URX_START_CAPTURE, + URX_END_CAPTURE, + URX_STATIC_SETREF, + URX_STAT_SETREF_N, + URX_SETREF, + URX_DOTANY, + URX_FAIL, + URX_BACKSLASH_B, + URX_BACKSLASH_BU, + URX_BACKSLASH_G, + URX_BACKSLASH_X, + URX_BACKSLASH_Z, + URX_DOTANY_ALL, + URX_BACKSLASH_D, + URX_CARET, + URX_DOLLAR, + URX_CTR_INIT, + URX_CTR_INIT_NG, + URX_DOTANY_UNIX, + URX_STO_SP, + URX_LD_SP, + URX_STO_INP_LOC, + URX_LA_START, + URX_LA_END, + URX_ONECHAR_I, + URX_STRING_I, + URX_DOLLAR_M, + URX_CARET_M, + URX_CARET_M_UNIX, + URX_LB_START, + URX_LB_CONT, + URX_LB_END, + URX_LBN_CONT, + URX_LBN_END, + URX_LOOP_SR_I, + URX_LOOP_DOT_I, + URX_LOOP_C, + URX_DOLLAR_D, + URX_DOLLAR_MD, + URX_BACKSLASH_H, + URX_BACKSLASH_R, + URX_BACKSLASH_V: + // These instructions are unaltered by the relocation. + c.out.compiledPat[dst] = op + dst++ + + default: + // Some op is unaccounted for. + panic("unreachable") + } + } + + c.out.compiledPat = c.out.compiledPat[:dst] +} + +func (c *Compiler) matchStartType() { + var loc int // Location in the pattern of the current op being processed. + var currentLen int32 // Minimum length of a match to this point (loc) in the pattern + var numInitialStrings int // Number of strings encountered that could match at start. + var atStart = true // True if no part of the pattern yet encountered + // could have advanced the position in a match. + // (Maximum match length so far == 0) + + // forwardedLength is a vector holding minimum-match-length values that + // are propagated forward in the pattern by JMP or STATE_SAVE operations. + // It must be one longer than the pattern being checked because some ops + // will jmp to a end-of-block+1 location from within a block, and we must + // count those when checking the block. + end := len(c.out.compiledPat) + forwardedLength := make([]int32, end+1) + + for loc = 3; loc < end; loc++ { + forwardedLength[loc] = math.MaxInt32 + } + + for loc = 3; loc < end; loc++ { + op := c.out.compiledPat[loc] + opType := op.Type() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a shorter minimum length than the current accumulated value, + // replace the current accumulated value. + if forwardedLength[loc] < currentLen { + currentLen = forwardedLength[loc] + // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX); + } + + switch opType { + // Ops that don't change the total length matched + case URX_RESERVED_OP, + URX_END, + URX_FAIL, + URX_STRING_LEN, + URX_NOP, + URX_START_CAPTURE, + URX_END_CAPTURE, + URX_BACKSLASH_B, + URX_BACKSLASH_BU, + URX_BACKSLASH_G, + URX_BACKSLASH_Z, + URX_DOLLAR, + URX_DOLLAR_M, + URX_DOLLAR_D, + URX_DOLLAR_MD, + URX_RELOC_OPRND, + URX_STO_INP_LOC, + URX_BACKREF, // BackRef. Must assume that it might be a zero length match + URX_BACKREF_I, + URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. + URX_LD_SP: + // skip + + case URX_CARET: + if atStart { + c.out.startType = START_START + } + + case URX_CARET_M, URX_CARET_M_UNIX: + if atStart { + c.out.startType = START_LINE + } + + case URX_ONECHAR: + if currentLen == 0 { + // This character could appear at the start of a match. + // Add it to the set of possible starting characters. + c.out.initialChars.AddRune(op.Value32()) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_SETREF: + if currentLen == 0 { + sn := op.Value() + // U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); + set := c.out.sets[sn] + c.out.initialChars.AddAll(set) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_LOOP_SR_I: + // [Set]*, like a SETREF, above, in what it can match, + // but may not match at all, so currentLen is not incremented. + if currentLen == 0 { + sn := op.Value() + // U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); + set := c.out.sets[sn] + c.out.initialChars.AddAll(set) + numInitialStrings += 2 + } + atStart = false + + case URX_LOOP_DOT_I: + if currentLen == 0 { + // .* at the start of a pattern. + // Any character can begin the match. + c.out.initialChars.Clear() + c.out.initialChars.Complement() + numInitialStrings += 2 + } + atStart = false + + case URX_STATIC_SETREF: + if currentLen == 0 { + sn := op.Value() + c.out.initialChars.AddAll(staticPropertySets[sn]) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_STAT_SETREF_N: + if currentLen == 0 { + sn := op.Value() + sc := uset.New() + sc.AddAll(staticPropertySets[sn]) + sc.Complement() + + c.out.initialChars.AddAll(sc) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_BACKSLASH_D: + // Digit Char + if currentLen == 0 { + s := uset.New() + s.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) + if op.Value() != 0 { + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_BACKSLASH_H: + // Horiz white space + if currentLen == 0 { + s := uset.New() + s.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + s.AddRune(9) // Tab + if op.Value() != 0 { + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_BACKSLASH_R, // Any line ending sequence + URX_BACKSLASH_V: // Any line ending code point, with optional negation + if currentLen == 0 { + s := uset.New() + s.AddRuneRange(0x0a, 0x0d) // add range + s.AddRune(0x85) + s.AddRuneRange(0x2028, 0x2029) + if op.Value() != 0 { + // Complement option applies to URX_BACKSLASH_V only. + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_ONECHAR_I: + // Case Insensitive Single Character. + if currentLen == 0 { + ch := op.Value32() + if uprops.HasBinaryProperty(ch, uprops.UCHAR_CASE_SENSITIVE) { + starters := uset.New() + starters.AddRuneRange(ch, ch) + starters.CloseOver(uset.USET_CASE_INSENSITIVE) + // findCaseInsensitiveStarters(c, &starters); + // For ONECHAR_I, no need to worry about text chars that expand on folding into + // strings. The expanded folding can't match the pattern. + c.out.initialChars.AddAll(starters) + } else { + // Char has no case variants. Just add it as-is to the + // set of possible starting chars. + c.out.initialChars.AddRune(ch) + } + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_BACKSLASH_X, // Grahpeme Cluster. Minimum is 1, max unbounded. + URX_DOTANY_ALL, // . matches one or two. + URX_DOTANY, + URX_DOTANY_UNIX: + if currentLen == 0 { + // These constructs are all bad news when they appear at the start + // of a match. Any character can begin the match. + c.out.initialChars.Clear() + c.out.initialChars.Complement() + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case URX_JMPX: + loc++ // Except for extra operand on URX_JMPX, same as URX_JMP. + fallthrough + + case URX_JMP: + jmpDest := op.Value() + if int(jmpDest) < loc { + // Loop of some kind. Can safely ignore, the worst that will happen + // is that we understate the true minimum length + currentLen = forwardedLength[loc+1] + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + // U_ASSERT(jmpDest <= end + 1); + if forwardedLength[jmpDest] > currentLen { + forwardedLength[jmpDest] = currentLen + } + } + atStart = false + + case URX_JMP_SAV, + URX_JMP_SAV_X: + // Combo of state save to the next loc, + jmp backwards. + // Net effect on min. length computation is nothing. + atStart = false + + case URX_BACKTRACK: + // Fails are kind of like a branch, except that the min length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + atStart = false + + case URX_STATE_SAVE: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + jmpDest := op.Value() + if jmpDest > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = (currentLen) + } + } + atStart = false + + case URX_STRING: + loc++ + stringLenOp := c.out.compiledPat[loc] + stringLen := stringLenOp.Value() + // U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); + // U_ASSERT(stringLenOp >= 2); + if currentLen == 0 { + // Add the starting character of this string to the set of possible starting + // characters for this pattern. + stringStartIdx := op.Value() + ch := c.out.literalText[stringStartIdx] + c.out.initialChars.AddRune(ch) + + // Remember this string. After the entire pattern has been checked, + // if nothing else is identified that can start a match, we'll use it. + numInitialStrings++ + c.out.initialStringIdx = stringStartIdx + c.out.initialStringLen = stringLen + } + + currentLen = safeIncrement(currentLen, stringLen) + atStart = false + + case URX_STRING_I: + // Case-insensitive string. Unlike exact-match strings, we won't + // attempt a string search for possible match positions. But we + // do update the set of possible starting characters. + loc++ + stringLenOp := c.out.compiledPat[loc] + stringLen := stringLenOp.Value() + // U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); + // U_ASSERT(stringLenOp >= 2); + if currentLen == 0 { + // Add the starting character of this string to the set of possible starting + // characters for this pattern. + stringStartIdx := op.Value() + ch := c.out.literalText[stringStartIdx] + s := uset.New() + c.findCaseInsensitiveStarters(ch, s) + c.out.initialChars.AddAll(s) + numInitialStrings += 2 // Matching on an initial string not possible. + } + currentLen = safeIncrement(currentLen, stringLen) + atStart = false + + case URX_CTR_INIT, + URX_CTR_INIT_NG: + // Loop Init Ops. These don't change the min length, but they are 4 word ops + // so location must be updated accordingly. + // Loop Init Ops. + // If the min loop count == 0 + // move loc forwards to the end of the loop, skipping over the body. + // If the min count is > 0, + // continue normal processing of the body of the loop. + loopEndLoc := c.out.compiledPat[loc+1].Value() + minLoopCount := int(c.out.compiledPat[loc+2]) + if minLoopCount == 0 { + // Min Loop Count of 0, treat like a forward branch and + // move the current minimum length up to the target + // (end of loop) location. + // U_ASSERT(loopEndLoc <= end + 1); + if forwardedLength[loopEndLoc] > currentLen { + forwardedLength[loopEndLoc] = currentLen + } + } + loc += 3 // Skips over operands of CTR_INIT + atStart = false + + case URX_CTR_LOOP, + URX_CTR_LOOP_NG: + // Loop ops. + // The jump is conditional, backwards only. + atStart = false + + case URX_LOOP_C: + // More loop ops. These state-save to themselves. + // don't change the minimum match + atStart = false + + case URX_LA_START, + URX_LB_START: + // Look-around. Scan forward until the matching look-ahead end, + // without processing the look-around block. This is overly pessimistic. + + // Keep track of the nesting depth of look-around blocks. Boilerplate code for + // lookahead contains two LA_END instructions, so count goes up by two + // for each LA_START. + var depth int + if opType == URX_LA_START { + depth = 2 + } else { + depth = 1 + } + for { + loc++ + op = c.out.compiledPat[loc] + if op.Type() == URX_LA_START { + depth += 2 + } + if op.Type() == URX_LB_START { + depth++ + } + if op.Type() == URX_LA_END || op.Type() == URX_LBN_END { + depth-- + if depth == 0 { + break + } + } + if op.Type() == URX_STATE_SAVE { + // Need this because neg lookahead blocks will FAIL to outside + // of the block. + jmpDest := op.Value() + if int(jmpDest) > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = (currentLen) + } + } + } + // U_ASSERT(loc <= end); + } + + case URX_LA_END, + URX_LB_CONT, + URX_LB_END, + URX_LBN_CONT, + URX_LBN_END: + panic("should be consumed in URX_LA_START") + + default: + panic("unreachable") + } + } + + // Sort out what we should check for when looking for candidate match start positions. + // In order of preference, + // 1. Start of input text buffer. + // 2. A literal string. + // 3. Start of line in multi-line mode. + // 4. A single literal character. + // 5. A character from a set of characters. + // + if c.out.startType == START_START { + // Match only at the start of an input text string. + // start type is already set. We're done. + } else if numInitialStrings == 1 && c.out.minMatchLen > 0 { + // Match beginning only with a literal string. + ch := c.out.literalText[c.out.initialStringIdx] + // U_ASSERT(fRXPat->fInitialChars->contains(c)); + c.out.startType = START_STRING + c.out.initialChar = ch + } else if c.out.startType == START_LINE { + // Match at start of line in Multi-Line mode. + // Nothing to do here; everything is already set. + } else if c.out.minMatchLen == 0 { + // Zero length match possible. We could start anywhere. + c.out.startType = START_NO_INFO + } else if c.out.initialChars.Len() == 1 { + // All matches begin with the same char. + c.out.startType = START_CHAR + c.out.initialChar = c.out.initialChars.RuneAt(0) + // U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); + } else if !c.out.initialChars.ContainsRuneRange(0, 0x10ffff) && c.out.minMatchLen > 0 { + // Matches start with a set of character smaller than the set of all chars. + c.out.startType = START_SET + } else { + // Matches can start with anything + c.out.startType = START_NO_INFO + } +} + +func (c *Compiler) appendOp(typ Opcode, arg int) { + c.appendIns(c.buildOp(typ, arg)) +} + +func (c *Compiler) appendIns(ins Instruction) { + if c.err != nil { + return + } + c.out.compiledPat = append(c.out.compiledPat, ins) +} + +func (c *Compiler) buildOp(typ Opcode, val int) Instruction { + if c.err != nil { + return 0 + } + if val > 0x00ffffff { + panic("bad argument to buildOp") + } + if val < 0 { + if !(typ == URX_RESERVED_OP_N || typ == URX_RESERVED_OP) { + panic("bad value to buildOp") + } + typ = URX_RESERVED_OP_N + } + return Instruction(int32(typ)<<24 | int32(val)) +} + +func (c *Compiler) handleCloseParen() { + if len(c.parenStack) == 0 { + c.error(uerror.U_REGEX_MISMATCHED_PAREN) + return + } + + c.fixLiterals(false) + + var patIdx int + var patOp Instruction + + for { + patIdx, c.parenStack = stackPop(c.parenStack) + if patIdx < 0 { + break + } + + patOp = c.out.compiledPat[patIdx] + if patOp.Value() != 0 { + panic("branch target for JMP should not be set") + } + patOp |= Instruction(len(c.out.compiledPat)) + c.out.compiledPat[patIdx] = patOp + c.matchOpenParen = patIdx + } + + var modeFlags int + modeFlags, c.parenStack = stackPop(c.parenStack) + if modeFlags >= 0 { + panic("modeFlags in paren stack was not negated") + } + + c.modeFlags = RegexpFlag(modeFlags) + + switch patIdx { + case parenPlain, parenFlags: + // No additional fixups required. + // (Grouping-only parentheses) + case parenCapturing: + // Capturing Parentheses. + // Insert a End Capture op into the pattern. + // The frame offset of the variables for this cg is obtained from the + // start capture op and put it into the end-capture op. + + captureOp := c.out.compiledPat[c.matchOpenParen+1] + if captureOp.Type() != URX_START_CAPTURE { + panic("bad type in capture op (expected URX_START_CAPTURE)") + } + frameVarLocation := captureOp.Value() + c.appendOp(URX_END_CAPTURE, frameVarLocation) + + case parenAtomic: + // Atomic Parenthesis. + // Insert a LD_SP operation to restore the state stack to the position + // it was when the atomic parens were entered. + stoOp := c.out.compiledPat[c.matchOpenParen+1] + if stoOp.Type() != URX_STO_SP { + panic("bad type in capture op (expected URX_STO_SP)") + } + stoLoc := stoOp.Value() + c.appendOp(URX_LD_SP, stoLoc) + + case parenLookahead: + startOp := c.out.compiledPat[c.matchOpenParen-5] + if startOp.Type() != URX_LA_START { + panic("bad type in capture op (expected URX_LA_START)") + } + dataLoc := startOp.Value() + c.appendOp(URX_LA_END, dataLoc) + + case parenNegLookahead: + startOp := c.out.compiledPat[c.matchOpenParen-1] + if startOp.Type() != URX_LA_START { + panic("bad type in capture op (expected URX_LA_START)") + } + dataLoc := startOp.Value() + c.appendOp(URX_LA_END, dataLoc) + c.appendOp(URX_BACKTRACK, 0) + c.appendOp(URX_LA_END, dataLoc) + + // Patch the URX_SAVE near the top of the block. + // The destination of the SAVE is the final LA_END that was just added. + saveOp := c.out.compiledPat[c.matchOpenParen] + if saveOp.Type() != URX_STATE_SAVE { + panic("bad type in capture op (expected URX_STATE_SAVE)") + } + saveOp = c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)-1) + c.out.compiledPat[c.matchOpenParen] = saveOp + + case parenLookBehind: + startOp := c.out.compiledPat[c.matchOpenParen-4] + if startOp.Type() != URX_LB_START { + panic("bad type in capture op (expected URX_LB_START)") + } + dataLoc := startOp.Value() + c.appendOp(URX_LB_END, dataLoc) + c.appendOp(URX_LA_END, dataLoc) + + // Determine the min and max bounds for the length of the + // string that the pattern can match. + // An unbounded upper limit is an error. + patEnd := len(c.out.compiledPat) - 1 + minML := c.minMatchLength(c.matchOpenParen, patEnd) + maxML := c.maxMatchLength(c.matchOpenParen, patEnd) + + if maxML == math.MaxInt32 { + c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + break + } + if minML == math.MaxInt32 { + // This condition happens when no match is possible, such as with a + // [set] expression containing no elements. + // In principle, the generated code to evaluate the expression could be deleted, + // but it's probably not worth the complication. + minML = 0 + } + + c.out.compiledPat[c.matchOpenParen-2] = Instruction(minML) + c.out.compiledPat[c.matchOpenParen-1] = Instruction(maxML) + + case parenLookBehindN: + startOp := c.out.compiledPat[c.matchOpenParen-5] + if startOp.Type() != URX_LB_START { + panic("bad type in capture op (expected URX_LB_START)") + } + dataLoc := startOp.Value() + c.appendOp(URX_LBN_END, dataLoc) + + // Determine the min and max bounds for the length of the + // string that the pattern can match. + // An unbounded upper limit is an error. + patEnd := len(c.out.compiledPat) - 1 + minML := c.minMatchLength(c.matchOpenParen, patEnd) + maxML := c.maxMatchLength(c.matchOpenParen, patEnd) + + if Instruction(maxML).Type() != 0 { + c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + break + } + if maxML == math.MaxInt32 { + c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + break + } + if minML == math.MaxInt32 { + // This condition happens when no match is possible, such as with a + // [set] expression containing no elements. + // In principle, the generated code to evaluate the expression could be deleted, + // but it's probably not worth the complication. + minML = 0 + } + + c.out.compiledPat[c.matchOpenParen-3] = Instruction(minML) + c.out.compiledPat[c.matchOpenParen-2] = Instruction(maxML) + + op := c.buildOp(URX_RELOC_OPRND, len(c.out.compiledPat)) + c.out.compiledPat[c.matchOpenParen-1] = op + + default: + panic("unexpected opcode in parenStack") + } + + c.matchCloseParen = len(c.out.compiledPat) +} + +func (c *Compiler) fixLiterals(split bool) { + if len(c.literalChars) == 0 { + return + } + + lastCodePoint := c.literalChars[len(c.literalChars)-1] + + // Split: We need to ensure that the last item in the compiled pattern + // refers only to the last literal scanned in the pattern, so that + // quantifiers (*, +, etc.) affect only it, and not a longer string. + // Split before case folding for case insensitive matches. + if split { + c.literalChars = c.literalChars[:len(c.literalChars)-1] + c.fixLiterals(false) + + c.literalChar(lastCodePoint) + c.fixLiterals(false) + return + } + + if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { + c.literalChars = ucase.FoldRunes(c.literalChars) + lastCodePoint = c.literalChars[len(c.literalChars)-1] + } + + if len(c.literalChars) == 1 { + if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 && uprops.HasBinaryProperty(lastCodePoint, uprops.UCHAR_CASE_SENSITIVE) { + c.appendOp(URX_ONECHAR_I, int(lastCodePoint)) + } else { + c.appendOp(URX_ONECHAR, int(lastCodePoint)) + } + } else { + if len(c.literalChars) > 0x00ffffff || len(c.out.literalText) > 0x00ffffff { + c.error(uerror.U_REGEX_PATTERN_TOO_BIG) + } + if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { + c.appendOp(URX_STRING_I, len(c.out.literalText)) + } else { + c.appendOp(URX_STRING, len(c.out.literalText)) + } + c.appendOp(URX_STRING_LEN, len(c.literalChars)) + c.out.literalText = append(c.out.literalText, c.literalChars...) + } + + c.literalChars = c.literalChars[:0] +} + +func (c *Compiler) literalChar(point rune) { + c.literalChars = append(c.literalChars, point) +} + +func (c *Compiler) allocateData(size int) int { + if c.err != nil { + return 0 + } + if size <= 0 || size > 0x100 || c.out.dataSize < 0 { + c.error(uerror.U_REGEX_INTERNAL_ERROR) + return 0 + } + + dataIndex := c.out.dataSize + c.out.dataSize += size + if c.out.dataSize >= 0x00fffff0 { + c.error(uerror.U_REGEX_INTERNAL_ERROR) + } + return dataIndex +} + +func (c *Compiler) allocateStackData(size int) int { + if c.err != nil { + return 0 + } + if size <= 0 || size > 0x100 || c.out.frameSize < 0 { + c.error(uerror.U_REGEX_INTERNAL_ERROR) + return 0 + } + dataIndex := c.out.frameSize + c.out.frameSize += size + if c.out.frameSize >= 0x00fffff0 { + c.error(uerror.U_REGEX_INTERNAL_ERROR) + } + return dataIndex +} + +func (c *Compiler) insertOp(where int) { + if where < 0 || where >= len(c.out.compiledPat) { + panic("insertOp: out of bounds") + } + + nop := c.buildOp(URX_NOP, 0) + c.out.compiledPat = slices.Insert(c.out.compiledPat, where, nop) + + // Walk through the pattern, looking for any ops with targets that + // were moved down by the insert. Fix them. + for loc, op := range c.out.compiledPat { + switch op.Type() { + case URX_JMP, URX_JMPX, URX_STATE_SAVE, URX_CTR_LOOP, URX_CTR_LOOP_NG, URX_JMP_SAV, URX_JMP_SAV_X, URX_RELOC_OPRND: + if int(op.Value()) > where { + op = c.buildOp(op.Type(), op.Value()+1) + c.out.compiledPat[loc] = op + } + } + } + + // Now fix up the parentheses stack. All positive values in it are locations in + // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) + for loc, x := range c.parenStack { + if x > where { + c.parenStack[loc] = x + 1 + } + } + + if c.matchCloseParen > where { + c.matchCloseParen++ + } + if c.matchOpenParen > where { + c.matchOpenParen++ + } +} + +func (c *Compiler) blockTopLoc(reserve bool) int { + var loc int + c.fixLiterals(true) + + if len(c.out.compiledPat) == c.matchCloseParen { + // The item just processed is a parenthesized block. + loc = c.matchOpenParen + } else { + // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. + // No slot for STATE_SAVE was pre-reserved in the compiled code. + // We need to make space now. + loc = len(c.out.compiledPat) - 1 + op := c.out.compiledPat[loc] + if op.Type() == URX_STRING_LEN { + // Strings take two opcode, we want the position of the first one. + // We can have a string at this point if a single character case-folded to two. + loc-- + } + if reserve { + nop := c.buildOp(URX_NOP, 0) + c.out.compiledPat = slices.Insert(c.out.compiledPat, loc, nop) + } + } + return loc +} + +func (c *Compiler) compileInlineInterval() bool { + if c.intervalUpper > 10 || c.intervalUpper < c.intervalLow { + return false + } + + topOfBlock := c.blockTopLoc(false) + if c.intervalUpper == 0 { + // Pathological case. Attempt no matches, as if the block doesn't exist. + // Discard the generated code for the block. + // If the block included parens, discard the info pertaining to them as well. + c.out.compiledPat = c.out.compiledPat[:topOfBlock] + if c.matchOpenParen >= topOfBlock { + c.matchOpenParen = -1 + } + if c.matchCloseParen >= topOfBlock { + c.matchCloseParen = -1 + } + return true + } + + if topOfBlock != len(c.out.compiledPat)-1 && c.intervalUpper != 1 { + // The thing being repeated is not a single op, but some + // more complex block. Do it as a loop, not inlines. + // Note that things "repeated" a max of once are handled as inline, because + // the one copy of the code already generated is just fine. + return false + } + + // Pick up the opcode that is to be repeated + // + op := c.out.compiledPat[topOfBlock] + + // Compute the pattern location where the inline sequence + // will end, and set up the state save op that will be needed. + // + endOfSequenceLoc := len(c.out.compiledPat) - 1 + c.intervalUpper + (c.intervalUpper - c.intervalLow) + + saveOp := c.buildOp(URX_STATE_SAVE, endOfSequenceLoc) + if c.intervalLow == 0 { + c.insertOp(topOfBlock) + c.out.compiledPat[topOfBlock] = saveOp + } + + // Loop, emitting the op for the thing being repeated each time. + // Loop starts at 1 because one instance of the op already exists in the pattern, + // it was put there when it was originally encountered. + for i := 1; i < c.intervalUpper; i++ { + if i >= c.intervalLow { + c.appendIns(saveOp) + } + c.appendIns(op) + } + return true +} + +func (c *Compiler) compileInterval(init Opcode, loop Opcode) { + // The CTR_INIT op at the top of the block with the {n,m} quantifier takes + // four slots in the compiled code. Reserve them. + topOfBlock := c.blockTopLoc(true) + c.insertOp(topOfBlock) + c.insertOp(topOfBlock) + c.insertOp(topOfBlock) + + // The operands for the CTR_INIT opcode include the index in the matcher data + // of the counter. Allocate it now. There are two data items + // counterLoc --> Loop counter + // +1 --> Input index (for breaking non-progressing loops) + // (Only present if unbounded upper limit on loop) + var dataSize int + if c.intervalUpper < 0 { + dataSize = 2 + } else { + dataSize = 1 + } + counterLoc := c.allocateStackData(dataSize) + + op := c.buildOp(init, counterLoc) + c.out.compiledPat[topOfBlock] = op + + // The second operand of CTR_INIT is the location following the end of the loop. + // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the + // compilation of something later on causes the code to grow and the target + // position to move. + loopEnd := len(c.out.compiledPat) + op = c.buildOp(URX_RELOC_OPRND, loopEnd) + c.out.compiledPat[topOfBlock+1] = op + + // Followed by the min and max counts. + c.out.compiledPat[topOfBlock+2] = Instruction(c.intervalLow) + c.out.compiledPat[topOfBlock+3] = Instruction(c.intervalUpper) + + // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. + // Goes at end of the block being looped over, so just append to the code so far. + c.appendOp(loop, topOfBlock) + + if (c.intervalLow&0xff000000) != 0 || (c.intervalUpper > 0 && (c.intervalUpper&0xff000000) != 0) { + c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + } + + if c.intervalLow > c.intervalUpper && c.intervalUpper != -1 { + c.error(uerror.U_REGEX_MAX_LT_MIN) + } +} + +func (c *Compiler) scanNamedChar() rune { + c.nextChar(&c.c) + if c.c.char != chLBrace { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + return 0 + } + + var charName []rune + for { + c.nextChar(&c.c) + if c.c.char == chRBrace { + break + } + if c.c.char == -1 { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + return 0 + } + charName = append(charName, c.c.char) + } + + if !isInvariantUString(charName) { + // All Unicode character names have only invariant characters. + // The API to get a character, given a name, accepts only char *, forcing us to convert, + // which requires this error check + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + return 0 + } + + theChar := unames.CharForName(unames.U_UNICODE_CHAR_NAME, string(charName)) + if c.err != nil { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + } + + c.nextChar(&c.c) // Continue overall regex pattern processing with char after the '}' + return theChar +} + +func isInvariantUString(name []rune) bool { + for _, c := range name { + /* + * no assertions here because these functions are legitimately called + * for strings with variant characters + */ + if !UCHAR_IS_INVARIANT(c) { + return false /* found a variant char */ + } + } + return true +} + +var invariantChars = [...]uint32{ + 0xfffffbff, /* 00..1f but not 0a */ + 0xffffffe5, /* 20..3f but not 21 23 24 */ + 0x87fffffe, /* 40..5f but not 40 5b..5e */ + 0x87fffffe, /* 60..7f but not 60 7b..7e */ +} + +func UCHAR_IS_INVARIANT(c rune) bool { + return c <= 0x7f && (invariantChars[(c)>>5]&(uint32(1)<<(c&0x1f))) != 0 +} + +func (c *Compiler) setPushOp(op setOperation) { + c.setEval(op) + c.setOpStack = append(c.setOpStack, op) + c.setStack = append(c.setStack, uset.New()) +} + +func (c *Compiler) setEval(nextOp setOperation) { + var rightOperand *uset.UnicodeSet + var leftOperand *uset.UnicodeSet + + for { + pendingSetOp := c.setOpStack[len(c.setOpStack)-1] + if (pendingSetOp & 0xffff0000) < (nextOp & 0xffff0000) { + break + } + + c.setOpStack = c.setOpStack[:len(c.setOpStack)-1] + rightOperand = c.setStack[len(c.setStack)-1] + + switch pendingSetOp { + case setNegation: + rightOperand.Complement() + + case setCaseClose: + rightOperand.CloseOver(uset.USET_CASE_INSENSITIVE) + + case setDifference1, setDifference2: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.RemoveAll(rightOperand) + + case setIntersection1, setIntersection2: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.RetainAll(rightOperand) + + case setUnion: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.AddAll(rightOperand) + + default: + panic("unreachable") + } + } +} + +func safeIncrement(val int32, delta int) int32 { + if delta <= math.MaxInt32 && math.MaxInt32-val > int32(delta) { + return val + int32(delta) + } + return math.MaxInt32 +} + +func (c *Compiler) minMatchLength(start, end int) int32 { + if c.err != nil { + return 0 + } + + // U_ASSERT(start <= end); + // U_ASSERT(end < fRXPat->fCompiledPat->size()); + + var loc int + var currentLen int32 + + // forwardedLength is a vector holding minimum-match-length values that + // are propagated forward in the pattern by JMP or STATE_SAVE operations. + // It must be one longer than the pattern being checked because some ops + // will jmp to a end-of-block+1 location from within a block, and we must + // count those when checking the block. + forwardedLength := make([]int32, end+2) + for i := range forwardedLength { + forwardedLength[i] = math.MaxInt32 + } + + for loc = start; loc <= end; loc++ { + op := c.out.compiledPat[loc] + opType := op.Type() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a shorter minimum length than the current accumulated value, + // replace the current accumulated value. + // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some + // no-match-possible cases. + if forwardedLength[loc] < currentLen { + currentLen = forwardedLength[loc] + // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX); + } + + switch opType { + // Ops that don't change the total length matched + case URX_RESERVED_OP, + URX_END, + URX_STRING_LEN, + URX_NOP, + URX_START_CAPTURE, + URX_END_CAPTURE, + URX_BACKSLASH_B, + URX_BACKSLASH_BU, + URX_BACKSLASH_G, + URX_BACKSLASH_Z, + URX_CARET, + URX_DOLLAR, + URX_DOLLAR_M, + URX_DOLLAR_D, + URX_DOLLAR_MD, + URX_RELOC_OPRND, + URX_STO_INP_LOC, + URX_CARET_M, + URX_CARET_M_UNIX, + URX_BACKREF, // BackRef. Must assume that it might be a zero length match + URX_BACKREF_I, + URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. + URX_LD_SP, + URX_JMP_SAV, + URX_JMP_SAV_X: + // no-op + + // Ops that match a minimum of one character (one or two 16 bit code units.) + // + case URX_ONECHAR, + URX_STATIC_SETREF, + URX_STAT_SETREF_N, + URX_SETREF, + URX_BACKSLASH_D, + URX_BACKSLASH_H, + URX_BACKSLASH_R, + URX_BACKSLASH_V, + URX_ONECHAR_I, + URX_BACKSLASH_X, // Grahpeme Cluster. Minimum is 1, max unbounded. + URX_DOTANY_ALL, // . matches one or two. + URX_DOTANY, + URX_DOTANY_UNIX: + currentLen = safeIncrement(currentLen, 1) + + case URX_JMPX: + loc++ // URX_JMPX has an extra operand, ignored here, otherwise processed identically to URX_JMP. + fallthrough + + case URX_JMP: + jmpDest := op.Value() + if int(jmpDest) < loc { + // Loop of some kind. Can safely ignore, the worst that will happen + // is that we understate the true minimum length + currentLen = forwardedLength[loc+1] + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + // U_ASSERT(jmpDest <= end + 1); + if forwardedLength[jmpDest] > currentLen { + forwardedLength[jmpDest] = currentLen + } + } + + case URX_BACKTRACK: + // Back-tracks are kind of like a branch, except that the min length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + + case URX_STATE_SAVE: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + jmpDest := op.Value() + if int(jmpDest) > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } + + case URX_STRING: + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.Value()) + + case URX_STRING_I: + loc++ + // TODO: with full case folding, matching input text may be shorter than + // the string we have here. More smarts could put some bounds on it. + // Assume a min length of one for now. A min length of zero causes + // optimization failures for a pattern like "string"+ + // currentLen += URX_VAL(stringLenOp); + currentLen = safeIncrement(currentLen, 1) + + case URX_CTR_INIT, URX_CTR_INIT_NG: + // Loop Init Ops. + // If the min loop count == 0 + // move loc forwards to the end of the loop, skipping over the body. + // If the min count is > 0, + // continue normal processing of the body of the loop. + loopEndOp := c.out.compiledPat[loc+1] + loopEndLoc := loopEndOp.Value() + minLoopCount := c.out.compiledPat[loc+2] + if minLoopCount == 0 { + loc = int(loopEndLoc) + } else { + loc += 3 // Skips over operands of CTR_INIT + } + + case URX_CTR_LOOP, URX_CTR_LOOP_NG: + // Loop ops. The jump is conditional, backwards only. + + case URX_LOOP_SR_I, URX_LOOP_DOT_I, URX_LOOP_C: + // More loop ops. These state-save to themselves. don't change the minimum match - could match nothing at all. + + case URX_LA_START, URX_LB_START: + // Look-around. Scan forward until the matching look-ahead end, + // without processing the look-around block. This is overly pessimistic for look-ahead, + // it assumes that the look-ahead match might be zero-length. + // TODO: Positive lookahead could recursively do the block, then continue + // with the longer of the block or the value coming in. Ticket 6060 + var depth int32 + if opType == URX_LA_START { + depth = 2 + } else { + depth = 1 + } + + for { + loc++ + op = c.out.compiledPat[loc] + if op.Type() == URX_LA_START { + // The boilerplate for look-ahead includes two LA_END insturctions, + // Depth will be decremented by each one when it is seen. + depth += 2 + } + if op.Type() == URX_LB_START { + depth++ + } + if op.Type() == URX_LA_END { + depth-- + if depth == 0 { + break + } + } + if op.Type() == URX_LBN_END { + depth-- + if depth == 0 { + break + } + } + if op.Type() == URX_STATE_SAVE { + // Need this because neg lookahead blocks will FAIL to outside of the block. + jmpDest := op.Value() + if int(jmpDest) > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } + } + // U_ASSERT(loc <= end); + } + + case URX_LA_END, URX_LB_CONT, URX_LB_END, URX_LBN_CONT, URX_LBN_END: + // Only come here if the matching URX_LA_START or URX_LB_START was not in the + // range being sized, which happens when measuring size of look-behind blocks. + + default: + panic("unreachable") + } + } + + // We have finished walking through the ops. Check whether some forward jump + // propagated a shorter length to location end+1. + if forwardedLength[end+1] < currentLen { + currentLen = forwardedLength[end+1] + // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX) + } + + return currentLen +} + +func (c *Compiler) maxMatchLength(start, end int) int32 { + if c.err != nil { + return 0 + } + // U_ASSERT(start <= end); + // U_ASSERT(end < fRXPat->fCompiledPat->size()); + + var loc int + var currentLen int32 + + forwardedLength := make([]int32, end+1) + + for loc = start; loc <= end; loc++ { + op := c.out.compiledPat[loc] + opType := op.Type() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a longer maximum length than the current accumulated value, + // replace the current accumulated value. + if forwardedLength[loc] > currentLen { + currentLen = forwardedLength[loc] + } + + switch opType { + // Ops that don't change the total length matched + case URX_RESERVED_OP, + URX_END, + URX_STRING_LEN, + URX_NOP, + URX_START_CAPTURE, + URX_END_CAPTURE, + URX_BACKSLASH_B, + URX_BACKSLASH_BU, + URX_BACKSLASH_G, + URX_BACKSLASH_Z, + URX_CARET, + URX_DOLLAR, + URX_DOLLAR_M, + URX_DOLLAR_D, + URX_DOLLAR_MD, + URX_RELOC_OPRND, + URX_STO_INP_LOC, + URX_CARET_M, + URX_CARET_M_UNIX, + URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. + URX_LD_SP, + URX_LB_END, + URX_LB_CONT, + URX_LBN_CONT, + URX_LBN_END: + // no-op + + // Ops that increase that cause an unbounded increase in the length + // of a matched string, or that increase it a hard to characterize way. + // Call the max length unbounded, and stop further checking. + case URX_BACKREF, // BackRef. Must assume that it might be a zero length match + URX_BACKREF_I, + URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + currentLen = math.MaxInt32 + + // Ops that match a max of one character (possibly two 16 bit code units.) + // + case URX_STATIC_SETREF, + URX_STAT_SETREF_N, + URX_SETREF, + URX_BACKSLASH_D, + URX_BACKSLASH_H, + URX_BACKSLASH_R, + URX_BACKSLASH_V, + URX_ONECHAR_I, + URX_DOTANY_ALL, + URX_DOTANY, + URX_DOTANY_UNIX: + currentLen = safeIncrement(currentLen, 2) + + // Single literal character. Increase current max length by one or two, + // depending on whether the char is in the supplementary range. + case URX_ONECHAR: + currentLen = safeIncrement(currentLen, 1) + if op.Value() > 0x10000 { + currentLen = safeIncrement(currentLen, 1) + } + + // Jumps. + // + case URX_JMP, URX_JMPX, URX_JMP_SAV, URX_JMP_SAV_X: + jmpDest := int(op.Value()) + if jmpDest < loc { + // Loop of some kind. Max match length is unbounded. + currentLen = math.MaxInt32 + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + if forwardedLength[jmpDest] < currentLen { + forwardedLength[jmpDest] = currentLen + } + currentLen = 0 + } + + case URX_BACKTRACK: + // back-tracks are kind of like a branch, except that the max length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + + case URX_STATE_SAVE: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + // For backwards jumps, they create a loop, maximum + // match length is unbounded. + jmpDest := int(op.Value()) + if jmpDest > loc { + if currentLen > forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } else { + currentLen = math.MaxInt32 + } + + case URX_STRING: + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.Value()) + + case URX_STRING_I: + // TODO: This code assumes that any user string that matches will be no longer + // than our compiled string, with case insensitive matching. + // Our compiled string has been case-folded already. + // + // Any matching user string will have no more code points than our + // compiled (folded) string. Folding may add code points, but + // not remove them. + // + // There is a potential problem if a supplemental code point + // case-folds to a BMP code point. In this case our compiled string + // could be shorter (in code units) than a matching user string. + // + // At this time (Unicode 6.1) there are no such characters, and this case + // is not being handled. A test, intltest regex/Bug9283, will fail if + // any problematic characters are added to Unicode. + // + // If this happens, we can make a set of the BMP chars that the + // troublesome supplementals fold to, scan our string, and bump the + // currentLen one extra for each that is found. + // + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.Value()) + + case URX_CTR_INIT, URX_CTR_INIT_NG: + // For Loops, recursively call this function on the pattern for the loop body, + // then multiply the result by the maximum loop count. + loopEndLoc := int(c.out.compiledPat[loc+1].Value()) + if loopEndLoc == loc+4 { + // Loop has an empty body. No affect on max match length. + // Continue processing with code after the loop end. + loc = loopEndLoc + break + } + + maxLoopCount := int(c.out.compiledPat[loc+3]) + if maxLoopCount == -1 { + // Unbounded Loop. No upper bound on match length. + currentLen = math.MaxInt32 + break + } + + // U_ASSERT(loopEndLoc >= loc + 4); + blockLen := c.maxMatchLength(loc+4, loopEndLoc-1) // Recursive call. + updatedLen := int(currentLen) + int(blockLen)*maxLoopCount + if updatedLen >= math.MaxInt32 { + currentLen = math.MaxInt32 + break + } + currentLen = int32(updatedLen) + loc = loopEndLoc + + case URX_CTR_LOOP, URX_CTR_LOOP_NG: + panic("should not encounter this opcode") + + case URX_LOOP_SR_I, URX_LOOP_DOT_I, URX_LOOP_C: + // For anything to do with loops, make the match length unbounded. + currentLen = math.MaxInt32 + + case URX_LA_START, URX_LA_END: + // Look-ahead. Just ignore, treat the look-ahead block as if + // it were normal pattern. Gives a too-long match length, + // but good enough for now. + + case URX_LB_START: + // Look-behind. Scan forward until the matching look-around end, + // without processing the look-behind block. + dataLoc := op.Value() + for loc = loc + 1; loc <= end; loc++ { + op = c.out.compiledPat[loc] + if (op.Type() == URX_LA_END || op.Type() == URX_LBN_END) && (op.Value() == dataLoc) { + break + } + } + // U_ASSERT(loc <= end); + + default: + panic("unreachable") + } + + if currentLen == math.MaxInt32 { + // The maximum length is unbounded. + // Stop further processing of the pattern. + break + } + } + + return currentLen +} + +// Machine Generated below. +// It may need updating with new versions of Unicode. +// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. +// The update tool is here: +// svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing + +// Machine Generated Data. Do not hand edit. +var RECaseFixCodePoints = [...]rune{ + 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, + 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, + 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, + 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, + 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000} + +var RECaseFixStringOffsets = [...]int16{ + 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, + 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, + 0x3d, 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, + 0x57, 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0} + +var RECaseFixCounts = [...]int16{ + 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, + 0x4, 0x1, 0x1, 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, + 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0} + +var RECaseFixData = [...]uint16{ + 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 0x1e9e, 0xfb05, + 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, + 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, + 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, + 0xfb13, 0xfb14, 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, + 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 0x1f98, 0x1f91, + 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, + 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, + 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0} + +func (c *Compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.UnicodeSet) { + if uprops.HasBinaryProperty(ch, uprops.UCHAR_CASE_SENSITIVE) { + caseFoldedC := ucase.Fold(ch) + starterChars.Clear() + starterChars.AddRune(caseFoldedC) + + var i int + for i = 0; RECaseFixCodePoints[i] < ch; i++ { + // Simple linear search through the sorted list of interesting code points. + } + + if RECaseFixCodePoints[i] == ch { + data := RECaseFixData[RECaseFixStringOffsets[i]:] + numCharsToAdd := RECaseFixCounts[i] + for j := int16(0); j < numCharsToAdd; j++ { + var cpToAdd rune + cpToAdd, data = utf16.NextUnsafe(data) + starterChars.AddRune(cpToAdd) + } + } + + starterChars.CloseOver(uset.USET_CASE_INSENSITIVE) + } else { + // Not a cased character. Just return it alone. + starterChars.Clear() + starterChars.AddRune(ch) + } +} + +func (c *Compiler) scanProp() *uset.UnicodeSet { + if c.err != nil { + return nil + } + negated := c.c.char == chP + + c.nextChar(&c.c) + if c.c.char != chLBrace { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + return nil + } + + var propertyName strings.Builder + for { + c.nextChar(&c.c) + if c.c.char == chRBrace { + break + } + if c.c.char == -1 { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + return nil + } + propertyName.WriteRune(c.c.char) + } + + ss := c.createSetForProperty(propertyName.String(), negated) + c.nextChar(&c.c) + return ss +} + +func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.UnicodeSet { + if c.err != nil { + return nil + } + + var set *uset.UnicodeSet + + var usetFlags uset.USet + if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { + usetFlags |= uset.USET_CASE_INSENSITIVE + } + + var err error + set, err = uset.ParsePattern("\\p{"+propName+"}", usetFlags) + if err == nil { + goto done + } + + // + // The incoming property wasn't directly recognized by ICU. + + // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. + // Java accepts 'word' with mixed case. + // Java accepts 'all' only in all lower case. + if strings.EqualFold(propName, "word") { + set = staticPropertySets[URX_ISWORD_SET].Clone() + goto done + } + if propName == "all" { + set = uset.New() + set.AddRuneRange(0, 0x10ffff) + goto done + } + + // Do Java InBlock expressions + // + if strings.HasPrefix(propName, "In") && len(propName) >= 3 { + set = uset.New() + if set.ApplyPropertyAlias("Block", propName[2:]) != nil { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + } + goto done + } + + // Check for the Java form "IsBooleanPropertyValue", which we will recast + // as "BooleanPropertyValue". The property value can be either a + // a General Category or a Script Name. + if strings.HasPrefix(propName, "Is") && len(propName) >= 3 { + mPropName := propName[2:] + if strings.IndexByte(mPropName, '=') >= 0 { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + goto done + } + + if strings.EqualFold(mPropName, "assigned") { + mPropName = "unassigned" + negated = !negated + } else if strings.EqualFold(mPropName, "TitleCase") { + mPropName = "Titlecase_Letter" + } + + set, err = uset.ParsePattern("\\p{"+mPropName+"}", 0) + if err != nil { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + } else if !set.IsEmpty() && (usetFlags&uset.USET_CASE_INSENSITIVE) != 0 { + set.CloseOver(uset.USET_CASE_INSENSITIVE) + } + goto done + } + + if strings.HasPrefix(propName, "java") { + set = uset.New() + + // + // Try the various Java specific properties. + // These all begin with "java" + // + if propName == "javaDefined" { + set.AddCategory(uchar.U_GC_CN_MASK) + set.Complement() + } else if propName == "javaDigit" { + set.AddCategory(uchar.U_GC_ND_MASK) + } else if propName == "javaIdentifierIgnorable" { + addIdentifierIgnorable(set) + } else if propName == "javaISOControl" { + set.AddRuneRange(0, 0x1F) + set.AddRuneRange(0x7F, 0x9F) + } else if propName == "javaJavaIdentifierPart" { + set.AddCategory(uchar.U_GC_L_MASK) + set.AddCategory(uchar.U_GC_SC_MASK) + set.AddCategory(uchar.U_GC_PC_MASK) + set.AddCategory(uchar.U_GC_ND_MASK) + set.AddCategory(uchar.U_GC_NL_MASK) + set.AddCategory(uchar.U_GC_MC_MASK) + set.AddCategory(uchar.U_GC_MN_MASK) + addIdentifierIgnorable(set) + } else if propName == "javaJavaIdentifierStart" { + set.AddCategory(uchar.U_GC_L_MASK) + set.AddCategory(uchar.U_GC_NL_MASK) + set.AddCategory(uchar.U_GC_SC_MASK) + set.AddCategory(uchar.U_GC_PC_MASK) + } else if propName == "javaLetter" { + set.AddCategory(uchar.U_GC_L_MASK) + } else if propName == "javaLetterOrDigit" { + set.AddCategory(uchar.U_GC_L_MASK) + set.AddCategory(uchar.U_GC_ND_MASK) + } else if propName == "javaLowerCase" { + set.AddCategory(uchar.U_GC_LL_MASK) + } else if propName == "javaMirrored" { + set.ApplyIntPropertyValue(uprops.UCHAR_BIDI_MIRRORED, 1) + } else if propName == "javaSpaceChar" { + set.AddCategory(uchar.U_GC_Z_MASK) + } else if propName == "javaSupplementaryCodePoint" { + set.AddRuneRange(0x10000, uset.MAX_VALUE) + } else if propName == "javaTitleCase" { + set.AddCategory(uchar.U_GC_LT_MASK) + } else if propName == "javaUnicodeIdentifierStart" { + set.AddCategory(uchar.U_GC_L_MASK) + set.AddCategory(uchar.U_GC_NL_MASK) + } else if propName == "javaUnicodeIdentifierPart" { + set.AddCategory(uchar.U_GC_L_MASK) + set.AddCategory(uchar.U_GC_PC_MASK) + set.AddCategory(uchar.U_GC_ND_MASK) + set.AddCategory(uchar.U_GC_NL_MASK) + set.AddCategory(uchar.U_GC_MC_MASK) + set.AddCategory(uchar.U_GC_MN_MASK) + addIdentifierIgnorable(set) + } else if propName == "javaUpperCase" { + set.AddCategory(uchar.U_GC_LU_MASK) + } else if propName == "javaValidCodePoint" { + set.AddRuneRange(0, uset.MAX_VALUE) + } else if propName == "javaWhitespace" { + set.AddCategory(uchar.U_GC_Z_MASK) + excl := uset.New() + excl.AddRune(0x0a) + excl.AddRune(0x2007) + excl.AddRune(0x202f) + set.RemoveAll(excl) + set.AddRuneRange(9, 0x0d) + set.AddRuneRange(0x1c, 0x1f) + } else { + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + } + + if c.err == nil && !set.IsEmpty() && (usetFlags&uset.USET_CASE_INSENSITIVE) != 0 { + set.CloseOver(uset.USET_CASE_INSENSITIVE) + } + goto done + } + + // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility + // extensions matched it. + c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + +done: + if c.err != nil { + return nil + } + if negated { + set.Complement() + } + return set +} + +func addIdentifierIgnorable(set *uset.UnicodeSet) { + set.AddRuneRange(0, 8) + set.AddRuneRange(0x0e, 0x1b) + set.AddRuneRange(0x7f, 0x9f) + + set.AddCategory(uchar.U_GC_CF_MASK) +} + +func (c *Compiler) scanPosixProp() *uset.UnicodeSet { + var set *uset.UnicodeSet + + if !(c.c.char == chColon) { + panic("assertion failed: c.lastChar == ':'") + } + + savedScanIndex := c.scanIndex + savedScanPattern := c.p + savedQuoteMode := c.quoteMode + savedInBackslashQuote := c.inBackslashQuote + savedEOLComments := c.eolComments + savedLineNum := c.lineNum + savedCharNum := c.charNum + savedLastChar := c.lastChar + savedPeekChar := c.peekChar + savedC := c.c + + // Scan for a closing ]. A little tricky because there are some perverse + // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, + // ending on the second closing ]. + var propName []rune + negated := false + + // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] + c.nextChar(&c.c) + if c.c.char == chUp { + negated = true + c.nextChar(&c.c) + } + + // Scan for the closing ":]", collecting the property name along the way. + sawPropSetTerminator := false + for { + propName = append(propName, c.c.char) + c.nextChar(&c.c) + if c.c.quoted || c.c.char == -1 { + // Escaped characters or end of input - either says this isn't a [:Property:] + break + } + if c.c.char == chColon { + c.nextChar(&c.c) + if c.c.char == chRBracket { + sawPropSetTerminator = true + break + } + } + } + + if sawPropSetTerminator { + set = c.createSetForProperty(string(propName), negated) + } else { + // No closing ']' - not a [:Property:] + // Restore the original scan position. + // The main scanner will retry the input as a normal set expression, + // not a [:Property:] expression. + c.scanIndex = savedScanIndex + c.p = savedScanPattern + c.quoteMode = savedQuoteMode + c.inBackslashQuote = savedInBackslashQuote + c.eolComments = savedEOLComments + c.lineNum = savedLineNum + c.charNum = savedCharNum + c.lastChar = savedLastChar + c.peekChar = savedPeekChar + c.c = savedC + } + + return set +} + +func (c *Compiler) compileSet(set *uset.UnicodeSet) { + if set == nil { + return + } + // Remove any strings from the set. + // There shoudn't be any, but just in case. + // (Case Closure can add them; if we had a simple case closure available that + // ignored strings, that would be better.) + setSize := set.Len() + + switch setSize { + case 0: + // Set of no elements. Always fails to match. + c.appendOp(URX_BACKTRACK, 0) + + case 1: + // The set contains only a single code point. Put it into + // the compiled pattern as a single char operation rather + // than a set, and discard the set itself. + c.literalChar(set.RuneAt(0)) + + default: + // The set contains two or more chars. (the normal case) + // Put it into the compiled pattern as a set. + // theSet->freeze(); + setNumber := len(c.out.sets) + c.out.sets = append(c.out.sets, set) + c.appendOp(URX_SETREF, setNumber) + } +} diff --git a/go/mysql/icuregex/compiler_table.go b/go/mysql/icuregex/compiler_table.go new file mode 100644 index 00000000000..609eb3764bf --- /dev/null +++ b/go/mysql/icuregex/compiler_table.go @@ -0,0 +1,357 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +type patternParseAction uint8 + +const ( + doSetBackslash_D patternParseAction = iota + doBackslashh + doBackslashH + doSetLiteralEscaped + doOpenLookAheadNeg + doCompleteNamedBackRef + doPatStart + doBackslashS + doBackslashD + doNGStar + doNOP + doBackslashX + doSetLiteral + doContinueNamedCapture + doBackslashG + doBackslashR + doSetBegin + doSetBackslash_v + doPossessivePlus + doPerlInline + doBackslashZ + doSetAddAmp + doSetBeginDifference1 + doIntervalError + doSetNegate + doIntervalInit + doSetIntersection2 + doPossessiveInterval + doRuleError + doBackslashW + doContinueNamedBackRef + doOpenNonCaptureParen + doExit + doSetNamedChar + doSetBackslash_V + doConditionalExpr + doEscapeError + doBadOpenParenType + doPossessiveStar + doSetAddDash + doEscapedLiteralChar + doSetBackslash_w + doIntervalUpperDigit + doBackslashv + doSetBackslash_S + doSetNoCloseError + doSetProp + doBackslashB + doSetEnd + doSetRange + doMatchModeParen + doPlus + doBackslashV + doSetMatchMode + doBackslashz + doSetNamedRange + doOpenLookBehindNeg + doInterval + doBadNamedCapture + doBeginMatchMode + doBackslashd + doPatFinish + doNamedChar + doNGPlus + doSetDifference2 + doSetBackslash_H + doCloseParen + doDotAny + doOpenCaptureParen + doEnterQuoteMode + doOpenAtomicParen + doBadModeFlag + doSetBackslash_d + doSetFinish + doProperty + doBeginNamedBackRef + doBackRef + doOpt + doDollar + doBeginNamedCapture + doNGInterval + doSetOpError + doSetPosixProp + doSetBeginIntersection1 + doBackslashb + doSetBeginUnion + doIntevalLowerDigit + doSetBackslash_h + doStar + doMatchMode + doBackslashA + doOpenLookBehind + doPossessiveOpt + doOrOperator + doBackslashw + doBackslashs + doLiteralChar + doSuppressComments + doCaret + doIntervalSame + doNGOpt + doOpenLookAhead + doSetBackslash_W + doMismatchedParenErr + doSetBackslash_s + rbbiLastAction +) + +// ------------------------------------------------------------------------------- +// +// RegexTableEl represents the structure of a row in the transition table +// for the pattern parser state machine. +// +// ------------------------------------------------------------------------------- +type regexTableEl struct { + action patternParseAction + charClass uint8 + nextState uint8 + pushState uint8 + nextChar bool +} + +var parseStateTable = []regexTableEl{ + {doNOP, 0, 0, 0, true}, + {doPatStart, 255, 2, 0, false}, // 1 start + {doLiteralChar, 254, 14, 0, true}, // 2 term + {doLiteralChar, 130, 14, 0, true}, // 3 + {doSetBegin, 91 /* [ */, 123, 205, true}, // 4 + {doNOP, 40 /* ( */, 27, 0, true}, // 5 + {doDotAny, 46 /* . */, 14, 0, true}, // 6 + {doCaret, 94 /* ^ */, 14, 0, true}, // 7 + {doDollar, 36 /* $ */, 14, 0, true}, // 8 + {doNOP, 92 /* \ */, 89, 0, true}, // 9 + {doOrOperator, 124 /* | */, 2, 0, true}, // 10 + {doCloseParen, 41 /* ) */, 255, 0, true}, // 11 + {doPatFinish, 253, 2, 0, false}, // 12 + {doRuleError, 255, 206, 0, false}, // 13 + {doNOP, 42 /* * */, 68, 0, true}, // 14 expr-quant + {doNOP, 43 /* + */, 71, 0, true}, // 15 + {doNOP, 63 /* ? */, 74, 0, true}, // 16 + {doIntervalInit, 123 /* { */, 77, 0, true}, // 17 + {doNOP, 40 /* ( */, 23, 0, true}, // 18 + {doNOP, 255, 20, 0, false}, // 19 + {doOrOperator, 124 /* | */, 2, 0, true}, // 20 expr-cont + {doCloseParen, 41 /* ) */, 255, 0, true}, // 21 + {doNOP, 255, 2, 0, false}, // 22 + {doSuppressComments, 63 /* ? */, 25, 0, true}, // 23 open-paren-quant + {doNOP, 255, 27, 0, false}, // 24 + {doNOP, 35 /* # */, 50, 14, true}, // 25 open-paren-quant2 + {doNOP, 255, 29, 0, false}, // 26 + {doSuppressComments, 63 /* ? */, 29, 0, true}, // 27 open-paren + {doOpenCaptureParen, 255, 2, 14, false}, // 28 + {doOpenNonCaptureParen, 58 /* : */, 2, 14, true}, // 29 open-paren-extended + {doOpenAtomicParen, 62 /* > */, 2, 14, true}, // 30 + {doOpenLookAhead, 61 /* = */, 2, 20, true}, // 31 + {doOpenLookAheadNeg, 33 /* ! */, 2, 20, true}, // 32 + {doNOP, 60 /* < */, 46, 0, true}, // 33 + {doNOP, 35 /* # */, 50, 2, true}, // 34 + {doBeginMatchMode, 105 /* i */, 53, 0, false}, // 35 + {doBeginMatchMode, 100 /* d */, 53, 0, false}, // 36 + {doBeginMatchMode, 109 /* m */, 53, 0, false}, // 37 + {doBeginMatchMode, 115 /* s */, 53, 0, false}, // 38 + {doBeginMatchMode, 117 /* u */, 53, 0, false}, // 39 + {doBeginMatchMode, 119 /* w */, 53, 0, false}, // 40 + {doBeginMatchMode, 120 /* x */, 53, 0, false}, // 41 + {doBeginMatchMode, 45 /* - */, 53, 0, false}, // 42 + {doConditionalExpr, 40 /* ( */, 206, 0, true}, // 43 + {doPerlInline, 123 /* { */, 206, 0, true}, // 44 + {doBadOpenParenType, 255, 206, 0, false}, // 45 + {doOpenLookBehind, 61 /* = */, 2, 20, true}, // 46 open-paren-lookbehind + {doOpenLookBehindNeg, 33 /* ! */, 2, 20, true}, // 47 + {doBeginNamedCapture, 129, 64, 0, false}, // 48 + {doBadOpenParenType, 255, 206, 0, false}, // 49 + {doNOP, 41 /* ) */, 255, 0, true}, // 50 paren-comment + {doMismatchedParenErr, 253, 206, 0, false}, // 51 + {doNOP, 255, 50, 0, true}, // 52 + {doMatchMode, 105 /* i */, 53, 0, true}, // 53 paren-flag + {doMatchMode, 100 /* d */, 53, 0, true}, // 54 + {doMatchMode, 109 /* m */, 53, 0, true}, // 55 + {doMatchMode, 115 /* s */, 53, 0, true}, // 56 + {doMatchMode, 117 /* u */, 53, 0, true}, // 57 + {doMatchMode, 119 /* w */, 53, 0, true}, // 58 + {doMatchMode, 120 /* x */, 53, 0, true}, // 59 + {doMatchMode, 45 /* - */, 53, 0, true}, // 60 + {doSetMatchMode, 41 /* ) */, 2, 0, true}, // 61 + {doMatchModeParen, 58 /* : */, 2, 14, true}, // 62 + {doBadModeFlag, 255, 206, 0, false}, // 63 + {doContinueNamedCapture, 129, 64, 0, true}, // 64 named-capture + {doContinueNamedCapture, 128, 64, 0, true}, // 65 + {doOpenCaptureParen, 62 /* > */, 2, 14, true}, // 66 + {doBadNamedCapture, 255, 206, 0, false}, // 67 + {doNGStar, 63 /* ? */, 20, 0, true}, // 68 quant-star + {doPossessiveStar, 43 /* + */, 20, 0, true}, // 69 + {doStar, 255, 20, 0, false}, // 70 + {doNGPlus, 63 /* ? */, 20, 0, true}, // 71 quant-plus + {doPossessivePlus, 43 /* + */, 20, 0, true}, // 72 + {doPlus, 255, 20, 0, false}, // 73 + {doNGOpt, 63 /* ? */, 20, 0, true}, // 74 quant-opt + {doPossessiveOpt, 43 /* + */, 20, 0, true}, // 75 + {doOpt, 255, 20, 0, false}, // 76 + {doNOP, 128, 79, 0, false}, // 77 interval-open + {doIntervalError, 255, 206, 0, false}, // 78 + {doIntevalLowerDigit, 128, 79, 0, true}, // 79 interval-lower + {doNOP, 44 /* , */, 83, 0, true}, // 80 + {doIntervalSame, 125 /* } */, 86, 0, true}, // 81 + {doIntervalError, 255, 206, 0, false}, // 82 + {doIntervalUpperDigit, 128, 83, 0, true}, // 83 interval-upper + {doNOP, 125 /* } */, 86, 0, true}, // 84 + {doIntervalError, 255, 206, 0, false}, // 85 + {doNGInterval, 63 /* ? */, 20, 0, true}, // 86 interval-type + {doPossessiveInterval, 43 /* + */, 20, 0, true}, // 87 + {doInterval, 255, 20, 0, false}, // 88 + {doBackslashA, 65 /* A */, 2, 0, true}, // 89 backslash + {doBackslashB, 66 /* B */, 2, 0, true}, // 90 + {doBackslashb, 98 /* b */, 2, 0, true}, // 91 + {doBackslashd, 100 /* d */, 14, 0, true}, // 92 + {doBackslashD, 68 /* D */, 14, 0, true}, // 93 + {doBackslashG, 71 /* G */, 2, 0, true}, // 94 + {doBackslashh, 104 /* h */, 14, 0, true}, // 95 + {doBackslashH, 72 /* H */, 14, 0, true}, // 96 + {doNOP, 107 /* k */, 115, 0, true}, // 97 + {doNamedChar, 78 /* N */, 14, 0, false}, // 98 + {doProperty, 112 /* p */, 14, 0, false}, // 99 + {doProperty, 80 /* P */, 14, 0, false}, // 100 + {doBackslashR, 82 /* R */, 14, 0, true}, // 101 + {doEnterQuoteMode, 81 /* Q */, 2, 0, true}, // 102 + {doBackslashS, 83 /* S */, 14, 0, true}, // 103 + {doBackslashs, 115 /* s */, 14, 0, true}, // 104 + {doBackslashv, 118 /* v */, 14, 0, true}, // 105 + {doBackslashV, 86 /* V */, 14, 0, true}, // 106 + {doBackslashW, 87 /* W */, 14, 0, true}, // 107 + {doBackslashw, 119 /* w */, 14, 0, true}, // 108 + {doBackslashX, 88 /* X */, 14, 0, true}, // 109 + {doBackslashZ, 90 /* Z */, 2, 0, true}, // 110 + {doBackslashz, 122 /* z */, 2, 0, true}, // 111 + {doBackRef, 128, 14, 0, true}, // 112 + {doEscapeError, 253, 206, 0, false}, // 113 + {doEscapedLiteralChar, 255, 14, 0, true}, // 114 + {doBeginNamedBackRef, 60 /* < */, 117, 0, true}, // 115 named-backref + {doBadNamedCapture, 255, 206, 0, false}, // 116 + {doContinueNamedBackRef, 129, 119, 0, true}, // 117 named-backref-2 + {doBadNamedCapture, 255, 206, 0, false}, // 118 + {doContinueNamedBackRef, 129, 119, 0, true}, // 119 named-backref-3 + {doContinueNamedBackRef, 128, 119, 0, true}, // 120 + {doCompleteNamedBackRef, 62 /* > */, 14, 0, true}, // 121 + {doBadNamedCapture, 255, 206, 0, false}, // 122 + {doSetNegate, 94 /* ^ */, 126, 0, true}, // 123 set-open + {doSetPosixProp, 58 /* : */, 128, 0, false}, // 124 + {doNOP, 255, 126, 0, false}, // 125 + {doSetLiteral, 93 /* ] */, 141, 0, true}, // 126 set-open2 + {doNOP, 255, 131, 0, false}, // 127 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 128 set-posix + {doNOP, 58 /* : */, 131, 0, false}, // 129 + {doRuleError, 255, 206, 0, false}, // 130 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 131 set-start + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 132 + {doNOP, 92 /* \ */, 191, 0, true}, // 133 + {doNOP, 45 /* - */, 137, 0, true}, // 134 + {doNOP, 38 /* & */, 139, 0, true}, // 135 + {doSetLiteral, 255, 141, 0, true}, // 136 + {doRuleError, 45 /* - */, 206, 0, false}, // 137 set-start-dash + {doSetAddDash, 255, 141, 0, false}, // 138 + {doRuleError, 38 /* & */, 206, 0, false}, // 139 set-start-amp + {doSetAddAmp, 255, 141, 0, false}, // 140 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 141 set-after-lit + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 142 + {doNOP, 45 /* - */, 178, 0, true}, // 143 + {doNOP, 38 /* & */, 169, 0, true}, // 144 + {doNOP, 92 /* \ */, 191, 0, true}, // 145 + {doSetNoCloseError, 253, 206, 0, false}, // 146 + {doSetLiteral, 255, 141, 0, true}, // 147 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 148 set-after-set + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 149 + {doNOP, 45 /* - */, 171, 0, true}, // 150 + {doNOP, 38 /* & */, 166, 0, true}, // 151 + {doNOP, 92 /* \ */, 191, 0, true}, // 152 + {doSetNoCloseError, 253, 206, 0, false}, // 153 + {doSetLiteral, 255, 141, 0, true}, // 154 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 155 set-after-range + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 156 + {doNOP, 45 /* - */, 174, 0, true}, // 157 + {doNOP, 38 /* & */, 176, 0, true}, // 158 + {doNOP, 92 /* \ */, 191, 0, true}, // 159 + {doSetNoCloseError, 253, 206, 0, false}, // 160 + {doSetLiteral, 255, 141, 0, true}, // 161 + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 162 set-after-op + {doSetOpError, 93 /* ] */, 206, 0, false}, // 163 + {doNOP, 92 /* \ */, 191, 0, true}, // 164 + {doSetLiteral, 255, 141, 0, true}, // 165 + {doSetBeginIntersection1, 91 /* [ */, 123, 148, true}, // 166 set-set-amp + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 167 + {doSetAddAmp, 255, 141, 0, false}, // 168 + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 169 set-lit-amp + {doSetAddAmp, 255, 141, 0, false}, // 170 + {doSetBeginDifference1, 91 /* [ */, 123, 148, true}, // 171 set-set-dash + {doSetDifference2, 45 /* - */, 162, 0, true}, // 172 + {doSetAddDash, 255, 141, 0, false}, // 173 + {doSetDifference2, 45 /* - */, 162, 0, true}, // 174 set-range-dash + {doSetAddDash, 255, 141, 0, false}, // 175 + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 176 set-range-amp + {doSetAddAmp, 255, 141, 0, false}, // 177 + {doSetDifference2, 45 /* - */, 162, 0, true}, // 178 set-lit-dash + {doSetAddDash, 91 /* [ */, 141, 0, false}, // 179 + {doSetAddDash, 93 /* ] */, 141, 0, false}, // 180 + {doNOP, 92 /* \ */, 183, 0, true}, // 181 + {doSetRange, 255, 155, 0, true}, // 182 + {doSetOpError, 115 /* s */, 206, 0, false}, // 183 set-lit-dash-escape + {doSetOpError, 83 /* S */, 206, 0, false}, // 184 + {doSetOpError, 119 /* w */, 206, 0, false}, // 185 + {doSetOpError, 87 /* W */, 206, 0, false}, // 186 + {doSetOpError, 100 /* d */, 206, 0, false}, // 187 + {doSetOpError, 68 /* D */, 206, 0, false}, // 188 + {doSetNamedRange, 78 /* N */, 155, 0, false}, // 189 + {doSetRange, 255, 155, 0, true}, // 190 + {doSetProp, 112 /* p */, 148, 0, false}, // 191 set-escape + {doSetProp, 80 /* P */, 148, 0, false}, // 192 + {doSetNamedChar, 78 /* N */, 141, 0, false}, // 193 + {doSetBackslash_s, 115 /* s */, 155, 0, true}, // 194 + {doSetBackslash_S, 83 /* S */, 155, 0, true}, // 195 + {doSetBackslash_w, 119 /* w */, 155, 0, true}, // 196 + {doSetBackslash_W, 87 /* W */, 155, 0, true}, // 197 + {doSetBackslash_d, 100 /* d */, 155, 0, true}, // 198 + {doSetBackslash_D, 68 /* D */, 155, 0, true}, // 199 + {doSetBackslash_h, 104 /* h */, 155, 0, true}, // 200 + {doSetBackslash_H, 72 /* H */, 155, 0, true}, // 201 + {doSetBackslash_v, 118 /* v */, 155, 0, true}, // 202 + {doSetBackslash_V, 86 /* V */, 155, 0, true}, // 203 + {doSetLiteralEscaped, 255, 141, 0, true}, // 204 + {doSetFinish, 255, 14, 0, false}, // 205 set-finish + {doExit, 255, 206, 0, true}, // 206 errorDeath +} diff --git a/go/mysql/icuregex/debug.go b/go/mysql/icuregex/debug.go new file mode 100644 index 00000000000..5cacc87d007 --- /dev/null +++ b/go/mysql/icuregex/debug.go @@ -0,0 +1,157 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "io" +) + +func (pat *Pattern) Dump(w io.Writer) { + fmt.Fprintf(w, "Original Pattern: \"%s\"\n", pat.pattern) + fmt.Fprintf(w, " Min Match Length: %d\n", pat.minMatchLen) + fmt.Fprintf(w, " Match Start Type: %v\n", pat.startType) + if pat.startType == START_STRING { + fmt.Fprintf(w, " Initial match string: \"%s\"\n", string(pat.literalText[pat.initialStringIdx:pat.initialStringIdx+pat.initialStringLen])) + } else if pat.startType == START_SET { + fmt.Fprintf(w, " Match First Chars: %s\n", pat.initialChars.String()) + } else if pat.startType == START_CHAR { + fmt.Fprintf(w, " First char of Match: ") + if pat.initialChar > 0x20 { + fmt.Fprintf(w, "'%c'\n", pat.initialChar) + } else { + fmt.Fprintf(w, "%#x\n", pat.initialChar) + } + } + + fmt.Fprintf(w, "Named Capture Groups:\n") + if len(pat.namedCaptureMap) == 0 { + fmt.Fprintf(w, " None\n") + } else { + for name, number := range pat.namedCaptureMap { + fmt.Fprintf(w, " %d\t%s\n", number, name) + } + } + + fmt.Fprintf(w, "\nIndex Binary Type Operand\n-------------------------------------------\n") + for idx := range pat.compiledPat { + pat.dumpOp(w, idx) + } + fmt.Fprintf(w, "\n\n") +} + +func (pat *Pattern) dumpOp(w io.Writer, index int) { + op := pat.compiledPat[index] + val := op.Value() + opType := op.Type() + pinnedType := opType + if int(pinnedType) >= len(UrxOpcodeNames) { + pinnedType = 0 + } + + fmt.Fprintf(w, "%4d %08x %-15s ", index, op, UrxOpcodeNames[pinnedType]) + + switch opType { + case URX_NOP, + URX_DOTANY, + URX_DOTANY_ALL, + URX_FAIL, + URX_CARET, + URX_DOLLAR, + URX_BACKSLASH_G, + URX_BACKSLASH_X, + URX_END, + URX_DOLLAR_M, + URX_CARET_M: + // Types with no operand field of interest. + + case URX_RESERVED_OP, + URX_START_CAPTURE, + URX_END_CAPTURE, + URX_STATE_SAVE, + URX_JMP, + URX_JMP_SAV, + URX_JMP_SAV_X, + URX_BACKSLASH_B, + URX_BACKSLASH_BU, + URX_BACKSLASH_D, + URX_BACKSLASH_Z, + URX_STRING_LEN, + URX_CTR_INIT, + URX_CTR_INIT_NG, + URX_CTR_LOOP, + URX_CTR_LOOP_NG, + URX_RELOC_OPRND, + URX_STO_SP, + URX_LD_SP, + URX_BACKREF, + URX_STO_INP_LOC, + URX_JMPX, + URX_LA_START, + URX_LA_END, + URX_BACKREF_I, + URX_LB_START, + URX_LB_CONT, + URX_LB_END, + URX_LBN_CONT, + URX_LBN_END, + URX_LOOP_C, + URX_LOOP_DOT_I, + URX_BACKSLASH_H, + URX_BACKSLASH_R, + URX_BACKSLASH_V: + // types with an integer operand field. + fmt.Fprintf(w, "%d", val) + + case URX_ONECHAR, URX_ONECHAR_I: + if val < 0x20 { + fmt.Fprintf(w, "%#x", val) + } else { + fmt.Fprintf(w, "'%c'", rune(val)) + } + + case URX_STRING, URX_STRING_I: + lengthOp := pat.compiledPat[index+1] + // U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); + length := lengthOp.Value() + fmt.Fprintf(w, "%q", string(pat.literalText[val:val+length])) + + case URX_SETREF, URX_LOOP_SR_I: + // UnicodeString s; + // UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); + //set->toPattern(s, TRUE); + fmt.Fprintf(w, "%s", pat.sets[val].String()) + + case URX_STATIC_SETREF, URX_STAT_SETREF_N: + if (val & URX_NEG_SET) != 0 { + fmt.Fprintf(w, "NOT ") + val &= ^URX_NEG_SET + } + // UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; + // set.toPattern(s, TRUE); + fmt.Fprintf(w, "%s", staticPropertySets[val].String()) + + default: + fmt.Fprintf(w, "??????") + } + fmt.Fprintf(w, "\n") +} diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go new file mode 100644 index 00000000000..9bb77994cea --- /dev/null +++ b/go/mysql/icuregex/error.go @@ -0,0 +1,125 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "strings" + + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" +) + +type CompileError struct { + Code uerror.URegexCompileErrorCode + Line int + Offset int + Context string +} + +func (e *CompileError) Error() string { + var out strings.Builder + switch e.Code { + case uerror.U_REGEX_INTERNAL_ERROR: + out.WriteString("Internal Error") + case uerror.U_REGEX_RULE_SYNTAX: + out.WriteString("Syntax Error") + case uerror.U_REGEX_INVALID_STATE: + out.WriteString("Invalid State") + case uerror.U_REGEX_BAD_ESCAPE_SEQUENCE: + out.WriteString("Bad escape sequence") + case uerror.U_REGEX_PROPERTY_SYNTAX: + out.WriteString("Property syntax error") + case uerror.U_REGEX_UNIMPLEMENTED: + out.WriteString("Unimplemented") + case uerror.U_REGEX_MISMATCHED_PAREN: + out.WriteString("Mismatched parentheses") + case uerror.U_REGEX_NUMBER_TOO_BIG: + out.WriteString("Number too big") + case uerror.U_REGEX_BAD_INTERVAL: + out.WriteString("Bad interval") + case uerror.U_REGEX_MAX_LT_MIN: + out.WriteString("Max less than min") + case uerror.U_REGEX_INVALID_BACK_REF: + out.WriteString("Invalid back reference") + case uerror.U_REGEX_INVALID_FLAG: + out.WriteString("Invalid flag") + case uerror.U_REGEX_LOOK_BEHIND_LIMIT: + out.WriteString("Look behind limit") + case uerror.U_REGEX_SET_CONTAINS_STRING: + out.WriteString("Set contains string") + case uerror.U_REGEX_MISSING_CLOSE_BRACKET: + out.WriteString("Missing closing ]") + case uerror.U_REGEX_INVALID_RANGE: + out.WriteString("Invalid range") + case uerror.U_REGEX_PATTERN_TOO_BIG: + out.WriteString("Pattern too big") + case uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME: + out.WriteString("Invalid capture group name") + } + _, _ = fmt.Fprintf(&out, " at line %d, column %d: `%s`", e.Line, e.Offset, e.Context) + + return out.String() +} + +type MatchError struct { + Code uerror.URegexMatchErrorCode + Pattern string + Position int + Input []rune +} + +const maxMatchInputLength = 20 + +func (e *MatchError) Error() string { + var out strings.Builder + switch e.Code { + case uerror.U_REGEX_STACK_OVERFLOW: + out.WriteString("Stack overflow") + case uerror.U_REGEX_TIME_OUT: + out.WriteString("Timeout") + } + + input := e.Input + if len(input) > maxMatchInputLength { + var b []rune + start := e.Position - maxMatchInputLength/2 + if start < 0 { + start = 0 + } else { + b = append(b, '.', '.', '.') + } + end := start + maxMatchInputLength + trailing := true + if end > len(input) { + end = len(input) + trailing = false + } + b = append(b, input[start:end]...) + if trailing { + b = append(b, '.', '.', '.') + } + input = b + } + _, _ = fmt.Fprintf(&out, " for expression `%s` at position %d in: %q", e.Pattern, e.Position, string(input)) + + return out.String() +} diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go new file mode 100644 index 00000000000..1e766ac35ed --- /dev/null +++ b/go/mysql/icuregex/icu_test.go @@ -0,0 +1,430 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex_test + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "regexp" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/icuregex" + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" +) + +var ErrSkip = errors.New("ignored test") + +type Matcher int8 + +const ( + FuncFind Matcher = iota + FuncMatches + FuncLookingAt +) + +type Expectation int8 + +const ( + Unknown Expectation = iota + Expected + NotExpected +) + +type TestPattern struct { + Line string + Lineno int + + Pattern string + Flags icuregex.RegexpFlag + Options struct { + MatchFunc Matcher + FindCount int + MatchOnly bool + MustError bool + Dump bool + HitEnd Expectation + RequireEnd Expectation + } + Input string + Groups []TestGroup +} + +type TestGroup struct { + Start, End int +} + +var parsePattern = regexp.MustCompile(`<(/?)(r|[0-9]+)>`) + +func (tp *TestPattern) parseFlags(line string) (string, error) { + for len(line) > 0 { + switch line[0] { + case '"', '\'', '/': + return line, nil + case ' ', '\t': + case 'i': + tp.Flags |= icuregex.UREGEX_CASE_INSENSITIVE + case 'x': + tp.Flags |= icuregex.UREGEX_COMMENTS + case 's': + tp.Flags |= icuregex.UREGEX_DOTALL + case 'm': + tp.Flags |= icuregex.UREGEX_MULTILINE + case 'e': + tp.Flags |= icuregex.UREGEX_ERROR_ON_UNKNOWN_ESCAPES + case 'D': + tp.Flags |= icuregex.UREGEX_UNIX_LINES + case 'Q': + tp.Flags |= icuregex.UREGEX_LITERAL + case '2', '3', '4', '5', '6', '7', '8', '9': + tp.Options.FindCount = int(line[0] - '0') + case 'G': + tp.Options.MatchOnly = true + case 'E': + tp.Options.MustError = true + case 'd': + tp.Options.Dump = true + case 'L': + tp.Options.MatchFunc = FuncLookingAt + case 'M': + tp.Options.MatchFunc = FuncMatches + case 'v': + tp.Options.MustError = !icuregex.BreakIteration + case 'a', 'b': + return "", ErrSkip + case 'z': + tp.Options.HitEnd = Expected + case 'Z': + tp.Options.HitEnd = NotExpected + case 'y': + tp.Options.RequireEnd = Expected + case 'Y': + tp.Options.RequireEnd = NotExpected + default: + return "", fmt.Errorf("unexpected modifier '%c'", line[0]) + } + line = line[1:] + } + return "", io.ErrUnexpectedEOF +} + +func (tp *TestPattern) parseMatch(input string) error { + var ok bool + input, ok = pattern.Unescape(input) + if !ok { + return fmt.Errorf("failed to unquote input") + } + + var detagged []rune + var last int + + m := parsePattern.FindAllStringSubmatchIndex(input, -1) + for _, g := range m { + detagged = append(detagged, []rune(input[last:g[0]])...) + last = g[1] + + closing := input[g[2]:g[3]] == "/" + groupNum := input[g[4]:g[5]] + if groupNum == "r" { + return ErrSkip + } else { + num, err := strconv.Atoi(groupNum) + if err != nil { + return fmt.Errorf("bad group number %q: %v", groupNum, err) + } + + if num >= len(tp.Groups) { + grp := make([]TestGroup, num+1) + for i := range grp { + grp[i].Start = -1 + grp[i].End = -1 + } + copy(grp, tp.Groups) + tp.Groups = grp + } + + if closing { + tp.Groups[num].End = len(detagged) + } else { + tp.Groups[num].Start = len(detagged) + } + } + } + + detagged = append(detagged, []rune(input[last:])...) + tp.Input = string(detagged) + return nil +} + +func ParseTestFile(t testing.TB, filename string) []TestPattern { + f, err := os.Open(filename) + if err != nil { + t.Fatalf("failed to open test data: %v", err) + } + + defer f.Close() + scanner := bufio.NewScanner(f) + var lineno int + var patterns []TestPattern + + error := func(err error) { + if err == ErrSkip { + return + } + t.Errorf("Parse error: %v\n%03d: %s", err, lineno, scanner.Text()) + } + + for scanner.Scan() { + lineno++ + line := scanner.Text() + line = strings.TrimSpace(line) + + if len(line) == 0 || line[0] == '#' { + continue + } + + var tp TestPattern + tp.Line = line + tp.Lineno = lineno + + idx := strings.IndexByte(line[1:], line[0]) + + tp.Pattern = line[1 : idx+1] + line, err = tp.parseFlags(line[idx+2:]) + if err != nil { + error(err) + continue + } + + idx = strings.IndexByte(line[1:], line[0]) + err = tp.parseMatch(line[1 : idx+1]) + if err != nil { + error(err) + continue + } + + patterns = append(patterns, tp) + } + + if err := scanner.Err(); err != nil { + t.Fatal(err) + } + return patterns +} + +func (tp *TestPattern) fail(t testing.TB, msg string, args ...any) bool { + t.Helper() + msg = fmt.Sprintf(msg, args...) + t.Errorf("%s (in line %d)\nregexp: %s\ninput: %q\noriginal: %s", msg, tp.Lineno, tp.Pattern, tp.Input, tp.Line) + return false +} + +func (tp *TestPattern) Test(t testing.TB) bool { + re, err := func() (re *icuregex.Pattern, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("PANIC: %v", r) + } + }() + re, err = icuregex.Compile(tp.Pattern, tp.Flags) + return + }() + if err != nil { + if tp.Options.MustError { + return true + } + + return tp.fail(t, "unexpected parser failure: %v", err) + } + if tp.Options.MustError { + return tp.fail(t, "parse failure expected") + } + + matcher := re.Match(tp.Input) + var isMatch bool + var findCount = tp.Options.FindCount + if findCount == 0 { + findCount = 1 + } + + for i := 0; i < findCount; i++ { + isMatch, err = func() (bool, error) { + defer func() { + if r := recover(); r != nil { + tp.fail(t, "unexpected match failure: %v", r) + } + }() + switch tp.Options.MatchFunc { + case FuncMatches: + return matcher.Matches() + case FuncLookingAt: + return matcher.LookingAt() + case FuncFind: + return matcher.Find() + default: + panic("invalid MatchFunc") + } + }() + } + + require.NoError(t, err) + + if !isMatch && len(tp.Groups) > 0 { + return tp.fail(t, "Match expected, but none found.") + } + if isMatch && len(tp.Groups) == 0 { + return tp.fail(t, "No match expected, but found one at position %d", matcher.Start()) + } + if tp.Options.MatchOnly { + return true + } + + for i := 0; i < matcher.GroupCount(); i++ { + expectedStart := -1 + expectedEnd := -1 + + if i < len(tp.Groups) { + expectedStart = tp.Groups[i].Start + expectedEnd = tp.Groups[i].End + } + if gotStart := matcher.StartForGroup(i); gotStart != expectedStart { + return tp.fail(t, "Incorrect start position for group %d. Expected %d, got %d", i, expectedStart, gotStart) + } + if gotEnd := matcher.EndForGroup(i); gotEnd != expectedEnd { + return tp.fail(t, "Incorrect end position for group %d. Expected %d, got %d", i, expectedEnd, gotEnd) + } + } + + if matcher.GroupCount()+1 < len(tp.Groups) { + return tp.fail(t, "Expected %d capture groups, found %d", len(tp.Groups)-1, matcher.GroupCount()) + } + + if tp.Options.HitEnd == Expected && !matcher.HitEnd() { + return tp.fail(t, "HitEnd() returned false. Expected true") + } + if tp.Options.HitEnd == NotExpected && matcher.HitEnd() { + return tp.fail(t, "HitEnd() returned true. Expected false") + } + + if tp.Options.RequireEnd == Expected && !matcher.RequireEnd() { + return tp.fail(t, "RequireEnd() returned false. Expected true") + } + if tp.Options.RequireEnd == NotExpected && matcher.RequireEnd() { + return tp.fail(t, "RequireEnd() returned true. Expected false") + } + + return true +} + +func TestICU(t *testing.T) { + pats := ParseTestFile(t, "testdata/regextst.txt") + + var valid int + + for _, p := range pats { + if p.Test(t) { + valid++ + } + } + + t.Logf("%d/%d (%.02f)", valid, len(pats), float64(valid)/float64(len(pats))) +} + +func TestICUExtended(t *testing.T) { + // This tests additional cases that aren't covered in the + // copied ICU test suite. + pats := ParseTestFile(t, "testdata/regextst_extended.txt") + + var valid int + + for _, p := range pats { + if p.Test(t) { + valid++ + } + } + + t.Logf("%d/%d (%.02f)", valid, len(pats), float64(valid)/float64(len(pats))) +} + +func TestCornerCases(t *testing.T) { + var cases = []struct { + Pattern string + Input string + Flags icuregex.RegexpFlag + Match bool + }{ + {`xyz$`, "xyz\n", 0, true}, + {`a*+`, "abbxx", 0, true}, + {`(ABC){1,2}+ABC`, "ABCABCABC", 0, true}, + {`(ABC){2,3}+ABC`, "ABCABCABC", 0, false}, + {`(abc)*+a`, "abcabcabc", 0, false}, + {`(abc)*+a`, "abcabcab", 0, true}, + {`a\N{LATIN SMALL LETTER B}c`, "abc", 0, true}, + {`a.b`, "a\rb", icuregex.UREGEX_UNIX_LINES, true}, + {`a.b`, "a\rb", 0, false}, + {`(?d)abc$`, "abc\r", 0, false}, + {`[ \b]`, "b", 0, true}, + {`[abcd-\N{LATIN SMALL LETTER G}]+`, "xyz-abcdefghij-", 0, true}, + {`[[abcd]&&[ac]]+`, "bacacd", 0, true}, + } + + for _, tc := range cases { + t.Run(tc.Pattern, func(t *testing.T) { + _, err := icuregex.Compile(tc.Pattern, tc.Flags) + if err != nil { + t.Fatal(err) + } + }) + } +} + +func TestOne(t *testing.T) { + icuregex.Dumper = os.Stderr + + const Pattern = `\p{Indic_Syllabic_Category=Avagraha}` + const Input = "foo\u09BDbar" + const Flags = 0 + + re, err := icuregex.Compile(Pattern, Flags) + if err != nil { + t.Fatalf("compilation failed: %v", err) + } + + re.Dump(os.Stderr) + + m := re.Match(Input) + found, err := m.Find() + require.NoError(t, err) + t.Logf("match = %v", found) +} + +func TestTrie(t *testing.T) { + p := uprops.GetPropertyEnum("Block") + t.Logf("%v", p) +} diff --git a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go new file mode 100644 index 00000000000..c46084ff21b --- /dev/null +++ b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go @@ -0,0 +1,373 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package bytestrie + +type BytesTrie struct { + pos []byte + original []byte + remainingMatchLength int32 +} + +func New(pos []byte) BytesTrie { + return BytesTrie{pos: pos, original: pos, remainingMatchLength: -1} +} + +type Result int32 + +const ( /** + * The input unit(s) did not continue a matching string. + * Once current()/next() return NO_MATCH, + * all further calls to current()/next() will also return NO_MATCH, + * until the trie is reset to its original state or to a saved state. + * @stable ICU 4.8 + */ + NO_MATCH Result = iota + /** + * The input unit(s) continued a matching string + * but there is no value for the string so far. + * (It is a prefix of a longer string.) + * @stable ICU 4.8 + */ + NO_VALUE + /** + * The input unit(s) continued a matching string + * and there is a value for the string so far. + * This value will be returned by getValue(). + * No further input byte/unit can continue a matching string. + * @stable ICU 4.8 + */ + FINAL_VALUE + /** + * The input unit(s) continued a matching string + * and there is a value for the string so far. + * This value will be returned by getValue(). + * Another input byte/unit can continue a matching string. + * @stable ICU 4.8 + */ + INTERMEDIATE_VALUE +) + +const ( + kMaxBranchLinearSubNodeLength = 5 + + // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. + kMinLinearMatch = 0x10 + kMaxLinearMatchLength = 0x10 + + // 20..ff: Variable-length value node. + // If odd, the value is final. (Otherwise, intermediate value or jump delta.) + // Then shift-right by 1 bit. + // The remaining lead byte value indicates the number of following bytes (0..4) + // and contains the value's top bits. + kMinValueLead = kMinLinearMatch + kMaxLinearMatchLength // 0x20 + // It is a final value if bit 0 is set. + kValueIsFinal = 1 + + // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. + kMinOneByteValueLead = kMinValueLead / 2 // 0x10 + kMaxOneByteValue = 0x40 // At least 6 bits in the first byte. + + kMinTwoByteValueLead = kMinOneByteValueLead + kMaxOneByteValue + 1 // 0x51 + kMaxTwoByteValue = 0x1aff + kMinThreeByteValueLead = kMinTwoByteValueLead + (kMaxTwoByteValue >> 8) + 1 // 0x6c + kFourByteValueLead = 0x7e + + // A little more than Unicode code points. (0x11ffff) + kMaxThreeByteValue = ((kFourByteValueLead - kMinThreeByteValueLead) << 16) - 1 + kFiveByteValueLead = 0x7f + + // Compact delta integers. + kMaxOneByteDelta = 0xbf + kMinTwoByteDeltaLead = kMaxOneByteDelta + 1 // 0xc0 + kMinThreeByteDeltaLead = 0xf0 + kFourByteDeltaLead = 0xfe + kFiveByteDeltaLead = 0xff + kMaxTwoByteDelta = ((kMinThreeByteDeltaLead - kMinTwoByteDeltaLead) << 8) - 1 // 0x2fff + kMaxThreeByteDelta = ((kFourByteDeltaLead - kMinThreeByteDeltaLead) << 16) - 1 // 0xdffff + + // For getState64(): + // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 + // so we need at least 5 bits for that. + // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. + kState64RemainingShift = 59 + kState64PosMask = (uint64(1) << kState64RemainingShift) - 1 +) + +func (bt *BytesTrie) ContainsName(name string) bool { + result := NO_VALUE + for _, c := range []byte(name) { + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c == 0x2d || c == 0x5f || c == 0x20 || (0x09 <= c && c <= 0x0d) { + continue + } + if result&1 == 0 { + return false + } + result = bt.next(int32(c)) + } + return result >= FINAL_VALUE +} + +func (bt *BytesTrie) next(inByte int32) Result { + pos := bt.pos + if pos == nil { + return NO_MATCH + } + if inByte < 0 { + inByte += 0x100 + } + length := bt.remainingMatchLength // Actual remaining match length minus 1. + if length >= 0 { + match := inByte == int32(pos[0]) + pos = pos[1:] + // Remaining part of a linear-match node. + if match { + length = length - 1 + bt.remainingMatchLength = length + bt.pos = pos + if length < 0 { + node := int32(pos[0]) + if node >= kMinValueLead { + return bt.valueResult(node) + } + } + return NO_VALUE + } else { + bt.stop() + return NO_MATCH + } + } + return bt.nextImpl(pos, inByte) +} + +func (bt *BytesTrie) nextImpl(pos []byte, inByte int32) Result { + for { + node := int32(pos[0]) + pos = pos[1:] + if node < kMinLinearMatch { + return bt.branchNext(pos, node, inByte) + } else if node < kMinValueLead { + // Match the first of length+1 bytes. + length := node - kMinLinearMatch // Actual match length minus 1. + match := inByte == int32(pos[0]) + pos = pos[1:] + if match { + length = length - 1 + bt.remainingMatchLength = length + bt.pos = pos + if length < 0 { + node = int32(pos[0]) + if node >= kMinValueLead { + return bt.valueResult(node) + } + } + return NO_VALUE + } else { + // No match. + break + } + } else if (node & kValueIsFinal) != 0 { + // No further matching bytes. + break + } else { + // Skip intermediate value. + pos = bt.skipValue2(pos, node) + // The next node must not also be a value node. + // U_ASSERT(*pos kMaxBranchLinearSubNodeLength { + p := int32(pos[0]) + pos = pos[1:] + if inByte < p { + length >>= 1 + pos = bt.jumpByDelta(pos) + } else { + length = length - (length >> 1) + pos = bt.skipDelta(pos) + } + } + // Drop down to linear search for the last few bytes. + // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 + // and divides length by 2. + for { + p := int32(pos[0]) + pos = pos[1:] + if inByte == p { + var result Result + node := int32(pos[0]) + // U_ASSERT(node>=kMinValueLead); + if (node & kValueIsFinal) != 0 { + // Leave the final value for getValue() to read. + result = FINAL_VALUE + } else { + // Use the non-final value as the jump delta. + pos = pos[1:] + // int32_t delta=readValue(pos, node>>1); + node >>= 1 + var delta int32 + if node < kMinTwoByteValueLead { + delta = node - kMinOneByteValueLead + } else if node < kMinThreeByteValueLead { + delta = ((node - kMinTwoByteValueLead) << 8) | int32(pos[0]) + pos = pos[1:] + } else if node < kFourByteValueLead { + delta = ((node - kMinThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + pos = pos[2:] + } else if node == kFourByteValueLead { + delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + pos = pos[3:] + } else { + delta = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + pos = pos[4:] + } + // end readValue() + pos = pos[delta:] + node = int32(pos[0]) + if node >= kMinValueLead { + result = bt.valueResult(node) + } else { + result = NO_VALUE + } + } + bt.pos = pos + return result + } + length-- + pos = bt.skipValue1(pos) + if length <= 1 { + break + } + } + p := int32(pos[0]) + pos = pos[1:] + if inByte == p { + bt.pos = pos + node := int32(pos[0]) + if node >= kMinValueLead { + return bt.valueResult(node) + } + return NO_VALUE + } else { + bt.stop() + return NO_MATCH + } +} + +func (bt *BytesTrie) skipValue1(pos []byte) []byte { + leadByte := int32(pos[0]) + return bt.skipValue2(pos[1:], leadByte) +} + +func (bt *BytesTrie) skipValue2(pos []byte, leadByte int32) []byte { + if leadByte >= (kMinTwoByteValueLead << 1) { + if leadByte < (kMinThreeByteValueLead << 1) { + pos = pos[1:] + } else if leadByte < (kFourByteValueLead << 1) { + pos = pos[2:] + } else { + pos = pos[3+((leadByte>>1)&1):] + } + } + return pos +} + +func (bt *BytesTrie) skipDelta(pos []byte) []byte { + delta := int32(pos[0]) + pos = pos[1:] + if delta >= kMinTwoByteDeltaLead { + if delta < kMinThreeByteDeltaLead { + pos = pos[1:] + } else if delta < kFourByteDeltaLead { + pos = pos[2:] + } else { + pos = pos[3+(delta&1):] + } + } + return pos +} + +func (bt *BytesTrie) jumpByDelta(pos []byte) []byte { + delta := int32(pos[0]) + pos = pos[1:] + if delta < kMinTwoByteDeltaLead { + // nothing to do + } else if delta < kMinThreeByteDeltaLead { + delta = ((delta - kMinTwoByteDeltaLead) << 8) | int32(pos[0]) + pos = pos[1:] + } else if delta < kFourByteDeltaLead { + delta = ((delta - kMinThreeByteDeltaLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + pos = pos[2:] + } else if delta == kFourByteDeltaLead { + delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + pos = pos[3:] + } else { + delta = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + pos = pos[4:] + } + return pos[delta:] +} + +func (bt *BytesTrie) GetValue() int32 { + pos := bt.pos + leadByte := int32(pos[0]) + return bt.readValue(pos[1:], leadByte>>1) +} + +func (bt *BytesTrie) readValue(pos []byte, leadByte int32) int32 { + var value int32 + if leadByte < kMinTwoByteValueLead { + value = leadByte - kMinOneByteValueLead + } else if leadByte < kMinThreeByteValueLead { + value = ((leadByte - kMinTwoByteValueLead) << 8) | int32(pos[0]) + } else if leadByte < kFourByteValueLead { + value = ((leadByte - kMinThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + } else if leadByte == kFourByteValueLead { + value = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + } else { + value = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + } + return value +} diff --git a/go/mysql/icuregex/internal/icudata/README.md b/go/mysql/icuregex/internal/icudata/README.md new file mode 100644 index 00000000000..070633b555e --- /dev/null +++ b/go/mysql/icuregex/internal/icudata/README.md @@ -0,0 +1,46 @@ +# ICU data files + +These are files copied from the ICU project that contain various types +of data, like character properties. + +## How to update + +Not all data files are immediately available in the source code, but +need to be built first. This applies to the character / word break +tables. + +### Copy from source data + +The `icu4c/source/data/in` directory in the source distribution contains +the following ICU data files we use: + +``` +pnames.icu +ubidi.icu +ucase.icu +unames.icu +ulayout.icu +uprops.icu +nfc.nrm +nfkc.nrm +nfkc_cf.nrm +``` + +The character and word break table need to be compiled before they can +be copied. + +In `icu4c/source` run: + +```bash +./configure --with-data-packaging=files +make +``` + +This will compile the character and word break data into a binary file +that we can use. Once built, the following files we use are available in +`icu4c/source/data/out/build/icudtl/brkitr`: + +``` +char.brk +word.brk +``` diff --git a/go/mysql/icuregex/internal/icudata/char.brk b/go/mysql/icuregex/internal/icudata/char.brk new file mode 100644 index 0000000000000000000000000000000000000000..a243ae6580ac2a4271f4ed78d252ca11bae801b1 GIT binary patch literal 13680 zcmeHOeT-CB6@UBY&6_tf`^~~KJ1B3-Qh~s3LHpru*CN}}0!yT@yGH1;4zttUfnjEu zofTKvstHZ{MKDGZZ9;5}k{JKNkl2O7_c6eCm8+lWP_U`Ww(=2E2TE&D6mzIFGhfb6O;^lyL+i~g zw{O1F%oHchLZ*@{_nV`|e6}>1+i316W_w9cs!Zp~v%ThUKAS7f=FHnn03Mw+OOKdy zv$=kAx>A|x@9R5${CMviQ|>L5r~0zOP49H&Xu(vT{e_}n3~zrMgWYONqx2`AbDRV4~%cvIDg7S%Lei)ltYp~VwPaO>MR zUvyrwC5KuzJ4_|YhEFqTG{AM7`VR+<1Clks4d>>F_f;*{-kzYxD4UQvSh7-zE` zC)5ErKJhmJH-_=x*d#<$pP0J$3Gjq@TJ`Uq(%w1}!jbzReAI?Ri17|9SkiNgF- zQho+_Ah-st7VK9?JBW4QQ4#Ir#p%J;MR@`$Po8EaN8DyCw< za@eYT3n=$-S-EcUHZ?Hm!wt_gTy1DkO=YWkOg*Q*r2bXyi;PE1<;ln^k+`-)dtCdq zrbq9Oo{YX6)%0!pxAm9wPxLRu9*td!{U>&hQ8vyQe>AQeo8qJKGx68r?TNLC$;6Kn zrt+tRnp~6InY810Qlf3Hpq@{@pX_d&X#9Sosk{p3KjFmVj;60RJ==7xX?62>^V#Ob z=1-cDdshpS9_Kw9;q|$e_kh>cx~tVx9)nXH=YjrhYeO3w-%I^y+f!|eZP(gT?bq7& zwV!Uk*e=kfvWp`H=cD$XWr8-9exM6*?&g)WCEo$=WeJIrcU)r|eT%wPfm;>$|5iZl z*xE4|7co5AVJeq8KJMsWKD+$F@(-7*@Q?88*v_SIz?hkPu&pm^g#^mVfHma3$1bY5A6Hz-qS zO?{firt&GsW@GB}sUK0z?opY1duo)+bEyiam~VEyqMiiqr5KJ*r_QDZ^`EEGp1RzZ zdNH+Gs^wzps*f#U_lo*nYIVbgC>wvAx>K^3=<4bcI8%8;-5`x@KBID}{tzi*c6ROe zAo5-FzW4t2aA6$90>s@J#$bWRj(oCZzy_!?91(|-4 z>ML{HbSGF`+L7kfO-WS_w{%7AP4~fZSNa2MPPgj2G_TeJ=~Z<_HI|-CAA?#fNcE`D zI`!JF#`z$Q$Eh@4qltU&UjSjp z)9~eRkbg;>YJD|5(fVfk=~kxkm-L6}x7yxqIoPVM>JGIMo5Jfk&jr-@nv8c*@8(AFuUe^1{>ZrjeW|LoaZC%XPoTYIz8tgto5 zt~iztS>~l>`V0CS`bT<4Y<+A;Y-h~2wKVE*>c% zM6H2E=r^}negX)~dkm-uUYyOi!4|Kk?{ufbk5|y*aW}iRaW!WSIk=)fwVC#!64Y$I zWL%Hf>_AGw+oT(9xEXBB74Ww{Y*;oTy=Z`VnKFl2u&=VLW;$-SnsxYGv-r7IK&*tZ zLc~m2;(@*pQA4v#G9uv`JBN^B(qpr+mKIMMdNL_Wl!VrlOi~8G69#nMf7M&^IOw>W z5X!yI2YMr*k%mI+adNLi7^0(8%Tts#kC6CHD|JnsL{jSU5r6+SANgK~yc4 zg!GaubXamw23UroNwpO${~q*{96p%JhTV7qif6+POqH`wHuJ+!@$JtDzi4=yqC_P%mQoxM7+V0@TWSz-aO+bc{o&NXj_ZP7iMK6XK= zmUmUBW-UQi^$nisc7#_nPgx)*cq(YMJ45jwK|y#m@YM$&L#&~BEEV2gu;JT5osB7q ztMyE&(~1ShxV%poOYBRqI}xOaxzu_y6A?QSuIii0h!uldp1rPoqY_L5(HIzJR#R#? zC)n5=p_aF-I9XN-NZ>NThHiOI&;ggl!Gue>dcG_o@wQ0d9^s$WB6;e)U(v*Ne}Ox6 zVBiQAYk~7DGu;Z_ae!xEc8r?KM$8w=Ku%c*&}Gv(4Ax@-Sy2H&n!yTA!~P>V=o_$v zQRx(#=zIo@Ro~T)x;{P3OVWUU`NOlrP9nK|xeO>K1lIvGLW9s`){e_bAy_n zzUl6c=KRW6+YOut!#j2}XIJTE#0f%wp5}lA#p?j^N4xLb<1LgoXBwgJt6(0KYI<%- zk6J-KRp=GB-Tiv|Otr>gV}whYu8tEDJ~A4<}L7pIF*(D^MCe+v{eiKL$%{GrH& zB%!T`d+TLU{eK}i!3VuUzs721R>Hna~STSft*i~_}V54Qx$ zloSvwxu$n_6v*}&oOB4wRXlNG6cY zzmF&YjFSD$23N3N#`#!)@i|3}{SpLm(jO~JE`)G!<>1?_VI-uP!qVd}gAlh=9O6nB zZcfqz4qs064FE$ZkRcW?&7Spav4Ww^ft6tqqYZ3gu6d7zW(I)bmH;YHXwCWlV!wf* zjK7d|xSHV8dzP&sB2gL?*1Efo&6O&-$%#T9{(nHGFgsBy7UlP3M{xN9D?Y5Gns6@ zIJG-dJ`(0if}z$}zMMIlVG47H%M0LYHaj?yE6j0ncVYf0C%;sl&v0^Et~kZX`*L5) zEZhcqlCgSRh?1BC)UgTrA}9H!cZA;u6f5{4L|l8l8xhRMqCSg0Kk z^AusRz^>Yp&q8JUAwQ)0#|OH*v(uS;5&A-@bmWWExyc*ahgj84vO8i!Dp-bu=;#>bsHJz9l$%_e*x8GX$=4X literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/embed.go b/go/mysql/icuregex/internal/icudata/embed.go new file mode 100644 index 00000000000..3fd006496bd --- /dev/null +++ b/go/mysql/icuregex/internal/icudata/embed.go @@ -0,0 +1,96 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icudata + +import _ "embed" + +// PNames is the list of property names. It is used +// for example by usage of Unicode propery name aliases +// in regular expressions. +// +//go:embed pnames.icu +var PNames []byte + +// UBidi is the list of bidi properties. These are used +// by Bidi class aliases in regular expressions. +// +//go:embed ubidi.icu +var UBidi []byte + +// UCase is the list of case properties. These are used +// for case folding internally for case insensitive matching. +// +//go:embed ucase.icu +var UCase []byte + +// ULayout is used for property checks agains the InPC, InSC +// and VO properties. +// +//go:embed ulayout.icu +var ULayout []byte + +// UNames is used for named character references in regular +// expressions. +// +//go:embed unames.icu +var UNames []byte + +// UProps is used for all the character properties. These +// are used to retrieve properties of characters for character +// classes, like letters, whitespace, digits etc. +// +//go:embed uprops.icu +var UProps []byte + +// NFC is the table for character normalization where canonical +// decomposition is done followed by canonical composition. +// This is used for property checks of characters about composition. +// +//go:embed nfc.nrm +var NFC []byte + +// NFKC is the table for character normalization where compatibility +// decomposition is done followed by canonical composition. +// This is used for property checks of characters about composition. +// +//go:embed nfkc.nrm +var NFKC []byte + +// NFKC_CF is the table for character normalization where compatibility +// decomposition is done followed by canonical composition with +// case folding. +// This is used for property checks of characters about composition. +// +//go:embed nfkc_cf.nrm +var NFKC_CF []byte + +// BrkChar is used for matching against character break +// characters in regular expressions. +// +//go:embed char.brk +var BrkChar []byte + +// BrkWord is used for matching against word break +// characters in regular expressions. +// +//go:embed word.brk +var BrkWord []byte diff --git a/go/mysql/icuregex/internal/icudata/nfc.nrm b/go/mysql/icuregex/internal/icudata/nfc.nrm new file mode 100644 index 0000000000000000000000000000000000000000..a1254c0aa7551085e9cd8deba49cea933fe69a75 GIT binary patch literal 35124 zcmeIb34Dy#_dotT&rBvW*=65XNhW4bW+pR}L=ZvjTkKR2#1>m@rA3)iOHq5NrKPke zO6{d~YOB%?jkTJh+N#o)lKkKIOlA_nhc2J*_xFFjzMj{4-*eA7_ndRjbDw2yCK1h@ zPxR;5KY?pHxPP)3oG10|&>YR0MjYoq2{idh+QZC59Cy`?<1Qz0T+ej$A9CCzUyfUp z#&LS|Nd_DCFQs5`YmPfDITN{`OAub+Ij$D>GPi^Kle6%n_>KG>K0#1Yw}d5|@i;e|6DGUXmmTQzgeFGS?*6-mdSvUUL;m zYe|z_M@rw9mPiwXXj!UkxNN%YxGX^sxy8A4aqHu@!tJ=5n|nj|3GPeW6NK}iP!F?5 zp~o5z7RP>^ti(HH744s-)p{~XQ{V{V5`;~jXFUDAT6sJ=WPdA_CK39EWt97fkyxP@jOyldD<(nW3@m=Zrt#7PfH^2FQ2mS0L{vQ5PW%Mp& z>*bG3L60BP{SW&~0_p`kF8#mW|5f1sD)9ei1ttdU3;5P~y+#DGEVBcL2d)V8kZz^k z;XrASJ@;t;20_h(+68qDY8=!$s8i7Y_5RID%tn>Z1bGMlzqwBSXR~S+{ATdUVDFG- zA#aA94Dk+44jml&v*NzNJdNEmaL~^6Q1S&Waqz!O$2VDr z)syYu@KNEDoL-*P+a>>b-VdKxWl#E<|5%Lo9_v+DZwlWZekS}{_}y@EgnvZ!2z^A| zh}IE3B8Emxh?pJmUc{z|{Sjv(u0`C95J&n)R*%$2){Sf(*&}jjxHdCu~0eZyMkU+OVnD_PqbdNNpwndTO=22#dXD<#J$B+ z#mB@vcSd|y?Bk+!8SOH`WvR<5ms2j4$NwAr^}p!x4c{5Q*WBj%K5!Ge-*xeFU*x;o zH`slZ?>gU2zT17(?mV~0_n_}FUz2-N_pazYbUE#7^S$WX$Nicw%lD@5ZQrr(zxmE} zU+T_DK5&2NEBI}4m->16`TK?X9mR-fzXU&}-x>Ft?hnzEN|OCdel`6h9(DbAuAyIZ zzczm19?2e^{kr@0_8Z_g)NhpESU;=ZRKHpHeEJS-c&>p*8;|$=KJZ)b_le&wzkPm( z{Z9Cu@jLH#*{{U!mfsz}`+jA9Vt+S(X6xf0=pXJM>o50L`y2ez{A-bv=j!`6@o(kd z9=$I9J^cIn5Aq-8Ke}8u-hYz+oBnhB7y2(Pm-O%$?6J~+jsHgftt9!>f4~3d{$Kci zOj;)XE$LvS5D>O`I*I=AbT7EoqjD%+##`N^`V~^^j zH%PyXSazLV!+4&n;937znLUeRWv9!|;BcVb>S^ex&y^$xSJ8e~Nwy_uSI~{5eL+Wp zPU3u@1$`5ACFn*_aCz}DmiJi};jTQV1hGS3-nMpK&owGIKDZ+5f0I)L>)<(+ybpEF za7M`r&IxXY-U!NSEO3JB4A=Rti(GSq*OX_vzP#nRP37GF^7b&=W5k~#v;m7AKw92n!n?t%gFg!1OcpzWKMU4NP0~Zb$AiBNJ{No`_&bvS5d2H< z@4=6PMIo|~bnG2_hXjO#g~Ws;hNwbPLd+qVA=x2~LRy3{uWds*Q(FbN>9u2H+)&a%T8=U*CF8eAG$8eSS%8eJM&TD>#@hrF~#sj`&k)F5qXGLU(} z`33hsdsNKA&ZU>uEUi@~i*6ykLkdHRLS7E3TfP=wJjdo3(OPOOHI-UQ?Y`@j)-7ds zRCX7vh;HXu4gT{{uS#xJt;dD5lfM3(v)V~I4HjLc710^TJUJvR8%Lk#UNkp0qQP_W ztD5iQeYOf%4i2>6+a2i8Fs*~^c|PvtztwY8$KJ9Go{aSTGgx_y!(&C|9TMi-&O2Jx zt339_c>7iPVz$q>vY(fV93ST@Rtl3lZ0#A?uK>HfU1LA14iC?d{nspdNZ)$wD(Ed; zfPO#elEoXv~!YRXbyDOWc4XB|(QH?y_Ws8ypL`Dk3p z)3dz%cjSL3?|(Ow5z-BG1#Ah~^^_HYpLRP}RLdIe{*^jSz?$4w<*NHP=2Vd>^EOF3 zgVz2L?AV`u%FLd;R{nVovk~zd=gg|EyH%y1IBWUOWBoM`_H{O2dIi}z>a~mempMy@Aqz%-g-(lcT4TZe?PuG0(*b> zd$HI!M(ZCN|I$I5x8uLKKl|s17q@@9<7?j#e%dJe%BetR z^KWPKWZa5)_Rq(5WKtIUzaRg}j4Dg^OBJDqrMl2#($vsX(u~kE(k!}L+oU;F?%I_{ zy(sThT5+#0mNpB$EX@tQCcQ507@8M)OWNzH?|U!m`$;>;r}v7AKWX#NCI9(3_1FDB z>yzS(#Ietm=Y-OVwPjyfj#b8Vl@CXLkFPt&9f^%~Sg_Xa%O1sn_C1|5Woxj0XT$#A zI!<91`7m7*>}NYH%GR=e_&(N%&{skyhE55c89Fa?N$Bd(^`W1H?hM@A^r9*g+@B!f?!pDbC4WApnG<D2EG&WPQ}Ak%q{$$l8(hBO6DyjBFR#C6edzLG~lmOYYSta$w|$$nlX=Bj-jgjr;)LDETz< zaOCO8;>eQ7+mR0<#Zg{S!BMeM$|z%$(W{nM&8WIj4WpVzwTtQ&)h}vT)YzyuqGm_E z6SXR8W7PJjJy8dvjzu-}YVCD8$`*Aw>Som4sM08Dv~P5HbV9T?+7ewix@mN~=x)*d zqK8F~jea9~cJw>ZE2Gy$Z;aj=y(ju;^qJ_3(buEzL_ds{#Q4O7#>B?RV>B^_7)wm8 zn0hgdV;s4&wu{$Luc=-Oy;gaB;0rTk;VGOUi7);6A_yjtBXyGW%)etkygu& zZ5G=;wmbC$tI4bNj~yO6E_O=noY*ChS*r2eYP27NcEvWvx8+Awn^bLXwfCxRthT4x zF?@6WK{cswpl@~GWZzo8jeXxqI}p1vt&Q)xw9jKdN&6!9)3mQ+52khXJ(0%ZU5vdR z`+KZwTu@wMTxwkXxR!C9;`+wD95*HI?YIx)cElZwI~8{+?&r9&IPdCF)iu>E)$3Mo zRlR%lA=O{2{#NyOtFNoRwfdpzUswOW`tQ|U*KpC{y_Za@n6J$mG%|-8*ALgC-V#O-^JgGzlZ)%-?Dhugo|mO2_>_O%A z@ITG7dmFoZI_`1yJ1OfqGI4NrD|Xjnoa1in&^oODS^H`EXSuyK)aftkNh;$x50DSY zar~LUCp!55)VjWam1m+~Vo0Jtya$8AD@FNFp8u*YS{7BFOLbX1IJr#m0=ZR0QOb0Q zT3PB}7^5$rNe0H4WLb&n=;tKXk~J&$QTK&rnw!`_))6Bb%bHiQ&r9r;*gtVd;>g6- zvUajp6DKBil693$Nt~J3L)Kf?Pd1S3=Or#q94Z?jdpB`);%M1e*++?+tIT17Y*H1^ z6|=T;Q!D9b$Y!HAU$!Ihv&2J*$H~_s*;3hivQ?F=s(N^_{+Eg85-%lw_gH>EB>s~4 zd*Y+VWH09Jc~*yNeE!(Z#`Eke>rejek(Isteg4Hiu{MV{EwF3VF2Khw|!f8{~4gEpnyXF1glipIq;DL~e3B zDNlDhE3f7Djl8bg6?p@<@8PGh+mG_*ZokS~yWOwkwVhj;+$E{2n|o5Vq~2~pNfAi{ z-Qtpxl18{`k_<_XXwPoPx=nDK#=bczCw&=W8mSaZg)~ z&maG6i^|X0XIuR<>aX^J}#tzrXA2X?3kCtBCS%a`yL_7xVacXZK>>pKrz1A*+i0 zlkMZ@)!wUEsn7TJ&n%u+i>q#vegDq!|Ce!pw_-i51%=$&_m}iHbgR{qT zNdGt5dH8&Ew#K;{7iwIo@m-A@HGZt|3yym=?$>x!gIBmHWC~A3HAR3TL=mBgQN$~f z6e@*IVNjSA8H(D9Y(V6_XXy6f+fb z6blrK70VPW6sr|$6&nYrfFts=4cjZ z7HgJiR%ljh)@n9rHfy$Nc4AptxlV&Wm%Ue&#(`#0|uR-L|d>Qg<=(#_W`)qSAb zsN1UhRJUJuRCh}Em9AKK71q~vKk4r39_YAa{Qd;UBiR=ioE!^>$cAXd5Kb{Bn>dI#_GwJQ5$0IswW^e6292QTDT-7tred@+NT#@uq z68CT$)#&LN+dZFh`^s%!((&?I{wGUECQs&2kwG!d>TBnju~*r?2glIvqbhoodOSm8 zw|v$(yS77ekK_@_Gm=*&?@6{LuP<9q`+1Hr`_X4d)tD74pJ^&;{`8etVNunOyRAAG=ox#QbwkXNok=OmohPBa!Rh|&6KxN=B2!yvMgmq%IcK0DIce7N!gLIJ0(-Q zFXc$e@sux8&ZeA8xsY-t<-3$tTK2qZtG$u(V@iAFFDZGNdnxx*9;NVlcDU%>_1=0v zeULsvAES@gC+Ssso!+1~>ofGV_1XFyeG?onXrd*x)PIfR8~qji zclsRV4gHV$U-iH1|J0Z1MFv;H7>&EZ+u&yiF+>>R42cGXL1RcUq#Du;nT9MweM2Kd zGefSSy`i%q&oD@lZ|G|%#8G4ziDQgmoZ)rDG{a289K!;`V#6}S3d3r{TEhmzIL&6m z4#RH4KEs>ZeTGAZ$(qj%CksZ ze#RhU7=CaQXG}CIj2dH#ai%8Km}abP%r@p2n-~{pS{QSU?Twv{dB%KWfw8}lJyi!8 zi;N?UV~npEUpG!MzG+;ndCR!KxY)SNn5SD|Ty0!y++f^n++o~p+-E#wJZ}8Lc-DB% zI8AxMc*Xd=@kisY#@~&PjC`s~sw~wrb(!Xds#Vnk8smoH| zPhFe3A$4==w$xoXK1)52`la?r>haVsQqQKIOTCo(ZECLaM(WS0zoy<#ElYJV$xJ_J zJxzY55L1*X-jsynXKjH_Wzv}pCbKEiv`g{3Hp^7sbX?QK)XEf~X>aOmx}eE3<(mpj zg*b{#BTZvWubEyqO)5%ET=}XhsrVFMk zrteKZnC@wQHr+G*X(}_hm}O>fv!6N09A=I($C(q&Dszh2Y%VfnnzPLH&5g{>%&p9A z%^l4znY){Nnfsa-D+ig2%p=WX%;U@x&AfK9d762qd5(F3d5L+2`9t$Z=8w%=%sb4V znGcvhH=i_rX+CGZWd7E?TJgR42lLP7U(LUp|1|sQ%FH5*tHs^oZSk`NS;8z)mN-k2 zMP<=h3>LE`!;)pmu{5!?uq5gfx?D?p%S)E-mR^>=mO@LBsOme(w=Tc%kwx|x=F zmc^E3mKB!OmX9nSTeetsSaw_XSq@n~w|rqaYdL4RVENYaz2yhX&z4^;_bp`>QJQO- zds?-$fVBF$`ns^RsI>UBq%>8UF3pf;PRmHEoz_TKKdn((v$R%eZPPlY<)!7P^-UX; zHazX+v{%z!OM5+SO4^%gZ>6!%K|EK6e*>+>bq8AafG*bMBx89&GM*PpCh`)Am6u95 z!9(IAcuAUweI%24UrF5zf5{X+Sh7$QDdEJi60x|Rq;^I_376j7Dutamy`Dsx-p%Tf zE=W8CvBU>5e<4^BEHt&g$%9W1mV^uSB*E!TCDG{}B-LRXp3X^TAnq*0orAa|GNh7t z#Fb;LQt*+?Mckno!4ka?DUl+k2VxG&XewzTZZ6RZ9VFWHu2y}zmo*(RuZzSIlOUDU zO81bYV|*>dJ|=2vt&3b5pkFuLN75KMq{Bx8?d@y_C_v)5qlKo z+8?rE$a4hdIu!EJ>0Xkt>GiA=5cjr7kXQwYWI1Xx2|lL7ek$zWfc_XT8{jJkzcsjD5;HH7GyNHo`wHasLdM0U4=30 z5o0~-7!AAXn8#+sT7!IdLcbX`+6tRJu=xym97u00(TbZ|cS3dsvcr(sAk&MRO7ip$nOi(?kw`Np>Erd+cB({Vj)m+8TqY5esaWBBEOfA-{Fh~l56SB zCD-8V7S7@Ibcy5-)a4<3av6d|oFSH6M{JJ_ABi-hp4BG<>nFp<8l1sNu0g*YF>j%E zx3OM&h`l7?u)8hzN^ZgSI`nripJ;dF^5cL$0uPfp- zLp_fm=X%KbbFA@2$X7tV64WURYh1t@cY~b-YupWU_d?CQ5W^R;K*R{e8b68i7>HU$ zB3>wJ8;dob0NY5+=@jCgM%=TAI|?}`B5n6gkv{kA}#hWqKz`8|0wG z+-=C=JaQ;T4lUuM4RYv!_?HlW7UD0(oP7~L5M$?Jjdu~cT05kRC4Gco>t(E^Zt!~= z=dLIG4iLIo`yij8h(8+hEre_YavgwG-UrxkO*gRzrvuCAg!Z@|Y4*iVQ3 zRM^jj{VdpALvAI=?FMrD0q5mykI3&Gtn=lF^&WBzMUIhJM_I__ZLIUJ z;r|2FXC2~xfH4~pVj#nJRpfY)%>y~!MUKBA$1hR8uaV<<)bBp>JArj`5o`S_a$JQRYaq57 zIp!h9qgd-DSnDP5^%KtJO|10?s7)z+3RtTWto56S?}at&20vP?VPCA>K&2TehBPuLl=zx zO!VIZ*BqugO3~NcSgT9?6P1t4R%)8-GaOe{49r`rHDBPHhHk=UUPzV z4fMC6$KN-SjDw%oY9>eyXUeTdA-@axLX6vsaeFXs9&DCj+`AZe4Ep=fFNW+h$o3)L zkBC=-d8~Bxv91DnNfA%xV|`C1SS3>2>(F0=zJPW-+O43SGENdIl}h%@JS53dAM0V* z2BSSDlS+yq3zzyxF3SR~*I;`cbQ@%Yd^y^cuw4niOW|)F`peNyK)W6EDg17S--ECn z41Y&qyA|yT_`L|(PWZhFza_A}3AzJ14_YUUwB7>!1lvvMe*!uK`bxIaqprs)k9r;- zOl|0~YH9;eW6%W8RZ~}j#(IvJ)zWvwtZty*zN7u-`mK9wzTfD;xq)l&*Cf`>ogcVv zo;+xD(Cnaf^X3PQo;NpW&HNd`A1oLfG9hG6$Oj?QL)I*q5i&Y-cIdi=^Fv23oEthD z-yC}hV0mqWh}`VXKl zq}&!Fw+_hd9kOG&wLxx0l-pM1wjQ~yL2jF`AdD3(C+uOs0;>KJYA`MIz ziu(~a7+g!@wt^c>+;(uSzhf{i6!kaJgogPgyKBvbwFmCV^;p~l^lG3thje=MOw?mK zBYGC}wva9x8Rj6vCde=v-nP*A`j9q)v>BwanbF9UtP!z#W(@W0c_%n^EK|8t$9grK zI+m%@sbf7gBF&@txro#rkvb!imZU7lwvcv&^d(4>Gv$a#))=YJOroAWzR{^;5lv1V z>sg#S7BStaW4)S)w2k8TK&0Myr|O4_Y@@mNgrpB710Y#IHbWs90m*2Ru>6KWG76Gc zAlXee6HwJj)MI)pdT*e&gmkmfn@>Han~UB;^bU~jJ$PCLPisiR^eZ9x0Frf(te}iO zfn+-*pF(nkMje3U2)_O}h@G4|*6V^u8)ybSz-=LJ0=Of@T_Y{C8$epdO(!kme##WlQ<|fD zC{KvHJ)S8{l8xsALp*u${AH3HlH}kiCzLd0PXpXhnY$;n5t5TR?Gh$Y(c4#N;U(?rYUI$5H_Reg@nyGcZ@A0Y)NAW z)7X|Yb})@?Nuvf6wxZmIP;RZrW(Z+x!l7i}nk|$tTPvc|KtdNeT#aNK!!9j3kAG%}7#6 z*peiJ30sn6Fkvo9iU@N_Qp6DNPzk^xgslmOGQ`!z=G`0k6=v0p>iZkGLgLPnOoTe! zBN^k=s5yH_6pJ+A*gGPtVtcClNWxWAftggFIfU~F7Z5HXTtT=RcvO@{?~<&J3-AWX zo?Q$#lf^dR=dfV!mds)|;Q_+a<;bi9#Oxj$AZEOum|04Q4<$aBv~gl~rwS%C5gJHt zAj=f8Y(u<-coS)}$a1KdozEh|K#~{u@^ab)rMK9@yt`Ui3XU_UV_cxwvAe!51=pZNc1UHR zkiY(fO(^gFly?*I+Mglbv)K6>K=P&}A3)fYybU01N4cDPwbPPT374Y)-ihBy34I7}x=J7qOK(K0GCHH#&v@y_Sx7~)EylAY|K2gCO<C>~1K znqmzlY)!QoN|?)4hfNV-E}h#V!d%i80ed4N+u`6v1>o2Y2QTVFGBNJTeM!bRp`;DH znGIuD+=ikKV_4Fbo}$AE+ftQ=6Sk%GHk`0606VQ+>dL(3zwcczi_=JYDhs#cqm0rP-+v6k7M951>Gj_rW) zP8sKvaZZ`cDXSPOa>_)ar8qZiw?Yst1IKnMkR@tX8}D#DO37r5bI4eE?J_*k%I#!M z8L}*wiJY>E7!qd;cJ*h|obm~?>CEHgw@}j%o*sdV&lyOcO z=ak8uvWl@Hr%WU|hUjdUM-Uwc$98!H(Nffk?YZ%yW#HHz8}F2HP8sKv$(%BoQzmlC zP-i|1J4s=pI3~kR661v8Towyg5D?c1n_m&|BSd6-cf4plIJQ^EJ7t_x#yMp&r>tVE z$SD&sJ49j;;SbR$Hr~6~d(TLQ#qBANk(5V!%3~y9dph?cfj;;wO}jp#YQ(Wm)eaf& zl<}fqXlVyX6ha)cbI5q7j2CI4rQHybjyPuLknv7gg>M)Q zDXXxnt~ssp{Nv+qxRDI&35#B33E5W%sSZsj)i-NkH? zyR&bP>EZ0#V|qIK_LyGIzCEV5vu}?XjYw<68<;IpIQ#aPN@w35Q|0X2W2&8f zdrXb9Z;#m(k=UMDwg~%W;8@IV-cv}{p#XWt&PhqG^wneXh|WA=3R?J;{f`}UZ< z5sB@SWs9&!29Cw-Q*LA5Kg$;Nb@uHs`#JmenEjo7d&~jOzCC84vu}^d_R-k0Wdq({ z1|llkRl}-9n=M)+VtuwUKOqfUgKU?Z+0SI}J3w3oSb0poimM|U3OlwN$2@ESu7d~m z6NYTjLDJYg%weOjkITl+qp`cA=xT~$?_Rs3*j2WTqHIGH$LC3Au?_3q(Pvf*$Z7%Q zvVgpnlBZJgR7#!>kj(+uI6ikWPoJUA4E=!z;MJjHURThF736gVd0j?!t8iByjd`$L zMmEDEz!R7u`>9B_=r_{XXSjq$t!9Yx2wQ$ej0rf4a5mv7!ZU=Y2}QBYMoQ>T7)KaS zm`<2Qm`&J#FoR>G@`!Ijm`;+Kggr=-PdJ!x2;nfo;e?|IUnU$+_!^;=@O8rFgsTWw z6MjfIns5x^Ou|`&8wo!q+(fvUp|p{9fmV<*kLAFO435%ERDk05>o@h=nqGV%3@UrhKR;ab9v2saRZOt_hF3*k1x z9fZ3GcN2a_xS#MK;bFp)<;b!+P5fz!ewO&N#M_9s5&sSG-w?l&_@4=H6aGSYhwu?l z>?*b`C>Gnc7c=f{C%3}Rablb9g4mXLK@2Y0$<@Ckw$;4EWQ=PI&c(K^*oAp1=hi#9 zUBqp@;9~m-USLMI$e3j=itvt8= zb)NfU63@LkgXe@kJQvoV=gtkpUxXRLa~Tux*IF?4mjygG9zKiUQ`{FegYny77#sNx z&o%GNb3YXF-0_J#*JCoz9qGn%wsAZc1)oQ|@LW(Y_<+xc7;^$Ktj%zKTEKrxp4;Dp z=e|TtALzbEPW)K>wh{bx0{uJ<^O(nTg>U1p;VkC4sUvvq)sZ~6bPQs=hCJRvAMw9> zgXb=_L;odIYBc`33w(5gzoW3}OyNaEQ4O^^e22oIyM{c_?az@pqALA!?BT+nkxHqfb$zXAGL3W@rb<%bK45}6^yaM$NA>S0kzJ=xm$oao;hG8usuhGbB+7i?c zb5mo?WaRcT*4nS2+BnCH2H+||%=V~fUz{5s%&QAx^=|=t%zXsLDBwQ?Io^bQ4a|26 z=29262|~@S&?mvCH|Cqq)(+w{h3{3+eTQ=99(;X{xYu#5e2(=N{W8zZLjCp(##~X~c35+JPy-{@ zpaprjf>xs5Un2LfU^fwUiNyF#$SV!GH%9y>@IM9D;Fqwgg?aAD<+%=6cSB)w5&arC z4@ZabTsmsq1(XkZ6VwLh;ZykBg}L}zaX-L1GoikZKtH2q4?*$;h_x7gAlrgC>u}D# zZ;qJodjz>%1GT`s7bB*m4bSxk)k5r5pdOugZUAUD&TDVfcRJ_}s2R@bRnSc2HxYFm zj`@V5Zn1^1N8J>I!J~FFM__Fs&m2$#P;*clP-oCI&~#8O)NmYV59%m;1#18_81r$# zdi!x4&J(VmwpcelsLxywkNTQHU*aAioyK#YpoZUKUS+678s=NnRp6T2e!7SiijfI8 z6Pq#9-k`CVc{Xc7hcN30pj^y)0q82|W6Zd2d;A|Apl%&dIh?7}I3wpk1z2HgK{iyP zDJmcZbp`2hR`=s73da@M64VA&3Bw9$0~(1d>l;+%b5P_!o_im65d$jo5oi#qBPl}0 zpon3(>w>-k-32v-T`kZtRBr<+^D@qGM_dIpQI(chRX>A9zK%+v8dlJ5ob}H^KZ4#s zWlX5ZV4Up*Gx7go&B7Ho7rs#)cT~ssZGmfOYgo*%F0irf>?NH0G*qh=>RLHe>OU|EJ=!bryvsoz92kJ_$A>P!n1^55!wjP5uPXfhOn6MBH?AiD}>(?UL!0a z{GRYSA%WCuvVJz^AS0q7*?9lgNSwpp+q zTI2fZ0UF+dNy0H^o1W)-K{AabjUbta-g}^9@Vy4o9=1!6-h)JhtL_r))6muf4F(Md z1sHJ6<31z-<$yYXhJeO^UTuZ9G@BB!2M)xY5RywI>5jGg1A4!JB(0Gn$OLKt>J3^B z`VHiRtGgAbJ7@yvL(s>dy`aFhcy54Nw88xr^b_s^C(yqO@&rExZ8BONXiYnT%eGxB z#$6D--$4@GD?&lB?4Hq{X;yK!8|}DMoEfxxK+`(m=V7*M7?;(VaV6jebjH&XG#YdT zbQPpA;Yom2-Ua6wQ~>H`61WDo*Dk;k+Vzlp4Em&t!1cDxLflet4Qx7aJno*ch<*qo z8r!yFR_$!(F|HeAeL>%1{6e(HL0^OJgT~?RXffk%Xco8}+nI|P4M_vsCmVxCfi{BP z1APR#2=c-GGYr%jv;lMm)Vw>pJ9%6}*0{4igrqmx*Fe`u^9wYwxYr(lWHm@_!Q9Y) z1+*A69Mli8e)%kOw_@aqyYe4+R^@;~(olWSHCW8TdYl8|aJOCr-P*CZ+h_4y4EkL_ zt$HyJ`w%Y&BOX995!AA`z%{Y;hu`UV_CG0TB z;F{Xr$C%c5Dl~;;Fxn-c9i*9t`qsC(Utm@bFs^Aofoo>l3H#l6XE_e)1q#G-wkD_m zGy*iSKeLL&S<$1X!ESm1aj(af6b70L>xO8@ zfQEzC4rEt#JbY#kVo%cU@Ue)v4>3|c80TlOz_qYRAs>t09O@b1yQR$^`HmWbYX-EY zCeN)vyA^Hyq4;b7ng}`y3d_V_BPbHMRyHF>)Ff^eo;M9_OAyy*7}G3+$5ytjz}9$D zVI5$Ml%7w&!m_pP4q*r?*4mbWQIW%OEcJ{HgjRS3bp+|N*!3f)tEfJ_x2OI&aHC#jnwl6l=T)pR^xE6b zV{YwjahO%tv5Z>HvwI)aYH!;NY;1dra1rdq z<8f!iJ3(LA&ICUNv>5a;?z$UT3)%%L882`hZ0Q&g@fv0US__H;?E-xdY6Ut9ItD6! zP2f7>{4wl{s&%xTMO75pxEoKvoeUH>QQ$h+j>2CLa2;(*aGjyg%Vxa|$gh*_G{*FW zthp5{98>^W4Vnb^lT+ zK(+wwdXm|{`7<3d_MM@LXxaCRrlMuvBzhAq_E|&+xH={5Hzi6+#+7h{JfRv?(XUx_SwzOgIE3Eck&+6w*+ literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/nfkc.nrm b/go/mysql/icuregex/internal/icudata/nfkc.nrm new file mode 100644 index 0000000000000000000000000000000000000000..2e6e3dda074ae9c7c3307dcad2bfb352869fac84 GIT binary patch literal 54136 zcmeFa2XvH0)IR?1ZnD`;C%re)vuWGX+t&2TCfRH%X+(MnMMR1aL_|PAKtw=6LAro| zf{1hkDT082hzf!Vf`WoV_&+mSl0XnF-}gKJb3W$WXYSm4=iWQ>&c0>#-QdT38REo{ ze{3dyNPnRjI2+p5Lm{dSPccmSG0^7^(<0~zy=TcVeZm;#&1N+2!p?zV{A$5A8e|;E z@lVt;q!+^sw%8JRgo$83%VL;xW;F95^Cy$U8qM0o`kfWQ*0Nt_A7eXkia0YlM>+Op zd1fz~MX)2-d(61zQu6`k5$x6G=ghsh5$ss*K<)z62e_6NYKvxzw=FJO1n@F=&+?ir zHt{a<0{Ic_YJMw!BmW@Z$})nTVOeMSy5(}q?=0P{G*+{%4qN?T6~PX&7FiFqe%X4T zwYd$6qc5SIc%!VN?nSjzD82(cYOuDkjkN7=yVCZ1+YmdQ-DEozkM#M_?v|azet`XI z`)l@5JqGkx-Q!x1D2D+Ks~xU6L^}3&T;O=l(c7uYX@=7wCu`?i=LybhosF^1ID5M2 zT#T*9)ze*GhwbN`?5~4&L3UBD5$pjV%%1CKuC{LZZZElg=4R`jJgxa6#a?K#!n`px1&<2e}282hR;Y z8T=#n53YkntVIDWB^Li~CWEa1s{b_HIdm=A^uD6A%rb}hPQ^VGW?F`!=wma-}*oCm4!ycFCU(b8uL!bGF{oU^p z!z%n?3nK9hj|fi+&k5Itw}g)hpAtSVe0lh0Q1{jkOZSDJ3cnm)H5co9VJM$u>) z8WA6n8Bre5Ct_q5xl#Y*_45&PBbG&Mj@TD*D&lg)?FgR0O%N)G7i0>`1$_h~1`q!XWK))XPU1z|IGZFNp8yx;F5k4iW+Vs zcM|1}FjEcn!(N|MvV!|5m&N?QDP|0hv8J)@nL?(L8N@u#9K<+(Vt6bCE0a~p8payO zTEhB{#bSP9S+hgg3ib^4T=rV_X7*Wj=bUe{#hi3bFHS$sTFwT}5zci^s9B7e!mQG) z51z~qnz5MUW;f03&11};F`sC@$b6ak5%bRd{|Vm`*0onZw1=2v!zni=Y<$>6oG*yw zOE`PX1-Bq3IiG1&1ZE4Fo{qsTGdG1XDw znC)2TSn62m*xRv>;{e#O7&q%+>tT*hJC1W~bDZY*isM|zHyoEZzU#Qw@qNdwj-NR0 zcRcKP-0@3l`?cdG$DbU3alGx=?r830X)n^PLtuz3sHpsnEK@ z`aP%3PTQS!J2hAju>REPsM8ltXQ|yt>j~B`TEA-jjniePYfd+v{%~TPEu3wf9h}{q zeVhZG!<}QCrOwIDGUrU^eCJ~{7V-a2%OdL!tRLO?xb?LzTIQioS#YhmR#y8(*x9ge z!Y+qh3%eQC%elh2*15^KpYssski&ne$9D_^J?eK&Yw6RbY?MMIDhT@gY!-2 zd(Ipe5}W5@>*D0%;li_Vu<7X%=^}ASa>;P{Ba8{>hO1n1sZFs}%QBbME*o4vaQWC}kIO-qV=kv$zH<4_UE4n1n+@g2{k8Xu{O0?!P&VJwms$5_b!do8uR%X{yP@1Q z_Jn7Y_C~X&gZC_(?3=zuEWJ*zAv}xe;7R-7eSH=O?;pE=9L0!wtM1S^zuHdoQ)%Gq zO|pq$eZnWZ4i2B@`ikp3tgwZkrLHSn*SqozM>`O;7?j!R=r@2LW@wKz)HApry6$i- zx7q93+vbq#3D+eC`7YGQK|dMVBrj_V7UKmXCD2eSEc)3zYje)^Th|#jEQWt4_Rfsk zg|(Hn#W-&Z;!goVp_v*|E}*f$~0!uzT)~bMtTdxVk#~Eq&2m3vv9L< zV==Wft3JSf79%ajSxmHWbemJ60NOz~sjE{h`;r!3A|xV!yC zWw?qM>YIkzjHl0#huWI*zHULtAp*o=?0K=YCVZS*iksZj()ao+bmF=3vfK(D>d~=Y z{;K!?gihn8dnlWZICXB#ZvEYcQj1Y;W8J)Xe!NL;Q{85|&2d}c_7;^dcU$AO(d|RG z9d3Kw0`W9>$nA5t({AV8F1r2bcEjy=w|j0JcfPxwJMrr5?nRxGde0==Aky7a}! zZKa<$qty86?D&l8?Y7?Ehkj1&F70mZ9_?Q3KJ9+(0que9!6>2a;q8KUT%kcR?Lr{& zg4GW0f46juLeC|)r?#g*kcFRnuzRGt$UVVbU|5S!o?|?EB=e;0@$HH2df&2kMLRj8 zlCxk(bUjbz;IE6yl-qN7;+h^l`+*~t=AO<|K5$eY^r+zFz#@~^5uI?v$|xb(7~3r7 zNpmATGEMnC81+foWEGGcjHo}i8_}pCS|jQ6d=lUP-o`jP`kG|$u+PUI!One*9y?~< zC?US}ys;*&&V8SZ*MBOX%=YnC`s31(V^^+Yr4YH%R-b|X6QH-(YxGCe=;86P|DHuI zuc*tXppaLFb}wF47b%Oez58gt4u-XC9E;xaweUsZZ-=i8e-C>dABFFF;PdxrZ~wK% z7TGZmWM6JLKh?XpxEr(Y9Jj;%K==`4{>8t_{-3PJ&e=REf0kzXO{ZMR*dJB88#l4N z9DXhQCiU@0Cr^*^@;@X0Cwc$3nbh(I(oaCE`{-^~2;NgLttcjf8}VPn?uRuw{((>3 zf8n*nie=pX*wRUJHxfrw5dz-SLy#;%h8n2 zPwhW-lh4g|eR*64?h(EwPmiL%L=lPuQ6#hJJi z6Zc?5`2!mEeY5*+O}zVlO_>qBA}aA+hZ!WKke+7uLz~DQ68Qv>`ay5{_TUG666pw& zY>bI}&csbIaSwW&-bM3I=kY{zHWUh~Y=eE9P z;DSq~*Vji*xf6N7h^Z*YTg+pZ@9N$?dzh?C$#1y=C{(C+pw6)uYY{ z-A8&d>ApvtXP&J0KeO6OpBVpMU-De`_j)|ey8Aiqu9tr0bRe<$ud{hLZbv-*Q?(w6 zl*Im@kNPxHXi$U zjvfbjM|kcYz8=SUK^{+j?4BgH@tMD4#-F6u-?jPcYuMlQhTV)uJQqJn9Q{aH47$)?Zkw{;o#eZGT!uuSr<4$E!#FB{R5i29sMr^>_9a|$lj@X5F zJaDfRaWvvM-t{;e@pZ&^5ktbRL|l!y5pff5e%y^<3Csn2fwchN9SEER?t-~t-U9w@ z{%t=&Al?aaza1%v5oFw!2oeM-f((HY?})gCheAdqxm5D#(Gi64@-(QKd#4H?@|Ee6p4*x!S<{{s^ktmz*hkL|$NIjB0 zeUFbl4tN~%IOXw`$9EoA zJ%0DN?_us~>1prj;_2lX;2G)}T_Q>-oOtR?km7_j!KidEE1i=LOG8oF5X_=0p6kBQQl(jMDKL(Oz&dvO7AA`0p7#BM|+R= zZu5T8`(^LB-fws>@qX8Pt@kGHZQeV*_j@1qKH>eP_t)N+ynpik)%%Y3eQ$FgOCNh5 z7auR5bAmp###Qr!&25_ipHQC|pF|&pPrgsNPj8=oKEr*+`MlsW%jXTBWj^ouZ1vgW zbHwLMpYMFG`EYIRZGZE*>%;ct`P%z>;C<#Oywxo8&Gjwyt@VA%cZlyu-)DU%`g+&~ z+P>gB!*{Olo4zZ3-}n94_kizl->-Zx`~Kp4*O%*O@8{tc=ojS|?w!3W4+WugB+m>tRWEWr;WtVJcAyZ}uPTKiYqi|8)Ns z>|eK^=l_=fD*sLXJN!TOUv0nD{)GQ||118#Qhx{RPubt|w+QGF;2A*jyJUacJ}4kM zAR#~=kQYz}nPm?aQ-``Qs5QW=hpkMbT(J%;s|&|^lA*L%Fz<3Nv7JudaQ*@Nw1 z@8IPysCG=i$l73sakb9{JYPE{U`FlqfVs6X4vXk9d^KQEz?y*V0f+IX@6Q4E11x(w z_3YVG)KlKGq-TB4p*^4N`FzhgJ>TiMspsCFCwqS1^Ov4?ds+p021W!X1?B|m0-FK{ z1&$4z9=I@YP2l#x!+~E0ei!&_AScK%sArHUNFJ0IR1%~M>K)WKh{Ys2v<8g{dM;>6 z(6ZW>g1*6w>lft*zZ2w7GUyP&?jgbBO#j=qTPb{)Faw zA?QlbjiBFy?gbr0+bc3SGA5D}%n$An{6(#Muy1fsaAdF~I4L+I`0HAAaAKq?vLKSo zVL@=>19AoDLI-~c_HeLBck(u-ie>Cz$ z=NAb$C+L23&_fj>yIS`o4%%n)~qxWz;ELiBHjeh0Lh@s8Yx) zLA8c@YsBr^*CSlex{IrM1L|Fvqn4=tQG=seqsV-Zj2a)6Aea<2C2AV9`m%^#%QFQ| zUH0;4M$L;_9JL~9ebk3hJEK00`aJ4P)VEPTM*SM~XA~#eDtbeNeY8uoPjp~(XmnJx zG&((66`dVj82vHgFT%Y8-Up4Yj&6#M5%h~568&`a#OT8C7oulG=Z2RHZbvwEbCl*q z8#5yHDePLF1$|9-FZG(|f~VlK^7u0S39M!Z&ktT4{2jbs2L042%H#b%xy5zfJBD0t z@>YSn&0GHjxpnxz%i9vn@V9sBvpD)4%+s3aja_D|qlXz^&+kSb9{Z>1r z5PukdWGAZ!Jv>=k7g86}9MZo_enUe>g^UfE)J67W-X3SQEqcczj@;vVcIY2IGwzQ5 zH2P@t$>?*@-$q}F{yF+q^qgLY7}przn2?z0n7Ej<7q#mZxI zVoPFmvGuWiV~56$j(sk6YV0eq^JCwNT@kxJ_QTknv7g3%9(yMC+t?pte~tY!mLs$h zItpju>4KaQ$oc5Or;CGlvLNR)4Jtc)5Gb~A#UCGJKv;OW9ToSd7C z=lHo{euADtH@uNYo-twsUg3p!#}9Yy4_tvdX6s=tMEJ1me?}4`j2EW<<7n!C-1Fa# z#wk4TU$4@?&SADU!LGLA;56;=uBgpI-$VSnLZ zVXJVIaEx%gaFTF}aGG$YaJF!saDi~KaH(*)aFuYKaHDXGaGP+4aJO*3@R0DR@PzQR z@SO01@S^aF@T%~J@Rsn7@Scz*G8gehHliLPXOX+eTjVba5`~E(MX@4@C_$7W$`C0< zS)x2qkw_!bimF6)qDE1RsK02is8uvdG)6RDG)Xi?G)**9G+Q)Jv_Q02v{bZQv`Vy2 zv{AH0v`w@_v|F@abVzhmbV77mbWU_ZbWwCgbX9aibW3zcbWg+*n~V8k8*vY@v)Eni zE%p}&iNnN^;#jdnoFGmSXNZ;JEODN=NURZS#Z}@uaih3J++RFc+$tU=9wQzvo+O?k zo+h3to-LjyULam9UMgNLUL{^9-YDK8-X`85-YwoQJ|sRWJ|R9WJ}15)z9_yTzAC;U zz9qgRz9(i$%q4t@jiiUfS>i77miSA8Bw>vzaV4=|E-W&{N*8Y z`D;R6=Wh&I#Q!j434ce(GXCC>mHb1v?ylv39cYCQJLf_IEc(K9GFJ4+-7TCHKpw+<)RnhF<53 zLT~aDLT~faLhtgGp^RltsF`I^D9^Gi)Y`Ht)ZVf_)XB0X)Xj1re0y28hWc5K4h^&% zZ;HfXLMVXuRdZ(50d2mMcQng=SiA3fsDjxT&KDM{?ftHR_Q3|80mQFB4~nOV3F!NE?M0rB|d^r8lIlk35q;>73n?-jkZg*~B@=dB+9CMaD_u zQsR_xd2yP!s<`{L{5r=v@4Aq>*t(=TWnDp?wywUef8B_>@pV({X4fsP!xMj8|G3t; zF>#aPrp3*UTM)N2ZdKgIxNUK};||50h&vZ|G45*It+;z}=J7W1&hg&yLGh9ClK7N( zWqe+|CcY}ZF}{C%Yy6n_N%2$St3=b{AIy73{Ji*u@k`@Z#IK9r6u&KgXZ-&7!|^BL z&%|Gd|33a|{6Wca$tfIDaujdVb|HuTlY=f<+^KiH|zeW zV|sIYJM{MI&0>sm`)mB~@qfmT7v7IwCme;d`%;|uuZoWf2a7lf7711fb_tFNt_hwA zz6m`OLK3bBBNCz$LyTYl&|pzM1%T;=74!64xhgO8hW!d*aT-y@{VD9!~r`@nqte#Pf;YBwkAV zA@N$`uZe=*cM|2jIZ5TcEt4FR2KDw#3QUSjk|ZT3sguU{p4R*I-phLzC2i}iOFG%R zE@?p0_q~7ZZBsuwX=2i}q=5R_Neh#t_3tLVm!zmKtk>0ln6x8lPts>e4fVt7$JbA< zUs(Tc{l@xz^Xyx)6 zZ%^Kxd?@)u^10-T$ybwaCErUnPq9gHPVr6&N{LL7q$H;(QnFJDQ_54SQyNl+raYJO zLdwjPIVlTL-bz`X@?Og3l#fz&r5s2(lJZ5$*_3Zmen`2Q@<$4j%1yOSbx3tf^+^p* zjY^fKDpCtmD^vTV4oDrA`gH2J)V9vFUQByAZEo5dX-m@HO!>=nyyICPA^U`Pp?jIN*|Ekn*MBhTl$OXFQ?B< zem%h9n~;Lz$76 zp~ zS&%GD7AcFBNn{DK6j_E$Da(@O$% znrx8x~DUJ-jM{gpw=FlD4NRw+>?C{vUfN~JPO znWqdA7b!JLt+Gm4r)*TVDEli1D_fPLlw*|Rm6McHl+%Z8QfX9LRh7yrp-$DP zYEku9c_s{2wW>y`#;C@tuE-~;rl_W=W~yeZ=AkT5Emkd6Emy5lty67OZBd<2Zd2`0 z?N;qq9a4R+BKeMrCj@m+PrM6Q$s$JEdYF~9v zb%;7b9j%U3iPW*GICYXbO)Xce)k~FmvK)1RdWxt@zFbkFE>q{Jbm}5ijk;dltX`$; zs~)Hxsve;ptzM@btA0*BSv^%fUHyuBylRemzWR!Mp?aq3E%iI<73wwW_3EXnP3oh{ z57o<6J>=Wft5iGH8&!MNpQ;b5KUberZ&U47o>A{momcNxeXCw7`Cffh{j2(p`ktDV zX`X47*(1|6(=+p*e%`z5=aU(j8J-!F8K0@j%zdQil1y!8O=e?e-%b{TGR5t~Gsk4M z;CsjMnUgZ7WKPSRnK?W2O61(kH!|POT${N$^P|jNnFlhDWPXu(HuIaz%jkVA^JeBB znM@Wp3%^eWIb^v3eX;_9;aM?R(yZhxSypCNepaunimbY zouw~hv-IV;EPa`rr7u&n^ksUMzPys9FLScyWX(rei1HT7J18qq)}X96lucRs@?n;~ zY|qk{omo4x_8Q6$4L>v-$kLa?S^9D;>sZz=4Zk#WH|Gz_PG;%LnXEHe=TW{zxsvrW z%C9KDqx@+o_p|P2{n7A8gIS|lqb-U%N>7wXlsH=Oz=Fcb)|Z?{O=At(eHtyYZL%G+ z-LrkO1G6KtW3%J3Q?ljRnb`%|nrvNmZFX~Z|LmdJqq4_lPs*N}Ju`bw_JZuUvX^JC z$=;a#VfK#fz1fGdKhHj$eLnkQ_K(>&vVYIMm(9uH=h)>q<#^`!=Y-@4a)dbvIcYh{ zoSdAZoU)wCocf%WoPjy5IiqvNy$ykI5yYAzA4&GUlSsVnXh`oGH1}bGPNp&Lz=Za_8qR&b^cK zPA-W{W@{$m%s0)}s$An-t%vW9IqEz^TXH+k(Ze%DW~XCbjB~O**El1_+1Q;+=HfuE zaVC!Dl1$vQS7C%C`pMk4vd`syn|mdfjHwm#w;P!s&ApcUTW))*d(7cg(WAi5EP0jl@cV^z4yajo0<^9usMnC6} z&*z5qqL-0XvMg^|-oxv>t7J{yCY0@Yd-D$Goyp{Vi;P|8qh@|#fxqKwFYHh*&dbd)*y3-jO3Uz5KHWqbbK{KNTQ!luFj zg~JPhf})b5vLankeV0C6pCyg&yXb48>t8gqXhhLy6!Ip|*rMl(CKpXDnqKru z(VU|BMdbVMg+*@_y;HP;dR+ENH(axg1MOPv} zE&8nJSkZOZoh&+2biU|YQ;wI4ek%F}t=pjXBC}%4V*6s3Vz1(W;?Ux#VsUX&@sh?R zjVn<$pln6iWhkk|sl{?bQ5UO=a}1@RSYJwt^`)#>Uv$O#Qd3-0T#wR>(idf*p$sk7 zml4JKGP;^qs6_c4Ivzb=(Ddv1fad&TBk(KZn zX8IV8#~!U?OCwyT?)ng=uwZ+^D9jkaHEezo}x!}H{pl5HhBO7@f-Dmhwm z0^UxSoGrOf@?FW5lB*>*sQ17AY?Hl?hj%r|dutudxeT^NfQZuyMmqu;^r$lmC+ z>zn+D(an4Pw%z;!Q{Uf~+IP)^#Y`}a#67HMSl5WgIn&pAA4ZGrB0uOEe>2UJxf>Z4 z7slMpz-Y;D^SbM&_q@pRZOfc64Ym7gOJgPv`|8MGB^_0b!JF|{f%^9z3F>`3h}P}a zkI?8XAJtEQd$BZ=qRcfsjg7`$(;Duiao2cj{53(EFpWSn zB`QXfpcxgOqDj{%HCdWGO>I<>rk6(lJCkxvm8Mp68gFnnYFac?!}@CmYldmYN977e zX~t;AYbI*QFHxpwUewIg%+)N`EZ3~lY}V}1{8h~E?00MS(;d6+u9$x%>sYOX>sXr| zax(9oTMyb1y}opO4!U3Y9bOLJ@BGOpcR3}vXTf{kxLYa1{m5$EW6Z~W#ap;@nT-37 z5xDng#@$N}?tFB(vl)$hm8r#tG&{47YEEF>q(6&!hu%|^0lVwZVvPM+%xKt5rPd^K z(pO*feOb(WNc6GsduzldeTjt*XzaT)i^V(~Tc11ro?MsQyT;b%j=d;kZXEkNxLY!g zoy70j|KYJ4Gk$pHI`=-UIj^~>`B8I2^SkDrhEvKfwJUWlHTGpOo~1oYBT7Z3Nu~1A zoYIm~U1@!3-_oI_qf4JFom%=z>HN~SN>`MwFa5A|XX&SB~{kHVSQWv}<`Ag|- z`04I#$@bEB)6B{&%k0bC$~q!-a6V;$W#I-*Ot%&tw$ifXvg*W+_JdrJNM5Ec`>nCR zFsH1b?5eb+tgK8|HeOLvR$ta!*0*e6+0e2Jq7h}I%f^;HS2nq9YMBOaTTCy@NjNPn zNO+}ehj>oe{IZ2*Z_v%Dyf8vFz8fKTR5a zCY*AM@&|M3-0tp^*I92>ZddMD?pp3$eoN_J9#kGy9$9`z8CxzXPbj~qOe>d{tIKoB zi^?_S&1Ks1n)3Sc=JLMfXH)~rXA4`)N0pB$A78$vY*P7@@}E^?-||=0wDOtdVe;AK zeAT@21?7v&m(sGld`#p_I`fG!+eIokQ!d%}gceE?CYqT4+TeRD? zJGFbYpK1?lKi8hrp3$Dyeyjan`=j=T_LlaJ_MSFN#;S-ESyWh5*i`gDajtN$@T~}{ z2&;&!h^>%RBvhnSWK<|CvMPd9c@-rUWfi)L2(_-FrXo^RU(sCAw_;#Ltm<6G(25Zi zqbtT%JXbNfVrs?AirE$ODi%~Mu2@>JqGCTt#96{joC zRa~gJSdpT-QgOB7M#Zg)I~DgTSUPhZ*udb&qM3#F3fx>35Zx=FgJy6L*vx_P<(JU77k*-C^Ac-D%x9-M6|cx~n>!`iAb7?vC!Bj#X)1$*;7l zbgXo*^sek#S+5SMjHryQlvE~Erd28{vnul{iz+oJ+RCcRzUsQl#>$q;{*{9(M^uik z43dwpoLo7za%Sb6$_15+E1y#@tz1#Lu5wf5w#pqSld*TZyK;Z!p~|C`Co5;lrmN3X zp0Dh$x>)&RYwYch?sufjhs@7L+ zs`{{Md)4l$Ppghr*;JgYI#YGN>f5UCtA4Edx$4)d->d$tx?jbqZjtk=ZK``zJ6F3` zdsi<|`Bw*3hgC;b$5zKxr&PZ#S!t6!;Jp`KGczj|TyTh;GWuc%&Ay}o+3cvJPZ>K)a)tM^wQsy%>>8f3E(u`uFNTtM6Bv*I3oq)i~C;)_B+KRr}Y3)I`?A)<|j+YSL=tHR_ri zY|WI^l-1~J>S`KmT59^&46PYaGrDGM&2u$VYNpq`QZuJ!e$C>Vr8Vc(=hZ7}*3@jQ z*;2EuW=GBLn*BA0YL3=?t3FwCrsjOjw>96_{8)3N=2p$0HLTjf5{ugL602IfTE|+~ z+SwBFeHbbC@o!MuS;e5Xk3nB4816V0n>&GJ#+}6Ca@$xuF2lCwnzL=Wd1m(9DJ%!B zBE^Y2mF32LjpM^*%mTP(W-4w*N;a2CE^On$&MaBQ!3x)U( z=sPQ=kXy;FZ7WYUYwN>ib8F%A2uHL1J5@Q_(`7_D3 z+;PdOwuy*)ox|p~vANtgF*nb{$4jt(5%w>@eirO!z-9?@dmFheM{X-{Oy(iC*O6Nl zj>#(I_Iip7cM)Q}iTvD??k?o_HS#-){N6);8<5}o z$nPlT?ga8Xg?Zb8+zw*BoMpRkzeav#dd4^8f&RH{2s%xYl7ba?A~op zA)jH0{|v@E5VDcT^=ZT!4*9cK=M!)|+c5TD(D!*9t8XwrFTlqP*uMn(7h(S@>|ch> zcgXD$a=U`ue#CK^kKEqCcxrH5t|GrTu+HB^tR=|N13CI&9m$c)e5~`6@c%C6XEox! zi$3om#(S8{0N4d%ESnH(HS+xk`c0Up4`A~NY<41#y;##itn-f``vP+pg?Tst`InGO zvDWt>$8V71S7aQ>@g{Qo4LN>}`8$amzr_6AL4KcM-F$_${ta?mh8)8YI|@10A;$w) z>zAR3Aqn!kHYuo@O>M$7gE_=Eo^(kwidaupyQ-YKFgIyc!euBIje%^$iMTj{YHg&M6Po3Dd68h`V&xLF}{5+Q$%-xq3+I9f) zn~=YTexIP<$LKc)HjB~kE%ZAG{T=8RK(+(2orrf8@h+v=x7qR#hi~7ugwJl{@^DT= zdnMXz)Duu|25sdtTn`?PyNhql74q!c_QBQ-^+7(5dloV;o;~+#zDwJ8u)Per4mt(- zo2cK0?c4CX2>w>1{U+*(s6POG1iv4^?;h9=fxiQ=-HiG(`27m9t?>H|{9c0X51?N_ zUxHTieA<2jU4!igwBH9E2YtbR+gf40%vxpr?u*&h%U;X`<$@;KEPL^7&^VisFBdzE ze7QHMkHa&LuR5-NWv=5h_zV9l@pt}LzdF}t^_)=GXIy8wuAVd3^_e-Zx~`l%!|mO9 z2@&vt*;{U!I6^Jcg|<1x!)^=orIo_Xz6k6E4@Jm2u#;`!11t)7*Oa{Zoq-&*?A z`}QCwkQ>Mg6bKp)dKxqiGy&8m9n>b44{DQwDnYM--T=K0;^98W0pteC1m%JzfSw1v z1bP|tC1@AuYtT2K%b*`X*FnF480DZgHt1u}CmxmgDOF_paxJMP(RQh&@j+Q&@-TMpoyRtKrew_ zt`pWRsuN#$6ZPAum!V#XdNu0zP;WrJ8TALSy#~4t`UP|obhmEGh6zz(e}eH8OM#zc zRR$2O4wxdIaQM|=@F(U5PZhHTBg3YOCqxYoo4P?3{S5GvzQZC24v(0&ft@=@h`*e6 zDT86-jFpSCT4oBD+bhMPtuTd1c{s0a_;CwkZna3?Rvu}Hy+L0)|%-EMS=*w{5hkiAO z$9)&)(f1I$j{3=mpKSOkg`Z;hsiV1lMD--M-;i59&CLNm8)=^j=<_W4ynsG!=+i=T ztA>6c)xQn>o6xU@{$1z?(%fD{ZWYMw4QfYnD@AUtG`G#jZ7p(JiQG2OdEp}CTKK7i zpFZ%@KyxF~-jG&9`V^#1kSZB7NJEp^Z33DhX@#Q|g;oL8EkbJvjY@QHqV+afB@ByW z+qMtfP|6*kGHb{VQW@bs1E(R`w_OLXpTYi6*ON$ z^Gyo-KmikqxjTj9GLUj#()qH0>|=V=2=@uJN*p!ow)kZBfkw(DBHIFnXlpU=11Z-V z+&T`6^FmvnH1?%UlzSfB77kf4OVZd}KZ}WNt0CD4YX^(WTv z&U@9J&78J5Y%^>A2cGP+xlB()8kEdF+mCWXz!g(&Gq@p?`v6=CINP=(;D%D}7`O~@ zX58`=>$ZL=?6Y;WRe{z3w3MXP_9D1O%Dn`xfO2cW4W!(ASQYCyju?SCu6KM~=WK9& zW{~qbjuWI7kXk`%2Pr#^@s;bXF$-pCEZTw|?h%}SN zcSa<4MDj)?dnzUIT_N>^)E82RG#(;SYxHzV z(ncCTFqPdFno3$EZZKNmXw9ZNFElaHIubG7q$90Fla548Ht9$!6_K{k_zj5E2lq_Sm!em$r!R zRkU71Yd6&`fv08gw314Q{%uI!g=94(OKC>$L-GM6A3?I8_Sy}}e$3Ay%uhPTvY*=R zh4fQM4?`+TtHnG~Ym8TsR!3X#kCw`$BlDDL(veoSNk`@>*Q6t@d_-DK_QYH~Mmpqb#MOlsMO zVip~53&kutmKKUx)I$rw16ed~3yqsaeP+|XPZ2zjLp}GUo^$DV`w={lOXd9xax%t# zG~0Y?Gl1IUQ=0+QCZB2tP%NO)2U09xtk8EL#bVlb2<=-;`wpRfi)pVR6iaAsLuqa$ z)MhBfUKEE>`(D&?7)1?@(n?W7?hJ9qK~O^@wh}y`p)=4*XP}JwA3;$|ag>24E4G4W z_%y`|I#*9qte~-;rkFvoj$#(Y7K#NF2U65fYz3ZWC%OtZl z$vXNnCKPf@&0 zF@v##ypCc9(}VH^l~mG5QAy>E6tk$Lg<=+!v`{Rdl7SQpsAM3;Vk#Lzv6xDRP}ERK zD@6^Jv=YQUR4{NT#am2ognx2TM8QJ8abk7&k; z21o7@$t-H=ypN(dlg`X+igPH=qqvabQi{ug2RL$i7fEJt9_}5D^GtrT6TL`u<~VvK<&(PO9wXDX3UJVhm^DY{UUQgo#lN72oU%vn4|H!6vz=tD7; z;$@0gC`MAuqu7)7l2hzSC31tDoO$GC1VgAqO)-LE4#f~|2^U&!FJLCYvmsm!_-raU zL2(DgV+1Wi@mHhR7BLh>1ev}3k0_o5s;x?`2&Pgj1HNo^61d0eBj8?&dx%7*vM#k| zGuy3C0QVU13+n^ca{63Aq$-GZ`L6J|}lOEPSp8d$&_M@0fN7s+wr9A4dKgB$ncYm69 z9`)LvAnvNj@ftwo`BXlDVm|dYfMP!NIRI!php|lOFqTQiGs$?ITpS6y7r@B_H;^6^ zql{&ekuew2>;_RRq`3^DSWIyUumU3`PqJnl9XRqNOU7JGGaEt>I}2nCLy0Y-k$naW zjt(5zXRzSZQkgZlIw~VvZ+Ie4!WNu*aO6ptc<4p_3?q297i8?S!zlKmv4&CXMdxA| zMGX@On^uY%dTd)MYN)mq*as2Geg=!v0*>rwusBaqnHkQ+eW{Fa>`SHeK5RI_v!yiZ zaDtc0=rwZ$#WFfeBPf>9^)`ZH8O?hH&AW{D9YOn+Q~5}W<~yhAGR7ohOftSn z*3p+^l5seTa8$^y1e>!M9NCdTeweKc+^exLi9|*?ql`?fUWRL#!H#c|AxnddW0G~m z;F@BP&%Kh4sga_R9+O7k0gQ_56S6ozfFrwdxZ)aQj7i3rWPFp1Z<29LG7e`mMn!g3 z*qklk$j%CRy2zqYTL_-bBC@kB1o25C^E?oE95Km$B#Uzs9NGP2nPiMf#+YP$ldPjJ z$0Xx$4k9|)&0%v6fg`&)Y|bLg71>*3aTbFkduc3_j4{a=lZ~XC*fAm;`>ck18pcRe_xEsj$%CD1sMkNQkhTxpQv3)^|g31}RIi@zr zN69fWwH>jRlpJ$Y+ljWhrnVzT2EB!;?L^x=L?XL|N)C1k!I7Aj1{?h@qLO1}YU^WK zo7(!AHm0^drmd;1k7;LW>tp&M64~ifaj>HTw*ywivm7JcYwmxQ{sjZJ0WNPbU z2AkUYm?5UNK4vH)k=;Ed2RnPti;W+WMGHrnWw2v#G6**$0uxewdPj zy)bYj=2He6{XUtJ)7RA2$Lwcn>tpsewe>LvnA-Z715IsxOtN1_t}W|u=Q0RU$?h3e zEovoaC5N=h%6y+{$QmTO*~ES(xzhpSQ-GDH59#Hvw0pFZm8Z$=O3S`o6P? zJ$7nIuQ|jPpCjUJ3%nVh5{bnYtZ!qReCFm+t9dl$JnCaNmF$MZ`0Pmf?ZE62bOP>% z52KFwSW0^=r9PHYAB$o446-A;gCy_$z|W93`8Gqz`HgDyc`u~)%L(GRz?L<`KOXop z#aR@OP&`iY7)4G1vEfm)qS%vS5XEGQa*9fdnG{nP(yNa0c@&eWB$Z+Vl{8WuLUAa? z;S@(ue465DiW4Y4N3o6KWQuQ6Tt;y@#T68vp*V)(Oo}g4e2?OKiW?|yB*@E&nHR(6 zuhvGm3z6Sq8C_f~I@U4_@rTj?BkEHx) z%8#bJit-C6uAsPz;u?zUD6Xftk>VzbTPS`=aT~?$6n9YEMR5lANL{DtB@pqYi)sd;D3PJM8eaPv*vQa#6*or*nY zb}HnY891SdQ=d0Gm3p4Y2v-Kq{M44S=ERGETWjLBQEu}&^HXc!1$G9GdrESSdn)D} zexrgNaX;K!-*?{je|0j0DXFzlF`d?#JL7&LlC* za2HF*j@BmV=c0ZK*Yig63Fd4jggcDOW-=-6rI=07!kO-&xKMPZ=ta?=VgSVmiqRAo z^S5H>b$SJ=*@xELt2 z+hm8?q*z3;l%kGey4^-Q_JsHB#@W4RHy(C!@PpA}PgoCJZ;-9m%b1sGpVuiarua6+ z6%=Q{7mLZj9PI(gKnf7PF=1X}u^8;KGMONJUxK5_Vz85dW6WZ(1HfR{ox%4dOaSH{ zPumQ3(wSZ$d~d?wiIORVya>ehW--0dZU8lcw2yb zK4Oz~i|aYFg|1W?$G8?{ob9+fB|65$u}qwd(@wSI&cMjAOk9T-hBFDiJ63lsvw@Mp zE7|k#V{q;CBS+JZ;TUT6SwBXE9#QzrMWJvgl9td<=*OWb^kXv;KM7g{C20Ah5P#+v zQyL0ENI+?fnW2vm5n?b0VvqzR1;v5lK?$G;kV62D59&}*7-#@|6ytos!AwQti0NzW zr!e?Ni&+cgKpIWYMWiNc0Pkr-qXHKKE+0CqD_ECc6>_lRlvt5$)MQQKyAlSw*3|ll zIexJOItKdOoC8?~Qx5&d(7ys&1nLo>Kj+246FK)KV05I8#mMv$5o#eMq$V1Pp-pOX zb|kgfP!qirZHb{K?RbM7sfnD_akOSrf8^YXZ_Fs-`$*KK$gvb$DU}n%w`jCQIATFY zQ4X#gJclBYx1ew+;(JHb!-2!8j3B;)qb^#IK$sp7k8@t>=h$JxBcOITAt78Do%~ z!{`X;Cx?ETL=?poB^0F;<0!^cOaPK~6Jw}j4Yklv6CXrRypWpsAvN(tYT}F3#2cyg z{zx{Y9m>=pD^mA1)b)nC!B96E>Lx?oY^eJf>J~%&l%eixsQVe}{)T#hp&n?c2N~+Y zhI)vh9%`tE8R}L;J={=_Fw`Rr^(aIAG-~{=5rl8rnZcNS{rV#-F%cyNbDx63p@LSOvG?nnHpj;+Q2-~@68g@A^F z3Uw@I9OyR4t`gTuP&{Y=XfWtZ75+{g=mw}AG^85W70_7FO;9_?s)oflf(k)(prN3h zAYN~b7nB2vZp3*QG#NApv;?#T^fl-r=z0_E@mCN+L4KIC-k?#S^`K8d$3bU6cKuk4 z6G#l|I|zR@Y6xP0l0a!7HAoB6fmVVh4#l+*baoj2KP!-1E9L_f3W@>!6LGnKN*&G!yqIpo5^Jpb@jN20&h~ zV(oyofG&fI=CGJ*P;XER=u^;D(61oYTo$tobOv-D^y)nP|L&lbpk=S&fA;~M2L;W? zu?Go3$)F0*#n=D8-p(|O zSqPg5vMI- zr>d)}tGcV|*1cg4vw$`M?|>a(5BLlm13!U(gUSo(S0EC!1X-XHml zh%{uS7-_ItGA8?9L2mT#<6U4_logo31Hyq9JOX^+J`e$FfeJwW!dXR73EU4VgDRjZ zus}7?8&n51z+SKq90G^I3GfB@5 zARV*;--91OF6aunftSEUFd0k-Gr*f*0ayeo@LW05g?hn_K}8Tle}5Pb15b2g9OTLq z+OV7_wdu0QWv|OVm+y1Ag3A?MuHJZ~8&1r2;v@PyTv+OGaFD|dT>G)YE4Y5GMmpj9a~vL7 z=J28^4)=89H+6EtSKM-pcKr*wyY}G@hqZTjubV!{EpL&VkLAYS9Pjwug;}?B4l_pg z3Nv0CNJ$&OCQ{M{ zG;M&i^cZ?VQAb7y+!I6%py%);ZZ(ooz%Mq8sPN;b!!rfjmv3NLR`z z{Av&*izk8Ao+kmeJf7Cqd1Krqt|ge=omCIGGMt`=&tCG&!TxABDT|anpk)te*#lbk zpq4$TWe;lEgIe~WmOZHT4r9t z@w+VPCG6yo!Ftfc8nFy!tcW*^t^YUqxeW-ame{GXBE zdRX$`h|dA?*i8Ow)qk^=$+_wdw|v6;sK-9oq~#8i^04OVq0N^1KPHD${v~=%58BuG zd1=A5{$1FlrZ)0p(w-s4gLJIIQt$J6_1naVCjRHi{fP1{a`6(r>)*m())MavSmZBk znP}1`<#qLNUz~ie`xmR{O)cXsJa1AallvoW>9mGPAKX?dYZZ|(Dg!EmDq~f~scfdQ zxypE?(}NwB^iB_USn`mtysyI|haGl&bQ~Bkfy_lspshD^|JFBYKhQg(_RKYs5~L%5WPEe6vS>#TS$Rf4msVvk`k>RwH#KchM@~~(z8j@>@F8+?MXcAXsIBhCgK+`&5 zNgL2uCv8B}I&qRVKwXQB8nuVGas%y^;_C7ik^3yI;c#1*(_JnjHsA1a--RWY|7aWd z+F#m#g(bIA-J>m5xzjU5TR~*0_L*XpJ3Z2Wic~LD*;ZvKyLoc=Meg*7EK25|5BSG49Aa>yrSlTgz>z2A@lDB;a&8-9b zQ1pErIoY~blB?;vP{Jb8Co0SH;aWd=K8Osc462M(8K<(D%H}HLkut9RE=zg|OAf-4 zhp^-#EcpmaPQsFxu;eD}m-vk0mxuR@gwWVeekhj=DEydt}(;t$Q<`&=2IE) z-}^9YVFogBEbB$qSKD7^{l?nrwb#Q8^I6tnMaRIxSdXy&9K+hOFZmq6_ao~6k6Dz1_`W?@LvfAMl%WdcY(gH*#;_(Q{y1fL zm2_t*&mr(A>)8*8|DJ0MBYxbg%s8meFI+30`rlyoR;@Q}N!ye%`+JD?_!rmPMV(eq zj{EUBG>Ek*>-pN`vxoRzlp$;cYfjqoujI3x_M42)FXVe|0(Ht`&x5?ykai$t*bl}t z`@BJ&N{ByBnj_QM|Ddc%%tXJTO+O+3U#LqIZL)eM?aMlIHuvH%__ToaEZ6A!8tZg? zU*~=sTx%|EU57N8gIR~O4{(6AKT^gL>R%7vZz*3L?oSqN`w-V!0^a9(`{)ZFaWD6i z*Wak)D9XEr{994JVYI_l%KcLp>eZ9>qir@|8%X|h>0fE&9h^a*qb)PIpTB>L_HIvl zuDyvmZN+|xG?U2VbO*{n`##0}nnV2#Q_iH}^c%`GnsQC1UeWmfnR1PyPu&HH+|zl* z%+$GFH`=g}dr_4-_2N3w)cHC58X1U4JVYB$BK`^Tt3o~dNk8El z?Z|gIao4y!K>`K4dO`YTDXDzt?ec)Z%_XK7B9KVUQ!{emgM0r|L=ChL^{|)ZYHupe- zIpjZ|Jg{vc|25o~8y&b7dG4n?S3yVWIiKtKy72i7NZ`85L4FTbOx(8twBt1J18C1Z zy9}mNu8FkY2+IEmZB)C2eI44UVHrB@H2pdHAmz&d9i*IKDwqZm7=PozZrZED81649 zqwT%)qub-T4~&Og`a@NoGqXS#?HCVEFy14kvUfvUU7;?&QumhBaaeEOF7~(Q>3O8A zM~iQ!b_2jTYMUwy_EOWIL1$_@2V4dlsbxwx)@Pva)3g?MMp7Ks#z4 z0eXWVckg3H$D@q2PM`~|QIlREzj}IxmN*C=FXcDocpAjg8f(E|TEsVu(t^i^^V9@q zz>gpczXWiI7F$oNjN)!~r+2raMLN-Q?tqam(@M0&B(RISd=T6Quh1&ZX^k@O?wslD z?#*Bn%_47FrXnpA<`#D3Q15U))g9{Xr?L<^#S$%J+iM#Hibn`s>-2eaBy z?^dr_<0v+>#xa%0Ri03JQspU?pR2T0eyQ@b$}=j@sywgqqRLAuuc*AL^18|!D!)=G zWie~q6#2j@?>4Xb&@t~W8oKco|Fu zQ@}JZ8N3CigB4&c*a+5x?O+G^5PSfRgVkU!_!R8(jqoKA8j5}pd;<1^qaYI;;?wsA zzD6n=tBh8esxnh0zkOpwsZW$oe4>2f6Xg@1D4+O5`NXH8`ZQE~L$x^Gs2n6!l3}pH%fpRi9M#NmZXz^+{8oH1$bSpEUJJQy+fmhwG-RPrBOE)t;{Q zEVXB8+AK|*r9N5elO;ZN{Jv6TQNTZ{qq8Bw) z8KW`{8KssepID-NVri(BhH7c3mQ-!eG_|LxJx%RdYROVdmRcJ4(lsBEv2II>o+kD> z=~{*?^=YUwpfad64WfH$CHR~gB>0>fB>1ETrqN8>p_#T-yz23)w@|%>>H*aQss~jM zs@_=j#;P|`y^-qCsz<9Ht9q>JajM6u9@P61)cX?D8U?l10hO^T<3!f1?`zG^G{_iC zRhCgCVHrolGJ1riCpp~MAen%UB;;7C!aCunL$vBm;jMQ?LbTV2cEG&M)VwW*1b{UIye-cc6FWYLm+^1KNrFKP28XyPp;R z;L9KsPD$knD=9Lt$&=T~HPcs3{WE=>xx+5GMuxm2L*5aTJ%t#N;wL#Z^&L{3PqGYV zrhZn_paps@@DzBE@I(H3k#dqo&iOX~r79bHMfn<;ZQb8I>2N^)#adID6Ur@99 zpb6*&MuRb6;uFlEz`tm%MPLbd2P^|Cz!h)}+yHyPJ#g2b-;`_fS!Yva!-`Q3vx3T| z)lAuAHCGO|;+4Htg7PC)qO#9QQohehR*tYzlxtb3$`!0M<=_=7ITO0pQ!%#v;#;$V5HEx)kjl@Ge*emV^6hGb@Mr zMz--D;N-rs9qa(_gAV}T%Qij&d%(xw6R;0(THY{g{|%W)`eVvlJ>MyB_ncDx)bqXa zhn~-s<^0@Pa1InRE*60jMv0tB64?mZRAr3HfXblCSe0=qo2hKBG9D?Pk|(<7fs$NK zb~(l6RF~6SPIo!O@;rM15%`Q_KOw-rK^f`aW6z>3Ul*zFZ`8kr zU${a$ho`vw=k88;TdBj@eI5Q{xWnfTC(7L%B(^h+1iLolq&GV! zJAm1Nl7Knj2v`tgjbyiApCE^N<%1C&#qua=wgovytfAI~W)rs*93tOUSo7@*SUuF@ z0)8#wG%y0xh-L4GQush7cp3}=FMzSx>@nDlxL#=~J78EYs3nhmj9Y}{uf_Oir3Iil z$N&StV(@oRl@`kedEh0m5^Mw?fd_MWQ-h9O*k=RZa#s!${t?`d{wf>;HwCM@nntR9 z^(?z!g#H11-1FM9Z^Y;NsrGVqQBv)jl=B;GHQ7t54{Y=T_}e(%HXwrj(yF`3lXf3% zv4@Zi?yBYY)aX}2K6c%*xi90JaTm~cgQ-1O-`Q75liX9Z>uAM2*$D-s!6k4RG-=N6 zAKai9_Y@2SeVg-1j{V{}Jc(V0Wg~d67oYOjGcHJ6Q?48~f!c7@y?D2=w^Ey~_G!}e z#a0NekajNo8TbPH1jhH~%^lB9SUjI#+9%JG8cRkWKDhwTgAHIQSPRaBs`TxepeI-l zPJ#}3vO87j0wt#R-@`Hhei2;NnD2?H%|6>FSXO}O7SxOI7%(4<0E4g%>L(@t9e3yo zqwSxJ!c6dROIjUV#bXA&aV9X>shdaK>T&G!B{NzH_X63RooHk4;d+^*_!-MY&}jhU z-!3BGY3z6Y0aPEzNCr#5O|bk~@mNCr&pgY{+LPqSXSeO_Wu(a|WcLtD8N3ke(3q*T zZ@OLaoOu0Ax^{z1qrJTo|6S}ye+K%42iaR~1qOoWz{Dc)dYrowB-DiGLNN&ZiFPlR zm}BJh6EVdl66(dhZEsJbHlOm8iD&fG1heqYf?oh5z-m5oPqgcjXX;?tMa?6Rd8(}> zWrH&APnl_Sv?H*OBQ#S(vE<#!u1>pGpQ2tvZX|+V^|2+HH&vUNZ9!-8->}AA#i_J`u zkktFrQM{2seJr{5IC4EcnmJSwdzA1MV7wqPlkm-Zfzd-R+6$XwXh#rCmhsa-M^QSt zchm5fXwQ$8m{z2lIhH;~sGEJ7x^=VbP^;eKL|cwliq^({2ES9}y@8(U8PB>Ph4zCt zKP`UCXsvGcW@H=t4VClo51+u(@HIv)zSGfP1@pm1p0w+Q!TaF)1k-rhev%ZAy+|#< zYETEf4{m^LZ~znUFk`#5O|v2~b44+jIm3NRT|ewk+&+yE{FlfW1FUV_8;MCK>--jhwEr`?}+ z`~u5)5SvCT!aH~`^|IGczFu|#?e_upz2NOvWM6wZ*Imhb_d`%~Dm{OySesC$Bs-kA zgV=V0@31G?=dqle#&<~UpDI zMDudR0d{5j<^X#-*X`1l92W2fUdTEVJh+h0{B46j2=GD}GbVnwMSkBZ#Tes;ls zobDAVr7}6)D^f~ha=KTfl)~hMuSmJNS7WzW23HNozu2m14JEE3+G77P&T+{Zw^UBb z9pM|75|R^Lr;(2Ju3x^DxtP;p2eGdcYlNfAw?vn7wrwq`_H%M9EuuUm-}_wUPvH#R z96di)uHW^a_pc#_b$*0=(^TTu;3xV)a#_QPwB0Z(RkhvXRNEZFa@uYk=gKb1RrH*l zTwB8HNc|r@Eu_o&IXPvwlXBnIR1%W!tV(zzd7fi-of09MeA9Fb=iAC_=jf8_ZgO`@ zu+6{M&#yCK+d^HFIiEL&{Eqq4IF%=@=A7F*hQAa3*niZYiY}%92G5gt%C}w@(_V5~ z@FboRmQtN0KhZb(PpBq2%jrcqf9SMpc}TAMIXTHUXZKK&Wq6(@johL4(Na`9=szCP zAL(@*9lmDr>mYXnZ2YOc9P1qQz0!Qr0or9 zpN6!3A?;{L%M59aLR#aHmKxHsLR!<178BC=t$w#u(IG9}SDO;cZ;;hd&hh;bmfs<( z>r2vKBCAJOYLbGdKB=iwHlaG9ki^xM&{^3z>+IX>Pl}LUdj0zN>xM^Un92u`6Ryj% z)EGZu!r&@pRp6HfR~afycw7}uwOGvZz-q7#tOpywMz9HN27N$ZkO%TXKTrVrg8^V5 zcor0bL7)f}gAz~*27@v%1PlekEF)rk#2gMOkB^w^4>XnCk-u88%B!q6<<(XT<#kp| z<@MH+${VadDQ~n|DQ~i#Qr>K}SMFnVQ0{AWRL-+HDd$_+%KfYy%)Lx2Dw6anc$1f(d^M2aFH zT|_`YM0#%`AWgb}hzJM?ko@17yPE<*5&eDN^M9U?dERr*oH=vm%-p$i=ic1CNSU7k zT^aI^%QWlXTg!)JPxCCv1AEY)VSe8O?0pf*C?{AYWSEZb4D+-QnWqfX&4yvBvKb}? z`NzOYsL4NFaew40t@1TT8DF8b6~iPkLz&IY6DEf9HfICp5hsG%g*%UXp6ktP&zr|P z&-3EH$)CgjfiDmw2;LE_5?mMf3Ofkj7akJ&3NH(#R?Vz@g+r}2TRpb&ux@Srk@X4d zKdi%TJgj@zd}MRd##iXjB%n#RCc~R-Y2qvVqlvR^uI-z)Gi>+S3hhjGBkkte`3iru zbGL70KgfQi{VjW6cx97)Q(vJ9XgrmmWI@v(o7y_~3gti!K+?MoOC5f32z2b^xXAHG zN3PQ&YhPiC(`ctHPEVZ-&gIVQogX?IT*_V6yF7F;xR$%FcYWZhckANzncG!2Klj$| zAG)7#cM)ZZ28k@~E)rc9$vrH&zvHbuI>UBOt^H3PHsVZiSFx{f7H}Hy6?#abB!eXz zC66Uho`XHtdEWEXc=hvI>vhk|SE%vs=e^eZp0~!QpU+yKdp>{H-v2BAe%$EBK&&WL#jUW9tUZQW@`v33RS&GvN55G73X8E1=^YDMef0qAgfA@fl zfOiA-23QAX1il-%J5Z=JDJzt_mBJun(72#oL4x44;9{%6R#UB3TOFe5n3=P=|B~Lr zF5%y#AF)q(c@n%(_D=Bg*6#-u%-H?TY$a=1a>S9l?$u77oP74bMTHmK?r8>KpOV3CAqUF)rxmnl7 zi&jMQn>pqqj*XZ53SaEk%Pi~btDhSSQpSY3zrw7vcyS2z4ONB4g!&4P1)D-!fwM{Z z(jBPS4;It@#H3#|@qs`62VsrV{am7l6Aw24Zh3RW?yKSQN}Q01;lP&HE(s`6A_ zRJ~Mhs~V53PW$rXKy1FmozCB-_h=h)RC82IA$|3d7I&(CP+e6$P;tYY!{lMwu#~VC zVMV~-dAF4J3#%553iI%oh8hcLt+fs6ZVx*ab}8&$7$@8*+*0TDxGY=~o*bSZUKHLh zd{p?f@P*-P!?%YY3%?Y8FPx)xQp?mDb+S5NU8L@(9;Ke9UZ`HH-mX5TzNEgV=4hNW zGL1%)ta-h**;=k{|1KH$E&1jAqs&oe27e`g2md1fDc?h25VR2V6TB}V`d3R!VY~Am z*uUuY>I=2+3hacT!W^N+wwG`ul|P_#op6Wnin;7&t&PR@pK=?kAS+TQ)2dK{o#V(2 z<>qi_aTjp6aCdTl<<_tNI&zVF!&+UC?@4u4g&d|2mCD_RxDRt5L*)wh$?j7j&2XRR zz8L&zRb6^qPqt_)8|XMpeVfXv74AK3`q-=y4zSrRJSyb#eiFWIGt%a&u)=1N&2*bt zHdR6n^NG!8HecBAtt3{VRvgBTx5nmcn=LjGR^Qujn7uaqs>48z6~F2v_)oykz;)nP z;13&46^G$hbwc@LB#3c zA>LX*Y6HC0WScu%rwei=K;I@i&6>VwaUe9q%w9)Bo&dZLyo0h2!RG?+LS6{|x!Gu z?r*?3*jxu5L;js`L|d1lY5(Iz$>zF7JWtE59UY4|%InpttN-6gj=b0Rl_zQ`dPCGs zBzqk6IR9}+Q5W`z|AUBoi28^Ih~5^B6jji=cikt6_Pb9N&G^Tj*j^lbR-0;SC018+ zkZ3G;cC==rIUTK8^UNY|toaaW<1;pW`0T^F5~5>EP>)+}PERd0;Tb8dFstz3v%JDq zg>ys)&HmXEVpmmICzJfa8a9f9HOFgyK(bKQsxfMr8S~BahV*)od90R+R@RQ%F!wd0 z&?lQjeosik+6p{)^56-_?r~KmO@v#(YAksUL-KWYWc{_VbFg!>^R!dgxtiyT8axRo ztT%rKKpx9uXYC}jeg%(f?P?ujr?HE)i>|V%)KpHbJXrZv<*3TcNkuc!6Lq2U-<#^XEMtG%EHd8c_l^RDvZ?H-tW`q<2$nzu2fF7 z%e3P#%>fSM!EZ@9k=xo8*>xwmR~!wd%8Sph>TUOeJySPlXxp#o?*%`vQLkSWzpCuT zh%BD4G7quiSB;<+{Hk$wOqCRUmS^MA1V{k{F2+B3S&maH3WvPfI#T4K>!)*W_bB3jmNOU+)$uGQ|< z9@Ua+uWFxaX|=rCuQ~upSsh#*TFqg?0Clw%l+@xdevtmprn+9RQe1UHbrPZ=dq_(< zZFj-$n%!-?dv;3mIc6#Od#*=~D1JnBRCP=>>pP`7t(sgb$ogK_IxCYg`0He<9ijW8 zN>PnS;PKb?|NGKMc5(b?wQEb`?IrPf_8caSU)NtE5nGFf^xu-lzLk#5-;V~Vkx@G$ zjko>RTFY#;w8&m3EtEt@O6r4+^RK=BSEZKGVL6H7MbED`N|t&Sk98w&(GcIPY~dtl zsrTxfT?by>{;w_B<5Cx6L#b}25WU5gjetGpSbMgFJ*pNDf8FjsFU{cFV2;g;1Yk`TG+Uq$~q^Ra$3uPW!zC|lHvmGu2(sj+?&+c!Mk^5{Z+^sMLUWnTU_ z`qzp3zm23FKaidQDtlccGX!tO)Xu2B9s~cAaq59N8U5T@_h0m>E>hyH7k>bq`-%2O z`|L(~_Tstn?_(H@7V~R+_WZnCTK#qHSpNIA{+S1Mo(J;>k`oC zqpg4B|5;oA*4nw6#A1n(^)`wBAw8O9_CxGP{2w}+&b3FYVJvJ!v-smMYHI7r_A~4o zEC08n&{+N9wdddA>ce!D8e08W&8CLh|6OiOZ9KnL|Jq1A531SpcM(kRm{RNM?`;37 z{2qCR&`2UXi^lYSuOsyc)-U}d8^R6rd#1g zk8eG8dmQxOLqig&$*<`{pBj-fgzS`__r~T=KIfBY&(`WL)k-&Nr8~9K^B(UvD0$s+ zeBSFn$$!S5i!-f$dhvWC>l?dn)VHfIzgpy#9?v{DVu9F3Y%g{ayNShOZ?R15FAfr` z#2T?d94(F)CyR}ELorvJFK#JrBW^D)!rO|);@;wZ;&Sm2@o@1d@i_4W@nkWFnFd(W zjC!&~_uui#$E+_d-irPE^xw7nyr-(dszRpvRnzme|84$N+qKyJ+xA|+K6{t*`t@J0 z=BCQJ^r{}7-xs`UJ&WDHt#9%3_ngeVzqfjEr^D8H*;@K49_wsK46hnTU9H!V_P%88 ze;r@1T4xKri+@#X){fjO|Far@XZ`9k%JMA1!ineqI+_=wwX{ZhUYCfLwEe$t|HX*v zYnJjC+OOa*v0uYqY5x^}o&84sCVF+xI=9QU3WxckZw1`$ap;bC|jjf6?aOYyNs)@z4Fg?4JLx+LOMm#hkYp_UEWj^Uh~_BOyu3Fj|8|rcuX{#vPI5_dO|n+>Uo{7O5wH|k z1$+f;2EGG!0|$ZQz)!$c;5Xngz;WPLSpjxH8vM9|djWEw8S=s4dLRKX0XaYm;0@p{ zpbO9w=no77#sU+8slZ3T$G{?B8L%2y4}1fB2kZe3IUIL5?Qp^2szbDADBb(7zJB^-D?q8|#DCyPS?VopHM2bjRt@3-vEp|8C83 zwsm%M_H_<&);ZU&Ph_r*qfNYXx^upBTj$37``D;$x;mFS4{;vjJlT0BY_?R^M~Y3M z%|hqp_|4ol=Y7s6Uhsd`IVm=&wV~G zaItl{kelH7OD>0Ta`_{d^!aISR!)Lvnr9Z!%(Kvw->jWy(`Fn-J?CIIx)7wSi65^sqDTmRy#JZ6B3C%ho_P0Gpd%owH>XPY_?~>81jY|iYt}eZ4 zd5dQKb9Pov_go{I>q+Xp+3XX~F7Qz9(x=(rW<7I8HhVkg6VGAppP{tKy|HB3Qyr zirnj3YFx7N8cSDrTQ3JM7cY^Qr7_K){ zMOWPOYOhwaB>UYze#vzs!mEC~e=n1jom`_WD?EF>COJ!8i^N)yHJeD5E54=FV*Nai z^GfqNC&@unExg|J!W)E4CogjKWm7RmXDqn};~j^ol3q0SS&}lZc6g(#9ewX%pw}?3 zvCu0q^6#1XyOMXC4Z!t}!wkdK`4`EVDqO?er+9tfHPdS@9q&)PmUwN))pv>4a_W;r zHJDz@$yL98jH|sicy06A=XK2MjMpWvoA`BUr5DHB#@oqT>@D*S@*XP*^VWOEd2f`Y zcxQO$dB5pBRn*bDoA)HqIry7|=>NLh+*Vaxo2A}$GKcvAPqYub1eHQyH z$8W|r_-ysr;j_=@n9muXOFlR88}dpYj<1ccldsrU<{RXz@s0LP_RaLo_if`_9`ilpd&&2vZ%=o!B9Xg=UvQ_;@a|%&JG-Wl zRh!%wbQDd+d$LN;*QsMwyPnmJ)61lLzAr2Juk}^FT&cD6b-g;hcFl$@{*(9g{<&?# zwq7HLYk)}VDfMN0ORflJER)!HE|#2;m^G{(S1dNYA=)TWNCTx7>;G%KHX<6AId*p6 zKx1lH{-5S1V8{Je>D8YOzRZhd|MR;1Wqw~xA0mA;iu42NOzB+deCemsCDP^6)zWp+ z4bsihZPFdmJ<_t9)*zk zrpP{!&6LfR&6j;DTOwO7TP<5B+aTL4+a}u~+ao(5J0d$FJ0&|SyC}ONyCJ(RyDPgd zdn9`ztC8{KR&rapgWN?fl6%U1RaxU8Y_EU z-B)sY}Q*LMXoTqg8pn_NE=Y;#>I*oEE83c&%_HG*TVUkOgt+skYeoOiwA`kmmW z>s{Bqf`_h8T#pJkZdPum1@>+(ZuKMjD_LA?>G;&~>2$>`#m@p?xA_fXzgipnF9LtJ zUj-pO4+N2Jj|K5=PX(!Nj4;!UFKq5+Eo|v#Cv5BHC@gYw6?TVj4`FXNFJYOR zR5+xzCA@Vl9N{+3EmSzsZJL{2IMZ#OTb%Gyw`Fc=!qsl;>)E|5@*H8Fuu%90e(|=M zT5osT<95jH1eMzf&$xZoApVzm`FHv+6sr^i3cpkgE*w_)wc;Daw~CR4V+-FcoK!fq z@Wa9#irtF+io=Q%iXRm}DK023E3PYUDSlJ@q4-l#rI=Otap8i(&kC0ot|(kn_*LP? z!Yzg0749tDTX?YWXyFfqrwh*&{#G5A+ZBkMy6}Y+19-%?>rY*z8_2UURqRfz6|vXE$%(ytMhK z<{va)(tLY!4wLAg?%&+MwSNcy?*66zgZxMMkN2PAKhuA{{}TVz{u}(a`S0;R;(yBj zqW=y5yZ(>-Yy7PO90EiEz5#&&;Q^5Wi2>;W%>!BobO`7i5G(5*@O<1o1Ihx11dIq6 z7cenkTENVJc>$jWEDKm2us&dO!1jPW0beM-RD6wNN{-@>^!U{sy`#-fH~+c$FU@~% z{ zHkX|bxD;?L;AX&`fO`QC11bZa1#klQNCkm5f%bt;fo_4~K<_|VAbV{51A_upfto-= zV02)7U~-@_Ff%YWupqEaV28l2fhB?c0tW>S4;&LXA#h6IjKH~p3j>z~t_)lkxG8X3 z;I6;}fyV+*1)dMQ7I-J{L10xNS81(uQo1QUl-^32(od;WhAP!cy)sG}r%Y0&DKnHg z$~Ws$P0vRK($*-tq@IaoPNIZ`=RIYBvDIZZi3Ia@hTxlp-SxlFlIxmLMe zxkh=R$2u)1i1&vg7)N{$h(|(H!mcJ zmv0F2&W{bs2vX%o=NII+40axOwoy7B&SfgJlJ826qlF4lWHY4;~sk zDtLVGls489S3H~3L-O|VsnLx_8bPl#VgaELY} zDkLGq7?K-O7}6Hs%IF@_JESaRSjgy*cSGI}`7mTo$byj1LssDH7vF~L4A~!YG~~yS zb0L>QehK+KVCeDC)A(+})zDj^zlZ)A`V`=*tW{0%RRjqjRRyS2sz_Cw zDn*r{Dp0+t>Zt05?;-S84OS(DzM~qedQUY~HA}T1s&7@hRR>kaRi{-K zRM%B^RS#89RGcuYF#9l;>YZ=x)tZP{Bu=23sVdKIkhs_9^ z7q&QTW!U<#tzo;u4u$;?_CMXn<2^CU`P|r!mX_7K1NkShtQ4OQBR(zu|5yA!YvZr_ zN$x8Cs{B7FtN--cJXU6}mtLF$Bpw>kLy1MA@%@81TZ-1=?}69CD&gT~*qyLv5`6LE zW&NOS4SDSC`LNE0jp4tgqRS0PCH4vWt9X3f zzP*a?SFs`IIvEr8ul5bkOE&kvUkxmAzW!J|3)}ZtB%RjSlZ0@KcftkXHsSX8LX2Cu zINUp27VaM&6t2SeUJT*U;qmzDi!nSiJQv@7X&K%oygj}E(>1&}yf?lBQyxAfd^o-a zGcJ5W_+)$&W=8nz@Oj}2!xx7y3tx%v!>kYA6uuQ-iP;stFZ>X`6>~EDO!#?xG3HwM z&G0+;Zp_2*%J66SdW=AAqqfI4WZcwZwKu*bYt{$Quj&C82Q%_J& z#ut%hsAsF^;k!tS)yvc?@pYv2>P_mc_(sw$^*;3>d@1Rq`i%NKzL#`OeN%l0Url5vtDc1DX^wX4UhG>RsMrp=rCTJ#WrfFtqW^2Z&=V=yd7HgJiR%+I2)@wFtwraL( zc4_u$4rz{QPHN6*&TB4du4!&+*2(W^?r9!sDmBkE9IZfWqqWyMY2CD9t+!UD_1Er^ z25D7Vjn<%z*2Zg-wMOkwX{I(;o3CxDZKG|kEz)+?7HfNJ`)SLyL$t%SqqO6+6SR}H z)3h_Rv$gZI*E9>Yi?z$NE46F2>$RJ-TeXWd+qJv2`?QC&$FwK4XSCoAQI~I_y zYpH9aYp*NPb=4K?dh7b>R%*(1Lv+J+qjckR6Lgbx({wX*vvu=y3w4Wid*sV>D|Ksi z>vfxSTXoxYyL9_>hjhnuCv|6ZJo$OuCEYdMP2C;cJ>5fHrS6%IqZjCH^!9ouy_;UF zFO_@ihstGoe|?ai%rh0JhDru~v_4*+tT*a2^||_deM`NsoWrz1S$kSur0+^O_6z#n z`hNOy{Sf_d{V4r7{RI7F{WSdy{cQa_{X+d>{WASZ{T1n2{d)Z-{Z{>U{Vx4J{UQA^ z{Ym{9{dxT*{Wbkf{k4EQ`g{6^`bz!7fMKchZnIBV!GyQwkg zE*h>F&ggF#&g*X*?i%hJqV6yX`+8)0uyLxeB8J0lJvz0C88h%*rv zBd$fmVluMKd$rH&JNr4oI zgxlVzaC6c{u_-c&O|emIN{mX3N;M~Sfx5sH#ir~iHZ_ZC78O+xRq(PAZK#JKO|7F^ zN3})jfK(&u9Mw6hJ5mW!sX3KJl|>~LBo$;K6(V&&Dn=?p8cx&r0+I$rv1wYt+=9i( zuPPWCH7aU+)a0lSqGmTJ}ds2fptqV7jkM%6?K zqHUv{qD9f((TeDx=n#UBzw2di>=^oQNrYvS~%sVk-W8RCI8Z#?qLCn&aH8C4wHWlnZKMv8J zY>(L!Gdb!|4C%>Fxwx>(DI zSVki{mW)M0tYsvOu_ThoQL@;zQKa?kSW$F-Z0p$ev7}Eoqct(nh};<4DYj>9|Jb3i zBV)(MPKx~?c2?~C*w12@$F7Op5W6LIN9^9%Be6fko{jxE_D1ZlvG-#uV{70=5N8|b z6eo&%-Fg;3`G_Yynw>AKj?5CTIIp;dbNt1eA}$0;7Z)3s8kZec5Z4x|Q(Q@0S=`XL zF>w=-K8Tx(^eNKvxOGV1#O**jfOI16EYcOE+i`!yJ&EJR^WtseUE(G2iue#DU3_eO zYJ66FL3~@J&PXLlW${DfN5@Z${~&&D{HO6GqUG`H;y1_dh(8d20_kl0<@nq2_mQgN zxe2xj&Iz6g@`R8CZ9;59YC?8Gi-b25IwkZ>=%4U*!svwe5~e52Nm!JiLrY&Ie3`I0 z;roOG2`7Nf2|p!VPPmouM?w|yPY^vfu}R{KBk=Du&WVylMWQM(GBG(ZJF#VAhs5H< zeu*;^mzi6#>@8o$*ClR9+?=>AahYgG;-168dB2NS(8Zr2Vyc zKWv$38`|HIc%p$HQIcnpZ<4}Gkra>=oD__mdU%p9DKaURdLtealG2j0kjn#FCACZH zl++`sPtt&-x06ODRU}PH`XK36!L5QnkgAZlh1|j>=JZk0M@e(dX@1iDq)*LhNfMiu zC$VXD5}Vc~v1vonhNR6%+mLo3?J=hVNo+cj#HJHTB%Ly+vq@~an8cnVU?m1HxoBxfi+@`XOnNnj|+#CSyb1zub}Nlq{lS z=9%P{OpXy5&vl84q_v*nD~CowkZb>4GvND9Ys_D(*VIQO;Qlew5E zOTLompB$8|O5U7E=Ff&iP4Zr)AvroZJ~=trn4FoMo1CBAGPzCi|LRG9`{dsWAK$PVSdnjw3oGd06tOdGg7ou2nvm$yV) zr(q?s*05#lZw^TQc^eMXQFzg$JxA@kJVJ=~pn8jaqP-&!_SkM;bqaFH<8J(t@3wWTGJ7kyd2(f2Z57vWOc z>(=JhEWZ|K-*G0CZR2@rS8ex_64vr%_1Mx6lRrsbn*4S0&gA3CKPRuLSwr8Kw$#~) zJW0==cXd}q;-&7GG(J;_&2Pz%lesC5DLyHo^m!E9dSkTiy+nMQvEHwqUjAMp-bco} zjKmlD)f3)P#k=Q(PsMxIq-B=y=7g^+;NlQEIFR~hM zhrACz0PkE|ez`?{w?*nb9}jC+*YAe$Sl+*kO-ZC9my*K%cFUBKozg7DfL}!vq_j>M zTfVK1`>`=A@xDc!~{>Hv0LOR&4zjQ!GC?2E>v zBu0%-nTRnX^*Kxpz+qN{Hddd*SnA`;$FR9Vtw{}1mrZP4eEAp}wrzYb8Euog#9|$& zrEX#rhk3DWHg*ovF!qLRv$1oS@rc~g_nM@-zLWMF)_<|@mWW>*x%#!Iq|8W}o3b!v zNy^HUbt#)twx#S!IY8$aTfecloR6iP!*3f?RHtD_ex>pJ8GRw;&(NzWE8K6T{87)7 zB_4dQJEb~>pUUQ&n0Z}kL&-7KJvB<%P**<}F7r;6rN$LzOZ`)WQup|&QZ=cDRC`@? zYJ6&PsxdV)H8*veEI+knYMa#dsYR(>Q+0Tkr#RI=aGhUJVDHrV@_wo1sY6nSr;bX! z6wq2WE_Fic0En%ZLpL*#zP5tQb1?)zC*;C)* zTI$WzJE`|lAEl1cKS`}g<)vArjnmtvIi$IyP0)+eywhZ9{%Ij;;c3aKy0qxD__X9S zV_J7ZW?GrFd0IhQ>$J9M6H+^*bxs>#y~m$o5obK17F9cg>g4j_#;97#Kob}H>`+C2Tmv@2;h z(w^yVr`=7vpY|y2Nm@-B&uC?|H98sHjAEmIs<%;Q^f!hWHAaIm+8A$4HX4nY#$01Q zQcGi7V+Z46O=n|wV{cWkI6rhHRNQyWuzQ<15wsl-%jDl-i- z4Kcz(vG8A~#jXROWe)_6v&&)Aevso$2dD~vj=9vYVZ8FK;>_ilH;ijD z*Jp0Z+?u&Pb64iR%tM*SGEZin$vmI=P;({oM&|9zyP5YhA7us^o@CZ!^0KV5Y_puQ zL|NWhvMm3spe$9EE-NxCJ}WuPn3bKCpVczUM$;y%eO6Ic*R0~K-dT|m{j$olhGY%T z8kIFJYeLrKtTOquteIJJv*u@gnzbZrdDhyj4OyGBwq<2UG>@1f-;uR1>rmFQtdm)1 zvMy#_%et9$C+l9;qpX66Cs~|qt8CkBhisQ@akh81EZaXjC|i}S$u?xiW+!H+W~XQ8 zX6I+O%x;t2KD%>vadz+Qe%a;OL$gO@mqe6AjLV*oJtg~t?3vkfv*%}jn!O}@d3I^U z+U)h&o3giNZ_nP9eIWZt_Q~wC+07M~vO6fQW#7!clYK9{OhNwMhbtQX^{qIU*0ZZ7eVEs*!h~3|kT=CS3Y#H@1o+58 z3@zf?30onC(0E388ZrEY7|tPv7VyyuF}#WPFQWbFXnz6v?2Pu^QTLEoneH(2Jg#Mklroz#2riu#m z{SVZAmn#%r#`wGkA5&rfKI|vK{v+5=hs{;Qb{(cmi}DQ^ zr*B~MJ#2O$jy-WXLM^{p#kbJ?h_MU9IP8c1C+H*i&4jxV<7LEnf%F40K0u6rBE}OK zzf*|uCyZYu;ya9abAjtFyo?x^BF12}9flZ-5aa&%4B_>-JmGct`VGhOR-8~+g|VrI zPi{O0CY~?6h4!1qI|`e?k2>B|;T(^-6z^Ce!Ew6|n<}*X8%+Pe-08+|D)fPEHQN0R z_FCBAg3SZ;O^$kx5yMmTAt1g}MKGz$v#!|9<5pDjJSr~m7~#(eT;Us5*1}GScEXN{ z0YXlKQpigfThS{K#{p$;S_KGY3CapZ0weqd`F6-B!zL9rlVI~JbZ?>lBGg}y=qQ|l z@*}y@|?-{m?&veh%t=k9yys-b~mmM7>W@?;y$_qx@s& zwnMiAx?f?p$i}haa~p2O=hlpHfwi4*74nO~$AW(Yd~3r9zp=I!?zXWL_P2Jd*bm!{ z;D>Fjg%_aPV(lorY~x;W9k#cCKY*WrRn|TgzrpTn$UoYAZkJ}a6#t9t7n8E=mQKn5 za)7b+ODBB}jI79Tc&cj?ja#{7kESF*KAGxo<|ID)Lqgn2&W-3L)L?4P) z&736~HuEFVidj=VzL-5yJXSnI{Dt^^@rv0~#ls{YN>z}5 z0S_Pp$N@$J?*i`w(}ACWoxsn)W#AX!7Vtap2f!HnR&asufbRiopb6jzI0HL@-M|~b zn?O6@E#L>>N8mK@lTmv~S){!*tE2YP$G`&MGhivO0$2ll1#ASi0N(*Sf&IW?;CM%E z(a#Uj5D?Xuiq?J~D?)7ri zWrQ11zJ_NlT!D4tE84aY+D`EE7X0*tpKkEemEl7h9LKE)qdrJ(5ptg+SH^I7O)K_O z$qv$CNadv6ia)5t=Q&pV8OJ?c%6OvIH}L#5JbxeWEc_O0+U|Jn>2g}~BT9~=!eHYd2 zgr*}jhZDKXZoBX8zL@=yn9H28KVtvk+;2R&=Xx;;c<3L;J=dQ~-#{vZ)U@I_q#{Us z;hXVx6@B8l=em+y#ru#-sq`hrWi8JI?eejvSz|4;OCTCoXmHsUHi4!oH0!C2Ujny6 zNpqw;0J&h~Hqf#(SZBaGhiXW<37RZunnAOP+H`@Y2Q6!wEb(Zy9d)d4XIaSwZ4nzfMP>JM~rqdYhAqX{n|$SfgXSbSq+HAHcEFRa|b|*o1o9T~s z`|VQc8cMV#`viOVqSR#H+Ma83#(uy3Ir~)bBap}%dlA&%G0c(62s{m*T*iu08%k{{ zwWrjH(ilqPC{3U=kK%9OeBzsLviu z7-|+XYTjK$K(4_gf)a8roGYOblpumgpVaH*`hw$+>kJ|SMT^&a({rXb_1&BL?nC81 zv}PY_*$0-EInSw;ta-Wg9L3eQRw6C;r5^jz$oqq~Ma$%RMq2Jq{NWl6E$WhMwH2?3 zN_LPs!;)O@NS_DLCsKm#)#l1d|tg7b* zlG@h?(e-^0jcO1XH)zO_8$?S76Z!gJsvkn-x5=t^?rqxg+qC5#H0lB7x@4^!LRZWp z#)fwpV@`HZoLY&gmFh})yoKl+xq@>MDv^Y!VUbAqtj?xZs;fcfL07ayD35mw?UCIV zr&eNWC7W7_$J>PV$R2@Q&x%QH%n&jfrH~)M2ic)qb>fq>*_R7)T8hT_%ZHvUzN_C|?qC+iO8zYQ0@Z1pUV+ig2 z5cHn$O=WjEJPwtxE2!0RYIU4iT}rJkh3AMGboaw^q7tcV(Q#^Z9G(=VbSK1oZOy$8;9!lk*R5nq09;M4ET}kO`O4m}lj?(p%ZlH7%rJE_; zO6fLAw^O>4(%qEqrSyoIlBkYT`8aL;B$ZE6`81VJQ~4~F&r1eu)3eX1iQgW4NaNU8#hor@mXC2L96=b$xZy+T* z5k><&?z+L#qjSQVOfiFd4yG5g0dxx&cNi$+Fa=!vPm(|@pfzxw zgI@)Jw*@)^oq=MY2T%(12g-qwz!>0N;5}d>Fr7>MlUDjr+J(Wr1v3L4KLNUi;!Sbz zZr~%qC!oe8pa*Qaf%gQ*oehI4IfFYB^qgE7rSzIDCD&v0SxGfKN^Q}9+&M9Wff?N2 znNbY(-^?b&Af-|)^!Ir@b}Uh{0r{ce!+_yH3w)K0{GUxNfmXm9KpWsqpe@jW&ttz&b{bZYl8r3)$joYLi#PDOqEFEJ|o{R7|vJb(`f z03l!nSOYdd6TlX*1MGpOfCJzNI04Rp3*ZX40q%eZ@BqYs1n>mB0B^tt@CBrR43Gl~ zzz^^T0s$4E#WB+Z2|zNC0%QT5f$YyY%uz>ti4r&toB)0RP69syr-0MI8Q>@2EN~7u z4_p8)0zU(nfXl!Y;3{wpxDMO^egSR*w}9KgufQGPH{dStJ8%#91Go=703HH=0*`>l zKqXKGJOiqM8UXjcOedfVfM*{J?vQaVa~RwKFxcJWT*G+a{K3AQ835o(2a|_wvF--M}7TI}ndCY5_Yu0byE#{|?>_=m5D0=nC`%<^jEd`9MFQ zFHi;y00sl2f#qn+3+<9~e=73RfQ?vxikNS4Zm#8RqGzuQT4kja^ErwuJhK6%^v|-SOA@L|BIjvMH)e#_?L?l9?0TLO$xsFzf|BVQ}(*p(|0*C~nfM_5DaF%0K!IeM|>i5TL z%flFiV~kmjm6=&XbDg0jt2p81R7=J_kVDo{Ji}u8L05)j&BO8SiZzc5PL4dDDKPl9 z4s#4R4xAA1aMX(Eib(#yTqikuWr##ga)pRS4+z&FdRC(Y*Fr-$DKVJygp+Fs;d(PC zij*bE!Xa)xi@mO7VOSd?4sRsv{K7#b}O5q;xRoU~nF#gsb5V zS3V{L-;3f=8Y0K>l!qZlXc)%XT8))QOmgTzecmWIx8Eu}h2^^_VYji5A=(kM!!LCNt}n>ndZ^27t-#0TNT3*p2M z;lvZ+#24Yj8{w=!5)H{KF$3AL>tyDg&Af}5cQx~FX5QV*dzg8#nfEmF5;N~*=Dp3l zkC~U6d0#W{XXgFQyv)o8n0dLG4>a>ZW;C%C(BeO3CDIVh;kHn)CInMdu zD4c60SmnsKQ-^~m0!hH360DT-aplA}n3X^TkO;H^#sQOn1;95zVm?+`pad8Vd;}~9 zeg+-_ObfhI2UG%X_+6R=Aiqyb2HF9|z-ZuWU@ve5IMoVQ&epgZy@4x18(f(IHy{uw z1M=EpCk8wQ9NOW^3Pb^YfPTQ)_PD|U_ke1k|65qqff2w1pc=64fU7x>2NVHizz)E= z6Z#8e1L0k zxY7gnftVqXJ z6tEkOoetm&^a1(wx9&;Wwh2fPVf`xrZvPcSCHVqi0{ z12_a611f=MK-2k{SAZ0V1+st^z+~WKU;&VZIc_ZAT1y#nKP+W%PsqHheh+k=dM~aC zj1yu7xBw5}0|Gz-2mvd=8jt}tKoh_gumkLYrho(B2si=GfGgkz>;d)y2Z6)D3E&iP z2JiyT0~Y~rzz2{5azFtD0zp6s5DKV(Fd!Vz0uewGkPhSk_kcfu)&RcZ&$I!?0^@*p zfoZ^W;6vaOU?E_GHRWt8^a)%A*aBM2bul;x2x^0Kpfy>o87FqN=9)Rr%mrpHG;=F6 zw=r{DGq*GIre^M7=8k6WWaiFh&NO4&n`GuTW`4F6tN+lP@14))dFFhHIp5x#PcC4~ zt-G_lq%X_$y;!a^>yJ%i^S-TFeyuCZgL|;t#%z}`gw5L*vV2H?mM@&d^7iKPuUfGA zE9N+cneDr^HR}hmoYS1;d(8C<&GGg&`*AXt-x|f*-3R9|&0FFQ^(}m*JDCST^)3UlDaxtSBJWoF_;Ox z^Ko{7w+DiIVCG;IP8fnS0KY*Sf>m@OR+je2f7JqKHh3gvQ88AMVASi2Gw_#QIIFM< zI5o#A09|LS)YoyfxR0_}U}js~O8{2}V&=hS5BwEE@70FHLSom^*mX2^9gSU2W7pHz z^)z-pja^S;*VBIMXRkMuJcsrFu#Yltxe*Nof?N(V%26qc(F= zpX6B&J#g2>!?EZOj;8|Hv+}SHA3*QP7Gb6N4EP*a3VZ=92UY^BfwjOoU_Gz_*aU0_ zwgTIL?Z8f8H$Yac{lEd>AaDpc3>*QD11Evgz**n|a1r#2GzFhs8`W7sfgon|3O8u{c&3^b;5C5yF{dyV`+v;X>e1z|X|GnT`jND;(I81%=FlL{rcf!L- z^{2E=9;BPF6JP{atG7W%dU_gkC+eO>2@jOj5>EPkfwq1JB}5|j7vTK}<+tHQfc$;+ z2K5bUeG!^FkhnN5WK1l5<>9!ney^(+QOjF2#@n#Gg_yWFK4dJ}(coeZZt6>R^@M6E z)lsUa)IezjrID0IQ5p@(&R~|4`s@s5Iq^aA#0%lX58=cU;lvl=#2ewPKeDSQd9ntv zoJ2(OtUt06k$exVRz)~Bx=>n#`&b^HS`^`o;o=^a-1YUv{Vgdc_pyYM=aht#HK~~1 z_m;io*on)ix=kv?Gq*OKOSnwaZkv1Yu_`%qkTg#iLVS=m`cS%sN95jAE-}+iR6}TY zv`+4*i*TOu@vMZEh=*>}1EE`ZEG6f3Z+d2zP}&2}E_kTJ^4@0N-OLA2PHK}o@!s@) zybGlzly-xT#7O8C9-+Od+?mo6P#)Eg{u3Qpnc2}GWkgHt$sIWBo)05dLbvb;?M>yr^oaGLbPJCh zz1~zVF&|01KZ06>Zs8Hyo603-x)0AzdgJK?AMJ)gBF|hp(We-l(I+m@1@U)AL}a{4 zKTD}i7g&)aho`?7V{&xKtRl0c4`>Mzp#wmB!7HJCP`ejulOxy*&kV>I6hk7s6rA+1 z2ioduj)PoN`_j67>C7xMQ_{mhbjA!sROAR1QBKanYhB4%NT`ldJ*5UpBPfleG>Xz_ zP;&OD&79OHdE$X^;)8JFg>d4BaN>z@;)`(Njd0cZL?&9nZ<&Vb~}-hkz-zwYQSd6u4o=YT`J@m!sYog^?GC|Hi~yg*mL!OnFFcHq!0 zaKXL+SOk0udw39uAc27CdK|C3||uo74WtOnKqYk@C;b--7^df;n-{C^}H zfla_Sz-C|zK>p8>Z-H&VcffYwdte8!6W9gp2KE4ZfqlS!-~ezCI0PI9$p2Mx6gUPP z2TlM#04ISTfm6U~;0*8+K>nwabHI7v0&o%d8Mp*o2Ce{Cfos5Z;0Evua1*!%+y;IH z?f}05cYzG-fga$lcQ5#{KG>OK?{%#JcU{<(kH-!$1N_e-?6a_E8aD!Wx40YpW<2gT zaVI!sDu?0r#GPYr#Ml=xmElR_SUjJdhU@xlJOzNSa`@t7ALfbun+kP(a7UGgJ=yJ2 zjKetGDdH}2Unlf)6z)FZb3b;RBJBHm!sk=eIgBcux8cfb=`+%3N|5_gsG zvmdtKqyLXSLL4aX)DCxiXyYfu;D9*8;Un@LJV8MDal|kY_0A!lgMb(AzQ0BJJ+x7d z@{tp9T+yFuv=xp1-@ty;xg*9BW78M=RT0MH3EJC+J}pBW_OLnF3wO?V`XGm&9VmYb zF>nUq4jE(l1^j%2@p~6G)$n_5H2TyTPiWw473vlvhJC;&?3Zt#Po*e7jygxC;b{$G zO~KyzSB&W{_^(D^f-xp5XJCABS3DEP;xMqg8}7c*MyDxwssP)mINl7}nuD=cpiX8# zJT<^mi~Xqk05O)L{{gW54e=>(e6ldMBDD1>uodm?#a#Fv$8sNhJwzXeBHmT-pMdzv zF%DM|cV#Q|t3AdKWAi0+#qd7|^UDO^`stW+7|RSCPmfs`@8+nFw!cE3HbQ?1bt>TF zr#!@f@lC|>nvMP)Mw}@FF>espFvRsP`V|KI35aVb=G1*48OQYFKDf6)du=d=B{&w2 z=+j$hCk%Za4EqrH6(hD=unk5(C!!B&7!wi3xB}%t@aKSjb|v!!Z8U@5r6{|GVK#R#vtU<+`Tr@(T7QhEb2tt&;pYI_`~~NNbSUnm zF<#&GN8d1>Z(v@1hdxJOp2eVjE8ugC?+L{GBkaau9DGpsE5s9vm~-Lx1j>{0QLiPQ zcwkNqK-mT4gE8Lw2jWQx#`GKj_(710L^jCE(6mL z*Eo#dAjB`h7|BcV6bfS$(jPL$Y1&}SLBy8<oNOoyS;RL0@Xn_gM6?yd#&%JN?@Q%skW# zz=*F$?|J|u(OV(I-zHn^h%I=zX}!Vx)+BX9;N#td2soW=+= zL(i>&j({FVZzs+UFPv#DfL0g{Pt1Z=z*&sM0l=p(u4GsZ3>b~oKtGI#upH3>-UG2} z0%w5-Ko;zhfP)yZwHTG5IGSxSyAv=XEiiNL0z<}QlrRz%z&0G^1Hc{NJ&Z~eMx#HD z?(AuJ7CHU@>+Rg5tE%oiemEi5v{2w8&|0bl5Qs?#7xI9hrQzn@8xjzKprAZ75CVvV zS1J|}0*E@JIDkdLQK@JPECU9pKm-*<2nu3&=@EHZXl$j`w#@2OhgIkE{hfR7EdQCc z=AU7$@9(vL`+fG#+57H&U=%$~+O$k7TBiNKyH1%`c1AP;9p;@nPha0mOBKh4sxV-q&F<1wShjh--i(&#Cp9~t$Geq!{r(NB$@F?!DE1*4xCy=e53 z(PpEc8~uY(mBsD&h3K`XQX5j;A0A763;LmHMcopmb+raB!4|p5A366s;pb;DdhrnTQ0K5m9Kq)xF z351M5w$a{3bBsofmKx=dOn}4zSN6a%~ zo)PnmSeg;@iI`8+e4^$PHJ_;YM9n8^J~8u&nNQ4oV&)SwA8t^RZ;APon7zd8C1x)( zdzr;8v$$pEQ)WJ8^64H5%!c+gS_)0+n(hn>j5L00oDYxl+v0qLc=vQ?q+;&yqu0;XA21NnC21Wd@DkDjuzqLbuYpX(&3r!wq@<5YwP0lqr&*VIl zdz;+bz6$1mprRcp4B?nXui<`(ayaBL%GnTF&H(j zQ6ydCNL-^wTu+kEd&hYWT(P9fyDZL2;TmW1*QistxNrCUHTL{)J&*hnkC*&34yCK) zK3AQk>*@8m>>8s=Poq@2#-`$nCv?TrIF+uEn&673(JEbBe&Vug%*w9Osr1C9Ym_Cp z{H4oZV^HbRc`5O zH}QPt)Z_55@s>_6c!ZHGsW_ndrX^{q&HEt9TP5QxN+35d1g#|{)eNf`L$GP)^;0bk zv^W3K0OuXCE7kO*cY4x0jj~4w5tW}(>Kiy>GQY7oyn%*THG_f3UBS;lC-fhJ4(Y5Y zz}?s$1P_4@*gJxDAPBw>T7lN!2cQjT3&!C)0dxmFK_=)8ZUKY9cyJ@=0g4>AbGkE> zd7_AU!u^S6Zuk0h=hlFGL%K5@s+rI2lJ49F)$He{r#mB{ngLyAdS*dpM`*WnXB4z3 zFp77BuAmpl05^e5kOi_qZ;%7}fDq^l`hi@K2l7Dy=nn>fLNE{v24PSPA|MK4pacv7 z`a7n!X01TYdVxM*B3K9(f#o;xb_D*5R(cV<1bz)Rg3aI}_&xX>><3rD6|jN0FM&Gn z8}MteIi23?!5XtGLf z1H1{iH|FdI?*eX*IfuYC-Dm-D7P*>Me-X^04|L{6G#lF2Xg{O5M)Qp38!a%}-{=6N zh0qk9^m#n#+W+Kc47;V;3!TRft-n|5?~H2Kb|(9@qLoIquU~1Wd>;%ACH9Vxe&ZU$ z>gxz=(I2xKyU0rCWmafzhgl_UW=;4PR&j-_(L1mP zUBl|^_Q^b9E2%ZS9DNCE&IPQj((tKeb(YI2Y%cM4vI4wymFkc^wPQ&9d&7PHM4bQ2 zIA3p=?ep?UKL7ZBpPwD&^D$F2+Ml6(Us3*_Qtkm@@Nm~D^19J?_plD#1Kbgm1grxe zf*1056ZHnN>KaYGD#84bviyWN4SAf_G;Ju_Glabij*xC0)=KX@))Z=S9>1I6F)$x= z%x5J{DFUDr+y&-=hrr@;PW$p7EU~XOoBaeV=gl&iRpDi{zk&d*^a23q|3fKYO0iDJ$AA*s0uxkOnq*soj{|ER1^5gJ+@V=mKtm{O*OJ~?qK>JS+ zpwGK$70&OvQEwA#(x~?Z<@^)2&aBLP0S~zf{$&B<9i;KR3>xP;G4BvXyj5A*huUnw!Z!Ll5Tn#MVFZzkzJ$?j1X z&|x|w8N39(0GsZW$4k`z(|g&cxP=t?EpNEDkvOAguor`6E_^-MV<9VO-x9CYS$Ta! zyy5q`&TZaa{NG}C?*lLubYiD!5SR`g0Ly2}>jruy53LXLLN&0}!P9t`!5;$i!4`gV9qjcW&FCEMNlhk==S+Hqm>F~FpSi9x z(o4f$i?+tJe9|7}b)dWp<}pq{-5~yQ!FR(;YIqL-%fb7g^UruE__^zpdj-T8WYTJO zZ-#j5$+zA83VD$f%e~#u(d_&191th1NZJ)VM|*!WdL1n`+RG)@4fE*>@HmJSQ7?RN z_pT@A@CSGTOzTN2-|jtY^Z@0*{0mxbF!f)+uHXXKxx-sXdUtpm3HuV885FjNc~VoB@J1G){ot?NCBKcd z)}7vK&|%(FMxVpK^G{;t`&3Fdb|LkAXIiGKax4;4{D@;A4D0gSTLR`)lNT9&?=u-c;K0V=U)D zevDRx@8R##L~kqQo9I>1es5xL1i$)M?GkPx-&gp%`!?vjf+v54tbHg`*lS7HVQhQB zU$BS0b68G3$q9b%I5n;C{!DtmTuDtqaFr}M)UER>{^G2rB)|g;!Pigm*X)=4`s;mz z$Kz}0vKpV8mMC^=MmKc+_2W$cE63)rXFJhr_|($kdY zrnQoGlEUxSGUA6~S;w0tcnACw93AR9RbGf(i=N|e0602~9ak{*dCl}kN#!5IB&Cw$ zG_MWM<}_~=`QC9WDZId6;Pt%WgHG%D1=@4Yed~m|wVIYv=`3o&-Vt}e^|gc%EyH%7 zf28rWlybQ4^pAn-R{wao?)6WA>t_E%crmy4>r%q@1%WMaeM4X?Twf7*6|V0H9D(cm z0UyBi1%ab*eM7+MQq23~wcK*xhAdJC-FN5$oQT;S-JnS7x-Y?P%%sk(3-r{au6q>R z*-Yx!a?i3M<(j14!Sr-;(v8^$-GoI_p55X;{!QA3l>rC&hjI8?Fs0lasoiXJu0N^PG~@{XDCZx}WFNr0(ZAo1An9 zwt?P}q_){{O zX7j%iu>jivYzs+2cWWEyX-P`;?`Bh~58$O+xC`O(`aREL1$W{&y;-NzgAqH&S-}a* z3}6Fk8t%B2ot%Fm7uf#*%{YW9Pcu0GT;namW|5_1TTOa0>a7FOa_An=fD}oDmg*ho) z5j@lo=A>{%u*2Rkr-dtmb^F4c6s`zH4una+BDk_K%xU3@V8)R!r-dtmHAlmo7_JEJ zYYKB}xFXp4M3|Gq6~VEm!kiwi2o5-Mp^o=r^W7D6DB;yZV;ePgRfAT+nqd7wrCc4{ z!2iajlXC2wa4PRZ?ypMI>9NyL-+CpaJFXiznRXa^y{u`ztUI@xI7jy?vEJj9S}ZN0 z=@xBWD8ebdb#_)QAzuld3vDHYvZU$WuEMwCC;2d`Y~}RZKDb&?w#%HLTZgWbef6AS zyP#a`OrP?WUQg`*kz+|&XZ3WVZ!hKkv&E988^F?ckmgxda*;Giy0^QFvvi5t`La^o zN9ukF8bXaBz7>LP7j-S>ETBqvEEMAepjyp8BX|sdUw=2$6pA9N^nb$hB%ZoMyn*)8 z$;6X*N>`~)lAh!pp%W%4Wu27NSx3KJ6Pj}Uh?H~>c|RrDi05hIsE6J_ikNgbbUZ2R z9OY@su$$UO)8u!RcOe-q??NH_GYr5}!CA2RPZq}D$PQdlmn4W=Rc8*4~Z&6);*~ytcSES3cEef7z(H9DQ#|E((I&&wlI1vbZK)MGrqHA>C!oE z=eC7EJg04qIC^bc&Z7ibQGhL=9&7{K!4B|S@EWK9lfYz938sK5FcnM#)4{!92DlH* z1l3>`m<{HDxnLfs0Y48qX-m@9aUgn0+Ou4Dto`=FSH+9Z&;Iby#^$P6Wvpsk`_@dT zjx)KdJALar`@a3^@Z7gh^{>dOeavzGndV&M>;Ip0t+T)CLhb)0#}vov@xMjsb;ey^ F|KE5!;~oG2 literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/pnames.icu b/go/mysql/icuregex/internal/icudata/pnames.icu new file mode 100644 index 0000000000000000000000000000000000000000..58af6c0157ab926ca08b4a68709e8b13ad07eebb GIT binary patch literal 42682 zcma&P37i~Pd9PhvBg@9v&^?Prk{4N)WXrZJYu_x(ni**{l18JEW+YjX?RIr{O?ORq zSGB6TXEd?_W4yB%8w_5+K$T>+1OkMxC14|RAtZ3QNgxoG5FkL7n@dRY0XG4@=RH-` zT|JVMtDpXLw(6==XL--NoikTye|>GYCjWG;W*b$V*rh5RUalheWRIo|jcVF6&$n<> zbkE!pJ z(I|X23f~`vpNYb+N8!(+@PZ4%=U*L#Yol;m6y6bqUKHLRg>Q|*4@cpbqVQW$_^(ko zmqz&^-#;%3S4H9OD7-5Q>rwb<6h0S)zaNEPjlv&Cp?+cb+>4{IGYS_*;nh)iLlka` z!o5*gioyq?(2v3wqVRiBs9zMm=ao@-T@-GM!lO}Gi^7MZ@cmJ!r8R8<_!8-CO4ELN zv8J6O%rI#Fo_1M9_%Q~Wy-L&mgz!s*e?hoxt)~4u;Z8gA`v}Jf<$Wik@HxUq$R8Tkw09A{Ncd4gZ=a@p zh42G}e@Xc0e(Fqk@wld?=~L}0O-lib^n6KE-xTmVMBy)^@RGyf^RJFVtwue-%~ARr^j~Lqi$-&+_*I1eMm|lX z{z*Q=Nh3?6d$gBE5Iv38f1VxvkM5F=mGOUFPIM&fQ8^Pa5=aFTnQF| ztH5G#HCO_c0%@}2pa{xf3Y-G31^0pb!2{q+;LG3};G5vD!FRy-!1uuqz>mO>!B2to z^FM%}ft6r8pnJ7DK?%$NdG_PrIq*U7N$|(u&%xh-e*(V(b7`b5a22=_YyrE#I52<> z?gkHoCxH(>4E_-O3HS#1A@~{iHMm5leZci#GZ+F#zyxT3$3Or+0sb8PE%*oUe}T?; ze>qqPmVmWj2RHHjpMs4HkoaBeiTo>hGHANM&EP1Q29JSHgCBwa1g~UJT@6-)?O;E+ z6Ewhs;A!wX;8Wm#fUkoegZ}`RFc^EmRbVyf2ZulroC0qIe+Iq2Z17s}R`6c%Rq$Q# zYj6V-WjB}xZvh_yUj*L=DJIZrFbZ7oJopPRpGmbDfp8Dt>CNR=U^U#ViY_KJ_i02`~oat;;sU>g4@BX!S8@S1OEyZ zGO0I%VQ>eq!CSy5!QX%`Cj54ACpZBf1!uvZgP((Lh`|Q14-~-T-~-?+_&0DRgkm!o z24zqO4}mk_Ecgca-#~|$Yy`(a4ZH?C0sa7d4*U@OUvMb|=W=i@7y@?z2fP}*5u65} z0$&9eLYNkU4WJK?tzZ)P;5*>r ztC{OyIT!}V!8CXrd8^I_z0iFk+06ztn zENA?K4WJL$;LYH(;9r1t9qR;G32NZ&;57IY_%7(U9y$c}ffL{!@CbMc{1Ny^aN!Nm zE3g{$gS)`%z`Mckg0Fyo054ymY3smFPynw2Zv*cIKLe>7>0huLOo3;?cfqf~MJs7H zun$at$H4o+kAQv?>ki{1#plRBrmVJ$ydz)|oC1%6r@{NdXTVp%_rcGA@C2^}SAvz` zX0Qj`2?iKX@pI>L>+8sS6L=Q94}2WF03J%*M^&`%lJ`#_wVJsMZUDD}QBVNW;9>9{ z@GJnf!Bj)fe+pfeh+*Cd=~sA_;(;O#f7)>EU*}S2>KJ( zeGYZMnY=-85Zn&_GEx2_7(9a=-?61L0*JCVU(CJ@6^;Meud- zx8P^sH{gb~R7|5q=4LAN&mb7F@cEH5yz4R)d}3c2EcRgSUdy;19ry;LpG}zz@LB zK*tbt5?l_h1p{Cd+zFz6%e+KBeO?(o_c&!HaKI@b^yz-WH-o3ZyTJQF{BC*g z$4Ebycvf%ptgn$T&-o7cd+-Y&&(rsXzwaWH-(O9*3P?SLhHWLj4+tHbUFSQ={|swc zTyJT|$H{vdya)VcqAX3>m&yBI;QJso!t=mQU<8=J1$TpIz*oReLGOOr2pk1f*1Y&# zOL^Y=$on|>68JjMM;XIlD>x4B0q+Mt02dr!jDSPnQScG)*C2C{xeE;NAb1jd68ul_ z9q^CfqA~gi>;M+H54;^b4?Y3D0R9f#!de>FZw~caG7kR)2EmYdoLs{PmZ6uo%p>VaZ`B1E6Iy)`sJ$G zJlfl-bIhl`k$93Pl#(h)w3tP&v#Cp znwp&U51vdFMy+%qm0#uMy6T2o&)+rAvFn!IFpVy&Xw_faVXQQVowM^j>q^%$>Xu`B zX2s0c9oI^`i|SV1s1P;nx+*(6zaw|HyR^pX_qu9E-puw|b<E7OzH$3z9c~zrUv+N>8odwf5p0-l?Z4;MSX2Bv^bd6f+ zc&}41SEG|QzqS447hCt&7SvYNH#E{b?q!|w&J?kW%#2$tT3aiA*6E$77o2+CEF9^o z)+%O4eza1mKiasm#%txJ%~!r`0jUY6Qcx9`d!;iy>xK)jwCa3(@;4XBdoRqL?aHq! z95L+)NzVQJ#rgz+yZq$0YFejNtAGCD1?I`RX?xOs8XD47SP0=^pNR#9r+2=Jty-`TW@o4RPC*^6Xl}&et2NpN?KJ`#UbK*6gX8|1T#_54gIq++1V5 zv%4U@d+5p{l{PAtElK+Qot;L#w12L%*;!LR`?ALP)VEe?op!#o$Z(B3)yva8HOI2+ z2fC|NGoPxjnd&ks4O8x$shMYAX86;2|Go)5|5U*rE2r{@3n{DrL@Iyw(nhb9FXu~U zzI^u5F57hL)u`l^iIP{Uk`qzMD-tEIP$f=Oa!I1(5>-hWUuD&Z(-P%~_gmXfEJk}|fi5Guqy|~r#oypTLtIWxry(s?%^C`wtjf1a9;R(?P)tgJogrSoTdscxz^=v_HesTescFHKl<+?lhzX=jngn3j>$%$&zB z=IO<=bDeAIXD`fHb}GMzPAH~|E9+{cT-fD0joR6{Lp=~B*QxBDc(Ca2DRmed%-g)u zOjoUB+P5sSi;aq>URW{fGQeJ*KDmX#3;DV^T|fKsd5wxpj@lw8tO4yoI%_A!p~pORW-aa`G)`Iod4PU=~ac(zcw$Z=BswiKYB7r zmYyH4)@zp;mTN-djeOaxe{=h#U8*a3mZmDJoNm>7%XqDKR=cL#a$ScBx5=7k7mJmd zTIpNcQ|ZFBy0NBwHsxfP{HcsxzB|n-keXY%r!;Oo;P}q;ocyZthRQ?r*Eb$XO}w%C zSSCO5_?pi$VvDuM>lIS2D0EM}T(j={p1eyf@;7x%th2Y(`)97snBJ1BAhw>l_^PzC`07r#VJ>-j z`M9&h+3F5YrL3jy+Ai0q8f(*2S9hCk#h6**|A3XgR1bbozO7QXJIXgXtK7wn&T`$j zX+hmJL()@NoHCY9t)6ciQ;wBqky-73ztfzk2j4I3ER2*Mwo=YDUhuPp46{bgqgu6k z@H5>A{vj9qz0=j8RZ@({F0^VY&yF6_^REj_paHucK^BqbJt$9HowkXU)*4= zx?^>Ajgej#jQcDgYR|n`TloWcYIIX@z*c6 zU-+_{bq76vXgXC|>aA^ZO>3`CS*t3#d5g8zxph9hRClBrSL;lkA!pu{>ALVmxAsnCJGNam(>c))_h2hQxj{f=y&_N z9k*)K`xd!|qV3I}-;|o@v-YK&;h|H^i4apdDSm{ss7?WPftz{&D2g^Vc15_ zIAyrriH6~tXXdAkl)b^72QkVsA%p+y%$J-qD^ljemOi=FEqT4s^5g|WtT9EuekBxF znz-uCRC7+n3+nTj2e4aSu)FJ&Rj|&itNTYAr`J#EEOw_nztZV7YQd{gR`4$O%63Ma z!UYt<+DeaePhqROKyl_1^rja)-fiXKKK&2Rix|-0!*h(1>HluY|2@n9q!g6 zunVxl{{Ng8*Xe)GDZUy~RrV*Ve#P!AHtMI-sr+^?)2N+ZJ1=_QnYHudWym{|(T(eK zYrNntMtU*Gb+O_Yf!R?xSU)^<=gD~u!=9ASzo+?%d(vi5gAc0)AL_Ksdhm&iYX#rQ zSd}69-qSZU-*Q7&rBQ6`rJ;?b)>y_iZqak=Cbqg=rAFNd{-evJ`bWB9V2Y*S&3gX6 z;uH2FSrUaoqGhGSoJEX4VT40w!ODBNUd5HOCIr8mF}z$S$>4W;3`JFgleM#ZYtI|* z`>yaVbYA?d<6AZVfrfv2Mz4%kb8e?)OGH&OX@O2 zgP4Exg2_~6dn0Aoc5f_q1aIn5gVab>M%^oCYOXZ}wQ6`KvxuGzs=2`KFmA7&toeN} zS&wemhiQ++rI-s!`QZO{@ltaUWJ?L75%j`JYnd>^4Eh#S;pPI)2Frb~mc&fUgG0!!;5|k}?j~W<;9sEwpIOue_^^!p^ z${aK5`H(^Nf?uTEl~W82JDBM-3P$iPBy78@WK@jND-1VZGOWC<2$&cAs;eLa|CblI z>)jn*ftCtJT1|DLH*On$q)VD77`e=-(783ER57Nk;QuqWm+q@9FlOl4yx|tS;Lr5Z zrplg148NAzi*@L;rwZ}0W{eY2qnuejUv(o5CsS{XohZw68?)N6b<}dd{Geex8{zcEMd#p*Ax{a4J(Y&%844pD6k#Dyh7mD_l_9 zRNLSku6H-wvXu`$hnyvHwd$ldQ0w4dV6ux_8uRH;7&!UZ&xCXC>QcWoZZCv3HcNii z+pvpje4LrnO*Q!WZCxd+Yz6O3yAKE@Gv_)FIu+M98h&o3+pt+AZxh18fc=A9@XuzK z%yIf!5~;#LyQk<>c;}=dC6~ri^30{Q|)M&-3V;o|d{;ZK-@ z)dkF>sMDz4L@uy;k>YIAX?S{di&shfFk{pgl~A0hzH0wX^vpNG-SL%MYYUpvj2C<* zoVtNW&A)jeLck7hq@J5u0Evjk%$cjY;JfTIm&~UtsznE-!hUIvZ_SN*pMjCRBDUV4WfXE2jU&2(ekS+58J82I;)2P26G`yC_VT;;PWOs znJE4Ty6aZO^nwq_ptWFC4bO1TEb1|4n8DTk9fgYdYTMuCc00m&r}A4%3+N$7NikmV zS3M2aLvIs&qt~1;tb%LVXBPL?O{;3mm}UQyG7S8$H<#hB&o7u|t8A1TR`8E=%Qu$~ zJI5M@nT%yDDnsXM4S3Losb=t*YiY74x^opLUtXN5GGj}PGd;a9LTZh_+t`s?Y^JAK zL|y^wYN2fjUZ2WsvU}kQ4H>UzuIVxA>XZLZ@e6AvU;Nah|Mjx}&y_jGxak{~f704yJ%Y67f|e{yDZCf5j@PL>8n z<4_Vpbj&&~hc-pyL8l5NnelGYF zGhv`K$JlO-R&D3O2CLyTyg~5ed9DMAFImAOS1@zDDoaEt!<<kSL>2>?D}|A z-H^&ZZT80cgH++L7tP4iFNbX{B$R+>UfyN$nTwFsAg)4j6~XnMzMOdmb0ITOX2+*f z1y*(?(<%K;%}ABjxZQj-6X=h+)ME5mQT*83yoE_xe&qtS(uG{;N9UOJihrNuKjZq( zHS|d&R=2y_D3`2(Zr)V4cFv#Vo#B+fJZ1ZXPO56VT}9q3V>E{RMx5S;ZnaQ%hV^Op?GQweaSjU1L#-8F-C6!w>gAUHhpIIiP+zx)w$SLRP0DO9K2eXAX)!gFZP1p zpbD${C)`f>+*frXrEcml_f#ip{(x|5{=Llxx|e3Ak^ej*D#4$NZr=Zk`SH?hERuy% zsqo@e@Gnvm{qB`5EutDkJmWwvIK)7wChHL?tR;o#OuxV6?}LZR)q10c)KU`xjQgx# z2cz>~285&Vzt*IcU+ZMSJagf~BFk*V+y&DaryBhFLg$22aQ(i9f7{Gur(m=#+pMaV zIi6r*j;CttPR=X1#x&#bk#48zOgQgWQ+ptq+5?dK(srw}QZb+CWn8no*xp+*RV(X*-km@5vT&BbB1Y-UUCXJ5a;n^XcSrS@?LS%b z-`(hy_A0tg!GESV=9+?M^xT%JpRKXRhtrym;V!)}SbDCC+)?)E=4b>=qdo}rhDJpc zINnO#+&=MqA5^QULwMahrQt>{{(i4fnNYU{2Qv;bwU~jz{tU^_M=ul2JXE0GKj(TpPColT6pO0aJl1U-M(d{)44m7H5B7lQv~p3x@GbQaI7WZW<^{GF+Zw^^y& zjzVu_MD+aq(#1iRUV;|N4dWKZ3E=O1<;UtdD!P+jU@sg8Puu-9qCERM1R3qmqN^grI!y3Z=4W>M8Py z9ebZ}!NZQoqgC!Yb7QJ}O*J^3sT#pwrd+rqFKXR03%Yo^d6yc*zm2OC{Ibi$D+ygu`Ucw_23xqwC|gmXL@j{ZsW{N^W&i-!)DdQ z1{vGI>u8za`}12vth*?p+2e?eRrFns|G|i@{$R#6&MdsBnyqfJhrCl${@&?M`f(s* z7S`~Mm9@IDJ-5+G<(AuNbFs38ZcMxDIuZES>(xys1{MP|XH=$BPdzywvWo#@<+gD>= zlAkT%m&c4x-Or!d6!go@VH^P39H&H1tNq3VGvVrxaX_+6W_x7UPs6N5&W zB}yYv0GWlw_M?u!t9H>u_r!7QsI%W)>g{f77zT_L`F(|5=H7GkzPIRwwUzbGhBlS& zjj7r_^-fg29hFUXe{I<7R&wN#P=R_ws7~FOP`~WI+_V!c*zSqvCLXr@y%qoV>Kx;) z{A%+i`$nfHG-08Tjh@GHBsBfH-dHR>kg`^ycR}LoyMoD*c;fLil!@g#nRj$^zd z_MkK3?QC4;CP>-XRh@OWa%+FuPT4Eq-<-;=id&X}dQq)J&Rhzq}mLSLsYr?*+@>R77c|HPhWg_1Zf z1XJDtZ(IHL>37fg{inJ_uN925#1~dpJI!(_80O`{-VS*3t))HFcuv%3f&=O4U|-4( zMyPzSzZ>OCp%jc#BVUYC6O$%FTD>HUGB+cts8{Sj*4tAUo#;f^e^;t)boNTs?!*Iz=Pds z6UhQ2;~7CNw{^0=lCDs2yh7&f*kgmEM2=C<1G=-KG0Kl0Pfad&()OW+xLnZUa0ZRq z;&7<5-vZHxn^hSL>w8dM&|i-| zm@ciA-)^iz^ETrdU~UV?$Ik6)Tny-?o2$3m3t5I^F(FmHzDCW=9?Q1Tl>~)_rwqp~ znI$!jSVT%YY;@z0VeRpVoqo9T&bF~Tpy#)k4< zpIBvYIFT_Xg4;4?34OQY*ujysy(DGsY2bp?K9jDUMx1}Zw7obrd56>M7UapMdq?`@V*4(~@2mNvCw!Z!wA6BDyWlu; z%x#t9&LG5SFm$rn)pN3ri;j7`*N(h|cBX4Xv&e4B+2#%}G$*W+&=%2*?c0N6Kr6X7 z-55bcgS&O?{iVB0qt=6yK1{t?#Y9vPE8>iDfVgqan(SJ`Tvw^iY*9WKH!lzwF{G_k zy|5gUUEkLE5i@1wZ`qcb*yd%-TFSWE>{d4NTjw_`(Wh5%8=5MnCI(pQ%n8#h48R4{ z^8O*y|8_B5*_3IRn{{gm%=)}|rCr(^%Qeg9thv^>E_XxuM&`ItW^K=Hi)FAQT@$i^ z;M?Ochp+8|V>NHMuzH}n)>&S&-1VvI6xzV8^@a2zy{1Wc|w$!j=ixa{t74D*qg&k75eRzv!-!P*vW7v=3+>$^9iwV z27j25OR6RYgp&a}6X#h3b<53ImX|q&EdcM3;D!30hX3g)7A-5|)q(Xm{4w-D;)=#f;%T+9kT9*PnQ?>xBP3 z*Z-k6*LoB(=npsi#!RX*=+4QTW{|a3Oa^z^5OSt`+Ij0dgf;`S;FEe`YjK!C7d&3d zVTv;H>4N{fdhheSv7Iic>BTMX!I`v!QWW_i_`Ou&8YlRq{|VSvJNN`0@zlK782D7i zF8C)Redh_|_980IlySYAE(CAa-Q`oOrXNy7bS|j8y(6YYB40`(?7$Q5Zl(7mThZqy6V5VJ2Y>-8E41f=-l##@?pG(&e^O1BexF1b=@U~7*u10_EUZq73J`icq->&CY%FxeYxs3GZkI`eF znHQN%jr%*XMLmEiD)=Cr#s}$;5)1F~boDqUsk<;q zZNwxs9-E|C*)U11P$sEo#Ugd|9E;Qg$|4nf@?4A5o0=A>U^ui$-H|F?Qx1^4P*zuU zYejh#d_op8Sr#tow$j;4%b#CbKHXb6{n6@0&Y&~o*6a5+9-Lw6t)|_hU1G61ic6op z2EHP2nKwbYco^R)1`w_0KS|3ynK7-v%T((AX$(D9a2hWKzi9b?RLRU#56!hV*=yX} zy$bIQ4y6j4EPp1pxXk!0gM8vRz4?BG2s?Ox%hnToxU``>T#4AFLk!Ky;N77q=G~0? z-|xg46a0Qh;iUOi+wXT%`QVL?e^|}P*|wO&u`T9}Df60%;N$ZmgG})8ra=ZfRKb5k zZ_@_zhK%6^x6ic~+UuN6wQcnbYW;J76Vj~WE+H;VP)AUf-@~jwGqYvZWDH3&1Jr?t8Q_J8bT35@RqmK&tElURiPGdtS}R@^S}RVaF%wlE){V2}5Ygsiau8i6ZU80uXn4ZxK z!Ol`JSo3#X*{np2Ks&|P_b2UnMr~aB$byY20U&#mF)QQr4F;#dCPRB~09+<|OlHW&{*MotM@~}PX zPMxFXdRMv{z!xm_PFxXr=tK@WCm7ApOFLkRSYz&R4!U&MA1QTLzl_YOw7Wv7xNeM< zVZk5GE$t}{(PdLpuzMc-Km%t59?=k;^LxZ#eU6&_9hiNqI*#8#-M+RJa>K2^YL!HyDFSGYR^LKc6 zHQsuXYNi|4haC3}sR~LRd0VK9y0Ig_!5pluuCGDaSY5j{_J+S@Wx94#y0%VsvaB}- z^9Rksgp1$ijT1LJ%RA9MU6-z3Ek9Y3sw{OkF6~54Se8DyNmXle+POw)wXW6Dm51H! z?gsD9=~E{kl}6huZFTePwz?TU?8L3}?PA3|(QxWGfO=+8d@{{_)V{Kzy49X?!d6O| zOD5KL$bj0f{i6y9p;xgFIzPp(v7x~2!glNaCA zFCDRyj_8+;*f|(?#IBxDh+L9(SL3*OqHn&q4y$kM)AMWWtxHQ+EDsS~c`S>Y>@D+gki{jmkuTqZkqllpe%%_k{zcxUIOyDooy2y+WQYPGwWV4N-GwU* zdW+qL|LL6nLLL_j-1JP-QQ4TP zK;cciPbRvolV;(T1!QC6LZ4>b3`sCc;?pm4Sy+ospg`xN<+^D;(eRsAUx5(~-zE{dMSj0--W0-_Q8JdKEJRUbQK@Q0 zik?sldo8;&8q0KdQ+=>;O%HC1=#8$usz-DssMb^N(#Eo@*@_@mHuQrjcMT>MbHafq)myM<4-mlm?%!aSJD(wk3l>JujY}ymP%O=pF z{I9oj9>w2NrC(pgVG6(HqMc)3;PXGLbrjyKRX(UWs~UW?>3&Z8sMcn0cu%VKgrdSf zzBkoT*>3Nt9ga){s=2~}Z-i!tk7)_JgH*U(2S(i|UiiX=LRi-*F+v zi9U=Is~gMiYBx??m%jz`M1SSFE-_EMLr5l#@NCE53EFoSBuX=CE0;QZ@^6&aqlZ_bmu_6v`Yq^MY|(p8F>F(%H5Q z>0m}>G`nuWd%+x4mW}sn?WPSzL(;hMUQO>42L(-Cm79V#XG9xuv_Wl9^wQd#quNo^ z<9M)fcCu*t$UzG0*@N+}6iwfMSlbVkXCsTI5A{(#WZ0A|Mu+?NY9kGHvS|9=!`faX zkL=-a`--L?KByf=D63L7Ffg!sl@?xAVzo-F4olXk#F{X%RwdSkiFGQmE=;UfiS=P( zgGy`&6B|`xW0=^a5}U%rW|i2i5^GcqxJnJys2Xrp2~~qNVGY)(8gNx5ss?Mq8mv(@ z;HpYg4c3G;SfgsdRh6h3tO;wdM%93;l&n=X;Hna;25Z9_tW`DOs!CK1)`m4$t7^bi zm8cr54QsGg)qty%Y*M#zmBeQCXs(i2y((-{f-o7j)~cwvR)_6Hph{^3`AyUotHbI` zpi09JSRH-A>af-lgr!m6@uI%*{$Z`J&H~0BpKQPEO3DticB_(E1(Mg-%IlN8ODgZ+ ztjz2|Srs&$M<<`yAEwAZ80AOz4Qt^m4{A|HA9a$a$5jbayo)!f{OG?1Dd|` zWL*Qo^tkj5E*jaLC$&Lr=RBpk|Bbw8pQfhoaz0{=MYUUFbXx?1XX!I zH;iTbVK3a7?0%SkF_Nf~aaA&o2XK4Q9$JVN4}-YHhxhN)hMl7LAY}K;R$uw{aIdeX z56f;viX#g)$4p?=($qyC8Nh23lGyUzZ0m;u`ykxf0Yn7F>SR??KL~N;O`4l zXQH0np$v4IK8~QG$wk}YEng{B`;&2T96z{xh7g96!fsi~{DA7~v%#%geG+7a0v zoekqkLeccwq@}qenwr#@%8D9$Oq_XSf2B6S!?LZUKBg*12E&KF{&6j#=VRJo@yi#U zjvs_)6ll6V0k4Q-h&D2~BfDG0$SXH9_a<`LHxb@Bkhrs%xwn}s!j>8oW8o&daEQoQ z9yRLOu^C$gmhPSXrcGWmp^RcF%D!mGY>6eqGUk(DU@ta=opvCk<)1;o^D zXEGI4P`}N_xe;yHDWoL&fubt1QCiav?$!<>R>?keIypMhfDYm-c%Kr)3NkQdJ8mfT zM6(9kGK!O1Ot=bJNwtVUE+0`*IedXhtB8Ac+=*9(h{{V&NcMM*XgeF>7RT&}V)6K# zu~yz#?2SV9cq=;^@QTdzjc9$cVI|*uz`&Tsx1x+=21Kks^qg@Q?dMEpk7>=}+q#!Q z+BP`%k7)Zv+mam#cX#q-2ccE0mCbLNi?Sv1PJIJ;KO!d#`S^~>%J zYS>Q&vr$4V$r{6$UL0nq+Z=HuL3TB(cTG#gqy3g~7PVCOF)p)xQD(UQ!P4|4OZ#V+ z_On8b_Nk>a3ZoqctS!;hVvq_2^E9E{svTz+$fDl#1(Z=2iVE~eGg~!2lVO)QLbDii z&Gy+cy~K+i+A-rQA6cy#(fg2}MG?zTi6X4p;#|POtpe_l5g!dDO&{YKMqY~oeY|AA ztr-n!zDrCteIJgS$YWX@>bnqzWRrx}jOcsN@QNVB!5CQ;*LULTB{l)A8PNxr*5W1#^^PJP zhX(q_w0LnJpydiYHw#ngh8t&D&%$#(JQkm*w3VgrFS7kj3$OaY21Z>iyy|;P_?u|q zl`&?ATi&!*EIF8)n}}!unF*~?-6{#kRK)$sxQcIrlqdL=UF>;e`$szu!@Po^$v}iZ z)>JCo?c>FnQIZkLz726(=EIdAuXavGM`FBtLWEkkzVwuMiTi{-Yk!6wPD9x>)qL)G zvh;8PQ8ds);uVEKlwiKdZdvAw3Ka9$Y*Nt*Fyz(hXG zyE@uKEr#O%4I!A7q3<6!sO^^tPd^B|sx}!RqBpm7vcM@%OQBoQzCP`o9C25Il=M5u zF5#X4D0(ZdD{`9M8F}`Hq_&lmNm5mNgtB95laNf7_Kfh8<9=@c;5ZkWXDiQUq?st9 z_lw!2Vro&CjaWbGI5QQV_a(v0gNv5jeN>*ut1u|(%bF5Mmt z-<}R{7(H~(J)_~?`0OE(e_Fp6aWI*lmFyVhgwfN&mWb-R9NB%SMFFgfCys*fEn%ql&qRBrvioGSlr}33b&W}Y z9^Rm?ZP_ReS|nC92u;!-6J8;+t>;LS%AD$diMf)PQ?yR20}^A0v0D2t2bbK?B!uHe z(SxVr(h>87;R7O0F?nPJG*YC z;ThvdxDrV7#4W*KZC?=D+OA1$86b^XHe6}>N$bPtgl2B?p8oURLsP2dx2*>K|1UL= zAxaI@4`XcE8S(WIM7)t2pP+`n?eS&nbMQB*OG~rxFc*%#2U5taI;gH z+A;5R)RNO{Ap8=D8r48mQ*FHuEhD?tv;E}A@*$(~JXw5CbX&AIL8H!-rT3Y#butp| zVc^=6`XP~OM2u=ibXc1y5d`8;-|um}hKMXth{T0b2?I*2g6HO_CccfMda`@uYo(=| zsn)k19#a$ZT$P(*b(#~p&(6!f*lZ~Lg>>mWc??YKI&c#XTtj<#$j$QoZBL4GW7^Hw zi%EBDxWj-nIPeDo%4Clsib{#R&-Ob-m*RMvJR1_7|AK)}z9bT7W2rWlu+&USCPd$j z#;lnobxtOljTQ@Xsusya>kio{uiVVEU9Bjy!H7;WU@A1DEuns#*&MT02w$NV4Mj#G zS06E_vubu>Z%f8?S*u1-3CIXb#@nWus`7cJn&dZYPrOYcYn3q$+W1w*DO<*~BTO_4 z89$LgFrGj?T`VCPM-lxn+FK=NgbVGgB*s8T6X^ZO{^6*IBMI?yV7AmbkkqB@m|eO9 zfwRPd1>X;ei6YV@kx#SGoj8g(8M((HN5-*MW^<}H3*($~8TkRUS;)r4>_N1m$%Onw zoxUS9mQ5S#J6kaX%32=<9fhF%X4r&y zoo?PNOOnxuj!@DPQC6H8qm&&Mu_ub?2TeID2nDSQ&`_XGP?si6h*_MN8pLVo1bk&2 zPdY(GP(*p?n1*NM^dc<^(4nI)ve90sfKt(LP^toi)x(;x=DIbE zqZ5k;7Y1{+QAL3$S6ykeqfryH#i2gR#HVjj8s)^Ka%e}%)Sft!78HfGfj@lfkWY}9IY*(6k6(G{pYU*qii>4qtU6t zY0)37i|1r03O436gb5aopRN-+)}EvHO{qg2v^b+W6yFDjWcD$sWp!(%J5L(Ow5FrS<+^IRp1 zqSZ|iyj=oFQ0wL$O<7t*1X;Uct<9{0CY_JPhk^fIe1ORA!_iLqEe;vkC?pu!a&4`e z;aF_0n@vK)Ktu(DUuQ zt>U<;cl`HF&59KXykS$e5>4_RWr?g1b5(6}L(5$!QG=3E#+e3kEkWC4mYLf&p+icb z4nvnaU3HjN7Tu~Gp2eNBGFrDJM|tzMWF|d?)?8hL9d<XEGvh`XE+o7%xI7PUBBLa>n<&QW(#Wfm9FVfZqa0Xi&dEZVChE}a&Kj!F zc!_5bKTl>mQ!35aQidcH&!M8ENncs98sP~KS~H^Wq(=DnYjJpvN>ugIY_(|OnRr>q zu}9)mp+dM#hvTr|G*v*d5uT?-P8>_{Ragz%wfQ?mvQv@}=Bqd&OR&M}Z$Sp4h!BQ& zlXxP^pn#vw?hVgRW6&g1ise-tF5}B(K!CPI32{2KqclGdC5JwO(54Pa(^@e_qetWj zz_wMRJxARHbIc()F>fqUEJpHo3goy#*pWzMe7-y!2TB~)m>t4&Tf|(ONKnK+TdWW! z`EiWQMs@(($dJ(>yA{_*J+NrEuSSgFC^1|TQ0`vqgS%RsmAZJx# zstUu{c14rGO){2ac`UMJos+Kw{zwNh(5{ha=EBmua+)g~eHiLTn7~4ck#n_VT<SDb$7XDlBXULSfw@%JQ58fsJ^?7I5{& zp%uLCbC2;-wXr)prp~I?&dHET;i0vOEu3vBWFK@~h(2;{nwsOKLfM_06qNFYEob;e zU(udo;EUH70&93M2s5BP9SJXEa)Mx!o`hmcdmalySsI9D3PUm=<}t4qINA=GvH3RKi~<3WRYQIgG^N`S|Q}=;wLZ?sM*88J9tMlhZCXsj)YN1smPKHt@BP|>e;k;`)J1IjN zV&8vJ+8ou&J3@`jW~@2uj~F;DI*@f0FbLtt)VWr$kj-dwhVAlmw6!qKRC#5%@KxLQQFJfa<;!$wZytZ_-ygpOL*j`Ds&<>NsNI zd8RH{;2`qYLU(fTW42GUm{|%CaP$$D19dtVf|1r2ALga$Ly)M*kf-TvNQmut!Y9Vx zpdSezuNSf0Mb0JzxDLu6y+J!><4qHvzy-V8XgSD6YFIKWE#KZ&CJRyQg4xQ%dnne~ zhW2rO?L_{0amGGdhC#dc+dear)kUslu|%BShszY=M>C^iq#)x;&J&4fucrPW4Btxq zlrLDEud_WP@(GOG7LSE)V&bRUbUHl7&tU4?vTEmbMC=$~9EM&aS{~<-3hg$*$dhxv zh&r>Z*J96#tm&v;sWpD-?MmZO4}TNfU)wavqxd<=p21nU>eaS7uxnOH{E=->850Mt zrd<#c$?w};gvdjA6?aoeG-T0^nk>#3wc?pG>$QCOxn9?kMddIqy|qiuccNFEthkW3 za?Z)n`c6F`)^FFs_Ko6+WQE#NxN(W=Te)oHqu?K+%vVh7-gc6%)9W%ZIjNF&lWqn2wwwvl31 zo7A|O{-7b%1;J~Ym6@L(SMT*I&qDopR{b+;s zR2LPc3sg!%{g}b$s0)LMu|%KYm_-+(B821RNL3oJ46|6x1c0!yF=2+8`OMCJR+Q5i410yH2AqW z$*1f<>gLR}w!5ONN+)mFC5PH2pAu#yZ&6byQJZi|B{Q%maUNdsDTt;j zEBPEX?Gir=r(q(4i8kw#qMYPY_Q@n{eUK!R#WDjE#cDz(();0oW+juqW;P~DWfmur z2RIclS*+5@8xC@mVDbi)PTmmC=ENhT>D`u7*CP zQMe~rMj}x(j3;;Ncamx=oK@OhJW@>lR#J)Ij^NMUda$J03TKthDu(7Iek&BPl{}a% zBatW?^O9wfN)(QxqH8@xQnO`yw6Zw9XO9}Qh5S~*IX}UEK1(AYCUt$X-ei50q@y1a z{x)WQ;18p%bm38q@GYpk;b9}@h{at~n^l_WcppwPL$($-&wXT5elFx&`|+EEPrmIo zICypV8+_F6q;w$LS(=Bd!$YmX?!-^B{kZXw?2D3pa&iM{wtWG6KAri>s-cFHeEGE!;q!N%^I5(CeWgZceR`_h<@@m_ABKp2CTGl zt=TnFux%<%9uL?sjz@vSb=g>Ib6{ksYY`3i@L4m$=j6_3&uv3bVG&5jz;YsiSIBTN3>(n-b8(57|{>?ok%L0eq@h!gtJ4`*pb=~ zt zXKBiIJNvbr@y4}%L}Lj;>br*u?_nn+dlZM}mK|mEZMsN>>PMvP zXuFGYm~Fbt;$RkSF%mW8q?_2b!=9c-Vq=d`86h=_Y855MHW?D^FN%7F1b*^KA9*G! z;zYEWNAzT*FiT4WzO~xi+A?h5XUau{in>jV$qea4=hvc`q?M0Ja~BMTDW1^W&$GXX zP!esT5jSCF1CU5)P=ba>JCN901VML`xE7;o*vB#(5n;m^n-lxya({$;jt1TyN!SH7*~&c*PbkU4;G%(W=dB2hfhSd^6;nshVHXbJ1Dt>g>k2m;HY*cZjMKvJ~x zTl`1L%sDO$5*G$q8awu*5sPfx`pO>vECkq(moG6xn=6R$RDn`d=>MBo}$f9k#q0r;TvYZ}S=_liq*FPM$oesf&K> z)TvV~ZiBVlk_BCSjr2XNAey?+lC)UMOH3Twn%-^V$JTUa+h1X`Y0Ite;3Rd5J?q2| zk1uNa<>_Pmkx1xA%P6QkB3M&n7ea^@%Z0k=`*72uGZNlfvPhuOn(cyxGnH3wc~TAF z8q#zS*(+?2y6Ah89!(-O2x}CZ{PFB#P|fzXiu|W2L#a5+qj-d#^W(^a zi++XfCcO8?jVUnzMviIvu_~OLy6DF|D7d;PuOjBJa<>~P7g1l!MaD=@4YgVBWa1sd z?M)XEambkEm634C5X*4OzoLf7zE>`&c_Dx5qTC*6{B~~#{^mn#cEo-(sHvkm&2m;4 z$)Qv7xcOLqet4XW@kEZCH*#P=9XoQM zVdcxXSDR}0Kd*+3qL!?m9EK6yC1Klbg9G8u36lBy=sOPGR6mXG;5?1CTPk@>i1Ib~ zFxoF=N36?DNWK<+-!FaFE<{r3QF!LgK&2t45ypZkqOmdq!IX@Myd!@Gia1i}!AS5F ze{t2aTo4lnrI~1@W@|CsIoV66siB|yVuMsD&5n+W6e#C$@UZ?}k#v;Z)pv}!!9~Q_ ztP-xX$EIZoW+xF+GnPya2MT5PzHzN@TsQ!IAd;c6SL|3>cEMVvYi9|RR;9N~x$ zvGYMjn=w6yP^M0N8Z|gSgY&Pj>XuCov=B?OZZkeb8fJpi`kIb~OVkhITRzHeWTtJN zW1i%69(9IF%h)->r0Ron#oU5BD&~ESi87Y5R?dg62y73r_as73>+OPDwHNX zT$E`YN7)#UEp1jZQ5?E8w93PD;)WrimeJg7-ImNv+#I<)$SjaQo`Tk-^^6$rwzx> z>@>rADm<@N)Nkrh=jQ054dx|LpuC6G5xDV~Q0h3fEg&Y~5Xo3kn22H>meVGqGbf?h z9K<0yI0@oII--|^x$!eNwZs)?W;qpCOU5xFhlc=jaD_Sr7#VI>hA1XN?xv|VJU0h= z(A?z*`D?|)0=4L>>~Q&wj>e2n$b`PgvBV<7s!&(VQOF{(fGbXk1OG%SL6I;!9o+zJ znRQDtN1t+<-aC*s*)@vxO!kAKvJVg1g=8Zrh_)UbI(Dl6zl}4O7gvM1j zc40Yy8f}`0A)#q{{Jud@)3P(b2aH69Vg8^cq>UEk9aig9Zpd;CUW|r4J3c-}`=`)h z$#{-1O#(6kfwDaU5;Pneba5XGD&a1KGry#Jo4#%ZUO5@hI6bpN!wSea1{zne0x%zp zyViFiX2~E3d_1Imaghvlu&}FgW!#(>gTQ0KsF00#4pTx>xCo~frSjXZ-Ax)XY0SC1 z0QM!DKwizc&Nw#5wUli@~ literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/ubidi.icu b/go/mysql/icuregex/internal/icudata/ubidi.icu new file mode 100644 index 0000000000000000000000000000000000000000..bc85f3d35020cc5084ad875edb6b833200db043d GIT binary patch literal 26636 zcmeHQd5l%n89#6Cxy!q69t;daQK2(DP+^!+q3XDnL0r&^U@IR3ND~AM6E?@ZENaM+h}84h+5O2B~)>VrP}kI?|$zr?{4osm&D9H zx%b-w4 z1vb50Kv2T*ftD3)&cH<{F}9=pSTIspHKwcOa_1`NTIYIagLA+0u=BVx?A#i=Kj}Q@ zyy(2F$~Cx$a8$V)t^PzskK<=-c^_@B{y0XPwxGotxa-B)!w!B!d)vL8-Yedl-Y$>(f&YTr=1=mw{9$K?KOnLXcnA6Y{sMoozr|`NdJK3sN&YtfZt11hMP85iPxw#!5BRV6 zZ~EK)_xxQx4}#$NC`MZ_F#zP`K=&PXrblv5M7KHn2ZsgyLRuj4D~mqls5(|rERl2;4PE)j@coOYRnjW&e%J~ZXWyL zSl$NOBjs_`n9JK9Zrjy%*tm9vd<%f%p zRrR{ZbHyiP^e+nCYmxrV;=haU3jTxQC*l~memExV2;2Fj5Ng`aI|bcS$lc)_sp*qP zu=NCUg?`9A&po=T6Yr736Qp!%-wl z^Go4(d$>NlPsk5iDC*v@^Qg2u6aG0uyb!)BxYuLxKQ~r$NBCAOO2y8@eb`itPlUHy z%9AGzvzLfcl2O{5jzxJY?j8-rs+77*`$_>|I z_|jr=Ods2vQ%l>NGfHQdF37~DM~b)pp!8!aZrV!z#KOkMtVgR~_1BbU^Xp32m3|>; zeFNXife*Z&gZlyv^ZnhgY9bGKca~sHS9+n2u}%EmqQ(J{maLyYz083zp4lT{J_}e;9Bqp(`@)X6kcEuP z0bDpcd?#cM^W^S{Uny&c2e`#ze-=C%*}@Zr=X2oQod5Vnl7Ff`s;BAwRydP`USjvT z>;`ru917VcDT57G0Q!Xe5RJ&pn{PER@&hct?& zdm?Tl`(g*x57pS4u0gx5vC*HMoSfC{-Dg)pd}OQ(I>TAvGh9cAjq%m7AT*!n5RXVy z2h6xE?=od^UoXq+!(0iH%IvP7ED&Y3Q0#-zSSTp&3k8LtETpk%Z-gZMpp+?3HdWOt zPCPauMmdUvHI%D0QYNHSVH!0(nv(`)Rkhg`YP|z%S?uQQU9ldz2D%Tj(t=;@t8h!y zVOlMqN#ib=)$R<<7;7hqPz!|*?r_vAOy(hfTr`L|CMI2Fx5La88j%B@qV0BRVNl>rada#SZupwWk_Sm*RiaeVyi zPhRat0fZelJHFD#*~zL?h|u`fjBngL>`L;-&zc9kS2j14F+-NO=~*l3{=@^N`ia(h z>Y8jk#zdS#>z&l&#qqjVt=;YsL{_5~pHD+QR?%!fyFbZQ+DJQ6nw;#LY^jY$cU++G zxlC;og{ZRB&tcbD$^u+kzw%e3&rF|}nJZhuj3Wk6my)YBToQzIZ+o`bue?_X zdZP~=sSNwtAUjdQrO~+f;2*zHfBwL_`UEQOb&9+jTFj&_>uYqV$f`B0!@@M%l)ha#}*r2Z&o$3d#yOr7aL=OxU)+0E=>V^V!J(0t7qX2k$Bc7cv~)?pOV z5!IuSPN!hc&aZpK@p_gt-mE<8*B*f z2|W3&t@i62IWp`z$@5URy9ksf;aAnvEQwe zC+0*cY@;Y-)vHlFp=_$Wh(m-a*?gWk~2?hiH*c6{E1q21uxxY)}#Ix`yz0MjB_Okq7hIJpgZ87bj!Q zWKlU`KK1|<=&)L|W{N;m_lH^8u~c=4q}YI?7&^86;dgY{ZGDnlTSKBZoqP4u+|`IG z)2@yDneB{pkIJHy!S*LPtvbR;O6w3u?afUpr#f8;E3&j}HWI8YM z=A;-%p#q4nDlzSZ8j={Q0;?rtT;>bK1Bd?!H_3VFcJRSikaI|8$Ju~Pb0JC@hyq?8 zfFnQD<2-;@(eN4q9aW-GfE$uS*FjvBi!4+X$fVP&p$ed)tU+DJI4UA=lt;w|jgk<7 zp1L-Rhnh@`9+xV{$^fV1siseZVik5IHE-LJB&)eV#35^L%38z0^`0M%$of6345B`@ z={C~hyBm^_>NwrsMCFsyEJ>LO z5w{Zb-V`vp9b%~sBNwZEK}+?%L_Gxx7+ofv5l=-<=b$>Xk~BP4nS4z1O|xOdGz8g_ zbAu&mBd;WTygrW?V&eU6Eu9JKnF`V*twQpQ?AS`CPufJ`b94R;Dp6NXlBfL}XP$4? zmT7VAdh|Xd@wShW%_?nwh0Ee>Pn5H~lpX7!D4+*Eej^kiY#QfP8>?0m=Z9wiW&}Is zLaZs7{&BXXoN3pe85JaqqGBno7OOP1&uKVyJ!hgnJrT#NO!ac`)Jj2nvLr5gx(@o6 zjwq)-OASx-Ci`P-JO`4GbjQ=F-ic&SwCH|uJUnMC{*;tFMt1F$44*xg-y6yI6@Y^~ zjk*ga&o7Twexd(&KzH2zk;AdP%FT zz^8YqKZ=s4-i>75%vvkW6ZOuE)_9HNakXK+K@V2ckT2+$=bUBw-x*X_x8l@1bxzrJ z&OC!_B^{fOr<{NW-rmR#7R4iHW7Fi?$m!@jxd;Q1^2S${(~R^wR1}I$R7Yj=o(M8$1@QS zL62%$8UErb-`&&oS4(Z5blLWY&lyR-aH~jgW|~9U(m2(jG?cO_yy}dy+8m*6y(z<| zOX{*|$(HDujBI2Y0_&_rP;{a+AEMwvd{T_ssH7oJlG7-h6^K`##nbTdwh;cTbk40F zEQO5sQJ*3k=crftDAkjOCJfQ|GcaCDJ~mbP?DIldkh48)*~Di>12k-LUb@!Kk}OlC zWCU2?-_NEGii5Nk3!hP%8D{*OGpbSflRt5x1+KR=jD43TnfgU?It6qVBHbQ;MJXSu zCygp}o_k9kOxCRQjKrH2l9s4xjgPNm$d9OQ1D^D0lyJ7ME~oj&+p=?0Ui8OoI&a#+ zQoF_H*0rXOZi$OW=q#LRPm+>*KzWjA-1(k_&8vs+W$6{ZQ3??tPP4aq2pyF!-dWkoyDD3qsg)6@QrY56uWaG-D_t!|Rr(9ZSGo%8DvaM&>F0M;R`cIf zmhttK5x%*y+IgzN3hz|5w!B+m&dy4g^M0k@`Jl4Q`KZ$0^54p`mR*(AE&r>Gw0vCI z((=;YtmTH8;vMMBE$o(=tJ&t6%L?z!>}RiZuP(gay`}I*_ekNN-CGOWySK8BXDwsB z%yD|0UWa#cUM`nCud{R7w5d~P%qSE}rM|vCR-Dp4ankN>J-xj&r4-7ih;ZU(i=Lid z9&I8hY#wc$v`v$F3gYi;ifzi2(q5(Z_V$w8CW>u$u^rS=?jAR8Oi#4+?b|2H*k>Qs zom7I?sVvi`if_$2l#E;pb6Z<|f9&3K#?P9y-|Pur*=>*MVKBL?@>PG&nQp6Z;|=+M z{l8Y7mFU#8LP3--DxNe+6gyFLu3VnhS$3ShzTTdm0}m`uo7Qgle+h-9-(L2;)6Y0< z`FFm1W<#?RXWfkAl!=pidpmm$X5Q4&lFmXq6J74?J^9pA1TB{er|0x{{F++ zH`z(zzf+mF=v(vW&p-0(i;p_`_(Km}u;3fV9((L@One6{w`1m>aN-dS6*KDeCGNlq ToLfYy6FrNu73?fA*jfJvVw-2P literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/ucase.icu b/go/mysql/icuregex/internal/icudata/ucase.icu new file mode 100644 index 0000000000000000000000000000000000000000..011e6053f79610d269ecf36ce21c97b189d6e235 GIT binary patch literal 28898 zcmeHQ33wDmzOPC5%=Bain3+Z;FwhBt8qi%p-6h}^HnK)t2||PbYS4g=!s=Vu^^41U z-bZxzY2qpdPZZFI2dJn)T0!GIbB(!ZcsO>o7FAqc6FP&Tm4S`QT2Pfd!ycJg?f><#9Qhe>^3w-XxSm52EJ`A4ZPXI6Xu7Dr$)yf(k_hs)!j^Flv z0KCP!4RDvY)60CCkFs|2?dR+38|XXCcZBaKE)mtFhZWpvyl=ekN|J%I1^2?EG{SSYE=F_@svvKFnR`%EW zX#=%tWe^7SVZaw-ebV=1yd$`6Ifs=Azx4L|H2YRy-wNzofqg5mZw3B8tiafx^o{pn z%DZ*~;-9J2X)}19S=wyvD!i*3v_@@?cDHt~_Mo=Jtl(*F73g)^D}bfmH?()P%MtHm zZJYKbB!AQXsrdrcN+=Kx^a>mjC^8C6h20i++u7~V?lZbS)O~CB0X^z^ z%GTlb{hgEhMD-;Jy|3cU^&7L+PJQ(p;SMgwxNEs9$Qz zuMPY;(8#%40?mQDO?l?HmiGR@BAd_SHrlcw>Y2dGz}mn|(7&F><{eZ1fk{;>$9lI0 zwg;ZYPxihF9EBb4yTCu8RSTDU1BH=7io8FEL=9K^LB7I!P~l;wtXkO`IFfTm7anhu zRx6d*dC04#a53Kvr*iI`!V4T$vl86w!mBv-r^3G^By$Rv`tB~guW%9g#}YD%=j^Rq zb6j9`!eVXVE1)*I+PAw(o$<19AM90(PvO0zaCQ!F7JD6XZKRBT=C+&Hv1hNUk?{!s zn0Hme+aK%^+%G62^))FevGBm)VZmX+;lbmACj_g4rNN28+Tgjt3xb1!q&Wnd!NG%q z7ejMt@Y>*w!CQk@1g{S^1)GCc2j^voD>BU1T7t8+#lfe7E3?{KHRVpN4Ze~TcViN7 zkM*kU3A4+)PH#rGEy3-<-5IiWJ-WMRNpKH3%d!Uw-_E_B@v+|7`20HC{JbhS3ALQ6 zaGytR(j%24=_{O{u+g_mR&#RcPjrWvLcx1jri zBpw!t$@}7Z^%=EO9ps(tZT7DBe(x>wnLdf=$$aBG$r3j{?%U?;4AmGw*vk_$=KU~=4iql{?7e`@AvBavO;jdxTNY9xuoWwrlJ_933(@p^d0C{KI(E$w z@_8eDREJHJIyA+2CXFVev@japgAomn2ok&`xIWlBgnbx;`rOc_5M#Z&5j}~+xg18m zXym=UTV!WTciL?`I4tc^-=nQZWM?2OcHqizL%1#6$+@x!$?7?beAtK{Ra+w+kplcq zM*%B~#-ouR`KP6g~6y=iI&* zkX&emm0g!iGg#6fSdpNexqmv&zFPLEwC~7nmrL3{Pcb)9Z@PHWFIVgyCwZl0W~-Vy zeg$adGPmp&ss2_LqNqYCp{%r8#M6}{oKFUdm5;q!Z8EFKO)Et;2uL#GoP%op-)!=J za^inw1!CS3()=m6Y=X*v-DNzLud3iAEip;E*65n$T2q~^0?O#@D!=05noCI7>cYN` z`;pZ07uTq3Gmu`izSc~fSDkdCNeils@~6*Cn)EG`F1?r3$STBVUbjq^h4e{>-MrJ; z+T|6T>8iJkWcQaeVOhT_4Krft>3pBsUC2Uo`EvrHm3i#8?zEM|U7swK`SSI%#}U=h z8pvpNEz!yKr_YvNlN{5Xrz*PI()%XY@18;WOYClyUTXIfD=AkrIX~?y?#&q^ot4vf zZ`WKzOJ5V|lPj_2=d8t@6SERN^wg#1BAt-enNN^=EW6Y#E=5XhB(uj)wa6|>tvZ`e z9nYG&dg=TqgUScZf;vg4i~7~XTS=U5+_KL*Xh(gAzlDT0k#2R9)v5r_#-^#xAeOvmr z_YL$b>lg3W+^?-)XTMUT#-OSjj8>xqR9Blg8}%;E+(#5!oif!FHx#!PcN7EkoSzhsWv2W+7MTVA)3U*RU~9?NXBr}xykHt-SnAcl&dSP zE-xoBBkAIv(^&_dcJs?2e;Od4L76QZro3{+Wu25+)Z^3$nT$v}ao<~zbmVPsPA}3} z_OzqRPSQHWB!2H2`CFCCxrmrlO|2li2&qbI21I3QvJSE#x|Yw8s>rR$xd?0W6A^1O z%tGdrn0#qn^KH_fFtz%UmXh|8K>xD-4gK5u2M#Pd&>8W(1G~~GhVXMnOXb9>(;a4Y zl1Z)tE|L~l#&FJ^?5HJQJLk6``E%Cy8eOLb+P-xWQPS7pWeo=kZ41XTtBfJBlxM@~ zGkI8BB9pW!F~3`~qKfO28|1sGDv~Er_AYp7ZP%pI>3{C*{C+N--L?Zu2c*;G zDvuB77!V)WF))5m$3e7Ky$??JU1M%FQQipt4XV9Ie{goYj_zx_mUz3KXuF>1eDrSd z;|E974vgkQB5Gx61e-HylItbskfH zYzMN%kLx&&ug~V_=G)6v78oUV!J1L+qbh&JSlh2EE23|T{(Xhek$?R7qR~`q$LRR4 zx&Maf2Dw+0zUYLu6Pha6aWYm6w*}Y0(zN@GWur=1s3kXsn$hnab+Q4QL0Tok8vd zVwWIJ)+i}qMV7)wT4v8vV44WZ!@t)o;Jcsrj+4Iqkyb)N(B6ofbc(Q=d&;I$gkD67 zcBfdf35xa&XAF_Agz=9F+&HpZ`lQEEEpw0jJ#!b4X$MN5Bbz4LEqjlAoxNwvW7p^Z z-JqKX_RYQec2InC-FwW4F_XsBk7*ube}gO=vuVtpF}j!FR#|Inr!A>FCH5P49ZBhF+`H};TRz|7d@T0?do6a_ErOej zqRLoq`rKq9LT6|9(ce`rzhDhg|O-{Q%#mv^S z*~lxjH#k$R^n7O0OP!a$9puw&_uAvS=AvHm?v3Bdv!3usuk`swjGZ+0lCg8gj)2Fq zv75&38Qc3*nLioRx{@sV`G`{|nO50W+pZ^X`W+;inX}1zu+9`qi7CiH(9Er7#js>< zRGq`i&B9&gcHbt;+&xKZuhf1X5<5TC1GP>M_{$wr)_wwW_!0A=AyO#gx@w(vxmgJx=l$)BlM7dui;=^|x^|X0 z+H>!EWJiA8%Ir?%T8;G1FS}(e{pC|(y$^&&<)`8b%PdEVc5Lg)Sd{o#V2(j-X3^rG zqT2!{k&zF7ik=kI#>#unCdz}%bw?2)Y2`&e!o*SUT65bQyBbU0V=6C2)+9JlZ`W$H zysT=iTqKjZEX&MPelxVPO3Lk|M^%uXnt+*W5O$V__*Tj8V?~j93FkLgD~3HX(Y48f zw7j@>5(+cPRiyQzpALxG)n%}8Q)P7dporBU_xu1J-mv(KPXkPYJ&Uam`ta63q zdZ%in1>t;~+@qwgjFlY6t$u0i^iB38-fronN#_#aNL;MSQXP(r5mn*E3V2PLm@!hHPbz7}${xnzVpF4K? zWY3Xo;q=d(*J+y^DVZ0sl6{hYvQI1J-ja8^%)E?|$x7NtOz*kW<}OXO6A~+*d4P@)fB0% z&`#o-3T+(M__XE99ZI?O3An)}tMI=?6!*V~UBt%mei6H?UxoG}*M(2;u&NDfk)@Ii z^;Bq;pyJ4QnO59?dBJbl@`6+N@FC|o7SPssUi5SWd;vJ55e#T;o^_D5dDa4lG=c%` zWzS2Hz3f>J9MT9<#Pe84IWaK;+Si`lu=(1v3pk_^3~Pf2g|$O4j>RxAVi^4}#w-YH zCt)0dQGxMYX;>=<{s-`97{3BNXhFEwe!$TM?A+1ATa{+T!&VqH`yq$7Dw^Vf#DW`* za_uo#yoz;N3j8hb(@Ft*P5A~?E2!g@6|h+b-A+iB124rjSOxqR_|?F>lqy^o4|@*$ z8t`udw*eo-*L=v~FY@C1Kv9|$li@gKb9BT)h8`}hUdu%h{9kF)- z?~J_%cvtLwz`J7~0NxY(5O7}XBfxuO9|PVO+X6U0wiR$e>=VF+u}=Z-k9`LCKjmSE{p8}d^YwEz!kB70mH)OYGW zZI`|aaJRl2@N4~Rz;E<#0Ke701stYERUfWiPrzf;et_lpizK6Q-TGs-uZ~ITs+bQ~ za1iuot7ie8wD81*;n1pJSW~YLYoB%xYs0TE*QWOKu-may)N)ME@40NKazbJdKZ|L~ z_{1RoVpgb3NDSiZLDwV(@l)A~g7U=hA%1PYn?fu57Gr0+h>`w9MOPLi#$_5kKf1!W zL_#U`aH)h+I*Lmryh+noaRD>%>>JIB;5^_0CRYHi(Bu?wL6cL#g-p&1uA9mEz;!n{ zKe!&mg&g@fXXfLanU8a3KF*o>IA`YLoSBbvW!Tr|c&H;A;aXh=3k8@@|&YAf*XXfLa znU8a3KF*o>I4ASb9e*KmFGKES#__mA1Grz8;I9ic;!Q=`hd^Hg`WoeU&M#3$K_>>? z^Td<&DA*80jF3IXXuQYhEhxNaA=i53T5slh3f^-PxgG)C0JnX&WlNgkXc(SHk z1Z6HOhIq>|=rugG0;YTW-o)qA%A--lQQ(C?aT=Es6i+tAqqnGpB5LZ}buo{+8oS24 z#1Oj*?FeGXc*IeB0m*7MR%Z`ZXOFo$ZJ2Khsv+$L&^Lmc|*Xq0{?%Sc_5>T3PI2C$Xx~Xy6xFIE0f-2s>^FrGuK7*XIK+&=&wM)E5HYuip>&0RBg6pVp$c06wTc z2)IaJ1o)8t5a7f5!+?+Ij{rWZKMJ^5UktcJUjq1;{uton`s094=uZGH)t3T3sXqz$ zl>QXp)B4kZt$HiqGJP4~Gx{@t%k|}e&+5+tuFzKiuGCiouF_WluGUurKBqqi_`Lo+ z;0yW-fG_GV070QP zzY6%8{u_@(|O;8*%rI^C<&*>pCV_v!qtIvpqa(Kz3a1{V4m zpijn_0e;BgWKR~thG;1zJ7GB;w(@htEPN6ek1>n+zE72Tzo#6tynpakct>OW!7KP_ zppV3umf+6^Jrd)5&i@zePJ#U?ru|Ilj>eejt;Fv&Df}Z5ogWyR+mE5*=_L7HP;sUt zs2NPu@CUc=pxJuzrpyu?|VAk zFg6Dsbgsc)mjgv78vK3{DEf4Se~kfO+NBwL!eET{P{v*WMLRQNPbR2VRL{b_IcxF>_fEEy$%^s9_1!wBbSX*-sUp$eG3%rbBt{TMLQu=K2A^` zWwElwAlWG8GcF^$DoH7)CMb_GK0%FAPU93s{uERjukUXOs#W=pQ&ivA#t#PVg^Yaz z8J*l2yB1>>t}A0bvCgw_MH$P-S2BUye zR~XkD3a73#ZZK3%U1j{q@N(*E(ZTLBLt#PA4=Mu(lGHx^IOu|^B(QMFp z1i#U0%r)pN!r0Bm?FOAg82gKHhe2l$#%?k0H0b<+d&9WPptA?AxpB8a=K#j;F&;4J z48Yhtqs5^2KV$bA4;u9DXY4*>kwNc$#^xIj8T8I)Y=QBxLGOFU78;Kj^sZ;@e&bPt Y-t>$;hE=2YF#cX4D0=5Gwj0#{01+=r-2eap literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/ulayout.icu b/go/mysql/icuregex/internal/icudata/ulayout.icu new file mode 100644 index 0000000000000000000000000000000000000000..598d347cc1e5376fdf8b799223506fe87d5c4f60 GIT binary patch literal 13488 zcmcgy50G6|dH>G2f8YPjF0gUg>@IJ$ASPjpkrtYUh{zO;K`BL18Zkq0(xy=(*3!}# zF%?lJO_f0^kzpWpB4$#iFo+=_Q&T1-L6|~qOZa1fV%9{VHnols(%<);Klj|XZ{Kc+ z_IrEpIp;gy-`{u6z4zUHWAx<7#}SjJWgoigBMQBSjLoTzW@$f8ZzS5ZXzC$S(Rj;9 z6TNxW+KbMi(P&3agS83AQa5!>?Rt8Uj?md^oq7Prt7?f}r+4aK=*7{WM@#g5(c$RK z_)p{Q@pExIIXzjMd?k4@8A{)q{;S@v58-G1&yO#O zS0Ve)#W%%Ufx!ducjIS(#7|_{F4PCrb3o>bGJ z^wjjM^c}$L!|5l}Yf~=Q_36!M??}Iu?oPj#z9_uemhljz`Ouj;Sg)Q9XQyI3Go6*a zqeJff*(%HVxNSd`ZOk@zND*xl&b`@gw2x#9l260g(^l$0b{Ok`=x~g0h_0=C zxw1!a-mA>~mdf2$Vprt}FIzbK7`*_l$@itok;(M+3=lAI^=&N); zJ&n79rh`;fZ&c?iO{;Jm+vhs9wX2QY!riY@J+9xXKd3*YzpB4E<6i2_@RvQ#^L)zn zWp+ODOi>nR3Irp022ycGIgiYaH+I7QRLjtt%A6g?yC;fcq9iFU4S~&G?sIz0*%Mj! z94lXHku-Wm2wiiKYjPlC(1UJ+fL=G1>B48oVeQ!9LkU5E58EcGJO3Pwr|J9~0ef-w z(nr(DG824^;w4E^mo%|_#T9$dHuh$1#tZg@mBBF3;EHsVr1FYV6_7-hL7qIuYNu;88`z!DucF}gJdxQ7BVW`Aj*4=T=8r5 z1gjet2D2Oxi$PXwv5j118SgxC&yirvq|6?cJ};dNe;^x!@8q~!8E|98J{e9pcZ(0! z;a(7s6H?~ZnPF==uYRSXN>$=)l;&8PutmiU)Et1UglN#P3H0K)#xaB7&R2}JQ>n0t zEYDIytR+&=_YT>X*t15JiBse_cSlO**#zTPCd0MoK#td&<)SAr@vH!)6vrHCc@z!| zuw|8FDvQ0cjgX~EC;|%RK%v>p^Ja#M*+p**#xTT*GG<(2fJzJv<>t#82W2E-#(EQ* zj)_*{VL>L(m!}u5hK~`mNGZn_Wcywa&dr04SU{bv5r%`}P>u@-M|du% z5+7l;TxI1+j)ff=_Y$1*EDI~97~f=gZ@_F6x^cM4EQbSA;e`>1h&j9N%_swx;g^x5 zzyVk}p2F&Om@}pzJInTE>`%M{nXrg_L7XuThMxfO$8`*GkhFM;abxd7yp>%a?+h%N z_d(}JC-lX%)V>e8jBY^x*gj8Czq(LuQje;JUZy{+(29CZrX+T%zL8O+CD)q z&>@OdziO+~)j5cErCO`5LCm+RJJo}T{8{yqI)eD;=>__?k;SEYwZ0m;d_~`(cOauD z^$+#Wk=NYll;}5+-Nn%r(Vrm48=@`I-N^Kb=!NJI^6ih?@#)C=!gyu87P)VVZ;kH+ z0*}Stk6#83&1585m@G+_Co7UQz+_``OR^ovJd!+}{0R6o(s}8EbaA>Yy)<17tgZ)I z+vM1p?gefK(_f_g=4O7jI9mn`S7lda*JYD5Nq>`V$?nc}WshgiWiMlaA5BgB@{#;S z;$vanmoLfL-tzqY`Q`b?bAC(L`}3K6wfa(id%i$V(p~v?MElC<@%*{`ClbXZ=?Yuy zuZ&hstNd1FLf-rpHs4u!5wjntd>Av#{UoG6Te(qmzE-(M+J`ESSDvpFI;QvE6;*9j zJGtfYQ1#cUX9O}I$j%K$OX;H0sI1$2bfv}R^6Dgg3|1!T)78JK-ctP@{Q&*#)g9Fd zdbs*;)%}?NDUP&ue2p!>p~h$|s(o2c(7CmXYR_Zz`TX+Qm9+`X)>YcE5G>V5S|8mj-gji;MzFW#wj#{UcXS@qwkUszvRUkhuKbWPpqd=ca8 z^V{lwU;hX69})KhJq;dDRQ5sF#SpeEe}oP+4mYAE^Ns#MbG&Kf z+4Gy5OYMm3^6utk&5w#~Xvg(g{@Q%~wDX&qo12rgt+~@%TuOVvnWTNq1JF7w{k%1| zbxMo5i&{%s&h~8D<JJWToixZ&TnxAdd-C}M`b+v-`bYXt-0x!gA39DgQWvS1uEenc$FY63s9jxcykNNc_4-_W zx!$0+=w13*UEH%~(W?{9^)w6lS%|=72MrH*Fx+sesGI3%wbo+nv_JE6yin+#iiv%y6Cs#iK>y%VvEc6-$S%K<}{p`_D0 zc%C#3Q{kN1vvCfi@<)U4D1NMS=)31JA0`|1D`2kfuh%Jfty(xo{*1wlnX_jQ-rif> zB?e^9j>^=Txu4~)f`fyDLFy*$nQ`y*?s2!aEgs!)m!kO)@IRj8IbTkh-AeMNtPqqM z*^{faq%E1VQ-+<-IEu`r7KMuf_JU`6yxz4qx1Koy=OAni*Z~Ln2KP?yh{Xm6TO3|* z?>2Xn_bj}3RK1ILlUXv=`+PM5p=!VTno?ydqr z!R+U$E_o$6;uU23ke9Nf{2-EbV7iiPh&{(skh3+rmSMIuBsR=|+9RN<9_mr#!e!Xw zbZjZ7i@*tBK6Kx%;j$s_VUOLDF0V1R7rN@!FK{V(xC9R4N+#rDaxlAb*OoOLI@0Q~ zc3bNqd=r$fm#u=R=uY1=Axw8KGqb(UxY9c^Vd;^vfuR~G39`jKv5xdulBGB^2%w-z z(w4GHr?`3XJPR`9pk$=5H8R3$3x*1KV7c8kOT~oB<1lEhX=!Xiae-qaurouz^TZQ} z`>MmkLjxWbg^KT_4(M2 z4(7(&Mb(XAzkpCcAA~Cw04%}l3kzLY5lK-B-jGak!<1f7JsI;J;C<`UunW>Q;@!ZR z@Rl_2xKOZF=Z%D`yYy5eQet;(z?(NgxHHyav7^PMGfg8%lD)Z2jO*PPt8q7Duv1upCl^k8)T~%L&w&VGexUQ#1SNlWkV-mX zw0_;QKa9P7?u_C^vcxfr3Q}7$Pmd1rPrN{}+{>Y3?^+H7e=7>ya97mXM0?Pz7hD30 z=K)lTVKhbnBc3*l8{FW9G#=WRXBRAZVqXlJ(^1Dc(GN!}2<55V051`d!qlGRE z7?Y-jT?@ChjfPb@oWMV?h*1}{lMF!MdnD87pesi(NrXjj=H)C96*KV!(V5NIAX$|Bq%3Qg1j!S7)erRuB$KF^84#~@mUcAvOXtTN|(@0fh2by z>lpv)hK=at=q=GZqd$tSi*AqZ$A~{q`}URSSM%xg)A{!N5?V!9(RDy(E8R=G0YL9x zSDN2U@`~v#H}iaNtou~&Piuzx-r9Yg%ei+jjEeU+ljzs!MjBD`)pok0BpRv3@VZRB zPhFvE>J!%9hK@+qLg%L?y>PGzS~q!(2&3aMI!Ig9J*p`dAF_6fc#B?W$@PobK6OG5 z-B7Mi%~e&%{Sp|> zsdJ8KhyEu-`0qF-_0Kxy`M*(9HF|TpMbKL#+$SdUndE!Tm$B<^@06&U)a-fvYJ*(+ zw&MC{KD+U~bU*Hi$E!Meby}o;7yps?YmB~uV~;wZYI^(_KZjxWZMsg=*)g7{pC8vh zgN2o_U_RIQUY#~e{}nsqBuvZx?hRzeHQ6l7a#Ke;r)2z6R}W=P6K5xHuS77er;G8R z_}WdJV=oM*Loek2D9E-P$81NvR_Oo7Ttu>QaSAqY6a|A)6w~pY&`k4A4sCZIa9tv= zAmU;k^M;GqT~`v&Y;J@rWBc;LR-QC^Y$?i_=nh>9Wx3brV= zVqrJmb)RPzf8YDQzyE(Pc4wb?o^$SV_c^l`>fWtL)ye;Yb!VM?)a)R>i&H;qNamMw zr|Wcgo~P5j;0e{~qIJ4|5B;D27pY6u=_i3&*w$qFeAX$CG4)Stzl1u?Ff54?1Qk6!w!V~683jkXn0I`T6lhVMR;@ggm81XJA5?! zsPLKLv%=2^zaV^J_|ovJ!qCq$eQadyP~h$|x2Mr?@K9I-Xx$%tJMZ$^9=@m0ic z5r-n8B9kJsBTFKyBU>Z8BaM;HNMGa;kuxGsk328(lE}*=uZ+Ata%1F{$Oj`IkK7)) zJMx{#4ZRy8}mra(=o5cycP3N z%vUkL#{3%-78@Iz7Ml}W99tFJ5<4N*66=oj#ZHQy7JF>$$+73eUKqO|c3JG&*bT9_ z$377IRP2t}U9o#(_r-n|`%UaGvH!$|#>K>C#TCa@#x=!t#hKz9aU*fl;*N`(6?bOb z1#$D^mc*@&TNk%6?vA(z;+~9qA#QivTXFm1zK;7L?w7bf+V3;@jf8 z7 zZ^GjV+Y??)csb#%g#8I$C;XJ~M?z3yRAN$MR$@_NWnx2OSE4b|nK+VobmH-eCnuhr zcv0e_#3hNV64xf)khn4Nw!|%oTN58o+?Kc_@#Vz5iSH+VoOmGd*TkTt=%mD?%%tL^ zLx*aU+LL;d%t@{!Z_?zX=}9w^PD(mG>B6LiNvo39B&|!@lyqCtJxN=Wo=)1C^m@`e zNgpR2NIIDGTT*CpY;sC+VRA)sZE|yRXR;~Tl{_u^_~g@)&rQBKc|r1J$;*@1B(F=} zl)O3lp5(2`k0ozUeku8l6DiwMcBH(K@@~q9DW9esNclD8-<0sw_|){& zg4EK~y43d6?o?B%D|J%pajCOYPfwkbIyZG;>J_Q0Q?F0mlzM0CL#fZCzL2^*_5IW@ zQ-4bRJ@uc|khIve#I&rmvb4IiwzLUp<}`QOXxj9&nQ5n`%}Kj3ZC=`363;lD<9t<@9&bKTQ85{mb-&>3^mNWh7?gWt3#p zWHe`VXBaaa8QzR38OLPI$~Zma+>DDdF3GqwcsApu zjMprd$-F!B z;mmEBJ2PL+d^_`l%#Sk!+BbFweaUXZ;cdwKSn>~-0+%*)Ix z%&W|6%s-dlO^=k3q?Jnul>!Ms27Lh_^X6Y?|jOY^JqTk^Z|4f)o5U;ecG8TlvWpPoM_ ze{TNb{AKx9X2y*A;Fk zyuI+A!p91qFMPG|t-=orKPx;?c(Cx#!r-FlqU55?qWq%LqUxf?qV}TxB5Tob(MZwM zq8UZAi_R*#sAzG~ilS?ZHWb}a^ia{$Mb8zzQuIdA+eIG~eO~la(Qifn7KIhZ7N-+QT>D~hRkc-3sIpg0tQxC2vTA14$yH}pomX{9)fH7& zRb5lHp=xv0165B{ZLiu@wYTcMs!ywasQSAqvO2Z8u)3d(w4y%c-iLc41 z$*n1>DX(d$>8$Ck8K^PUSZka$!!^E|DK$seoKSOe%~>@U)Xb~7v}S3|s+y~7Zm8K< zb6d@pnulw)*X*p>Tk}!Pftp`yf@;HR6Kiv7%WLauJ8O-#uG+EM8MUX>o>x1s_R`wr zwO7_&Tf4sYmf9_~_tid8`)uu7wfky6tNo$&m)bvSb#>u&@pb8S`E})W^>wXvy>+HK zXPvKZYTb;wS#@XDT~N26?uxqAb=TJ2RCin5J#`P)Jze)g-D`Dk*1cc%aov}7-_`wG z_h+51KD<7*KDj=#zOcTszP`S-zPo;)ez4wEKT>~0{c-gt*PmH`Uj6*~OY4`_uc}{D ze{KDy`n&2Mu79R}SN)syAJu{F4b=^;4V?|W4Fe6P23NzRhNBy1 zHq2@`t>OHJxeW^&E^k=gu)5)fhK&ukHEd~ksNwO3=Nn#Xc&%Y?!-ox@HXJ(iRl|1; zzc>8dplb|mOl>S}tZb}pY;0_8>}>399B4E(S{vPsBaPDC zQ(u$0$=)=vX;Rblrem86*ivW1EwkvziN<%bIJNo0>bC`^ViKkHUHhLYYA912Y)Ng&Y{_k@ zY3Xh;wm4h-El0G>Y&o@MPRsn3MJ-pg+|Y7s%bhI`wmjLgqh(ji-j;nWpR|0_@=MFV zEg`M3t!b_KttG9Mtxc_MtzE5_R!6J5bz195t#euzw_eq{vGv~8r&_nS?r43f_0`r7 zS`W1T(HhZ~+Lqf^+E&}v)YjhC(`IaQv`uO|w(Z2WbJ{LwThMk{+sd{Z+BUb{)AnH7 zqixT$?P%N6_I}$}Z3o+aZTqWD*B;g$-JaZ@(Vp90)n4D;(mv2`ZuhiLZ=cmZr~RV# zCGBh5Z)(4}{f_pB+8=M<*8Y6^EA4Nzzt{dr`+@eK+W%?~>4@n_>B#9Q>8R;w>FDV& zb~rk`9aB1v={Tw5jE+Nx&hNOSV@b#Ij;lLv?AX|`x#RARhdQ3>*xvC{#~U5*b$rtC zMaTCYzjYky2VeVtP~kLo<3^R&)6o%1>`>%6LS zedm_WM>?PDe5rGH=Ubg0bbiwLRp|LKnEN$5%G$>_=JDe0-|Y3u3k>F+W1*m_(&-kwQ4(|V5Xnb|X|=d_*+ zdKUMr>RH#bspsyVM|+;@d9CNYp3i#@_Wa%x(i`8K)mzwG(Oci!*4x`_?6vn!?48s* zz4y4@*}Z4>p5HsacWLkH-Wz-G=zXyF>E4%m-|XGr`%Uj}y}=XWCS*=1nNUBWYl3Nl zd&1-i$4)qP!ub;xO;|bM`U$sAxNpJ}6JD6`#)N$nzMAm!guf;P_eJ$3_GR?t_m%b4 z_OCw6+`h$q%lfYByS{H@-yMDT_C4J9RNwP`FZb=~ zd#~@~zOVXz==-hj-@dT^*#4CM?Ea$u%KnD__WlX|rhZ4ir+=&;|KC5e|K$F&`Y-6e zr2q2%mHlh`Z|uLNe@p)Z{g3rO-T!R=3;nzLclYn@f46^M|0n%l_J7;|WB+gcfA$K4O~93 zdSLCq^#ivI+%a(1z0SCs z`lI!;^k?hm>Mz%?*59DNO@E*MasBi9*Yxk|Kh=My|6Lz!h&7}e3JukURzsh`YVa7Q z8jdrZYBYGvmxdn=e;dM$iN7eO1(?6yVbCfy3oMz537nv)}b>>Sh&+6TRZQwNV7JbCb(!MTH%4XzrzZgA7!mcc`Zwhlfu_`=}s z!FLBg8T@AOm%)DrBP@xQEK9Ma#?oq;U>UTyEt4!qSx&T^VY$Gv(6Y>OwdE$u?Uws2 zk6E6z?6kaQdDHT~s=j zWj)<`uJt180_)}071lM@8?Col@3lT^-EQ4w-D}-v{lfZ#^$%;XEyk8=%d?f+8f=|5 zz0Gd(+NRo$wViA`$2Qk?nQfKrI@>1O7TZ?aQ?~849kyMz*KB)iZ`@oI4d#XLto@+0%m)ooD4fa-hr+tFmV7J&E_F=ovKFL1Kew6)K z`-%2b>}S}|v0q@HXJ2GrVqamu%6_f=M*Bwlt@h3Kd+ZO`AF)4S-)4W#{-XUA`|I{M z?eEz?w0~;<%Kn}GC;RXAf9%1I2uG|V$&v2JaTGer9Mz5nN2{aD(dRHatd1eaM8}9@ zietKChU0k0|Fi@7)@kcg*S38=U4soz-UcxGm1HVJoi=>7C9(ol!q*^Xd6! zNbj?nfz2pC8Qnv}d~dKhZN4!swpm=f=y#cIgKp1Ip>EXX;~qAX*=_OYhplxwvt?}9 zXVc47R=r1WfHtF7#yVtht6{nHCUrYZ z*5`JrZ+&`~#qTTBnQRst|1d^_&82SUv|02%zlSNBeO8-$Se`cG9yQCI><%X9_Xg%Q z=CL`QYA{BRZJ4`Q<)&Qc@(&qAT27nGtmj{!ZOF`k4EiBI^R+TiJr8l|3w6U*z1Li) zb663a%-7~II?bj+ore)C;aNNw7t%ql@}w~vw{bhUVnpw;Ard$55;5uwW0UnR#KE|| z6a9J_zt`+E8+~qs=kj~4etE%T8{}qQH`0{H47p9VK^Bq++ALPzsNQ2@oa%c-yRj+NbywTv%Gk2cB%CbBR!lC!+9coU4HkT~B$vnc37KX`S&7OcVIL(6y zYG_RF8WPEP^)7E6!*E#w>*BCE^bS978FG)9ht!gyI%Xvu76`TA)&{5Eh)^tMx5uJZ z6A7S42$2!#Js$UHp-u_OXhlgKW?q%&2Bv3m>g8z+=zp|Meqi{PNj%!fEtF8~jLT~V z498|N$$Zt_b;E9($XZ_4JN09Hv95Y2j~mjK&E(OK@_U47@((-ROqbsqm=cSz^Li4q{X*&9h8ODXe;%!|Qi&#jxLH^!Wq&Y3E6SQJCDLuD~)Jw(Wpq z4nJIZ6usZ0_gZ}(Y{Y8wphjXXNJn(lYBqSxEU{+bNYv$JATIRE>yzac`$N-_#h`c( zWg{*#R&5($ibG~I)ALy|ESZT5`EzUk@UYxk??L)XCc}EGepGy$-(Y98Wqvl7R~b4t z6jAuBZa?~K!g_fe>gc!QoqWvRXJ+6yGoEADV+BgIjL?K-Xa+GT@(J9BK{{|a+`!Eh zqEa3Q;|Yk#feUma39HCm+@Xol8qBPnQ{?RQi;gKvtE)#&$l zl}IrhTrA`A;c=7`$C#{slYSI8#%NguhuNp|nDGN%k)+8zWOKRsjeFQV^){4#od{>SI>2tAI4VP-eJ8_Gz2fNN0r1i`-cXdA`zuuq5?7~ zE6yIBM4TS83F&ymlJ!Q9d)VrB$z!a{hp9TiuQyu#4A`jm8}(kL!>AAo&;5tp7Fm}e zQEtQ{9$Lo4d~sRa9C;!Mugxnuj#S0*ssSKr30cbPiJ6P__{^;7kRD4H>-2~^$N;#L z7hN1zHd*9RZp{!3YA_bR%dSk#X&z=${KjcueloW~o0nA>W(4wF&0x`Ghe*%u^SO}& z-+0hw)YD_;dBilYc@j~9kniy;F?-xrlt3=w?v5e;^}79PElkQi|Cg}HZ8c9JgD?{j z#|M2lP+?0-7x4m4{Rs1Q=*d#}*EfoKX^y9k5xc$rBUt^2*{g-|v0;x*jLP6fTQJ&T zx5R1UlNL0H`pnpkw1#~85szOi*(_R*C-$IBL;5iZ`d;kCrNb|2t0|`I)06?Hq`4v- zmrEiz@^>o#?)JF!vU=E`+b*71@09qA$YdZxlM#yq2njDPf&*e576ovbNBK=aiA5in z3h#CJJ=mMutG9?HSdoORgx+jXD#^%fUJO{2%C5ZXpmGO;Zt-STx7$Gy;5AA(^XQZZ z;wHACVdvoy~V9C@Jf2zoEF zvw0mzPNb}!paKz+BRt`=xE;^Xlvr~Yep#tiHIE$u@Z> zxsD!;ACvqL~J5gAfx+FBBMdnFHtR-NBWczNBkLVrxh}v36EzlUk zoZF)tv5E6^>OGPs>0E6EW$WoNgA$nqgdfcrNzk@W4oBrpfMIIz$43t-qiDbk+ zJp&V`r#mk@|+l%k|*)SSE~_h9vi_xTIpDc(MXGUUGcG1Hbl|ErGmZq91GGd&Y)ExYfdYDG1a{q!ki#WYGL0 zH==Tpyg+YtV14<3{Q*w?#{FnFm#jMCxI$ax$U=N;n$C zEC`3P0$llDT8K*imq~cZbCtMU%2DEaaUbJJr*1Mit8N%88pbV;y3sokSU^rHQPn(0 zAjoRb6Zd5vUq}Y37ShM0Ec#}V*dTsJJQQNI5s@u=>{H4e+=p2lqVl~W2M5=&Ak5@& z5R{2JB#w*s#kb+Ku_&)(CfJh=|AyKrDLcnBH)Bvy42|-8oMh!VS86k{E3a4CxQbJ_ zQio=OKCxo-LkuY3V8wAT3J=pV*}RBYtv;1Eo=cQOKg|6OhjEbvIoF_Df)EdtyjKER z9m>rp163a}RCz10Vl`hW&&|#;O&M|b$iYE#W2gm1XPGBI4aoXTWahN`VF z%fmX4Nsefibbafn`V2J~7Ln(L?DnqHNwJ5N7LMowAKO}C=VH-0O zqr5h^6ih@3v`<<6xSOQ17|;C!xhubR521TTd;y&haVR7)qhi)(e35dCMz=pu9EwMt zB-iM%8ph?u=}L|?tVdT+G}Sbb3eGAos0t3ZXi>38+&D>s2N@d)>SFK`Or_DFR7;eO zM;eg`E8{`vUfU#{6h30bxC%mxNSACR5Of_b&1fZw_7CYu#Kmumc8ga;m|QB; zA;9rsAbC(KhINx+icP2=|0jltBXQuf@okJsR70*JEd1{cv`kTr-RPmgBjIpJMFZB$ zImR7$AeH{8wy;d!g9}lCO4%!giOa9W7p+icc;r*)qzYbvg+qRmYVIW_3*$LrboOkzaz2@ z@y`&kK@t;}-s_;ZCZqCdUfP3>DHm~AMTw$X<|ld`XER@=v4qRT3d7Ua0fMZ{Y6vK|B>gC2tyKVZ@J!jCqF`4!<{=1JOVg6IMKp+io-9 zYnAXtG_=y>EyFlIze|Nw$)e@fbi_=4uS`z)bHcWoljd6_Rnf9K&1a&Q!>obSxOB9} zo3vi(I(d9v0)I0O9d{xF6VJ)eWWq!qw)md7!53PprrL{y@5a! z8qe5tlWmsClB&~~B%u}wQ}y#S*1+U95@&EkKAX?4888mOiI=C8B2!Zxt$;Q&6fGe$ z>r7*KM+p|<93}R8Y0HQd5H;DB$}O~{Txw9&(I#QS+$e{*gzKdI(Xr(qu1AZ=VwhgB zcD1cBOcNBo+(J&Fk;yc?vIJrJ;;i2oh1Fz?R283Q+T&yt2uxLNQUG%e5};D%8`X#7QW~c$bDeHb zp2A*~%!140LC8|P7dD1Oumw~Fl@a}uUcE}BbrO#psI7z}>UY)KgUhiE!+eyh3?1 zS#l9N3*j<5a91>mL`_YYIzf}Y8J{Lo5hp*Ol2(<2xK$ef$KvwiYfLgL<)`W3>I_UO zAVZvoRu>^0hCO1VQj$E4j0k=6Nef-&n&O5e*!_5?8Y~hBn6S_xdTmccCpR!yFg=zcaNvKQp-wW z$y%XoG-T!HZPM>y|w=l{boB8;~~2h|j~-i5HUW*TNPg4#P|8 z!b8zT{SaF2H0#l47LH7W8?hIP#nCKB-b`2Z5mf>p>!$o1uS?=8%{8@{N|u4-pY|DV z=iU5FzNxn3w6>;NcG-%O(92#4MKS8FWFx<#mUb|tz;2U+%8~hs60j0v=;ETWW_gRs zYa~%H%PJX&m`n|y>~fG-i{HT8%I5L^id|Z`2CG#%1BsiDpx63wNe8^RIO);@M`Z=_2tY%q_te)bW6k>l#OU=VW;Yri@J(59Q1qw zn+uGD2qrQbZ%g3sRf92*Wz##OUm-T4GGPQuF)fNWnHMFLiJY}Wn0TS2$GX$FU<_6p z20E;I0V?UDTSrGy@^faW$}f4aco}rEPU;D@?;}en*@-;Rp!E znlt0)s_`z6T)>0LjbxccWyBXCTcqx#SAb6!2QexNhQ+wabHoO?z3P&9bUvaInnatH zn@U;}XpqU=hGjh17{3(3iPdT9f|sCnuE%DQsVZ;&n@eFX{oI4Y;t;P+@&fe)hE zV7X-LU8NNwPZ9^0N;KI%r4flLt9F7zJ~^q;Vn^E? z&BC!9)@M+AUE->2z~D5^4!KTslf2Ro!KN6+Fzr+ZIA~+%m!H+DAOzx%YzXtOws(xm zsftaLH*1N59$~74Z+2DH1SO~D`Irhn8$33Xv`s0g=^JszMKzv?qx>wdk1A)5Z4i^j zs|beYB|f2qlHMNH5s?>3f|NVTv57pyVzkmpM~{nagE;1tc{5q5=VeNai=#K>3EWm{ z%JCx92 zdQ8lXhV8h1O39-#Jdvhy*HWdh$HCng@E~VYSQuZGZFFGL%tfyo?|s}<2` z*@JS=S}EW)v#ntuFGe#+@e!fNk1fd9@e5i5N0f;+o#e>m1*-g`Ju9^z6Om?8v*@^| zSpo|tY1Ow#9CI-e83n7VLW_P>JTpU{EHOyhxcnY1Ry$23Fk@mcLl({QTwIFi5*4i0XJGZp$9Xie>peSCWbyTkv$Evr8%xtt37^g~6*^!ha zSPJCBwKB#=*r!#3caT(A9WwDMH!rFeXcAL^5I)2V<-M$ngnciD>~d&<28E?zumsxPvehODyKXYBR?{T1T`f%_gpzLKs~I;(F07~y z4xI(0(BzyHnYUE!fw(-Rm5g$LgpsNIL+hDom8NK&A1}b+6qk4nw!bAHINTKC@~<*x z(OJn&ES$rV-?A@9??bO+?pC!0%)F(4fS!=z$w-FAkQRHQS|TS2zNCJvzZBg(&Ll2| zrID5fGREp_vxIa;z}Z8V z!fiy!oU*e%$mv9RCD1UF2#601coo?el6$D$q{q#&p?|6f7(WH6elvb~puwp{Bvor_ zo2i6NvrWzBa2H*Q!LR)QxJ0rbZI1-tNfC~gtFN}1?5!d5zOTN>ZBYYNm4S0)24O+ynbY= zDt&HC+N-uXd1GJ?UpXAvu_4;xcd(Xm??zs)1};OFHxe*$s$vdW5{LzxByb6=xYmEGlsgbEm z9%w?XZ5~t~DVj-9Bo}Ov;o|*Vq=ej#4h*+yyNK&4s9_;z49yDJBf0&`Jj-0?_8-EP`Z4UUp;=+jy){ zkJRdan`>AEO5Q4kk;*SrOHJC)_zm-rI4BA1WC4SNfg*-e2lT#W7+Tw2wE!78>o12N zNWtXy43g|r>Kx*-dWx4sMB4eIa-2cz3SAuH_vpQ{dUcXahRcZ~Gp}n6LyQAOmrj*d zAk}HfwipH#0Y$#-HqHQvVn@Hp&?n)exjdf{r zp^-9M+^1?@s)IX>Cg4FhBCJLiIFKweMla-@L&G*2-mVsjv(iB_YgGs4A!o{bdOsBd zdXGhsT;p)4NK|&sJ`SVNnvK}HEZOREX#i+5odUYC0m!{;!o2vc011W9rDeu{tWv-xn!#rVWwTBJe530X>1 za%HVllC!Sj2Lth04rwVZ7hU3QL>Z}Ej&MKR605fdmiZFpu#DWgFBgV%c)sDIoDN<|*x88LVf~|4i-2Na2{2-mS+xStGU=p9!$M+- zwB==qTs%Qbj%2rhTrJ>y@CvvM$=UtfMbALlQIvHUWW^AQe1d@hOX{VwsPbP`WLoD0 zaiTgrLv?^9Tv2gjCCzdiN;VwD!!azUUk>14AkxK^$+6EdqFEK^f~1^6x3uGHIMIl_{T}gNh=|QP zWx#+A&NP{=0yBHdVigF1 zHBiYRem4-x*d#FcrA{8-lawt>wRwr_q9q}oR*Et9?L4d~hm2KkOg@4lzY?ugCn?2? zGB(OdAN7nfKvkC!QzSYW<)j;$PRvz`A}^~>j>tC<71*8P1}eGXI^3t8;|?-$r&dZ) zEpko;YqO^zw;WK9~X zm#K0vd>pqC5m0?n{7S1(jyL>IPuoEL;STh|2s44bYHbIFERsBb{H!7;Ox&zA)1)>D zIB7~|0-Z}dg&hAQ7nT89NwB0AaY!$SOpaUxPsHz~G(-`#L|JWVp=k1v4OUah7#ZO4 z*mG3b3d5FSUGw|YLL{Vg0;zqZs8A=gd9+1#za8Sq#URjN>0^j>NGYro8Cwg~8jM|X z9xo>v&=7jOvc`xDQL7Dpu^X-GVB=QmF{hm>+|CJ78He-=0-3mvYzONn6QZVOMI_;5 zOM`Ztp99pqqCAk+g;g^Xp~$H_HonyWnV(WcrZ35o^!2DQ#dApfk#4FAzBpa&ti39s zC21AQl*C4>vS|^>ooy11+2tA;e-^8v|^=4>lzFOO#Z_&5|%MLrHCD)R%}AD z70P`5pgO@O0nH)5l#hTQ;c*{AA>e0$CEHeNk6IF4B_WYUGs9%=qz4kcSVPI<`HTyz z82ETmAV2lWK|QnVk4>IPUy^?f0UxEgettI7ZxDCrr`yXov{F!3Px^6$P^qy|1G|~_ zE0(3_862QeW5T7WEiGond^F#OnvCyN;jWkgb8wL;;nh`UL97B4{;EP->N-Jt*fGC7UCi#TKY--wdxt8M#a%NSp{vIQyGEk z^2jUFFjU?Tn<5GXb`w;)L~B{#Gte|?=MC$t=r0Yq^y=&d5Fdk zJMpRoJf5&B%jT>U&u2}={?+aw-C}WAB*;EKF{N^0(`4cg&fJXN^N}G&D7EPL`B%w6 zon##12at>^qwFg0vC}p3NY|UZl6DsFBfJsrOvp_+W|Oq+kf3xJN6{j2B76{1=Ba1X zUq0H&Iy-a@(M-vEw0%be%XtXL7=}Qp;o>?QC6{)bQR*4m(pZE+1vQLJK9QyQ9_^Qh zD;tI~M!Y4~@JKl@|} z+T&UPV>_7@VO@zV>cj{)JiOl`eMpv2HHR=ef;(+;BrHc;)fp#AweVyrqBHejmRphp z{JpAY@T!blELL(a<|5%=GG$fJvw14HFP~z2FrD!|+{2ZoS33%>87(Emc#d|s zIK%;u;dr?RvS4cg&!t)!9A)`$H-aZ)wmgj#)MzDcBPk_pE&VyXT~UW$6kAjdL|Sfv z4=8J?rFvDSNyL)$;PBQnGIe7zlx*k-layP_(NMkG15zTQc3tEF45{8WEg=S znA#@6Fyves3q47NHtA8Khw=$FoPrUfpm9m}gbzWBJJF5;$tJW4W+;f7p15Lrj~J&IX#N_UWknbLe|b1Eg*f*-od5~K2RwwA0^4(!#T z-KyiKo&N}=B2pu=y@RkgAV?-mB*jI@xj;rOcOlusZ%WyZkmQ3H6J;Mt$~4v9^!n%p zQK4DrIgROQB;bC`jFBo{Jeyn97Pv5{uC=ubbCT)^YXKqw+73S1;}r)nNxG!G#1~z6 z@mEd~Y?c}s5GAQQa10nKRpXdmb}XgzQ;kfqYR(vmccUv~VQ8GyQ2I$)DQd!NXmw!! z77}_yXCVl(%q4&+QIemUM;hg zqD(2AdakUV*GE~;r`vd&b|%m~#w{gz6P=}3&8Ws()v}m`9t@X=sCEd=h)SAeG=z9v zL_#X7en&e6|0|lN9Tt)%3r9vsibRX#(;HUe))2DQKIO>aI$1ci?-w}5!Wh(cJTXUF z7JO(}4$DZ6%|T|l3(6Zf1gW)eC3WL3BFHgkn%S6(QIoIc+mveJAG9MvJswWqj9xZYSlZB&C#7f#wOT5;&?L>9bY=%KZqMs)xwY z6*=+A+}MbbkArf*@hVkTjpLT;PeX73N1bg{Dj3*Fms(dlHpg};<}Yh=xKoT=oE)4K zxu_3`DUBKbY;_>l81>z^ju{kj% zS~N(Wt96SEZiz%Hxuf&;X(6={8kDK zx{c@y=@_OU-*E@2n$RmP{HqnjD7jVv?D%JPRf9K&4|U>Jd!Hn-Pfgo!IxKB4T$RS10~X zDm20#Ly|)w+L;IX*-TMNZBc)z;0=^n43 zG359x3TNf;qv~KXyCB*}J!CJM{Un;GssQP!&iIp{?V_YC-Bn=Xw$K#8*1x@A9O*Eg+f{TW$ zofZyDB2Vdpp(K3jdyqThC5b*{CTQIlTLFA@Tg;Zt5!DD}cUAMSR5ZMfNxCIbk%LL> zPmntDM+E4Sv9W>qOU9@k#DNMUr>|uv&h1m1w;c19P$A>g&Sc@}$Ur3mu$8Oz6aw#J zIiy!87prccw3KNI(PtIm5MYSf;>qbjsT)dREvZ|ew@ebCbwh9nT1SI#0)O5kP(YxH zC<7f&%rJ29oGFje%H)ptY3+~jXh#Pn&d8rOpnzaJnq}cMR=!eAZVB2WtD#>d`bPL)>KaLAs5>uw$M4vprV!pKrvG8;3HVj z>tp-LM%a{CgLlM{yZK2*<0hA8qC9X!H6?i9ki-|gsRbL7dk(TPoPVGbfcr(*nh_|u zOQ!(mX<~hH8u%;8J7%$5>nt_390a(#5DL@Qab!8IRkD@ z$rPq$!LhV#n4O*jKTXet>oW47D>EM^Wfj2pvI^mb>>_wvPBBc)ErIXmmcqyL%Ha0= za`9L!3HKFM!Pkqc;nO8G@Ql)0*i%*qd&}$LtcnI0U)c!fRyM&)tD50xbqfrs zX@v)B+F(pwJ4~+cfawjL(Ad}o?M>aVp}7Yh)6xsSX`KL*+WX)m?fr0e#{fK|Qx6SY z2AJDzgr9YrVQtSK9PYKi>IqgD-DiW>_t{}%zXN{W?}W<-hF}EiuhU(icf)rnl61OF zhKVrG=z*&^H?Grt!QoDw&dQ&C(CMBtjX;BW6t3U^icXg>I0mmDoD5qnQ{V>6R9J7F z1~0Q70YA6$A8F9l+m3|mY)8RT`_b@b`!O)WF#~RM9191X$H5)WnXr85czFKM2{71o zBD~gh66|)*g73Iz!w-f}hWjU;0^j$X3hmz0;HAFP;bs0a;nI<_;H=TJp>fi=@SjO@ z;3Jdy&qnC>PB|YQHT41*I_*OE`?QPTwj<`k)+6V^=Z~BZXCHM5>^yn_Jo=c0aQciz zF#p)aaMQ7u!lC0XgWn%_Is9$r5*T&D74VJ|mco-yTn4L8S`KxyR)Ftkt%NIQuY&C- zUkP74c{M!dl&j$1r>uc%PrVwpoVFIed)hVdoYSv`g=bs`pE=`t=s9y8EII22c-L7s zLhIQ#!98cMhpW%o08cu1BQ($11dGnQ8HS#J3w-bVTj9nFZi5RiydBQI=ngpf;>|FB z?w#+rs%cWc4s>>dNNlPAv zUoCkAPQKz%xc`dBpl9jh@R6lYK-aP-Vaf8Rpl!v|@T(Qu;Dsxnfl;fr!#7qv3op9z zIr#YM=i#!ec0m7{7hvqwFT&@q-U&}$`w}d?W*6Le&CBqoYhQuc*S`wCx&Afy!@AvY z;|;IFV?TWNj*sEpn?Hf4-T5h8yyY`E^RCa~k#~OqPr2tyxaHoj;L`iPhL-ye zz+dkF2I?OC7MQnw2U{Qd9tJ=B13d8XL74UEk8t3zpWs=K{|xVZ;urYblfS~hp85?s zxBU*6Kl2BCZ2O-u<+;CL`tyIo?j8TYe_!|)PTqM4+FsHHq364TVB*Wc@P(H{;Hp=| z-A{Wp4F2_MIE>pJ0ngqY34>mbf=O>g!#CcDfp_nTg{SR}lh@yjhrhm=0PlY*5nlLq z5}fi*GE96o1$Mlb3a7oF1|1)y!`6KnFz>@mxay-U=-QtRFZ(zLKKV&5{Or>_SoC>5 zO!%SzR(x3qN4_e8>%K0A-+WyHuQ*T&bH6ErQQwxsN4~9qCwx~4GrzBbo*$~=?FVaM z&`-6HKQ#?weyN8a{L%oQ{IwCT|E&og{d+T<_D2iM|Fab~{M80){%(g||8zj-znyUY zp)Lektm}p;K|L@pxEHpBOn_aXeQ+$SA9jQfz`_VU{4~M7gl85G7^ zgmG4RJ>CYdOt8c4i4Ith?1ZaRhTzqyF8D{98~&0$3_~*~!qzMgoSf~2qd7jfEY}bH z`6KX(f>C&X;UxG?(HMNZc(S}+G6gO!oeJlcO@o`ukAU}8OovZb9tpp#I!a!zJ{r!c zIR;v4XTZ_AV_{SMaWJi6CQNBO9)>oZ0Jk@t2+wRj37*|D3+`)~4QIEW3^UqJfseGE z3eRdk4NmPi9X55I2|wyQ3vTK<8?NX+2QKY77q<4!f#3I@2Vb3VKD@Q>0(egUh0rr_ z5p30844=}^g+qpUFw!_59yDG8&owQ8QRap4Uh^W@H@FzSI(R9ZZMh8Yv0M&Mw=RL9 zwkzQCwxw{XeHpBFEQg;vR=^{iD`CmdD){!$m2ih^HN4V&6`VG_1{P1e8b)~5!gZc& z;0o`x@J!!zaEAYSc=X6R_}Rz}aQWzsFmKYynCpbGQ{d;5br|Z(sX?$~S}^=EN`q0e(&3}CGT@xqnb2}_7Sx@R4WBtB2QEJ~7fv}X50;;v z4-cMR0B<>?5RRQ$1T)VnhF_mm0=J%B3TK~F1{2RMhfkkd0SD()!npIQ;8W*S!=uix zfw~K7;cXYx!LbYL;qMnVz-umQgxwc6!S63_hAnejV8gstm^!}={xrWGuD+xL_Acmz zdlz&;%c5>rvbYDjF71VtmramYF7JbBOZwr(O9tTkOZ4!AD-7_OrAD}DnF*e-+zfwR zJ_uK@u)yzDSmDhpZE(#hJDhW+1O9QP6TZEA2;Os*3$9<|hUZ*845zG}2$Qezz?-h| zLi@Eo*mRvAI<6mq1M5cN_3I|VV{aIP@7*vN?!IvftXw}8ezblXyn4eCaN@@4@YGFD z;Np+I`AN9v=BMDjx4eM6KmFDh;nBD4gr3`9f`&VG!RE~`!{R$%fyrB5g~4~d27kOu zg2cynzYbr&=MDJGy?fxM`}V@s_rD1jJn%NOKKKrN^TBuFJ6qp_uRZiWyy@W&;LVTh zgSS5VA%8 zUoRYhJ6`+-cE9v3+_>vI*#64*aQ!Pkz;j+b2peDf5l-6u6D)uIXZXVFzra`C_!Zu; z=Qnur-rwO#Z~g&)e)CVb{;j{D>+Qc`&^!OYt?&E`&wlq1{N`O9A@hXyg5Znq2gA&L zp-}%}7!2AU4m&@NfEzxEgin7O1!sO14UM12z~jG&g=c&j2e17q9^Us=0zB&LRC(n< z8Z7@N9q#%j1NMBI3IF^y3*PZvHk|%_4($FR7e*Y+gD)M-hiiW`$KpLo%kqr!%I(i!zUdPh}nj_hcOnf6Sf%T{*|XirnMiBe^r-s=VW& zA^!x}Q*Z|SsNhU^b>UfXTG6@ixuQAny5jR-ddY?G*^-N3UFpSePU&12Ts99bFPjg$ z$}fS>lrMnk6$|0B6^r14%Ehp#>QeYv)n)MV>dT?IW(nL?a|N7OyA_7aDM9s*w?lZ ze%H1MMz-G!*RQ@Zbi)jjva6+I6?UGIZ%bMIEz zKjC3GZ^9$6x9@5AdEYj;vi~{QHt;<3>vzDj3@^Y$&aVNab^fK%-zXG2zzY0Br zufcN$cf(rC>+na*8*q_z4@|f1g&*49gkko#V58$BIM=x!Mh<-p*A0CFQ(T|IdtINw z8SXFPLHAeip5d?INfQsiA0~bS@AZ5OhrPeS>%ITLHs8O{;X4G6@#~_H=SUEY8V!ae zlR{weSSak690s$dgv0i!5%7#@k?^7;qTu<{qv4bzW1#J*Sa`(Iaqz@r;^9d%65z69 z6X7+-CBfTgCc{gPPl0PsNQGNZOoP{-ln&R;%79PL&V)CeoCTjaB^z!zH3wdQS}wfm z^g=k}j3Q_|vlw1{Rtfy;>{9snIc0F~x#ckGyb3sQekHVDPz4uUSPk#GXfFKxqIvMK zi|4}?b1#82<}HBj^B2OXOBTV;FIfzqUT`T~v+y!_{G!XDe(@5Rap@KCz@&gzPbD=c=w7m@RF5R!;w{MVeOUIz~I%_!Z%l6 z2XDXXdU*Mob?}6%Z-Dx>H^S^|Zi2sEvmSnVZDBOt=DH%d`1)cvbzKQ;xS`_T8*AXpn`+^vo9p1&x75RtTN_~OZH+MO_9pn|?alC( zI|gF7=N)=@!kq^A*cKyZp9HI)9fPsYO@>*|Pk~#X zp9J1xA+GB^?_Bh~ad!4Z8%^|q^O&6T^mK!#` zJq-81JrS;a#{>26df~x$eQ^DIemL>{5g7ErD7^QBN${wBV{qk%li{|Hroe;yr^2F7 zrooAy9s%clHXYvk`H^t{7e~RQua1WLuaAMJ9hd>ve{(F{{q1ou==-^GNa_1|aMcg< z;qbvrV9bvT;4?oigy;OU2-g0*7=HBgrEuLZm&4=;WdIik`0p#>ZU3x>YyZ6pUV3OvT!`*m-PQ2;ppRg0@P4=>_+xl|$S1Hn z^ivof_8ELBY;JsrE;4)`^heBxcSc?U8>1J%uVWU%vA9JrGhs1&G4WD(d(vfabMobI zeaaH}aOxHCm9(Ys)AVI9By%~Oo3#QS%vuS5$zBD6bFYSdxohElx!1ri@~(yYg6m*c z;Y~25Xg%Cmv;nphZ-h@3Z-O&RZiWe^x4`pDZ-p6Ux4|7{w?j+$9dK{?W;mQei*B)3j zycgaytV;;dB~1*1Pfd)3k)9|R;f;m|z3Fh0F9U}82jKmFJ{tw&hlCr8-imFaeP!;#0po}*^KHAfu_k2(5ym~_ku@Y7>Xh40Nc4W4@J>F~2- z&w$?J&V)Z5cNV;L<~h)H{9JhM@$=xS6XwH(Ctd=tI%xsiFl!<7%w7bKKY1~{=#)!g z!Ks(QcTc?>t~zZAw48ng%s*o(eC~|puJi!-JRIA>UuV875wFC!Df$3tYbJF1T{}-EhH*d*H&A_rh;i-3KkJ z?}t}j^#HtP&4X~$)m!0~wGY9|uXz}rdF>V8_O1V9loOaOI|F;m(_%gLmEX0<_)wB0TrDo$&A5UxKBZcR}}^FT?gN zufY7fUWGMxzXtp6*$r>M_jOos{~Ivofj#i!2lm26555Vbx4sQ$ZhZ$@9(orZ_3(SJ z{L%N}NsoO1=Rdv=Wa$J86mnipWFsNf9e_d>(kp|_%qMK^zF~V@@Joi?a%Fi zgU`PJM|Qjjk9%P!Jp09$;G&(o;F_0ShPUi`1#W%$RrvfXufaE8)tw4o3xYqr77TCP z9Rg>)9tkJB5d~jXp#RFd%eN` z1g&^6Xz`2Hn?gr6h3?xF8Wa4%X+d*m@}H1C>y=Hx9btPyT|u7*R|UUS5VXG_=);1b zPYQy<3xZ}91YKMZ>?{Z}76c6x1lyQ zJH0PGPN(O~z?YFP6JKV&2Kln^W#!Apmz^&MUrxS;_;T^(=4+U*iF|qZ^71A3^Yb;r z*C=0;_!{GD@@Yrz4{i^BYvz&r{|L?unjRAW-$>-U*20jC;LP+`1{2F*Vi`;Fx3EFz>F)hZL|BtNq0g$RX@BU}*{M)7L`);w#wn~#Bm>6xTDQJud zZ`j$L*_~l$cXoH??lP+l+Oi8M1w&Q_g}fMDU{|F-NJK#lrs*01{rw`wpdvM}XwWpM z$x99GCeTP@!X{~@ecjTedHa68&$+Wp`-45-=Q+=L&N1cR;_$)3Snd$wGz2KW1tu2gh z_FnZqj`$2O0r268-fyy04_8xNbMT$Ww>{zK!55-`jCrrVR;jF8Jj_4f-gU#ns`OvY z41eE|4UX(}eh@=cF+>$ZRLPeEdF@QvFZQkCe<`_I)=F+r`^OAWF)}mE2L8;Y#jk zHT=@Z*G65?vk+HuM-{`B+)>%=N?sSTlFM^xrBYF&4?7$lrg|v*!{{23&%4Ha5cwis zG#|v;n+mqlTSZd~RJ8M}!tuYQDst5Ik1L(uyOQuU{Y8F9&9746RWqv2WQe2YH&gJN zEl^9{Q42TY@SAngQ|gW~1qIeQYJN)(zjsZw3O4)0!^?+jrXm`grbV9W_v zD*`(lHNV+`-|P^T9gecD1p1CTzq+ja>atLaZ#3u>yLQ3wZ*vf?^c1cxTnh9Zb$*4* zAB3yc@K$FgbqbA3&0V4@xufQ{lH<2dA-PR50b?4jE|6GVD6VxJM_DPYBrH-!-E5~4 zYPhR}HteB&_|<}rn%_DceyI7a=I|R0XN&8GE6Xt#CC5ZdXp;|Bzz;W zpz+mC480^2p66+0HG;38bzalNjR+mRBozIHt`r#cmIBmLsM0qh1BAY<#*WnjVQcr6 zAPq;|;8yGy@+t~nDML8Q(z%PIrT%yHDZSNH`Bp}8$@M=AI>NwUY(T^p;0 z)z+#N+EBgF2oq&r7FqnZ_?MAQk(u7y_l3gOS$$#CM#7AQxH@<)dI5Q;x7^WjxbA(A z{%+adp#|eb{`GzNeR@w>dw|5Zy^k8@MazIsv8NhuwRT%T_yvM#W!Igoxy>N!TT{}! zrZ`&1U6+}!Ln4~PiOk2Eq%mg1+St6BNj2m&Z7A4LH>l*NrYu~vc13mEIo4?*Rm%km z!KDKVmYkDboWl2~b;Z7^E{aV-+5xwJm(|qprX9muhF{$EqUpT#l*|8DGaaYvCz?uA zLBLp#R-3Wfs0VA&Q6U_4)4u$g_Jcy(EI^@c@q@A4t6MK`GoqzF0MYnRp&hl0hD{Vo)yIW{V}nvjW~EYvVodlConDn4-nDE0 z)PHIVtzR*>DRoCJ^*WmvNn4C9_D+R*6Jrt!a~BLwi!R}`0>&-FE-6GoS{UNA0>)YC zv}B}9ixtFa1&mvJoED-$i(cZi0>(O4U>Bl5ORLA@+bLkcxOKxM8v8JeVGh4Q!-!p6 zSo?Ez;a&N>&-L1iFAl%Rnl!v@-F;|J%~y8$6nzL$LANV_o9ZX-OraDyESd&^(DB7YS8uaZc)=fw1+nSC<8Q@rGoyRC;HtH?Z z-B`LQ8955=>J(MuYXGoVXgC;g*E716B!!8q(bXUh0@6v zkT-Sn>HQdZ)6gxDWX}?4#jp}H<}6L9&GRgC+>+D+@q&|RxvLAkJxv15@PR-^rIGO_ z|MC0pDDS`H`A2$|`7i9%Dx#6AR{u!r`$Bb>w8Giq$k%+uq%9q9s(=VrdzY{Lc+cV;k9R8l zZb7)t?&1TBSZT>#wWgU!No((_Qu1vJs5#^M;+ra^mR;GbHNh?zSY>$x5F|vW!PH(s zS=rd*fT7$UEd$_e%K#+Tg!aLX> z`i#gjf1tZGuy%O)u7S)4=Us*d@SQhPc<&t`k1OI94qdrpdRU%qgyp zS@M}Je)IhK*S1_Ya9ycqz%Q;{I$O!lR#nZWp=O5<&JI63JM#E!@0!_fZJiyybawck zAe*+%w(6YiT{-*q@Jsj{TIBHOXKSUM{igT$*4c7zJLakI&7>awyX_<|{7mGD?X#B& zJ`?`qt+S)Qh9<(Bw$G*^5z>(0>-F1jYPmK&zcgA}n(41p)_x-0I6HD_{r1_~Pf?q6 z=oskmKcWAE@GYX(;!!e><32t+@;epZ#yL!h2g{!|&kR zg^y>NvzHr&zvH#f4&N63Ly@c_w?(copAq_?vI^HJ^_#cPE@o2fKa(ee#mTMYdysm1 zd?1bKD}SeSq<6@FqQo9ls)l*5sT#nA;ojK``TQVO{w7fO)&h@njYKG9QqKuDgHu?z@M-5fory@E?t8W4n1bDyTWyuZ* ztRNU36vVSOFuyb0+TfC!e|l!2jt)pZvoNyjnT2SFkw79dR|K4{tC1zb&n%4Xd}g6y z(6M6wMkYo=HJ8BxA?sd1$O2>kmhlJFZwAu+J$LjDtb1f($a-Zw{^vBqhyR=+d>F1J zlYLYQnHMH}oAPK=j7(n8F6aPcU?(31V|@Qo|DrNvx}{PCN=ilfW?F!(5{ah; zUkAKHK<%Fa(fdXv{0tB|DOS$F-o!raaVw^!&=M$}Ww87yZhxD#|-R z7(i>@0tX=LREk2HacguP9wQ0~`~gD~JZ#}c6dw2kh7M4)iUl>PkrF`X_uv3zzmn`% zaJBiFc1eVPilq~@;Rn!6nXN{!>(I>FTM2TMPF2FgX5s0V2cGEr>WZCP?%X-Db9l?` ziui8?0nrynXJ1)+>+CD1E;wv+irH6&m)Cv_Tq9(&mu|LC&*1N}2(dZpSoDu1w@b<} z;Ha=2YD6I&kDI4NE#813*9T|Td`KA^HUG2D`>bqwkbA)!onH)9GGk`=CIpUZATafuB_E9bhlYnE z=zcY@E?^j5S6H_;qA?oS7BJMskW0@|>g%NXoX~h~g`HOnBla1m?dT;RtQpdHafG_r z>p~qhzm*fem6Po&HC~L!G8<+_=2AT~9oP@(r2C+ z`T6<93BcaK>K~YdO)^#w6)WOtTP<22N>coDC8W?64 zWF90mGGxnw5%byfi5Dt=U*H$D@TwqPmAE-LGUQg%$dH?j9aW5CkbfTYVP#AbLpeCg$*5dJK3K{Xl=wd-v3tDpARYRIj z+vOE;CA5i-65ezed&^BA`&WRRDF&uf06Xr1ea4E*lz>tJNIVI!;|!S3^(xxpj5M01 z%b5m#rN7@_#sj=ce`n3L&1s}r#apCszN41_Qxt_A^T4u_V%I^?I|TGd+dBuaS2VcD zDR(P=(IhlIl76kDFhl3|AhrJ&juKCJqhk29buy_4o z?rRQ*pM7m@&@ffg5A*0p^};obp=yV#0cwY8OaZDLM!B)#s2#3D3IMgk_@#DurYLi` zipLJ_+N4YEfYb)p4)R-V5T*ih;cubT2A31m1_S*NKyARC15_JCbjs8Q-M}QO4RR+p z?7y-9qcvM*+%)SCt{UDs^72hdLZ~3}P%>6Qx_h94WR23Jf*6?x4%dBPdnK~OrlIT+dWJ%$+k=j8+hPn3zt^Wu2i~Rw02V$w@V#j*+_#_ z=`6`I&leD|j|P~_`(;R)ZDApOY!BFd;5@H2u~||RLD`7CI|DTpr(nSptX*oNlnAZl`kqt|qAm`WD-~f9H+$P_{~L_<`D{I~w>`z# zgLrFyP}%Jt{wQBEM@xg;52Q<-2lmnk8RZwd#CsLOZaqnuz3ewe4Hx~c1Lt4FfBtC+ zqaSruDC6B$bwQJdqVob^n^cp;DfQjDS>wY+|DwikU{@2^nv#z2 z$?(dX_{g=94~(W-w3$dQIojY|-@rK}nWXoL1|2QI!>F38$cW9|x=B3oJF#DKPLA!Q z+}=LsEo&Lxt!|A`VI9Q#KxH9HM<7BFHdv|KN!Ad zruUJV@ejxoA84qn{mq8D_y*ki2JdJ?=(UFM(zo`4ufDc)czCEScqg&6=#e8&H-u^i zi5RN+MMH>WVjtORG@OS<-R|WvhEVN$4Wan0xa$bo z-4KesiR_58$6WaHRDd<=99<&X`qXeVHD%T z*Cc)q@dp~>FE_-$-w;3EK&??})LG5khWO_i?C7Oza%+R?K;;j&Q1GxS_I&ts;_n;! z*dU(`e9H}2)YtW(cSU`;E@0l;-tabh(VtSF$ijwbC-DdLFVj1)V*m2t&70Tv(s_8d z)$okrS;M&DO2dspFK>LaVWZ)m240zMi2oIREczl4E{Fdg?zX~tT&st|c~cdoVsss8did2dbS!{F(I)l_R?E-yXjKTU@RaxG4O+)g z(qRPa4z9R!*@ncXL(3|=AAj=EPwY}8tM>4tLCR1X@^*~Trp^r<`z$0_ZAvm)I|E|D z%r*29&+>#&yYju|tyoc~PLyVbM}cjA9QB=c)ihMYeU>o0laX#@#!Tx_+^$dtd`i!R zNyjMRx6}){18^N~Q4lXk$NR8lE*)6%A)aH1>B9_CY7$w|9po8J|LiAA*qvEqKDYN;`(hfkY* zHX_{NASP4Q5)`Dd@uKrCvF|3*8s5LQxn*=&<%a#GW5?DlTXFPmGTAsY98;xk)Y&$o zv43%0##n8hzf#+3PPxx zLsW!VlO6aeE?Tq76*z-%3+Y%DSV>47k9fP)xaw2fZG#CAT_)EpqA_hc5Bop4xk7QRl zCIsW%&mIjIX4;9Y>7=(*32J)OI0*&@)bnhr`B6a61oUh`;{jb6(2b6I`M}>C(8hr7 zp>qG(jkHINcs~-U=TbyxEJuH1iq)}9)$PKQTt2&QQP(q8@mueF?Z~5b^{o%y^}x01 z^g|C`Q0?jx(SF>y5VKDPE=0en*2Cxh1RnT`n2*O=ya6NOn+dD>zbPhsGjGSC8wlB7 zj?LoiFo_HC-z+p`SZ!gr(3;}e3X=`ic!|Aufm@9W+#vGn(gh~nfQkO;g%AgErt$$v z)-J!GOD6V{yudW=xFjjy&1NpJB3)pOgg$a16#L=@T|{LqPT5bRB21dz=Qrj4GSjj& z_hh=189to7KYjBBy(`E`(ihkxt^ER5QWv;|Xy=>{B&()4^6139Er@*o0yns_*BKDm zNanM!KHbC>9O7`As~IystkXr!>O^Vlg+ysPuIBm+iEHs(i|1NA-A&qDd}~0rUkE?n zCySEdzn@~+M3M{b?hDyK)zR(FKR5A~l6;FK-`akGV`s9dzK8GV8MwP<)3=WvUitfX z9lr6_Cy!jHdBc14LKl1EX?xz(g>Ju%?d`Iu3t=`yY2WI|_6wSD1luoopXIk#!|fM1 z0VCIZ!A5x-<@+hSyyZHe#8)-^w1w%wdqcAoTYcvGbfSv3l4Wi{s^uV<$4q7T+fI&b z_}hd96AqfWi%?jU1?1* zX4LZ*0;0wou1pk`<8|dAldVwjl(mkKRwWRyR2b8vv@9g>Y5|sF z;H@ev39fG4WOUudz8CP=<>5tv7y18da32zK{(@+^MMc~#sIcGwNN%Nv{J&J2frcM_B90qWvgFVpX0D}B)iRqhODBbtr|)WPqF)oR2FM1UODnVP53W2&r}8n#6PpN`gfK?;Ma8*DZ*B z8Yqe*F4!Vy6Xa9h@co+_D!X>=dT99rb-#5}sBXqKr4QbraSSid7~_pQCL3E7FjbwbUbK-;HqeQmehu&P zx3+q0H^#d`KZNSdAn4_VLXwxYIPw^tl5wD=ZyxV@l(rEs&mmw~$e~Ey)pjOWmFJu% z-tHLnu4ac+b|oI|uW;PWveJXOo`Kx6Tb5QPwMe&2@-|p$($?mVNw*?ThH_=ZsnjHo z_z^0cJ#i;lbpegTNh+0dmM68sPO`#Ivd5|R2t|@3LQ9Sa>fcJI%ST=7DoQ*OAVdkA zM-MmGOgLX?*8Z~W1U-4)duXyhl&C1H^Nw&EnaxaIQ7>1s6RRYk<1Zap+LA7ZcHZ$o z^jKx!(T{`(uuznojs&C@zRmm&vp!NEY0h+nas^!cbCc&gIBAbPlPC91F{u_x(lxo4 zB|DRz+}o7$;n|5_l)YxlOb$#klU(&o?rqQH*%h}r*`-q@C-?T{7^stbtG-mGWpXb^ zIVb@(ispb;Qs9XbPsy^(UM1zB5w^+i^OxD+@(>&SdNqMIG1iCk@21TkFI-Q!Ieh_c5U8w`|z%p2bSH=BJ&2TZg4f>ZqFCH)kbUP z6vw^d)Wr(7DO$j}Ravfu`uX5qOwUtToORt|cllFZyBe4=GdAMxk=z0CER0v#J=79C zBH-eL%Lfv5_p~tFM273QN(rCTql2g}dh{qHxT96}+x<^0iK=vElySw{31dUT_>x)UX8Pvy=Rc3VZqj|{AMbmbph&NHX@E(_=GQfge- znD&o?az}qLb(M@LaYN7N6KIzK>AJ}u6}0q3Pat1B1CdQ)93sr$aAp|E**zdyh` z1uHh+Eq!}H2@Rf6asZJ|B1|WB|6_NTZ^r%y52$`!`PSzzgnS3`_0WCLJ*%0*p>?ZSE}-|WX4PCBs#&_4B^v*N)vO)R$5ykhu4V~c z%?iDm1$s5B1~jxf6hE<=hvcMp2)*&utlO)Z;Z`#ZX9QqtsG?`x^{b!2HSsg{c+SBD-^Q*f#W?X3paYuBqowLigqx0H_1 z2^%Ag5wkaaBGs`dcY;2MTZMb~L@31{F2^6v=MT5D8<#&3$`FXO072S@o;PuVu6Ba1 z27mknT@8Bf1YPX}T@8McFzafKWZBqA?S;}Mdr=F3DV zl#WT31AI9FrLw*RW^j~@J|o7bD*l1;aUH|Ah?th*V}ozKc7o1*f_BFWwYfS%Iz61O zR9=5&-O@(Yl2uclB&aNF^Aj{S?hNiME{vPk8XbHM zH+sT*C31-9M$S4CcVwlJcQ~-Jm<+FuT*BoeC;V_MOQD;Pi+suGZ=K+%66W<2Y@|MC7&C>Jp9DUVAMy62(JM~TE8r&yo9-3qXk$9mSVF!7`FiL+=$=#b z3TWLadIj{}Q}l{c^a}U|r|1>X$4=2JPSGn)(JM~TD^4--oMO@;jYFsC8K>wOr|21{ z)GL5oiQnHkV}SSZCGF2^6v=MT5D81R%49fF|`+< z6(QOaErs;UuKr0zN$Pz$0p+w&463L!AWntVS^&yXUK(+X^IU^bnsGXh*>+!MouPg;H{YoSXn9+7sB>Eyx2ITuNaD zF$!x_6mJ4(6&Kkc?{T2n0?xrL5*MANfJq>-%C6dV<*wJtLw>(Pb|_@MxP9XG03}PX z4lZrny>Q%xe24}HA`5^f;~s-+7grW^3-W?k2#9Recb0QV~&qO~b?q!#c7!yi(E z3v>Mgx5sXd_0vKzHrMy#CZx2qw}GK5!^y<(D+A&v6J(6x=%rbE%j#`HKH1NZKDD8(Nx#~;q; z50~K&mqJqD50^sPhMsqPoR%M_&EY48I9rTMdep->6go^jfCoVFb2qHdh#9H;rl zX}xh8Z=6~hr~ROzaW3S>X+hFDM7;5Fns1!e8&{J7no~JZ*2bu&)sSR4z?TzbEr!zVfXmmVLgZtD|qvH(P@vBFdrvFq|L#l53H&H*^ z`}eIA3`yK7+`E$eV(SFk`BD87p&WlWpFiBrZd`sMl;IDTBBa0{E=6b?`rgC@<8gxF z2!DKnCV`%tpjjpuiSUzzO--+^X`&_MJCLu3?t|``pqZd`6AVD;y%RLq1kDA%V1ls+ zeQbiJo1p0?Xu1iSZi1$Rh9+pb37U?y4xu+bLDNmpbQ27@2{j*}3ngnlro`LRH-|!! z zCqDHk$WPGXxHGu3xHxVlZsPJ=Ks`3PS?lxhd+OsK7w0?d-pV^oDQY< z!{zwH`TXH_cH{D=LmB>XDMAYT;ZlUQq3=zcriYxShrk~{O%H*dJ53KcO%H*eBw`3~gkq5GhFPSZo6b*Jee(0fnQLr&8};1`^xhd>`YO%FLu4>?T_IZY2aO%H*F zPSZO`=MXyMr|BK1=^dw;l~2<{POFCi|EY)2hEGeD1AI9FCAA&`X5<7Em);>Kpv3f$ z)AW$ju7{lFx{043Pt!xIt1@$x{Zq|Z)#Zsu1zOiQP1nGk!JWm$aVv2fPt!G^&A3L~ z9*p0L+x%Zg_lS+<5n2wn3is}aT?UVYQvBg^{Na55a67wk`H@hDKU|8C0)My^p>61U z6C!AyLj8@rg!pdy73(LpgP*oM*5 z;e6O*Cx~0J()>d0-O@MwLj0@Zg|X*5s;S5R)Wly1PRoPLJ_BKCZc#33d^C#z58=W^boPdDHg)TV&=A%1m92Dv*= z?4JMZ-Kh8*nX2__-#}<=gPXPwMYk{>4bK>!HH;gsG~6ik^2RqCHX80Ri!Qb_c28{M zp}o`vcT;(KZ|x)dj_lh%^5hf8j*Q%QXhZGZUEy1wY%3%=a}_53T^`m{8MclHMC_0g zaD@(IcyW=3I<4@$|CQwJ47l3o_;FXkV)%V)N+hmWNEQo$q>zCr7kIDSlsKE5h};z2 z`egX$oxC011{c4^d(a4HBe&XRsA9*j>CAF+X-a(TPxxUEN_1)*Cj9e~1;no@acHA5 z6_;{0iRJub7f;qU6_QOU&UaS9Itxio-Gr$rlD=uR6p}4fi51EO9Y;oXJidS5{p*jc zY+$RbyF#ID7K?X&IRa2twHK1yySd_7K{z(H+n1zUxpJ^P+g8d9e&fFN$=*V;H)W+z z{qX1?8C*9%?m4b#QER`J&2=h{YKj+OU<=8F%3S-kjsmBVi|urCQ$aB!e=D8HGn9R9 zidT7@$Yg)ZZ(34tYTOyiq2E$Z8DxCX9*xmDV3v8O5+)8*~l~T zgXZHDOr!J{h9WnEw?0V=W1zaDO4&ig6oWbyDPW={vD%6Hs!^S5G^ErWyQfu5e8J1Ze{ zDa&J!b|qOB1RdGM>#j;Tm(@w~lJp8{ZsTc%2+u35=w?st*NjBY@lfN9ko0Q5CTk-5 z!*6gsB#1nt#*@#ybX)kGfnix}+0Yluvz<5UBjtR{Zzm!+36u=LO)FH4UO`UX=X1MLld(^4NaN+&d0B3N>M&{ZFNBl`QVh2QAN z@lydk55IaDa7J)e5ErZzYy{|KaLs~70gb9*EVTONz|4iMmcKgot=kSiHEYqKC!em&D7073$G1m+75#Tk+wH(0e$-1h|-==&s87 zrXsIeB%{A-Z!O?yOU+Hz{2=f#fp9i?nXeEy!9E%L=gN6rhsm|)bZ&9%^(GEA?yj6y z8i@nK*z1mWl%tWE^D-StezELmWqT(ybgXf*RbL#L{)gtK`}}k=zC6*z&Pva!m}FA( z7xh|IlF<`lc{8e2tan#^k(kNWW?u1_ivl*AbS~!<2+!@?iTpYI& zx3NOkfHvbAaeHtH(@L0L!ZZ`6n~*lU1W7+(F%lLh5ybkDT7*{-RJ!xLYEn!#^-+Jx z#4+Bjq>iWRtoD-jTx(Lbc9B-GwP5vfKDO7|leaUHb33!$e&u|tkM{D`a-n0IPZcZY znI@AqpC?_|#eTwh1mCR+&A{?In`2csQL7*ayaSf1a68DGThPb`_~R9B2chRG+zvt~ zk*|X8fUbw`gYMzgFX`9m)imh6dhx2lZ6Evsy&4C7j2GXa(L=b~D?Cr(eYd}=@Hnx; zGiTl!<5@GiYj0`{#WIm#4 zH0;m}CP$RFmB-uURBvso+khn1P|tDm$viJOF|6X1^PRj?M|`z3ygcY$iim(-FksVQGl zTjD(q(3)`P0NN975}-xlc8FUqZlAb4K*_y6M@jWBvuoR{gPVxB{elRC>o9Zl3M#EHTO%j zH}PKq)Z&(g7>gM6fi|B@wF!)U+2mI)@8+79Zc@3tQ{NjVTh+`} zp<2q#js$D&rfM#aPAXb-GO<5#U~cVd$F?$kC8vC~mx#65Ec#%YpUr}qO>>-Pcbm~$ z={^5mdlPH?)FdB>D(BmEemE<@lpt|HbN#xe?lc{lX^}$xm*wWa1_euxzAlaZ!agn) zF<<@6$U=Q>8vBK~%K3b7P9WAc~b8{I?bZ6hUZReLIim;fgFE0pFdoNz>fh1 z6#O;ZPCl4QK@4&*s4}m}2{O2pB(aZKKAm)zg+qc4ldU$n~+f_qtl%ah=o{SD7XyYL)df zYXYHH_eaz0ylHm+G<)qddmZLKWI52s99LGiacpbWa%;`-7{z;CtsH)$pcW55p{&DC zWauZ}OzYh<)xW)&>fzo@>)SK?j90(*W~$$MGu3Oond-CN%ogLDQUhC-S1NZtdQ;u| zR8Py+uD#iGD6uz_j`8(+Z>GA*6;SJg-b`5l(#J$smDhk?9c2}voFdmb@-dP4o*-0= zg~#_~blaq;R5xhxc%%zk`!3L(%78q!li+c{H|`(h{e|G?0H3dek&W{I z42VAqEXw7`o(#ohJGJgUppvq^(Q|XuhyYHfGemc#I?qtu7KX+vMIUim{wUVl! zDrcznA926H{roN~ql%~!s(@BLD_)4wS>Zyp=kBti@R1V?1yMPWkfBP!tJ8O>AZ!rD z>-%?w-c$)9mkwOD+xsKj&#u}X z`5Wks(4XQ)aJS*V7y2Uf+tAWgyJIup&tA1V`U-RmcLaA2?yqngaGRA{^vA}L7JgFj zj!7E+oN`F~yEy5_|DKR73Xg@6y=^*gNB(AjFMN4jdthL>2IJWASIC=CQUe`7&*^&e z0_i-Z5`J42qBBnxLLFiyJT`)|5*uRQfm#~gJS(-O15c?zmD^JE<`GA=Gfydd^Q`oi zI^in)JS+b^tATlz`aG44Cx9jP5o_ly_2NZ7x_xUP%5aZ`*0jz=f~p!duykOc<=Cs= z`s;yLUR}zXxs*nT;(l93foiPZmbvg$0H|jAZRx~bGCx&-6jMQ}0Hg|H{a5X-xlVMv^s}jrKhQ#!bCvI0vqTA4{1|UnUbQdwh|C4CH1-H#u}8?EFDxr9}g9Aib2=EhR=P_819UgY(l=hGbHfdg9B3sS1>G6)D&z`sgO6 zl}$A2x3Aj7FxzBJD%=2nBeIdJHZc{T$JB6}@%I{{kNh6PzcM6-H0Us!7?Olrnx(5Y z(Tg|HQ8v-XH+e_WEt|1=^YZMl=)eFnQlZapvOIsF9I;13Uv@ae5_qM#nUN!2aWrLt zfsIxMo4{llF~DO=lj%(S7FP_?~9cbbI(qqTIeaslm1JY56wJO|KKTvhhFqj?vahk7lSxR{p(cdFFXGS)K^KrR3GislPW+ap= z$Pt>CLnDs{U)TUr$;+XV??qDk6uOp*lHuJ;WLcMzC1q)Gu{mc(yBvwNqsO;9fD&t$ zBeC{g-b8VhX~|Fbq+909DE6UN>?2aKk63ukuXx2i17a50^kUA8ZaEaY(c}9?Kw8~$ z=F>OG z+1k*Y8C_^~k!%-n_}Bf&b*@QkHkpW7tNsiQ-WKEA8Dm|$19VZ1n1%pNp z8a>h|$)Qm~qxLXVX{g_n&?rfxSq_b6G@6x0vs9YpP-&j7EbDhI{j}&}v=$R(F;NyP z%3?)XEJu{Zs;tFIda;TMXRN8HvO4f~5W9mw9!dcU>X0LLha!5DFlBUV2E>?B*|f^2 z)pSTW-PJlvt|jNp2tP;U@N+~AKS%0#&99i@=M2ap{G76AlS8o$J-#6Wq}3*ePMgZm zW_l_^?FIQvK_zHY1+=LK+EfL2&2QnVg|@{7^(s}EdYGt$mYmgzc#GoFT&zSD+Dnqk z&q6KCLNKUT26;a>K*{?#T6oQ`e7&C=psCTO!KSfhL#wLfLJJSm@IS=TKiR-=?rvNo|MQHzw(Da*pN94cv4c%K_k=4m-p z(kf$QJ&fs?lGAc(re}(1R!t1XY44mFi-^65#21nHA|<{^i7%2P@kL6!RSuO_RQQe( zkV>l@Ds1!tn4W4G&itmLispwJf#O>NE(Mo_rD2IET84XsOp9E954$GxP^>z}lj^Yc zD^%PV?j;=Q#Sbb#nR@3KPg04!UyL%ZeFufFLr+R#)w5vE3?9#+v`vhZ*j^mtKmo<* zG{q>>TuC!w?HgjG$gW@L#M9!?D={6q>DVTgjNe(?k5nC`q#jaI7r|?O#Vo0pl+;ZC zq%CqNwxGv{m1Olz7<5`tX_2*^a%gm-!TS)5%{~}3I?>=+2$QyYO}n54m>rF-xPn*I z=FE1;2ab!1^PeqM@4T8jLvzC4er--~p*vmdOfd_E-|?=R<6S?;J37bv$sA5pYeAZX z4Zr%@9No^%;qD)I78l2@#BH3zJwLP=*NEFQhu4$lTReY)47(E1+y`rW4MI~Vq`gqVmghv zQ(k_1E3u1JwR%M}8$+p$AuzWwRM;5uH->sP^5AnLH}O#9tpv0Z(n?S(VOiu^{G63~mh zjJ%AzjJ%AzjJ%Azy=No4D`o^aK|$aP6wuzo)8K6B+HAI*Ee~#bxV3-QWzo5_F3Zfi zES1X5x=eq#0)M#j+;Bv6C`0%V@|4tOUDk^?{3qdvXfI;pp}mUhJn|9KJkVYx<2*3l zi1~X-%4AAX={gS#3i8V$+0aW6-@(ngY*lzed1ueqvR=)QnTJaLj)%T1>GuWyuJcP^ zysrj~ohBlIq--i{c-!s8Qg@*(Q^dzXj9~b-uRIHhS^69`LTz+V4Y&d>h3ll8Oh8!E zX-+^fI;(YL%n%bPwUkvLwxbVJYswf#rz@i|z_d44zO{VoUk+NKGCLj`-K@All^Fv} zdzE*!@{9qexUWNlFBnSsO-jZAIQ~<^%+_Rm);b8_A#qR7+q?TeSqn06_^H*H*20V*rv>`5ecETh*J& z{%LO4gVRO3mD;-C0!6(MVVg3aby@tgu^+M1S-|9~Orgr8V45isSIHRDY)(Ml&DA0q z(@gd#0o5WI(_Af*F+flx$2C`rWDF1#$#GnVJFATWroE|qmtIu9dk1|L%V{*j?k-_= zJ7#mcMuzR?1f<_yHQShW(nJZUnr#frp3B-8VA`u>s%9Gl1ZF#qb+a1R(yf|}9KSc> zXnUYTYS@|>!SC&EXQVyI?6LQ{pl8mytbdp^jGKVn?3bxL+odyg30G<*T_FlP5AtKs7gG^3^(n(TKYB`sv{% z-D({=E?=!fV}PIz9mgHIv)mY9+N*qnuvwS=47P~PTnb`p(VT$9idCD9DH07OplY)* z#j4H507TZ9a$K=$qcK2Wqqy>|_m*x-tQ#IGwX|e!U4QHCN;sz?02WELfX2yRIQYb~L8pBDA?D0a2KA#Li%~5j*XPcEk-#K$QyPeS zCvqs86A+m0stsgJxoWgA0Fl)|jw@FUHU!P z=mZhund%pg>vlnqpW#|alG_eaYB47ul@=x0LTti-sU#fNLQ5m=z_dgvduvNN+p<(= zX{=q_|LjtF$5&jKC_m?tq%<8D=OTocfk{15M=E1z-|P$$)ihe-7$`&H)qanvpwl4( z)UGAfOG(XHDq}+AD^doL3LR1EUvZgyC2Lx>Pn!r!crU$&?chl-OaJFF6j9jprc@l@ zkhDAcm!^BxvZJG93LvE}8QVoF=mKP%Hv{v^6JDn9hFegpSfiFw`xqtEDB-+Qq|7>6 z9T*bKE2^M(oYJ0`4=3<;YqWX0ZM=EAHQK!08flR~Q>{Cy8kS$RDyljaEoFOpTC(@u zH+(;>V*i-LWG`SyYFG}|ye>{9%TpyaYhBu+N?X*mn>gj-Bo-5i@{>gE=t6Y0+QX|L ze0h(-GpG)4203~&Wd97Z@n&eefhN^>f;YKV9*-whh9=iCqrqmQWDMP|ZEY+J#-KG>0Nw4i9xpLnCQOBMmTAutw8R zuYzUikW2>*6)eMtvKDSL7`(#t#LB4-drcazgb=^J074J~1G+L$6Dh(d8 z2&K|06<(`0ltPO{YVea{)u8sf&@Jw1uZlA#ECa__1-V$)tYa?+;BX5YALpix=L_}&L0aa>vl^nOphBB&Td4f*OY4IG&&uCca zUw=>k9VqrGPM@OmNtMTxLh1J@O?8=d0$$=ZlxEFmjdkTQ-S~4ziqg=9&X6vEhH9CL zhI9cmq)VNl_%{GeQbU@yhHC#Fu&HZEa~3SX-Mpc+;%Q?oliD{J$i0;EYmLkszoX@I5Pw1T2>V?O4Os(G-Wg_M=m!@&V z%*8@C7lY12Du^_>L5L-UxEa;Pbikrs<)&Gp1wy3|ut9V!Zj!9hLdJx*b6usfWy@U; zukG7^pbJ^D^wXohxObc|TW#wDA3R2eim5c90HU--7UckaW@L?N=1y?903lcP&FO?}a)n|g zfT%6SH3PJnZ-tyM*#^^!t63%{!{(Aon^e|Pah{dwuO4~vTVsb;M)UdTYhPVCi+>M2 zbnIIwCHkal6_hI`*}EZ`!>{r!;xLSk7F2+|Z^yc^$r6@Mnn#m`XWU-JpiL9;is{cM5N1~8;O?#=# zTj=gcxLmfJjO=Q`etexK!C^;_CCmW)_6lPdgu_T+6`Fvm0fiHe_J02Oq z{w3W>d6dJHC!g}$P|I_YX)<4E?M~*=Nc0JLca`)U1G1|pis@9QH7UC|nv^f6I+AR# zOeqTkPi)zBEIn}Rkt3P%F5Aiu9Y(357lb^DP?d>GLYQ;62&GvSYvdia1Y%?N>Km+Mwq%J`5vXHj_< z)U{)CW7ikdLSk}NCZljkQZ_WiFiP~9a`1AyeGKV+ip@_y#lEMXV$ai0F%W=Ofp;8W zd(%%*Q2-l}eu@{-0Jbyz6s-ZU8|kNbd=0QU>8F^y0QM^V6fe&TNRJm{U53i^V5f5Q z=9A%kGMrC_^OfOzWjJ3M&R2%>mEnA4IA0mgSBCSI;e2H{pWzFT>wM)pU%Ad#uJe^E zK%#u$-R`G53+co?s@=pr%foD~Arth^iXHOyV#Z#~*ozr^WyW5au~%m7l^J_w#$K7R zS7z*$8GB{MUYW61X6%(2du7I6nXy-9?3EdSU#;lQYKAwE;SFSX0~y|+3~x||Hz>m! zl;I7^@CIdggEG898Q!1_Z%~FeD6boo*A2?+24w?~jXiD`@MBuKbd;2j@)9<*fXf0b zcSq@@NY6pxYfm2Kqqw8whinH_vhShT0aHR4se{7D4&E4A zJvi^s(NM7n!wb%>Yx#O}?b;1>i2=Q?=pUtO(8P76j)q6B=t^}t;(R&tmWB)sR#zc-k>w^8GSJZr!yoITwdfOYw$qCAG0@Qfz zTcGel1}Jm%i@)pG(VZK9cf-dw#E%?XcXa2mFNX-pDOfILpo9tqdHtjHd@h;dMZKf- z-5E}gp-lBkoC>w8O zbMmmMy7sy~TA#`mX+Rx^Q!cb6d7X|Q_>b~2e|Ij`ezd;1z?thxiD``uT`eXHwC^9MkR>%8u)#Z7hsyrhxIQTAh5P=48<;r(;B~oQe~@aw--q@#bH5-G=Kb z!$VQo*@%6EiKhl5`v<-22X*Z@=>23c+!t9m=xvLfjLseOPV$xr-*%_E%gN}9L2og7 z-eS}2V4H(hf3Rl!kK?};J`>Qh0gVq{;eBF|qT|ls&f?;@mAH+A6dl@(YsBs05OY~m z{g)XGr3OP_ZZK3B4EZ1)AD6*(5SC~gWK0e+O$;)i2AM|&nJ5MsV1talL5AWW<9Lt( z4u5Vivn(*Ex0zEH!;W$a)W%PGsp($P~N&@>u>isVPIfwrN3v8gZK!@VoU7( z#MlPKdRv?pM5Hk(x9e4ibAULwh z`Mn*EM%D*(kE2#M#H(gu^@cyPAkZ6k)M^2_RS-0lMZQfryVe)yYD!#W1EGt>nLWmt zMaD&ERYT1>r?+qallvce>N0k0;w>Q^+ud1JvJ6Zqa&J`@ky$QT>tDuB=U=rloCs<- zoS0UNJVsnh)0Jc1`i#ZaU4DT_z9~6Dub@*R_U*8rH$is&&<}J13upHZ?mnf&#veSI z6BOkwrpg0%_`S<}m-qMUM!(1BsS8ix6~v|-!d`jG1`@E_2dUW8w(l7MxM%g`(Os`S zfy1MlUVEOc5Z%Cb+YN8GVgiOHbz$7sdf($2llMJ-mh`@-pAx>OU&6kp-<7`CA>gM5 zfcxFpL;SW7xgJ!D?XmE~(UFI!pn8w|iuh-~Ka;I#TQ-hTPfqf`6k zhnIV}r`Ib!VBm3&k!y*_6A}w{k2HB$%Fhpaaw=2xED$o(YES8ECW41xy01s3V(=JA zPZ4;xi6`@!R1aT9*G%Lu;(1KYkHI=tBw0MBt3)1G1pn5C!ngDkdNcXCJR{-}U3d5v zCQpEE3~yoT1=xoG?{R=_4B^fJY-4x}AG`tVnF8|RI*N5b+PHh+xaYiuNn8O70JbND zdkkQU3&jJv1$-!f3llfMxBjS%1MEzA3s+SD-y>QE+P-XnCyj2qla+OGc8<7(>kvUs z;0xG-LP8U8_aNTM#p2hennGo)PdUT4=!|fo3ds4!tAr{ICuSO|0K9Kus8}4r336%E z#wX2Og}}GzJEVY5Jf6E*?}qTNy3n9OUkKmA0Rq6Q04t~9jNq&wE?6np2=IwNT(h82 zu;=0EY^)S6HuR2*j~)^hO;R>28Aj?P&l}(ABnu>IH@?NN$1rQym7-o5jcY@d3SX7- zPukvuFnYVN!}JVt@@F!JS)nf|6b#Mp!o;S0<5L!gueT)QJ1R%6b>uonKE|r>pZAQd`cpf?q| zWGZw?K1F5n;bJKC%C3=-kzM;9`EoTPgDOaZks5Fe%^U!_$hQ&p`pg+NfK#`UWq?7vY> zw#xZcrZB}!Y5AG*&Yi=<`|0s^X|gT!NDx2gltZg}Ob}YdIJX*F)e+G4fVKrR9ndDD z?CBY7txbS0$OsAo+G~_O74q8u zWi-Mm3ekEVruSAVy^lYkZGm7OeviqIIbVdS*nCX>&?Qc~gcr)qYoV+NQfUq%oU;(; zV=8lc5_*$BH#^TrB!(s6Jqj!DlB$=dtS&rIV?j}{^D(>OGleFn#Tui;A`6VLzDU^# zmjpfwWBpM`Idw;Wf2E@1fqN<&7BoD--ene(HB7b%SUBF(3I-V4Yf-sAKtfLAMqIh5 zW_&RvVM>G9?%I)inpd=4CTa)ZE_3(Rc6QRKK z_4-w+bI-uKhn5fTd^vPG33Bd6q4e;3 z^Z-HR8J=92&%ANI(U*^$|1VwN0v}a%=bv-$omZP!8mp@fTP7+#p;DgK6=af`WQI&; z#+giBVofW|Oq8~n76{YCnq7qid5J}k6s2N745?y75yNT#A3$Bp)(^Cc?Yd$MwOYIW z`&jFv`~QA_=iUkacmIL=d!2KB=XuYZdmg{T{*aKF2egr<36TwefPcN<7DDmKoWVhy znS)El{aGXUT~o{u2%KsbI2F)4jX%m7Pq*<5;ub8c*Mnd>iLu-o!bXT+z`R2sw&~B6 z#@CI*Xsex_=s;z#!=b~Ut##<>GnZjWDMK3iW5pY)XqU6~)M~yDwH- z>FWBcaY1D;65x&Fun}lHUFlj*H>Bt>bdomD*pcZ%ACM6=FilZ z*H?0jD_Wywtn8Y~9D$WZh^$dxd46e0lYe~=iv8?AijP6Q-_!fO?|rXt=ov(Ktz=;9xav@KsBY`H6b#?TYPaIDfm9&8wFGl` zb9od95&>rWC=et9Zwri&LgH=l{)r**K1DG|hQy)78*ChkW?Nxgvx3CE4griGnKx}k zm(2$Tt?1u)c`Ld&K6pV%%#+8@g(0~91@V3p7vMzdNO(>$ytmnNDiBQsFqak!+#L=F zH;e{@5nRi)cXadqEAG)A=+XBYX4^2KxD~@Z2B&eZt>_u)gUb|K(dTW&Fb?tjOI-lH z_g4IH@WDOq2!~?BZ^%}BUTj6b!0c|t=fzh18u(NAptcDHPvHrp%#&yjSS#9!e&9X; z)QpDkpNxP^rRyjm^5eeu2GDltl91xPg#DE+-!YN}cnL1*rZp1~o|{=Cg;hH?cdoJx zduJDUek3m9({r<}D?#(qkPD_3oUW<$6Clq&ae20K!o9&y0*69ieG5dWMo?o&P%Az# zDGj7r(2_{vEynW(p11J0@hrk~$5y<#z}0x>;(2gu0&~Ig!5f(sq| zp$>z_JaR-tEVkID&hE?lR{Q#Q96WgcDEvE+U(QB66%JjbJ26 z8ap@^6os}|p41fHk>d@-ZDb+HDskC;C`=+03x*OT`0QlVeMjB{Haa{=N`u!bNorJd?EE%VGtTK^O&Qz$w*!L zIGQdB1ST$RvIVb>76!03iQ3f+?jf|vdbW5i45g#2cV-&9n?vE_SlokZ@AF6|5mRUU z4k<*E(Wvpff#)qeZajwW@1=K+LAX-p`o5)#Efp`>0&%tB1S%nN{f zF96DV0xN6+%X$La2Y`@`fpi1rZe%v^nS~|Z!&@S7Lp>oi)PV*t0WE8S%z6P*iF|Vd z!Y98*2n41{!U`r}NKJThTpd9@L6jgykRV7BG>8mVxhBJ)D0uvk)v5{O(4zvHJ~d&d z?_ZK}gxraKr2#u#uqapWcVyL}9gQ_N&OYk7F4NIsfh5JZ#ncr8#P@&Nq8oi~FTcbr9-66<`YOY$Nxf(r4 zgenjeXwaxukf%;_5)NJheSO%J*V&7Yy57zMcW?3TFVP+Xw#UFco3FQVli+AXP$Z>M zSff~r#z{fZw4k&kt|T4QTr8%s_8JWd7fNUp)Sg1xPbg_Cii>N!L1!girP)S}ByP_h zEN{c|v*gkj@`)bbLk^WhuPVjC6R6ntA~Y6F#oNyMG{(Q0%xSv&($0guhtM~8dVBkN z`<5;1xD4~Is~STf?&@`LcOQa?=^Z?5bZb7 z(Y!-T-w^~OM-qkVe zg+vq*b4a9U5sBC}K@jadk)=H(V#g%Xme2P0-Gh29_3if^8S?ye*^2v*?=-)S5f*+e zLGWi;esoEvX_8XJ)$;od8GaxAu3yrp@gt4B%^Fu>6!}YBlQtD!2v{FNe+n7|dNL3= z=nK=zAh8tz7qd5FrU|rV`qJiMx?5(T7WP^j#jvef=?KRt_p1(Vn(K84igToiSkzbNdIUQUr(AfQ|Yc0hmBpk1`6oNODzI>#!clC8mFTQYiOYOmzo0i1ZLuMg;yjm?J`Az8uZLsc2ptYJM__cX-mk*yTi|Vk zonfd^M`-*$jct|KLIUIEb|fA@Jzq$KsZetbI1N^S;s+YOwWJ2sa=4*v1&_-y(T&;k z!X@eD3zx3U^ubKXOC9Hzl$Heqg!p9}2$431s)(uyf&|_rgyKWR1w|;BM^HzAk@z0s zdV*6i!Z?A7$D!8vdku>meIqugqQ^A7;5;fl@qns?Oo*l=;W2>?B*p9BYv_FSx3XiB zSdP`Fn~v1L0!_mL?q~`&G|Xk7MuH}SR)SD#`1}&%Z&7T_;GS13+a)6SZH0cXx_&6|h08#Pq6Q$j}aKZ5el)CBHB_!TT0sH z13=vX9@7YEC_RP*lp^v{L|p340w7X&Ba{*+#}WrdDoPP!DPk-&{+{p{L%~K%scQm& zMR2`w8qmyOV?<}?65K<+#kV=VsIRk^ZmxR(U0*t1)s&V*1SSQh1vXw!xivu$%9tDq zR?s~feF?RgX7j4@m#I@NR6^8d8q1!68~5;uwnPvQUgo z=k8iS>|!m)q)?;|8#$BlbZz+jQv6)$wrZrM$s#VaTo?w_QnkB$kG-4uArjP}jsx4A=MG9|_?TQ@Q6tZYjh}Wi&Nt;5>Y|8tFLp2s<%|#0N zwf%imuJ%cy)ME~X0#<)$Hzp&cA&|Cne2hMY)>L$G#~m*ymZIl)kw!vnDGJD@+_wpZ zrg2Q#ZZ)WCf4Q|0*t-OPF+|c!ng_S9>ThgrUDDjJb0Nevc#P}kp`bxrF0C1_R4MBN z$SMKg`vssS@M(U>m<$Pul5uN_S*I_&ex)c`VUCdqyT1exia0Ma5OyHGp{2$U-wA2a zC};8E-VF&_k*zW9HyFL~JOE!)+>UK)=PT37r>GK?`i*U-N;SH(XPDi9(chzo2ato9q% z5F2*~iM&e)Yb)7v0%ApfQB~yb8M#mtD83 z@wf#WD}S3u@bvYKjF39Nj1ewhuOkM=nAnedw83kvdAgIg-7y=j1tc)k^q0HtYggTn z6-f-HZ9N)mp25g76yICXV-q4VuF*b#h5xG^0vH2NwA{_@>gVuMB#TnUZcDqhUkMA5 z6IQ)KWe|{%a7eXrwU%iTLtQ)7g1J!Iw^MkmA(1tc2YUOI)y&5*iem6#h_9$OIgu6=|$_CXPHa ziOx#gHP$@c$=mKa3GCIRm4T2XXpHYGIJ}|@rAQW~EOA@PLI>%zoFZA2G9R{-EeAhT zk~NKmmz07BhKXG}sDsJA)myvA?{j@}-vF53o zJXHgme02XJ(-em>Ck40W;W8oz{3<~zchA)LT%@Fhfu0y4|3;v zDFuO<=|npk#Q(vO!9g^tK{$P+^Cd|7`9EmQ3Yh#~b%of3En!KD0bwWIp~M$tOsST!$~5JKbyBLOOsPZJKAl+Yl)}~Y=$b2% z;*(N4B25;hPE-{CVT}wcP>$@;(6vdoW^1xzr#}{ymOAl6Dg}XlKtvkrjzeN#w*f*V zRKYhKGEEkxOrkAiVc4XjfS%9R!4%nEm?BL&(K{*PD@nB)V;0O#p~k{fsrDe+>65B8 z#w+@3cSAG^flX*hqb+De%<$=2sYAF{@|VMT+QECUE#&4cgR#Iljq|u|(LCkajJLw>4r^A*q9k$dV?XuER z$s|aWewk=yS=nLTHffiYEoJI-SX6}L*HPn?6~4u#K8Yob>9I6|bkeA$k}xfGNGFXV z)yhR9;uDR6of3pdYb;77VU9*e>Zp-4vfbIwi1)Lf{Va9A~C zR3jQS>AZ%HMn~$X(V3I3!s!=u@%^q`^T138+NSWt!l!K>SooC9153Jh+u?zQf7a%K zzd3SH{{C0QGRGth@;uHa(NQ?$ciPP_jB&Y7ad6x5DYvsYl!AG-o90j!I;L4HH;{Kj>+Lr5r}t|EGVI)sx8+s)3CcGJ1gKJ7exy7Ky-*}bZ-@6KtgGHHvt%B0m5 z$+4X3M(X5LhbY#b1Uj1=%cbXeF^#!7IUS3MwTH&x zy2Ngdn=~#a9__|%p2ajnBz?zY1DGNO<-S?W-trWffmp=H+fXoipy zev?qrk6!@EHGc`1Yha{8cLSQVdjiL|P9vg_fPxWFu#QqX3*9{tA}Wpo`bPSY%@wzl z=teY!1-#oXaP+lO;OIu|sNqDCQMIZ4p0FI*Zc(AwKEvG?ofRkXhFwO0zu}92}UwbVp{= z^x*Wy_NSVEWc{~yzWa6WeCxlBPu#B?pBNv)!o*}!o*swt9d;;dm+{F?;}f6H_ypG? zC;i4J)>`v@4Y5aMvh{89*0W^znB|OpYXh;W&mt62R$yIp`J6Yr3Gb4r1$%hO7Hg* z@r2*w;nE=1AdygU^=%ec-)8Gao*M7D#wXr}dYV}A4|p5Q>E?t1>u}CGtQJ^@PvJkv zQ=~n2ArJ5MshcGIYQLsn!kHWWV8!`J4hN?^OU>=RjWsK0ErtQ!>iUUyZ3v}<6BD}{ zA|cmA*FXD=KniEqBB}at3rt|g{N~1|&7Ju+_2Y8SlUa>&JbHDQhp_Ge20`nIBkCC==iAi5!`JG(bBs@-UkG}-RxB#@ z9jYneGGMyV3|rs!{y30GL=g>VdR?RLb*@q4TGy!S3MpQAF4FrDCOcmMAblD4fg8PV5?3I zcIwnXX$_RtKxqw>)<9_ul-59L4V2bUX$_UuK&dRm1q6l!#sxMDbom5@1=b0S3al3x z6POT~64-F((52ZsTQkkqUAlSW3y%z4(b*nKC*zZi7YD3s2C8AA2Zn~?{(%~{wXnYi zi>%fDfhZPR{R3>f{(%_YTreoM5Ek19ti=OO@tA)=9m92`H^7$4fZX;vK&}3Px=6x5 z(2%ah0-b-r_m2axQv=H`us;)vwTKS*sZC?|LM-AR2&8Gn2pYo4mPQ=(_7C7{9>}AS z#w2dmjANrj9P1(jDa4JmASg~w)#7c!_E>CZ_ai9{>116b6~?XLOk)i8(6Ni){Q<-o zXz&jt)7TUPeb~(4AE=5nRL5~0?f~{_$X~iI9c)^%FpUL->$rdbpu!z^-RmpWZ8lzg z`&ScQvr>0aIGtH|JORhtG83>ji(vgN1|1kE;!&E^Cw^ue` zRm6H~O=ZvROlvqN-ZSdDauqPvR`xhV{`U1qGnS|-)#g23%Tnp8d7>LlnScF7>A^nr$p?ZYA8njiGlHZ|To<4IV3 zZ(58q3#m}D5vKzDN6ZTbpfeakBSMdtid08KP|*q_5pW%dhwIVuO9fGV>oNSaTP1a& zD0WhzLF0BV99V##%gd_hUL8s`z{U@+l^(vlG<10x{?{BUtGT?a{_@hex_Y<@djp4S zBlUQXhGE!?oa-|0_3Fa18(_gdg=?PKfPeaKX(9q(oJLv%BqGbXN$|<*%o{gmv)S7+ zoyawQP_On59{cVsWwiv+Qv{@1e){g*vLHaI0OV+HDfG=PotH?)X(Fn{Z=Q3^`mDEM zAW(;2loaY7W=M{e)>L%;#e2Q+ou^NyuyBcs;z`DMr9BDn2|~|Kc1-~*65F(sZX2FsyAf+BP`(A2Z)!dhd)Gj3h)+z zn_v;a9RNm@R5ig|f(HT2YXPj;1hW8)-vXTe1VFVAo*peF{f5A|1iA$-5_kup5f!>x z;9P+Z5}MZ%TC)YtA~b$Wc={7U=oe;mW9G)u?Bcm4#zhx5z=TgIK`-WnYJX6CjRN8l z1;A6MKf!OvDW>1(!qJV8|7&#n25eD9TWp&CPOBS6QBu}g*Nw-p1r(3Fr}t=YrgyM6 zpY1iu)cD@NsWW&E^s1{gIMds6d-(T^Hu% zy+>~^H~+(|(CC&y&1rNCBdImkf-`E(3boqUVflKE>4K1%+ts@&Y(#p1jlIgs5J%AG z#!tc%q3-Cc^PLXwsC0bhEjF&`>w|sH=H&~gO{>A)>|k2=kk15pBgGQYh0@dAd$hYZ)7?8rLf2aN?N-j00cYjB8_F?ocE}ErNMd(Rd}Qd~eXTBU za}FC13fi2*O78AOHI}M{gDjGxz56oA{O#o(Y*QV`F(LSz2mmPo>ajlD(VHR2X9?#; zOK030ZhT~8<{zd9BeSAYr*vT$sjs!=WwzEg%ml2m;U@;0?NSXXOzi*~e=tVkHO3|0WyGZfrr$HiJC6R{$4HfGfr(?K=-%Qr>=wcO_ z7_~IST}!gom6y~GEU3C@dT>K#Q*!3$z>E&-O5itV96^Y@b(jyyc zR*a732JTukK67w1xZ>cJOBMwWHQ}2J+pEm0j7!wh=DF%=ca`!o!8ZO#xQ|>rQ2I}i zHVei_)?YNu%^37+V{kYY3W-{vo~D6Bas5(5-2IFJh)Y_WuN(6x81g6MfofcC?>Eb$ z_{nQXHUw=sC?Ax}IK>>NCBU}SYLJrbNYBb1spg)JU~E{x_w zBva{X+?EzJ`tjm^jxX;=LIRQNXT35%#^)T$EWn3)z{w{3ISj?+y4WbR$@+A(A%b4Q zdswLA7o_*F^^-O>o;GUT9P4a4uY>U3hZ>x$V{Y_@ydY<32-PpA#E*%QoW zmoVFX^KA3Ob*?McS+VKUf*Wi4Huh(7qX&bN-kCmY$~yPDb?}}w-^iP9DBQb(olI|F zSjrFPc&t8-ol9sV=@4FAyrgPDhp{(nHs^qHz`CsBjUnR_dLkwfEJdmu;swsV$>R|hQY5WRp>g+psaIlYkC;Kgp5dyriln{N`RcZsu zz(QAJKJH#u6>q{YGH;nnir*qbfoR!HLOLNvw68KaL zqZI0tw_SQmwAgavFq4hdk=L_nz1NxRx`#Sck`ELwPoRrOU`OiWX2km-CN1~ZOiSK9f@`A z&@VQV>d?k}Me6+`b&0NS#><~JR=FWEr_#q za8M(#)&I+?K&`(XCx!7<6^QG~%#g5_17US_#Ymy6Us8`nuypXXMHcKkMq74jUAwj9iUDb zmkhMQ`c$&o^NkdO#gpa*`g<~zv~IUP*BA*k;lx0|uU1$;R}aYhfd1b?Z=42nNDL0@4DU-7k;yQkRZl8~-%iY~c;}NbKX3k0V?CPD zv~iKf^WqJfPz#Aqy6+Zbi&Hp46vk%U7y1S}`@VPgs%3+Z?p^J@yKA(AzN!M%b&e0M z#N4T|b&kgJ3n*z&c!MeH@f{?}Xk>oAr&s&T2*xxES@gz_py`70oYo04Kd;7n?$Tb( zjN`{nx|?b0>eY`Aswes#&#o8)b}*0)<`rMB7xBnCM1*yQ=$3@td*mQ=?ox2 zmpCoNH=e%Lc-KVj*C=x+OA6HWL%JJJh_Am_bwvP{pH2M_3 z=TF1INb1u9?>6|TPsCF)4D=6ox0S0wtTN!!1l`fFs|j;-{$ZGnuJ;em-e&AAkHyh7 z%vKfVRBQMoIGhG*V!zRgcQWoDu8+qsC=77waX8$Pc5MOe{*!M;h1#axO2un%l}|D>+56*R78e#^{@J`((u6TqV}#jq zZ^0Kp15hG~UJS>!Z~;clKa9yM40;_dZ?ul68v)Jv0K3(VSrGu5^RwBd%ldMCq0!NX z;2uo73_=PYTtFBx=MW(f;BIpcR}&y3LO3nU<2U@rR8G9YRVoP(*#pwWGL!rS05XhHs4F0 z^l*s3bAGmGbG}(=Ty0`l{EVI< zpN8H4gjL9RO^qLvl>q1WQ-pH-cu*kSQm=yNqiqy4dg%ep0%Sg|r%sYr~7Tj88aAB*Yb6#wSQO4}65G zu!2wr1NrFbQM92ZWCWVEo1fj|*-?t1Hh18G((&J%c$P-jJawaM9_;Z(lCF9Du7IFa zW7yqq@K>d)>!KmoJbbqJxu80)3YCM^FdUw)L*~(BaXp5;C^;`s&w-O)J>Y3~%`-p5 zQUrbmlYT3Qf*M9FSk%HS3>wINSjE6-*YRMSfNRb=hpU}u|INm+emrmCapPHp=Z@JE zj3{t5p1F7)oQ<1#XJ2W}#xo1_ukoA#9mewo(7W)wi{}rsaeMEr(1+(OJZ?OT@Z51L z^Z{4nnTzMaTcPh(=)*IMX~||e5q$7q=b`S)(rLK<8qXQHhVgs>^e#N_;`zg^h+)EM zTF9P&BfmuZG}Ee4iy)&*bAQz6aV@-w2$u&7@=O+FJt_!SRKYAFezSzD!HYb`xo#Gr z?|wmgA)@HjyGo>3lR#r}!W%!P@K#K$2MDSNstI_n?x+4J3=z~2)DnaVA_VgY>Ik9) z^#n13I6;Eo27)9(ilBiYP0&ctM9@soLeM%J8({+kRRq-pL4puL4M8nIm>@y`ySAVH zqp*%3N>EP_BZw0u2yP%q5~K(k2+{0;C9KLVy&Z zObF%?AO$EB0;B+CLVy&YOeD3Tu{2YN6rqd=kRp^30aAoAB0!2zMg&L^%LsNV_Oh)L z%mOffO>l;6nBWU!cM-fx@CN|n!W@FVLGTuVn_v;a9RNm@R5ig|f(LU5cI{qJZ(^W> zzy3${zm=n->e^Lj)PDf>3p``&BQ5X@rf5&tZVn|f?T4<~pRvjDSESC6vUNEW#~G>( z>ecu&?OD?+RDXNcSSQijj`U?@Y+w1JuYA#$ILg3+x{NYMmr+*K#Zd?9!USd?^skO{lx24pJ;sX5Pv%B{3x%*LuapIIb;lqfzk~HBjqusL>{wEv=D-IHu7~BQ@G&zYP1Jh(+rpu|p>T`ZR@-fHo8< zZvra)%+Af5Gtd6;QT%7QG6pqX$H6Kv$~2AA(F788R<@;9?TjObc9lU7@;H_OmM9bz zOB4|Z&0^^gih0l!>y0TtlXKd(Lr|hywnOB^XHJBezj+=d`#yrI@i|bx0hJqT`Ebn_ zUGqg(yOkEh8fC?>PGiR0iW8Quc37tocNBwmNXWbIXGyLa*^etr@YmnZWH=4Is0e;- zJ$Cby1nmg5wwqIm*8NTqoDovDVC#M-`H(vKJhOcy ztKQwMeQY1egVrzeb|c3M(GJ+N1zJ0pQ$hEOykF$~BJUUZnS8ru*}Ao;Ki{rHvYnnW zr#5DqvbEX9OsY1MzH!x4t45SZ{dV_h^{)aC0q!$Mi&(V?z0bUml0TABr`2!sn#_wl zuRY}%ATMF_rxo-7!aGiZso$P4KUU|NALAm9I=}hxh;|TY#F2LZe80sQd2V}G0M7Z)(Liz`hRSGpkID%}D}GrUHb&djt;7l-cb>|A~6g65T* z@8M??73zv0 zE8m?J0=3-%8=ss?<}Su_q*^Mlw!rGC(1A6{Gm3t-Q7uG}k-Ygd&~6)Lg&QG4IZ%jF zv5_qVVzQVjrm-7epey(piWYYs+lfGC;TxBwrjA~E+r`tDPWobcWxBDk;Ci{-cf|88 z_iNa#*o6ITn4$_b6pZ=Z#(ejAxHykH!SJP#toOa@`9Co%GP%A*D2qHXCR@5pZVB3EgF0Q)5Clkw_0 z7=iV~%uU!Kn(s4a7qS4|q`HkVLgCeTked$-*F{S!uH6(`vOPV2BCbYF)uHkD+dcn@ zrD`Sz+wt%@jD=p#{{}K+p%2>-*Jf$KAZi+o)XWO%)MRSDgYRR z3<0^X1F*{ulxv7GAAt2PLA(jKc4Dde0bG}eZHwLRsp{Xl4UhV_0ztRbh&4$aa=+yL zEg`=0FwxO6br!Z~WM(w)TcO9juIPXjw~h|XOJf}im4vIMlCd=Mm`=q)iH0-)teM8K zDG`%~0o;;^b$e__zy~HirIXQ69G5)R#-kB@gkq*LnZPbQSWv|U#|V+;CpBhNxL$Vy z_EAERzmpKD09bdBMc)J0`>f6DS9DpIf5mUP zH8r(JO)FB%YE2FqL2$!E}Nd1T*pGpHOXp zyzZxPvb-A{kebnLUT==?LPH$i1zUmUi(S5EFfVpt(FXHKel4>Zaqu-m*6kW7xCYRm z6KK$!iNMO$k8p*2GMGY(!K9RWfS3JYOM^5H%&&#y)u@V8d&k#e&e!q9#a}pKLc%{> z10!R}$-&U%D9r1%q8r)AxfD!^FHmho(#ZTU{ZeMMGkxop^Pg@EwRf$);!aFHZvQjThfi4cn2|BWg78tl}l!P~D^-ojgm|oA{5pzjtg+h=Q{!AE!%Jq9m}`qwr(?~j5Er=c9>E55Lnsx;HU*eK zS;d`gxKSy(&bK9#!eY@g`a3qbZbMbOl z*I|~=pNd8}mB>X1>`N-|HgdD=P;9cXlo~{=efx9xxD9do>alCwyEwSRmd|U ztimSI*PlgWWf5I6Rt1~<1O0l4kylTM#&zZ@xH6%l3&$72_+i0(xAm^~2`gvMe{&8G1?N3w4&ky}p0!A}@Os{=c%Y$8 zwOTtch`0+@j>l4=3 ztsF;{-c#kI~J)b;6XTsRUbC@ z<1w#j?#E-cUT@_3@u;{O1f|jWM$V|{1~gj%9_{;KXK(Go1*y!k!XueSeo!BX;*G#2 zv>etF9|#7plMQD#E;5u_qFUWQA?|kw)yf}$$&?ka4}Kj6DH`F$OyUPQj+xJ352S_e-GNW63vS@&NUm<~c=B9R_3OSn!g zu@eeCn(Eic=<@az+PhuL3a({e-8bnUmfiYfdUO<*hc`4l`{*d->Tz!$lH*!7ThajC zOdoP>eo-xPU+(4Rx*Gg6qfPkTm-l1OS|ge`NIR$fHr55C9%oRm6jhNl29F@0YT@2> zpmt|D0-H?1ijlKujDfXCP(dAjUq!lYcn@g&?3(9xqmJ)+fA|cRFVuMe{e0B($BgZ& z7k)rGRIVt?IU=0)ah&(%!r{)%BeWC2S>Voob+hBmc+4EIUK>YcjJDt0q;bK_stfxu z0x&mXcnDe#64iNFSX4+2Ru74+xi)XjwI%!}xd}^naOC%nko{Vy(=0+wI$ryWKF{R2 z3dP`OvJQnfxx6T`+Qd_w+j~O^dP^g>MvFAMLlAb3i0>)!2)Py9J5=krxNiGRV zFbJJwN+i)FB-tb+;i9?Un6x9)+yCl7KQ_Adn~x9l%a-td$hbcoKXx_p-DBJaiQY-l z5J@D^4`+H`V@LwiApGuu7s4xh3!D4rwQjt-u>XrsC;GKR0sAaI3}QoNVzPBZMx9Tl z3Z``9^_nk0)?Y9l8*GO@;G@9!vBjwzgzR@Mz{{HHRr}3*yMf*lC?d z{kvfbo|66%n=ofOBK3)Dr8(1)&+dn>Cfvmunrz;T#6NG|EO{oOS?0XIC>IKPDWHpa z5w_@5NeSS6#|K4>;~A(71#3f-jemsKG1B_bY|ZrMn1Vz121jz4)^u~Y6Fcft>5hHH zjW_twv##t`6~=xHZ>G54R&i?vo=hG$VElrcq1@+W$4Fy+`3bXzCx5&p)(qow-V)>I z*jZJ+ZO=Kz>*k{>%sn?;D?G>etNG|U?wA9JCozO5cefjd+qp3g8@4Qubv}BJTCn&E zC(3)T^bKbT)M8?2B(d8Zn>JOrr&op1h{8Ar z0R}tk!`ORImQbHzfJk=5)7>VpRiCFD;DT-Ui$nMVZfwL(`^IczHoNS$MY8UA+oH{1 z!84VZU>XU6sX_^yP8q>ep#)A7Qs7igYczv?38o4qaGH<;r)pZG=^__6Rnr5JuosO=~nw zGa4Z~b{N6aHK!4>q}?<|E`+r)b7uzy@xSD67V}3iRVabegcLYc(;7_|xxlHK)@Yh$ zG(wD8lg2ZI5ja)T8coxTM#!HXM(}jaX@vY~H;s`WVQtL(*+D@#^~SnewsK`ZvNcm& z1y0qpM$H;qwi!rGX%X$RHB49#kUs??4eqZV~o+tuM&y>vhg ziMx$ivpTp|%;6vY2lzv4QRaH_zsXf#bT8liq| zC4#4GP9xN>cGDR3DXfiIzjjbf%+Rbxs9)`BfQEw ztj2hYby(Zg;rV*g0dJ_d+n8^*4sH|3t*yZEQIs(Mv#xR8Yrfkn`}=o6=S&?!BfQ#L zp~iTtg|#u?a_wvrcw>dPG2e9UZWDaBwF1XSQNng1Zg?Zbi;ellYaKShS6l0Fd=w?} z+S3iSt^GKJy!l#c}$p;q_elqZq;xX9t0eYq9WavUVRVt3Z% zLI|v{x!^I=xg>ozue#EIG{7hSaUk>wIS>k%xqV38)`O#wKo}9;2EGTv@>kd6nBuGJ zQN&xCaY)hFf-{XrS#?GXR34}1%d#qf0r5ScacY7B^Byek2C!hR0(nhiB^?6sBmO_- zN6RHF^)RgXM_BuKY8j8O+DA`?*2h$_f@nna2~;7qZ$wq}t!eEW%fF6qAsyX(1svUS zXjhK5W=Yz*?p^SC)dRwMIG7&)-~+37o#oeP+{{!C%n;4t{8ZX zK}N-?m{=z%#k6S`)y9s+SmmCn-X1rEM)ZpDM<6ySoNZzQ(@39l&-?9dyw z%gLyZ^^Wc$BfcGP_((%o<1oI{1n2R7BA0Jwfsws8a})?c30?99`Ebi>gl=?ghk3bq zj`6U%$?_N(Dte(@tuP+u4t|{Cx~UuoJ!{Nz>&m`qjeUpl_cH#neaov;hhI3nc#7*W z9Z=(R;+S>iDeFoEQy-iXXndC*(g}^tsf?*y#m%2urRLD>eX?`nf6B0T;M4wO3&#JH z{SSZHc0~2*p!xqP%C`THgybPQCC8V|kmn|xg%VVHa*bdZ0TB$#A}4u1dp4%;X1g-8 zjhV*JXS?4%DWBTLUT$6YkcRWg@>CkfcOpEjfUn`^*}OEcnu{;Er7kpCU5L3Jql}Gl z;cTpe&o(!vhnC(_y|D4%veo1Nu5(&CbD-~@*=(t^d5q>{dFe^o{ZMUOs&YQ}Ce##R}OKCiJ3sdq(K8x2LUI~XfX z$vPg%u8#*>aHUVAnwvB1-207mLQ}1ZwQ|RLl95}q?nshqZ$-|pG>MHnE8&A^M+-Lj|uPdE|bX`HR%^1+mZRS3l zm7k$rYuCtFujL3`k9!Xl#cnB5_jEAsv!fgjQ zCb+XYg0*yBU5e>+-zv45%ZgZAA+MfurOG{@cNaWOz9)onxyBgd64YQc&DAj+ANM_B zJmhKm$hb{y;ACS6E0<8n*b!w92Tj0}1^|qu064h%5R395G-b?>ZA6<@8_=C(Q<2Qr85-g#Gc@xd2f6qiJVcBM3FpG=I$ULA*bX$|0Cl^`@1Felanz+5;X zj6nev)D-jFaplJLHAE-Y3-ebj5>xoPdHt*7OuReT*g^uWM=flWgNrzCY7C-TeuzM8 z2-OFc!tuXg8V5$o)yJNH$8r}=*ohZg1-#ihW-|mX;T1^^odfy_Mj(li7nokv6z^l^ zRbpa?D`NL5(?9_0LbU)=PsV!hJ&ozkbl>1e@JKe@mwBYO7wrR$#@ttq zsb|6x@B%nv2?6IU&3!{C{edCtC#3dj2AI4VYj^#I^)jF-W2t%q!F2bco`5z@=Gt&l zxXt>>S+4%}Vi~;F&SH9vld5ZSa=KU!8qaW_B%=h!lfJCls3BzVY>tX60l=N#<&hsAXx)*!ap;=9&7-3)&>^QjC~XX(Su3qY{39)p zqdbFg+^>u^qX5emh6%8e6hJqF#$X*{su%Ex8NMepfib}zjzQ=`#71H+G0zp#K-e~g zn;vm$99tOZ;q7v@MBj?R+NP}pyp^;&fT7H*sPQ2h8UxKSOwPK-{3S}_uj=g-EWD-m z6)7f$3$#PBOp7D_<%-6w_*-9o(yZZ+zhLrMlUgGJAs|77zmh27t4J5+>Rrxqzi{Z# zi!b&aI<)!FA&lV<9eVj?6x<634jekvx0}jzIPI6Qj`0%L+3crWhp^A_Q^9e6hz@W4 zKsx{E6PzTDB~iJ1e2F6I045P zgd|f8{Am`h%n{QrSa@WG*?4QGA-T4jIxB_nsZ&pa6||u z9NWtWbjORv!Xp)P3PwHQx!?00%4`a;kTO0yl%Z$R_A#OlFfg z_x_xF#OOzdXnc;Kzx>s4>$BbCFIf4*k!S9jw>{l>aP^|gwzt6*4sb>Xy&jAp+b#xQ zgEfY28bdR1k%U1gSa`hB^d;}ML5ZCl%4|}kVv<07>dds|vDAGnEZt-^R!?A=>Io}@ zzP2A#&)(w+<8zwAhmDmnJ~xE#8c_VWA#27+N_UOwI#CWz7NU%`MLQ1uavaX#VxgQL z!>>NJE8$3B4US^rw-g3HZA#rn6B@U1X=+ipb9G_Md5zsKWt$GJKyPv2zOcHDOIH{H z!=4}v{fSAJ@)QP`YJqK0BX2f?%>jw^3}Bd$I$^}}+VTK4PH|s4BX|TqOQP=)HcBdf z1hWQ=8QaqV7*kGt8I}M}X+&EK`ry02{8TIiLmC`fm^*p;_~aUa>qYO$^%yT=V8%fs z#zqT}peo+G+v0b9S2#Gr?&mdh(=UnO;;V>;uZvs1nRp}YdRSmyJ*H@*s6sz(VnEo9 zaW6+_#OX{m2KqIfxGtk_bExqaOx1LA0;yXJPYDE_&vy6r6nc*qdIt-fHzGJw5HA8j z=d*=g%wvo%a0kauyWRg;aR28v zJPfuF`)IKgNdSsFm;tz_xljrQPhp$e+l{Bt0gNXvR;~8+nCFAiYlv6`?2*YlOymjX zjGmq%!sH?dnI_~264Qi0GX1Bq8-$%MzQ9a+=?bhr7^Gumby8sQp%5xm9P@V8<4I_4^^1ZLUIS zOwGmJGpT?SW-}<-W=z%+5botN?f173Hh`(y&gO!_CTv!R4THXt!I76&FH85l277dP zGI)?cG&umaIUux0fd>FN2dNQd2t{LP0uho1(F7XPU)w>uaB8*2@Fz35@Tq5XZ4BSS zOAIyz^+M^G91^h!NloRsZF_K*QC`lzAYUDyLNSUrqjE2 z1%t<@ym08ip#zzWB+g>Ssi_Ah=Ri^eY|4S64hZ4K?1~jSjo?Kr(P`9Em<^L}1Q~)s z0%kv(#wG(@5FZ!Ge33GC2R^4Sy#jvF`*9vZC~mE?DeSBQW$!?jlhW*kx&Ib@Fwhx{^4Ar7 zzfu=El3*zliWH9qoD)(O!pCYKTl!{bPHT0@DBO4m(A|@RK<52ub=0x z&_E~L>M224wZaI1BQ<5uq->(ekyhl#4;R0#VhF_{6< z)is=;X)%zeV0;k!fukuL(+@>((A=M(O+f!}b_w>cFK9|P!h-73rgZurap<^}uqs{G z0)(rULqMLd0C?Xf_!Zd*!Q*88q$UC^6G#W|1psw{x;T*g69CsW0KV@4#ADR$tt5Dl z;1>j!1ElN3M{OGY~0K$u|ksc=I8$oEqK@6`atvjFaV(~{eo3+WvlPA=%@#{jrq znF;av!wl9!J*Cz%!WJC7$8RsCOrCY4mRp;|*}BmtJO&(H9|16GY^(<5$y1qkCvsCN zg@l99EF_#wW+9cbkZ`@`+XTNN8zFd{te@0G02UGsM^knIfD}?G3kgR5uW{ve!FvS1Ah;ZWg$D9(08(i1i$cTEY9>bt4Jv+0FINMg&=6N10EL7j)GQ>> zEF^FY$HD+~3JKCUcwI=qEF`ivfkHyEKLT(H3AsceAvzS&=1nW`+r4~wd*6yhS8Z`G z;1z7RFEM9CaB|xwM8a!*WPn@=eKtQs&B0OANs9DsNYUw{+< zu}N+T%Sz0L6MkzdTx_DOC!WQl^B6;e-<;+-^ygdpF#?-R}PbPLNKM=RAf)Qb5qR z@Pf#NSvG=DHyEV}ZxiAkVmZMBFnQl2!UBKWKBB4Yi0{jleoU>x(S`V!&{C(KSm8xu!!KGn*D>3!~X3 zogL!x|3pQW!L)u!NJ~Jcw9au8q9t&H?UlGCBFb9nB@hnO;Brz7g~=EVR{ zTHnSv4-J~lJKIL9M;Cu48V_9jnR)_PR=)T%DFI2JX##jnr2+_s zK`i_O62zgs8p7&oc&e_3C$PfP6HZJKz*B20gsrXc)Y=M9V1=ir)>Z+cDJ((5qAM)Q z!de^7_fk_>!?;dmsF>P|KXXG1g5J=AkTgcJCp1^|l{hAcK z)V07%Grcs^3(j*2D?Ify(^E4&)zVWfJ=M|^SmEiZmY!-`@H>n@v8U4{F`uRBQaB!KZOfbW|C9s?kh znk>HPDMIhwp8>e`rt3(59iSx*U?k{4t+AE?aTF;s5`|zUw;-5#@rogD=aTS_Od9(? z%t;oEfZU{>A(1yHy_v<6C;MhrpnAu>&uUliV498i8S7{8XZ;M@Pk#0$q$~339q$VG zCB$VhJOqfTWSob_WJJo`Hy_nzl>ga%3&FixaxkL*VDPwW0t>cVowbk7^4wa{hl7KWil+_<}obKCJmoItXYs>s z3?^{V7H*Nv`c@a|yl*wGu__BS+?82So53XysEAC+6I?cNPLgH-8Ya&1Et)vTT&La; z_?AHT#5vaJ#5wNp#5rQ6bK)GYmmUD&!Sm9@IjK+}I&n@Zg{2~UnxyAxn}!0=Q0zMw zq0WJSTHN*1k4-V6P2>SVyJjFkfyBf)=tAp42=^+YM?66q+@Cl{4D}~>Ea-b7ecO$D zZp~yCq#H9U+Ta~$esI_@h6FF7Hb7`|UpH}1BG4r935HC4O?fbVfHfAF|E)vFcoWpe zYbVZ`hZ&NIbE$mDHy$M?nM%|Ky4SlUc3t~?$N?_WPrSFms0E3?>$n^@)Ezka*~NdJP8U%r36 zP7T3SrT{9tlKQYU`F^gcW}^jWlM60)^!x*oe3>o7nRfQC=X-EgFv-b=AN;D=zdqMQ zMf~l3_$>aIOy*R3z97y~2yb58PEo~rosxL}4O)W;Gz*OFiqy38I3u|QF9a3t%<*lm zUPoDa?4E<|cDpcf@B>?NTPv@dpX(}{W-JVn2mQ1wi2If5B!ym$Y4mOH57myrmt ztmkP8JE7`ju|Bba4D><@nPz3SF8k{l%b~aJ*nxQe`a*}&vTPz?<^@&C=qFrjOzKb| zyA*DnLEj|<65^cdrviB4MV>^K(MfGsXR}9jB8^kKc1=I=?#5nV#WWV8q-lO&LXvQnm(xh&GDkt*Iz;~mQ3gkuN@n((g!fi#kk3?UMn8P<9V*53 zh^C;q>kkhWx9u+OD&!8%zq?b|s%Gl6aInTjAe+8uduYnFZlfJAq}0bt+U7SYw4v2& zRZ%&(iD~HVGr5q>ap=nI5kp=3xdPb*f*-?3R_b*O>EEquq4#RFW=p0%L67|zMcBBx ze?2pBB~K?9&#N^X>i**aAG=a${~Y}&PZ`W5#ONjCyx}O9G*v!3(d3&STsZUo!tQr( z$$fTbzJ0tffe>c5rb<$1`eB;IE^j%lXB9EK z(NK3t(?I=yr~H))CH<;85j#YXKj4fwwt3b4q^!>63WbA}o_pJybF7f?M?UoTzk=?- zAR^=XpVK8Hm-YQKmubGrwdcdJ8dpaMaam>P3f1vuoy%Usfre=HwGwxweg#AS1pW+f=)MyAi@a=d%n zX)Vl8@67GGdtc&b`x2}7Y1M^0a`x&tpTvPVo$e@S_C5CLeTmPU-SYH|^{!{X&&u{Hprn6rSvZH}v`#H6+{ff-|)$kBMh|3q>vLPV~)@!V~$Wmu!c~Lv>N;xydu1DLW58wZJf{`6p8WH5!z(T5o!q5 z5UP<@gI|MJgf~uT5Q?OY6B>jfGUJ2>p@{4_p+P8;XPnR=RLS#8t=Q%^bqhT5RFSPB zLo4o}6-B^RpcNqqO}I@|HDNXpZ%TX`rUeqFVNS!W?VuG&lQU^WVh~dZOo^`V;Kw&j z()$%I=N+`#4r{v|skfBt?RqShgDxYrGJvz7Q+C5X-#~Yrhbyy%4Ls z5Sw@*R(>H?d?7aZLTtkev6&ZQb40R)7NJR$7?BrZ!3(ka3$Z-^NeD{d65TkA2NTUV7egJ)amhp9!m!=T%prYKW)P{|Rgo ziDC1+Nx@OM!;(#=atpb_{6hQgMVNgY8(Xq`@97g3@MbHfKA&@qbZsWw4R=5`%kV9Wq!T|r$Oe(i;ux%6)4lDnBp z?PgtT6NCz3ZkD(%glVoqLLH?68#+zR@7}U&CpNOG-dXr}@s(}D^aNl=VBBD98nQSy zXHe{7!$^~B%^51rDJMdf)ChTDM&+z2nb0CgrX`Rg6s=qp+Y;59HmK5CVm2hx5U;Hw zT4qLTdSXx8Zxrn=2S8eJCR95&XE_BnXh_rGrjdU@!~TGV>j90?0~(wMG%62hI3CbA zJfH!1KqKsc#?Ju_paU8~2Q-8Z@DYmtJ^0{}dpe!j*}J!`NDOUTk^C$0M(`oR+oAUp zzPN2gdKvfv;Vfa1u#fPkguAw_D3@mQ^>QZIwxU+bcWNTqr7|1+iDJLZPDMl&qAH>R z(GbDLIDEggZAI_51ilIAe}lkl1g;i%9e`(KouY|-v$HMzriVCItX?xBQ|x#h3Ogrm zx+!<(mRxSzNsduC$uSBiIY!|m$0(fS7=@D@qi~XA6i#xC!by%%ILR>zCwrbITubQR zc5<5EUfWLc6emD1Ro`~9|I6D>@~bUd-gdIcr9h;fE#=#@+fMSun4%oS#ZB_IlYPHO z2~TZ1$y1(sbK6PQFWK0T8_R7c**=f-R=blUSKoWLo$UQSz{-VDTwiEWQlb_@z`WxYab@aD&^f$t_ zg#LBb-`075TStGRjlWE*s=uwX{zk<6+dAuSM7_VQqrXwYQ|r9H!Oyd)ZkwXmOLQHb z$Z?BR80|^9?xYTwL z7e}vs=g5e)&Gq{@B4?8K2YGW}WFgPuO-QpZX|^b)Hsh^`s0Jp)9;A88*Qr=M=uGf4 zSR7vB!z--gDZ)#eJcu$|b z-Hy(>9r<-T>g#rtiI-uN;gn&~-FCF#wcxejwcxejaU=iBf0OW#X(Q7{h9O`_jeIqD zHF&hgjvBloydu0Jydu0JyehmZyehmZyehm|c(cT3p_|Z6(woGaq&MMB!<&XTO?$O&=Td!9;UEWh^mVm6RP!O2kRHZJ%9+@Tr_Lc2U zlqLjfuz8egsih|GPzBvGyT=-sV9QdKMZC(&V;uuTbLJFfhKY{Y3bb2hC+E4m<>iGK z)`Gj`Joa_Vqt1su_zAY3+zFp5l~C5gZdFl3f5~9I zi&2!ri>{?^PDV%@$F+&5WCe?*uRfA$N05)k- zP^~p95t9(6IR#WNlKK3`fLsx^4Uf+$ABezghZ?Y|Yc+3{nnAb8`@$t)XIx(3FmVAa zX20mJwWzMk#AEU2Li}uqFNC-!#M?u>*KxwGe)45i$AbsobK*EZ`WF80CzmaTf;NsI z@PNmYZ+4to7vj|+|Ai2LH{{#+l6WljuS5PrA^wLD|D)r?uZQ@ycxZCsws>eXRA@HT z_HMBe(3iSFr<*8s>r>rEwwtT+Ka)5o*fZNLb?ma_)id6hHj-B8Ob15t@}y!$SMw!W zUQa}LjIK%Go!OC*{78=9y!_}L9a+^){6$c0wCJbZ-al5eauIgR{PISu?-QrxgOsEnj2yH^|A6L57N;gNYO1Dt$)~c|o-DI zsKfli{KD_&a=$Z@9m%fYKWVC&fwGfUN!V1?0Bj>UWDf>V8Sqk>z$1_D1c7TrL8%co zDeBh33Np63jj3*JipW&AI@K);W~_@grn|Li_|x6$bhj*Me9ly6mKpiXNOp5}bYx_{ zTh7#r)WTp;?T&BgX0zRFuFLcJ6duVk2H>|@E_E7_;jddRQ|*jBu9qv@-Ac7vsdahV zty`JF$(h7TE*G3u$H&EPquOmKG`lVSPj#oqyHgZh?fxHtcU(aohV;qf)}LMsd((?y zUwSd@NiW9z0(dd*7r=|*1@K}#8|L%={$kkQUkv;Ei(!9%G3EXJ#iaN57n9!KUrc#_ ze=+RuFDAXeznJv?{$kSm`-@?Je~}*T{auvzcTwKoMTK@kMV8(c*$OYT7AmwC%CFED z6J@VSsuX{g$)a-Z^I!S=LmztRcVGF+H=lXSo##IPyEiRfJnpyS1z|EjX7gZsTBRdeD?|$RTv%r<6G;#87>P|T zw1X(oDknv>+A5YIkZs19B&tOE9t7xFW+t^Yc^P*j+HwLGn{V=zc#oIS_qB5z+L1KQP8Q1S(jo1v)H1i!LckUDwB9tHE3L{djj$$|pDwx&scia19< z@^*zCt)UFuscQQQuLi!N^p+Z-|2F{R#Mm1oataSnWqJG`G*^+_`*Ul|1`h+p?{JP~ zQ;z28p6u~PN?xW{0^xGXE1$`zhD6xoLcIs~P&g+x%lACpvHFnqDr^2s>K~BF@pPM$ zo;e8Gdy;n3=X3#%;pQn0?}XB@<75r1fI4l)&Rv5Q&%DTE=uG-yi%r()uC0vD(NMu9 zenvDyGwl}D{7(u^%d3*OiFYCCtk#n3HViNQ1dQ=KuVqo_6S>h&J~!HKXLHx4+f(!N zjg8AnZNdzJ|BB`n&x?^)fEBI%f_B#J>?QDr3Gbcav&HZEg{cH=U- zssIV&=opuhank}cCi0MCOVc<{&FANH{A$Z@?&RnBKQE?&L`*x-NHJtUPf>(|I993( zwK6*@vm-?v1uLUTwD0poXO%ds#90L;&MFbOaT)Jq2@q{yIQx$XsBoYGF}Nu$OvzYT zx-hkS%Yv8^!sNaiz-BSfNM(`54f(7go=k4Y$cBt;U}Qr^HafBc2#}QxS=qqKhOBIu zl|W2&|GINB-VSAecn%PP+ot0w1SsswqAO(h_` zL{)-_87T9MRm=xis+muORI}cUm+e(`a<|+J2Z$lF&1xCV`3?Y%ST{(~X1ANoD|N(W zB(D|Z+f)2z(LcL9#8XbWrHWIkGTkh%Vz}9jy;h#=Qm3!Vav5%R7v$)I{9M4)f=uPb z7Xh)btRTw@(q6zr1!*ftNkQ%@prjz@@G^}>QBshS!t4z8E!aiOOTW5~#Z*9gN=m1s zbPA-t?90i%oOL`P zL-iMk%F-N>oGi^*er1=nIa!;NwK=TK$y(-ri()ND4+{`&qn};v1(YA?o!9z+F3Ziv zlz{AM%bqs&@M$2xdKFMiSp^VVZfqMWH^RZJBHzUtgad5=8A#nT2zdg9Dq=}8vRIRmKm*410k38Q9;+!z zki|et_JXV&FUE|M_L}*mG&R+p;)12!d2U2ADS@1B0?`Lb z%2?)N6<}^L#pV?GL>-GG#jq5Jx!h_Sl<5^Yq#`STRsoPljAx#x1T$O$QKzyiLn87F zhe`>$XFwh?p1G!C9P>}bi?`CvKis97xY>cB6yt@-R3c`p29#pO=7Ngmf{M)r6`u=0 z8<$yiD)O)@V|AepWb+DHeE<~A?+}}kq8lH^PcfB~x>!}Hf^u4YtU3VYG_Go-3UQkZ zP<1NQhBM>(^w89n*I}vbQii0&51aPH2A6-5OF^HQtjMb<}Y_x z=rOqrbN!xDLq7MI5;q@)PINE28zteiFS>q(Dj)&-JRx7cWCHQ5O)Y}r;s8t z5QybK1K3@aWsV#gPd=-fB|syMEgPz8Pe4Osrta-6FyM$Qn}A-Rk*-3ngpWldkbzV+g**Wzidd2j6EiZrID_E;V_RP=&Y1l|O0n)(oRR%N%VB&qdvV6gwxU%s zpPq^%$||KgS(U6tst{j}aR=n1zugPbk6B9=z@BM}o>tM*Rw&R& zF;w2PqAG7%6nv_IdsfP$DHW^8YzYhpOd0F z@uX-juNzF=SNaHKxAJo;)fB5AagvRIf)%PhG^Z{!r}1u1gZ3Qb-JFK(IUBKo8<+8} zuK=l8jLynvJ>@oNOq9S7Tc&4adKS}Jzyz4-KqJMF=~uWj4tqSKJC76r@>y1; zTA?yME7P->&dwkLG93tz=~f#jB13^Y;175Do)sd2dHcFVe(8e$sA1TZ{4kJcB zedESu&8B%!D7?S0s^|%n5@i$+VW0tu;uQ#E%PMFC##U=nlCd!k(%CYO2*$B&YcldB zv;mVbF9x zV3g-KuAJ8MC*sT!@WP{lU6kGNI#Fl*h*wKx(OBA zUFbdxo#COzj>F#uYx5>oG3Qy5 z*;O;UYGx47vSSdW%kZmSBT<1@SQ%jL29GQfkRO3|^_polGYx2k;7mx9F?SNdpS4*v zI1rgWyaOeXSvM1;ydZ~`3*$3RXUY7~qJ#JjDAMER`71rxUcD-&y*4uB^_4>KbS7#g7 z^Ak?e$~QiaQa;QhWwyR?y&mt7Ocgqt7jnGb+{w*Pqi>}&~|(|iz15Dpsgyx;x2tXzXlN24+-q8H|96*x?b-#M{hPuUIy!iQy0;mmrTRQ_+iGXXRNlGg%qqS_W6fSM3h7D)U(U{X}pqTh$=`5YkigutzUY7lDY z3!;4I$b%oAhFOzzMW7{+&H(1DglC|NqGkm89|jaeOYRC9R3&mHq28j2~9a}bTOkVOL`om+(%DnV-jCBQLIPCxPW6S#g z-(xI+lhXTV$Cl+RORi{$uQQJ^GXi+0{uoQRg8fer{_8P*c`EX~WBiVTm=x&y?y+UF zB6xicaTa0novBflpYl`fTz+$oPoIymwnZ8b(-AOW8A`{Nl~f5ZkJYII1Ro+E<2k9S zAwZ}Kl*OVFKm!GcDMFjj|8dm-f(igDgr;Hymb{PgQ-z?EIRz2?wB>V`Y(YI^YMwV? zbJPVsBRF*-loeso3T=d{VuZehW6J{mVbT(vLXz{luoEBrl2IMphjRFOr{g# z{Yy0mE}Y+-%X87F`8U%orLPm*eGkSvUbys47<1HN%A+8Hs^?=;gB3XvEnVz#_4th-rvv z5e*1NB>;hUr)9na5~&1q1Yi`1P7&IKUb-!VlT`v(Av6^uFfLIELh1ul0Nnc!b>NEL zOyg$T3%hqswYMyA#ZF3CV3tr;gdqyMe}Pu8*<2M{K{B-o3cYYFvHF-s-(&GFoaajW zxGvfB$8~X~Kd#PuC7wX?N=7{)y*YL4O5RRBb|qU$Ko)}F*p>ZH9=nqFb%@gGg~}a4 z-hwsOH54-mozvffe+(pe>^I`i5uPPnAoLKn6ZRhCKyh%Ea6RDx*`ljE{qbD=2)jRa zcAq|Yn*EKA>^|Lj-DP|gTCL0muhX`8MrglSVt~tH&-c9(Qji^3V~_y0@cTDFR*= zOMX3KlJL? zB}db*IsWR`#WxsA{su^UBWTm@O#F6jwsoCt%TvjIT)HSTb+)QV{s?E5ZU?y<11 z1DF=*n?u2$V<<&e2vsbo$#BcT#IGW1@FvQ98Vk`!V&a*Vv0iH2FjER-E7YmVw6qc` zh%^Y5I;NEC^X*)rQ`ocT;l9?$^%_mkGOnU{eIC*Px!U9?6RH%csuZ%2@Kv zvRs~O1*D-b2&!gVoeh$uWTJ=#ul`}VzJbZ42G#DA8SC4n)=F+f7;Z`{HF&N}e0ODH z^~%JVm5ic`i)YT9II(!*;epB<-}uJL8wUmkUORPQ_mTNnawWID)S))vbo!qz(!I10 zBWRRxNu)Qnp%bwqsu#~jJW;9r@fh314JS(Tv5Qm>o12BaO%a9UrL=PW_vGQV~MsBmSTU+0U`6TzfOJ<@Nto!V&jd}8>q|=5|GbSP@mO-76PuFzUcrU=e^SG*;K~$I zEd3i6194F^w-GbM1wRK3U~CSc01O|Wh5~TW|M2p}Y9fx=joG$jZjlm58UR%nRTtF} z)e$u%O8YejYynLrE$S4!Sw&|_eLl;+)L$Ux#O{Nq7dXJ{V25ymz_!2$u^aUtBVx#6 z>|Je9+rwIWRj<4$VmZl}|LF=cf^6`ML`+#if(?)16ggL>3iBdu{PZE`fweKRBC1+L zS~v(qyJ26Xji36K^S~DGd&YL!M#`wmVjC{WdyzK2nGMfs!uz=+6lLf#`g}EvJ8@YI zPnx`84rg9mq$4a8rWSVbA^L8VPsy(U+U?V7RzdbtuA2J{S!}5yutJ)=oaUq4LhOaz zh6b>wbqfl>gj26U0_S5LF>@WcukUF<)ySGb?AEs3LUb&H04DqQsC+@u1)~c_PXND~ zc#k@z5cA-|&f7aro#R^wc7@)pntfHR2~gIiWo;TQI}-1ab&2=LWFS|}CJtVkk0rhf z`F0MEBDMehBmyW&8}3wAFQquUsY*l^=`s?%uVq<#Uw% zyl+cyDEYgw#H$#KFT*W=JRcjl3a3#2uU20{%JT(-4Z-+;u}T>>MVknW^KfCqm@W zTh_V#U$@w#$0oxpd~#>({=YQDD~KVE(mmeuS!ai-VPegrtZK=6wi-6fik71M zVl`ZK)O1v-AK1Nn&ygc%V?7HC^PNR}ZBjgWGu(3Id9Pk>)x3JSIp*r+=8UVCn>Vgr zUN_P7H?y=Lr4TEcLab>D4LD-0Q89Xy z5X+xJ2``0pP{mmK6mnQHnrUOto&&q@`20s^p1$M2-MT1@pEp=HxWO~9W~$sQmI~_e>D9+~0MoiV&f6$J82<=S#JTg)H|FHvIF5~L{ zEUcFuwai=IBt$ihze{Ds|2+9C=S6G{@efl+6#e1W)R7$fHs$l9Yr?_Sa|8y!A z@SjlPKcT~a!hKBzfrH-kR_z7zalWJE4^4{Iz_6q|hD6u)=C1E*LE2k$>c2RTLWfWGz=%jMvYo}y!% z!n>!a_bHn96uTc$-V>yK^Axx6r)b?%v2^UzXO^?CcC&Me%PIhr6a>VfdRH%}egXz7 zBB>&mn*wi5<19p3w11{`Z_{2yt|LR-FDDbSn1%V!c?E-rNiHfMK!1V$Ty!w_{KYw8;+_HSDBj5#N^$1ExUwQG>b#P1HzfWCzOYggH zNq^;*<*AdmOtNuJrd~zW>@C!Gc4-{g<5c{QlD~!Fj-GXN!BLN+?T+>uN>n_Sb#%R> z2ZS`D2r;4vaXDG+8ADvK7JI@F7rMoMVu%q%G0w{|%vho*V~G$?RD>8rgcxpw7+HiE z8ics`gcu=&7*d27pM;9R?k!t(ZQ1^^rw>_!0gON8QVevGa>#=ZMM?%4rion5V-X>T zmi1}|2(c_ZKFl_pjiM#QhxyjnZ3)TD#l+>=X)AX*kF0|DqI|U|7cFv?0jT<-Dz~VS zbWwF*R3#Tx??o;`0HZL%KLOy&Mb&*#RbHg}o%t>7Bg7dozBNx8CSF+7@JI<(PG1R< zH3h+0D+*FG6rdxJ7YHm~H`2Kdk^Rhaz6TW)XbKeLi}4o-Up&u^od4asu<+E`g{Nb& z7568eViPVPTT2Ok(cpy@_p`m#kij1q+$hL~UxLV!iahjbqyNevdEtBBMans#5`g3d zDplKE?K4a|vAb5>pFXLQk}qc?F`ziSFIAi1AxbBE%~G_jO;p;&75Ar}vrG@FI1^-x zAofVGJyt$<$C}Z#Prfr-*t~P1xU*R7o?G4sYO|C!5u96|Rim(^)LwmlcilO0q)M>iqU z$I`umsV|?S$0%e(CZ0J*$5627BqW+cI+MHsiJzQftUecOv$Lh*$ve-*_~gUr{{Ni% za-ophyfgRyiJg`H51qMd#hHVjM3!RucBuTEs2Sy}K{W`8zYu>MYTWfyA!i9qLLVnr zuuOc8K6#GbcaC0awt!Sd3r;ceR;Hb=bk1G9oc+3$1%jU^MohCUJaHkuVXu_RM6~-x zh~Vkd4dvjB3d_OiEI5Qusq%0e?8n|d_nGBoye!NH5));0M1=dA|v5W?rQ+GqDj<@T`O7wqcZsi01TW^ z`z7aM7Qd4?Lp8u4Q`Jt5+DLvBP~`z5e|)mT(7e#)<|8Jq()+&#U>?}1VSJ~C@txfL z0FvJXFuw29*u9eh*&sF3J&$1CAa&;Ug&F|`RCl1UW#4C(t8m6?gDUX(VZtU>G8 zgX)`!LF?Xw>cfe_&VW({ws87fxaabOybMQeeYOKmre`}hj{EA zu@(;*GU@%YJR+Ysc;p1fColaS`B3}=tnHNRhFPx;pXo(f^CPGe#H}RC# zi;0L$*^m+9P5obYfe>$Mh3O&Q)K~NL5O2zb=^@_K4&x!-)bm-F5Amj2m>%Ly<6%6+ zn;Ky}#GCjb?gc=EKH2@jsm?5Kir!=in+l;oh&NTkc!)R6hVc+@YKHL;Z>ogx5O10a z<00M@84&q2OcqLhnnSrJAg&Gq!ID5HZ&1LfP_S5Pn}`W^@{SA!o+Fr5dLaK^9aMqw z;pC_D<;D9oMnr#i-0`U}g(1gOOUj%pqQ!(s`jt#xLXpgsJo`|*|B+%{Hj>V5NWEmw z;r1Llw0Q2|eO;-DUlAW?+!U-EWLQ*$J4_*d?po{v9=p?H4|?nYk3HeBZwlG?WSARl z!vTYm6H5#uc>Y#mU8AN%4%K3#h%);k233pmE{pK8RQkMdvKSIq^XpcO_zOL}q}9WD zPwA=6cHq@(mbKZM!XT#`^~a3T;YP55{eSryK1G!P5CNdhKLSZc$ROt=<#gavP6s~a z?7*iSZwS>HPque0uTD=!c<_AI>T^n)iBZ#w)M9q?47Bq1^ik=2+XSJ@R05i0t> z_8K+xGMFu$Hf zQ(Lg~CEtO*pJ>6DMb54FayX_@_a;Pere!F{5^6}%tCr~W2I&U+>SnwyVh@ih9)9^l zvuB@t`Ar+oo_^+;$L>6WRD8fzcfZ_%uTwW`#rQmaa>Dz&QAnkg|@3yNY$s3xJBglZlneo?Rb4bl~8T`%fRy2*_j*;GVyR*T`!yM6(cOk& zZiz#Jm*1Sj)=(zGsfEf~?6{(i4b|Lhi`ihOQ6t!khkH!1CPITIqa=@{~mCj+4 zhI1k2DZK0BeLE+{qny+&liFs|R4XE>Sw!*^)RX!By6wsS6Avvu{BIT?Uf!!8KsrCe zvQpJZUX_gKs66hO`95BfOj=7+utd#rI(V3@(>@~lBBdE@iCyEjpUU!SjoWkR{x7k^ zusJW_ys=Ca(G4=Gxu;%3=IvW~8I^nfOA{cX`B(Op{>RyE{Oqzr_lTmtlJ}U&GezN<1L8ZI`|= z#D6Nxe&xVRR&liGb5GBIrPu>(eGW|h-QvAlH{Tom)+lsYw(|1X2c}b%2un$_?SO28 zmV)O|DFx57Ty+o2RN`wFbzzs1<-GbcEnEp=T>rGZ0wh=R zRaXA1#P1suzi&+1*th;3)~c`yxbM>6{i6Tful#CZ?ckuJAxFcGMjVYg8gsPD(P~F) z9IbT}I2w19b(C|IcT_Mml<->Bd%}6^^LrAeMepYsOukf{iCjb z)b)?L{!!OI>iS1r|ETL9b^W8Rf7JDly8cnuKkD@zbA4m3Z_M?LxxO*iH|F}rT;G`M z8*_bQu5Zlsjk&%t*Ei<+#$4anOV@Xm>tE&iSGoRGu78#5U*-B&x&BqIf0gTB<@#5- z{#CAjmFr*S`d7LBRbJoKu5Y#LTkZN*yS~+~Z?)@N?fO=`zSXX8wd-5$`c}KX)vj-~ z>s#&mR=;$8*SP*Qu78c|U*r1Mxc)V+e~s&3tEyb zUF-VRy1uopZ>{TF>-yHZzO}Azt?OIs`qsL>wXSci>s#yk*1Ep6u5ay2*Eewef$I-k zf8hE9*B`k4!1V{NKXCnl>knLi;Q9mCAGrR&^#@+xao0EQ`o>+~xa%8tedDfg-1UvS zzH!$#?)t`E-?-}=cYWioZ`}2bzjS@Gu0QMgv#vku`m?S->-w{-w{&v;moa@WEzMSjJxxSq1%elUs>&v;moa^K4Pfa|z zm#%N#_2*rG-u35Qf8O=yU4P#7=Uso^_2*rG-u35Qf8O=yU4P#7=e@oK*H>_T1=m+_ zeFfK7aD4^WS8#m=*H>_T1=m+_eFfK7aD4^WS9r<#20s4=gQhR?1D}5bpML|Ne*>R? z1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}6`m!5wEpML|Ne*>R?1D}5bpML|Ne*>R? z1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5b zpML|Ne*>R?1D}5bpML|Ne*>R?gO{Fv1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5b zpML|Ne*>R?1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5bpML|Ne*>R?1D}5bpML|N ze*>R?1D}5bpML|Ne}kp@Hx}Jz>9P)d{tbNo4SfC$eEto5{tbNo4SfC$eEto5{tbNo z4SfC$eEto5{tbNo4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8 ze8vrY#tmM2#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8 ze8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vr4 zdd3ZW#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#tnSN4SdE8e8vrY#$|JS>oY#%20r5k zKH~;H;|4zC20r5kKH~;H;|4zC20r5kKH~;H;|4zC20r5kKH~;H;|4zC21_&UwDn+q zDVsGu;|4zC20r5kKH~;H;|4zC20r5kKH~;H;|4zC20r5kKH~;H;|4b4#{bsU_;~E| zZQ%25;PY+Z^KIbsZQ%25;PY+Z^KIbsZID~rnc8Yg+p)fbpNz$ped%Mb7`UpyLgiHj z7B#Ob@TM?PBaP;`-wdw?U)5?qIdE0)Vy!-Hm~*QQbNB>!DnFfxQfBhSM`L|cmlrwd zV8f${w@-F*`T8|29!u*Sbl&0QHKIqaO7HBO>YM7lW8s4fyLR2bcy{+4rw@JVLkB)} z_|aH;I`uyujivrwYFIGGVJ>YE#k^ooR50ARA3G=g<0!5i;(MOdCQnao^7PauPfu+!v(hPR?>JcL!k5P@rX`}` zU-{SXCA{y7E8}DFDx?sx$1MVM`rrwYh#fp3$vKw<5!-re;&Wn>FL@v=Ab9{z;Ad9xN;ylfe7$@SFkK@4F{5WAXyKR@f}O*SW4fms%GDE@$)?+EJ z4yQAS4Z8Wpg9jHDo_U5-0?wYf`1i~+&m25RyZ3lm!YU!zidPP1tfVLRnEHu<|d7U6PjipO@rH{m)BM%oR~4 z^GFo2%3O?GlEV7Fy`r61Y2)7Hu?Ru?5ct&;Kp+kp`{0{$)5U#i?d3PuY;WkC;k`Pvg z2rC1^MJA$@31{@|2owGws^A|uBk^2h&~u^0o=5`1ONa2%_goZ-oI;v=?rV|s*Fqr} zkr1S}3huCLhq*Xx)yP5OxEr{Cc)c}m5MJQx!2+ITEjSnNtr8Kvf z(SOD&wKp^dMr4d;2BqnuG$9dQHiVa*4$FoSl?`dx`ft2Eey6)NxonA)*it0fyu9yY zQX%5_<{OXP&((T{%k|LX{{QjEEiLuEP$<1FBK|TYA{F7tkz97kSd{w8C6UK2q2u4a zG=fE!Mm2`$^7f)2ych^8LYJ~-7ZGL^>Dg|HvfUD8zk5fFcvi|%w=ixxDkB}0K|c#q z!ZN(WMLaJ9o@T)r%_h-I@{vm;U%Djn*d=sguOezi%PYe2iLg8n$R9bv2*?i|Z1GN=K&oni3Q7<*BsQRVX3&QKA4r&S`AJFqmp$(~)0>@?Pry;{# z{a7dpwo<_+NN1It#z8Aegb9c+3H91gBy3>~l~y=IWr#KSm=HbB%YpE6AiNxUZYbjA zgdOuS?o6yV-FFa&J+8k`E%n;OUftEH(}g%>(36azo+Ls&Da4@!bG1plHk63}$Qj^J zLhdvX+-Vq}39G^(c1a9%Mmo~gygcDmZ+BApkN zPX(9CL%C0V5I*%m_|%7IRbT8&E)$V>ZAeg}b0{Vx+&vntl5&p`?w*J=o_vB?;+cDl zaQ8%{k8~Ij%9(qVC?7;TnQuIIM?^Z$-Es4c?nE3(|2x84$`m2@NH}y)L^|(c`Q{!g zRPKq0k3__mLc&LExkn6hj|g**5pqu`8(u#0vRy6ma!B~arVbr4oWPM$N_<9r5C!p% zM8uaw$TyZG!jeQ-l5$H#GJfe%oY)wZ>%?Ke8`!(Eq-@MZHeE_N~8TfIqKx(Ni!Nn0+3d5WFfuk6bsjka?mtM}nURW$ZD_tPm!)$Q+TF z`{ckI)o~tr;)&Slht8fp`{ciQc!0^#X0cEO^B9#gBb~iuhUUDeJwn=AL^I#T%u&~3 zNDr0k&J9u~35P`(X>M_*X@s1)D2ju^ObiM$OJu}E(wd7!Bjr=Nk(O?=Y1rVJSeLx1 z1PZn$cTyRh8&y^FP}D<=j0QEPqcE#crt&MQe&{eREv%v{XBW>tdF9zNXBYY7T778o z^y$UZ4-vse7mBV1yBaTMk48q5i;VCqGBN_|BjZqZNzE)FDwk{SVp-w!VLtY&!ZoV% z@_(td-FWK|OFDEr5$V%C#zi_3!6zY8_fk>1d)7|v+$S33l%tR>PI4p>;2T zG9$U0bxH$%WMXJJHI9^g@jS!Y_J4Es^E(E)%348%<@8RAMgQLQ{a2;_*K$qS8-kZ$ zO48PGuL{cJK~w8{S5@o0y~@Xe7NM-5Y7`%H)_s}ysDirxEb>7PbP zS6nqI5kOF>cX&ItSz=-N_!U=;ms*?=D&0Zznr2Ys;Odad=c}CYs{@g9haNk0|9?JA z9dCH+&<$@LzTvGSH@tQ9hPRI0@YYo~ymj>rZ(Vc4Ti22_NFaR(fgvP@5E(*dXjS~G z_{Zcvge!0pE{NLMMT9nCI3~( zYh0w6`<=B@@X{^zKc5BGpKK zVyi$hp*r{O6--(E2NJ6X=o$m+6a$Gf1Bver#J_M}+jb4eeFJ!A0H=U|_8bJjy#Or- z47}m^$k@X5?R+fumY4rYnz!+14-ceXHW0sk;LFDc(qB*22I8NO-*S8){#t|IKR%FH zmH3;1)YS2T)VGfhB>#KRzCt%)SFn{dTL|;uFa4_bUSTUmsjn8A!GzACy+R8{SJ=vs z{VK7gA^Ta9*4j{7ze|!$sALmL-xt?(%{uXe8@HV}@z4_|{%~Po>%I1@h^G`4lJx*G z{d#{=VHD$=c;RxtQ{tumXT)=ANiK0ySWLWcFzvn^Ab=0 z?1njJzjc7hZ+8zs%dek4F}rC4n*F++j-?agrxTLzzfpAmjiy__jAC||A4^%CYZi7dtjEl+se7y%Dmg^bGPJ^`b`EjKGRn8*e}9L zGr0+qn=rWv%ReE0k3E0vvFDFH_WZHOo5`#jcHnp=~oQT1I4rzGN9P;Z=1Tk;wSc+I{l1-7T+HI0L9n+ zm*w`G7&@lKbPoY-Wu?hu_p8x0axsEb81cS-F>+~&T^mbDc6oRVRZiS|W9s*kKkO~^ z{c7T8scdc}H!{|;5BWznZ$8QZbToe8{LwcYKgw8fl(FJy>Sae0|K%tH(oqJaqw(91 ze);&(#Ok9OWR7b1I-2;KqkNJq z?Whe;M;ZCTZ=xKv+xep!p^mckRv<7gkQK-Y69PqnlE9=uS)d|N6{rc+1sVby z1eyXZ0n^bDH6<`DFe5N4Fn9E}?sG>qavt3R!ESL!HH80?v7P+b(c~4R@EIA#@_C&A z#`2w$0LBi;SUv<3z*s&Y6Tn!$8WX@+zTgtTSUxQiz*s)S62Mr#uM)smJ~|V?SiWu& zz*s(U62MsIiKDkB{}l}d5&!Jb+wO|rt^K+6q1pSxkK~SSfq8hJTJErByu<7%`P^aV zqQg7xO?=_-4nCg(3tbS5$l@-8-3XW!B@QkTu&8Aavm>5mm=dV@I>xcw~+r?dxTfxtbB;Lq}7ePLz)J{nPQIY=@pXewah)tF6p; z6OSLh`@rEZ9yq*k;P8n9hwq5rd-$&Se7YI`R#4+)sivsJn*}-0zRHJY?Rv95KAF$9 zGd=&U?{Nk0PgrHrN|XQn=6Gh=_~780OyJbWnoQoQ;We3(Q>%tDb*F{~GaaV}hcdY^ z%jj4p7v>pUoymonhDI~FFxMEeVYac=naL>Is?21VZ3HruT{V`O3}siZ&XgnB(M&m% zT?IE(v3iit)7_jkLz!}zZOve&8fF_BVb>QISPdCjHkPS|*@odp6mpdnP#YBWGK@L zvkk7zw8Ctoa3jlxGp)$7HJMhJZES6(9c3HMv?IT)&a@+KW0`hj8M0Bf(adyIp|Q+# zq-`)W9cCLH&P<2dhE`|hBH7W*Tqrv{n3=1eev%C^{qJS5aA${@`*e6L6AU{!I+6)S zoLmDpG+{N}E>gU95IR&hx+YVLY#XLwI$SyhBEOBSaaq0;Lw^7uXQ{uarh&njz?FAI7}Z~=e-Vb{E%$cXponeXU$!oRUbYwWfzO9p-$jtxF`@Tr53 z{cdHA`uN%35e!p?GO3@1WLUzO8ef^zIb^{Y9kJuxQf&1pVf8)XwVA%Zb#m2Uh=)Qv zoS6uk`q>-CRkJEd;_~z4cnV7)phu za*3r+gsiZ@)nS3w-6?QQSm2tlKN;Zf#iD+M!JEb0#uOf8unPc8h~uyg`N;RK|mLvE3PW33@}Tkq^7(F#X{` z$oHxZ(-&IP^`>7j!**GJY1H?F>D=$wjLhU4C}*`gxz7oqh811F?ZDPSZH3@Rffm7%7do2^6*$#rVTa} z5_Xv3u)_?GSz+;?({)S|YItz)#HD{9i)j%4n5CFXqfDt$);>mAV;gm`Ec1@>tlf>W ziWn7XA0Dy>6K`lq$Iz0Fp(PzdOFD*@bPO%&7+TU{gR$8>w4`Hrsbm|PEp>P)@9
  • >2M;bjbx?U8F_9Ytzq2UDnBe0^&KaB)Ox$Voj}1OzoErp_qeh=J_zmM+ zZ}4q`=^5iMH+rqnuK=bV@B3e!8rx1lrQVqORpI1+bevir;-kX3=4>YQ_byW@X?9wQ zrv5(6^7qQJdy(yX4}SQ=2M;}X;^AEj+t+lj`$b|KdmlS`m!VuU$g%`3I=)vh@jatw zf97aWDET%+@!8ajMMp#YHpkzIZ*U?$8&Tg${zk;QNTsd_b31;U$CcX{mfI-h4rQ09 zw=Gel)mqTH?ia;8hh?za{dXFCGEroXq3|EYpS|uE>EAN=?)U}c068|AeJFGa%UJ)N z2)x^I@_q5WH*J3{A7yOW{nX-?bDy7?zhhz^FD2iXe6zvPc=<*ZsAx6M*(bFUPYV=F z{K#RVEI~PqOm7Rt&4)j7_#HX&M~?$hE3cx}ockj~*`PVgwmy_<(ONLGL??dq9l0Gj zP%d9-6jg&+uYIe{4#;NA+l=v@c<=G+0PJ&p<8|?|-nGZCdp!B;$FG}Ztj-+2j{6k* zNp)TSyN+L1AFuNy1gV}mqy*w~z_%|RI&$Pp{>lEAeI$M6$$?kC@#SZ3ja?U?l>i$h zvI}IsMK<&kPbINUBae0<+RfQ< z-85MVso#^?PaVIG4^%p>I-U9WbtMjM%W}RBKlEz#DvxW(6+gzI%k7|s1@U8@T2|V` zt1i*~{|}5@sandBW|3A&*M|w_X#ZxZW4XCxE=E|v=ci#iOv-f{b(FhY?|KT1WYSCfjotOeO7HG7e|X>{uk4GjPo$Uk{x?I73cF3_GCiv- z#)><0?8P|8s<*jYv$Xz~8%qDeQ15ewCW20_O-Hu~4bN_bD*S-K@%pskzAFu7E1hv8 zc(fH{*+M&vpfX$j|JZsP=qj)C&R6F{M^eL@68be>&CSTI*S)k-)mtfaCNY^*dGlmv7@Z^Ik{t?FRB>Ui#wAfdf0E4 z*Kl>$dc$}32A!Dwxk-1W6wXxx* za%7Z!X`~hh`7m$vx#4yv^;UZ0$bs9Nb8kDTv$r}D<*`dFz=@hvh>6I8;_Xo&JIMFE zLMb=%RfH0s{^5VA$|u^YBdh9CKjZb8KdUPfRx5M^U4#;$LKq`V5Jr@-hR{Q}jnGTz zBlL@!5mzM)5_(i@h%ihTQ4kbWq%4p$-gJI9A0!$X?SdKU;-itJcvSoB~-R5N0kl3Z*T3oBeVMmCPtd1r56%cxCnJPCl1YmOl#rsz0 zjcs7Bn_pHkoz2&7N=!~paVf5@?(y`R?6=N>T%0yuHs*{QjXQZ+UwKBcszzgdpH|UEnTkrq9x;jZ2iUWrkIASBwjH-BEwKX&@hHvtQjNxvsoCZ4{ zS-Jn90s{sk_CP|n8#uyC;S?Lik|2DuPNeCNg^mBGu;R9kyVQ7es(7<9iv1?2!?fvJ zChn9un!DxUYISN|b*%dE*w(!xow?M(x?(SHWjn~1WPq+>A1~xRs5cZJtjkOSd^O}C zcQ1hKyN>eS;e*)r2sl1a0&^WeW~bmapnewMMV7rvcmU#+fthffQ2SBzz2F|gZYAkgIFF7k!K5z`GRO1A|rUm3KA;Tt_}DIExl?&R0vfjf4JUx|Fu*%h!Q9t$ zPW|4lkl1uZg@y#Zf<8e%(6}BLQ0vQM@9^?Vx+^Rji-FxH1A!)Vmr`_B=u+ygB9TpZ z^+`5>?&>v*fV#__O#`X6dfr_MT2~%~j0}u`Jry>;RldEblxl5w69|nqifT>zH z&=2UsC?xm)DOqjfE388h92JMKXC~R=xwZ>mRc{B1>gK?}gi^S23l_)3bB$IhI-ZpZ zEVHRn@=#Gd;Y1PSloA{-k1;k|ZXe@j<`#CC%L3hO5Y{- zyGx_o3*p3M7w%KgU8(d#dDZ)_x{l(wc-|tvtB$Ys>|#+bc0ziB^cJai7aH*q{Z-zR zNqtWMC~Oz^9sz|ip+XoSR0+J(okXP$#2sV&5$|){JGEuQRBD%A5(&I6xGcyCHVSqE zTw;Rj5Udg$N{uj!4HY5IUkTkpp1u>|#m-=vCDPHdW3-vXNYNKbi1>I`5El-r?-~*9 zEl!SYnA~vBBp05dtxDxh_8=<>IA92&EHWC)xSln+8hEQ;`9D55AH ztP1#q3CK7p`2iuHjRE;s)Cg1AK%04MF?}Y1J~M~aXWpSppP5k6XBg@By?t=5n#=U} z0oDNzI4DJZ*TF%p1D~aCt`%iHS zBjnZAATJ~Z2%Q4v9k52|TxB}eqa@B=0V9(fIR@3jm#D!3!l0Pj1k6mpI4m~17vTK# zQ*7!1PNAw_&{Df+&$dT)@7w(dX`B^*iW5L!oWNwK?6W}Qr+_X$MhnCKJ@v0r&1{-R zOnK9XDKGjkE#X6yUtVo8!5yTCGh3BDp;z9D87U5%VhoCo4~&(}U_gpjhC%@2q$rmR zBAKtK9N6~A{zta$IVk!xKZ9|`?d7Bbz6%mXo1S$BAEmq9Ob_nJqQDh7WW%x$oFGZHXw}+0x;J(nPHY8;%|)t&x9{H2ki}NUcOCS0t3c zi_D`Irhc@85284yiD8g0GXE)#;4>_?6Lq(SQ51g}f&Kj!59XzDsha)HG>K z%ko6>3a978pz~30eyPRgm-<-lIqA^+Qa3Zd)UC`f_3`Lnm=^iaO1?E(-x^X{ zLrNHXp8IS*E~IdWO|dZWeAsuMyYEg6DcopvETnL|)v=HghMv!_xY60*Tqw*wA6B1l z3#n}(HKss5WSZA>STSVlL#l@9rWYB_>3ES6+b|DPA5Qbggjxl;Dv=sg&!zhG77s7T6 zVY`Je+Cms@A&j;VMq3D@EpTU8Qp0EqVYG!X+Cms@A*{8Kw?!1fS_|Qq3t_E=@XCd- z&O%scA*{0y_E-o*EQBEz!Vn8#h=nl3LKtEpTyY@`u~7Jz`x}N?2)A4aZ(IoDD}?bC z!uSede1)*O0uOtrzhQNSa9{S#3O(>N169@1&M3ZEx<56rLyz zhUd6~S<&1J)9{Jb=3bdZ!zc31y|Sr>Pqa1n%BmVZ(cau^Ls-TWtz)F{0@0S{+kkRP!Uc+ z0cG?mqgNTd%HXUNP)46J_;fX(3{F%5W%Mh94_^by;5-#5-)L2}ez0z^UWqEmr?CMQ zXLP29&|MEuaiOg$gKxOEG}bIL`$J z-ZKsKQID;u1F1i&%RFq7T^q4igX}lqWKl)9?qYIFkb@@me-N>IgFI-GSsi(^5ql*_ z$t3%Z2`9O(_JT?Mw#b{0#GNL&4~e9{_mqg92r{`GBxk~tKT71;9}ynyF*zA|Z6`zb zqatHlfYc||&rjCTOSo4B`vLk#+7p6X1P1~71$VFDL4Y2SHVbUxA{%i-36B1JQm5>s zT>$6_Y4eKi1n9#jQ&;yi+zR=&i8}I0J=z511RDjE)niPbq$}&z=@RHV{n6xggQ+rg zJ`5T*(e|=BK>Ss~eqa+lD61RfJR!J6a1fvyWYZ7gf=%=WRL+knkP~c7l{<#29Th#J-|VQm zT~@ct8o@H-SG}WNR~S(MbA{_0gX!O-+dy`$xK{zcfV6u_b{@XFtgI&h&cge}9|Y?E zK-|582Z79LrOk?aMNk5w^q>yUTmb5~iQ5U}K9v4Vnl2zpwWO{nC<`ir0l~Nv$Be5J zT+*tXLC5-x{lkaLELaZImc>7VNOrmqTz~-lOIm`cRh7=Q`291jh?+m$6ljfuhFg7)T`I91b{xh zrmnokbzOjt04hquU&9grn){ks`qhKeH?iJS|SS2_FWIh6< zZxZw<`V(+*$Lx#^77yH zySXUw!g^LG3h+Rs3cqc9PME$^$ODz)|JeAdQ&t)Oy^segmAlaV3iH1z35ob^G$xm(AzoEjmlF6YyOxqH4$XoB)vUiZwGlU67Mw0 zv}KmGm9wk3*v}Nr@~>1nS64j82eSxO!u}zkOc*$KOZq!!&n?`Ldie9_7N#~4=Fedd z=P;LZ_?>fgnFr3*btpb}j{7grYou+0?uBlL9)%t{hvg*hpj6`Xbz`gQGNdm!hY3M< zox>#0VW99U&ta|5!{@NobJ*}XjPx8fdaiDikUmb{#d8?xIo#VhF7VSHHE4uSLIwdx zVHc)f`0Tlb9YsDrd2ZnkKYuRuhFYL?qTG3;tZn!??!*u-6LN%&=dihRnA|xGi*WLs zjQZTu>F=IR4xu+Gf+tZbEPxBy0R}lw#?&;ovk?JsIcAV1}Zi(}_*08Vf?dQ_g z6OE5O`r@O1x@Sk~-=_X29C6Uh>Y&+!LF;D|x-Ch6B4}o35}!}#+Mu})1syJqBpD9| z&Abwni|ixZfL5KE3kfX+?M&K`<$kyFS&xF@i34sbB!yp1=>DKwy{{Fk|AR=&F_Y^s$^L^${r628K5v4EMDl-*LLspJ-#FH|&ZLVs zahPfPn<#YFq+!q`H)+!N(TK&7x!-YYqUg@3@3a#GGfCf6eRyxRx_@tV?BQps)rYHN zsV42RnzX-a;*mXcrj%EKu5P^pU9tTDt2>3BNHs02mc-oxOfWp$scK>ZCbX8oPOM2g zF$&%*co4X~I4IeCkM#w!`z2eYuTYTrM~gjXN{jvHEBlyP(<5` zCT%O4w5@1*I(-MgPPXak4L{6&gJ<=do~}_xaD@MkwA6|0%JS&wXqmq%PpYx{S5wQm zlSVKMm^1d9HI~Jeg$@E084y$y1QNtHemPIP0i4tT<3{d#E!Vv_z{vfs<+^FNTsPmA z>!#Uq-85US+jjs1NTO5G;&QY&x_--~KD)`8LR~#CE?RKMAnDxu16;b_a{IenUw@Y? zW2XJd{xo+sutY?+h?UIa&(gmq+oNf3l z%T!+~`{oZ*7Z==MQ(t&5PiEHfi%K8jx!d7p4oN+nB0+@QB=zErr;R4-#Q&GnS1@>C zcCGnWjr)b&Wv4%3yv2A>SpNs+?=?Os%&d0$toc`rC1Lg*;h^PYE{Lz+=Ja`CW~VUs zArPB0UN+{88-?0@C=r8=rUMK^hOz zc#y{D-T1s4pLgTIlhe}={Mz*N3-Kk*t>0p9{)MDHY=YNu?A0LqO)v!|*1F&b{zhr| z6_a~|JZOR^QDSr=5v;Ua6f;t4%mEY}he=5VnqDG`(swS(1npwzhycGW+4k=lNwz zO6Xzj#Vk3|S+QJ8NSMO|Ne*AS?jBvgernwm-+9$(>6Q(0_qq$JF}X;vr!oI}QOxsTy9Y=Gh%|3`*6CAPj=4 zdCU)0!walxct?2hSm~%*NRuJ7n1_uFMyZ-)v`a-stC}QKEsVoZH4#)TbP`JDAtYm{ znum(4YEu7+s@XpHKy=LEQZ)n>`>{z=shTvY8iI~Nsala}fK<(+N(N)7T9r!G6hzfv zNRX;6B2>+RB;b<;I{~)3hqTo{g!dJk)Jb*n_rR~c$mcmyQz>5MVO$-AE7wZnDzivt zlL;5k73&DX6J3h&6qN{1?unFwWVSoTWo;$af6Rny&x&zLTZHFTg2L< zK=>9Ph-XVIuIc~b8{T??o5(_bd%T+O(u_MYqA%1$IzKzBF~eM-r>Gt*GRjCTJz)b?a{Z(JqgzI+ z>mTN-ExY~EhsUc^sYjVlJiz*>HqejiIwrtQ=27jzAJqo<(Yg+vH3K=XG6~xBRU+K6 zc(ktRZ-5D&zkgKEjy_t~*yA{WB_e{6dQ@jJi2OXzxDLQ#IYz?u92tUm1WOeAe3a`J zhGu>V36onR=m0ncQ$xFooofBM+`76)!)ey3#E$?T$OSnGd$f+-^zoqfGOnzy0j!sIKr=N54E)-MF}Z^kj8# z#})Q~02lhf>{_wL%&V|m5Fl>kjGO#h1P2B83LXTwnsJ4z$l_iBxQs943tF4gOO#jw zvhM(?9ut1=L+S-!Qy`a_^+B&Fu5W0uW6tbXp`8F1J17}AQXEUl$Q_@XN^r5#s(L$y%J!!&O#^i_2i{t3b@goA{82@jIDT0Mlm zLMV~;&J`>R{=y%2uIuWmR=19nDlgvid$i#Sk51A9mB{Q=>#uNzH`0BD^RMCVE2&3F zy!nTpzjCwAn;<{@Y_O2@2Mo&lYo-3RxNZ}FSWMCX?ep~Nb#+R8Qg5z2ohoeRJsZQ< zndU{Ae|P2SR~MzftE9~Hj;0!vASLM!bGxG+tqrEO>I`6O-2x+L0mNSw><5@NTV-)u z>pH5u$D`D2o-+Va4+`!TJP5G!->Mr4aIXls;IF@^Oh_WPbGB+a!p*D9PJqonGyQeJ zWkF7`5n%T&X993%=EmRGBrE+v+6-`G2JThCetaZP0+p2?MWLTos)Nrj8yL;<0u)2X7NSP1+y%Q&|R~vXR};dg$&D zbAhN0qY!Ko%%|Ge-w*^91p$S0DXERiO=Kx$K#Wp)RFJJrn|3B`n1Z5b05(5xa{y~l z8}^{+HGn-5++K0p0s6=WiIk%N3m3_U0G21X1+|$Jw|HMF-$g20C$e?{>?ztX6h&78 zY{}pb1MG|1uoi&zuuZ$AHmzyYavaE91lV1*VKk{W*@uvB$smMuOeP_u#~N-Seb#_W zO>Y31rc_(%8&9R$%KiK-o#&SQYx`8&tt9EHK-&h87t8WjU&lQfPUH7%O|_+WkR!Nt z;b+sMsV8Yjw`s$m3EL;rj)d}?EyueO+Lur+=SK#wVRnK3gpMY3FrniKDAW2?dv8*y!W02Q_Tn>^8vN6a` z6ZU03pd-ktAZ+bQol{#lFB{uk<(ExUsas{Fz$U?b>Q))(tr+O7*ygR6<*gW^QhNCH z8>A~RB290R(%}>yJXiLbz=+UMrHzc3G}XcLy+tL-GDX?kiZp#2XgH;e$WC${84Qf1 z)sEnCbXZ@pI5jz09MU7 zq~o?uaKNnd1}Hqxd^U9}FI51s)>~;r)TnLut@=F=a9NNOY!vJS*lNRd2v!LWjZ~+& zb|_a^PMR*;z7+$$RaSf}?;NGkz3wVaaa;M3$E{4aVrnhEWG$n#mQh;ED6PeBtd+}H zE0?iWE@Q1+##(s`(&hlX#aej_qSpYt1>D}6AArVXdUCca=cvjY0&o~`3jq8B+%5qB zuvY$o=t=K9Ku>Tgta;iTdV24R?~Z}`n&cwhP#I<|70~S zoyRB%n+Wsg>x%p*RQOM*@}IDONI}r60F)_4{s4sh9*#)=^gNn4k0RjDoJSYXx$|h^ zJc@w7M&2gqUg&n{QRt!C+Q4tdxPwqe?1*G6I8XnfyUx@5^K>45<$1aeJ$#;ypT{qr zr|;+K`guIld3uk`Y`+Tyy>WhF`o8n*IS8LVKhSqRz2iIvbRHF-Pk-cm=ImKAn$F*{?(BJ9QZiI* zPJR5Av*(AaV$UKqJ6lG|0u+t~6J94=Cgcbk2|LeYyU-58D#D@j<(^{S`Gwg(K41Um z=R0^+7%xXjeDwL$_cDLkVr zDO$H~9p`1dmwx+=dWmQ*=^;0=Jexn^PGlGGYJ&>Fd+HF|m{wMJ8Qjh0iX zlfl92Xrk3>ed=-a`Zy|m9F;zfN*|XJAD0pzml7YBKYCo6BW(^qXMhpSk-bSdd&O-B zP!Q3hiXH+`AKU@}^}+1|(A?wlLPS>rXbSGI6^X&M>5sRekciSAm(m`W(jJ%69+%P{ zm(m`W(jJ#d{3n!2BsKsc)k{3%M6Zx>{1P3zM87UEwBD7jO19jn<&sPTs3-`k3W73$4q_Q% zs+X_^1iWzxzd(4MaG8)JY$WWw#Ds-*5LOWm-CfFXPP8h!DjVIOTX=K&uRc46D<^Cs z%+J*o`A?|upHSsLVgC@JcaE#51Nsah&{%bLeyqo!*@J#OLQw%^YrbPPzmtoOIO>){oRJ<<=Dc*DI|8R22mD(e53j zo6<8tN2O>~^Tymj-yE9?p0Jr)$lI9I+|<|68n_{~sXB_Hc&@RWJliOpS!ulpKy`xN zo97rP!Rvy{f}CKZU?-5KNL`W8Bj^{5R)9h7czdPewzL0NQ-jQ9f}o-xP%7?okh%@V zUa|aOoVF2gEQ56eVpNXlKd9+HsOdkb=}+_w(BH+B8GwL3#A;E13h4Ltt6 zdiO)T#M*!LV@gG5vV_#2+%WK8rK$&9X4?xK8;fVB4=P{Le zECv3|JO%=to5wunu@3laHu)g&vy69-s^41oX!__`18-b&kwq4W#dy#}?)> za`=_=m;m(fJO(h&WS?i?=Na;OCOYztlXr2Rk)CIu=QYkiML|$i5YQKou&~!4EEBkw z1=1~s)KMYbV??<+(;@Y_jZo4y5#|YR%rnIEjPQKdC{L5kGlKI2ee;YV;dR1gLXNPJ zuydZ#gLV*B5e~7+mCLMjwZ98>Mg9{i{3lfTPuM@CAm~*9$`m7i078BbN2GsxfdRO{ z0KlKQzyLt!E-(NW7y$TdHu)g&w-V06-U9K;cOv(18{)>xWE7)?>KoEFE9WX7=Q~J0HC5Es458Piw6KK6X*r{7E`@| zrU{z}^Mp4_Z9MjS_T5>6KhVM+_QIm-37FLf&J@+^o|S6oeP=YOn+BUxsa`A z`)X`8<}YLlsV_DCA7?M5zW3Im+~Iqw+~Cs30QDB$;aXzqPct7iU%&pI_wyUxn~}OZ zVjI%mRB4{0T+|eTYsG3M7Nz$k2M(79N2!AvyDhv|D~ih9;yp zGBqK!$k>GHzRk^qt`m_4j}5dBGc}%43@Ieh$uSH>t%Tu6}gz);={3 z*Zq+E_e1j94=wDjs-1G)!0Up`f}CKZU?+g{hU*Zl5*%8$b?e@}k38v`1ovhB$D&l_ zzSK<(U%hWb!rvVf+~BNQky^q6%BB`1vHC;A>JLR3QyujE8i)08PIcoiHHq~Q1Qi8A zRY6cDprc9b1V$;?$)xOLQg$*aJ0W@oz)s-i0PJK^c0%+TfSthY6}Mg7QE`U=>;!HB zz|s%53&1WWWfw$OMk95JXw=s?NW@OaISgPYld_XZ*~z5rWKwp5{Nn&7GAR?8tgHW6 zYEr`r%%>)8EFfL7v4C{S#sbn?n*>OIZ4w;z2FJ3BPx35%`loKw^}%96oz#E>uL~{< za)OP5od5$4*CALX;3}SfprTiAY^iQ;cwkj(MoOKLQh_r7N|=#$W|;dXC}c(xWkw2^ zQQv2zWs1)MsBlJFn2{D{)ORwj0VrUG(VdZgDYgq>&?MD_u_N^`z+fp=!oR-ozex&2 z5*4d-afP72t9hvCp}0pstI^-Ohf_1?$Y899#eTr5!B}>IG2wt&7vSX=GYfn8TplmQ zoRJEF*9DgaIl)H3P5>3cbqG+FHas)BvoynH1%xcfE(8rj)n1HshF@6xGb`VL9teVp zf}pA(C=)nv2UZTF6b59624sg6wnJ7<^bCNN!_5Kce}@!D^csMb!|fHfUEEP|hX4i; zZUMk3!0iGU;2pAZqKTE`ALSm|uaU?&kaHMdq<6^5cW8WfXnc3b%8`E@z#Hz+aPQDa z?~tkQuz`d0#$y4}9~(G`H#C~iYg;==zisV1G*f_i!5aW}zXR{4A5%>87}bG;y!HK= zg9lT;nEDUA9PF*2#UT7((8=S~geK|Z0})5(&PU~@p0L$Z8UK`~8^FlLAfF|itF;5yL`B4LpUZIwV3;obm>xWk~C5;UI z@B*epKX8eD#D#v~LO*;-D$x&Itvk|B?RlYTU#8}VZzx2Q-sJTlmxCmE8zZ*U1P9=f z?wFi_V~0fK2t;rNCf1T(;O^KJ6JFx(lJ1&b!N-W; zWkm2ZB6u1Re2oZuFA*Ngg4petT1|F6j#^Q6KaN^s_BxJQb@DmHbmA=1kQ2fCfa$-k zf9WZHVEi?$xay=H99G7Ei&k%qR%@x!p?E8Zwqy{WP(w3u7N8`VoXP7!E(dXiT5p`U z(*%=qtRu*(AUfHJux4?sxnpoiNsX;u{Ym8V%h zpodShdYopMPqTKMW?)YnM|-o?`l=4nRqG{Z>v*l7(QP*D(66$Es}`)9CBpf@ZR zVydSZIKq6X<1R+!#vf^KoaTi1G^c5&>;Igx+@>gVZCvlhto{AquDaq6?xHe5wQLla z8wgaq=mN#6_`ej-RW^?8;b2q=bcxG6py(N(!x?jMYs_6!?k45#6}Mg7QE`WW5g)q% zu4HZ(T)((MN0X)5P2N-8v***~tai>}xIS^&m=-GYYGKe%ht*XbPU6Fdl{|DJOJXeJXhMNiQ95AMoLZ?(+K^d?szsyakf zji_qcn)$`ETSg!F&@CFCHr^}FJ>=34?#f;y{RelwRsT0Y_5iTKnCe!_)<4&;-%8Cc zlfC%fSmW0vG2T$kc(FtovRGaRmSEj z*Ubo@z8ac0;c&&$$4w}^>FW!-1U-U&vn8{F)q+03kf2vE+-(%lF6I+q{%SUR_9}{I zbgnWuS2Z?Q-|~g!tC?SXYf<`*d+N{Jle)W;A1;>Plj8VS?z%A%#ntkgX0A4Vkqaco zR=*R(z&pF3=~d~~=?-+5Fg-PnOvktAm-SF{3pmeJ083uBCL5Hk> z-1lmH=2O9C&Zb^S|F5;wRBNg``^b%0jaRAc|9DUAx}?n6_egrr(HzlQE6>*}mehZ_ zR!)*e`EEtAKj~;>zqjgIN=-nnCH=QI8n4(5VYS9bnH$ZIcuhhyUspU=y=MhIQdi_Z zp~8PcmH&kOLkfc4BXwm;kTn1ytB1nrpB}-J9Kn;opE-i(fX*Gka~#2Qz+WS86Lc?h zJM<{@&=EWbbit7utEd)H-sW2>k-qB)p5zF#0Dk2WJPGvh5j@EeJjM|`#t}Tm5j+O+ zj+1xs2oB;14&n$NPf$@1R22kt!>gyT7bp|x3cgZI^$2dBu!%5Fc%!4t54!lPxy2(a zqes#^jJi-HaQQM1gp^Ft_kc>_PT7vbvxc8ZG_JAl0{)#(U!vWplu zfN;XCJmF@Za692?=n1zJo~E8~Tj6Q!3Agryn|s3TJ>dqQaEnP_sYfSrdRc<*`|+n^ z)aDaz^a-~bq}eCj?h|hK3Ag-&n|{L0KH+wsaKovamY;CbPq^(T)OcB*up+1m%7TH^ zB5kXI2f1i7xQ^D;{!%ZK<-&}XsRD)8n43$_Z zM6cox3o!|h0S6gzkRb;-n*q@u$hi!N0f1Nl$cYTta9X&A=iWqlzy@OFibxTf+9pzk zrnd=KR1}lGG_y^lG0pDD-&yA$0=NS*gYD`H5raGAASb?2Ea!T-8SAt z(o=L(bn24+uC9>@FB?@{r!B&rLQY!5cNkY04>A0?yAmaeq&}?S7Z@vI=@?iw82J^o zN;sL{Pm6at%7I6nAc_Z3Jc!~!%mc(cKn4_KKtTop#9);2fRG^;ze~tm2QeNH<5Bu1 z^LvH$G3}?OHcvN9t0q_g}+F5e=Vk9%P_C{M~`R+Td!Q>yJc@mGE`rWn<2`(YRB{SgA~hah37< zZ+AwvRPSiCuy>`@ccr^`)wy@ow|Axbcct@prRH~~&3C27ccsO5r5p-h0~oG%)%SNb z1n+9x-jx#G#Q@&rUZq6Kwt=dkEEq_=J6Pn`NPT>ps4MkuhO_7FLiY@~D`8T9TgPqw z7hgR2{PQREllReeXKz3D;-fEYx%kaD(|?zHnj0iba*K#G{6<8Y?jcg<-S{IFenkPF z8|~!tU1po^C!Vbgmw6tobn%urHi{C5_jP(z96!kr+=7v>0mC8i-#fAOl-ir%6%A-8}aS@B))eFU* z9@r|JnLc`TJ<`LcUHU5*`Sb-Y9e(It=!6(XVu1n2NU9A5HghCsBy`fQf z>r)q-3Qt|Ur{P;qUF2h#`jquWzClprqiYv4UmYCaJ93PiK5urB56JTIF?yWtf9m4G zKE6|f0>+DKNKX-l)?9pAgWEV(*f9C&`?~h;J8=KWb=!{K*Zs8*E-Zd<;m;Nu>c1p5 zQ%Re4J1zJBI_i#MzHfc+R8Q`h$=_?A$!1}F1TL6>rC@ZFRgWI)Bl|?y|r@Jv-o+AI*4_)TiNEt7O0-`tZ#47RuC0#XbSe0A%eVZ4izuRZu9li!K-0`ofM&GAW?B z{v&)<@B^8DL^Pp9>q}LXoG_YV#Is))k)Y~oCeEgHTShlbE<42+2&X1X+qQoCZs%w; zs*D6s#Z*gGCQ3=82+*ip%>IV@4}{cywa{ZouglGg)!{N#mz%RoG00blA+fPngZd{k z8fSQ!4Wy5rL7HR=z3vu-wwK+na+ijrj5*o|{Nv)rA2UP#F>zclebku!$JGAF&QybH zzLoieCr#bgdC#foFK*kn|AB_-`?j1s@bdJj%_MK4z=R5aWz$+hB!0H zr|*BDY5K9R{?^H(PjqcPdGda=Lst5QnW3mc6O;tzXmc$|BS?~r5|UV>5@c9Vjh8Z> zpHWwpkTjw;a3qC)hmjpXG-FM$rJ&6-NdsnQhS3X&7>_2L29FpkUyU}`Dyly)BULK5 zs`UgWh69F=J~hvDb9|x;O>5;dN|2rk+78-mOAuep(}-w$(vxI0BD1fueR}r`yVoDx z_$#OGSU=Lar}L3eo$Q_bn(4z1Mn`1h#D+D!RbX$Bbu)Z0)NJFMQJ~5kSAl7h=(~)8 zw1&^kXhNs{DwVZ8bg?+dG*}{1`)d)3=2oP#xfS&zsjT$t0SB7@S+e{M@2C9W!lwVk z1pG0+!&`z(1@G)wYQJa*8NzqS;V@wee{LSd-lC>-|nf(*r^9{ zjDZHUi)DccDng?nw9&~7a#y7d7Xc#${#8sr^Acf3sz)G+WlX+NB2JANL1?t>oTXER%rrXj75wVBT7+@gxzc z7{*V;t6dEpaDPv$>Yu;Ax_A9UlOxm9d#3MRzkAs2YIakhyIBg89v2^-i_Ys4|pC!w4g)el_2u>kS=X2puDRfl4&?m z*%^fXb~|WZDTsqb8$1M1NEKd=f9A1U>LnD;=u@|n)kB8_%=EtlDKh8afULI`tTq>(tLeUiz zRKX#}=vG`Yk}J}dD(qvZR2597nlP@$`%Cp$D}`I%L)+3%+O31sph&5CCj9S{Rvt-q z=}&sUNlg12$m+`U)_o_Zzr`!>c27TZXJ)d1VtC4UIO#Xqt$O%w*q^ z_fnaVD}yGMEzfxRn+@Ys8OFs##sDL=LDWl#j1r=KsY!ImE5?(tVm#?(kW9RjyhCtF z&7_Sm2T9XXYtlT!g96nF$Fxx)QAQ3Kj3PrJWH`D&i>Id_oti$f<@viOU+fwG=G`xy z!PQYbecVJAMojJtl6&4{SO!3JxXO3Z%r$-4BxHMyd(vwhg_`cPlavs4L68X=(xU|r zVB`}PgX<}lNY!2)4u=620`Vphk;2|Hg?r=<0CuWO7cr9OG@S?V!zskb{OdoGdTg<_ zOd^hE|4C90POr6lYjx@~*;lH2Zy$N;q0W;BrcWV6&DCC5HH5b_VXVBccuMw!XM!-r zz)Bx2PyZ6lPWT>b?NI}mu3o8Y5lwenZ%@E)N@enhT^lUU}7 zoe)FSGgi&FEdSs_En>NyEFX_)c7Qkb65_oSJ8>||AX!C`3Auh!!pyjT>(T8m?D@?9 zvTyg1?Ju6B2J+Q+h<4@>Z(^Zd`XBR*Q5hx(M9veE`}>)M^DgGGWJ#wOJ`IDUNV0lA z=E*>u=^uNaV6|Z)Rq8j1mRePcOEp28XV@AD@2ibMna4^fymE^oI{*Z@>HDX=gNjV5T8!t|ntd zCp;1x1sWfk`5E$=pwZUo9`PhvJ7wmGro*w3!B(*{wSbwEdQ3i%@7ZdPk-5*lUnFUC z#^?5#`|3Y{8wNFT`9<}x76$EiuWx*FKf*G9i-p}p#xx_JAg9B(u?2m7n!%w&X^>Ue z7SQX7eaYwA=}RPSJGkwEUwZ*FnL02zvS;1MXPS|DDLm<=#f^UbCgP@seN~&_C7|JZ`RugpxR#a$@lT=cfk;ocAXfXXUabN%EJI%_u|(_4;sFa z%8K^sv<(O$)&5~^$@{89UFwG{Ia}4#2Q{{k3CHM)^SF#q#SzH*vendnCHAo=q09AN zP{E(gXx|SLM2%JP{$lIo@dtk8?ydK2`RJ+L_p?nn{^AiOYpu*pHMI!2gv0Nt7;s%8 zi7X)<pQ|-W;Yw8GMnB*c)Ws%+KT!m9nmOFx(AE$<(Ak z<0tOB`@j>AZ1~LBfuqL`oILQ2YS${Y(wEg_k^R*cG@}Z#OPQ$hk7hLBW^~eEM*B~O zN2RW(Y077pN};v04e0)mmfEkfschm5k`+JmMJ}W@>JpO}Uqun)vwbJYA)5)$YgQA_ zjMfh`8WuCdm0n$@FfS9!e39!MA=XSYeblaeQ2Q{hdNObBgwZ+*sjQ!#oZ7l|`Wqvq z=hy3LHw-VSjYXdV_8S>_{A6J2|%Jti9AhPvb zf2Nug7$Q2PYP2(zd((a=bN_WS^dM#KS9?HhtV}onsuc^Oy#_>+3F6B?_j{v7Oyi&; z@ABq}ZFm3a$mI0&r@wGt=Xbw56@}R@k?j+8sI>%u(wf`OnHUO`;)qz^_@8xG9&x6SRGe_);T1^m)!1EYixTtj~cEtl?58; zvCb!RYqjRZ_T<^v)IA&fwrA}KQ_p&%57EjAk!#j2rs>NZhJ7_YK&h4|J~E4OP$jvUz+?v*OSU(f{sY>jeTDJ8hNR45=Y~G@3*oMx~|$rs$BRV(G;=i z0<>K?!Bvo~W;q3`&2nYcX8wtAuGyS^C6!ITQnSxULrvsaW06p8{!5k3(K<~@%|~^u z;d8Q~hR=B>L8L?ovqk^k%X^#~KBwL{d`>G>!{;)2A6B)SRlA#BTC9Bvgyp7G{KC|9 zS7pnA>FH;x>(+1Gy3aW=gDIl+_pR0`Zne(`-|4gU#%1qG0k}qnKe|~ zJ&676Fk?X9iE4p?$VEY7cEQM8AdHU}H8{qRgxns?n?N79$s+Y?9c}uOIs@t~3?i2f zkx8=yy4@&yU{d!fh7n2|6H>-Ro?XYh7>3I)2^CjG)q=r}aw1&Kgbbvo zIM`gqmsT_nbu*5!Ws#>{x|PyQ50O*5|Dc@)w_T{P;lUYsF5elDF1Z9P#X%I~;v+Y; zbYAU~5w6BA;gVIkmBw_bUERKRr89VK)7dSuZJa)`ZR@rp>#G~?|H|Gi+%$ED)*$jN z`bll~B}m7yQHE$RA*dO%fzc>JY{#7C_^QUGWJ0l@Z~Tk6ZqFG7ZD(VQbT+3Ngnr#Q zVdH1O7qTDossWXhh7GBk(!ZKopKM2(t4wSUE8>z8aJU<<)+>a@?0d|Y4T?7 zGs-HM91Fzl4dICsNl{?jq_I`Lq@wKi7ckv9IGG&WY;$BrQiUh|XTf|RX* zLXCSf`@e^!7aCRsGd^6oeWZK1*)g)MMYp73MU8(mHJknLBM0{HyZ@PIp8U$>i}7HF zYDm50iMAv6OK&zUBtcB(dy%6{?<&#|O#MB)ua<;*e!X4)Q8SsJn53^jQWMwBXuQTT z2W#Da`#SerjZ5Dd+|#Nx)1Cdw2cKTJIsIS3F9$a56MtK^SnA+cSN;1Q+s*m%Gt+;% z+n-x`>4?2{)+ctjZ*3*@tbi^)uNKq5CY2)popU7o6IAEI5IMcL~d%NnUdv<9LL6{JT?Di>ms%+xzeTbD-ukG|T#rN-&~2aX*$y8Ve? z*!Rp6k&#@P;@I9Q_wp2<>^|atW3qa31CXxi=2~*QjhkiB z9j@;t_uO)S;=9u7udeO#xC+Fo<(7-|^Cai$CW&o&#i#1W7HjrpwcWummaECG*7Z9}mq)SXEs?Q2imzxzRq=P=#h0bG>CF<*62e>B zEUvZ12e-DcnMl3UDl-*vGg@1hJC+a99*OOoJ*VEux4GSX2|Hpgl3yN)%h{w*WQB#b zwYfcQZLXp5Hs2aby#u4EYla6(d=0V|d0$++bs5AvZJ~~~f~#y>7Sf~2wxq5V?o`_f zx1()^Tg>;6rw_b%^uU3A_a8X+^2-ON_w7ElO#=CTuStsq*6;D?rQU(@b$>CrU z#x-~)YmzugXNG39Ej6RFjbPZBo6)|U%JSA5U(|K-vp662!W4KmCWDwz2uw0{l%~n< zQYhtyUdZs(1?~XW@^vQQf>GbHRu955sXcwEKHMOiOO_s8Gxj}FsJsN^J;I6U1fh?M z$`i$9-`cnPnfveAwtd?p`=5F2`1)s<6;)qR)eKHp-_7GHqvt+5QSXM?kb1X!b9kAX z7yCvNmEF8Jv1~Z`HyscU6v0&;kJC@(qq@xfG(NdBr8_U|WVuumt{j%8>cWoCO?=5_ zFYQ{#LlIg>V7Txaj4vZ`w*t>3ox&WLnO!~+uU&KHoVWZ&UDC3+fd7O!}FGjb0`NQ#Bn^>-eG<6t{gejs&oz-~KtA?TE9Osx;y-B{uq3!p2 z{^{?RKTuq{w8hwJ%p2Q`g}|aodtee-?btHoa^nhN%Tl8Swpd_`1-4jVOPh~wvD_95 zY_W)zWszY?Etb@3Nv)RDYDuk@)M`ns7SU=Et;^IM{-*fUi2CMEwOUrIW#ug^Z&`WE z%D1}0yhY|MGH;Q2i_E)Kd5g$fM4LslE%jk-megiRZ6T@6lG^f-Z-H$V*j9)n%WbpV zHp^|Z+=As6EU;jK1xqTlsUz{H&P`8Au?34PSXRNZ3YJx{tai(4x2$%{YPYO*H>cgl zw)@z2AKUI@+kNaZj$Lg_067-qSde2ujup}s2qHj4fQSGQ(cyzZ4i-{W1UnJwFv-)A zFuUj&eIk<@KxzOH4I&yuG-%N>s}79ILR;t{5!WN@g0~2n6}&~rtl%v|_SC_a$mnBb zPo(%**;C1F2T2D>2O}Ms@W=#_2_jQSQy?p%BOu>jRo|4V)`C<8TBMo@Z;@&yyhW;+ z@D{0NmfTFx$Eum|J~o;uvnR#Jw))ss%^Z>}Le^edDr*NVBH!X{89hmsBx{G)RD@Wd zj2_+s^UKwxsr6iI=R{o-%6XQoNl;oU>jy1TW)E+XvUhmPYHPQ6**i&YTwAD2W?y2j z5$IBdrC5p5uo96{kaCdDAYDPagH%l15X~BnlEFdTo`Ty`h%Rc*kl|((a25QiYob%p zIn5zTxM7+@rKOrfpe1V#!CR!}5WHn+4#9{1+n2cu?aSPS_GRuu`!aW-eVMz^zRX={ zUq%l zf>?&^4;~qvR*x|Gl~b)-nqMR%2Sg5t9FV4fhyf7;h8Wt=>G=rK2GGZ9KElgeLo7n- zfwu_h1>Pd0kWMQEw1`&CDI%66g}_^c6asG%QV6_7NE4mb1lU4fSf>Oam%^MEBBdaT zSfmsLZ;{duyhY|^a6~LQFI5w91)80mo}Hj2X?DU}l4d8oTsXu6JxPgJZd=IJTqVf@ zHAy>{YHbqwz(Nalt{8R>aV?s2U5u5wBrmAGX>vK0668e*+5t~HKpG%KD1-(GlL$2s zqa7ga=<@i1v_t4p8b%_PtMP-kT#X;R1#0}@EkfSED-0T9Ng6+Ri_rMNTZG0BKANVz zMKWIaLj5_@k~OSd9#+sIHGc3GsquriNR1zSh}19=iQZ}$iCB`xugl{HT9U>OUSKqC=D152tTa>!+Pc_2f}KNcFuw9^`l-RY4E~A_k-qki&$L&Cx_Sy3a=o$q>O( zA;Y**Q414?|L+U z;9Y^{Pv25+(IIZQrV+daYX00+9*&SZNpD$4a~K zKDK4q^)AW5(>3Q=R*Td@!R2%bV##t${Z=z*$Suh2LjzV zc_4U;um*`(l2=+H7MKr#QY1+hXn#b+a;49HG-wLw%5L>2XhdLbdr3MLlPtYTsz%}OQ~Qf#(5KBQTry|&!`YP1ca zwm~EfqP9UK4Wc$dBn=`d5VfgFn|75Ym8|YS)mD)t8nw91q(&`nGqI?} zZ6+4Age?$_+H3~Zumea)^Mo64M}*V_Aq%88Ak~2MMHt!CMMR!U1Va;Hu@=E)h~P3r za2XJ~4ldV5gg=%8?MkMmoJ!LMJT^j0Y|&0#F-w#Vh`2@SIlNn>e#5&d>NPx_21B4+ z|A0qH7!gZSw@I=j*+0De3&a9F4TxB-rUAU=Y8t?YT)CM6=7bN-w=e@^uTSq-8rD10 z^=Oh%P;Lg|>NGpxU4d6?Mcm_j=rA8T)RZC1m20jHc&>o%hMW$(#mniyTdr1Xcpt0P z8s5ihn>E0i?S}YVh!~OvJ@P_wAczGKE3Eze-+L}C3r?msGN}!uHjvstY6Gcl(Blar z|2E495ey<2L;{Ee5D6d>hFlV)q>%PNI6#<0P!ll@2RR(%a4-(10q`_n$PEzsIJJX_ z<+kKw^T5tAN7kr&+$x;m^ELp07w`A!A-m+wNEJv0l9mLWE z`q);D8WD?-D&Q?bs(`l$sRCZ(RP|N`v?OT+UULm%fl>v$C50+vcBEL6JZ%*-o8$=+ zp0Ds2t)E^$W;TAT@xjHg7liW z3$lM?SQ#FBB5sz(9^TE;*u%S7-v1Mc*2x4$F@dS%JjPaOm#EapbUi*hAkEduGT>!u z5LYJyfp>K>5cn7Z*$EL>r`_u)*1~aP45|@<@CXz}1_Fs75JVt|KoChFl7ud*88sH` z7Q_NIeDD_7(x$UCFJ0?(*;G!EJZ)egu0Z1k?+WbQiMSqb28p;j&9*Un9MBa=0(Q zDniLaD2Ox=X%iL$a=Z{>A~Zt;OBBHpA;=a+37zB-qw&H>qXHsYgCNucQV$sQpsNW_ zFp$9jU8e>E-jbws_z=-ja2+iLx1y!sR#q zfJEG_7PNslk1w_-*vT+ASSI86hU?@bJ(-bb?*ISm?3Rdy;Hmuk-!I#iI*b z_@_gwH@(ClS`&m!OfO-`oarSHnKQkd^~{;KocGL`7H)vZalK~(Ew?29k9VK*yCmrp z{;K)~y^X~(u~TPc;$H|`2qqiFPKj;Lgq_LiTs{dWlDB3)`tEZQe&(ZYcTq*v= ztS4DdvN)d6fF~86RCrQRT+g2HRAqmrwH_6vvM7~BsVqumQ7Vg4S(L7#bQMk4h>Vov zRY@vJ1uK%Kk~Ed1sU%JKT9Iy*q!8Z@Tc)N&>Pu2zl6qfyed+b3(3e7A3VkW`rOE#4 zf;*L0|52kaUA}acrK>DmW$7wQS6RBs(p8qOvUyc5%d2uZpsKQ*D@$Qn3M*1rk-~}; zR-~{Zg%v5RNMS{?70Fg4J1p5@$qq|)ShB;C9hU5HS$*O87uy*Y$F1~ZSh|L#YgoEQ zq-#XFMx<**x<;gHM7l=g$B1M{Dhm3DRE!K;hMEfNsR@)th4oYjY8mP+V6jy~^@a6R z4{Ajv#tJ#cs<56mL9NQSDtb^@PY*z=^E&-vFVk)&2%^|pR!j-~E3B7enm7e~o?{Wd zAW$Oz3UW;Ir=a;$(EI_lwz8^eRM0dkXc`qXjS8AZ1x=%ZrcpuDsGw;CDixYd1x=%Z zrcpuD2sa(k3r(AXrcI$(vdG6I*Kj z6f}Pdnm+|ioI*)))Kn>GsuVO;3YsbfO_hSCNs`G%E_46@_8NPP3q(Sy0d{C}l8YHsEO+@PuDa+khwFdIGK|;CceCC*XPl zt|#Dn0J>l0Aem&vW6MjA6 z*Asp{;nx#>y>S%TwG6V6op?<+9?XZMd7!|n*r<} zZsWjFVxD~}3eQF1xhOmrb&x4)1;*PA)P@Ky3c*DoxTqDls1>*<#1@6vqL$vG5L?vJ zTNG45<&F?r6k?0`j8CmaO;9A!gx^_FP4X+uLtq8h}MY*u2BO0Jq$-sU|>C6QDR!%*RGmC@mxStE~PlEJlomOtYb=*-+GMC~7v8G!06c z1|>~{lBPjP)1ahjP|`FgX&RI?4N95@rNa7r^47#vFTptYYChq+443qhmh_UQKuJ@e zq$yC+6ewv5lr#lOTEt750wqm>lBPgOQ=p_NP|_4AdEByA|8YU2B~6KvrbJ0oqNKS{ z(p)HM^)3ngC4s*r@RtPsl2-4MR_~Hl?~-tho0_RvxGo9TCE>axT$hCFl5kzp>W#af z*-|J?~>qM61+=-cS-Or3EsHZd39Bfx+O_1#3iA=B-ED#_mbdV65LCIdr5FF zX)P}a=Oy91q(!_WJmXGkDixkf0&+<}E(ypb0l6d~mjvXJfLs!gO9FC9KrRWvB`xA5 z9YRXNY)P0cY0)kX>t?pqA&N#w;Ke=M^h4k+3B0(ETO~v;Zsr!{wpxO;2s#Xa)_m9rJu*o=par{Kim ziN$>6A3>!wK2%;`J9SoEod)ou}f^ z>kaLxRz2*?^Qm@wYTDR|YC}!p-$?%_X~K9IH*EvQuuepAYBeX+c-nf`Lyy$WNyl{C ztqm649=a0tXDzO=I?&t;IT`N4VZ;C;t4xFMAi^7*X-q| z+Q2(KlSg{%u*|n6_yS_F{nkVaFGr@5!T(JP54-Zs^`38mwhp)EBK!-5vfGxA&}>FA z2|`0EA}yU;NGA+ra8M!7{uK)+4UZPpmbHK_x>{3TWUQ9&_46DzH(hG^uGy&NGgtPg zqn4j+PvtY86blV(J}EcwJT%(Uky?H(>|2*>d9-2o?_OAXVe!QNwHIBqcJA2PbDn(R z_>t^G$DcWNp4#JGUrO=I5a^Yus5bDhKcQ92$Jbf7vxk*!;icAL9*>UL=%WL8fhr=% z{?_sYw*frT*<{$%wbp8HAl##f1%RqBZczubX#miMk?wP7(T9mBLYbwg^PH-h3achL zu!us~NB6m!AXZhpq?%(?%TMF!S#t>MiZQ?o7mWPWOq;U-{UEeq%>q3ep&Ww%>&U1L z=Bi0!&PT&HHntiFvHAE_YC6@xi>IR=_Dy=KhLN!V1Fwu@YhmHUp`9ld?|$&E#l4HS zf8f~rUS2$M&I?QC&>XJHs}@2XCXGG+c+_yQWnwpZ%IN$nnVGDk3g4+d$!s zO05n3jh4ZhXcos-jkLOgE5^W@Of+48HIV~2!2LzbIWn8sDu||Jke|gu5iJivUb78t zu#RCZ6gJ*t!8-o8IAqEjX)pH9076fB@(FBj8U`GZz z!=9XW#KMj+S?8vkQHqGWX11a-2GJO~U69&Vu{Smbm(9I7H@<^c$`JA{beiOKJECFy zohuH7+_j_e=lLTVh#p4O!sEg`(cgG*;mPNox_xlZzHNJMo8K{izygT%#C1OcHb94Y zaoTh91mkJ#w7y3yPt6vYXiFhnR>L*HCAOZ&3*z(b$+~zKB7EjjevoHeG-(}Vf#Rs$N9vRA<}{b^=h0#%pBn@%x6DuZSadU zV(>9dfeEp0aU!9`bC&QUCIs!=C(lg4>Bffv6<{?qD8bxVWEZfAD!E2%I`rIug+q%A zcP%_Txc{b)J^A?Ir=GmmWXBX>NOJ2@QK!jG-)f_60bbar3!MC2z_91r?u<~m_Jk7$ znU28hst+5$O>!vEHVP(!NHDS`PO=(&@OAAP=qnF2I_txt(9H-&J*wl|if+5nYlYpe zQvq8%G1(Rn=I2~M=n-cvRJsAcSu7fgX^S@=(2fe^vs!cNMsjR!o*p~JQdgQ2^TUrsT;lYCk9y)O7u7{4S z9o)A2hQ;kK9+bzKrNk>@TAL%!r`_C(&MEj>%XQv9?7<^9ysL{F8|5ST%~*=*7=;2I za#1wj5E5hfHG~C_5QxeEdn=58V>)b4Pg?JxA3wS@sn(5XmH=8{Qgk4dWM+#}eh=JMn1e%DMz$1z|WV{-U~XYP7n31s%G`1x?<}10WeH>~EYWJO;p&7_VSP@uEeqjv6t!rt8mqU3Y zPaViuqBW|44U%1NaEVf?O|gMZLOI(T(aIK}MqXPZ5)NyF+l;l-Cz(c`$ZO!7U>3Ah zB1@lq~$?GI(lAB2G0ExpL{G1-iD9I!7UGe`I--TaJbt)d5-+R}C4{o22{qOio17WMx z6oVhZW4;?IX;=>7R~CDw(y{8e()u`lDv`|*Z!6i*R(^ao%HtPES-;te$7+3(vuqIJ z#a&;(T^|DnA9_Y^skT3Bt$Nl9zwSpBGgdekHe1T$m;YEii{F1Y&Zx=YM+#YKBKZA6 zW{k?;3t|I_-(ukfv8y8{U2#ldl}-J3R~`F){Fmo>p9*iIRC>df=O6y}_$I1m$qbXG zGEMQ-zB1Lma+WbZ60ue$QZ<=&TQcpo)ymkuA=elHtYZO0@glx##>RLz;3HJDN+ zu$oqsj<-}gJsB0_SNCLCRAiHAK>DNPt{&rrwA)sKxA|B)POn+suTF`;`wCu+`~T!Qhpn_!ATubPA5cdav7_$#>dg&p(XWO7a@X z>m+HCn@H{eiBFOXNJdE>dH&%K;2R@o`9O9rNTQVb86@egAhC_^|C!3%>yt=u5G48(|3-ea8 zd3Hk{gS3H7S`uh&Y{LEXHd*j+Ok*8~{txnj*hcM_(jGPBq zcp(;FX24aH;nx)o-G%cazV~vAot0PVJsL0y#!WCBC>gKLS^c^F31$iGq78n7 z))?#s#*A~gQ8UAcA#FtIT$J{@bQu*Bvr)s@4fXmbd-{dxu)eTZ@6~$`tQnZX)x|Y~ zYZe#2HR~hUjB8>VgcsY5;cDP&;+nu!#kB!fh^vMxz(v71l1W?}ag9&o{o71&wdRa( zQPU<|bzE(Vw{Ufs!BwUHxu0}22S;;oGzUj>a5QI*=HO_Kj^^NK4vyyFXbz6%;3z6^ zRdH>=72>Ml3UKvt!O3#r4Po_$luPK9IZ&R|Wj{aJ>P39M`+SKZWau zxZZr==f5ADxC#=!l4nWdl?S*dJa9gi?&<@vd)wpnet2D2@7KebM%dUgUq6Zc|0GTl zCkHMkGd&41#cLcSc7$vL*9|kO1WwXpdjGmQUET`Ktk}qF3?#k`<@hK{W8}$g z!j-rk(&Nnax?Rxv=FOn-e@1Kk{Ow8ng7?XR%=6Gj=5lh0_mho32kyHk@q6AU2R37U zMoW~Wzsiu`GOh0|!anO)huxyxi0S9Sd4K=ufN z-DyJG97ugLX)lBo3`7Uad0)Ohj-nQSEws5A6Rh1jdEQ{^RDUMiI&)or%dSuLdow6(hjXA!UR2kVCeSfv zsG>>vrW*9gsfJvi?fYsE%JzK~hir?#gn&{tItl-%q#DYVP6vGa?oesyGOTAyNmr?; zO+vp-cYEym>-&7sbn25KwmY10P&L=5Oq;gaQZ+LRw#j^`?%)sV;L$4P#jXK~e?iGT zU_XsP&v_(8k`jqeQYNX83=}M|;Zveai83Y1lqgf8Tsn)tWojr>Lzx=N)KI2|3N=)yp+XH6YN*g=g%TA? zR47rQM1>N=lo+POFeQd5F-(bJN(@tCm=eR37^cJsB}OPQLWvPdj8I~P5+mia_&Y)k zBh)Yg4S3pu1RC&k1qn3ZF$xlBC=erfE`kIScp3tP7%31V1!AN?jNs7+5=syw1!4rx zJCIO<7{S91B$OaV@T3C?C5RC`-#{|_lm6mq2hAV%qhLJ497k0yW+BX}@@jCxJOT+c-~j{@ zNZ{!M5=h|L0}@Dh#0Z`>03k;3paBUbh!H$xKtc&(#3M%VOaTcch!H$cKtc&(#PiNF z-|#R2Yp8*k!5ei+sDT*4^8+N*K#btw0TN0OBY1Lvgc8IE9vdK`1TlhV21qDDjNpL* z5=sywcv=927{RjwB$OaV@L&K5CH%9@H#{Z48fqYB@Q45jH4r0sK7fQ8h!H#-Ktc&( z1djxO5F>aXfP@mn2%ZKYp#(94M*&DEL5$!z01`?NBX|gagc8IE?)WF6#PC_>8*cnZ z4Kahe{z<5T7{Sf{B-B8R;6{HE+9XDBdp`*zaC1KiByej#2_$f1KM5poTR#aUh!Nb( zPeKV|1b6O}P=Xl2ZTlpYAVzS{{!h*~+``WqY9MBC|2_#d5F@yGpM)BSks>jIJN8K^ zL5$%3d=g3!BSm5ax8ajef*8R)_au}cMsUMD2_=XT+-*-n31S4d+LKViKg)c>efO-P z24V&`-IGuQF@k&TL5LCDXiq|$#0c)MC!qu}f_v*pC_#+i26_@o5F@y4o`e#_2=19D zp#(948|FcX5!@(GLJ4A|IDD4*hP&ihLk+}Ck?RU>jRzq{aAQ0PH4r03Vx&ln6p0bs z{!T&(Vg&cTlRyG@zLP)#_q~%q0(ZTWKmzx?lTd;fDG?*Mxt)X(#0c(eCppV}!+q|o zp$1|GH@TBg12KX-+)1c`7{TrBB$OaVa4$OuF@ihUNhm>#;5K#=N)RKshn<8H#0YL+ zC!qu}g1gsAC_#+i)^!p}l+H5Wa34Erh#B0zPC^aD2<}}cp$1|EH?EV=CNYBB(@7{n zjNsmM5=sywxG|lC62u7ZN(UiEa8Ei3C5RE+kWNAgVgz@ilTd;f!L8^&Ip1(+I%}wb zn89u7B-B8R;GT36Y9K~%13C#Mh!NaY9K~@V#Fs#d}72WMtow#Cq{f?#3x34V#Fs#d}72WMtow#Cq{f? z#3x34V#Fs#d}72WMtow#Cq{f?#IKxXzWKzAPt5qljL&t&Cq{f?#3x34V#Fs#d}72W zMtow#Cq{f?#3x34V#Fs#d}72WMtow#Cq{f?#3x3|#7LPKDH9`QVx&xrl!=itF;XT* z@HYCN#BZ6HDHAhgVy4V>rA&;JiIFlfQYJ>q#7LPKDH9`QVx&xrl!=itF;XT*%EU;S z7%3AYWn!dEjFgFyGBHvnM#{uUnHVV(BY1f|2_=Y;GBHvvon^k2iJ3AnQzmB0Tvy7( zNSPQZ6C-#7J_$7tBV}TwOpKI?kuot-CPvD{NSPQZ6C-6}q)d#IiIFlfQYJ>q#7LPK zDH9`QVg#>aC!qu}QYJ>qm9xyZGBHyoX3E4&nd?fK7%3AYWn!dEjFgFyGBHvnM#{uU znHVV(BV}TwOpKI?kuot-CPvD{NSPQZ6C-6}q)d#IiIFlfQYJ6xVD#S>I7^x5=6=I}9j8uq`3Ncb4Mk>Teg&3(2BNbw#LX1?1 z5&UQX2_=Y;3Ncb4Mk>Teg&3(2BNbw#LX1?1kqR+VDV}A%Rfw4iF;gLCDqL48#7Ko0 zsSqO-Vx&ThREUuZF;XE$@Y4k##7Ko0sSqO-Vx&ThREUuZF;XE$D#S>I7^x5=6=I}9 zj8uq`3Ncb4Mk>Teg&3)nhY$X%fB2%}J$M)7`t^<}IL32K(J>{*_>L(%rs9}k$Ba0p z>X@2i0>{)H6FR05F$L#O!TD2g{uG=)1?Nw}`BQNI6r4W==TE`;Q*izioIeHUPr><9 zaQ+k;|0-MuBmX?-pXdDZoPVD4&vX8H&Oguj=Q;m8=bz{N^PGR4^UrhsdCou2`RBQK z6`emt=TFi3Q*{0moj*nAPto~Pbp8~bKSk$H(fLz!{uG@*Mdwe^`BQwSc$J)gCFfts z`B!rOm7ISi=U>VBS91Q9oPQE%Fe&C^RMjURdN1QoIe%kPsRCDasE`CKNaUs#racl{#2Yl73WXI z`BQQJRGdE*=TGIG;x+928+QH;JO75Af5XndVdvkl^KaPsH|+cycK!`J|Aw7^!_L29 z=ijjNZ`j3a#Q8Jg{26inj5vQroIfMZpAqNJi1TN}`7`4D8FBuMIDbZ*KO@ec5$Dgy zJH@N&{Hr?us?NWv^RMdst2+Ox&cCYjuj>4(I{&K9zpC@E>inxZ|EkWvs*6|6`BQWL z)SN#x=TFV~Q*-{*oIf?^PtEyLbN?)<4cf9lSk zy7Q;*{HZ&C>dv3K^QZ3osXKq_?-Z}l`4>9>Lg!!T{0p6bq4O_v{)NuJ(D@fS|3c?q z===+vf1&d)bpC}dUJd6@!}-&2{xqCF4d+k8`O|RzG@L&T=TF1=({TPYoIefcPs91s zaQ-yjAzt{289P|u=VBZa`4_B@{0r7c{srqJ|AO_Af5H05zhHgjU$8#%FIXS>7p#x` z3)V;e1?waKg7uMq!TKm(fjfT%?)(+F^H<=`Ux7P+1@8P6xbs)w&R>B$e+BOR6}a

    zXEsu3f%cCaObbUoxcKi{tDdr zD{$wpz@5JWcm4|8`73beufUzZ0(brj-1#eT=dZw>zXEsu3f%cCaObbUoxcKi{tDdr zD{$wpz@5JWcm4|8`73y*^H<=`Ux7P+1@8P6xbs)w&R>B$e+BOR6}aB$e+BOR6}azXEsu3f%cCaObbUoxcKi{tDdrD{$wpz@5JW zcm4|8`73beufUzZ0(brj-1#eT=dZw>zXEsu3f%cCaObbUoxcKi{tDdrD{$wpz@5JW zcm4|8`73beufUzZ0(brj-s$`mxbs)w&R>B$e+BOR6}aB$e+BOR6}aB$e+BOR6}aB$e+BOR6}a$-?HvC~UH@d2#7O1AUhDS=MJ+ z-^zTdL0QPMkY(Y;Dl30zpiPN3Yul_%zkzJ}4P?`AD2vAFH_$l!1{$Z|sE`bkjBIg^ z^943r`+XJ`){9EVMEIeB!FjNQ^H4W94|Rj{P&YUab%XOzH#iS6tdU?KEm9)p_9W3ZQb41RSLWKc+gLKGB6LD3kLy||Noyb~=3??i*a zJK4uOQ9XF4YT3s-sY@LIr7m>}7RNRe+)d+d0tqM@fua>Cnt}EJt4s$P zG96TAI;hKZP?hOGL#6`_nGQqKL3c8p5sdX3G)}KUl6ZGN)y*L5E zak&nPMxbozCN%B{q49O@+tVc0yKM3~Hu)Tz ze2z^%hbB|PHz-O$r!e$`uh7X?P%!ukoqUB(zCtHofu_u(sLecz+RURxTLxzoXD%o# zMC;6>Xq|a<4r7(hp?Nxo=II=irE{oG9YR^^5Xw@AP?kERGKi!O!9YjrCMfDb(Et=J zvd=>*`#egs&!aH=yvo?Y=h?yM(ZPe8wNN?mQ~YE##&Tm?lNP{@L!4Jho*Y8e7$t(GBB)@m7=l^>w2)xrXb+AN`$ zJ%Q3Jp_e7}vV>li(905ft&6jt(;yk+MK>-E^zBB*f!`m<%{ECwlJ>wrzsDwgx`h9J zjnyO}NgJf~(f1T;xHrm@f+e0MMN3MS_?DC{saP^>$%rLYOKO$`meef?Eoqn|r>TmE z&Hk|2A2$2LW`Ef151ajAvp;P1ht2-5*&jCh!)AZj><^p$VY5Hnki8MJH)8fi%-)FE z8!>w$W^csojhMX=vo~V)M$F!b*&8u?BW7>J?2Vkgzg4qeHTzYwUp4zxvtKp)RkL3; z`&F}FHTzYwUp4zxvtKp)RkL5U{?^Q1&Ft08Ud`;)%wEmx)y!Va?A6R(&Ft08Ud`;) z%wEmx)y!V)?EMYQeqiQqVD<4B)F#Cb^w{G_8 zX0LAc>SnKQ_UdM@ZuaVCuWt70X0LAc>SnKQ_UdM@ZuaVD?{8@KL$e>6{m|@(WPWEvRqvd2D_c2;d_VG(2mXm$_ z(un0`AHOuBT*2%W%wECl70h12>=n#j!R!^xUcu}Y%wECl70h12>=n#j!R!^z-e1q` zduHD=`<~hN%)V##J+tqbeb4NBX5TaWp4s=zzGwD5v+r4di)OEA_KIe&X!eR`uW0s) zX0K@Wie|59_KIe&X!eR`uW0s)X0Lem{+7&s$?TWRe#z{Y%znx2m&|_2?3c`b$?TWR ze#z{Y%znx2m&|_2`s?D=NTH+#O>^Ua=b_WZN= zw`}&yX1{Fq%Vxi9_RD6!Z1&4$zijr)X1{Fq%Vxi9_RD6!Z1&66U)z6i=c&+LF?$uW zS224PvsW>D6|+|{dlj=+F?$uWS224PvsW>Dm9zKP_Fvq`D*Lwo;$BwEnSI-TaYw5a zntj`UaUZJ{ntj`UJ==dh+kZXVe?8lOanGv+Z2!f5ua+}=w*TVpS1UAow*TTDSSvJp zw*Pvz|9ZCndba<1w*Pu(-+w*Ze?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh z+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXV ze?8lOJ==f1v+uv2?Z2Mwzn<;Cp6$P$?Z2Mwzn<;Cp6$P$?Z2Mwzn<;Cp6$P$?Z2Mw zzn<;Cp6$P$?Z2Mwzn<;Cp6$P$?Z2Mwzn<;Cp6$P$?Z2Mwzn<;Cp6$P$?Z2Mwzn<;C z-oM^|2L=QX+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lO zJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOJ==dh+kZXVe?8lOy+#c;OXDBjcZH|u@HdA3 z{(TL2e>Hg<@uV zkWOAw2jw*Z;p8P7pc5)4QP9DKrVv3Pnom(0FETe#6htYLS^SUQ%vu}BcG-;ys~oeWo1gf^b$&6dUw?g2@AvxkddPNY zw11$=(6phY4X%xR8?kThA3%#HuGo#>`^+Pgyd;FF!6V3mW$ifFIWUk3aczPgyq}a6 z$PFF==|duU37B>poD3l|3uCjeHVgULu-Wfl{*^7G0|VQ)@4Y>jIk+@1z-lN>rMg(O|EbgLz?t8a;l zEsYIWns{2piKk=#kY2Mj;(K^%Fl_apCGm9pVkKcdcDb0QW1>dL?%08!hQbF>rrQQ? z500Z@Y#?^KlEE-yV*lBS$0G*NQnTxoB+gZmdR$5TSteJTzt4+n#k%~S0&uar_H|)pb>cK=F-|J0=L9gA}f5S`rp9t`#>2`Y}&xdhu-~1@L zaK^vn_f!9QAX8@&@NKHCKAy15TS07X7Y~!ND!zu?-0b|$h2slLM-MJ6-nKvU(7ySj z%U54`eqt|3z1j_%RWPwxkilH?(@IirAjuS!%M_KHsKUu^uZfE8TxSZ8nNE=B8$Bl` zn^A6Vx;@cp;%&~TS>^L4k9XQsO};uEcH4O=4>v_j4_%idc)v=o3H5ZUI^FK}I_*ZA z`Z5Qlr#sgQXQo&Sf7Gfy*&J)P^6?(ZQl66OZgaBTLLP5xn2OjLbYKoeO}z0rYJ80v zCr>EpRyR4j=u|Tpmjb*(V5ZaV=HW^`#H#`p7Iz+4I5Kkk?B1>`1tL)k;TVvfj4-OEB(c*y`!i(-Fdun zid{<&M+~4+9UG$+!D6ffClozpHSFqNgctFHIVI9AY)`cN=!FKr3fpItSkTTaas9W* zB~PS(CHp9RN?j8%b3`JC0qA9HGuH#Z^{_hyNT~p`q|?zrMar!}QABJT=mf+J#?*tr zqbSebP+ig7Hvk#$hQajXWxX~Ns+d-oTgXFha!zrghWDCZKa;xWr6XGxx9?n7T7L0` zmtKD1#TTx~>|J_cVd3zR0kp&O>9niqj+b{@geKGZI$0vnA)c!!bNuPvM2&u;;O%ZZxwn@_vKXIe z0=j0WOUBxru{N@pV^N3x_H<~{Ew-U0rRUFW3ay3 zs+qiT3U5tTOIm0iUpl`0Fn)I8*@thxXzcd2H!dE3@z~F0k7ZxHceTxFb+RNeRK|a2 zy3@w{UGjJDGynFY!QVKuZZ!v2&X7Y65W%dMmh@00k{-(TBK9=XRSAAzVWNqp zI|KxQz2Aml=K3>psh2g}t7=m(XTFI>sh6`1fBbi_sm-nWEtlU`lI=!bWPT54K?ihZ zh(UvW*s69m;axd{Z^mz#5rDYIL{N9zjYd0uOSOYFkASL&b6k4x3PbG31YP`=#3!b? zSdfW!hxje6>S|O`T<2)TZ%N&VF~>}$8Y@YBVj_Gvch2+sfAy)O-<;3VNmvoI6Dw2<7 zb}{+gC{b(V9^?#7{GnB?&3@Rie-Un4E&lEHM9(#P>`aNrGOH4gWll>xmN`QLXK|ba z5noIZb|K1Ry|R4jr!o(6(_%-&hhun6i85MkQa{Dj5NiX95;JG=>95yul$>h!4<7sF zhcEKC&(H6C_Tj7!!D^fSdiFm=Oy1k>NXes;^2kL}qy|Ipkg_U?UVaq;6gj(;ur$c5+MKyoulysRV< zXC`(d>8;71`uv6GZxv1cRN})h8}Ei=tu_F)DZt*s?Nr&IU9VyV{h7=q1xq|jiWi>W z;KtiSZN%h1jdkJDmvG|1D`oLo&I`}4b*e#l;rag$%Ku&Lk5hk%ZU>ge|A(cSJ1uQA zXTpn(&3@gb*!}s*WbiAHzw?X?KCHYCV9950Onxc*+7~{4_|d^V_*>|6i(lCCar`2X ztI8l2c=tP+HXN0J^59rM_&@Q!*T%)@Tg0(@|33MHWF8U3J9#?ONE?&Y9>Tbzk4$_= zf=w34rx(blUL>D-k$h|^=0#FuO)$W3x?mN-kr}ZF@cS#q(RYlz+(>_qeENHIX{tZj z!<>Z!gO`%VGNMbOe?gl1N;1FqE3@;54<0;xbnnrReQa-I3p6zo{}*f1Gs({}{eEm? z{MMKRHszBTjXx#%j`7#T-?3uIyGPoh{lGs{T6kF@c z;%iphfn{m@duEU+^M9GD#@}b65vx|4=^|b{RF}inwAG)SR5ts6nhvIF{UL?uI)#XI zDS6$Ny?^u~e&X-NEn9Bbvwh!Vb0;1}`ClYai=crjNJteVq%KHcFG&A)knVIR493So z`~bvQUM0y6&~ZVsgW&}Q?$f`ZA|`|a5)@ug?T_J&sy)3zDIfnjO41KdF#P}p(+{vs z`hk+D4@YjTh1Drc2;b7=t@v3JW|bsw?c&9@%vzefb$YymkJ~J9+y(I=DtiG0@5Jl( zFlR5o`3^Ud^_pF-eaKH?E5s(YxUd_qFx{=jZn>E-u_~_Y1SzZo2S^?OX2Na*Gws0#B zP}FVd*PJ$QZgQ{y>H-$g!9siul32!aNp@*4$+7t923AFUaaL_kb`i1a6lzHMp%$Ul zR}uJ_hWAimGS%=!K@-hVzs6~r`ZZ1*yvP_I-(8w#QR>%K{p;i#{Vskn@b7Wdx$&l4 z?o#}JX=U%&f#;rj@x`y7*ca8ap-kilALnXgeDeP$Z#K1rrGAYCX3z?LodwCCP901C zV*LNh{>RgN=8e-lbl!siCx1He$ETYaZ_SK{xigzFi}*MxW-;UeJ|b@dj{nZ~g3RC9 zZg5QvD^0V*7~$)gYOjwV1(Qt$s~Z5g%^)h8s*1xYE$HJ_jcnh+A|Z)YHs6>GH?eY6 zcD2L_;w`=1K7Q3hD)1o=(_*^c3NTwSttZU8!~~&}%H2M`KbOm8&iK}DfQ@eE8kU%V zb$or*!=9N3L=qF&C{&KE!qO#7n$Hs}i3zNV;E5jae{n{+cq@+ zc>S+q<$@)iB}Ge0miU&GEvZ;CY{`fvh4oPy=_y2t3X!Hlq^c05Kp<5LWD4Qn6&%l0qR$BU$vw zG&;>ubM(NW-@B$McTb`R36EBQTyV0D=$WNicz~_)UFumFGlT4QTvkXQ)I6c z*(pVGrAV$6$@!5SA}0TQZ|`H9&P3lpR37Q`OBa0jLLAS!c-c6(!QUj=_2CQgMJ$fS zybI9&nTq)1{{ydO26>I-b&@p6O(b`K;I+))0+LaZM?eynfFv&?sWAIJk~f$gCwVus zPm%nPG>bgG)R4 zt$7DuJA}QtrJYr7UA?89si)DdQ^h-ed1g)j_0rCI*uW}*Ny?vF$j3j84;5Gn_=-i; zp}2#OJ3+X#bFSU*jW_y}Yz7lQzqAuS`NOYItZ65#2IEURlb^-)YWkm+cFtDum3eDy zY3JaRFkhd6O=#)hOK_YJ$l${h{)WLqE7{P7pLCjEkJCE#S_A~U{NCe>2cCUy-?4We zU)Z^LVE^5RZohEx8hF>oDFnXYV>EUe_9VckDeK76&MRP{J&7h%HC+wxb8v8GG8|Ju zfEV1V$4fg=fj4_kFYUy;HOKK437hd7z1CO{9h$}u@GyYrdDL}!UbmRw*OCyiDSU4} ziICg_6>Ww!_Cvg47{Nv(d@X>__HYtGoq>;Ih?9QUg0++QRtCuAxZr~@+QCy;>sO%- zhnKN&RN%w5+QZH)|F}T!5F~{TZsBZ$4>61(#pP+=k~+R_Xf?azra$(j#P?ynU+vVx zx(#9Ge>GbVUsT!g$!8ay`~1Phg=4?+H0J?yR&g++9h|oMIOj(rj9>Z;@nw$FSa0`h z@Nc%-0^Dr12*JTmV?3%Id>t!IaE8B205><`ID&3)P`V+|We7lYrl;{HWrf%>hb+|2j9?}27G40mv^Qjc|&52&`Q`g8rT={0fq3Gtk!@b zS;wDQ5GJsz?O_vspU?iFjQ zxN5ipTy* zu_&9u)xy=rHH~W{t`4p)t{$#Ft{GgjxHjRM!v(wO??{bP%2e$JwQy>AtdV%jCE`31Ea~&WZlNKo=nzJHil~ht_fT%Ty0#_xHjVI z;_Bh*(G~V=*l|uWF0!P4*gh%ZmdHu)}a&YIIipj2%^ZYfFO$NIIinBuIvT~ zqPULZ${v9rit9M8>o~6L9SEX`{yCpq%0fhS9mkp720>KWaS%k6T?au_*?ACp#BpW+ zLFf_3bsfi*aR8x599M<|gdTBR837P_#Bp86ab+Yx=mE!-!2qEL99P7n-$yVpa&?Uz z@x=gv&?AmBW3-GAEh9q9h|MxWvy8|rBQVQ|#xjDjj94rq6w8RjG6J!Tab8BZm(k&6 z^miHET}E%0(b;A6bs1eCkV}z}HEfes>0th`K;5qyt^oW2b0zl{y0Z$Ns&?5q#H~^tX1U#VtLXQY|q5%YeCmulP z0Rc`(fY5{7X6L6%&;7);h!IGQBp|7igd`1;F_LkTCdmep36e>YDUudRn`D|~BT0v( zOVT6hlgyCJl58TGBiRgsn1Ij&!~}#MASNIU=ck?-#P`}Pq(_Jm2t7iKK439RX^n}M7 zQu@Lp4(TM^ecL~k(mfu0Na-LCKcsY#2Ov^9$wLq+-Q+=tl#cQ+L`qkAAOcnRcqk$- zUllv!pQMxL@M7DJRXo(BHwbIw{->?)yY>ylt(AFmahsb z4^S+TuZk;A7N5A5M+;K=%EJXIedY0jlyT(&gOtAVh(StUdB`B8uRLauGLAfGkkVHk zHAv|j4;!TPjmHgA#t{dOxA$j$UZwPvM-EWMm4^=U@>Ox=xq~J0RdMCfgC+7+apmcQ zCGu5qOx=8H6SBRlwvigeCG-apXyaC5$7_BBUB~3nUIl8g&aLa&>-nb>=@v zpjbe#=prZBT6~cc7%alb2@V!xq z28%FGR+?rOW8?&09`4xE;A?S4&b&4RTc}aE!S&T`t|n={FjVvSCw0QL4FDvCY#RdP zH1Bv6R7=e~8wS(~&o&UqSuYII9E%kodIh7f0yM7x%PR=>3UIuFFs}f^D+urk@VkQ0 zt^m6$2J=vIKu6$EVs5nDmH zR)EPB1Zo9QS^*kYfW;LcaRrfBK^Rtm!4)8I1-)MZ`c}~I6?A$9xLX10R&q1-`WBuF z^baQ{mccH1y@D>UpueNJ_=owiBmQCRO>5#OL6SdD?)xB#EhN9hvK5f@-!c1RW(P=q zf!V(SnZgFOrbV&|+uWMOg&@=9QV%hE@DE6-KYb2J{A(bodr96;a$Z zk$Mv!)#y=<9_8p!jvnRcQH~zv=uwUy<>*n49_8p!jvnFk0YZ;*^e9J-xJ>L1ES}~p?o)=D)NQCq9ry!$$b-kD zj3K!1gTP~uUt-w`2s}pi$IK3p`~tIo0|JjRGfChv$}R+f$D@oPvWOulKaj#>sQwxV zJO+6`$qANy>D*Do3}Xoo7&CD205LOvqv+x>eHOwa#&DD|L>3+~h9K~WF~mIM$4Pp` z7@`awF@~dz;izJWLVjGOM~oqA;1Od80*@HOQN?hSF&tGV7{gJ-5WXU2ql_8W6b`eZ zMjiu=zDCRWR;1U9S1E>Gkp$)7qa8+H!6e1_qXYm@JV6NoMU>?Lfno|u7@&m6L;^G@ zuAqd1B1;M|D7GvI92i|J8S+(_RDe-JRD#ks;Zi|m34IeZ8CP_Io#CZrPBQdWXjQ1e z6R(h@uR^TC4PKZ<1$|}2LFKD(tDvKV$W_>{u()5Lz+a)r!+pY6u?J7UD*9L=8Ga_f zjB?*sf?hF)ZmDiX34*S|A6wyoCBJ|U;Xai4ZEpRxMt`FOIHCxKf(iO31s5s-o|waM zs1W0!)XrV#C3EPe+T3f&`z<+PbzfqFaRKGPFtJ_``^;G{I5?_PFNk^7sTUj|)%}7M z%4?1Z3(aeeiUi~}#)X~b<^HIoTI+>IN{!VEjums(3yv0ZfR+;Vg7XA9%@m}ZD@X`V zBpfs(oH5K{*qC6}Akn;0YJJw+QQmqDXsg?rF)Fu?qg*}LyfJz89p&o0W{=9P`w7WybZXmlWUT{kDZAGzjsFR8p-P!NAVn?&y*g&^Am>_D_kY4cF&sOTf;2z8OZCi=Q) zTJ$E-J4m66Z359zQOd#{3L~L35{e_CJQ6BELJjJNa@2uBDnatdxhvEG@*2tOBx#bH zNbUfEX>d46Q3m7@>>%5ood3eMefyp`dU(&#Jwu;~T@b%)=ri9TNq$6hf;9G)=#`=u zyqfq3+K9$3IF_8?CwunU3`C(`p)R?xWB3+?njnU7|HHV{S~b#iCwTey(^ZCIziSs5r0dK z;%^;&crE_D_xR#N$JU;I4F5fMWbONoE=DeX^OW;8z3WsWW~HlTiRHuxP9^?SrF4Gz z?U~5+#PaU=8dn1rGVwo-G&%221#3du!rKb1U^ z{-QI3OybCD<}%0aK6QGS*#57h{^C!N==|d1(S3)Pj{KV=xZ^f=(fbw_j^r+FEbV!a zjdn}@s#V&0l>==xt1hCWbSJ(ewmG@M(s+^qVF|dXE}mq>tSs(#@n^8!&}UM2vWbOK zoUcf}fhwJn_}NPn_6$aN%jzNfpK+@+0<^ zDP_eqgSXj2iJ7@O{uLPh03@)kk;nSOB=%O6Ug`1-j26FL0vDITU4k51M%)FldlDX7 z#qNo%#m#urlVRQ_{uu>>KjT{Pxv>tOx;VXn#O|?qkA>RU?^ps1S|2W(EL=d8KbfDe|wn0%M@-)l;vY(o{A+{??wodv@qZ0*|fx8Du$B|j2r zLM%*m?8?)6$eORat^4M%7TWJb%Pm~C?RjNY307fBg$ArRwdA9x2;I#En9jpb1?Bv zC7$E}h>h<*x^(oJyN|8gzIXfHZ3hqD_SDhUx@d+$!W|vJ)-Yf4XVgSTlvC-4UX6BF zv6>8k6ILi%(h`F-e$J^xgB@CBS&LE)qqU6@QXkRmKyu2~ozrRjDl~g5nnjT18g!+b zK(C&%Xj=6JHd?1_!N$o83#_^h#&F3JxoV5t1w17f65Of;sra=HD_jwLZj306CCNUu zI39hAvw3#w_1m^#8P*B!*{Br+w3C{^W;?TGpJgj~HTdq>$)Uk_C)Sg^GBg-48!#In zoB1VVGrxo^$SXssuOplKIIipX|;})!LI?aL6XdVh^A`IhGTR zJ%Dz|h$fE>rE029)mY68wbodjswI!z_5|*&%Y;5o z*{;ED>{qRFA2bd+Q5RclsoKn*(P|b0h5E!bu^%R`Np2q6c3<-R%ObIM^Krt(X$1mD874Z!VFV*d;1V ztWzNpwOpsd)MgbTVIh*MhT;eAi#>K91}0*D6fx+4O73@6_s62D`>iT7#s-N$OCGis z;?#s(YH?`WW2r%!V*+zb;4Kmc__4&7RfmLiNLZKnvg+d3Mv`J!FXk%U%>8L@whDh=d5-tOX+z0&lTK=#}bW5y9!R<)e6hNFeOg1 zLcA7rmw~Fqqfb6{EVuTeOLJ>;=RbF7etr?(uAbSN`q?Gtj3ayu{DJy_~gX3tt&9z-nmFM`*JDmhJzpAYqoG70_RolklB@hxCYF| z*1ATC--~?{pM#hrYf8{G^aw|a({PY zKMXe`Hp%^&|1y+au1(@D<~+jeSf14O$1-Z1{6!_P44NkQ&+-;<;tB_x?D8z+hEiXK zq12ZX-r^0r_ikDE%)#yb%O77ndT{c*htJ8V6TMF#kr}S!Z|7q%L+1=lJX&)*znG4epD| z#bz#Tw#M;_jG@e>yq_CPd?5BEC})B%HI(}-lI*P@UA&$KAOHX5nWOVN9@_EM`8~%b z*S=@|$a{}n^yF$SLK`2zn?Pn;E``7!h???z1nu)27KYQ?S9f$7)%Uh z{_Rk>2~Tlp;tnNm%N*?7WhvF?f1cEhba1;>d zBNcd5fSLI3>FhT+dVg;g^~CehTDv}%r%2}520TR312WIGEy2$GsdJM-0cFVmGaeM! z1}%yDqGi#_kUe)eGVuf_rMZ<+}57&8NgH5D84`bZYAC)q6n|qVcIQy)RQz0c1PYHp z;Snf20xhRGE;xrq_6LUI*RVBdQM4rLi%3D8$ z+eWXO<*gUj-;(L~FZj)Y>Soefe_{^FOjr+*WC|dI|Bd8tm~E2Fllw)G?V>6hLUy7$ z(MFQ~F-U3{B>U?i?H1JzUO@5>B!5M+4x~S!n*RkBU+a)^dnP;U@GH$t+C7Yt$gr$Kcla_QX*=$KGssuuhGV8Mb(Lx@Q?;b2qP}R`s3oB?p(V4-#B()ErYzZD$&4jkOU5jzGdXW(a=}u( z>!YQb^L9?Q`2cvkWffIE={sqBvxQSgFRbCgGFd%u(`*yh8)8h^?6u}=5 zEqTq7*DXm~a+4)@D2dAk{+wuqqn13vBym-1uByIV2G61|l<;;){2z|RS_9Q839cGR zKvE|eBbg&fTnaKtu0=9U^8Xk1E?{<5Rl4Y2YrmmDyEqULXxE`>5J^1yT{UR}+0Uw- zy{q1ViX$e0;3<6YNDSz~08)t`m_tKC<=RyN6!@g&s#l2L zhp1@qDy`j(*W+nhz5hSv*p;$dy!F){W6b9m^Rea{YtAv}Y)L>5s1m6rf%_!DYEo+n zMhQj;Dg-tBvTc9{T|)%!&j1EV74a+6=j#BBRILtXN5)6mSC36jip)M=1l|F%z}7rK z?5_sQVtEQETme8Eh(ADa7yw~_?ymvb;?XG&I>Sz))vhZV22UqZsdW+EsC9x@30@-z z5o{v33BawA$`h<2xX&3L9%v7H`)2p=-_PIN+>7>~wKfUFop;#>LRmzr1c$KWB zl3t}PR3`0JibWv}Nul~OUZqwOGSv@~RJ_k1jV4cdqnc&f8&$uIH!4}i6QFFC@pwN- zNgYR$Zk9-6vo!E*OGfcj5;1L_nYcGPV#$;NDf!2fB&~R{K5wWzEaJubz0grh8E>d< zJ<}kImIQ~Pnvn4{$adD!{fw+p;xR90xy9pN&K4$~@N%{=X@qv`rw`H{ZzK}r`(dO2IVc(Na)NEAFf8pWiejU@=V0Mre&IlW1qRC%_O|2^oe*p<25bYBq(hcG;mYf z1)gP1o%X7lW5%nhU!PZ%9Ao_{ud21vmjo&0MnzY(C~0`wEK$qOphf`qg4Rr*SIF*v zl9f68%_oE4Za(m|MeR$YsFseQ z;W{?YWWwuco=LA`^Nbm9=IeBi5E9PgPG{3MQG5h-wE|p>WkdI^;$5>IaohpXThmSCc ziY`6e^);UV1xXc0xb1auR3Q$@5?_oQgH}!yW6);W4?insD&|4cmOKqSEtN`mFldm} z47cU&Pco+D6i@Usg3SRP3NJc|jHbcnWXY_*4;(C+Ms6yWL0l_VEJYDCvWa=aIOtIl zV$_LPsy{(r+br=kX>DvIr9)VzGhTFyPeX$9<1ve@wHd^{x~+y-!mD?C zeFmiDnM!+@WRjQ|XNK%R8ppFaWJl5%hPB8)OH_0{OU3#aY%>*7`#~RuYCDL=(X`0G zwhLYi&R{owj87S2DffhRN&a}$D)sW*{E?G6_lkGn$@6m!3Iw&et)iWW3Q zQrjGF70{9w-UJmbY&=bG%P*1g2DG&36a!jXyetQ_wCH>T`r?S=eFa*orVkCNGROMR zfLcm;H?;pGJsdnDtP|*01@%Rbw4-4hy2_krnAVjc<;3fdK zN-9sVir~IaZ~4#f!9YLUw1macopc4yAoQm!b}qL#G=A&OvGJ>~9=UGp>g^k0aJ?!rY)BsDy9`3567-B(IV3f1I#m;|2DL z*9(N9yaY(}Hp$h>CHR+gK`{lqJew7fpbo!mWRHNaO&UyuM!{)b#Q=91UCUd2M3iWSVeH3vwWL#2hweGo_((S1tr2kzBj-1 z9_Lkp*9bxcn+R?KaI2*91gi+{1MpuA;GRiPq4OuCehLu4&OloEkBk8DZ<4`71ivQu z8o=@20XS*G!$PhV*dcUp5c*AlkBHBegvaM44dG!S*9z16IOCtRD%T5hGhadgFf;gW7UX8Sm!|#K@|1aFY!55dE8o2_$tCyX+Ham!fNaS8d zR_>w|VQYqx_zpKsZz+Zb&dMcOu4MA~hXW*bG>Mp?E|m~E718*|!5 zdA3oYZOn2TMcPK0wo#yMlxQ18+D4hSQJ`&s=X~pZA^eSn)~^SOL?Ft|Wa!>ZUZ9a3 zAQ-tfi?}zFi+eNKAP!g{6g=RAtO1d?dowdfe2J;WyqQG|t-m%jwjK`VVapqWL_y|qG zoH+{S%t=EaBgw)xhBL>8z>^KZ_=Mx{IyQm3j6K6$CJnYsYGxVxhr3KUE+~$(3MIcJO4K{J0Xyr^+0x(Kz2efJL`k& zEP?E75V8}3**)FOjs&w44`wF>vm+tdi4U`TdiUhI^6opK^XPt}Y39QGB`)(PQLna# zfND4ayIcSfSqXqc*E=}R6t=)a5{ffUIaN9F%Vrx#S2jRoF&#TJ3Ud#}y|OCl#j@rxj-e7d2fmX44g8HeE4h(-mVjT`^|U6=ODCDHhX` zE+WaVr1_LIpOWTN(tJvqPf7EsDF2G4t7y84rmJYWil(b*x=S_PrJC+iO?RoLyHwL% zs_8D(beCzm%QW3(n(i`9cbTTUOw(Pa=~ijFRhn*e{~*DO|r5K z=UQp3xHc9sE1j5?PRvRtW~CFe(urB=@sMpSVw=u37Tq?TZ7jNNI@?(B5E@I;+s2{W z^0SRYx8-LWhi=QyHcmWb8;97YvyDTyO=la2ZkrB`bD}Ejm?HdbW5q+Zv52kQwz258 za@)p=hiqdJ+w$7RqT8mkjYYRjXB#UXvW-P-)7i$N+onTfi9EJ(=%49uSlhk(j;O_V z|D*Ha(3=2}ALG%@g5ZE91cwCi;E*5$hXfKbB!~}(1R*#ANDxN=68@AD;U~I5mv@7H zr9|KkCCJ=;RhbI`=GKFwrG$XF5Hebd4@XNOI9if$Ia-n+b0Nsw27$Q^0&^kA+!E)6 z4{2@O)y2p|WV){_6A8*B9+XK4%0xnB5+BMW1Z9#S%0xnBA|WzK5M`1e%0xnBA~6|0 z1SoCtT@cbHBp`xcJlG~eKm;MAO~i+7A_Uun1gk{|X%qTLn@AAbgoK}MLPFYv#8mhY z6cOE6XcxHQi`m>Nko%dqz=Xi0z?8tWz>L5?f&BuFzucj$8Gw!%^+@bh}$gU zHjlW?ByMv_*jy6vEyZHHz2+`l_GSOx75ML5ya#go-gO*5Ck=3z#9D$5fO`X}Hwhl0 z^GdhR`MZ^}Pty6jCoHit@ZWp=Us~zhzGG!zr)WAD6k}OK;WRjvcS~>D*~$mhXf7_tO=~MACHZF;Px{tF}7a3FBjMl*c8|j z*cLb6%8auXzeg%tKf5J0QK)n$q{c7Il0(!19`GFYo*XFgoPeO)sjZf0lx+8H=4 zlSt)+NaxahmPqFFGaoJ##gQuZ=dG`a+XVflm{gyH(Ky+Z5SdIOCPZI65w|{RW8NUFh@KsEN3Z(N{<%Y`*DuNr?V@R`cyIS(fQk()wgdag_%5 z)k@?{GHrC>RTbW&?D(Pzt*PUkFl6HgV6cK^_HCr`5h@n?r;X|TRT{)CGq zS?cOv5|+;OFA3|5<(9ZugcIrdGKIbqt~4%mJ>klv(|s$yEGIIzaO5sXT&hrSjcuR5 z){3Uk-E|gDJtnLQY~pevy4I8ln_R0Luyj6K-cs$^`t%bUCZC!A+WfjO{|9z&`9i*f z1AK>zC0sqzalR7z_AqVkyj~n$+-Nr2!#tSVa{{>k2G&_Ha#Sza8+LF$eZA;~u)N-^ zz`I$5*>E7Tw}EK5m>}?XAus%~z^33sBeiNCcQwGow9M`QL7hM;AT18 zR6vg`J|96gg@)0<3LZh+z=V4?bWbnCaEH=x(|OVT@xf8tO8^Vazzv`S%dzKU(JOir zXs8eL5X)cji2PLJh?h-vL+ESFtJgVZa{ytZfnWwe!LI`hz-W}r+;s;(zV(4EU%Yc* z=lp!>6GsQ{KC}vzU87+n0>gJq7EqvSu{9O_hbg4&310&g?o$+8-W%@eg642-6m;lx z02nY1XL-}w0N50~=#9>?=vNE10o=Y(%e8S#W-w)o6Xu#14n8PmUHDME+Rmi zmBc1i!(f2yIrk1_%WI34jQ$E~a|^dVyZx@UcTQ|A-nRG51L55Zcg{O&aYA^>L&t$E zL%3%Pw!_~q;nD~Q<^R5fov@|${0D*uhT2Ws*H#;9;pV?qE!)na3@}oK={j!V9Kv;A zFhj+3fl!#!#ibMo<>isXIo!O6TP@(sa*uN09E)BSREMDWI=FHZ z6^M+QFzH1XzPDapyB57H+wuY%qd%$FU}ukd3H>FyUD3H zUo!B-uG^RIXw+cykKCG=o@!y08xvKyB?4>cIFfw0iFzqBDZ&R~s9z8LANg#h==BEg z!~Klqfnoo5A`M(*k{!Y(30Q{W{;{HZ;BfDOLG%j>TgFW@wR{a5@r#~|$7s0Fsn&21 zc)eCfL*r2{z=EOJA$RoJMvFXP-;4XULWA%_?L-erKEngJNu`1kJ18)=%X@)WVH{Dw zYnxJGS8EGxJ%IppS3LXI=WF>2#)T@#$lV1RtYfjd0i&pa$i^kdSdIsflbRVdZw~_$JQhDNbe}ROC6O!LvKi* zD>qt$g>1)TaaN-NSJYNiF*}7lPnf8J)iSgTJQyW8Gv15bP{JAf)HRx2!b3|oU}E10 zk3{~0augdlx4p!sMK3@<%;vDdfGqr9z##_%Vpr{jf9@YGpmH(%HR`zel#Nqt$m5Ht zvZ12;|@ysog4ZXa=^fVDvk7>AGxoWr9>xh7s|Nt@MpmX(76#9_<3cxmTLxo8vd8_L!Sx$ zGTYylS^Hp#IC^Y&t^iQDFK+xMRL_<@T@@0mKS^MUI&o!#?xlm{b8 zvFZhn;~p;5pEt_Qw+)mP{PWU{8@TpTt|h>Qg6Ig5TKH!$DlZqk$lGx61Fo0E{Ro9R zJ6jg_6Tl?Cj;#mSRN+1teTW+)FjDdE04oNLZeW#r(T@V6bI1dRbj=YIss>AypTN613+PL7PaenCi ztwwtYm#L1FVdFo1eiQwy82Tm()8N}K3;XBdFwWxlq8lcHyPCL5Zh`JgKBK^uzehm zk1B2Aj+a)`3w;cieAWj$;a4!|Kz?=~KC@nQs?i?E!Nh*5Im)Ym8iB)Da7K3?s8!h_ z5*v1icxmvhjf44_+>7qdBRzzH)nUtn^myS`iyewqDi^B-ST7+IH&$GOut?*CtBy@1 zt!xersb#Nc3wYF-1xvv0$)_#wMjm$pffMeR!a&y>`gX2et`^8zZaHYQaW_S6IJ73R zcTp`s$%`0wTV7~Q@cn3K^uBx+J?s4#)$n4n5ckL8aVxg`{OAXfWh_PAnwDgMtcN7a@;~u<{c4uhS3-7Ly8>+=zy{XN_=si2?9Iiwb@FxlZD}; zj4uAs$a^DSjl6dX*K6SI)S6TrzGy1AFMQD?z?#YM8r*77ge-XYFw&xVH%vey;|R)8 z6P?$a*nx`W<;DSXwg3MNygR-kp;S_FN^x3oCUIfny2PE0zsg^*>g{G=Ho7*~9^mXh-z6B`t6A&^0Y-@; zwZV%9FBp01=(dfY-Fx0$;}327;B@5fNR3;QIAjFBILOhDeJ1i!Xd1tdyT^#5KSJH% z86B?FvKS8VQp6EGxY@agn<+T3LnFwc4R8f2hfBCH^wyCN6jCat$539tRv9mPU|V7k z!5QZm%Fi3Qonv_JaK{K(OX0(>IL9z5^NB#58mNOE-<0dH7_R(P;@q8EAGms5u=UWEkqx7xj|Ofj!YVV{i2l-l zJb2NRd&*9K$6Yz)oNe)Di^c`rgzNmz2Zo;Fu47{O&vHb_#3_pAY$~_!B?;Z zRLE9?uW)p1Vm`5DYa$0#gWBF>BydYBsT%HhZd5}@gMTRT4)jjwJrkpNE^%qjvCtML z9iF!se$X9B2Y+s#>V}Qho5KLw@lcJ;96kb|8I2UnRe!J-JqrqR__WY_dSQa-1@9Tk z)4a%5DefURfP@nyy1L&OP;piEmY3iG}bBp)n zV3dWnjowzp9cUOG1km)GT>|P^=dN9M-<|(&8?L{;Fu$)*Xsv2iaOq6&0t{#sFE9^D z@D4)ppy1){&)r*m)$wtI?!}8)oa=dY3%==auPk1Nc!4yau6oh0;5uqFaIub;T;z@C zqCN83Y`BlZuE>i9xEW6M1Kd|bg}c{3K3Xhr6h{A)-t8X`9Q2Q4CgxSE*08_$$8iC< zC?o!Hs2V#$KE!(2KOSC=U%&W8dc)s#U-jP|{`ta3^G8N6UijPm!~4c2HuaoWzGc08 z5%f-c<@NsdyZ%zJ_w`)6nhU>lFo%u6p=Af5aiUDrD-PmxE4)L=5_&FPLkIPH$o=j? z6iM{&T8%fGp$p|Og4de`=;IylU@#vZ39S#0OhwLiF0aeBGVdTN9FHeDsQV1|wxP2Q z6!!#1_Jt#S#|M}7M*e9ZAbbG%Dd~}jg!kEWy)1AulIeY(R|FNHru@^sCWMN-5c(=Q zQL%*YOsZA*rflYeKMOw{?T|`cebr;TC(pZk>WteTD-=dI6bmDT&@F-QIyJ(`zxD?1 ziu|jKL%^aJLpg4&KtJ`O7h@)Uos$?Js?{NApZ<4*?&qyNjZN#`_|QYUCU)PQo4Vwp1HB8kt;2SA zYH#Igf?bN<^f`ai(u z1I{mYhCdp*-Z>%&xZ{G>lk7(m3ws{P*{y*tH*!%pApA24gJn+%Tu7h}=LSOX!7w??= z$aNDV56{luQX0E@blZ+=gD;$Q!x^W9ek-`g;^8*zuNzHz>2W*~8v1SMk{~BV@`I!D zS&f%5+6!N~-UQCamRC=_Y5P29UK>Nd|8+)wd z2co@){g?SOqC#KVCv;UeA~qg5ms#Jjjy&bphN>yP{Q5IJd!536&n z;*Q1c4eEYV@gwSgrN#b~#o-ke2Od`Uxo(Bo`|}pN|7vk~+N~U#pMP>;;nsVO-oNQR zx4bxi-IwmhDFuuy7dP8@(Pu5@?PJ3$EG>J1mBvI~QAs@9-3WMw7agz#s^T7Kw=%8| z&Up99Az>dKyttuz595_47!`&GienoGs^~)aWX_|YyKn@l)-lC`uiQ5$<%RD}g#Rgc zRlSTaunvqti=p@U4?yXGSQssv_~OJKrV+VV6EC7_5x4T;Zn$Q>!|T?(@W0|K1s_Nj zV!K)n*XoI$5PA>w2C(1kewzcg{~frWv&akn@YS_<+j#^KQE;Zp+mt+8!VE=fu9QtbHzKPJ4hjE*tXp;qe`VQ5x?-Z?wE(2W0c#2PO zeC6U+J52F4CIh#*S!x$kUBdqz>(Qb{1x*HeCf?PfycyC9z5jUFAG#76G2meWyioC# zh>HIh^_IYG;in@PC@;(cVwwuCZsKUi4HAdm*&7)Jh`w(kyc#&XI&zT(IGUr_gd?(T z&1B%a=RN+}@$*^(=Uy@O?wuE2a{I2oSx*%QzYib7U6Rrm~ocNhJnKcH#I(ianXN9;3CZPbM6x#dL`UBiM@ntix~fk|Iv?)K}`j3@Q-2Q zknw3(o5L@GA{fHgtQXiG ze&LS@|Ao%QP)4el)bd~W?ls~hTBkJd>;C`socON`<7=sbDI83)%MuKB*saEzK&4Z| zZbKYrf%jdj&DTM-jQ1(lWXc%JFo_6N&bfDW7T>HxwNAENKDu}L^76jDPvfe^E3QBC zpPt^m_maXF0#jK2m~yV~b;E&MfT-0rnXcAx!*Dw_|8XT;-vlF_a7Dfo`Ii%}=zDRt zhz-U3LQMiVwl<^x#W^M}%c3cFg|eD)9cMXT=E_C%P-MSG!sKD2wZeT_v%)}$Yg6So z4y?-j59e5MO}CX%_mK2hdpL`g1w1|ozrL3;VSM8KqVV7&qG2R4WMM#ofACX`+`@8% zd>8$epH}d*wIBq`!t4e^Uk+ZRgugZs{V({~UgDH8hknR;0$%FMyl<&4U$gh5TW>#k za{g-z`?f!Ir1H$%7`|RGmEjx~h7CHuqvUzzpSJ=!d}v zStFqYc(DnPzKt(OVNgO1;hIyV!&jE%iq}iA&g@$G(D&^boM9e34|t7bNc zVY*viIH$!o!Af(k#|MKhaQF|92cBMBDc)s@yQ*!X&@xP!$afnv47}<;j%+bkBa?ev z%MVOp=0&TL#2*i&QX*nx;gkM8x3s``W8r38R3H;np$`E#WHpYiV370%gG=4e9NGI$l&Td>g)bLR~z=b&;d<=7Sqa~F>;{NEZ?##6Wp*Qvge-6#2>0wT3#(5#~qrgUf#$!im zBX|`Sejwn!=(`UFuL6Mjgfjv7{fenVtnzq)m+*1px#y|p!RYycO$V_fe1#C0HR7U2 z@G2#mkxz%-8~JqjAEAh&_%drZ%-Pr~CM_^ojITppk&Mrwrgu>07lCDV59`pO>4U8r z)S^~klUy^+Wl_9N2cDjK$5e3qNal-kcW&}e)5-#0*n|AF+PONW1ULb}We>I@msV>- zO-xe_K=;9Nc(y#W8QTAU2(acbG!Ri1}}(Qh6z!K!8uVNOyyxgWdcfi zJ+ura9{5(o(l8q~vIn^~d6_f5ckwZjD;5GTIrrjC!v?^NJg2tND*GaPv_D~BgW?YF zLKf3-g*KKwaPe{x3l5yzphEiTo!4*qz`|-@^x>mZrw<-DbKd6*4^G(c?8sTD4X=PD zGFe_iLq{)Sx2cE|%V617aPQ90xq%LlfSy1H|L9TJ41l-A3iw7oANf?|^St4F1QowH zsT7#*c4@5HaeXo3XHBA)qvcr~e7s}23;h#9dg`FXNRg9a;ZvL0Hs(^B6?ABsOXJFp z#}zM3O^()qu}mGkMxH7pXSAoTZZ8}-35RceZPUAVtvPVg(awRh9=+|X8!pE>LkpS% zSAF2b3@l*i5`aLBRFs~Vx3gG_#Ns6!;d>jHJIJ1(RwRVxw;gi`-z${(qRf~QV~)`? z87q2b5-^eiz=8$LFR+DKPZ+O-{6NnF_H&e;@uG6B08iJ_NY{Ejj)WZrmB( zU_7vFU=i~KCG4@lQ}LpVCsY$>e~N$f`|yHVh`|9wxxs~~`??g?y=;kKgUlfKhxS`? zD!RAt$l1{Y7w$dVJoDh#=jNZ9ojyQasy?>7dRI&Fu2jhl!5PQZbEeP5{I?} z6e@ADJc~p(BU@v|fi^T7WE9#U^}d0%9@@Yb%T_Cx`d=g@KBi_cDTrqo-l*Ocn3{r) zl+801G`it3DGbb2+Y6MEq*+{Uj&&ohD_LH2#mgWA?hwjop2FwS8by z_}1HlM-ObeeYuCYHdr!2??n@%yHJ1}hVnICEWzTlfkbf}Xh6&|z={Y2>J`R8 z(t`#h9ClRjuGdB}7XboaXLPr)LfH`~$LAqD32lkpqyZWpSbiRiP=Y8E@e|e-3NJjY zo$w{u^{HY8f@-AHS1nsdbl&k!XZV#udw2i@CSi+g0UDun>BtYngJK~+abzk-R!}F6A{T{i&UvhTm+_iJNbsHPL&0~l0fO&j9}2!R4Tx{v zA-u6b@hyVtLpNeh>KehCCl=)3fur|O*&y%%$Epo~3;@8~;-AY%HfcNR-y`FwhG$K>7eKNbDY zl*(U?MlW+WzK)-BI>0*o+-oe{0~*o*Mx*{+ulsj7r~7xU^Y7Z@-vv?qyATF%m@Yfc z76>SKoke>rLNrStswG~>W&??C@j8q4ScK@7Ky*vIjvWa$cfsqdbB{&LUA(tV+Kfwo(dd_GL}k8|mqQ4cSddG(it z?k9}=mH3>4j>K7aK*R(@Owun4-A{;^+qs#1_imqW=T2VzHak0gV{eahmUG7QipLbs zDn>X;Wg}PuF)V>7mSCK_x^aYH93dD-2;s9+`2Y(aVBrJIJZvx#)7wCzJsP;@dF#DL zgZD7^Zunh7F@~kA2n=N`@ngxkF>Q8sY%G~96sEuZ;Elb3FZkd7e3uyO60==G;-Jsl zI3y$v35i2O;;hpsz!C#kA^=PH9t{VUa9{}smT+_xjUkl85lUhSH645{cx=b>0>=c- z3Y?zV@%+q=v6&sSGdrf;U+s7vzcKt~@oTq}N%kOIqq=nzczO>b5R4Jb5=`%b58QC= znEvPoc07;Y7=E+(;UiYy0w6zjEu!D%aR6Sn0E9Z7+q!l4?C#w-$%+nl`GeE1JJ}t; z&nq5NJgXQvFH#Fik;4hH8$PWWSu|$RtVPJBODtG|c`Q;IIE{{Kh3OJwU1GLNFeiif0ugw?%3}Ddu&8lyDx{zp%gtYPWI9!`9n&v*ys+n)@O( zwn)t`QsA+KURb2bXQ@MDP0J%nFh(#-Fgk`wUkE>6Ym{O?}L#i8hJ|B4V_oL9AZrnK* zcr);da}ga+J+=GREgL`g#mbYTNBU>)8Qb$HmKSL?S%i58?ghJSsWZLAx!DPSPa+6` zcA9gb5%P6EUN`ddfc@`78gpcyQ({m|lwu zq@e1%KC!JTc7(z-TlfyFyK-if=Zbb`a~sqFQf|~x#fUM84`*iP9D6%9 zGM*yS$IwZuUik808aKjVermtK!Va+5o-#+1fn{EVil6C~f6EyL`Bn1|k zAm{bHmKjF&K7g)kA+b59B0jL&Y^f?5+OE>^bMFvP*> z)x6aUmeH_1Z)2vi0P`Cdb>fReEPq#{MOgR}znAfO4uyP}OMXzK2Whd-$rM*X#A);tV`sv4wg&=nCsI3!q%Cr>i0&2f%jv#4z@kOsuU+aVTr|2SHO^XK-V;)X+gR*=bRnAkMAqQQ>LWeLnu!3&uOZfRy4 zWmwIk(AgG@83wS-Gf2Zl?z^b9+{xev1A8WosBlwh(1(@r_?zf|quBs^GIgLxI7{cCAE zi>JSaUXPw5`!Zl+fyG)Z_@jl9a4jeLfiWJ!l&^-#E^cV56yo`p#^H1Tj1_WLpiMn4T7;boFjE+=z{rVKS2da!U?)!z zJ4hOse1J70XDLu1n9;+~7b7sM7@eXRFo7Qj&IzAJ?EKDRJT3*gr^2U2e-ir3=-0y6 z%QrZNj+zi7thVxS_#bA#c)vti10(%#!&d7&|l7P zdwg{4!7t2R-?)DMsRJ8EKQlrDGd!%YZ5$|Kp9#BT@b@yzqX3YfGp(UPR4ixehtb-w zdd`-Ln9nX@ps)?f2{lY+;*Ep7GT3WmS8FjNim?$-fb_Nb3Px}lA_DUTJO!}cL|^0_ zIVPELp+>b8Jw1F(2vi(Q&++1$$0`nAUEKOn@(29~WR;-fac7f12*W;n>2t-$e*g`F z$^Gu}`$G`AgxSW<2o346r32;yXhcknR7*Hlvs_%ui&cuP2R3w$lyAH4=tssE_OBh8 z-#+C-FDnlR4*AEq`2ixp#7daUxQD4ybZGRdX5gFwY!rZX9e1|qUZ&M$^hyj)?X}pg z|Azvs;x=-{V-1(jFA8JvhkT7tQ#EcP!Pl5bMOBlrD4(JOB12>6J zj4*r(86uJ(+f;}qF56fy2BozS6M3>kI40wA>?S|aWG6hj!k_R#8dkh`ynV6xmN;2= zOPp+kCC){DHZ3c$8w%-{<_ssBWJ&1KoEdpZ&TuZx8P26SvlL6Sewu$rk3QA@(2?-g zt-8CHk(M%~dnrfBWC=^Um-3{0DO0W7!_MWw_bomQ>guLdC!z-DAaihz0l_)C|LbBu z?~)Rx-fV>@g69efv7ZE4FDDG`BMcrF_y!^MmD(^VOwVE!so{QpaxvK?Q%y3_B-2a=kz}%)Vp0=uXu>)z zH0WY8e6F?T<1Q;2>xoS`-t*A7n%vw18+Lx0U>y-_Z_d5gX@Etw0w$_ySfXDo82J$u zBQVw;KmDM5E%uz=ytBOFimS$gdpg*X*@8tBUhZ6gi%ekjgaIes+64@YJhc#mF<(Qn zk&v%SjP0yealz7>6G*k!I)wR&M;U7|4z*qR%$NDFY`{c#P>3qnE_a7=J4+W!xB}tg_at{#o8*FyXb4Mu^Kb8m9 za){TC(NtED69aj92tY^X&^>#!)HF2yMurYK=?Qk{yyTC-CkZJib+)TKD7PQ@r_H?1X7 z>Q;=xYPb?8o%QGvEO_S8&5rZHquVz-Z#?R5SmJQji(_tSh;=yYF(#M7H^e%+AxQtn0o%JF&Mc%|bdk@ZgF$nPE@rXlkf3g{ z7UQw_bKQ92Sj@c}B)Ojm>SiG|gWTs9%YhnP;?R(84N6G22CajVBZ*|q+)&&l4y0#D zw}zHDx*THse0~ z!~^ceCmzre!*T9YJ3n#z;BAjm6R>|%duX+SFIa5FfbG=gz@<|(=-;k*T=5e#&a=-2 zuCWJn1+EF~@5N+7vpe+)cXXvYM*rwaXT5~WF19Hwh+}eL1ZFN>hH`Qn9PT6Ua(+ng zD#2?6A%aZ=Hvza+Qh9<^1oxddGtyFj{kGE*rT$nJZRQ^fjpH%)y<)2jkN#&nZ{>itfkCe)jQh8D;PfF!UsXQr_C#CYF zRGyT|lTvw7Do;w~NvS+3l_#b0q_jTL%0I3A)5<@s{L{)mt^Cuoa@?=z= zjMhh=^6yjreagR2`S&URKIPx1{QH!DpYrch{(Z{7Px<#L|32m4r~LcgqCWano_>|5 zU*+jndHPkJewC+R<>^;>`c1Pgdp0sytbh zC#&*gRi3QMlT~@LS|2&(pHu!h<)2gjIpv>I{yF8JQ~o*SpHu!h<)2gjIpv>I{yF8J zdyD$Wt2}v?C$IA4Ri3=clUI52Do!YCj3(CKs z{0qvzp!^HUzo7gJ%DMU|(h@)T8`qRLZLd5S7eQROLWeUy}cN%@zQe@Xe5lz&P2my~}=`InS`N%@zQ ze@Xe5lz&P2mz00$PuGW0{lmnpJSJx4F)=F-4qg;J!NjaQCT8U^F)NRWS$Ry%%41?y z9uu?jn3$Ev#H>6fX5}$4TOUUC52N~rQT@ZH{(*xQEq~QNjOrgo^$(-^hf)0lr(bG( z<*)jOQT@ZH{$bui|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT)jy2tA4c^Lqxy$Y z{llpKVO0Mxs(%>OKaA=hM)eP)`iD{d!>ImYRR1ule;Cz2jOrgo^$(-^hf)2*sQzKz zLjN$Te;Cz2jOrgo^$(-^hf)2*sQzJ8|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT z)jy2tA4c^Lqxy$Y{llpKVO0Mxs(%>OKaA=hM)eP)`iD{d!>ImY-a`K{s(%>OKaA=h zM)eP)`iD{d!>ImYRR1ule;Cz2jOrgo^$(-^hf)2*sQzJ8|1he57}YK{h+52N~r zQT@ZH{$W)AFsgqT)jy2tA4c^Lqxy$Y{llpKVctUjFsgsxfK~h4ss3S9|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT)jy2tA4c^Lqxy$Y{llpKVO0Mxs(%>OKaA=hM)eP) z`iD{d!>ImYRR1ule;Cz2jOrieE%Xng`iD{d!>ImYRR1ule;Cz2jOrgo^$(-^hf)2* zsQzJ8|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT)jy2tA4c^Lqxy$Y{llpKVO0Mx zs(%>OKaA=h<}LIOqxy$Y{llpKVO0Mxs(%>OKaA=hM)eP)`iD{d!>ImYRR1ule;Cz2 zjOrgo^$(-^hf)2*sQzJ8|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT)jy2tALcFe z52N~rQT@ZH{(-}q?enAhhf)2*sQzJ8|1he57}YK{h+52N~rQT@ZH{$W)AFsgqT z)jy2tA4c^Lqxy$Y{llpKVO0Mxs(%>OKaA=hM)eP)`iD{d!>ImY{)7JElk#R&|H!KT zkyZU8tNKS)^^dITA6eBuvZ{Y%RsYDU{*hJvBdhvHR`rjp>K|FvKeDQSWL5vjs{WBx z{UfXTM^^QZtm+?G)jzVTe`Hnv$g2L4RsAEY^B-B&KeF*cPj?Ebi?P;)!@ppg!taW@p;jKTd$%3H$#6 zA-L9HfABkjVff%m1h`*v_G6-o;8lXx2tov#2yOy!tEBP-s|fBxs@L&bKpxLH{}q`! z{|~|WmVWj*YrC%ymsmRc{lCbzoc%oZUSLija)eJoU{PR6;DEqEfn|ZK1y%$?7^E5! zh*E&A39JjeTwp_BQ=sMB78;Tu{;0q;0y_fNqNdiP4!D^Y^+oc3yYE1e-seB{j|ww3 z0#W?S7>2XxSs2_qDUgN1>GT3wm>wSsgL@wZvM{`fl8}YL4Uhs^7~E+okcHtzm4qw| zZk!az!u0r97~D50kcGi@lLA?o9v=&X3oHe)FwXB#glqk$vhhhek7Ml`Tglmz&s_QbL=3+;ehK`N_@(emf!5if;!Da1=5UJCJ2h?hdV6yl{2FV(+96d~b_8k**)<`jM_5>H3kbAL;s$&LDq-cn0wd;u*vi=PC$Axhd6VHGlw{H zkQS9DWt+y?s5r@=fIQi*aW-t6EgNUk#@V*G&64oNbPqfeDxcn?25UkF(+9 zY>hFUk9a?GV2{9^#01QNodOpV6ObqSMVuWY&Ylrx*TB`t1Wdu+5oh;^ zvwy_dLE`Koadwe7`$&R)B*8wCU>`}ak0jVf66_-h_K^hpNFvt%ALUK3&m`Dq66`aH zICEehNwAM3*hdoVBMJ7A1p7!ro_l%l?UT`}ak0jVf z66_;ba3>%S_K^hpNP>MN!9J2m#k%r#Uvczg4(u}t_L&3+2(0WAFbDRL1p7#WeI&s? zl3*W6u#e#G0|KUCAAuRc?Ci_r;fRX@mbo-)X+neP`n;V7!uX>~;s%sjw;cd;`9L0c2Ic1|wg?H{{Ft zayehq*KGQHt9`3i`|`euujQ-z27Osywun21@n7^6Yd+9`ouaSi8_nV$1iWg#wr>>v zqeb6n+1F|Nifvz~>?`{$qk$%P4X^eM4}xRWR~_(GhkVtXuiErgTfS=DR~@u6Kmy7z zJOC+9h~T@{@%^tIj?WkLe6SaSQSTpB$*k{_jxU1WTZa2w-zyKg2>&zn`jG2OJ@Mz` ntU~xXhyHvV58?mgAO3utM-l#mpa1zduut->KF$AUW&ZyFkheXm literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/uprops.icu b/go/mysql/icuregex/internal/icudata/uprops.icu new file mode 100644 index 0000000000000000000000000000000000000000..245db9a0584d2f5466573fb0a25b1d6cf24eca2c GIT binary patch literal 135656 zcmeF42e{Nk_wchN+1-?F*}nH8Dp*hy5mXcuQ53iqL_|eEL=jX(MO3h2L4hm2hz+dR zu>+!DL9q*01v`ShSL_YjHz%{_&ZKU`*Z2QD?(a$F%xNFn|?1c9GsjTVFeF@1fmAdEYYO-s8d!Zw3%^>d+y*!ie&0|jAWVql`D z+$Gu8e{Ari;2FWm!E1sy2k#2r7hDp2D%cWyIk+bHLGX*<_rYI-J~1LD#FW?|wiCA$ zyNKPz9mJkuAF;o9fH+hfDIP1HAWjs|7N?3Ai?hYc#B0PG#M{Mr;$m^B_^h~6TqV9M zeky(?{wV$;3Q|~#OBtz+)F`!=I!axo-ck=~Z|MMOdT^w4j5Jg_O*%uGES)G_CS4`n zC{35{l^&2DlkSjSlwOzKk(NncOFv3~NS{brC>?4HiJ^|6Z9_YS+J~A#`-Kh;?G`#J zG&Xc{=&;cFq3NN^LT85Z(lw!*L(_wIg%*Y$Cgqc%=R&W9-Uz)H`ZV+nsjm`$2?^nF zI36A;whiaPt4M#XY(})5!d=4MN!g1`eZu|22NC~~;_&cj#|UeNW5XvIerJRyho^@x z3tvNOH;3;E-xq$0NY95?gkKLYA=2vb+VK0~&%-|v`Ok1LqDE4YRbsoymXR)zhRE)b z>A}4s`$i6obdQXR97-%>$#h2K2Y1LTqNvGVEi+46XKs)dW? z*|HkBTE0oXL!KuuCi0{5Bk~IQB_gSjXN);WYUDlnGx?2DeJ!U4+Yj>ZL=#jLh5E8N zwx=so<$|&ynVV4~n=74^F3OIj$dO_{rI#{5p-VYf(ZmtTDsi+jK{=k#MVzEeQD!L9 zgR>0(>rCz4s@$b4QSMXn(kk&O1-9h#%B#v7Bz~3nHkqrDkCX@Guaxge)K|)HrdaH8 zK&9JSDV5eOU%sBGIx(*17 zH&m5pZdKmpmty^U%&WPF_thM^ewX6GQa>L3AL$P+-81S7>Pw|Qz41TRw->^ntfwBm zqpFdw)yd?1#LN%s@9L7Ess%OBr?dubOKnMT8?BqRi?*lMPTN;IkXR1Yj?>0zqqI}C zj?y{WRPBZEY0_-%V(ofjyG@&~JwVFGv}M|h+8f$?+DF<~nkH%_-;YLCbBq7h{#^tA z7uLWZS|sX=P7l^cB3d8$=)e3gtpU2{ZWtxs1=Y=>9Z3mvHOe=_(}PXXoua!Lwf&=q zl&t2xeRy;<(T^e1xai5zGo$B6r$;XW8JL-pS(*{DJ+qUt3$yQL z+vX0+jn6I2O_G<&g3?o&r1X@RlIeXiQ8rT7_}2JFs5hvnd0+KwAotSF(H__Q(O%JW zqK`*^je=ieY`@qV-$XJ|ZfTWh{XtUyF4j4|#@B;Pz=`pm^1X)qJ(2$9-!<@maSd!s zuH)5%5UPh06ZJ$3p%WAJefrfOy)ycG^d0gkOpl?TM1PKc9gW8Rib}CetPpDx!xmm3 zSBExXFfJVZGy5oJg)U~ z|FYQh*sR!`*xK-15;HHhIJVS?T_vuJt%^Mx`!x1-?A_SUvA>8m77xW&iOF~&zEQkG zymK7-1{rrG(nzsaTod~d`v60a{ifw$q90-CkB*OvPc)>_(X-Sd{pH6 z_-*lf;?Kkv#UG8gkoubV%kdB5-^af&yw}D3iB6K7C?wh@HcKR1w;Q=Kn%E)HGeNG3 zCi=H(p^^xR^eokUu;{L>3L-TZEdE$}8 z`-ztlt4Z{CiOX~t6zUEX)4J)^6UM!mhh1u3^7lN#yP znt4~F{*c_aRdcXDTpy_)qo1gsrk|@1)tmK8^sDqa`ZygdLx^RrK3KnlSnkyy(x1?m z>-X!A=uhj<>ksI!>iZe_%zZG|x`-d?UBoZ-@AbD@+pJ`{D)m{X`%}CxImV}ClZIJW zi3Jx6^Ql(i_n(bO_K{-eREMft*)i2G)ys$)oEl(?J~}lnb!zII)U?!0q90+hL0{F# zl_^-`*D2SfZcN>(++ox}a~t{P=HAppsYg@Kq*k=XnYYgCt=?w6bM2kfCnbOSThQ03 zpHtBKUt|iU)imu7#HP|PFB)^`Tif(z>8;Y+5)aJ06R|xapYHHEQ{5KWEe%_spD`z& z-=rK|lKCz3_sNrtYk(sh+dMiw&QV``Z`RlK*@^49Uv5h@X+5`d!FtZ;)^DmGM$Q|( zJ^l`BvuPQJku&0dk|W94Z#~Q~jtbSir_J8j`Yh9WK`nfHF#E6X8f&R?3GT^QU(8L< zF~EFX$bBS?d3CQ}%C)7trF*q&KWEui)vo)TZCm+>=D#`FTeEA{ zZ-4k~u)bh_$2O@%HPnJuvxP;mSf-;{Ssinr#!vPLhy%nDK5};+*!els-6wYoAH{Nl z#P_7Kcs^G(GF>`4$5tU8TXpLTANP02qS1%yGY5t86v(?7aPvgFQ^{h?)}WEktaGYK z!aob{8_-{n;T0^cYdEUm3;OOO$343J1;fJj52|&WgSVCHwj}O>kcCEKzbh@VJ@AL4 zFMU^m*4nebr;hE{T2{K8UVcSxUxS|oynNTc%_}1wUH-n(X&)@VrRlqYo|(~kt=o^q z!5JOAiiA8V|GxLXPI70B{A_MC&CT$0thy2tU=*wy=U1GZKOyW~m7e9o8gO3vEQt|F z&TQ-FEVfi_8Fb9x+^Un-?YtZNCOPJ>wwLw$!6WEII{R9XaZA-?t z7WAasSSoF6ss%Uu*5ftRtzBzdP&fCxk{aJPxAsdbw}Jw)#mQ7FmGqeUIQP}Li|)OF zTwsN*;I!|+*7fQW%^ddh@rtMIhe;1aA#7mnE7gJrNYbZVK}%@&BFFVsWLzPzBNEz=S_tS{&nePK&j zhtg?zVO`L6VH-QWz{9${=tok2c#x}{qZw+^&{GL&?ifzJHG+;lxc z4Sk)K7p|U|U*5B)*u_?DlRkb?^rr*Z}xu19FW&*|3*Jr zW`!oPS!@v%iO%7i3fiZ~Q@)RM?Sh6azuAtpa(1>h=$5AK!IIEAmUMYpf9X=uR+tv* z-pdOf?k|Xj9@x*&aXGNW<+GN~c3umuv;CW2UXbzKY+YVz<9m+l-fMa3{>{Im;T_I^ zSjfyYC}EGWM13_SWVe0}3t0A1(w74 zGYPD8IV)Ql@UP4h`s?HgI(rU7MS6lS=rJDQpoh3ls2=9!*EXy_^Jfyv59c8aJcov@ zLUB7srZG+9o3?}_Bu?<#7EDgtjb9?xM=!}(OfsWe z%6X3a4p`~65SGz7!aJXHj9E*s@bK%(yymE#t!wlI4STVE@)CPDz`K=im)6kk{QaJ_ zX+CpYkc&o zHLhEFZM$4nwiGC_JrpvbmX=UwI=dab%Gu2sz5Od`kI%>2=Cx*-`8fCA?Co5GNV@m% z`x=#%0Q`>H5>SFo!DjL-3PmKJy>g4%6l+N|rCKsg`Icf+o8}3SgZ0I^9r#4FLnG;& z-~Q!fh~W1JsOy$3?A8x=JsdmTt!Pmz@IhZz!cmmIn2wj|8lM(f9+72U53>1XwX!$)!^90j(2p4`8nMp zL9$N(aNgnlIgS0r?;A~0v(zFf<|eBc9pu=-Y3H)o_Zx?;u(>kfF zXKr5na@&2Nk4%d_HkZt9me9+rb*`Z=lupa2zypcM%YNjoUgO*YH!W^2x;^-Pmdg0P zqBZCv-QLdgP*r;teP5e1L4LRQJMKSpk21$WjrPuI{TimytehRs%gt-v-kRACHTq(E zHs5OHGPYF7H!!EIO4lFDg6~K6J~T&R&z#l^zdyjaCCf@>)Idh3c)lfwt#z3&4=x+s z6R<{?5SIf>?7LIHs1MERWXJZPC$?J284BNuNY1YrkcB^QqB1TSEz6yI zIolzYT3Ao1WX_K7bNs}@Ezg&@^-+h*WBzoZHR{WMe%hChYS>z+M{>*VZ~Y$Q?%`!& zeZr-#ro^=hFpMKYbw&_$&eyguj{s6h-kjEOs+dyV^`}*}8>lp6HQ?|9Vkfd-*tjz z(8)JB1bWaw?%hq1IhNL5X+o6i2i~2{dqE`DGeV2s{=Js<53bF*0c%ts-vsJ|W^tdp zzwhx6;!^3a-fo`mxxs?x7d|4fiS8vuviIQr!tR{)8!@n7fsL*!n6vjH+;a_i!Lzxm zvR%`5&0U&zP`WDJlpe~iOMa%OyF~FElN*JS2zvg_kG#wUKA_n^4Z)Z$iIW(V?^?`bz5%55Bm*+{*I#Ad>fT%jB$wcdoO=UF#Hq=W(@_=JQ_sOlKvY!@FmP zPe|%dKb4_|6Y4=jbu15@J(=zNa$^p9?gBOT!gI8yE+Dat;KTb-8NU^}l`JnvaJ|Xv zW&Uo-|CU4Z^Vjo0Q?BmTE7aTj0?Q0%s_bl)O016S2TFS77v^Y(65m5uvIz6_Hq^s; zHj}d?9&7dzwxZWMz(32hcBlyP&^ZdYc@Lf{Q=^Z zhx*CORk+?xuK?mVlp^sHg?=J#E53v{z7Dc(XwoF68lczEJNRY@@62jqd0+UQmd4|H zu)YI$Z!fvBIJ##v8+UFsNlx2@^}{TAwGB4R$$p(_2|49=(^-E5CEs}D7Fki6lxC%c zJVw2VJlVNLRkS9pS!>ahs9PSUZ;CcYTcS#=Db^foi7D}>cr$tKlagpkG$&dTY-278X~*0uSrTu%mv!=~URF07Y?0n%?LAfD62qGAJ)yV!l*(u99ph`B zvLsS0OYRwC@$FqfUrT#-YoxVad3&`-(l#cCdfV=Pl;rG2xQ*c(^@63_W*3+ zimAqz%R0HnIP918y-~?l##k(kBN*2w)H{`IoioYF4|L@{9F8Fs_5<5|ircHL>jHhs zm3)qJzG(05x$n`>b@n;wEMczlrSsX3Iji|x0&3;*;+b_hKeyil?JYWM@S9V$C7iXv z8K74j+`B=qE<67&XZ??p{TgdO5!&FtHCUH|{(4mDoStp+b89~LIxH#u-IHCz7L7At zuXTXmWW9XAhW9J7@24z+N%&0}{9&2VLZ3?kHShNDn>yq{SWe6aCA?^z|JLokcB~fI zZ}HnNmP}J_uSPrmS{)|eTqtP; z%|7N@QmBuFP#@;rwAQ#YRM4?+9OciCF$bKX)^e8G8ejHX2CQxLb}w@tK72gLw9e1n z^EzA1doS(}){=QKi}h2;IX`I2oz=0NZjyWMN-Xd%rE;%wc_7})r(Axk-7OcM|L~tm zPBMO;&=Rfyo3Fk*#qc`#jX)=N@nmdQlJUwiWTts(8{ms~{jkq7dfl0ifoB=&rC-Rv zk3J&?m!B=S_xfahZW3gKyNt|vDz6)%#@#wOvWLQ%C;aoB{k8|6y-qKBFA}rTwNvZ! z9@!mP>^)Xq!sj*8?Sos%X-%2WC!9RW>)_Me&C9&@-Mmm=sW#>`pOM z$F{pNPg&eFp=nanl%^R?vzq2K&25_3w77Xf^Q7h}%`=*3HP30D+dQv%F^!<-Yy6z< z9hlGQ*9H53rpO>WFc|7AWsWj;Ik|tSgu0xJv;dGhnWo6I5aW&|a<3Bkw;@#Te?t<*Dq_dugkmi>oQFwcTpwCT~o8B%_33=@1Y`ft~PA~xl1ZUdsqcxQuRj zt$Osg=Cdw0`eA9cTkGg&F3sl%W*eFmEmK-%w9IOm(=xYZUd!T^?V7r_bZPF=vIlu4 z@;>mH($w3Y-CKigw+w6Q0-m_W`QO#~qbc||wc$Vc%%8k?e;n=+O1GC|=5tjz{RxD9pNeNkLKFrTweW@e^??cvDqPFc$v(UYWWrUvLi|v>E8gn{2 zuy`t&pQd89I@f8d=9Zj2owX2;`wwn^XHTmZm$$Zf77Y^i->hXda}Sx`S+e$&9zUIZ zowU}^5$C#7B-L3zZ=CDqc)Hh+fA^CkGWH+eKW5MJpTGEi_I`SR?Rd}yj!VruIKMmT zyN^DV75eA(!v0rZ;&z7fJL3q}299vnS0#CI-I+`JzHMrQ`*4{cizX|hUPjPwZHE*Z&Lx07h z{uu3Dnzt&p0DB`lvb&vG(C3geO~Ad&T)MT(w@CLWy2WrQp@h#w7$U#=N;%z`%~K- zTXZY!y@q~LEnONqhj=(EwZ5GC&-=KwOZDc~A{)W&0E?7k&rFt-#(B_^AP{&=>Q? zR%tHGY_?-N%vw&OZDVWImrnVy7xvP81cA2UD2h0>?(}}bGk)(g9h!}{;nYjiRl7ex z9RJRTr8Ix`TdmZ?s-<;0w=X&N&aH$9`}#F&VBfI5oYp7rV{Wg~cQGHu_km`{^#eWT zJyFK@KfK3rUBP;R$$I?77$|G$9gT-ppjFsADtlS`VU2>{tIJ`HVfCz_JWrl{+oG&g zmMbfoTAEfW%gA%Mw{P06rEAl&mR%Hb9TI-mYU!>#Z*7G>XMy}q)pdCnj`?v)E{G~V3{^IJVo?_IJecR%I>l9M0k>8L_*wTuF~T3+U(HDoYr z%GY2E(d|rq`Mn2whb8V0SV9EKe2((1@{yH!SGRUpe&_dlxonlbU)}Olat_ISn#o-Z z=1eeNt3UYX1h&i4v?#k<+ec`K{Yr-{*Ln&F(4(31Tb4Vale z?*MDGbY2?%{~^x3$1LU+VVhh1ts1u$F{5*7ZHKi2`RR-4sJC+_sx|w1*Szw$*}OC2 zKXqud(^@MR4^PZ;YujCKjw#=U)x#X`-VRvIzSd0EI?ZK`q5ngw=O&0sf~GP2|nf*?(=3p`fNh*^lBfrGVJ%5 zqn6U#8>gCmnM%Jm)933?Xqwn`w%w;#8qF$itdr&mEiS)%;q&=zzpz(uuPfX!xfQ$?}b2hBBvVQu74zZN4jAT6^D3%U2pdz^#8yYmC>y6PmnUxc|8CiRj^6(m0sI z3$MvoeOY$&;rqnCC((HHfh;tVYEW-2F^c-KDAo%n3ua+nC^2u;*h|)bYU5*g*&d1h zGy@(tttIBS>YeN0?QB79IeUxW=^1xTB7N#3W^pgEzvX-4p2O>#fLD~44f|#HK?^Ox z?)Bp1z5Lv4e0=3H`^Qk_G1SxA5BGTLS-agW(%gf}yv#9t%^J_!K(^YU&cCd7-UhPO z?o>xVYb;-Piu2Zax9x2;XU6_fE$tOAo9$q+zswn(YUUWThuOy0%K4e?ZZ-F{L5`|r z|99wiECzjV%ReW*e66;6V#%|S>S!5zVt&?-u{>p-_v7_EpbG!}S!=&*i>ob)Y*_=3 z=xVnfzgJf;;}&x(%RQq9uk_Vg;(rCJ?>UVye^0R2%H^%+U(bB*&PRMfm0Ln(+JK&%@t^*Msj zEpuZ2wESjzw@6!kTzXu3e7d{dL+_+_*1PCeX0Fcc8o4^tH?mJ;P~`f|5F(F=93APV z-j=y7GcNK#=D`Y>pE))1c;@lUIgx{v!<3_x6O@-S%QMp=Gb2|<-pqWMxgoMTb9leIIo^R+!=ePZ46JLl)B^VECti}HzLkLaWM z@6>hbGx?w5mnSxjU8EhA+b?%m?%>?7xo>k1&NSFM?Q{B(AP$OicHd{=riN{TY3Q{<{9Q{;|GR|4IK- z_oV`#Odq?vZ~i2W+S=%k8Qr^0U;<)olnX^1l2f`E%7NYDWG& z^LwU^9LzE{k-w1!r%z9BC7+kxPQEaGY5MB)&hl>ZP3b$*ePlJeVLqYum!*uBxi_88 zG-Mu1Kbl@eSe%U78pviR<|gLePyfrmYvA8C@b4P3{iO zS_9vv&HR-9GuB$+3#$CA$`kC!LNljH@d2U3rvrpU`u>j*cbZcn|K`Zo1l%AbBC zb!BRX{9bC7{8?&_e3Lv!mD6|13*}@wmu^fyEI%iYNFSZvJ3S!%iu|Vhq})AI8@p#H zTyRMcc+9S!+d^)B@* zwIjK1aI@B!tc(L=Lz6S(FU7A(z7=oOfh%H*Vn4@*>O=MIrFCxoeBM#Z6?$pA#JXlbls}h+_-o2%xzp6J*e}~7+bcUY_qOt}(j~ED z;@F0vv7rsWkn5S0=?zxq#TLhw#?Zc2`Bqt`z1r|gObCU=nei62pV6kj;jd^Y*4TJ} zIz&A}y|i#&Ls-15VP)*SVDV(W^ZM8A$c8!I%lY1pKp zXZ(`btXMnZeP#TH_!$jzVkgB;i=9hG62bo8-grY}yTq1>KN@~)7#&A#Sp2AAvv{ZY z4)MDhZf$H)682PKCYJ^do_b>ip5 z+l{X^o}750@#)6yp|=`xrdWtt9e*eONqlWQnn)STJvcEeab)7!#7&9ajjXIkAU({j zn?9j^+U$@Vmam1w^DnEb4UWkJ@2fjDGWt8TZ0-lEZCotJ5z$X$2N@ZMgtm`8-|$pJ zI3AB5QAHb@v}x1kxzds+8cY15)CleRvF3$a`d`{le zl-slOv$C=?R+VnKu zb4*jebWH_etO@k8Rn3F8{iX`I&hVEoj^?bJH1ktqJzP&R{e zJKlHIhN!D+~CqdvDzm?_wm;KKZR@y99PpJPE`Z2UR z^kV4Q&{Ls@L$8M(54}KYUy}RYUJ3m|?ty#2fpN|{ERWC8r7S1LnXdG zMmkbDitwqQawL(U4jR;>j(QAjjQ}68{p=qn{p?>$O6TbZ|8c}_^|97z>;wsXLFVnp zNvDx}I59tZ&5D2x~qc@b_`q%lTfE0i^lc}v$>-+(uP4EkpUTQ!M2xMaONZ@SAYg*@r>^MQcQ0B`&{4^)% zK977J`6}{Bgtu2JLmW(>M?N&ldz3{g$XG|awOphE_iMld1e`;;F8W6akjzsoaGn$k zcu~&o6&njAJ#+K1-$alH)d$s016JAuc#45yAV-)c<%oW|v>}-QD8X-r)XuPj4{(7* zNfRj-Xs@vB=#dNj8u^vnS^GVimZ;^pMEYmBT)JFpiokrI$YzR_G3vRRnMb0O$D-$x zJCx7YW=pfBEtG|-8*^hf$6(qv&^-WUM2HBz0zwSv5cnenbzql3$H0DpPGkZi0-g2? z^a)%il_MgUJs?A$R<0)Zp?lLiV79Z^xv_zPfq?^!J`D^s1%8h#R_;^wAtgU`3v3r~ zn;^a^&^s_nIYL>WJfSXCpH^Z1u=c2dBHtk$9yl^^xB-@+QDmsim8{%h(^x(D`VEqm zn{67a2h>IRW@%_(cwndjmY@-3s2yI#bR>y#@*f;HFfusOov?4>ugG7KN67DTVc9Rk zAO=?S;7hCCT32L|Y&RN<f#Hpj&LG_xpjdhB_v~j#` z+-sVWn4CC4AFFSd-ZsrosxTo?J*uDzrv;8OIF8K61a{4j3)Ig^B+r?Fs0{hZ&|~R_ zR7^NCQjTXecaO6Jb|NvVn@()!8TJ{0vm-6q3tEe29B}Ai$CzIds0GZjd!9y53tSLD zow;12QZ`5h((_}$-Go=PXTq$|Z&zt}amS!ufStFqiF-#Bgs=$XTZRynj1+=u~qVP%Jlfat5TN;gH z^H)omio#ccuZaAW(MN~}pCco7pkMD1-fjitVTl`Z*X8!i7lk&$C+atW&($`| zq6a{OC9|3(b0qlx9{7u!(+uAe&94FA8v^D>8OonZ+9ffx((*5o?JuJrkd^1Ch#km8 z`?n<4FVA~Zc|*Y;^aqvThC(#R^Sv5v)md%4AM-*j_2y$X6gCt@l3g)?_9(1vu!Ko_ z5CwC0qQURUe*nHx!DCFs&PJ(k3>(YPK6`Txm}}5ZNCevw+6kKk&9oD!4okC@)vS>i z^^=M|kRe5~CkSngc_O$p2HLj4jY-SxgpEm=H6*A}?kj*l);14r-rC2dkN#?a#LYbVawnS!A*r7$Q%JJYG}EW;e%SxQ9biy64=laGeY^UMr+gx?RE~r z)Vbsj63AVHz}ANLi5PlAxn;?d*KI2N932<2a$E$Mr~RrTzY~eYsp)<}VDJ3iK{Fi$ zTF26CWi@AeEX{j#5IP9Y#dZz?w7q`84g%D6EtNY5cPA1LSRv}U*k)O#hpbP>o{oWL z&!8RV2~xhWEN14TY%)D4I4IaZKO|_Tqd@CenyswnZ0EHdg^t2ss$V0m1P_L`$TTR} zQGnXPq=b3@;QmD7=}&Zg9Wv6aG4cY-NfA38g=8#gSk19myCauQj}DFwE-s7>n&~9a zI+kWD)pHCPerBtV|g!A`<4L109%pZ;mKlR)_*3mh8!G|RI^7V@l!L3Af!WYEq_ zF$Rr%HMWXlTj8O?&hpOkw!-dmdA1dR@_y9A$rE(;95n9^h6UuVq(?i;T@9eNtpN31 zjcHd>f`&mJ)L{dx5VMoXlf`T&Obyn@coKhpa6HGU!R-Xf3BmugGBtQ%@Dz^mL_Q;U zt^<>TlUg%irfSP>!nMI`gD~$VY)}9h=2YIGzyJ$cQReG314Oed)Ndw@R;9`!LpBpqv*uFLHq~T@aZ;q!~m;<{SEHk!g|Xk;}<^T5CV{*Y`G{o)CXtWKa1# z(m#x)I;goxd&&U%({fMw0+I*lF7%iC%X@S5t_VTVEV;5nI)Gh)-<=cbaoFVgkr=2195>V4iU^3M~o$2iMa`BkiT9%8eq+LEI z>m#paa=G@vUJ0-U2I+(JFSW0=Q^@~w$CsEyd0pODd4p7w>= zpCY-RB0OGd!)kw=SkMOjHrodZW;^za<)?OxgACxq?9Y<62MWu{fKh-FKT!BNMR`5Q z*hqXw87vGI-jg;Gknq`kPk*k0U5+(H~I{H7rX>t-P9 zU}0z(`igGsAyWE^zsY~fA1PlDJ}u#krel=762k>ACW=5vX%kDtDR(5d&u*VRT%af+ zN;JfnEKU~BDRHNW=!&i!DOAfnq7z36m9mvmJ!1rn*`~tiZIm&BGhLL*IZm*1lCT?j zcA<6Z5$;K}l}tT|pE;gtP>Z#m;Y-v@)RTmhgn^NPkv4=q!hORt)CmG|NVqn3Oce6I z!fC>=aCuG>HjQjbp0nuAsBo>!(q?IA3VSGSoGk!*Ddo6Yr5;e8Cs=tW;mw)|gzAc90~gjg&K{B6+$e$ky^0 z{TO|^P!2v4Gqy4Em5VGIZ5Bzp6zD;&q=S-zIv=Y%PjhJic7_k^Bjqz=-iY6s~Ofyz*4({|FfQb(y1(KwZ$ zah(4^31p;@6*NI5AqTR1c{qQs%0LrMK|8yk^-OK+sS9XUAEjXVo> zws33cd?We{GM^ecH8h@-mxV42og2D|NY^_sGxQ(g0o)oI9U55&vju?9P`YO3ZWeA98sogCoZN#v&%zAksGK>9JJ%>Ya|OPJ zGDM?XO*u!ID?sdA0ZKY&CAC<4hWVi2ZqXLizRrCYjN z`axk!4cMUslOGf)U}1KsZEb+|&Ki|=)^^sqIXJqbHT-xl5$dPI9f`z9#w3C`T0db0z0))b>x+MV9% zCAlTJo%2g_9|}KYPtltb-DBNj2Sf)%VHyza%W-Mq;si_=8FWc)llnwJ_Kogk&^HRw zK~?d&V8tB&xlnC=Sk#+CqISL#7>7j}-w4~Mz7Y^AFOGUMD(c1e!V%GW_+FS8J=WkS z0m`2QgldnCPAu^g@dU6$eYtW-en=j29vuZR`|;5eEgWokq5UV}H7>TVtuu zl&GC66IUjF6)4Qw90j0G=gfi<#y~kaYUhel)^nogL@zG!yFkG@mKU{pTpUFozT780 zFuh;;>fF!Sr8#6o{?PnLV`^V`%-}DfUhazm3kbIn7DVqU@npg815!FbZ-R{KQ9VJX zK|Th*SMEl}n-G0G4@J+=`MOm;RUfB%agzFJ^o{7t(T}3_{>{Ep|AO?2u;&TCn@j{0U#ew*`5s(iR5G3(pi_ z0&S&$|IqYasl8H@^zm_ITC6fIh+PnyPDuI8Tpp{O88OO@i5nAHA8<+R@N~88l`Z({ z=eihhRN9$F-_i8eo&3$G*j(;3~%K_f2bekO90!R2I z?!=#QUt&w26TijD0m4Ek!GNDUpoZnvJ}WdjWXYi$G2p8u?0{ZQWDHJ9)AONP=M9Nk z*&xx?*VVUCqE>eB0qqmz*vaR`7K!TF*@tYF*vf@neUxnym9e|encEVS1(x!*#4U+E zeS7-uN$}i9YA^x!5%%>7m=1lSg&)PY~2hX89)#t<+dC)FTd2?OTi!*%t7I+2}_9{&D^PHJ;V`X8IZ~btg zUOhi0f7HLz&-JfSt{+re1p^1Sp@dbP~-@jR=0ahdNjpO^iDw730F z-P``GenGl&pf^_ZRJ@g|$@qwr>(W~Gx-{@cc3N&)?g_$0g>J=e#aTWx)QiqB_UbhG z(|eZsU1wyxE)7|xCGHbuGKhf;MVjf461K00@bXW7vQyb&79;?V~((MYO6y|tuXG@iCj zqgYQ^-7K*h^TjrKja|mRV+Jh2lb7w0RKwzHla5XuZLD#(=yS*@ri|lL$XP@>kwEJw zr%q1Y>Ru*>6c9+ZPyj`6$~IZb{wfNtc2djHKlpG9nRhn=AISsd+x0 zXHs@}`|~L?_xj9iOupl6tpAm4t8YTs+?ci`y3Gmj{pX#O6Dw11q+o(MTsE_ha;c?&k;JLIDPx%lhE7hR~EuH+pvMl{X z`e`y&?@Jb*^_h9F^c)Sh^3`;uT$rb{_;@ZZIK%sR$tPzUvTvt1%D&+HByHveAJt)L zwz8Tv5~FyJOksoUCuD@g{uw*G--t{-eC8XQK}KgzXq8bnHgk*v%=)G86f*W{?R@P!x6J*BynRvz`Ob&H z(^0M^T%Vbl`O#-*R>sbczL~_E@uScC87sV%wLrDEXR75FAJ4s+^899c#?ibCg=$~O zcyT`&2l3g={UzWBO{ljJK=*9s;miY>M>8)H9w$6y@M30i)}3>*l(VvDWsM(5>CdD! z&x8I~GL_N#xCZ*rhSL1?j58rW#o5ZV=!Ye_3oyP3B#UlTqc6M0mniH|dOc#j`5cD~D4(frQ*mbrqAI#n(I$;bV1{}eJh=alTF z1S)}yQgaqOfQz$sF3Mg+#<&75%3foFT->jdd$N^~_k&m7kKCC>7LXCS7!CdfS(qqH zLXEj}w%nJuw7~aysjt!o9DN51zLX67=!9b^2{iZp2;pJ(=$YB@ArZ|VQRx1C1<>nr3VZ2 zENtedw8_~)TQOJ6b+@sF->xgC>6Y8n#1?+O-YwUGw6Y6f+gvBY7JfV14OGM(4cVOJ0^^^z{SFNCwc+`Lf(|L|#EyW$366&R@#eK^v4i8?v-0jh#BBviQ?c0(&>EFt zetX`TTl2tGHg3&dpT9PLQ~r?Bb1=P+Zf@3L{$YMsv(vw`B8o+Ue8y;i}@Gx zuMk!fj`s7sm9LCr{m#6buZ9ouUmE_$`;o8nJm2L{^qcuDU!EWHPW+Om{Fz5i=AI>c zK%p8=_ggulVCF3US$?yoTNvo%K*c$n{r zg%bwp&DFr(h`;p1q)02NF@LcMzjAW7L3jY=U zWRcE!J?NlL=Vpt!wyqEU;){-=*FFX;5yZb50k(>{(Jl@ zqtyd(^>Uk14cirY7WfzV?^J5%KL36GUz3%x*zZJ{%R!zOiyrbnA*pWo%I!g-jP0$C4nV@0}?ywJLxY4%p8z__~!!esVf4t0vT3@Uk)E0 zv9m1jU~ExrR~?bkha?V3s4+FRT7$X0&6h0wpmkomlfJY51o4999BX;%=h)z$x?N}1 z?5W?KyF0fkb|`t;r}tCwtk!3gXOxe%kG0prpJ|(=0QS_q+N67f!0`!WUa)?SO)%~^ z^3zu8nLiqQAXxA80LcM799$9+<6=Bardo1B98UNgFf7^gzNmzHEUostrMsp1(r3XX z(u0IW61f>&d@+a|q%4(w46c&aNV2$8`X%^F@Uvi042pM4aZwW&N*@GQ6W(g&-IDZ5 zutCflKPf*a{Tci%xJ-IldO~_t`ZD;E^pfNw8CFOujPhfW5eSLj1-}lyseC8@D1Tkz zWrgyF64ey#N9`BwHRTsF2cY(gwo>US^7IgQ6NiL)ic2H!MShWi_m%gRKQ+q4#OZ_~ z`VXl#=?w{8vh#EMiu;PA#LuO@#80Gs#UsSMh&+sFfMEnGeNXJ8#IMNwkdg-Udl3eZ zxNi*WUgA$id4PDV_^Sl=gG%v76M2|;ScxDR93`QJqCYF^2tO#_mRP6!Tw*QJ1f#iF zF&2%Vl>q*dewU6HPi*CP=`X_X(utxNa^`fAQu-On^wZACBIBIaC!0~}EO82f%9F*j z4CyRV0-z0t!XcEcu^2VE^t?1BbdCrU4`ji7nCC2#mFi^0${&g|iBPi$e`UHzY$l+@ z>Y%$)yuz@72lYZ<@Tc-c#1AGU8@ia(E*39tjiDBrfwn|t+A_}hmYFX~_A*uKJGKiR zAUBH_iK$SE^a(SYOUwjz@aIdknM=B)klk5NwU7re`&et%99Dbzx>Gw$cd607pt^I7 zs43TojN8OpM3|W78nF=KYuAVygf|GI=0@>GlkXg{eR!i*D|*#hYKo+iLlxo9*5?G) zCT$$Dvp_sft&FMKmZ8eHPh9MXTQqvR*-^iB$eV}6heY&F zAv{?Ed_dm8@CC~wqMNQW$@V0%f=_4SkA5`DdES+@4ijq;TB8}rO;RRQ<3&#B)M*4& zeOaWe6#Iqt3@sOzi#r)}kai7$ypl-Ih%>dBT3Su3dNi**FFr5s9D0pR&l}d2#+-UW z97NC(C0eICXz%%w$8vFR(gIA#W?^U0ZZ21n*nXim$b6^JuA!Br@4FKTdcBjO?-wGM z-<6zdrD$m4(PyG>ik0z>_^}SWBbw=u{zCjg{DMGjKakpI`e*vj;?Lrz;#xAn{GHZ` zMT3@M&kVM;MjZ13Y5x#kr&(Ao$is4BpWjZ>n2$utXni#CdtdyP^cH&fskouqMs=I| zhwRWL;(KD2Ss%=LTKibH-GJYEW4`H<4tbiP8Fjc&}j7s`!VEMk8WT)mycjWg4_p?>z4jzFQp--YaaK_6qMyw3SZ~&wKAh>Xdy;dTO=K(Ti^%b+lJl z+n4ll0Lew!Fw!n^K=`2W4!UoiZF9u8pg-tF-0OBf=dc&RnKZjt$qt zg2Hx@avUFKT&q#KMG$7IU4on^hU>$+6eox4=lN*m^hi)9hU@3_@aZHAonUUqYKI8s z`BCbll+&+7yv}*As$AQsQTK1ze^gh$R$COAUE&g> zyu+AZ9=Rbhx0O30mqe~3698itQHjp)j@aStD8pz+5DnAKk*keqVPs+Ci8^^YQXk7A z_3=`qcAkqo7kRY~Rzz0R(K_vSJ?%s6s?zuZIL<(&Wvy|e<Eij9y5 zl^99NPh!9z`Cxg3JVgE~_A3E2f5i?{?2MJi%7>Pqndy9_47mo8_9&2nx7Q+{VH&Tg z((#+jQTAH-Je7{xOWokoAEM)x=w$?0+VDFT(;=C{kY zlhL8BHvuC@x%pSBXX7uuAiq%MSWYa~Dt%mYCVDlT&Ev+^aaMPh*q|T{N`2tSH?@|P zk9Oni*;A>GFLj_}5!p?tpH4+PZllK7^?hAOn6dl1Yha9-t5qCp#(hj0H;?0$dgzp% zs4zCsHqlO2s%K+uV{LpHSSz(jm5+Ym?Afd!=O`B`mm1J9FD_Olw{n(ZhWf$0-1r!k z*r5lkWtO|z{21xpCN3+K?q20AFy6-;#XNR{^Jo)RtvSYdWC$;(fed$e-SkJPfSa|v{>W17b&mJSKS6(N9D6)P@)gDwVXx=1kvT>#rSLVs8!EUuz$AW+RK9 z<+Mwz6}xQ+X%}F6z%6*g6i3)as*_(O&qNa_5ER?A485 zV)wpKtBoU zyrokP)q!{Q5rlX3c4^9{X<%)ZvM`57b$T?n&pXGx8B9hY0mhI~NG3Q;Fg6M3D`jXY zAI(a~I2l8-pB+1sOu(6j1T{L1j01BLL&$tcoa)Yu4IwgcXnX{*%}JaWJ1=%hd~Dop z8tY+$m}wP`OXB7?JGMS3KQ;*+N2K;STuc8c_B-J(L!vU&spc=^dON-54jxW?LNd_n z@-&XlKPSGw$G&&q=)}>9QHdiG-^9L&WlN8Tq-S^YlZ(iA6UzML2l^s%#s`{32-Y;T2qnbkVS>9~4$-m-Q4ljPD8)I#U|DyX~_)B0Me<1 zN%8aRV3Lu8=ArVr@d2rQ36S*+2M$Ovt}2Z)Q)|gajHyhzH4fZv%(bTzX6z zrg4OQvj?SMRGkm%0LSIAmfyu0pT&_cz?ALDc!9{2gzaXTzydC59jPqp>-Wyh=W z_SmX4-pp<;u4I%CMoyo=XC4$RiOZq)$tqNq~`Kyl6+P?F_mlyOqjLMCxYfH}#D5 z%8Rz9(ug&+mWL;6Wq1;(bW9mHztfWSadz_T$7@+QkQdZEzg!<}AniXT7PCBtH3CvAS%D=9^lzd%x z0`xDD^U?Xqk^0?aqTHQa*m`aXXQ&H}bJaDaGuB09f--3xKnoX<^B539)=GiM(j(-3m*8t#!IAholZkZDq8+vaniT z?T}+d5+YaYuU7mNgkN1e;e6UU^-T>Y=hnm1>#}3>Q1;CNyBX67`LTIAZY7Lexmf|n zrm3}jn2Zneu>NSxu+e3xRPCrQR{zRIEphkyP5(vTIA1%#RHgXHz@5w&Whik%YpB(a zTF@8mdD~Dv9-&$ zfim_9dp12i-RL3M$={_3KGHXuBb1t*o^4oZpJ8ra9;(6o?j>#6M>0U$G~>%z#67MO z3#`G*)6jeHrpsZSm(OF3aIVulY%NnSY}MQfcrj12&ASF$fdcDmWp-GK0{bRxv+3ze zTG!&r>_M&G7-cPC?K5DTLG5y~ed!hk%>_B;0X8~O9rZ^$)-Esgh4zTwk3UVxWk3%o z`3d6CMss}1)u|`5Gs)UsC>zSYM$l3KwNe@8=V!cW{aMoKr^w#giK>&YN!5qJtF&eY=6 zBXzQv4ZPLMHgcvUdWq^JbsBTbZZ_ik-yl6WkQ!A%t z0Pjytv-QFZFx3DpSCMgEy!cOs=Xx?8%w^mNFEc%3z4D^%)=V|rl(ACnC^1(57d*## zDQ|Tehol;>e(`OOrdS)qZcB8En6QCRWSO9mro_5pLqGOkwEH^*eE3K zS-3x(Fir~sqrLF(F*8U8 z^uuyS4#t0BErK*X7c2GQ(gGhJ<9e`pp0>iz8=Q4j`2}W zf{{UvGe!o1(Ls=MQRVDzqi3;PVV1wjy`6h6_hIfQ z!e#kSa-Wn&p};KFmx+8O{GRjY|H@IK`BbZ{kZ9ENWhFVOPwiJ^X(YanXvx3xDwDmR z23+1+?z#Bo?9QWiRIY{bFzUf=Jd8^9ALr}ivwVGgm#-B%4hG#zx|4Cc_%XVmPBr*f z6B!=AM2_q*htCc4?pxvbFK&KpB#7Fia-$X8Q@DqWX8@CxvkHLQcm#pp$q7zQ zaxGIC*1EZpX>(IeAk~-VEf@I^3Mo zcR+c#Z`2kjll9z@leaPCwlE*nnUi-kWMejJCr<9p$vbm$Pfp&IlXvIjKAhZ_llyUU ze@@z zoO~T8U(d-ma`H`_d^0EC!pXOC@?1{7jgxOjnH=}-;N&||ChO-ePQIIy@8RV6oV69%B24vaq`EU{0S$2%E_N`^5>lV1t)*W$zO5uT2B6&lfUKU?>YHL zPX39Lf9B+0IQdsjUWYPS-rrCr%lkVg|G~+Ba@ zA5F_HZDkTHzEtf*-dbBVl-Bj7wNHQsu&oA#h*W8+_)<%kQ8BbynxP%Mjr zUH7e9y6$$nZg;!C^E#u%ink}mST0PA zvFxb$qKYr6c&_40E8bc0Wfjj?Y@^DhRlDWtim$EsO%?xW#b2rTk1Dox#pcF}FD`72 zO}J9eupxrs(V%C$3(wt{d2j?9kaUiG28q$B+Je!xyUtKxUEZv$?R2Mu&1TTUq{~Oh zx3Vq!TK?l^O8O1-3(wlrny^E;*li}cb!#1PH}uVQ#B3{O+pN>iI<%AZi=B?!AF6C- z?wbum-nDMWdYP>&9$H!JK2V1&ee!i7Kh<^MgWG|8gX@pj#5iKuOd`r)x8JyZ%G=>) zJlORcv|Z~uh_m-qc-GddEn=S8{iJ-zI4@+iP3*Bs?6FGh@k!i>dzxRD2hZBbDewpW zd%tAaPwe&+yUoOIbIqvQB8HhU-pqD>uxwH;9}>GwGUMDQBc5Y+U$;4J9qiAxou!$> zMwXwM(N|`~*11ES0omup$3-3Y;mR92uNtR8rY?5WO@M1%j{lp9CD>(W}?0t<-l)lL7i>$uLvKbt961$xZ8;c#X?2u)L zEIYwr=iJjuTe8}c)t0RB2FE;-WizplH@LE|`Rz}ZzR0pqmYu|ICphdR_VLbaDs~d@ zL;jOxlWgH9S@wg&=BCZX4q0u<(ogLE1V`I5PcLoBYD-qXWZ6mVc7nr>O~DRhyVRWL z%zQo=JmWbm-dC~a!u4bQ4vcNpFS%*gzM{D!?D$$U@hKHItCXK{X6cu#e#x>)md)U> zleh!>WZ4gS*dIF^_T2}}UEx{Vc#h;l{Z8WE&}HIh*E&y~7dqVU#N6+)95wuW=x9$# z%zYut7ej}6i2Cnx@~qi5u-{AEeQ)9R$tL7yWO;*_H?Q4I9p_HPJR8ss@@)(DGH}o1 zCdT~wIYIVqhI=dZU49_T%@)+J%NH|u#Hibl52!y3US})hID16fd=8-x(Dyx8oa`WH zc36vpCFIVX7OMh`aSeLQH^`rnfybnQobkE-Q={bGi1}V)7r5NI>wR3}4Pr>L-1$T~ z#tZ$B7hdmU&+=fL9I_o}+SwlcGCrA?8WVSHsc9~R4mxl@oqIiBJMei&YKy-Vb1uZR z4|d-;Uz8(n-5BK+BY%dl?^6!$Ah!!6uUwxxIBU5bFwbLRKO5LZ;m@#P7Bcs2%9%EC z-V^gomE~xAG20>^;2Ri!p1{qC@=3d-(h*Zf%s6D77p&+nHO?3*?1Sg3ehW{!Jo}LO zfE79B^wO&B&WbN1ch`6v=99c;SiU^T!B?D=CtaR>*j4Ei?po&6igAW7@`~HTE|<-& zT}hkB4fqxuG0(hQ$?+_Z<-ODv`slaFCkroEyjpR4c+DdERX&{NoX&e-_|_f5TvHC6 z2IsfLor;IRQ`X1@dMu+H&u}TKi!z>+08= zj&rg}mQ68jifL2qHgCBeZ4qw%)9FlfX>-903|W_JN4&umo%#Gy^J2)yP?N_cXFk_F zz_S}|VvbOIGb86S`?FqWqF1#IUc$2;+9LiNx*PGV7tH51G54=5M@?8$>4>Q#j_1Lo zQ}p@lMxEn)-kIu#4%-eTeXQ1DaEgOCp5q|rx%#xw;q%&Q)~tnGbC=l9REd{np24$H zv0vDqcZxi*`;$1Hi9&~UZyPwpFf$@4j{Q+OZK8N<3#Etcv%`aD5UxVVNVjH`zL%FvdlRmfs4-M4T z*Y!ehC-bw|i=W5rK#sLz7|TYnk1_EY@Mg)^j3Ex+7y75VJTc@GS3|z$>9vf*wa)Ot zV?qwOT^DV~J?3J@Gs|H=Gv)$}JrUPW-=^GaLTVcvI==2YzOLD}9Uganb@VmnePyoY zXq)B8rOe1Bu=hLW{eqF(-7&8}iBFLCs81cQMbPQ7ZS-~CiFU@u9%+-Eu`#Tdv9ST| zn=Kc{dX#+3*VJa{Smc!rHc%h(-S+B=?>QSf=*z5U$F54eYLC5UH_o4|^Nq8|Z4@yn zyg<48KWcYT4jvnA@3beNZ?o;1x6GW^ka6xBMZP_D)iqj2u#*{fz_5>X5%S=W!zSdg z4;?US#hv!0XbbmN6GJCB&O*J$b%IwXos<2-3(XU5hxz{@EBrj-0J+xY6E6dIao;}H zYaV;~HH-;j)jR<N_qFX!}(Xs!7yZWA~9+K18&w^Yr zIbZB&c(U}#H!4=tIn;S`(vd&ZArGQ0d9mUH6)$J@`>4`(p>c-G$jQ(NPC7AO>hq5D z`=f2Q<^6ofkxLkRV!!*vePGXH;?Ebj^XVZRu#a!Cet?Ysd8oPN2P^)eir)%W3~zZ` z#c%Dn9sYC6+X`>IW|VzB+5JwF`0L#@9lPGPLK*$;hYuYwo&meK?MVBtn9pTdjy*cd z!9zvghQZjQ;~5RCcOOHQ9PgyEeY|6u!w7iCMOMFwYdb_pB1e;@OLhA zF_vC1?euN>RLGS}WaST8`9@|Odeo2k4URa39mug>X2Pb$d!^a+urE7g^}F?q(l1%L zN>*QF-K!_|d-dR)GueOS&z#ABvg{OQo-4L}+p;D3aIhh>t(g9ZWrsTO$8cAlcrcq7 z{@`vr^P?w79L0k_A%kqSFEJN*bet#hw@q5wzb$KOW!%o#o5zI z=YSPnXq+4O2hSsvtF85uZv*Rr)BK)317@BaY{-fgS+OGb?&iA;vc_9@V1<`xC*lxt z#U$*j-cp}Apnly?#5Ux3R|r09*2m~~`%fBK`5e5m#_xnf2jfL-ow+V-k(_*RdGO<* zKT`CO&aD5mqgau3#!y@C7nJwz-pIS?ZH)hn_lvuCx9|7xekY%J6Y3lM_r7?io!2|= zv=fhD_m>XrIKv0rOHbn7IOhZIhg~Q7nmZ|XaW`9ZuBdod=I+JcliBYZY2WV~Q4hUs zyek>*sE#cf?vC=hEBuU2ykr#m9gGWSU9q1BCcZc1JjcEi-1^v8&77?X{j0z@hjefr z9(pC1F~`~X`&}Mq;|u;#k&|OCnhX9hhq;N_@i$O#IttG9z=KCBV=~YYr)EYvU29s*Fi2m<@JfDtuUVB zex$UuH#qCPm2-N>;mkFNJ6P&?411ZI%q{m>>S)hx>b7shSm3AMEgQKV9>O_?3_tnq z!{uc7w&XtW-4On8-{Rfb#G6JhP4hDGqt5XxgMDykjF;bF?DW3ePsC7iey5Rju&-r# z;aT6aB<~yck<56{m>FwoXS$1a?lEp#eckyNOzB-7bhJjOpWoBDZ!=n_J28!1r&AY% zeSY7v-R-;oT&FudWP0}mZh0=nn%&t;bF?GrLlKNWV~)g(xpRthX2kho>W4pjm{;5cz|5phJai z7_4(LYDLy5%z8q(zbip~7^EB{t2UB#{v>M+X2!h%Svey)zlp-RSN-zaq0G2Y$&BxF za=&DqSMlvlmXq1_3t4S({*j#LqTCj>4vc*& zzDXE#QhnIra((YNNA|km?{f#eq_Zo_4K=~_UB_!e;z{gN@|k;<81?Fnumj(GznFVF zbhO^d@}I0R2FHF%R@{PD_&n$wIrf;%?^nNm_Pdq-FjgZ!Rs321V#c%8diX;**lYMy zr9qgWI66e7EF6CS-D4M+?KsQETo(yQ_k-d7gCPlTS3|6_ko$Q9y8-va>1glf8&(p zi8tWx0?%s;rkp2JpU-qzAM4cZL|$bc;N4=_^t{D$f%;nVZ(SC8`}$3CKLd#I?(B_K+nd43pUela?RSbmZ<_Vqtc z9jpbM{hXEOowo3<17OV^S#8Ox55X&}56vCWS6^%I1WUj0zzQ!1vtN8~?E2(cTlWiz zF?R=x9kTo!yDRBT+m&~pjA!k8?txtXkTn+aQp@upS$-BCL|gUc^0c?w`(RW1;P?I_ zm~Go8IO-?a+oCoGuad9&CE8*AagOo!ZT&9?a~{Qf$5hGf{p>69GxNZT9Ph{=$8$7v z2KJHWYAM)sztUOQb_m`8%zj(HABN4y6EgBd_sOv7xgq_b zT8=!4d(PZ1-j9f}PBSAnwt1XKaQxfqXM;QztmWJmX8@P`TK2Pnc(vQ?1*diQ2Ff{K zShKmW!d$;Tr%@Ab=KO*W__~&Sp~2Z8_2uV;tdD&^`-Anq$5e~R$%pqrt~HCjrr60m z@cl?~^qYC$dz0kwAv5=^J)PvU%ZK1U)}8B&d&K-Zmir)QJ|MPYKaVBmzL|Bn9`_)# z@gGFvez%GFg3T%CWJk=lSzhbR+O59=ebpbb@`S8WPHjgXD!$$DtHlTBWRonLWYuTs z(}%*3*~k&JMYw%jWQ~`sHBMIi3-?2xa_miRNB+P4*T6SMtjHQKS@8*u_&BFAUi)#h z)!dQg4_S7|8l!V+`?gP%wq*HFR=;G;d&t9wS3F$WlGQdi>I_+Sf=k=vgMAWq)Yd*# zxN+9n{E1J4<^PX;rtnYxdeJ9K$3C0nt6lqzif#b zpf5a=-UpUG&J6OcXYYLktlS_g|H<;f899dWN}qQ6Y#X^ZW7~c++3(w1{wbJt4mM=j zDJ=O?%e|Fy+HXT&YmIWvOJd(!gE21{BiZYWIQ+S@xnnu`JTT`Nd!HOSyY2SRK`!6O z8l!WJcMv)nujH&_qhBa($!bfM{;R&2;&9l=>WeIW=hRooW4)7=&t%n&kYl{aH|I2# zEl1IpVn|MB50|??h!5q8)z3ekd^^~X>6=(~Tpqr)p(8ug_j(JPvF4X@jgj1Kd9Dt5jL|ubapP}mEWRGaz8-)2OOPv`&MBUhYc77_%SDGQ`Oo}L zk`K%|wWVA-Klen^=^0t_cmHmYlO=!cG04T8m7?>`C&5~4WUV!_a)Yc`ku@%|Vig?m zN$k%Z&M8)uqagUt1z-Jp#h=1=RPt9g_Eh+}25TJOt}~wec6}EaHN@XBC0?zV-*US? zboec|7(b7*JM8#-6T2<71wU2E#r#HH%x~1ir+QoG2KA?lZ`8%7Rt)(hzISEJ`Mv8d zaQHKCS0)~x$9}Xo?MHYg0@i)PZAE9}V_?i3?L196?^I39d(^`7lZ9>KD{eF1Yv23m zAm_kgXFppawhxe7<~dn;^6^Ul;fjC%^JuFaD7@Hwt&*=+{I_3(zRq3vPLFNo5tmL+TNC3-TcHFyt{W>z++M^ljmvfn|rRagk-y zIr$Uv*uTi?D>&>+pL^)-|Ge~Ne*mV<)vg^+j0iu~k(_mateSwhAr4Uo$f^lRj`w1t zuJ7~i-@S|R-egx|$geyZw@2~4ist2c_&^=Re0&~tKt3RU3eQje3+dzi?B>K{X!{4L zE&6>fG5XD1%iE*&he-$W%#eSzl4ov@PW%z%syWA=zJ|VGz0c!!Q0&9l;q#K;+cV%{poxStbLZ8 z-YL60)wDmheefJ{gT6E5Lx{;c{scN&3yJ-EU9$Wkv+W?{I_FTXwGbR@!5LZ2wQ$>? z7W-t`B&%(3v?Z&~J10B8{AXn>*Z+CpH~a-y<0Z>xaM&d4Ir_u@viL(*ObQPyIMzE^ zHpx1#hCJ*i_S){8e170xp)bXXthu=LU%Nc!;v@g2_(Rq_lGTbK{6W%ijY`=}LJzwp58 z#{Q@q!6WpI_ksJz$sq^t@b6eqH?nVqc}JK1#67u~_vCH|>wVyN=2?z5u!~x<8#0u=h#6+O_8r_pIVdeuBa{S54E3C&nc4x!{b!o;9{#(d3M`YIF-uS%nE1qQ8aZY{R{SCC099ym8*0X!Q36>qQ`XZ|)B=+|>WYvd| zM|~ixu72!W#U@!c$@0?~zHz^xTz!#cKjdLw`mFi){`+E|Ec@geTgHd1bxoH2;IN4w z%i0eQ@E@!-XT}G2-7jYCr`P?5q|>(}{}HTy$r?-H#n4gA-}Rr0K3V#OUohv0$$%XG z^zE1ab7@OfTe9qb;=e#g^GH@4$kGpveTS@=1V>DqQ_kG~SFouVlGT@U(%JG~p~L+V z>(^QOWY@>~CF`!^1Apy0h&d9RESv9tp~%USzwU1!*SN@%+kY$i6*q-%ZGPZ8(9s-` z)vq)BXYC4k#DpySWaYE;nR|!+d+C=f{ZIZ6$i?sZ+oIzfef5lT>3r(%prdh-`nAq28aLlRN(*LdhtN3Q$E&R~`F8+|^4_W?@6~o|&8(BUi zb{~SnhY$XJ=_@$ek{Od}E8MRbXX)ogl`}l^i+ScR@~>7pncJhyUoJ|gXG8zf+20S| zYlk0gb8a+$c_{OmZywL{LRK8e8Y7+s+BDZVV`Rp;BJ*%B@w64jShhnRbK2nd;fQgk zy%(&$@V$DL<2>Lz8wkZ4$j8X>G}DyO?wboK9jYk3NMDf?6`bf=XUHD**CIuP9|eQD3>RjAN{|{hrSiQqvBUK!~c+UrtPMt$@Bj44p?i1 zET11GEBL8aP30qg%4uNSO>n?M96ldu}DgTl4heD_4zw<52aZYymjOQ)BdCNLDyB#I(O?E|0#J$^M?se@S+|?kTrNet3F?01K+#Y>!-MT}%<82-s z2jdymfB(RP6NQK0Kk&B5+kyY`fFlu~Ua;bZ`hO(l5?QfA{%1K^aYN0Qoh5&MbmqFi zv*D2#JD>Zq9BbrA-V5HJ^aqCL5!bsnz-NN$b zVu!pf)&*I1$n;?l`s79OcrDkwQ(t{0PJKb2IWT=X`jyV-&H$^{ku@%|d?2&0o)sQc zycjGWs8ibqyALi;KH&YkeBk@{Be{M)4LSQ-?c%O7bdWQ+S4?uRjVIyu=;3vc#~HGj z*a+5rQrazjX}8&m_y zPCGoSh-b|nEPde|vYhgktbXwh&iiG}$FqO(&Ce&fFWdpbX2dEOF+|NoNTt8p#hxDh zVt+@?$!(E`@(21;V9oF6&jc&q$XZ8alkua{`b09&yewqbHEBe zS@A;i+q1>am(K$$=gG=XGUsK{$nrTjeE!n;XsdZ7YaWB+3_#YLlJ!h&+vq292|x37 z;mLSV$rqd7*-`o;v#+I)Yh6&TzK|P=3C;u0S3IourA4a4Ow*?IkS>-X7U0s z^8r6Um>53~XxmfU)%oil9ppKB?}bmMdGX)<@}$9MKf~Z1?s3zZ6X%5Ew29ia^$f7) z0{M0v`YGQs<7Wr(mapY`75S#N%snx3?~`oH*dhiE;UvWIH(GKRo3*315`;e|pfPQ)ws+Zk z?9=$0$Ng{c?<{qC_}imx7ymC9{u?4U;n#hiY3JgXY0TI?_F;SI|MNGs-1WDz#Y@oU zUi*kWWJl~#`@DU@zFf60dUJL%%~5+N+I_wq#g8p_q45s1eLH-5xE;q2EpD)nwHu(( zw|m+?ez0&8Z2R@i7Mq9W7P}K^Fx76i3-LtA_}eM8^Z)CKjW%hU?SjJXgmXLLa{M_3U60;7Ir@)r{GZ@R zBBp1)_;1ipiua4!@NXWlQR_qFui?QKySCkBuk!iAzki2cb$qE^*Wyq7ueI$xh{Rp( F{{u2gN96zj literal 0 HcmV?d00001 diff --git a/go/mysql/icuregex/internal/icudata/word.brk b/go/mysql/icuregex/internal/icudata/word.brk new file mode 100644 index 0000000000000000000000000000000000000000..80460c6012812b2ac0702a64ffd82695b7c7ace3 GIT binary patch literal 22232 zcmeHPdvILUc|W^%ckgOfTD>ewtC!a=d8PGB*v3>ILqUyW@Dm)H48nF6X^~aSl98nv z*Es1+pi`P@OKa#K&9nn;rg$iA$b%-+4wFoQU^g{!#*=1@At5Chm=+p_Ev9WINx$zq z=iKwS_wKIj;UAiFrF+i#zTfve-#K^pTH{C9D=QW-Y*BX0%;8)DMzIKEx#8)fCuSzb z56$NC!~MDI)(&2uD^88&CX2JBnW5a?sfp3)vC_)ity80`i7`EUs5EnIb#D8_Xld$L zDc6?+;E`jw>3ef?$4W!FL$k9-ht{k)e*F0AIWD|K_wiRv-*==oY z3Fa57snu>*{c#DD#SJ%Nk_V_TUXGO7cYy zg_TWZU6y5TmPlHC;5@L#R6azVM(sLQM;0j8Y4e?Q-f6N zy;SScg5EDm!CAzAfNaIW7({zrYMsm0Gxs{-hZl?74N-eHc(F(XTawy9`vuRdz*>(& z71HX5?uIDr2HH`fyCDj@!PR$zkq@%m$F8US(ii53s?(ra4_sT@ind?G3FAXVVMn>N zqrwot+(>&4wjG^PwhnF2g~bwuUFp)U3_}1Do-nX);RjlT`F2Cp+zk;t8AX$oY}cvb zDb=Z2g;pg8)|j**dY47*Dm1K+j`gWuqdHwrLYk+t3ZvDl5bA>}s88sytgw6()`eCs zbX}s5Q_8x`sigyAF+FVsTYO*eXubW#r{Q;eeml`q2G&MCh zwS)`$IO>@6u_U2puRs9&(4JdTgXohC9-8PWJRMVfFZ&Rv10SCJ$6{l&ES0XTgH^VY zV3e`SI&3Sj4}+UO3QTAX>k_ZY4>HR0(TFLj)^PO$Fpx2s(-4FuBr0g##LOq6ES?@( z`6*{Zdjy^czs^pwe`ICj>WYj+4oChb@^S>}_0clxj_!y)82u-x(Tz@-9dgb%uQ?w$ z8)L^~UyJ=)tTVnZ{%HJSyg9Ku@rQ}?iL|@Teb{};jVC{r{A}|1WTfuqy3f>oqwbe= zH>N(FdM@?z)aLZ5^w-nxr#IBk)qkb_oqA<|QNzB5M;o4Rc&j0w*_wIJ@C#ADmpPkx zE0b?L)Oe=x)yC$g!KV9~o@n|}Q&02$=F`m=n}5-KW6P1bCH|7DTkELC;Q= z;Ou;LY4@^UT_08kZdi8TGJe|r>aKS0mzKTI`=e!xmlv1cw|u=HFQjZ}q+1S6HzywLW!w>S*c_xc)r#52;sDKmJgzSo&MAU%t%_uh_wMFkCOW zqj-OmXo~(?%v{rxE6h2{?*xr_J?VS_&^UT3R%SUqqs%Aj#@`>p@9Tya(%Vad%IAo~OY|!SU1{I$5#{8SPPBi0 zy3F=QF#GZBJw51tHvdNa9Q%{}!|q?^pUFS2`M@O*S!Vkp z-^^biulal=@_IfvMy~nY{1;-ZpW8>uNTi{k<8ih-k^|4V{+m)g{R90&gq1$(J>qwP zr#21ggYKEwZTt6x4%%180M*nwcbbI87nqP7ib?xT( z97u1EoJ&9HpvO1C>~!>O8drl~A3!yg*-!g_0roQkP%X{912Q&=G5a!GJz!Gg>`C}8 z!7tQ#Etjf))Te90-J;L+anOF1^4r;*7qj%T)?V7U?5nQYG;n#W+U>CBx(uF9kBhx` zyyXW_RX4KBu=+Yi#cNU({fc`Rd{Y~V+&Iu0m6)9aAMZXeaBbJU732?54U|GEL3(lj zz*o8}UvG4C>_Y>8T)`T~aZeRnDj%5dqi`-=iajy##Jtw*nSqhWb71G3X3NRQXy(_; zswuPcVlTYNzCZAjftST>Ny9Acz21iR2EH4(SAGZT;JfT4;NdUoI1lxMHB@G}tMT`j z0}TbdaP>zbiDtaNm2OXVBtM8g>eWJZzt5$63WY*A4rAHLuC#0l#UNr=#H{WheJU&F z=(QL>lO%OpAy^k?@2oe=%OQJC>A(%wj1 zAU)^AxqQBT0gcYXeYs}g9F5P@dyOD}6wS0`&jwwEyt+|3t3Jcpqbu`x7jgwY{{~-I zLg>M0oBBVYdTnIT{I}t>d!d)dVdt*ad63>9+PlkwHFhO8wDa?V-))r6^N#@?+Z&sU zJr;X5_PyBqv4;4n_>TCI_#edoHhw<-PCS{&C)Ol(B|hcID-V=uWBy0lOrp#_pZFqR zzMc3F>w25!^{$Bynyqw)J=CZ-lE^9d_uZ%5Zvy>w;<02~5_ucwzBf6Sd@T8F@Y=_!N3A*A`2|zPdS!@;Pg+Tl>>G={NQl*x%D!NAU5#8aQ31*721($7`{8 zJmF2ef$Jo3Ot_SjcIv&>kRnhfv?06E31w$UJWlt@wc4a8lt&^MUtq- z4^gYg!VyIX88u&HBemcy#0xFOgLX0vFo6sZ*SZA~Elj{vi8;dM08kAirN;#G0J!HS z$C}9)=ao3{nSNvqwS+f>AU6Xw=Ofm zlaT_#DBv)bWBfmEg=y!SbV>*NeNV?8PL#PD76d%^;RVs84zY5?F-E|i3&kqJi&=!=YAWEDkB<0vIt29{PCM`|s`@hl@C7X~R|97m`G8hIxz zsmO7sWF!V)$~ZE)=`0{xJK(7uk!A<&Xr>x?YvXUgl$?Zb+}iuu7Z{$ zdDzL46T1!AGV?ezSKYJAl^8WA_4?LWF?b~jSqMaMKr5MqG*Bi`Ej4aVUwQOLwT^kV zY)40yH_{=b1u3hSx3W=T)Y0L+`s9PRL}-b&<;;=Yc&ckjo-hN9;O{Y(q4b8ggHZh4kF3WuWJ`%2+O~-gnq};W`k4 zBkX911ZbQ@EDn?<#JO3ejiUpzrxC@;sj6S&g$`+n)P9lYYbahYOtEmZi8E(Xil}l@ zsSg$pm7ub!w(PdFjko!=&1FJmUz=3W%DLg9Z!vq_-5=h&sP)@nS~C>{49;l9X&UG{j=@=N#{(kXlhAeb!A9{j)o2ye>ONu`($Lmmkh-sR1K&S`E{`zD20leiHd9uu)0f$Xm7A;ndQ&Yp$stcFnZhOAN zphm|mn3s~w0yV5SvJ_O9>ONhid#D|Qq?^}K*pkL4S94dc&sUl|C8VHMMReF{L!nY8x22K#|VSR)G~N)2VwAj;f8L>#96< zk=@zW6^iPTjiC^J%TD6ef|$XA`%20=jo6dq)uKfN7Jb(s(Gl1 zLxO7-^ML$J<9DS?EkCE}C_d&2fQ#QFv@hPC;(&qFqGeg2B!_zCh978X1t1C-wCa;k zTVTzOn_d`buX+`qIOaaBWZE#Q)*eJW_pyeRex3yLg@rMLcOS;QhKctco3Qv=JM^Cv z2%!Z;{?l@3TaEMOBB@?1pb(cylD($YHs>!mRwXpVFN7wQnjrI05Y zEe(yWlM-xrUrs;}y_f2^0kQ;=pi-T&JZSC@j>#svtNr^uk-1MGtL$)h(H4xzC5k|d zNE6yr#>fLwt|y!0Jq<<@NTG_ucvJ2e@4`(>^rF5)h9#DS)4FJ)!&%B2_ZO>)kH}{L$)o@kMTAvf7Qv*G zS?F;c2mU`EqbU}C0n?k}zzFxHBydOY4isUU+@>kGLKDp=BJf}_U=+j~Z=&dc1mcn; zd0O%CG7_p$IX4L-Y1=L}Zbe8?zR)oGA=P7#Q;ba9Xig3yg>&IfAPy%PAxJ<;KsFHQ z-PZ!+^_)IKTk!qPYd9CvcB}PxAU=>dm}v@{Wc&=9{C{Au9~1MlqJIR4_7=QY+AOpIJJ3X2<}7sH!T~!WAn(} zM^D~4Ju`OjmYGuV@aEw=P62)U7Ms3(3(|L%j!#ZZm29RUBjRKC&z7dfYyhX8T7}jD zbl07?*$5z^_#LJ3iRr20hcinBJBQ<1-|I9ohioGRJ~ zNk`1q(mgY!;|I5wW+7oV&|o6^=3ty4BVy0Q)c9oS;O)2!wmO}M*x#6*yC)2*^ANjZ zVr=K!5ioBn^%w#b>>;2yL+J`ugnTemnwc220fLMOzO=y(={&^Vxu-O~=V)=%mky7i zG(zR>o*3mZay^1FJw8)BdT3%4p<5?risQ7bLvn~^ed}0BqR4Y?j#9(ZQ?oPElf?Qs zDk~i+9UPvVI|l16w*iNcjpEe!+~l4UlaobQA`qG@j;>w1>h8I*jm34O?a0x|(*3Bk zbU*kzv>kSc*i6&JIw*MP+BWbOet&af^9XJRF9NSZ0Rxpki@RXv#PDs~0O!e|gyxF@ z1>7(cCp}!mC+VhEj3oimLp)0 zFQV9~cm(XKogD%k3|S z>$wh5R0dHiA)TI=dBm(fU>+g;99R*usUlY@@V4v&YV$u}oG-Q53eSe2ssc= 0x0030 && c <= 0x0037 { + return (c - 0x0030) + } + return -1 +} + +/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ +func _digit16(c rune) rune { + if c >= 0x0030 && c <= 0x0039 { + return (c - 0x0030) + } + if c >= 0x0041 && c <= 0x0046 { + return (c - (0x0041 - 10)) + } + if c >= 0x0061 && c <= 0x0066 { + return (c - (0x0061 - 10)) + } + return -1 +} + +var UNESCAPE_MAP = []byte{ + /*" 0x22, 0x22 */ + /*' 0x27, 0x27 */ + /*? 0x3F, 0x3F */ + /*\ 0x5C, 0x5C */ + /*a*/ 0x61, 0x07, + /*b*/ 0x62, 0x08, + /*e*/ 0x65, 0x1b, + /*f*/ 0x66, 0x0c, + /*n*/ 0x6E, 0x0a, + /*r*/ 0x72, 0x0d, + /*t*/ 0x74, 0x09, + /*v*/ 0x76, 0x0b, +} + +func Unescape(str string) (string, bool) { + var idx int + if idx = strings.IndexByte(str, '\\'); idx < 0 { + return str, true + } + + var result strings.Builder + result.WriteString(str[:idx]) + str = str[idx:] + + for len(str) > 0 { + if str[0] == '\\' { + var r rune + r, str = UnescapeAt(str[1:]) + if r < 0 { + return "", false + } + result.WriteRune(r) + } else { + result.WriteByte(str[0]) + str = str[1:] + } + } + return result.String(), true +} + +func UnescapeAt(str string) (rune, string) { + c, w := utf8.DecodeRuneInString(str) + str = str[w:] + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + + var minDig, maxDig, n int + var braces bool + var bitsPerDigit = 4 + var result rune + + switch c { + case 'u': + minDig = 4 + maxDig = 4 + case 'U': + minDig = 8 + maxDig = 8 + case 'x': + minDig = 1 + if len(str) > 0 && str[0] == '{' { + str = str[1:] + braces = true + maxDig = 8 + } else { + maxDig = 2 + } + default: + if dig := _digit8(c); dig >= 0 { + minDig = 1 + maxDig = 4 + n = 1 + bitsPerDigit = 3 + result = dig + } + } + + if minDig != 0 { + for n < maxDig && len(str) > 0 { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && w == 1 { + return -1, str + } + + var dig rune + if bitsPerDigit == 3 { + dig = _digit8(c) + } else { + dig = _digit16(c) + } + if dig < 0 { + break + } + result = (result << bitsPerDigit) | dig + str = str[w:] + n++ + } + if n < minDig { + return -1, str + } + if braces { + if c != '}' { + return -1, str + } + str = str[1:] + } + if result < 0 || result > utf8.MaxRune { + return -1, str + } + if len(str) > 0 && utf16.IsLead(result) { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + if c == '\\' { + var str2 string + c, str2 = UnescapeAt(str[1:]) + if utf16.IsTrail(c) { + result = utf16.DecodeRune(result, c) + str = str2 + } + } + } + return result, str + } + + if c < utf8.RuneSelf { + for i := 0; i < len(UNESCAPE_MAP); i += 2 { + if byte(c) == UNESCAPE_MAP[i] { + return rune(UNESCAPE_MAP[i+1]), str + } + if byte(c) < UNESCAPE_MAP[i] { + break + } + } + } + + if c == 'c' && len(str) > 0 { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + return 0x1f & c, str[w:] + } + + return c, str +} diff --git a/go/mysql/icuregex/internal/pattern/unescape_test.go b/go/mysql/icuregex/internal/pattern/unescape_test.go new file mode 100644 index 00000000000..8428584f8c8 --- /dev/null +++ b/go/mysql/icuregex/internal/pattern/unescape_test.go @@ -0,0 +1,38 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pattern + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestUnescapeAt(t *testing.T) { + r, str := UnescapeAt("ud800\\ud800\\udc00") + assert.Equal(t, rune(0xd800), r) + assert.Equal(t, "\\ud800\\udc00", str) + + r, str = UnescapeAt(str[1:]) + assert.Equal(t, rune(0x00010000), r) + assert.Equal(t, "", str) +} diff --git a/go/mysql/icuregex/internal/pattern/utils.go b/go/mysql/icuregex/internal/pattern/utils.go new file mode 100644 index 00000000000..2113a2cdcf3 --- /dev/null +++ b/go/mysql/icuregex/internal/pattern/utils.go @@ -0,0 +1,111 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pattern + +import ( + "strings" + "unicode/utf8" +) + +var patternPropsLatin1 = [256]uint8{ + // WS: 9..D + 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // WS: 20 Syntax: 21..2F + 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // Syntax: 3A..40 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 5B..5E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // Syntax: 60 + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 7B..7E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // WS: 85 + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: A1..A7, A9, AB, AC, AE + 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, + // Syntax: B0, B1, B6, BB, BF + 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: D7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: F7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, +} + +func IsWhitespace(c rune) bool { + if c < 0 { + return false + } else if c <= 0xff { + return (patternPropsLatin1[c]>>2)&1 != 0 + } else if 0x200e <= c && c <= 0x2029 { + return c <= 0x200f || 0x2028 <= c + } else { + return false + } +} + +func SkipWhitespace(str string) string { + for { + r, w := utf8.DecodeRuneInString(str) + if r == utf8.RuneError && (w == 0 || w == 1) { + return str[w:] + } + if !IsWhitespace(r) { + return str + } + str = str[w:] + } +} + +func IsUnprintable(c rune) bool { + return !(c >= 0x20 && c <= 0x7E) +} + +// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +var DIGITS = [...]byte{ + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, +} + +func EscapeUnprintable(w *strings.Builder, c rune) { + w.WriteByte('\\') + if (c & ^0xFFFF) != 0 { + w.WriteByte('U') + w.WriteByte(DIGITS[0xF&(c>>28)]) + w.WriteByte(DIGITS[0xF&(c>>24)]) + w.WriteByte(DIGITS[0xF&(c>>20)]) + w.WriteByte(DIGITS[0xF&(c>>16)]) + } else { + w.WriteByte('u') + } + w.WriteByte(DIGITS[0xF&(c>>12)]) + w.WriteByte(DIGITS[0xF&(c>>8)]) + w.WriteByte(DIGITS[0xF&(c>>4)]) + w.WriteByte(DIGITS[0xF&c]) +} diff --git a/go/mysql/icuregex/internal/ubidi/ubidi.go b/go/mysql/icuregex/internal/ubidi/ubidi.go new file mode 100644 index 00000000000..97d137cbed8 --- /dev/null +++ b/go/mysql/icuregex/internal/ubidi/ubidi.go @@ -0,0 +1,460 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ubidi + +import ( + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +const ( + UBIDI_IX_INDEX_TOP = iota + UBIDI_IX_LENGTH + UBIDI_IX_TRIE_SIZE + UBIDI_IX_MIRROR_LENGTH + + UBIDI_IX_JG_START + UBIDI_IX_JG_LIMIT + UBIDI_IX_JG_START2 /* new in format version 2.2, ICU 54 */ + UBIDI_IX_JG_LIMIT2 + + UBIDI_MAX_VALUES_INDEX + UBIDI_IX_TOP +) + +var ubidi struct { + indexes []int32 + trie *utrie.UTrie2 + mirrors []uint32 + jg []uint8 + jg2 []uint8 +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.FormatVersion[0] == 2 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < UBIDI_IX_TOP { + return fmt.Errorf("indexes[0] too small in ucase.icu") + } + + ubidi.indexes = make([]int32, count) + ubidi.indexes[0] = count + + for i := int32(1); i < count; i++ { + ubidi.indexes[i] = int32(bytes.Uint32()) + } + + ubidi.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := ubidi.indexes[UBIDI_IX_TRIE_SIZE] + trieLength := ubidi.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return fmt.Errorf("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := ubidi.indexes[UBIDI_IX_MIRROR_LENGTH]; n > 0 { + ubidi.mirrors = bytes.Uint32Slice(n) + } + if n := ubidi.indexes[UBIDI_IX_JG_LIMIT] - ubidi.indexes[UBIDI_IX_JG_START]; n > 0 { + ubidi.jg = bytes.Uint8Slice(n) + } + if n := ubidi.indexes[UBIDI_IX_JG_LIMIT2] - ubidi.indexes[UBIDI_IX_JG_START2]; n > 0 { + ubidi.jg2 = bytes.Uint8Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UBidi) + if err := readData(b); err != nil { + panic(err) + } +} + +const ( + /* UBIDI_CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ + UBIDI_JT_SHIFT = 5 /* joining type: 3 bits (7..5) */ + + UBIDI_BPT_SHIFT = 8 /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + + UBIDI_JOIN_CONTROL_SHIFT = 10 + UBIDI_BIDI_CONTROL_SHIFT = 11 + + UBIDI_IS_MIRRORED_SHIFT = 12 /* 'is mirrored' */ + UBIDI_MIRROR_DELTA_SHIFT = 13 /* bidi mirroring delta: 3 bits (15..13) */ + + UBIDI_MAX_JG_SHIFT = 16 /* max JG value in indexes[UBIDI_MAX_VALUES_INDEX] bits 23..16 */ +) + +/** + * Bidi Paired Bracket Type constants. + * + * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE + * @stable ICU 52 + */ +type UBidiPairedBracketType int32 + +/* + * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. + * It matches lines like + * U_BPT_ + */ +const ( + /** Not a paired bracket. @stable ICU 52 */ + U_BPT_NONE = iota + /** Open paired bracket. @stable ICU 52 */ + U_BPT_OPEN + /** Close paired bracket. @stable ICU 52 */ + U_BPT_CLOSE +) + +const UBIDI_CLASS_MASK = 0x0000001f +const UBIDI_JT_MASK = 0x000000e0 +const UBIDI_BPT_MASK = 0x00000300 + +/** + * Joining Type constants. + * + * @see UCHAR_JOINING_TYPE + * @stable ICU 2.2 + */ +type UJoiningType int32 + +/* + * Note: UJoiningType constants are parsed by preparseucd.py. + * It matches lines like + * U_JT_ + */ +const ( + U_JT_NON_JOINING UJoiningType = iota /*[U]*/ + U_JT_JOIN_CAUSING /*[C]*/ + U_JT_DUAL_JOINING /*[D]*/ + U_JT_LEFT_JOINING /*[L]*/ + U_JT_RIGHT_JOINING /*[R]*/ + U_JT_TRANSPARENT /*[T]*/ +) + +/** + * Joining Group constants. + * + * @see UCHAR_JOINING_GROUP + * @stable ICU 2.2 + */ +type UJoiningGroup int32 + +/* + * Note: UJoiningGroup constants are parsed by preparseucd.py. + * It matches lines like + * U_JG_ + */ +const ( + U_JG_NO_JOINING_GROUP UJoiningGroup = iota + U_JG_AIN + U_JG_ALAPH + U_JG_ALEF + U_JG_BEH + U_JG_BETH + U_JG_DAL + U_JG_DALATH_RISH + U_JG_E + U_JG_FEH + U_JG_FINAL_SEMKATH + U_JG_GAF + U_JG_GAMAL + U_JG_HAH + U_JG_TEH_MARBUTA_GOAL /**< @stable ICU 4.6 */ + U_JG_HE + U_JG_HEH + U_JG_HEH_GOAL + U_JG_HETH + U_JG_KAF + U_JG_KAPH + U_JG_KNOTTED_HEH + U_JG_LAM + U_JG_LAMADH + U_JG_MEEM + U_JG_MIM + U_JG_NOON + U_JG_NUN + U_JG_PE + U_JG_QAF + U_JG_QAPH + U_JG_REH + U_JG_REVERSED_PE + U_JG_SAD + U_JG_SADHE + U_JG_SEEN + U_JG_SEMKATH + U_JG_SHIN + U_JG_SWASH_KAF + U_JG_SYRIAC_WAW + U_JG_TAH + U_JG_TAW + U_JG_TEH_MARBUTA + U_JG_TETH + U_JG_WAW + U_JG_YEH + U_JG_YEH_BARREE + U_JG_YEH_WITH_TAIL + U_JG_YUDH + U_JG_YUDH_HE + U_JG_ZAIN + U_JG_FE /**< @stable ICU 2.6 */ + U_JG_KHAPH /**< @stable ICU 2.6 */ + U_JG_ZHAIN /**< @stable ICU 2.6 */ + U_JG_BURUSHASKI_YEH_BARREE /**< @stable ICU 4.0 */ + U_JG_FARSI_YEH /**< @stable ICU 4.4 */ + U_JG_NYA /**< @stable ICU 4.4 */ + U_JG_ROHINGYA_YEH /**< @stable ICU 49 */ + U_JG_MANICHAEAN_ALEPH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_AYIN /**< @stable ICU 54 */ + U_JG_MANICHAEAN_BETH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_DALETH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_DHAMEDH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_FIVE /**< @stable ICU 54 */ + U_JG_MANICHAEAN_GIMEL /**< @stable ICU 54 */ + U_JG_MANICHAEAN_HETH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_HUNDRED /**< @stable ICU 54 */ + U_JG_MANICHAEAN_KAPH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_LAMEDH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_MEM /**< @stable ICU 54 */ + U_JG_MANICHAEAN_NUN /**< @stable ICU 54 */ + U_JG_MANICHAEAN_ONE /**< @stable ICU 54 */ + U_JG_MANICHAEAN_PE /**< @stable ICU 54 */ + U_JG_MANICHAEAN_QOPH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_RESH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_SADHE /**< @stable ICU 54 */ + U_JG_MANICHAEAN_SAMEKH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TAW /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TEN /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TETH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_THAMEDH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_TWENTY /**< @stable ICU 54 */ + U_JG_MANICHAEAN_WAW /**< @stable ICU 54 */ + U_JG_MANICHAEAN_YODH /**< @stable ICU 54 */ + U_JG_MANICHAEAN_ZAYIN /**< @stable ICU 54 */ + U_JG_STRAIGHT_WAW /**< @stable ICU 54 */ + U_JG_AFRICAN_FEH /**< @stable ICU 58 */ + U_JG_AFRICAN_NOON /**< @stable ICU 58 */ + U_JG_AFRICAN_QAF /**< @stable ICU 58 */ + + U_JG_MALAYALAM_BHA /**< @stable ICU 60 */ + U_JG_MALAYALAM_JA /**< @stable ICU 60 */ + U_JG_MALAYALAM_LLA /**< @stable ICU 60 */ + U_JG_MALAYALAM_LLLA /**< @stable ICU 60 */ + U_JG_MALAYALAM_NGA /**< @stable ICU 60 */ + U_JG_MALAYALAM_NNA /**< @stable ICU 60 */ + U_JG_MALAYALAM_NNNA /**< @stable ICU 60 */ + U_JG_MALAYALAM_NYA /**< @stable ICU 60 */ + U_JG_MALAYALAM_RA /**< @stable ICU 60 */ + U_JG_MALAYALAM_SSA /**< @stable ICU 60 */ + U_JG_MALAYALAM_TTA /**< @stable ICU 60 */ + + U_JG_HANIFI_ROHINGYA_KINNA_YA /**< @stable ICU 62 */ + U_JG_HANIFI_ROHINGYA_PA /**< @stable ICU 62 */ + + U_JG_THIN_YEH /**< @stable ICU 70 */ + U_JG_VERTICAL_TAIL /**< @stable ICU 70 */ +) + +/** + * This specifies the language directional property of a character set. + * @stable ICU 2.0 + */ +type UCharDirection int32 + +/* + * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** comment... * / + * U_<[A-Z_]+> = , + */ + +const ( + /** L @stable ICU 2.0 */ + U_LEFT_TO_RIGHT UCharDirection = 0 + /** R @stable ICU 2.0 */ + U_RIGHT_TO_LEFT UCharDirection = 1 + /** EN @stable ICU 2.0 */ + U_EUROPEAN_NUMBER UCharDirection = 2 + /** ES @stable ICU 2.0 */ + U_EUROPEAN_NUMBER_SEPARATOR UCharDirection = 3 + /** ET @stable ICU 2.0 */ + U_EUROPEAN_NUMBER_TERMINATOR UCharDirection = 4 + /** AN @stable ICU 2.0 */ + U_ARABIC_NUMBER UCharDirection = 5 + /** CS @stable ICU 2.0 */ + U_COMMON_NUMBER_SEPARATOR UCharDirection = 6 + /** B @stable ICU 2.0 */ + U_BLOCK_SEPARATOR UCharDirection = 7 + /** S @stable ICU 2.0 */ + U_SEGMENT_SEPARATOR UCharDirection = 8 + /** WS @stable ICU 2.0 */ + U_WHITE_SPACE_NEUTRAL UCharDirection = 9 + /** ON @stable ICU 2.0 */ + U_OTHER_NEUTRAL UCharDirection = 10 + /** LRE @stable ICU 2.0 */ + U_LEFT_TO_RIGHT_EMBEDDING UCharDirection = 11 + /** LRO @stable ICU 2.0 */ + U_LEFT_TO_RIGHT_OVERRIDE UCharDirection = 12 + /** AL @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_ARABIC UCharDirection = 13 + /** RLE @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_EMBEDDING UCharDirection = 14 + /** RLO @stable ICU 2.0 */ + U_RIGHT_TO_LEFT_OVERRIDE UCharDirection = 15 + /** PDF @stable ICU 2.0 */ + U_POP_DIRECTIONAL_FORMAT UCharDirection = 16 + /** NSM @stable ICU 2.0 */ + U_DIR_NON_SPACING_MARK UCharDirection = 17 + /** BN @stable ICU 2.0 */ + U_BOUNDARY_NEUTRAL UCharDirection = 18 + /** FSI @stable ICU 52 */ + U_FIRST_STRONG_ISOLATE UCharDirection = 19 + /** LRI @stable ICU 52 */ + U_LEFT_TO_RIGHT_ISOLATE UCharDirection = 20 + /** RLI @stable ICU 52 */ + U_RIGHT_TO_LEFT_ISOLATE UCharDirection = 21 + /** PDI @stable ICU 52 */ + U_POP_DIRECTIONAL_ISOLATE UCharDirection = 22 +) + +type PropertySet interface { + AddRune(ch rune) + AddRuneRange(from rune, to rune) +} + +func AddPropertyStarts(sa PropertySet) { + /* add the start code point of each same-value range of the trie */ + ubidi.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add the code points from the bidi mirroring table */ + length := ubidi.indexes[UBIDI_IX_MIRROR_LENGTH] + for i := int32(0); i < length; i++ { + c := mirrorCodePoint(rune(ubidi.mirrors[i])) + sa.AddRuneRange(c, c+1) + } + + /* add the code points from the Joining_Group array where the value changes */ + start := ubidi.indexes[UBIDI_IX_JG_START] + limit := ubidi.indexes[UBIDI_IX_JG_LIMIT] + jgArray := ubidi.jg[:] + for { + prev := uint8(0) + for start < limit { + jg := jgArray[0] + jgArray = jgArray[1:] + if jg != prev { + sa.AddRune(start) + prev = jg + } + start++ + } + if prev != 0 { + /* add the limit code point if the last value was not 0 (it is now start==limit) */ + sa.AddRune(limit) + } + if limit == ubidi.indexes[UBIDI_IX_JG_LIMIT] { + /* switch to the second Joining_Group range */ + start = ubidi.indexes[UBIDI_IX_JG_START2] + limit = ubidi.indexes[UBIDI_IX_JG_LIMIT2] + jgArray = ubidi.jg2[:] + } else { + break + } + } + + /* add code points with hardcoded properties, plus the ones following them */ + + /* (none right now) */ +} + +func HasFlag(props uint16, shift int) bool { + return ((props >> shift) & 1) != 0 +} + +func mirrorCodePoint(m rune) rune { + return m & 0x1fffff +} + +func IsJoinControl(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, UBIDI_JOIN_CONTROL_SHIFT) +} + +func JoiningType(c rune) UJoiningType { + props := ubidi.trie.Get16(c) + return UJoiningType((props & UBIDI_JT_MASK) >> UBIDI_JT_SHIFT) +} + +func JoiningGroup(c rune) UJoiningGroup { + start := ubidi.indexes[UBIDI_IX_JG_START] + limit := ubidi.indexes[UBIDI_IX_JG_LIMIT] + if start <= c && c < limit { + return UJoiningGroup(ubidi.jg[c-start]) + } + start = ubidi.indexes[UBIDI_IX_JG_START2] + limit = ubidi.indexes[UBIDI_IX_JG_LIMIT2] + if start <= c && c < limit { + return UJoiningGroup(ubidi.jg2[c-start]) + } + return U_JG_NO_JOINING_GROUP +} + +func IsMirrored(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, UBIDI_IS_MIRRORED_SHIFT) +} + +func IsBidiControl(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, UBIDI_BIDI_CONTROL_SHIFT) +} + +func PairedBracketType(c rune) UBidiPairedBracketType { + props := ubidi.trie.Get16(c) + return UBidiPairedBracketType((props & UBIDI_BPT_MASK) >> UBIDI_BPT_SHIFT) +} + +func Class(c rune) UCharDirection { + props := ubidi.trie.Get16(c) + return UCharDirection(props & UBIDI_CLASS_MASK) +} diff --git a/go/mysql/icuregex/internal/ucase/fold.go b/go/mysql/icuregex/internal/ucase/fold.go new file mode 100644 index 00000000000..bb10ba8cb35 --- /dev/null +++ b/go/mysql/icuregex/internal/ucase/fold.go @@ -0,0 +1,244 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ucase + +import ( + "math/bits" + + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +func FoldRunes(str []rune) []rune { + out := make([]rune, 0, len(str)) + for _, c := range str { + r, exp := FullFolding(c) + if exp == nil { + out = append(out, r) + continue + } + + for len(exp) > 0 { + r, exp = utf16.NextUnsafe(exp) + out = append(out, r) + } + } + return out +} + +/* + - Case folding is similar to lowercasing. + - The result may be a simple mapping, i.e., a single code point, or + - a full mapping, i.e., a string. + - If the case folding for a code point is the same as its simple (1:1) lowercase mapping, + - then only the lowercase mapping is stored. + * + - Some special cases are hardcoded because their conditions cannot be + - parsed and processed from CaseFolding.txt. + * + - Unicode 3.2 CaseFolding.txt specifies for its status field: + +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) + + - Unicode 3.2 has 'T' mappings as follows: + +0049; T; 0131; # LATIN CAPITAL LETTER I +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE + + - while the default mappings for these code points are: + +0049; C; 0069; # LATIN CAPITAL LETTER I +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE + + - U+0130 has no simple case folding (simple-case-folds to itself). +*/ +func Fold(c rune) rune { + props := ucase.trie.Get16(c) + if !hasException(props) { + if isUpperOrTitle(props) { + c += getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + if (excWord & UCASE_EXC_CONDITIONAL_FOLD) != 0 { + /* special case folding mappings, hardcoded */ + /* default mappings */ + if c == 0x49 { + /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ + return 0x69 + } else if c == 0x130 { + /* no simple case folding for U+0130 */ + return c + } + } + if (excWord & UCASE_EXC_NO_SIMPLE_CASE_FOLDING) != 0 { + return c + } + if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { + var delta int32 + delta, _ = getSlotValue(excWord, UCASE_EXC_DELTA, pe) + if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + return c + delta + } + return c - delta + } + + var idx int32 + if hasSlot(excWord, UCASE_EXC_FOLD) { + idx = UCASE_EXC_FOLD + } else if hasSlot(excWord, UCASE_EXC_LOWER) { + idx = UCASE_EXC_LOWER + } else { + return c + } + c, _ = getSlotValue(excWord, idx, pe) + } + return c +} + +func FullFolding(c rune) (rune, []uint16) { + result := c + props := ucase.trie.Get16(c) + + if !hasException(props) { + if isUpperOrTitle(props) { + result = c + getDelta(props) + } + return result, nil + } + + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + var idx int32 + + if excWord&UCASE_EXC_CONDITIONAL_FOLD != 0 { + /* use hardcoded conditions and mappings */ + /* default mappings */ + if c == 0x49 { + /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ + return 0x69, nil + } else if c == 0x130 { + /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ + return -1, []uint16{0x69, 0x307} + } + } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { + full, pe := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + + /* start of full case mapping strings */ + pe = pe[1:] + + /* skip the lowercase result string */ + pe = pe[full&UCASE_FULL_LOWER:] + full = (full >> 4) & 0xf + + if full != 0 { + /* set the output pointer to the result string */ + return -1, pe[:full] + } + } + + if excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING != 0 { + return result, nil + } + if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) + if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + return c + delta, nil + } + return c - delta, nil + } + if hasSlot(excWord, UCASE_EXC_FOLD) { + idx = UCASE_EXC_FOLD + } else if hasSlot(excWord, UCASE_EXC_LOWER) { + idx = UCASE_EXC_LOWER + } else { + return c, nil + } + result, _ = getSlotValue(excWord, idx, pe) + return result, nil +} + +const ( + UCASE_EXC_LOWER = iota + UCASE_EXC_FOLD + UCASE_EXC_UPPER + UCASE_EXC_TITLE + UCASE_EXC_DELTA + UCASE_EXC_5 /* reserved */ + UCASE_EXC_CLOSURE + UCASE_EXC_FULL_MAPPINGS + UCASE_EXC_ALL_SLOTS /* one past the last slot */ +) + +const ( + /* complex/conditional mappings */ + UCASE_EXC_CONDITIONAL_SPECIAL = 0x4000 + UCASE_EXC_CONDITIONAL_FOLD = 0x8000 + UCASE_EXC_NO_SIMPLE_CASE_FOLDING = 0x200 + UCASE_EXC_DELTA_IS_NEGATIVE = 0x400 + UCASE_EXC_SENSITIVE = 0x800 + + UCASE_EXC_DOUBLE_SLOTS = 0x100 +) + +func isUpperOrTitle(props uint16) bool { + return props&2 != 0 +} + +func getDelta(props uint16) rune { + return rune(int16(props) >> 7) +} + +func getExceptions(props uint16) []uint16 { + return ucase.exceptions[props>>4:] +} + +func hasSlot(flags uint16, idx int32) bool { + return (flags & (1 << idx)) != 0 +} + +func slotOffset(flags uint16, idx int32) int { + return bits.OnesCount8(uint8(flags & ((1 << idx) - 1))) +} + +func getSlotValue(excWord uint16, idx int32, pExc16 []uint16) (int32, []uint16) { + if excWord&UCASE_EXC_DOUBLE_SLOTS == 0 { + pExc16 = pExc16[slotOffset(excWord, idx):] + return int32(pExc16[0]), pExc16 + } + pExc16 = pExc16[2*slotOffset(excWord, idx):] + return (int32(pExc16[0]) << 16) | int32(pExc16[1]), pExc16[1:] +} diff --git a/go/mysql/icuregex/internal/ucase/ucase.go b/go/mysql/icuregex/internal/ucase/ucase.go new file mode 100644 index 00000000000..e2f8acd2a92 --- /dev/null +++ b/go/mysql/icuregex/internal/ucase/ucase.go @@ -0,0 +1,422 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ucase + +import ( + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var ucase struct { + trie *utrie.UTrie2 + exceptions []uint16 + unfold []uint16 +} + +func readData(bytes *udata.Bytes) error { + const ( + IX_INDEX_TOP = 0 + IX_LENGTH = 1 + IX_TRIE_SIZE = 2 + IX_EXC_LENGTH = 3 + IX_UNFOLD_LENGTH = 4 + IX_MAX_FULL_LENGTH = 15 + IX_TOP = 16 + ) + + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.FormatVersion[0] == 4 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < IX_TOP { + return fmt.Errorf("indexes[0] too small in ucase.icu") + } + + indexes := make([]int32, count) + indexes[0] = count + + for i := int32(1); i < count; i++ { + indexes[i] = int32(bytes.Uint32()) + } + + ucase.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := indexes[IX_TRIE_SIZE] + trieLength := ucase.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return fmt.Errorf("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := indexes[IX_EXC_LENGTH]; n > 0 { + ucase.exceptions = bytes.Uint16Slice(n) + } + if n := indexes[IX_UNFOLD_LENGTH]; n > 0 { + ucase.unfold = bytes.Uint16Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UCase) + if err := readData(b); err != nil { + panic(err) + } +} + +type PropertySet interface { + AddRune(ch rune) +} + +func AddPropertyStarts(sa PropertySet) { + /* add the start code point of each same-value range of the trie */ + ucase.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add code points with hardcoded properties, plus the ones following them */ + + /* (none right now, see comment below) */ + + /* + * Omit code points with hardcoded specialcasing properties + * because we do not build property UnicodeSets for them right now. + */ +} + +const ( + UCASE_FULL_MAPPINGS_MAX_LENGTH = (4 * 0xf) + UCASE_CLOSURE_MAX_LENGTH = 0xf + + UCASE_FULL_LOWER = 0xf + UCASE_FULL_FOLDING = 0xf0 + UCASE_FULL_UPPER = 0xf00 + UCASE_FULL_TITLE = 0xf000 +) + +func AddCaseClosure(c rune, sa PropertySet) { + /* + * Hardcode the case closure of i and its relatives and ignore the + * data file data for these characters. + * The Turkic dotless i and dotted I with their case mapping conditions + * and case folding option make the related characters behave specially. + * This code matches their closure behavior to their case folding behavior. + */ + + switch c { + case 0x49: + /* regular i and I are in one equivalence class */ + sa.AddRune(0x69) + return + case 0x69: + sa.AddRune(0x49) + return + case 0x130: + /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ + // the Regex engine calls removeAllStrings() on all UnicodeSets, so we don't need to insert them + // sa->addString(sa->set, iDot, 2); + return + case 0x131: + /* dotless i is in a class by itself */ + return + default: + /* otherwise use the data file data */ + break + } + + props := ucase.trie.Get16(c) + if !hasException(props) { + if getPropsType(props) != UCASE_NONE { + /* add the one simple case mapping, no matter what type it is */ + delta := getDelta(props) + if delta != 0 { + sa.AddRune(c + delta) + } + } + } else { + /* + * c has exceptions, so there may be multiple simple and/or + * full case mappings. Add them all. + */ + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + var idx int32 + var closure []uint16 + + /* add all simple case mappings */ + for idx = UCASE_EXC_LOWER; idx <= UCASE_EXC_TITLE; idx++ { + if hasSlot(excWord, idx) { + c, _ = getSlotValue(excWord, idx, pe) + sa.AddRune(c) + } + } + if hasSlot(excWord, UCASE_EXC_DELTA) { + delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) + if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + sa.AddRune(c + delta) + } else { + sa.AddRune(c - delta) + } + } + + /* get the closure string pointer & length */ + if hasSlot(excWord, UCASE_EXC_CLOSURE) { + closureLength, pe1 := getSlotValue(excWord, UCASE_EXC_CLOSURE, pe) + closureLength &= UCASE_CLOSURE_MAX_LENGTH /* higher bits are reserved */ + closure = pe1[1 : 1+closureLength] /* behind this slot, unless there are full case mappings */ + } + + /* add the full case folding */ + if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { + fullLength, pe1 := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + + /* start of full case mapping strings */ + pe1 = pe1[1:] + + fullLength &= 0xffff /* bits 16 and higher are reserved */ + + /* skip the lowercase result string */ + pe1 = pe1[fullLength&UCASE_FULL_LOWER:] + fullLength >>= 4 + + /* skip adding the case folding strings */ + length := fullLength & 0xf + pe1 = pe1[length:] + + /* skip the uppercase and titlecase strings */ + fullLength >>= 4 + pe1 = pe1[fullLength&0xf:] + fullLength >>= 4 + pe1 = pe1[fullLength:] + + closure = pe1[:len(closure)] + } + + /* add each code point in the closure string */ + for len(closure) > 0 { + c, closure = utf16.NextUnsafe(closure) + sa.AddRune(c) + } + } +} + +const UCASE_DOT_MASK = 0x60 + +const ( + UCASE_NO_DOT = 0 /* normal characters with cc=0 */ + UCASE_SOFT_DOTTED = 0x20 /* soft-dotted characters with cc=0 */ + UCASE_ABOVE = 0x40 /* "above" accents with cc=230 */ + UCASE_OTHER_ACCENT = 0x60 /* other accent character (0> UCASE_EXC_DOT_SHIFT) & UCASE_DOT_MASK) +} + +func IsCaseSensitive(c rune) bool { + props := ucase.trie.Get16(c) + if !hasException(props) { + return (props & UCASE_SENSITIVE) != 0 + } else { + pe := getExceptions(props) + return (pe[0] & UCASE_EXC_SENSITIVE) != 0 + } +} + +func ToFullLower(c rune) rune { + // The sign of the result has meaning, input must be non-negative so that it can be returned as is. + result := c + props := ucase.trie.Get16(c) + if !hasException(props) { + if isUpperOrTitle(props) { + result = c + getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + + if excWord&UCASE_EXC_CONDITIONAL_SPECIAL != 0 { + /* use hardcoded conditions and mappings */ + if c == 0x130 { + return 2 + } + /* no known conditional special case mapping, use a normal mapping */ + } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { + full, _ := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + full = full & UCASE_FULL_LOWER + if full != 0 { + /* return the string length */ + return full + } + } + + if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) + if (excWord & UCASE_EXC_DELTA_IS_NEGATIVE) == 0 { + return c + delta + } + return c - delta + } + if hasSlot(excWord, UCASE_EXC_LOWER) { + result, _ = getSlotValue(excWord, UCASE_EXC_LOWER, pe) + } + } + + if result == c { + return ^result + } + return result +} + +func ToFullUpper(c rune) rune { + return toUpperOrTitle(c, true) +} + +func ToFullTitle(c rune) rune { + return toUpperOrTitle(c, false) +} + +func toUpperOrTitle(c rune, upperNotTitle bool) rune { + result := c + props := ucase.trie.Get16(c) + if !hasException(props) { + if getPropsType(props) == UCASE_LOWER { + result = c + getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + + if excWord&UCASE_EXC_CONDITIONAL_SPECIAL != 0 { + if c == 0x0587 { + return 2 + } + /* no known conditional special case mapping, use a normal mapping */ + } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { + full, _ := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + + /* skip the lowercase and case-folding result strings */ + full >>= 8 + + if upperNotTitle { + full &= 0xf + } else { + /* skip the uppercase result string */ + full = (full >> 4) & 0xf + } + + if full != 0 { + /* return the string length */ + return full + } + } + + if hasSlot(excWord, UCASE_EXC_DELTA) && getPropsType(props) == UCASE_LOWER { + delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) + if (excWord & UCASE_EXC_DELTA_IS_NEGATIVE) == 0 { + return c + delta + } + return c - delta + } + var idx int32 + if !upperNotTitle && hasSlot(excWord, UCASE_EXC_TITLE) { + idx = UCASE_EXC_TITLE + } else if hasSlot(excWord, UCASE_EXC_UPPER) { + /* here, titlecase is same as uppercase */ + idx = UCASE_EXC_UPPER + } else { + return ^c + } + result, _ = getSlotValue(excWord, idx, pe) + } + + if result == c { + return ^result + } + return result +} + +func GetTypeOrIgnorable(c rune) int32 { + props := ucase.trie.Get16(c) + return int32(props & 7) +} + +type UCaseType int32 + +const ( + UCASE_NONE UCaseType = iota + UCASE_LOWER + UCASE_UPPER + UCASE_TITLE +) + +const UCASE_TYPE_MASK = 3 + +func GetType(c rune) UCaseType { + props := ucase.trie.Get16(c) + return getPropsType(props) +} + +func getPropsType(props uint16) UCaseType { + return UCaseType(props & UCASE_TYPE_MASK) +} diff --git a/go/mysql/icuregex/internal/uchar/constants.go b/go/mysql/icuregex/internal/uchar/constants.go new file mode 100644 index 00000000000..d1edd706586 --- /dev/null +++ b/go/mysql/icuregex/internal/uchar/constants.go @@ -0,0 +1,238 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uchar + +import "golang.org/x/exp/constraints" + +func U_MASK[T constraints.Integer](x T) uint32 { + return 1 << x +} + +const ( + /* + * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** comment... * / + * U_<[A-Z_]+> = , + */ + + /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ + U_UNASSIGNED = 0 + /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ + U_GENERAL_OTHER_TYPES = 0 + /** Lu @stable ICU 2.0 */ + U_UPPERCASE_LETTER = 1 + /** Ll @stable ICU 2.0 */ + U_LOWERCASE_LETTER = 2 + /** Lt @stable ICU 2.0 */ + U_TITLECASE_LETTER = 3 + /** Lm @stable ICU 2.0 */ + U_MODIFIER_LETTER = 4 + /** Lo @stable ICU 2.0 */ + U_OTHER_LETTER = 5 + /** Mn @stable ICU 2.0 */ + U_NON_SPACING_MARK = 6 + /** Me @stable ICU 2.0 */ + U_ENCLOSING_MARK = 7 + /** Mc @stable ICU 2.0 */ + U_COMBINING_SPACING_MARK = 8 + /** Nd @stable ICU 2.0 */ + U_DECIMAL_DIGIT_NUMBER = 9 + /** Nl @stable ICU 2.0 */ + U_LETTER_NUMBER = 10 + /** No @stable ICU 2.0 */ + U_OTHER_NUMBER = 11 + /** Zs @stable ICU 2.0 */ + U_SPACE_SEPARATOR = 12 + /** Zl @stable ICU 2.0 */ + U_LINE_SEPARATOR = 13 + /** Zp @stable ICU 2.0 */ + U_PARAGRAPH_SEPARATOR = 14 + /** Cc @stable ICU 2.0 */ + U_CONTROL_CHAR = 15 + /** Cf @stable ICU 2.0 */ + U_FORMAT_CHAR = 16 + /** Co @stable ICU 2.0 */ + U_PRIVATE_USE_CHAR = 17 + /** Cs @stable ICU 2.0 */ + U_SURROGATE = 18 + /** Pd @stable ICU 2.0 */ + U_DASH_PUNCTUATION = 19 + /** Ps @stable ICU 2.0 */ + U_START_PUNCTUATION = 20 + /** Pe @stable ICU 2.0 */ + U_END_PUNCTUATION = 21 + /** Pc @stable ICU 2.0 */ + U_CONNECTOR_PUNCTUATION = 22 + /** Po @stable ICU 2.0 */ + U_OTHER_PUNCTUATION = 23 + /** Sm @stable ICU 2.0 */ + U_MATH_SYMBOL = 24 + /** Sc @stable ICU 2.0 */ + U_CURRENCY_SYMBOL = 25 + /** Sk @stable ICU 2.0 */ + U_MODIFIER_SYMBOL = 26 + /** So @stable ICU 2.0 */ + U_OTHER_SYMBOL = 27 + /** Pi @stable ICU 2.0 */ + U_INITIAL_PUNCTUATION = 28 + /** Pf @stable ICU 2.0 */ + U_FINAL_PUNCTUATION = 29 + /** + * One higher than the last enum UCharCategory constant. + * This numeric value is stable (will not change), see + * http://www.unicode.org/policies/stability_policy.html#Property_Value + * + * @stable ICU 2.0 + */ + U_CHAR_CATEGORY_COUNT = 30 +) + +var ( + U_GC_CN_MASK = U_MASK(U_GENERAL_OTHER_TYPES) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_LU_MASK = U_MASK(U_UPPERCASE_LETTER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_LL_MASK = U_MASK(U_LOWERCASE_LETTER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_LT_MASK = U_MASK(U_TITLECASE_LETTER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_LM_MASK = U_MASK(U_MODIFIER_LETTER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_LO_MASK = U_MASK(U_OTHER_LETTER) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_MN_MASK = U_MASK(U_NON_SPACING_MARK) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_ME_MASK = U_MASK(U_ENCLOSING_MARK) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_MC_MASK = U_MASK(U_COMBINING_SPACING_MARK) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_ND_MASK = U_MASK(U_DECIMAL_DIGIT_NUMBER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_NL_MASK = U_MASK(U_LETTER_NUMBER) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_NO_MASK = U_MASK(U_OTHER_NUMBER) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_ZS_MASK = U_MASK(U_SPACE_SEPARATOR) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_ZL_MASK = U_MASK(U_LINE_SEPARATOR) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_ZP_MASK = U_MASK(U_PARAGRAPH_SEPARATOR) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_CC_MASK = U_MASK(U_CONTROL_CHAR) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_CF_MASK = U_MASK(U_FORMAT_CHAR) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_CO_MASK = U_MASK(U_PRIVATE_USE_CHAR) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_CS_MASK = U_MASK(U_SURROGATE) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_PD_MASK = U_MASK(U_DASH_PUNCTUATION) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_PS_MASK = U_MASK(U_START_PUNCTUATION) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_PE_MASK = U_MASK(U_END_PUNCTUATION) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_PC_MASK = U_MASK(U_CONNECTOR_PUNCTUATION) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_PO_MASK = U_MASK(U_OTHER_PUNCTUATION) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_SM_MASK = U_MASK(U_MATH_SYMBOL) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_SC_MASK = U_MASK(U_CURRENCY_SYMBOL) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_SK_MASK = U_MASK(U_MODIFIER_SYMBOL) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + U_GC_SO_MASK = U_MASK(U_OTHER_SYMBOL) + + /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ + U_GC_L_MASK = (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LT_MASK | U_GC_LM_MASK | U_GC_LO_MASK) + + /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ + U_GC_LC_MASK = (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LT_MASK) + + /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ + U_GC_M_MASK = (U_GC_MN_MASK | U_GC_ME_MASK | U_GC_MC_MASK) + + /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ + U_GC_N_MASK = (U_GC_ND_MASK | U_GC_NL_MASK | U_GC_NO_MASK) + + /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ + U_GC_Z_MASK = (U_GC_ZS_MASK | U_GC_ZL_MASK | U_GC_ZP_MASK) +) + +const UPROPS_AGE_SHIFT = 24 +const U_MAX_VERSION_LENGTH = 4 +const U_VERSION_DELIMITER = '.' + +type UVersionInfo [U_MAX_VERSION_LENGTH]uint8 + +const ( + /** No numeric value. */ + UPROPS_NTV_NONE = 0 + /** Decimal digits: nv=0..9 */ + UPROPS_NTV_DECIMAL_START = 1 + /** Other digits: nv=0..9 */ + UPROPS_NTV_DIGIT_START = 11 + /** Small integers: nv=0..154 */ + UPROPS_NTV_NUMERIC_START = 21 + /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ + UPROPS_NTV_FRACTION_START = 0xb0 + /** + * Large integers: + * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) + * (only one significant decimal digit) + */ + UPROPS_NTV_LARGE_START = 0x1e0 + /** + * Sexagesimal numbers: + * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) + */ + UPROPS_NTV_BASE60_START = 0x300 + /** + * Fraction-20 values: + * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 + * numerator: num = 2*(frac20&3)+1 + * denominator: den = 20<<(frac20>>2) + */ + UPROPS_NTV_FRACTION20_START = UPROPS_NTV_BASE60_START + 36 // 0x300+9*4=0x324 + /** + * Fraction-32 values: + * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 + * numerator: num = 2*(frac32&3)+1 + * denominator: den = 32<<(frac32>>2) + */ + UPROPS_NTV_FRACTION32_START = UPROPS_NTV_FRACTION20_START + 24 // 0x324+6*4=0x34c + /** No numeric value (yet). */ + UPROPS_NTV_RESERVED_START = UPROPS_NTV_FRACTION32_START + 16 // 0x34c+4*4=0x35c + + UPROPS_NTV_MAX_SMALL_INT = UPROPS_NTV_FRACTION_START - UPROPS_NTV_NUMERIC_START - 1 +) + +const U_NO_NUMERIC_VALUE = -123456789.0 diff --git a/go/mysql/icuregex/internal/uchar/uchar.go b/go/mysql/icuregex/internal/uchar/uchar.go new file mode 100644 index 00000000000..50167902a49 --- /dev/null +++ b/go/mysql/icuregex/internal/uchar/uchar.go @@ -0,0 +1,400 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uchar + +import ( + "fmt" + "strconv" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var uprops struct { + trie *utrie.UTrie2 + trie2 *utrie.UTrie2 + vectorsColumns int32 + vectors []uint32 + scriptExtensions []uint16 +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.FormatVersion[0] == 7 + }) + if err != nil { + return err + } + + propertyOffset := bytes.Int32() + /* exceptionOffset = */ bytes.Int32() + /* caseOffset = */ bytes.Int32() + additionalOffset := bytes.Int32() + additionalVectorsOffset := bytes.Int32() + uprops.vectorsColumns = bytes.Int32() + scriptExtensionsOffset := bytes.Int32() + reservedOffset7 := bytes.Int32() + /* reservedOffset8 = */ bytes.Int32() + /* dataTopOffset = */ bytes.Int32() + _ = bytes.Int32() + _ = bytes.Int32() + bytes.Skip((16 - 12) << 2) + + uprops.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := (propertyOffset - 16) * 4 + trieLength := uprops.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return fmt.Errorf("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + bytes.Skip((additionalOffset - propertyOffset) * 4) + + if uprops.vectorsColumns > 0 { + uprops.trie2, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4 + trieLength = uprops.trie2.SerializedLength() + + if trieLength > expectedTrieLength { + return fmt.Errorf("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + uprops.vectors = bytes.Uint32Slice(scriptExtensionsOffset - additionalVectorsOffset) + } + + if n := (reservedOffset7 - scriptExtensionsOffset) * 2; n > 0 { + uprops.scriptExtensions = bytes.Uint16Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UProps) + if err := readData(b); err != nil { + panic(err) + } +} + +type PropertySet interface { + AddRune(ch rune) +} + +func VecAddPropertyStarts(sa PropertySet) { + uprops.trie2.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) +} + +func AddPropertyStarts(sa PropertySet) { + const ( + TAB = 0x0009 + LF = 0x000a + FF = 0x000c + CR = 0x000d + NBSP = 0x00a0 + CGJ = 0x034f + FIGURESP = 0x2007 + HAIRSP = 0x200a + ZWNJ = 0x200c + ZWJ = 0x200d + RLM = 0x200f + NNBSP = 0x202f + ZWNBSP = 0xfef + ) + + /* add the start code point of each same-value range of the main trie */ + uprops.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add code points with hardcoded properties, plus the ones following them */ + + /* add for u_isblank() */ + sa.AddRune(TAB) + sa.AddRune(TAB + 1) + + /* add for IS_THAT_CONTROL_SPACE() */ + sa.AddRune(CR + 1) /* range TAB..CR */ + sa.AddRune(0x1c) + sa.AddRune(0x1f + 1) + sa.AddRune(0x85) // NEXT LINE (NEL) + sa.AddRune(0x85 + 1) + + /* add for u_isIDIgnorable() what was not added above */ + sa.AddRune(0x7f) /* range DEL..NBSP-1, NBSP added below */ + sa.AddRune(HAIRSP) + sa.AddRune(RLM + 1) + sa.AddRune(0x206a) // INHIBIT SYMMETRIC SWAPPING + sa.AddRune(0x206f + 1) // NOMINAL DIGIT SHAPES + sa.AddRune(ZWNBSP) + sa.AddRune(ZWNBSP + 1) + + /* add no-break spaces for u_isWhitespace() what was not added above */ + sa.AddRune(NBSP) + sa.AddRune(NBSP + 1) + sa.AddRune(FIGURESP) + sa.AddRune(FIGURESP + 1) + sa.AddRune(NNBSP) + sa.AddRune(NNBSP + 1) + + /* add for u_digit() */ + sa.AddRune('a') + sa.AddRune('z' + 1) + sa.AddRune('A') + sa.AddRune('Z' + 1) + // fullwidth + sa.AddRune('a') + sa.AddRune('z' + 1) + sa.AddRune('A') + sa.AddRune('Z' + 1) + + /* add for u_isxdigit() */ + sa.AddRune('f' + 1) + sa.AddRune('F' + 1) + // fullwidth + sa.AddRune('f' + 1) + sa.AddRune('F' + 1) + + /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ + sa.AddRune(0x2060) /* range 2060..206f */ + sa.AddRune(0xfff0) + sa.AddRune(0xfffb + 1) + sa.AddRune(0xe0000) + sa.AddRune(0xe0fff + 1) + + /* add for UCHAR_GRAPHEME_BASE and others */ + sa.AddRune(CGJ) + sa.AddRune(CGJ + 1) +} + +func CharType(c rune) int8 { + props := uprops.trie.Get16(c) + return GET_CATEGORY(props) +} + +func GetProperties(c rune) uint16 { + return uprops.trie.Get16(c) +} + +func GET_CATEGORY(props uint16) int8 { + return int8(props & 0x1f) +} + +func GetUnicodeProperties(c rune, column int) uint32 { + if column >= int(uprops.vectorsColumns) { + return 0 + } + vecIndex := uprops.trie2.Get16(c) + return uprops.vectors[int(vecIndex)+column] +} + +func ScriptExtension(idx uint32) uint16 { + return uprops.scriptExtensions[idx] +} + +func ScriptExtensions(idx uint32) []uint16 { + return uprops.scriptExtensions[idx:] +} + +func IsDigit(c rune) bool { + return CharType(c) == U_DECIMAL_DIGIT_NUMBER +} + +func IsPOSIXPrint(c rune) bool { + return CharType(c) == U_SPACE_SEPARATOR || IsGraphPOSIX(c) +} + +func IsGraphPOSIX(c rune) bool { + props := uprops.trie.Get16(c) + /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ + /* comparing ==0 returns FALSE for the categories mentioned */ + return U_MASK(GET_CATEGORY(props))&(U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK) == 0 +} + +func IsXDigit(c rune) bool { + /* check ASCII and Fullwidth ASCII a-fA-F */ + if (c <= 0x66 && c >= 0x41 && (c <= 0x46 || c >= 0x61)) || + (c >= 0xff21 && c <= 0xff46 && (c <= 0xff26 || c >= 0xff41)) { + return true + } + return IsDigit(c) +} + +func IsBlank(c rune) bool { + if c <= 0x9f { + return c == 9 || c == 0x20 /* TAB or SPACE */ + } + /* Zs */ + return CharType(c) == U_SPACE_SEPARATOR +} + +func CharAge(c rune) UVersionInfo { + version := GetUnicodeProperties(c, 0) >> UPROPS_AGE_SHIFT + return UVersionInfo{uint8(version >> 4), uint8(version & 0xf), 0, 0} +} + +func VersionFromString(str string) (version UVersionInfo) { + part := 0 + for len(str) > 0 && part < U_MAX_VERSION_LENGTH { + if str[0] == U_VERSION_DELIMITER { + str = str[1:] + } + str, version[part] = parseInt(str) + part++ + } + return +} + +// parseInt is simplified but aims to mimic strtoul usage +// as it is used for ICU version parsing. +func parseInt(str string) (string, uint8) { + if str == "" { + return str, 0 + } + + start := 0 + end := 0 + for i := 0; i < len(str); i++ { + switch str[i] { + case ' ', '\f', '\n', '\r', '\t', '\v': + start++ + continue + default: + break + } + } + str = str[start:] + + for i := 0; i < len(str); i++ { + if str[i] < '0' || str[i] > '9' { + end = i + break + } + end++ + } + + val, err := strconv.ParseUint(str[start:end], 10, 8) + if err != nil { + return str[end:], 0 + } + return str[end:], uint8(val) +} + +const UPROPS_NUMERIC_TYPE_VALUE_SHIFT = 6 + +func NumericTypeValue(c rune) uint16 { + props := uprops.trie.Get16(c) + return props >> UPROPS_NUMERIC_TYPE_VALUE_SHIFT +} + +func NumericValue(c rune) float64 { + ntv := int32(NumericTypeValue(c)) + + if ntv == UPROPS_NTV_NONE { + return U_NO_NUMERIC_VALUE + } else if ntv < UPROPS_NTV_DIGIT_START { + /* decimal digit */ + return float64(ntv - UPROPS_NTV_DECIMAL_START) + } else if ntv < UPROPS_NTV_NUMERIC_START { + /* other digit */ + return float64(ntv - UPROPS_NTV_DIGIT_START) + } else if ntv < UPROPS_NTV_FRACTION_START { + /* small integer */ + return float64(ntv - UPROPS_NTV_NUMERIC_START) + } else if ntv < UPROPS_NTV_LARGE_START { + /* fraction */ + numerator := (ntv >> 4) - 12 + denominator := (ntv & 0xf) + 1 + return float64(numerator) / float64(denominator) + } else if ntv < UPROPS_NTV_BASE60_START { + /* large, single-significant-digit integer */ + mant := (ntv >> 5) - 14 + exp := (ntv & 0x1f) + 2 + numValue := float64(mant) + + /* multiply by 10^exp without math.h */ + for exp >= 4 { + numValue *= 10000. + exp -= 4 + } + switch exp { + case 3: + numValue *= 1000.0 + case 2: + numValue *= 100.0 + case 1: + numValue *= 10.0 + case 0: + default: + } + + return numValue + } else if ntv < UPROPS_NTV_FRACTION20_START { + /* sexagesimal (base 60) integer */ + numValue := (ntv >> 2) - 0xbf + exp := (ntv & 3) + 1 + + switch exp { + case 4: + numValue *= 60 * 60 * 60 * 60 + case 3: + numValue *= 60 * 60 * 60 + case 2: + numValue *= 60 * 60 + case 1: + numValue *= 60 + case 0: + default: + } + + return float64(numValue) + } else if ntv < UPROPS_NTV_FRACTION32_START { + // fraction-20 e.g. 3/80 + frac20 := ntv - UPROPS_NTV_FRACTION20_START // 0..0x17 + numerator := 2*(frac20&3) + 1 + denominator := 20 << (frac20 >> 2) + return float64(numerator) / float64(denominator) + } else if ntv < UPROPS_NTV_RESERVED_START { + // fraction-32 e.g. 3/64 + frac32 := ntv - UPROPS_NTV_FRACTION32_START // 0..15 + numerator := 2*(frac32&3) + 1 + denominator := 32 << (frac32 >> 2) + return float64(numerator) / float64(denominator) + } else { + /* reserved */ + return U_NO_NUMERIC_VALUE + } +} diff --git a/go/mysql/icuregex/internal/udata/udata.go b/go/mysql/icuregex/internal/udata/udata.go new file mode 100644 index 00000000000..50b67b5f427 --- /dev/null +++ b/go/mysql/icuregex/internal/udata/udata.go @@ -0,0 +1,160 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package udata + +import ( + "encoding/binary" + "fmt" + "unsafe" +) + +type DataInfo struct { + /** sizeof(UDataInfo) + * @stable ICU 2.0 */ + Size uint16 + + /** unused, set to 0 + * @stable ICU 2.0*/ + ReservedWord uint16 + + /* platform data properties */ + /** 0 for little-endian machine, 1 for big-endian + * @stable ICU 2.0 */ + IsBigEndian uint8 + + /** see U_CHARSET_FAMILY values in utypes.h + * @stable ICU 2.0*/ + CharsetFamily uint8 + + /** sizeof(UChar), one of { 1, 2, 4 } + * @stable ICU 2.0*/ + SizeofUChar uint8 + + /** unused, set to 0 + * @stable ICU 2.0*/ + ReservedByte uint8 + + /** data format identifier + * @stable ICU 2.0*/ + DataFormat [4]uint8 + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + FormatVersion [4]uint8 + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + DataVersion [4]uint8 +} + +type Bytes struct { + buf []byte + orig []byte + enc binary.ByteOrder +} + +func NewBytes(b []byte) *Bytes { + return &Bytes{buf: b, orig: b, enc: binary.LittleEndian} +} + +func (b *Bytes) ReadHeader(isValid func(info *DataInfo) bool) error { + type MappedData struct { + headerSize uint16 + magic1 uint8 + magic2 uint8 + } + + type DataHeader struct { + dataHeader MappedData + info DataInfo + } + + data := unsafe.SliceData(b.buf) + header := (*DataHeader)(unsafe.Pointer(data)) + + if header.dataHeader.magic1 != 0xda || header.dataHeader.magic2 != 0x27 { + return fmt.Errorf("invalid magic number") + } + + if header.info.IsBigEndian != 0 { + return fmt.Errorf("unsupported: BigEndian data source") + } + + if !isValid(&header.info) { + return fmt.Errorf("failed to validate data header") + } + + b.buf = b.buf[header.dataHeader.headerSize:] + return nil +} + +func (b *Bytes) Uint16() uint16 { + u := b.enc.Uint16(b.buf) + b.buf = b.buf[2:] + return u +} + +func (b *Bytes) Uint16Slice(size int32) []uint16 { + s := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b.buf))), size) + b.buf = b.buf[2*size:] + return s +} + +func (b *Bytes) Uint32Slice(size int32) []uint32 { + s := unsafe.Slice((*uint32)(unsafe.Pointer(unsafe.SliceData(b.buf))), size) + b.buf = b.buf[4*size:] + return s +} + +func (b *Bytes) Uint32() uint32 { + u := b.enc.Uint32(b.buf) + b.buf = b.buf[4:] + return u +} + +func (b *Bytes) Int32() int32 { + return int32(b.Uint32()) +} + +func (b *Bytes) Pointer() unsafe.Pointer { + return unsafe.Pointer(unsafe.SliceData(b.buf)) +} + +func (b *Bytes) Skip(size int32) { + b.buf = b.buf[size:] +} + +func (b *Bytes) Uint8Slice(n int32) []uint8 { + s := b.buf[:n] + b.buf = b.buf[n:] + return s +} + +func (b *Bytes) String(size int32) string { + s := unsafe.String(&b.buf[0], size) + b.buf = b.buf[size:] + return s +} + +func (b *Bytes) Position() int32 { + return int32(len(b.orig) - len(b.buf)) +} diff --git a/go/mysql/icuregex/internal/uerror/error.go b/go/mysql/icuregex/internal/uerror/error.go new file mode 100644 index 00000000000..9d23d8dc4a3 --- /dev/null +++ b/go/mysql/icuregex/internal/uerror/error.go @@ -0,0 +1,159 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uerror + +import "fmt" + +type UErrorCode int32 + +const ( + U_ZERO_ERROR UErrorCode = iota /**< No error, no warning. */ + U_ILLEGAL_ARGUMENT_ERROR /**< Start of codes indicating failure */ + U_MISSING_RESOURCE_ERROR /**< The requested resource cannot be found */ + U_INVALID_FORMAT_ERROR /**< Data format is not what is expected */ + U_FILE_ACCESS_ERROR /**< The requested file cannot be found */ + U_INTERNAL_PROGRAM_ERROR /**< Indicates a bug in the library code */ + U_MESSAGE_PARSE_ERROR /**< Unable to parse a message (message format) */ + U_MEMORY_ALLOCATION_ERROR /**< Memory allocation error */ + U_INDEX_OUTOFBOUNDS_ERROR /**< Trying to access the index that is out of bounds */ + U_PARSE_ERROR /**< Equivalent to Java ParseException */ + U_INVALID_CHAR_FOUND /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */ + U_TRUNCATED_CHAR_FOUND /**< Character conversion: Incomplete input sequence. */ + U_ILLEGAL_CHAR_FOUND /**< Character conversion: Illegal input sequence/combination of input units. */ + U_INVALID_TABLE_FORMAT /**< Conversion table file found, but corrupted */ + U_INVALID_TABLE_FILE /**< Conversion table file not found */ + U_BUFFER_OVERFLOW_ERROR /**< A result would not fit in the supplied buffer */ + U_UNSUPPORTED_ERROR /**< Requested operation not supported in current context */ + U_RESOURCE_TYPE_MISMATCH /**< an operation is requested over a resource that does not support it */ + U_ILLEGAL_ESCAPE_SEQUENCE /**< ISO-2022 illegal escape sequence */ + U_UNSUPPORTED_ESCAPE_SEQUENCE /**< ISO-2022 unsupported escape sequence */ + U_NO_SPACE_AVAILABLE /**< No space available for in-buffer expansion for Arabic shaping */ + U_CE_NOT_FOUND_ERROR /**< Currently used only while setting variable top, but can be used generally */ + U_PRIMARY_TOO_LONG_ERROR /**< User tried to set variable top to a primary that is longer than two bytes */ + U_STATE_TOO_OLD_ERROR /**< ICU cannot construct a service from this state, as it is no longer supported */ + U_TOO_MANY_ALIASES_ERROR /**< There are too many aliases in the path to the requested resource. + It is very possible that a circular alias definition has occurred */ + U_ENUM_OUT_OF_SYNC_ERROR /**< UEnumeration out of sync with underlying collection */ + U_INVARIANT_CONVERSION_ERROR /**< Unable to convert a UChar* string to char* with the invariant converter. */ + U_INVALID_STATE_ERROR /**< Requested operation can not be completed with ICU in its current state */ + U_COLLATOR_VERSION_MISMATCH /**< Collator version is not compatible with the base version */ + U_USELESS_COLLATOR_ERROR /**< Collator is options only and no base is specified */ + U_NO_WRITE_PERMISSION /**< Attempt to modify read-only or constant data. */ + U_INPUT_TOO_LONG_ERROR +) + +/* + * Error codes in the range 0x10000 0x10100 are reserved for Transliterator. + */ +const ( + U_BAD_VARIABLE_DEFINITION UErrorCode = iota + 0x10000 /**< Missing '$' or duplicate variable name */ + U_MALFORMED_RULE /**< Elements of a rule are misplaced */ + U_MALFORMED_SET /**< A UnicodeSet pattern is invalid*/ + U_MALFORMED_SYMBOL_REFERENCE /**< UNUSED as of ICU 2.4 */ + U_MALFORMED_UNICODE_ESCAPE /**< A Unicode escape pattern is invalid*/ + U_MALFORMED_VARIABLE_DEFINITION /**< A variable definition is invalid */ + U_MALFORMED_VARIABLE_REFERENCE /**< A variable reference is invalid */ + U_MISMATCHED_SEGMENT_DELIMITERS /**< UNUSED as of ICU 2.4 */ + U_MISPLACED_ANCHOR_START /**< A start anchor appears at an illegal position */ + U_MISPLACED_CURSOR_OFFSET /**< A cursor offset occurs at an illegal position */ + U_MISPLACED_QUANTIFIER /**< A quantifier appears after a segment close delimiter */ + U_MISSING_OPERATOR /**< A rule contains no operator */ + U_MISSING_SEGMENT_CLOSE /**< UNUSED as of ICU 2.4 */ + U_MULTIPLE_ANTE_CONTEXTS /**< More than one ante context */ + U_MULTIPLE_CURSORS /**< More than one cursor */ + U_MULTIPLE_POST_CONTEXTS /**< More than one post context */ + U_TRAILING_BACKSLASH /**< A dangling backslash */ + U_UNDEFINED_SEGMENT_REFERENCE /**< A segment reference does not correspond to a defined segment */ + U_UNDEFINED_VARIABLE /**< A variable reference does not correspond to a defined variable */ + U_UNQUOTED_SPECIAL /**< A special character was not quoted or escaped */ + U_UNTERMINATED_QUOTE /**< A closing single quote is missing */ + U_RULE_MASK_ERROR /**< A rule is hidden by an earlier more general rule */ + U_MISPLACED_COMPOUND_FILTER /**< A compound filter is in an invalid location */ + U_MULTIPLE_COMPOUND_FILTERS /**< More than one compound filter */ + U_INVALID_RBT_SYNTAX /**< A "::id" rule was passed to the RuleBasedTransliterator parser */ + U_INVALID_PROPERTY_PATTERN /**< UNUSED as of ICU 2.4 */ + U_MALFORMED_PRAGMA /**< A 'use' pragma is invalid */ + U_UNCLOSED_SEGMENT /**< A closing ')' is missing */ + U_ILLEGAL_CHAR_IN_SEGMENT /**< UNUSED as of ICU 2.4 */ + U_VARIABLE_RANGE_EXHAUSTED /**< Too many stand-ins generated for the given variable range */ + U_VARIABLE_RANGE_OVERLAP /**< The variable range overlaps characters used in rules */ + U_ILLEGAL_CHARACTER /**< A special character is outside its allowed context */ + U_INTERNAL_TRANSLITERATOR_ERROR /**< Internal transliterator system error */ + U_INVALID_ID /**< A "::id" rule specifies an unknown transliterator */ + U_INVALID_FUNCTION /**< A "&fn()" rule specifies an unknown transliterator */ +) + +/* + * Error codes in the range 0x10200 0x102ff are reserved for BreakIterator. + */ +const ( + U_BRK_INTERNAL_ERROR UErrorCode = iota + 0x10200 /**< An internal error (bug) was detected. */ + U_BRK_HEX_DIGITS_EXPECTED /**< Hex digits expected as part of a escaped char in a rule. */ + U_BRK_SEMICOLON_EXPECTED /**< Missing ';' at the end of a RBBI rule. */ + U_BRK_RULE_SYNTAX /**< Syntax error in RBBI rule. */ + U_BRK_UNCLOSED_SET /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ + U_BRK_ASSIGN_ERROR /**< Syntax error in RBBI rule assignment statement. */ + U_BRK_VARIABLE_REDFINITION /**< RBBI rule $Variable redefined. */ + U_BRK_MISMATCHED_PAREN /**< Mis-matched parentheses in an RBBI rule. */ + U_BRK_NEW_LINE_IN_QUOTED_STRING /**< Missing closing quote in an RBBI rule. */ + U_BRK_UNDEFINED_VARIABLE /**< Use of an undefined $Variable in an RBBI rule. */ + U_BRK_INIT_ERROR /**< Initialization failure. Probable missing ICU Data. */ + U_BRK_RULE_EMPTY_SET /**< Rule contains an empty Unicode Set. */ + U_BRK_UNRECOGNIZED_OPTION /**< !!option in RBBI rules not recognized. */ + U_BRK_MALFORMED_RULE_TAG /**< The {nnn} tag on a rule is malformed */ +) + +type URegexCompileErrorCode int32 + +const ( + U_REGEX_ZERO_ERROR URegexCompileErrorCode = iota + U_REGEX_INTERNAL_ERROR /**< An internal error (bug) was detected. */ + U_REGEX_RULE_SYNTAX /**< Syntax error in regexp pattern. */ + U_REGEX_INVALID_STATE /**< RegexMatcher in invalid state for requested operation */ + U_REGEX_BAD_ESCAPE_SEQUENCE /**< Unrecognized backslash escape sequence in pattern */ + U_REGEX_PROPERTY_SYNTAX /**< Incorrect Unicode property */ + U_REGEX_UNIMPLEMENTED /**< Use of regexp feature that is not yet implemented. */ + U_REGEX_MISMATCHED_PAREN /**< Incorrectly nested parentheses in regexp pattern. */ + U_REGEX_NUMBER_TOO_BIG /**< Decimal number is too large. */ + U_REGEX_BAD_INTERVAL /**< Error in {min,max} interval */ + U_REGEX_MAX_LT_MIN /**< In {min,max}, max is less than min. */ + U_REGEX_INVALID_BACK_REF /**< Back-reference to a non-existent capture group. */ + U_REGEX_INVALID_FLAG /**< Invalid value for match mode flags. */ + U_REGEX_LOOK_BEHIND_LIMIT /**< Look-Behind pattern matches must have a bounded maximum length. */ + U_REGEX_SET_CONTAINS_STRING /**< Regexps cannot have UnicodeSets containing strings.*/ + U_REGEX_MISSING_CLOSE_BRACKET /**< Missing closing bracket on a bracket expression. */ + U_REGEX_INVALID_RANGE /**< In a character range [x-y], x is greater than y. */ + U_REGEX_PATTERN_TOO_BIG /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ + U_REGEX_INVALID_CAPTURE_GROUP_NAME /**< Invalid capture group name. @stable ICU 55 */ + U_REGEX_UNSUPPORTED_ERROR /**< Use of an unsupported feature. @stable ICU 55 */ +) + +type URegexMatchErrorCode int32 + +const ( + U_REGEX_STACK_OVERFLOW URegexMatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ + U_REGEX_TIME_OUT /**< Maximum allowed match time exceeded */ +) + +func (e UErrorCode) Error() string { + return fmt.Sprintf("UErrorCode: %d", e) +} diff --git a/go/mysql/icuregex/internal/ulayout/ulayout.go b/go/mysql/icuregex/internal/ulayout/ulayout.go new file mode 100644 index 00000000000..744c9727461 --- /dev/null +++ b/go/mysql/icuregex/internal/ulayout/ulayout.go @@ -0,0 +1,133 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ulayout + +import ( + "fmt" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var inpcTrie *utrie.UcpTrie +var inscTrie *utrie.UcpTrie +var voTrie *utrie.UcpTrie + +const ( + IX_INPC_TRIE_TOP = 1 + IX_INSC_TRIE_TOP = 2 + IX_VO_TRIE_TOP = 3 + IX_RESERVED_TOP = 4 + + IX_TRIES_TOP = 7 + + IX_MAX_VALUES = 9 + + IX_COUNT = 12 + + MAX_INPC_SHIFT = 24 + MAX_INSC_SHIFT = 16 + MAX_VO_SHIFT = 8 +) + +func InpcTrie() *utrie.UcpTrie { + loadLayouts() + return inpcTrie +} + +func InscTrie() *utrie.UcpTrie { + loadLayouts() + return inscTrie +} + +func VoTrie() *utrie.UcpTrie { + loadLayouts() + return voTrie +} + +var layoutsOnce sync.Once + +func loadLayouts() { + layoutsOnce.Do(func() { + b := udata.NewBytes(icudata.ULayout) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.FormatVersion[0] == 1 + }) + if err != nil { + return err + } + + startPos := bytes.Position() + indexesLength := int32(bytes.Uint32()) // inIndexes[IX_INDEXES_LENGTH] + if indexesLength < IX_COUNT { + return fmt.Errorf("Text layout properties data: not enough indexes") + } + index := make([]int32, indexesLength) + index[0] = indexesLength + for i := int32(1); i < indexesLength; i++ { + index[i] = int32(bytes.Uint32()) + } + + offset := indexesLength * 4 + top := index[IX_INPC_TRIE_TOP] + trieSize := top - offset + if trieSize >= 16 { + inpcTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + + pos := bytes.Position() - startPos + bytes.Skip(top - pos) + offset = top + top = index[IX_INSC_TRIE_TOP] + trieSize = top - offset + if trieSize >= 16 { + inscTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + + pos = bytes.Position() - startPos + bytes.Skip(top - pos) + offset = top + top = index[IX_VO_TRIE_TOP] + trieSize = top - offset + if trieSize >= 16 { + voTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + return nil +} diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go new file mode 100644 index 00000000000..dad3f5a0bd3 --- /dev/null +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -0,0 +1,484 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package unames + +import ( + "bytes" + _ "embed" + "math" + "strconv" + "strings" + "sync" + "unsafe" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +var charNamesOnce sync.Once +var charNames *UCharNames + +func loadCharNames() { + validCharNames := func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 1 + } + + charNamesOnce.Do(func() { + b := udata.NewBytes(icudata.UNames) + if err := b.ReadHeader(validCharNames); err != nil { + panic(err) + } + charNames = (*UCharNames)(b.Pointer()) + }) +} + +type NameChoice int32 + +const ( + U_UNICODE_CHAR_NAME NameChoice = iota + /** + * The Unicode_1_Name property value which is of little practical value. + * Beginning with ICU 49, ICU APIs return an empty string for this name choice. + * @deprecated ICU 49 + */ + U_UNICODE_10_CHAR_NAME + /** Standard or synthetic character name. @stable ICU 2.0 */ + U_EXTENDED_CHAR_NAME + /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ + U_CHAR_NAME_ALIAS + /** + * One more than the highest normal UCharNameChoice value. + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + U_CHAR_NAME_CHOICE_COUNT +) + +type algorithmicRange struct { + start, end uint32 + type_, variant uint8 + size uint16 +} + +func (ar *algorithmicRange) next() *algorithmicRange { + return (*algorithmicRange)(unsafe.Add(unsafe.Pointer(ar), ar.size)) +} + +func (ar *algorithmicRange) ptrend(offset uintptr) unsafe.Pointer { + return unsafe.Add(unsafe.Pointer(ar), unsafe.Sizeof(algorithmicRange{})+offset) +} + +func (ar *algorithmicRange) slice8(offset uintptr) []uint8 { + return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.size) +} + +func (ar *algorithmicRange) slice16() []uint16 { + return unsafe.Slice((*uint16)(ar.ptrend(0)), ar.size/2) +} + +func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) rune { + switch ar.type_ { + case 0: + s := ar.slice8(0) + + for s[0] != 0 && len(otherName) > 0 { + if s[0] != otherName[0] { + return -1 + } + s = s[1:] + otherName = otherName[1:] + } + + var code rune + count := int(ar.variant) + for i := 0; i < count && len(otherName) > 0; i++ { + c := rune(otherName[0]) + otherName = otherName[1:] + if '0' <= c && c <= '9' { + code = (code << 4) | (c - '0') + } else if 'A' <= c && c <= 'F' { + code = (code << 4) | (c - 'A' + 10) + } else { + return -1 + } + } + + if len(otherName) == 0 && ar.start <= uint32(code) && uint32(code) <= ar.end { + return code + } + case 1: + factors := ar.slice16() + count := int(ar.variant) + s := ar.slice8(2 * uintptr(count)) + + for s[0] != 0 && len(otherName) > 0 { + if s[0] != otherName[0] { + return -1 + } + s = s[1:] + otherName = otherName[1:] + } + s = s[1:] + + start := rune(ar.start) + limit := rune(ar.end + 1) + + var indexes [8]uint16 + var buf strings.Builder + var elements [8][]byte + var elementBases [8][]byte + + ar.writeFactorSuffix0(factors, count, s, &buf, &elements, &elementBases) + if buf.String() == otherName { + return start + } + + for start+1 < limit { + start++ + i := count + + for { + i-- + idx := indexes[i] + 1 + if idx < factors[i] { + indexes[i] = idx + s = elements[i] + s = s[bytes.IndexByte(s, 0)+1:] + elements[i] = s + break + } else { + indexes[i] = 0 + elements[i] = elementBases[i] + } + } + + t := otherName + for i = 0; i < count; i++ { + s = elements[i] + + for s[0] != 0 && len(t) > 0 { + if s[0] != t[0] { + s = nil + i = 99 + break + } + s = s[1:] + t = t[1:] + } + } + if i < 99 && len(t) == 0 { + return start + } + } + } + return -1 +} + +func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, count int, s []uint8, buf *strings.Builder, elements, elementBases *[8][]byte) { + i := 0 + + /* write each element */ + for { + (*elements)[i] = s + (*elementBases)[i] = s + + nul := bytes.IndexByte(s, 0) + buf.Write(s[:nul]) + s = s[nul+1:] + + if i >= count { + break + } + + factor := int(factors[i] - 1) + for factor > 0 { + s = s[bytes.IndexByte(s, 0)+1:] + factor-- + } + + i++ + } +} + +func CharForName(nameChoice NameChoice, name string) rune { + loadCharNames() + + lower := strings.ToLower(name) + upper := strings.ToUpper(name) + + if lower[0] == '<' { + if nameChoice == U_EXTENDED_CHAR_NAME && lower[len(lower)-1] == '>' { + if limit := strings.LastIndexByte(lower, '-'); limit >= 2 { + cp, err := strconv.ParseUint(lower[limit+1:len(lower)-1], 16, 32) + if err != nil || cp > 0x10ffff { + return -1 + } + return rune(cp) + } + } + return -1 + } + + p := charNames.ptr32(charNames.algNamesOffset) + i := p[0] + algRange := (*algorithmicRange)(unsafe.Pointer(unsafe.SliceData(p[1:]))) + for i > 0 { + if cp := algRange.findAlgName(nameChoice, upper); cp != -1 { + return cp + } + algRange = algRange.next() + i-- + } + + return charNames.enumNames(0, 0x10ffff+1, upper, nameChoice) +} + +type UCharNames struct { + tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset uint32 +} + +const GROUP_SHIFT = 5 +const LINES_PER_GROUP = 1 << GROUP_SHIFT +const GROUP_MASK = LINES_PER_GROUP - 1 + +const ( + GROUP_MSB = iota + GROUP_OFFSET_HIGH + GROUP_OFFSET_LOW + GROUP_LENGTH +) + +func (names *UCharNames) enumNames(start, limit rune, otherName string, nameChoice NameChoice) rune { + startGroupMSB := uint16(start >> GROUP_SHIFT) + endGroupMSB := uint16((limit - 1) >> GROUP_SHIFT) + + group := names.getGroup(start) + + if startGroupMSB < group[GROUP_MSB] && nameChoice == U_EXTENDED_CHAR_NAME { + extLimit := rune(group[GROUP_MSB]) << GROUP_SHIFT + if extLimit > limit { + extLimit = limit + } + start = extLimit + } + + if startGroupMSB == endGroupMSB { + if startGroupMSB == group[GROUP_MSB] { + return names.enumGroupNames(group, start, limit-1, otherName, nameChoice) + } + } else { + if startGroupMSB == group[GROUP_MSB] { + if start&GROUP_MASK != 0 { + if cp := names.enumGroupNames(group, start, (rune(startGroupMSB)< group[GROUP_MSB] { + group = group[GROUP_LENGTH:] + } + + for len(group) > 0 && group[GROUP_MSB] < endGroupMSB { + start = rune(group[GROUP_MSB]) << GROUP_SHIFT + if cp := names.enumGroupNames(group, start, start+LINES_PER_GROUP-1, otherName, nameChoice); cp != -1 { + return cp + } + group = group[GROUP_LENGTH:] + } + + if len(group) > 0 && group[GROUP_MSB] == endGroupMSB { + return names.enumGroupNames(group, (limit-1)&^GROUP_MASK, limit-1, otherName, nameChoice) + } + } + + return -1 +} + +func (names *UCharNames) ptr8(offset8 uint32) []byte { + return unsafe.Slice((*uint8)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt) +} + +func (names *UCharNames) ptr16(offset8 uint32) []uint16 { + return unsafe.Slice((*uint16)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt/2) +} + +func (names *UCharNames) ptr32(offset8 uint32) []uint32 { + return unsafe.Slice((*uint32)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt/4) +} + +func (names *UCharNames) getGroup(code rune) []uint16 { + groups := names.ptr16(names.groupsOffset) + groupMSB := uint16(code >> GROUP_SHIFT) + + start := 0 + groupCount := int(groups[0]) + limit := groupCount + groups = groups[1:] + + for start < limit-1 { + number := (start + limit) / 2 + if groupMSB < groups[number*GROUP_LENGTH+GROUP_MSB] { + limit = number + } else { + start = number + } + } + + return groups[start*GROUP_LENGTH : (groupCount-start)*GROUP_LENGTH] +} + +func (names *UCharNames) getGroupOffset(group []uint16) uint32 { + return (uint32(group[GROUP_OFFSET_HIGH]) << 16) | uint32(group[GROUP_OFFSET_LOW]) +} + +func (names *UCharNames) enumGroupNames(group []uint16, start, end rune, otherName string, choice NameChoice) rune { + var offsets [LINES_PER_GROUP + 2]uint16 + var lengths [LINES_PER_GROUP + 2]uint16 + + s := names.ptr8(names.groupStringOffset + names.getGroupOffset(group)) + s = expandGroupLengths(s, offsets[:0], lengths[:0]) + + for start < end { + name := s[offsets[start&GROUP_MASK]:] + nameLen := lengths[start&GROUP_MASK] + if names.compareName(name[:nameLen], choice, otherName) { + return start + } + start++ + } + return -1 +} + +func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { + /* read the lengths of the 32 strings in this group and get each string's offset */ + var i, offset, length uint16 + var lengthByte uint8 + + /* all 32 lengths must be read to get the offset of the first group string */ + for i < LINES_PER_GROUP { + lengthByte = s[0] + s = s[1:] + + /* read even nibble - MSBs of lengthByte */ + if length >= 12 { + /* double-nibble length spread across two bytes */ + length = ((length&0x3)<<4 | uint16(lengthByte)>>4) + 12 + lengthByte &= 0xf + } else if (lengthByte /* &0xf0 */) >= 0xc0 { + /* double-nibble length spread across this one byte */ + length = (uint16(lengthByte) & 0x3f) + 12 + } else { + /* single-nibble length in MSBs */ + length = uint16(lengthByte) >> 4 + lengthByte &= 0xf + } + + offsets = append(offsets, offset) + lengths = append(lengths, length) + + offset += length + i++ + + /* read odd nibble - LSBs of lengthByte */ + if (lengthByte & 0xf0) == 0 { + /* this nibble was not consumed for a double-nibble length above */ + length = uint16(lengthByte) + if length < 12 { + /* single-nibble length in LSBs */ + offsets = append(offsets, offset) + lengths = append(lengths, length) + + offset += length + i++ + } + } else { + length = 0 /* prevent double-nibble detection in the next iteration */ + } + } + + /* now, s is at the first group string */ + return s +} + +func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName string) bool { + tokens := names.ptr16(0)[8:] + + tokenCount := tokens[0] + tokens = tokens[1:] + + tokenStrings := names.ptr8(names.tokenStringOffset) + otherNameLen := len(otherName) + + for len(name) > 0 && len(otherName) > 0 { + c := name[0] + name = name[1:] + + if uint16(c) >= tokenCount { + if c != ';' { + if c != otherName[0] { + return false + } + otherName = otherName[1:] + } else { + break + } + } else { + token := tokens[c] + if int16(token) == -2 { + token = tokens[int(c)<<8|int(name[0])] + name = name[1:] + } + if int16(token) == -1 { + if c != ';' { + if c != otherName[0] { + return false + } + otherName = otherName[1:] + } else { + if len(otherName) == otherNameLen && choice == U_EXTENDED_CHAR_NAME { + if ';' >= tokenCount || int16(tokens[';']) == -1 { + continue + } + } + break + } + } else { + tokenString := tokenStrings[token:] + for tokenString[0] != 0 && len(otherName) > 0 { + if tokenString[0] != otherName[0] { + return false + } + tokenString = tokenString[1:] + otherName = otherName[1:] + } + } + } + } + + return len(otherName) == 0 +} diff --git a/go/mysql/icuregex/internal/unames/unames_test.go b/go/mysql/icuregex/internal/unames/unames_test.go new file mode 100644 index 00000000000..941556b70d3 --- /dev/null +++ b/go/mysql/icuregex/internal/unames/unames_test.go @@ -0,0 +1,64 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package unames + +import ( + "testing" +) + +func TestCharForName(t *testing.T) { + var TestNames = []struct { + code rune + name, oldName, extName string + }{ + {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, + {0x01a2, "LATIN CAPITAL LETTER OI", "", "LATIN CAPITAL LETTER OI"}, + {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK"}, + {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "", "TIBETAN MARK BSKA- SHOG GI MGO RGYAN"}, + {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401"}, + {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED"}, + {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA"}, + {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH"}, + {0xd800, "", "", ""}, + {0xdc00, "", "", ""}, + {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS"}, + {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN"}, + {0xffff, "", "", ""}, + {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "", "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"}, + {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456"}, + } + + for _, tn := range TestNames { + if tn.name != "" { + r := CharForName(U_UNICODE_CHAR_NAME, tn.name) + if r != tn.code { + t.Errorf("CharFromName(U_UNICODE_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.name, r, r, tn.code, tn.code) + } + } + if tn.extName != "" { + r := CharForName(U_EXTENDED_CHAR_NAME, tn.extName) + if r != tn.code { + t.Errorf("CharFromName(U_EXTENDED_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.extName, r, r, tn.code, tn.code) + } + } + } +} diff --git a/go/mysql/icuregex/internal/uprops/constants.go b/go/mysql/icuregex/internal/uprops/constants.go new file mode 100644 index 00000000000..a3a6f0d3d5d --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/constants.go @@ -0,0 +1,625 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +const ( + UPROPS_PROPS32_INDEX = iota + UPROPS_EXCEPTIONS_INDEX + UPROPS_EXCEPTIONS_TOP_INDEX + + UPROPS_ADDITIONAL_TRIE_INDEX + UPROPS_ADDITIONAL_VECTORS_INDEX + UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX + + UPROPS_SCRIPT_EXTENSIONS_INDEX + + UPROPS_RESERVED_INDEX_7 + UPROPS_RESERVED_INDEX_8 + + /* size of the data file (number of 32-bit units after the header) */ + UPROPS_DATA_TOP_INDEX + + /* maximum values for code values in vector word 0 */ + UPROPS_MAX_VALUES_INDEX = 10 + /* maximum values for code values in vector word 2 */ + UPROPS_MAX_VALUES_2_INDEX = 11 + + UPROPS_INDEX_COUNT = 16 +) + +type Property int32 + +const ( + /* + * Note: UProperty constants are parsed by preparseucd.py. + * It matches lines like + * UCHAR_=, + */ + + /* Note: Place UCHAR_ALPHABETIC before UCHAR_BINARY_START so that + debuggers display UCHAR_ALPHABETIC as the symbolic name for 0, + rather than UCHAR_BINARY_START. Likewise for other *_START + identifiers. */ + + /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. + Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ + UCHAR_ALPHABETIC Property = 0 + /** First constant for binary Unicode properties. @stable ICU 2.1 */ + UCHAR_BINARY_START = UCHAR_ALPHABETIC + /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ + UCHAR_ASCII_HEX_DIGIT Property = 1 + /** Binary property Bidi_Control. + Format controls which have specific functions + in the Bidi Algorithm. @stable ICU 2.1 */ + UCHAR_BIDI_CONTROL Property = 2 + /** Binary property Bidi_Mirrored. + Characters that may change display in RTL text. + Same as u_isMirrored. + See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ + UCHAR_BIDI_MIRRORED Property = 3 + /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ + UCHAR_DASH Property = 4 + /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). + Ignorable in most processing. + <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ + UCHAR_DEFAULT_IGNORABLE_CODE_POINT Property = 5 + /** Binary property Deprecated (new in Unicode 3.2). + The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ + UCHAR_DEPRECATED Property = 6 + /** Binary property Diacritic. Characters that linguistically modify + the meaning of another character to which they apply. @stable ICU 2.1 */ + UCHAR_DIACRITIC Property = 7 + /** Binary property Extender. + Extend the value or shape of a preceding alphabetic character, + e.g., length and iteration marks. @stable ICU 2.1 */ + UCHAR_EXTENDER Property = 8 + /** Binary property Full_Composition_Exclusion. + CompositionExclusions.txt+Singleton Decompositions+ + Non-Starter Decompositions. @stable ICU 2.1 */ + UCHAR_FULL_COMPOSITION_EXCLUSION Property = 9 + /** Binary property Grapheme_Base (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ + UCHAR_GRAPHEME_BASE Property = 10 + /** Binary property Grapheme_Extend (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ + UCHAR_GRAPHEME_EXTEND Property = 11 + /** Binary property Grapheme_Link (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ + UCHAR_GRAPHEME_LINK Property = 12 + /** Binary property Hex_Digit. + Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ + UCHAR_HEX_DIGIT Property = 13 + /** Binary property Hyphen. Dashes used to mark connections + between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ + UCHAR_HYPHEN Property = 14 + /** Binary property ID_Continue. + Characters that can continue an identifier. + DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." + ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ + UCHAR_ID_CONTINUE Property = 15 + /** Binary property ID_Start. + Characters that can start an identifier. + Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ + UCHAR_ID_START Property = 16 + /** Binary property Ideographic. + CJKV ideographs. @stable ICU 2.1 */ + UCHAR_IDEOGRAPHIC Property = 17 + /** Binary property IDS_Binary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_IDS_BINARY_OPERATOR Property = 18 + /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_IDS_TRINARY_OPERATOR Property = 19 + /** Binary property Join_Control. + Format controls for cursive joining and ligation. @stable ICU 2.1 */ + UCHAR_JOIN_CONTROL Property = 20 + /** Binary property Logical_Order_Exception (new in Unicode 3.2). + Characters that do not use logical order and + require special handling in most processing. @stable ICU 2.1 */ + UCHAR_LOGICAL_ORDER_EXCEPTION Property = 21 + /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. + Ll+Other_Lowercase @stable ICU 2.1 */ + UCHAR_LOWERCASE Property = 22 + /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ + UCHAR_MATH Property = 23 + /** Binary property Noncharacter_Code_Point. + Code points that are explicitly defined as illegal + for the encoding of characters. @stable ICU 2.1 */ + UCHAR_NONCHARACTER_CODE_POINT Property = 24 + /** Binary property Quotation_Mark. @stable ICU 2.1 */ + UCHAR_QUOTATION_MARK Property = 25 + /** Binary property Radical (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_RADICAL Property = 26 + /** Binary property Soft_Dotted (new in Unicode 3.2). + Characters with a "soft dot", like i or j. + An accent placed on these characters causes + the dot to disappear. @stable ICU 2.1 */ + UCHAR_SOFT_DOTTED Property = 27 + /** Binary property Terminal_Punctuation. + Punctuation characters that generally mark + the end of textual units. @stable ICU 2.1 */ + UCHAR_TERMINAL_PUNCTUATION Property = 28 + /** Binary property Unified_Ideograph (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCHAR_UNIFIED_IDEOGRAPH Property = 29 + /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. + Lu+Other_Uppercase @stable ICU 2.1 */ + UCHAR_UPPERCASE Property = 30 + /** Binary property White_Space. + Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. + Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ + UCHAR_WHITE_SPACE Property = 31 + /** Binary property XID_Continue. + ID_Continue modified to allow closure under + normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCHAR_XID_CONTINUE Property = 32 + /** Binary property XID_Start. ID_Start modified to allow + closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCHAR_XID_START Property = 33 + /** Binary property Case_Sensitive. Either the source of a case + mapping or _in_ the target of a case mapping. Not the same as + the general category Cased_Letter. @stable ICU 2.6 */ + UCHAR_CASE_SENSITIVE Property = 34 + /** Binary property STerm (new in Unicode 4.0.1). + Sentence Terminal. Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + @stable ICU 3.0 */ + UCHAR_S_TERM Property = 35 + /** Binary property Variation_Selector (new in Unicode 4.0.1). + Indicates all those characters that qualify as Variation Selectors. + For details on the behavior of these characters, + see StandardizedVariants.html and 15.6 Variation Selectors. + @stable ICU 3.0 */ + UCHAR_VARIATION_SELECTOR Property = 36 + /** Binary property NFD_Inert. + ICU-specific property for characters that are inert under NFD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFD_INERT Property = 37 + /** Binary property NFKD_Inert. + ICU-specific property for characters that are inert under NFKD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFKD_INERT Property = 38 + /** Binary property NFC_Inert. + ICU-specific property for characters that are inert under NFC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFC_INERT Property = 39 + /** Binary property NFKC_Inert. + ICU-specific property for characters that are inert under NFKC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCHAR_NFKC_INERT Property = 40 + /** Binary Property Segment_Starter. + ICU-specific property for characters that are starters in terms of + Unicode normalization and combining character sequences. + They have ccc=0 and do not occur in non-initial position of the + canonical decomposition of any character + (like a-umlaut in NFD and a Jamo T in an NFD(Hangul LVT)). + ICU uses this property for segmenting a string for generating a set of + canonically equivalent strings, e.g. for canonical closure while + processing collation tailoring rules. + @stable ICU 3.0 */ + UCHAR_SEGMENT_STARTER Property = 41 + /** Binary property Pattern_Syntax (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCHAR_PATTERN_SYNTAX Property = 42 + /** Binary property Pattern_White_Space (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCHAR_PATTERN_WHITE_SPACE Property = 43 + /** Binary property alnum (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_ALNUM Property = 44 + /** Binary property blank (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_BLANK Property = 45 + /** Binary property graph (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_GRAPH Property = 46 + /** Binary property print (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_PRINT Property = 47 + /** Binary property xdigit (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCHAR_POSIX_XDIGIT Property = 48 + /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ + UCHAR_CASED Property = 49 + /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ + UCHAR_CASE_IGNORABLE Property = 50 + /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_LOWERCASED Property = 51 + /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_UPPERCASED Property = 52 + /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_TITLECASED Property = 53 + /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_CASEFOLDED Property = 54 + /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_CASEMAPPED Property = 55 + /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ + UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED Property = 56 + /** + * Binary property Emoji. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCHAR_EMOJI Property = 57 + /** + * Binary property Emoji_Presentation. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCHAR_EMOJI_PRESENTATION Property = 58 + /** + * Binary property Emoji_Modifier. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCHAR_EMOJI_MODIFIER Property = 59 + /** + * Binary property Emoji_Modifier_Base. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCHAR_EMOJI_MODIFIER_BASE Property = 60 + /** + * Binary property Emoji_Component. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 60 + */ + UCHAR_EMOJI_COMPONENT Property = 61 + /** + * Binary property Regional_Indicator. + * @stable ICU 60 + */ + UCHAR_REGIONAL_INDICATOR Property = 62 + /** + * Binary property Prepended_Concatenation_Mark. + * @stable ICU 60 + */ + UCHAR_PREPENDED_CONCATENATION_MARK Property = 63 + /** + * Binary property Extended_Pictographic. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 62 + */ + UCHAR_EXTENDED_PICTOGRAPHIC Property = 64 + + /** Enumerated property Bidi_Class. + Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ + UCHAR_BIDI_CLASS Property = 0x1000 + /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ + UCHAR_INT_START = UCHAR_BIDI_CLASS + /** Enumerated property Block. + Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ + UCHAR_BLOCK Property = 0x1001 + /** Enumerated property Canonical_Combining_Class. + Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ + UCHAR_CANONICAL_COMBINING_CLASS Property = 0x1002 + /** Enumerated property Decomposition_Type. + Returns UDecompositionType values. @stable ICU 2.2 */ + UCHAR_DECOMPOSITION_TYPE Property = 0x1003 + /** Enumerated property East_Asian_Width. + See http://www.unicode.org/reports/tr11/ + Returns UEastAsianWidth values. @stable ICU 2.2 */ + UCHAR_EAST_ASIAN_WIDTH Property = 0x1004 + /** Enumerated property General_Category. + Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ + UCHAR_GENERAL_CATEGORY Property = 0x1005 + /** Enumerated property Joining_Group. + Returns UJoiningGroup values. @stable ICU 2.2 */ + UCHAR_JOINING_GROUP Property = 0x1006 + /** Enumerated property Joining_Type. + Returns UJoiningType values. @stable ICU 2.2 */ + UCHAR_JOINING_TYPE Property = 0x1007 + /** Enumerated property Line_Break. + Returns ULineBreak values. @stable ICU 2.2 */ + UCHAR_LINE_BREAK Property = 0x1008 + /** Enumerated property Numeric_Type. + Returns UNumericType values. @stable ICU 2.2 */ + UCHAR_NUMERIC_TYPE Property = 0x1009 + /** Enumerated property Script. + Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ + UCHAR_SCRIPT Property = 0x100A + /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. + Returns UHangulSyllableType values. @stable ICU 2.6 */ + UCHAR_HANGUL_SYLLABLE_TYPE Property = 0x100B + /** Enumerated property NFD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFD_QUICK_CHECK Property = 0x100C + /** Enumerated property NFKD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFKD_QUICK_CHECK Property = 0x100D + /** Enumerated property NFC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFC_QUICK_CHECK Property = 0x100E + /** Enumerated property NFKC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCHAR_NFKC_QUICK_CHECK Property = 0x100F + /** Enumerated property Lead_Canonical_Combining_Class. + ICU-specific property for the ccc of the first code point + of the decomposition, or lccc(c)=ccc(NFD(c)[0]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCHAR_LEAD_CANONICAL_COMBINING_CLASS Property = 0x1010 + /** Enumerated property Trail_Canonical_Combining_Class. + ICU-specific property for the ccc of the last code point + of the decomposition, or tccc(c)=ccc(NFD(c)[last]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCHAR_TRAIL_CANONICAL_COMBINING_CLASS Property = 0x1011 + /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ + UCHAR_GRAPHEME_CLUSTER_BREAK Property = 0x1012 + /** Enumerated property Sentence_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns USentenceBreak values. @stable ICU 3.4 */ + UCHAR_SENTENCE_BREAK Property = 0x1013 + /** Enumerated property Word_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UWordBreakValues values. @stable ICU 3.4 */ + UCHAR_WORD_BREAK Property = 0x1014 + /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + Used in UAX #9: Unicode Bidirectional Algorithm + (http://www.unicode.org/reports/tr9/) + Returns UBidiPairedBracketType values. @stable ICU 52 */ + UCHAR_BIDI_PAIRED_BRACKET_TYPE Property = 0x1015 + /** + * Enumerated property Indic_Positional_Category. + * New in Unicode 6.0 as provisional property Indic_Matra_Category; + * renamed and changed to informative in Unicode 8.0. + * See http://www.unicode.org/reports/tr44/#IndicPositionalCategory.txt + * @stable ICU 63 + */ + UCHAR_INDIC_POSITIONAL_CATEGORY Property = 0x1016 + /** + * Enumerated property Indic_Syllabic_Category. + * New in Unicode 6.0 as provisional; informative since Unicode 8.0. + * See http://www.unicode.org/reports/tr44/#IndicSyllabicCategory.txt + * @stable ICU 63 + */ + UCHAR_INDIC_SYLLABIC_CATEGORY Property = 0x1017 + /** + * Enumerated property Vertical_Orientation. + * Used for UAX #50 Unicode Vertical Text Layout (https://www.unicode.org/reports/tr50/). + * New as a UCD property in Unicode 10.0. + * @stable ICU 63 + */ + UCHAR_VERTICAL_ORIENTATION Property = 0x1018 + + /** Bitmask property General_Category_Mask. + This is the General_Category property returned as a bit mask. + When used in u_getIntPropertyValue(c), same as U_MASK(u_charType(c)), + returns bit masks for UCharCategory values where exactly one bit is set. + When used with u_getPropertyValueName() and u_getPropertyValueEnum(), + a multi-bit mask is used for sets of categories like "Letters". + Mask values should be cast to uint32_t. + @stable ICU 2.4 */ + UCHAR_GENERAL_CATEGORY_MASK Property = 0x2000 + /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ + UCHAR_MASK_START = UCHAR_GENERAL_CATEGORY_MASK + /** Double property Numeric_Value. + Corresponds to u_getNumericValue. @stable ICU 2.4 */ + UCHAR_NUMERIC_VALUE Property = 0x3000 + /** First constant for double Unicode properties. @stable ICU 2.4 */ + UCHAR_DOUBLE_START = UCHAR_NUMERIC_VALUE + /** String property Age. + Corresponds to u_charAge. @stable ICU 2.4 */ + UCHAR_AGE Property = 0x4000 + /** First constant for string Unicode properties. @stable ICU 2.4 */ + UCHAR_STRING_START = UCHAR_AGE + /** String property Bidi_Mirroring_Glyph. + Corresponds to u_charMirror. @stable ICU 2.4 */ + UCHAR_BIDI_MIRRORING_GLYPH Property = 0x4001 + /** String property Case_Folding. + Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ + UCHAR_CASE_FOLDING Property = 0x4002 + /** String property Lowercase_Mapping. + Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ + UCHAR_LOWERCASE_MAPPING Property = 0x4004 + /** String property Name. + Corresponds to u_charName. @stable ICU 2.4 */ + UCHAR_NAME Property = 0x4005 + /** String property Simple_Case_Folding. + Corresponds to u_foldCase. @stable ICU 2.4 */ + UCHAR_SIMPLE_CASE_FOLDING Property = 0x4006 + /** String property Simple_Lowercase_Mapping. + Corresponds to u_tolower. @stable ICU 2.4 */ + UCHAR_SIMPLE_LOWERCASE_MAPPING Property = 0x4007 + /** String property Simple_Titlecase_Mapping. + Corresponds to u_totitle. @stable ICU 2.4 */ + UCHAR_SIMPLE_TITLECASE_MAPPING Property = 0x4008 + /** String property Simple_Uppercase_Mapping. + Corresponds to u_toupper. @stable ICU 2.4 */ + UCHAR_SIMPLE_UPPERCASE_MAPPING Property = 0x4009 + /** String property Titlecase_Mapping. + Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ + UCHAR_TITLECASE_MAPPING Property = 0x400A + /** String property Uppercase_Mapping. + Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ + UCHAR_UPPERCASE_MAPPING Property = 0x400C + /** String property Bidi_Paired_Bracket (new in Unicode 6.3). + Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ + UCHAR_BIDI_PAIRED_BRACKET Property = 0x400D + + /** Miscellaneous property Script_Extensions (new in Unicode 6.0). + Some characters are commonly used in multiple scripts. + For more information, see UAX #24: http://www.unicode.org/reports/tr24/. + Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. + @stable ICU 4.6 */ + UCHAR_SCRIPT_EXTENSIONS Property = 0x7000 + /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ + UCHAR_OTHER_PROPERTY_START = UCHAR_SCRIPT_EXTENSIONS + + /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ + UCHAR_INVALID_CODE Property = -1 +) + +const ( + UCHAR_BINARY_LIMIT = 65 + UCHAR_INT_LIMIT = 0x1019 + UCHAR_MASK_LIMIT = 0x2001 + UCHAR_STRING_LIMIT = 0x400E +) + +/* + * Properties in vector word 1 + * Each bit encodes one binary property. + * The following constants represent the bit number, use 1< 0; numRanges-- { + start := int32(pnames.valueMaps[i]) + limit := int32(pnames.valueMaps[i+1]) + i += 2 + if int32(prop) < start { + break + } + if int32(prop) < limit { + return i + (int32(prop)-start)*2 + } + i += (limit - start) * 2 + } + return 0 +} + +func getPropertyOrValueEnum(offset int32, alias string) int32 { + trie := bytestrie.New(pnames.byteTrie[offset:]) + if trie.ContainsName(alias) { + return trie.GetValue() + } + return -1 +} + +func ComparePropertyNames(name1, name2 string) int { + next := func(s string) (byte, string) { + for len(s) > 0 && (s[0] == 0x2d || s[0] == 0x5f || s[0] == 0x20 || (0x09 <= s[0] && s[0] <= 0x0d)) { + s = s[1:] + } + if len(s) == 0 { + return 0, "" + } + c := s[0] + s = s[1:] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + return c, s + } + + var r1, r2 byte + for { + r1, name1 = next(name1) + r2, name2 = next(name2) + + if r1 == 0 && r2 == 0 { + return 0 + } + + /* Compare the lowercased characters */ + if r1 != r2 { + return int(r1) - int(r2) + } + } +} + +func GetIntPropertyValue(c rune, which Property) int32 { + if which < UCHAR_INT_START { + if UCHAR_BINARY_START <= which && which < UCHAR_BINARY_LIMIT { + prop := binProps[which] + if prop.contains(prop, c, which) { + return 1 + } + return 0 + } + } else if which < UCHAR_INT_LIMIT { + iprop := intProps[which-UCHAR_INT_START] + return iprop.getValue(iprop, c, which) + } else if which == UCHAR_GENERAL_CATEGORY_MASK { + return int32(U_MASK(uchar.CharType(c))) + } + return 0 // undefined +} + +const ( + UPROPS_SCRIPT_X_MASK = 0x00f000ff + UPROPS_SCRIPT_X_SHIFT = 22 + + UPROPS_SCRIPT_HIGH_MASK = 0x00300000 + UPROPS_SCRIPT_HIGH_SHIFT = 12 + UPROPS_MAX_SCRIPT = 0x3ff + + UPROPS_SCRIPT_LOW_MASK = 0x000000ff + + UPROPS_SCRIPT_X_WITH_COMMON = 0x400000 + UPROPS_SCRIPT_X_WITH_INHERITED = 0x800000 + UPROPS_SCRIPT_X_WITH_OTHER = 0xc00000 +) + +func mergeScriptCodeOrIndex(scriptX uint32) uint32 { + return ((scriptX & UPROPS_SCRIPT_HIGH_MASK) >> UPROPS_SCRIPT_HIGH_SHIFT) | + (scriptX & UPROPS_SCRIPT_LOW_MASK) +} + +func GetScript(c rune) int32 { + if c > 0x10ffff { + return -1 + } + scriptX := uchar.GetUnicodeProperties(c, 0) & UPROPS_SCRIPT_X_MASK + codeOrIndex := mergeScriptCodeOrIndex(scriptX) + + if scriptX < UPROPS_SCRIPT_X_WITH_COMMON { + return int32(codeOrIndex) + } else if scriptX < UPROPS_SCRIPT_X_WITH_INHERITED { + return 0 + } else if scriptX < UPROPS_SCRIPT_X_WITH_OTHER { + return 1 + } else { + return int32(uchar.ScriptExtension(codeOrIndex)) + } +} diff --git a/go/mysql/icuregex/internal/uprops/uprops_binary.go b/go/mysql/icuregex/internal/uprops/uprops_binary.go new file mode 100644 index 00000000000..19e3141cbc5 --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uprops_binary.go @@ -0,0 +1,229 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "golang.org/x/exp/constraints" + + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" +) + +type BinaryProperty struct { + column PropertySource + mask uint32 + contains func(prop *BinaryProperty, c rune, which Property) bool +} + +func U_MASK[T constraints.Integer](x T) uint32 { + return 1 << x +} + +func defaultContains(prop *BinaryProperty, c rune, _ Property) bool { + return (uchar.GetUnicodeProperties(c, int(prop.column)) & prop.mask) != 0 +} + +var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ + /* + * column and mask values for binary properties from u_getUnicodeProperties(). + * Must be in order of corresponding UProperty, + * and there must be exactly one entry per binary UProperty. + * + * Properties with mask==0 are handled in code. + * For them, column is the UPropertySource value. + */ + {1, U_MASK(UPROPS_ALPHABETIC), defaultContains}, + {1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains}, + {UPROPS_SRC_BIDI, 0, isBidiControl}, + {UPROPS_SRC_BIDI, 0, isMirrored}, + {1, U_MASK(UPROPS_DASH), defaultContains}, + {1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains}, + {1, U_MASK(UPROPS_DEPRECATED), defaultContains}, + {1, U_MASK(UPROPS_DIACRITIC), defaultContains}, + {1, U_MASK(UPROPS_EXTENDER), defaultContains}, + {UPROPS_SRC_NFC, 0, hasFullCompositionExclusion}, + {1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains}, + {1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains}, + {1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains}, + {1, U_MASK(UPROPS_HEX_DIGIT), defaultContains}, + {1, U_MASK(UPROPS_HYPHEN), defaultContains}, + {1, U_MASK(UPROPS_ID_CONTINUE), defaultContains}, + {1, U_MASK(UPROPS_ID_START), defaultContains}, + {1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains}, + {1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains}, + {1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains}, + {UPROPS_SRC_BIDI, 0, isJoinControl}, + {1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_LOWERCASE + {1, U_MASK(UPROPS_MATH), defaultContains}, + {1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains}, + {1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains}, + {1, U_MASK(UPROPS_RADICAL), defaultContains}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_SOFT_DOTTED + {1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains}, + {1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_UPPERCASE + {1, U_MASK(UPROPS_WHITE_SPACE), defaultContains}, + {1, U_MASK(UPROPS_XID_CONTINUE), defaultContains}, + {1, U_MASK(UPROPS_XID_START), defaultContains}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASE_SENSITIVE + {1, U_MASK(UPROPS_S_TERM), defaultContains}, + {1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains}, + {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFD_INERT + {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKD_INERT + {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFC_INERT + {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKC_INERT + {UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter}, + {1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains}, + {1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains}, + {UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum}, + {UPROPS_SRC_CHAR, 0, isPOSIX_blank}, + {UPROPS_SRC_CHAR, 0, isPOSIX_graph}, + {UPROPS_SRC_CHAR, 0, isPOSIX_print}, + {UPROPS_SRC_CHAR, 0, isPOSIX_xdigit}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASED + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASE_IGNORABLE + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_LOWERCASED + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_UPPERCASED + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_TITLECASED + {UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded}, + {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED + {UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded}, + {2, U_MASK(UPROPS_2_EMOJI), defaultContains}, + {2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains}, + {2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains}, + {2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains}, + {2, U_MASK(UPROPS_2_EMOJI_COMPONENT), defaultContains}, + {2, 0, isRegionalIndicator}, + {1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains}, + {2, U_MASK(UPROPS_2_EXTENDED_PICTOGRAPHIC), defaultContains}, +} + +func isBidiControl(prop *BinaryProperty, c rune, which Property) bool { + return ubidi.IsBidiControl(c) +} + +func isMirrored(prop *BinaryProperty, c rune, which Property) bool { + return ubidi.IsMirrored(c) +} + +func isRegionalIndicator(prop *BinaryProperty, c rune, which Property) bool { + return 0x1F1E6 <= c && c <= 0x1F1FF +} + +func changesWhenNFKC_Casefolded(prop *BinaryProperty, c rune, which Property) bool { + panic("TODO") +} + +func changesWhenCasefolded(prop *BinaryProperty, c rune, which Property) bool { + panic("TODO") +} + +func isPOSIX_xdigit(prop *BinaryProperty, c rune, which Property) bool { + return uchar.IsXDigit(c) +} + +func isPOSIX_print(prop *BinaryProperty, c rune, which Property) bool { + return uchar.IsPOSIXPrint(c) +} + +func isPOSIX_graph(prop *BinaryProperty, c rune, which Property) bool { + return uchar.IsGraphPOSIX(c) +} + +func isPOSIX_blank(prop *BinaryProperty, c rune, which Property) bool { + return uchar.IsBlank(c) +} + +func isPOSIX_alnum(prop *BinaryProperty, c rune, which Property) bool { + return (uchar.GetUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC)) != 0 || uchar.IsDigit(c) +} + +func isCanonSegmentStarter(prop *BinaryProperty, c rune, which Property) bool { + panic("TODO") +} + +func isJoinControl(prop *BinaryProperty, c rune, which Property) bool { + return ubidi.IsJoinControl(c) +} + +func hasFullCompositionExclusion(prop *BinaryProperty, c rune, which Property) bool { + panic("TODO") +} + +func caseBinaryPropertyContains(prop *BinaryProperty, c rune, which Property) bool { + return HasBinaryPropertyUcase(c, which) +} + +func HasBinaryPropertyUcase(c rune, which Property) bool { + /* case mapping properties */ + switch which { + case UCHAR_LOWERCASE: + return ucase.UCASE_LOWER == ucase.GetType(c) + case UCHAR_UPPERCASE: + return ucase.UCASE_UPPER == ucase.GetType(c) + case UCHAR_SOFT_DOTTED: + return ucase.IsSoftDotted(c) + case UCHAR_CASE_SENSITIVE: + return ucase.IsCaseSensitive(c) + case UCHAR_CASED: + return ucase.UCASE_NONE != ucase.GetType(c) + case UCHAR_CASE_IGNORABLE: + return (ucase.GetTypeOrIgnorable(c) >> 2) != 0 + /* + * Note: The following Changes_When_Xyz are defined as testing whether + * the NFD form of the input changes when Xyz-case-mapped. + * However, this simpler implementation of these properties, + * ignoring NFD, passes the tests. + * The implementation needs to be changed if the tests start failing. + * When that happens, optimizations should be used to work with the + * per-single-code point ucase_toFullXyz() functions unless + * the NFD form has more than one code point, + * and the property starts set needs to be the union of the + * start sets for normalization and case mappings. + */ + case UCHAR_CHANGES_WHEN_LOWERCASED: + return ucase.ToFullLower(c) >= 0 + case UCHAR_CHANGES_WHEN_UPPERCASED: + return ucase.ToFullUpper(c) >= 0 + case UCHAR_CHANGES_WHEN_TITLECASED: + return ucase.ToFullTitle(c) >= 0 + /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ + case UCHAR_CHANGES_WHEN_CASEMAPPED: + return ucase.ToFullLower(c) >= 0 || ucase.ToFullUpper(c) >= 0 || ucase.ToFullTitle(c) >= 0 + default: + return false + } +} + +func isNormInert(prop *BinaryProperty, c rune, which Property) bool { + panic("TODO") +} + +func HasBinaryProperty(c rune, which Property) bool { + if which < UCHAR_BINARY_START || UCHAR_BINARY_LIMIT <= which { + return false + } + prop := binProps[which] + return prop.contains(prop, c, which) +} diff --git a/go/mysql/icuregex/internal/uprops/uprops_int.go b/go/mysql/icuregex/internal/uprops/uprops_int.go new file mode 100644 index 00000000000..c8a6795749a --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uprops_int.go @@ -0,0 +1,287 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + uchar2 "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" +) + +type IntPropertyGetValue func(prop *IntProperty, c rune, which Property) int32 + +type IntProperty struct { + column PropertySource + mask uint32 + shift int32 + getValue IntPropertyGetValue +} + +const ( + UPROPS_BLOCK_MASK = 0x0001ff00 + UPROPS_BLOCK_SHIFT = 8 + + UPROPS_EA_MASK = 0x000e0000 + UPROPS_EA_SHIFT = 17 + + UPROPS_LB_MASK = 0x03f00000 + UPROPS_LB_SHIFT = 20 + + UPROPS_SB_MASK = 0x000f8000 + UPROPS_SB_SHIFT = 15 + + UPROPS_WB_MASK = 0x00007c00 + UPROPS_WB_SHIFT = 10 + + UPROPS_GCB_MASK = 0x000003e0 + UPROPS_GCB_SHIFT = 5 + + UPROPS_DT_MASK = 0x0000001f +) + +type NormalizationCheckResult int32 + +const ( + /** + * The input string is not in the normalization form. + * @stable ICU 2.0 + */ + UNORM_NO NormalizationCheckResult = iota + /** + * The input string is in the normalization form. + * @stable ICU 2.0 + */ + UNORM_YES + /** + * The input string may or may not be in the normalization form. + * This value is only returned for composition forms like NFC and FCC, + * when a backward-combining character is found for which the surrounding text + * would have to be analyzed further. + * @stable ICU 2.0 + */ + UNORM_MAYBE +) + +type NumericType int32 + +/** + * Numeric Type constants. + * + * @see UCHAR_NUMERIC_TYPE + * @stable ICU 2.2 + */ +const ( + /* + * Note: UNumericType constants are parsed by preparseucd.py. + * It matches lines like + * U_NT_ + */ + + U_NT_NONE NumericType = iota /*[None]*/ + U_NT_DECIMAL /*[de]*/ + U_NT_DIGIT /*[di]*/ + U_NT_NUMERIC /*[nu]*/ + /** + * One more than the highest normal UNumericType value. + * The highest value is available via u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE). + * + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + U_NT_COUNT +) + +/** + * Hangul Syllable Type constants. + * + * @see UCHAR_HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ + +type HangunSyllableType int32 + +const ( + /* + * Note: UHangulSyllableType constants are parsed by preparseucd.py. + * It matches lines like + * U_HST_ + */ + + U_HST_NOT_APPLICABLE HangunSyllableType = iota /*[NA]*/ + U_HST_LEADING_JAMO /*[L]*/ + U_HST_VOWEL_JAMO /*[V]*/ + U_HST_TRAILING_JAMO /*[T]*/ + U_HST_LV_SYLLABLE /*[LV]*/ + U_HST_LVT_SYLLABLE /*[LVT]*/ + /** + * One more than the highest normal UHangulSyllableType value. + * The highest value is available via u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE). + * + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + U_HST_COUNT +) + +var intProps = [UCHAR_INT_LIMIT - UCHAR_INT_START]*IntProperty{ + /* + * column, mask and shift values for int-value properties from u_getUnicodeProperties(). + * Must be in order of corresponding UProperty, + * and there must be exactly one entry per int UProperty. + * + * Properties with mask==0 are handled in code. + * For them, column is the UPropertySource value. + */ + {UPROPS_SRC_BIDI, 0, 0, getBiDiClass}, + {0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue}, + {UPROPS_SRC_NFC, 0, 0xff, getCombiningClass}, + {2, UPROPS_DT_MASK, 0, defaultGetValue}, + {0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue}, + {UPROPS_SRC_CHAR, 0, uchar2.U_CHAR_CATEGORY_COUNT - 1, getGeneralCategory}, + {UPROPS_SRC_BIDI, 0, 0, getJoiningGroup}, + {UPROPS_SRC_BIDI, 0, 0, getJoiningType}, + {2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue}, + {UPROPS_SRC_CHAR, 0, int32(U_NT_COUNT - 1), getNumericType}, + {UPROPS_SRC_PROPSVEC, 0, 0, getScript}, + {UPROPS_SRC_PROPSVEC, 0, int32(U_HST_COUNT - 1), getHangulSyllableType}, + // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" + {UPROPS_SRC_NFC, 0, int32(UNORM_YES), getNormQuickCheck}, + // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" + {UPROPS_SRC_NFKC, 0, int32(UNORM_YES), getNormQuickCheck}, + // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE + {UPROPS_SRC_NFC, 0, int32(UNORM_MAYBE), getNormQuickCheck}, + // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE + {UPROPS_SRC_NFKC, 0, int32(UNORM_MAYBE), getNormQuickCheck}, + {UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass}, + {UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass}, + {2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue}, + {2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue}, + {2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue}, + {UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType}, + {UPROPS_SRC_INPC, 0, 0, getInPC}, + {UPROPS_SRC_INSC, 0, 0, getInSC}, + {UPROPS_SRC_VO, 0, 0, getVo}, +} + +func getVo(prop *IntProperty, c rune, which Property) int32 { + return int32(ulayout.VoTrie().Get(c)) +} + +func getInSC(prop *IntProperty, c rune, which Property) int32 { + return int32(ulayout.InscTrie().Get(c)) +} + +func getInPC(prop *IntProperty, c rune, which Property) int32 { + return int32(ulayout.InpcTrie().Get(c)) +} + +func getBiDiPairedBracketType(prop *IntProperty, c rune, which Property) int32 { + return int32(ubidi.PairedBracketType(c)) +} + +func getTrailCombiningClass(prop *IntProperty, c rune, which Property) int32 { + panic("TODO") +} + +func getLeadCombiningClass(prop *IntProperty, c rune, which Property) int32 { + panic("TODO") +} + +func getNormQuickCheck(prop *IntProperty, c rune, which Property) int32 { + panic("TODO") +} + +/* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ +var gcbToHst = []HangunSyllableType{ + U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ + U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ + U_HST_NOT_APPLICABLE, /* U_GCB_CR */ + U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ + U_HST_LEADING_JAMO, /* U_GCB_L */ + U_HST_NOT_APPLICABLE, /* U_GCB_LF */ + U_HST_LV_SYLLABLE, /* U_GCB_LV */ + U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ + U_HST_TRAILING_JAMO, /* U_GCB_T */ + U_HST_VOWEL_JAMO, /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ +} + +func getHangulSyllableType(prop *IntProperty, c rune, which Property) int32 { + /* see comments on gcbToHst[] above */ + gcb := (int32(uchar2.GetUnicodeProperties(c, 2)) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT + + if gcb < int32(len(gcbToHst)) { + return int32(gcbToHst[gcb]) + } else { + return int32(U_HST_NOT_APPLICABLE) + } +} + +func getScript(_ *IntProperty, c rune, _ Property) int32 { + return GetScript(c) +} + +func getNumericType(prop *IntProperty, c rune, which Property) int32 { + ntv := uchar2.NumericTypeValue(c) + return int32(ntvGetType(ntv)) +} + +func getJoiningType(prop *IntProperty, c rune, which Property) int32 { + return int32(ubidi.JoiningType(c)) +} + +func getJoiningGroup(prop *IntProperty, c rune, which Property) int32 { + return int32(ubidi.JoiningGroup(c)) +} + +func getGeneralCategory(prop *IntProperty, c rune, which Property) int32 { + return int32(uchar2.CharType(c)) +} + +func getCombiningClass(prop *IntProperty, c rune, which Property) int32 { + panic("TODO") +} + +func defaultGetValue(prop *IntProperty, c rune, which Property) int32 { + return int32(uchar2.GetUnicodeProperties(c, int(prop.column))&prop.mask) >> prop.shift +} + +func getBiDiClass(prop *IntProperty, c rune, which Property) int32 { + return int32(ubidi.Class(c)) +} + +func ntvGetType(ntv uint16) NumericType { + switch { + case ntv == uchar2.UPROPS_NTV_NONE: + return U_NT_NONE + case ntv < uchar2.UPROPS_NTV_DIGIT_START: + return U_NT_DECIMAL + case ntv < uchar2.UPROPS_NTV_NUMERIC_START: + return U_NT_DIGIT + default: + return U_NT_NUMERIC + } +} diff --git a/go/mysql/icuregex/internal/uprops/uscript.go b/go/mysql/icuregex/internal/uprops/uscript.go new file mode 100644 index 00000000000..0b80d54a69f --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uscript.go @@ -0,0 +1,507 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" +) + +/** + * Constants for ISO 15924 script codes. + * + * The current set of script code constants supports at least all scripts + * that are encoded in the version of Unicode which ICU currently supports. + * The names of the constants are usually derived from the + * Unicode script property value aliases. + * See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/) + * and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt . + * + * In addition, constants for many ISO 15924 script codes + * are included, for use with language tags, CLDR data, and similar. + * Some of those codes are not used in the Unicode Character Database (UCD). + * For example, there are no characters that have a UCD script property value of + * Hans or Hant. All Han ideographs have the Hani script property value in Unicode. + * + * Private-use codes Qaaa..Qabx are not included, except as used in the UCD or in CLDR. + * + * Starting with ICU 55, script codes are only added when their scripts + * have been or will certainly be encoded in Unicode, + * and have been assigned Unicode script property value aliases, + * to ensure that their script names are stable and match the names of the constants. + * Script codes like Latf and Aran that are not subject to separate encoding + * may be added at any time. + * + * @stable ICU 2.2 + */ +type UScriptCode int32 + +/* + * Note: UScriptCode constants and their ISO script code comments + * are parsed by preparseucd.py. + * It matches lines like + * USCRIPT_ = , / * * / + */ + +const ( + /** @stable ICU 2.2 */ + USCRIPT_INVALID_CODE UScriptCode = -1 + /** @stable ICU 2.2 */ + USCRIPT_COMMON UScriptCode = 0 /* Zyyy */ + /** @stable ICU 2.2 */ + USCRIPT_INHERITED UScriptCode = 1 /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ + /** @stable ICU 2.2 */ + USCRIPT_ARABIC UScriptCode = 2 /* Arab */ + /** @stable ICU 2.2 */ + USCRIPT_ARMENIAN UScriptCode = 3 /* Armn */ + /** @stable ICU 2.2 */ + USCRIPT_BENGALI UScriptCode = 4 /* Beng */ + /** @stable ICU 2.2 */ + USCRIPT_BOPOMOFO UScriptCode = 5 /* Bopo */ + /** @stable ICU 2.2 */ + USCRIPT_CHEROKEE UScriptCode = 6 /* Cher */ + /** @stable ICU 2.2 */ + USCRIPT_COPTIC UScriptCode = 7 /* Copt */ + /** @stable ICU 2.2 */ + USCRIPT_CYRILLIC UScriptCode = 8 /* Cyrl */ + /** @stable ICU 2.2 */ + USCRIPT_DESERET UScriptCode = 9 /* Dsrt */ + /** @stable ICU 2.2 */ + USCRIPT_DEVANAGARI UScriptCode = 10 /* Deva */ + /** @stable ICU 2.2 */ + USCRIPT_ETHIOPIC UScriptCode = 11 /* Ethi */ + /** @stable ICU 2.2 */ + USCRIPT_GEORGIAN UScriptCode = 12 /* Geor */ + /** @stable ICU 2.2 */ + USCRIPT_GOTHIC UScriptCode = 13 /* Goth */ + /** @stable ICU 2.2 */ + USCRIPT_GREEK UScriptCode = 14 /* Grek */ + /** @stable ICU 2.2 */ + USCRIPT_GUJARATI UScriptCode = 15 /* Gujr */ + /** @stable ICU 2.2 */ + USCRIPT_GURMUKHI UScriptCode = 16 /* Guru */ + /** @stable ICU 2.2 */ + USCRIPT_HAN UScriptCode = 17 /* Hani */ + /** @stable ICU 2.2 */ + USCRIPT_HANGUL UScriptCode = 18 /* Hang */ + /** @stable ICU 2.2 */ + USCRIPT_HEBREW UScriptCode = 19 /* Hebr */ + /** @stable ICU 2.2 */ + USCRIPT_HIRAGANA UScriptCode = 20 /* Hira */ + /** @stable ICU 2.2 */ + USCRIPT_KANNADA UScriptCode = 21 /* Knda */ + /** @stable ICU 2.2 */ + USCRIPT_KATAKANA UScriptCode = 22 /* Kana */ + /** @stable ICU 2.2 */ + USCRIPT_KHMER UScriptCode = 23 /* Khmr */ + /** @stable ICU 2.2 */ + USCRIPT_LAO UScriptCode = 24 /* Laoo */ + /** @stable ICU 2.2 */ + USCRIPT_LATIN UScriptCode = 25 /* Latn */ + /** @stable ICU 2.2 */ + USCRIPT_MALAYALAM UScriptCode = 26 /* Mlym */ + /** @stable ICU 2.2 */ + USCRIPT_MONGOLIAN UScriptCode = 27 /* Mong */ + /** @stable ICU 2.2 */ + USCRIPT_MYANMAR UScriptCode = 28 /* Mymr */ + /** @stable ICU 2.2 */ + USCRIPT_OGHAM UScriptCode = 29 /* Ogam */ + /** @stable ICU 2.2 */ + USCRIPT_OLD_ITALIC UScriptCode = 30 /* Ital */ + /** @stable ICU 2.2 */ + USCRIPT_ORIYA UScriptCode = 31 /* Orya */ + /** @stable ICU 2.2 */ + USCRIPT_RUNIC UScriptCode = 32 /* Runr */ + /** @stable ICU 2.2 */ + USCRIPT_SINHALA UScriptCode = 33 /* Sinh */ + /** @stable ICU 2.2 */ + USCRIPT_SYRIAC UScriptCode = 34 /* Syrc */ + /** @stable ICU 2.2 */ + USCRIPT_TAMIL UScriptCode = 35 /* Taml */ + /** @stable ICU 2.2 */ + USCRIPT_TELUGU UScriptCode = 36 /* Telu */ + /** @stable ICU 2.2 */ + USCRIPT_THAANA UScriptCode = 37 /* Thaa */ + /** @stable ICU 2.2 */ + USCRIPT_THAI UScriptCode = 38 /* Thai */ + /** @stable ICU 2.2 */ + USCRIPT_TIBETAN UScriptCode = 39 /* Tibt */ + /** Canadian_Aboriginal script. @stable ICU 2.6 */ + USCRIPT_CANADIAN_ABORIGINAL UScriptCode = 40 /* Cans */ + /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */ + USCRIPT_UCAS UScriptCode = USCRIPT_CANADIAN_ABORIGINAL + /** @stable ICU 2.2 */ + USCRIPT_YI UScriptCode = 41 /* Yiii */ + /* New scripts in Unicode 3.2 */ + /** @stable ICU 2.2 */ + USCRIPT_TAGALOG UScriptCode = 42 /* Tglg */ + /** @stable ICU 2.2 */ + USCRIPT_HANUNOO UScriptCode = 43 /* Hano */ + /** @stable ICU 2.2 */ + USCRIPT_BUHID UScriptCode = 44 /* Buhd */ + /** @stable ICU 2.2 */ + USCRIPT_TAGBANWA UScriptCode = 45 /* Tagb */ + + /* New scripts in Unicode 4 */ + /** @stable ICU 2.6 */ + USCRIPT_BRAILLE UScriptCode = 46 /* Brai */ + /** @stable ICU 2.6 */ + USCRIPT_CYPRIOT UScriptCode = 47 /* Cprt */ + /** @stable ICU 2.6 */ + USCRIPT_LIMBU UScriptCode = 48 /* Limb */ + /** @stable ICU 2.6 */ + USCRIPT_LINEAR_B UScriptCode = 49 /* Linb */ + /** @stable ICU 2.6 */ + USCRIPT_OSMANYA UScriptCode = 50 /* Osma */ + /** @stable ICU 2.6 */ + USCRIPT_SHAVIAN UScriptCode = 51 /* Shaw */ + /** @stable ICU 2.6 */ + USCRIPT_TAI_LE UScriptCode = 52 /* Tale */ + /** @stable ICU 2.6 */ + USCRIPT_UGARITIC UScriptCode = 53 /* Ugar */ + + /** New script code in Unicode 4.0.1 @stable ICU 3.0 */ + USCRIPT_KATAKANA_OR_HIRAGANA = 54 /*Hrkt */ + + /* New scripts in Unicode 4.1 */ + /** @stable ICU 3.4 */ + USCRIPT_BUGINESE UScriptCode = 55 /* Bugi */ + /** @stable ICU 3.4 */ + USCRIPT_GLAGOLITIC UScriptCode = 56 /* Glag */ + /** @stable ICU 3.4 */ + USCRIPT_KHAROSHTHI UScriptCode = 57 /* Khar */ + /** @stable ICU 3.4 */ + USCRIPT_SYLOTI_NAGRI UScriptCode = 58 /* Sylo */ + /** @stable ICU 3.4 */ + USCRIPT_NEW_TAI_LUE UScriptCode = 59 /* Talu */ + /** @stable ICU 3.4 */ + USCRIPT_TIFINAGH UScriptCode = 60 /* Tfng */ + /** @stable ICU 3.4 */ + USCRIPT_OLD_PERSIAN UScriptCode = 61 /* Xpeo */ + + /* New script codes from Unicode and ISO 15924 */ + /** @stable ICU 3.6 */ + USCRIPT_BALINESE UScriptCode = 62 /* Bali */ + /** @stable ICU 3.6 */ + USCRIPT_BATAK UScriptCode = 63 /* Batk */ + /** @stable ICU 3.6 */ + USCRIPT_BLISSYMBOLS UScriptCode = 64 /* Blis */ + /** @stable ICU 3.6 */ + USCRIPT_BRAHMI UScriptCode = 65 /* Brah */ + /** @stable ICU 3.6 */ + USCRIPT_CHAM UScriptCode = 66 /* Cham */ + /** @stable ICU 3.6 */ + USCRIPT_CIRTH UScriptCode = 67 /* Cirt */ + /** @stable ICU 3.6 */ + USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC UScriptCode = 68 /* Cyrs */ + /** @stable ICU 3.6 */ + USCRIPT_DEMOTIC_EGYPTIAN UScriptCode = 69 /* Egyd */ + /** @stable ICU 3.6 */ + USCRIPT_HIERATIC_EGYPTIAN UScriptCode = 70 /* Egyh */ + /** @stable ICU 3.6 */ + USCRIPT_EGYPTIAN_HIEROGLYPHS UScriptCode = 71 /* Egyp */ + /** @stable ICU 3.6 */ + USCRIPT_KHUTSURI UScriptCode = 72 /* Geok */ + /** @stable ICU 3.6 */ + USCRIPT_SIMPLIFIED_HAN UScriptCode = 73 /* Hans */ + /** @stable ICU 3.6 */ + USCRIPT_TRADITIONAL_HAN UScriptCode = 74 /* Hant */ + /** @stable ICU 3.6 */ + USCRIPT_PAHAWH_HMONG UScriptCode = 75 /* Hmng */ + /** @stable ICU 3.6 */ + USCRIPT_OLD_HUNGARIAN UScriptCode = 76 /* Hung */ + /** @stable ICU 3.6 */ + USCRIPT_HARAPPAN_INDUS UScriptCode = 77 /* Inds */ + /** @stable ICU 3.6 */ + USCRIPT_JAVANESE UScriptCode = 78 /* Java */ + /** @stable ICU 3.6 */ + USCRIPT_KAYAH_LI UScriptCode = 79 /* Kali */ + /** @stable ICU 3.6 */ + USCRIPT_LATIN_FRAKTUR UScriptCode = 80 /* Latf */ + /** @stable ICU 3.6 */ + USCRIPT_LATIN_GAELIC UScriptCode = 81 /* Latg */ + /** @stable ICU 3.6 */ + USCRIPT_LEPCHA UScriptCode = 82 /* Lepc */ + /** @stable ICU 3.6 */ + USCRIPT_LINEAR_A UScriptCode = 83 /* Lina */ + /** @stable ICU 4.6 */ + USCRIPT_MANDAIC UScriptCode = 84 /* Mand */ + /** @stable ICU 3.6 */ + USCRIPT_MANDAEAN UScriptCode = USCRIPT_MANDAIC + /** @stable ICU 3.6 */ + USCRIPT_MAYAN_HIEROGLYPHS UScriptCode = 85 /* Maya */ + /** @stable ICU 4.6 */ + USCRIPT_MEROITIC_HIEROGLYPHS UScriptCode = 86 /* Mero */ + /** @stable ICU 3.6 */ + USCRIPT_MEROITIC UScriptCode = USCRIPT_MEROITIC_HIEROGLYPHS + /** @stable ICU 3.6 */ + USCRIPT_NKO UScriptCode = 87 /* Nkoo */ + /** @stable ICU 3.6 */ + USCRIPT_ORKHON UScriptCode = 88 /* Orkh */ + /** @stable ICU 3.6 */ + USCRIPT_OLD_PERMIC UScriptCode = 89 /* Perm */ + /** @stable ICU 3.6 */ + USCRIPT_PHAGS_PA UScriptCode = 90 /* Phag */ + /** @stable ICU 3.6 */ + USCRIPT_PHOENICIAN UScriptCode = 91 /* Phnx */ + /** @stable ICU 52 */ + USCRIPT_MIAO UScriptCode = 92 /* Plrd */ + /** @stable ICU 3.6 */ + USCRIPT_PHONETIC_POLLARD UScriptCode = USCRIPT_MIAO + /** @stable ICU 3.6 */ + USCRIPT_RONGORONGO UScriptCode = 93 /* Roro */ + /** @stable ICU 3.6 */ + USCRIPT_SARATI UScriptCode = 94 /* Sara */ + /** @stable ICU 3.6 */ + USCRIPT_ESTRANGELO_SYRIAC UScriptCode = 95 /* Syre */ + /** @stable ICU 3.6 */ + USCRIPT_WESTERN_SYRIAC UScriptCode = 96 /* Syrj */ + /** @stable ICU 3.6 */ + USCRIPT_EASTERN_SYRIAC UScriptCode = 97 /* Syrn */ + /** @stable ICU 3.6 */ + USCRIPT_TENGWAR UScriptCode = 98 /* Teng */ + /** @stable ICU 3.6 */ + USCRIPT_VAI UScriptCode = 99 /* Vaii */ + /** @stable ICU 3.6 */ + USCRIPT_VISIBLE_SPEECH UScriptCode = 100 /* Visp */ + /** @stable ICU 3.6 */ + USCRIPT_CUNEIFORM UScriptCode = 101 /* Xsux */ + /** @stable ICU 3.6 */ + USCRIPT_UNWRITTEN_LANGUAGES UScriptCode = 102 /* Zxxx */ + /** @stable ICU 3.6 */ + USCRIPT_UNKNOWN UScriptCode = 103 /* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ + + /** @stable ICU 3.8 */ + USCRIPT_CARIAN UScriptCode = 104 /* Cari */ + /** @stable ICU 3.8 */ + USCRIPT_JAPANESE UScriptCode = 105 /* Jpan */ + /** @stable ICU 3.8 */ + USCRIPT_LANNA UScriptCode = 106 /* Lana */ + /** @stable ICU 3.8 */ + USCRIPT_LYCIAN UScriptCode = 107 /* Lyci */ + /** @stable ICU 3.8 */ + USCRIPT_LYDIAN UScriptCode = 108 /* Lydi */ + /** @stable ICU 3.8 */ + USCRIPT_OL_CHIKI UScriptCode = 109 /* Olck */ + /** @stable ICU 3.8 */ + USCRIPT_REJANG UScriptCode = 110 /* Rjng */ + /** @stable ICU 3.8 */ + USCRIPT_SAURASHTRA UScriptCode = 111 /* Saur */ + /** Sutton SignWriting @stable ICU 3.8 */ + USCRIPT_SIGN_WRITING UScriptCode = 112 /* Sgnw */ + /** @stable ICU 3.8 */ + USCRIPT_SUNDANESE UScriptCode = 113 /* Sund */ + /** @stable ICU 3.8 */ + USCRIPT_MOON UScriptCode = 114 /* Moon */ + /** @stable ICU 3.8 */ + USCRIPT_MEITEI_MAYEK UScriptCode = 115 /* Mtei */ + + /** @stable ICU 4.0 */ + USCRIPT_IMPERIAL_ARAMAIC UScriptCode = 116 /* Armi */ + /** @stable ICU 4.0 */ + USCRIPT_AVESTAN UScriptCode = 117 /* Avst */ + /** @stable ICU 4.0 */ + USCRIPT_CHAKMA UScriptCode = 118 /* Cakm */ + /** @stable ICU 4.0 */ + USCRIPT_KOREAN UScriptCode = 119 /* Kore */ + /** @stable ICU 4.0 */ + USCRIPT_KAITHI UScriptCode = 120 /* Kthi */ + /** @stable ICU 4.0 */ + USCRIPT_MANICHAEAN UScriptCode = 121 /* Mani */ + /** @stable ICU 4.0 */ + USCRIPT_INSCRIPTIONAL_PAHLAVI UScriptCode = 122 /* Phli */ + /** @stable ICU 4.0 */ + USCRIPT_PSALTER_PAHLAVI UScriptCode = 123 /* Phlp */ + /** @stable ICU 4.0 */ + USCRIPT_BOOK_PAHLAVI UScriptCode = 124 /* Phlv */ + /** @stable ICU 4.0 */ + USCRIPT_INSCRIPTIONAL_PARTHIAN UScriptCode = 125 /* Prti */ + /** @stable ICU 4.0 */ + USCRIPT_SAMARITAN UScriptCode = 126 /* Samr */ + /** @stable ICU 4.0 */ + USCRIPT_TAI_VIET UScriptCode = 127 /* Tavt */ + /** @stable ICU 4.0 */ + USCRIPT_MATHEMATICAL_NOTATION UScriptCode = 128 /* Zmth */ + /** @stable ICU 4.0 */ + USCRIPT_SYMBOLS UScriptCode = 129 /* Zsym */ + + /** @stable ICU 4.4 */ + USCRIPT_BAMUM UScriptCode = 130 /* Bamu */ + /** @stable ICU 4.4 */ + USCRIPT_LISU UScriptCode = 131 /* Lisu */ + /** @stable ICU 4.4 */ + USCRIPT_NAKHI_GEBA UScriptCode = 132 /* Nkgb */ + /** @stable ICU 4.4 */ + USCRIPT_OLD_SOUTH_ARABIAN UScriptCode = 133 /* Sarb */ + + /** @stable ICU 4.6 */ + USCRIPT_BASSA_VAH UScriptCode = 134 /* Bass */ + /** @stable ICU 54 */ + USCRIPT_DUPLOYAN UScriptCode = 135 /* Dupl */ + /** @stable ICU 4.6 */ + USCRIPT_ELBASAN UScriptCode = 136 /* Elba */ + /** @stable ICU 4.6 */ + USCRIPT_GRANTHA UScriptCode = 137 /* Gran */ + /** @stable ICU 4.6 */ + USCRIPT_KPELLE UScriptCode = 138 /* Kpel */ + /** @stable ICU 4.6 */ + USCRIPT_LOMA UScriptCode = 139 /* Loma */ + /** Mende Kikakui @stable ICU 4.6 */ + USCRIPT_MENDE UScriptCode = 140 /* Mend */ + /** @stable ICU 4.6 */ + USCRIPT_MEROITIC_CURSIVE UScriptCode = 141 /* Merc */ + /** @stable ICU 4.6 */ + USCRIPT_OLD_NORTH_ARABIAN UScriptCode = 142 /* Narb */ + /** @stable ICU 4.6 */ + USCRIPT_NABATAEAN UScriptCode = 143 /* Nbat */ + /** @stable ICU 4.6 */ + USCRIPT_PALMYRENE UScriptCode = 144 /* Palm */ + /** @stable ICU 54 */ + USCRIPT_KHUDAWADI UScriptCode = 145 /* Sind */ + /** @stable ICU 4.6 */ + USCRIPT_SINDHI UScriptCode = USCRIPT_KHUDAWADI + /** @stable ICU 4.6 */ + USCRIPT_WARANG_CITI UScriptCode = 146 /* Wara */ + + /** @stable ICU 4.8 */ + USCRIPT_AFAKA UScriptCode = 147 /* Afak */ + /** @stable ICU 4.8 */ + USCRIPT_JURCHEN UScriptCode = 148 /* Jurc */ + /** @stable ICU 4.8 */ + USCRIPT_MRO UScriptCode = 149 /* Mroo */ + /** @stable ICU 4.8 */ + USCRIPT_NUSHU UScriptCode = 150 /* Nshu */ + /** @stable ICU 4.8 */ + USCRIPT_SHARADA UScriptCode = 151 /* Shrd */ + /** @stable ICU 4.8 */ + USCRIPT_SORA_SOMPENG UScriptCode = 152 /* Sora */ + /** @stable ICU 4.8 */ + USCRIPT_TAKRI UScriptCode = 153 /* Takr */ + /** @stable ICU 4.8 */ + USCRIPT_TANGUT UScriptCode = 154 /* Tang */ + /** @stable ICU 4.8 */ + USCRIPT_WOLEAI UScriptCode = 155 /* Wole */ + + /** @stable ICU 49 */ + USCRIPT_ANATOLIAN_HIEROGLYPHS UScriptCode = 156 /* Hluw */ + /** @stable ICU 49 */ + USCRIPT_KHOJKI UScriptCode = 157 /* Khoj */ + /** @stable ICU 49 */ + USCRIPT_TIRHUTA UScriptCode = 158 /* Tirh */ + + /** @stable ICU 52 */ + USCRIPT_CAUCASIAN_ALBANIAN UScriptCode = 159 /* Aghb */ + /** @stable ICU 52 */ + USCRIPT_MAHAJANI UScriptCode = 160 /* Mahj */ + + /** @stable ICU 54 */ + USCRIPT_AHOM UScriptCode = 161 /* Ahom */ + /** @stable ICU 54 */ + USCRIPT_HATRAN UScriptCode = 162 /* Hatr */ + /** @stable ICU 54 */ + USCRIPT_MODI UScriptCode = 163 /* Modi */ + /** @stable ICU 54 */ + USCRIPT_MULTANI UScriptCode = 164 /* Mult */ + /** @stable ICU 54 */ + USCRIPT_PAU_CIN_HAU UScriptCode = 165 /* Pauc */ + /** @stable ICU 54 */ + USCRIPT_SIDDHAM UScriptCode = 166 /* Sidd */ + + /** @stable ICU 58 */ + USCRIPT_ADLAM UScriptCode = 167 /* Adlm */ + /** @stable ICU 58 */ + USCRIPT_BHAIKSUKI UScriptCode = 168 /* Bhks */ + /** @stable ICU 58 */ + USCRIPT_MARCHEN UScriptCode = 169 /* Marc */ + /** @stable ICU 58 */ + USCRIPT_NEWA UScriptCode = 170 /* Newa */ + /** @stable ICU 58 */ + USCRIPT_OSAGE UScriptCode = 171 /* Osge */ + + /** @stable ICU 58 */ + USCRIPT_HAN_WITH_BOPOMOFO UScriptCode = 172 /* Hanb */ + /** @stable ICU 58 */ + USCRIPT_JAMO UScriptCode = 173 /* Jamo */ + /** @stable ICU 58 */ + USCRIPT_SYMBOLS_EMOJI UScriptCode = 174 /* Zsye */ + + /** @stable ICU 60 */ + USCRIPT_MASARAM_GONDI UScriptCode = 175 /* Gonm */ + /** @stable ICU 60 */ + USCRIPT_SOYOMBO UScriptCode = 176 /* Soyo */ + /** @stable ICU 60 */ + USCRIPT_ZANABAZAR_SQUARE UScriptCode = 177 /* Zanb */ + + /** @stable ICU 62 */ + USCRIPT_DOGRA UScriptCode = 178 /* Dogr */ + /** @stable ICU 62 */ + USCRIPT_GUNJALA_GONDI UScriptCode = 179 /* Gong */ + /** @stable ICU 62 */ + USCRIPT_MAKASAR UScriptCode = 180 /* Maka */ + /** @stable ICU 62 */ + USCRIPT_MEDEFAIDRIN UScriptCode = 181 /* Medf */ + /** @stable ICU 62 */ + USCRIPT_HANIFI_ROHINGYA UScriptCode = 182 /* Rohg */ + /** @stable ICU 62 */ + USCRIPT_SOGDIAN UScriptCode = 183 /* Sogd */ + /** @stable ICU 62 */ + USCRIPT_OLD_SOGDIAN UScriptCode = 184 /* Sogo */ + + /** @stable ICU 64 */ + USCRIPT_ELYMAIC UScriptCode = 185 /* Elym */ + /** @stable ICU 64 */ + USCRIPT_NYIAKENG_PUACHUE_HMONG UScriptCode = 186 /* Hmnp */ + /** @stable ICU 64 */ + USCRIPT_NANDINAGARI UScriptCode = 187 /* Nand */ + /** @stable ICU 64 */ + USCRIPT_WANCHO UScriptCode = 188 /* Wcho */ + + /** @stable ICU 66 */ + USCRIPT_CHORASMIAN UScriptCode = 189 /* Chrs */ + /** @stable ICU 66 */ + USCRIPT_DIVES_AKURU UScriptCode = 190 /* Diak */ + /** @stable ICU 66 */ + USCRIPT_KHITAN_SMALL_SCRIPT UScriptCode = 191 /* Kits */ + /** @stable ICU 66 */ + USCRIPT_YEZIDI UScriptCode = 192 /* Yezi */ +) + +func UScriptHasScript(c rune, sc UScriptCode) bool { + scriptX := uchar.GetUnicodeProperties(c, 0) & UPROPS_SCRIPT_X_MASK + codeOrIndex := mergeScriptCodeOrIndex(scriptX) + if scriptX < UPROPS_SCRIPT_X_WITH_COMMON { + return sc == UScriptCode(codeOrIndex) + } + + scx := uchar.ScriptExtensions(codeOrIndex) + if scriptX >= UPROPS_SCRIPT_X_WITH_OTHER { + scx = uchar.ScriptExtensions(uint32(scx[1])) + } + sc32 := uint32(sc) + if sc32 > 0x7fff { + /* Guard against bogus input that would make us go past the Script_Extensions terminator. */ + return false + } + for sc32 > uint32(scx[0]) { + scx = scx[1:] + } + return sc32 == uint32(scx[0]&0x7fff) +} diff --git a/go/mysql/icuregex/internal/uset/close.go b/go/mysql/icuregex/internal/uset/close.go new file mode 100644 index 00000000000..02e1d117b52 --- /dev/null +++ b/go/mysql/icuregex/internal/uset/close.go @@ -0,0 +1,98 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" +) + +type USet uint32 + +const ( + /** + * Ignore white space within patterns unless quoted or escaped. + * @stable ICU 2.4 + */ + USET_IGNORE_SPACE USet = 1 + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This performs a full + * closure over case mappings, e.g. U+017F for s. + * + * The resulting set is a superset of the input for the code points but + * not for the strings. + * It performs a case mapping closure of the code points and adds + * full case folding strings for the code points, and reduces strings of + * the original set to their full case folding equivalents. + * + * This is designed for case-insensitive matches, for example + * in regular expressions. The full code point case closure allows checking of + * an input character directly against the closure set. + * Strings are matched by comparing the case-folded form from the closure + * set with an incremental case folding of the string in question. + * + * The closure set will also contain single code points if the original + * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). + * This is not necessary (that is, redundant) for the above matching method + * but results in the same closure sets regardless of whether the original + * set contained the code point or a string. + * + * @stable ICU 2.4 + */ + USET_CASE_INSENSITIVE USet = 2 + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, + * title-, and uppercase mappings as well as the case folding + * of each existing element in the set. + * @stable ICU 3.2 + */ + USET_ADD_CASE_MAPPINGS USet = 4 +) + +func (u *UnicodeSet) CloseOver(attribute USet) { + if attribute&USET_ADD_CASE_MAPPINGS != 0 { + panic("USET_ADD_CASE_MAPPINGS is unsupported") + } + if (attribute & USET_CASE_INSENSITIVE) == 0 { + return + } + + foldSet := u.Clone() + n := u.rangeCount() + + for i := 0; i < n; i++ { + start := u.rangeStart(i) + end := u.rangeEnd(i) + + // full case closure + for cp := start; cp <= end; cp++ { + ucase.AddCaseClosure(cp, foldSet) + } + } + + *u = *foldSet +} diff --git a/go/mysql/icuregex/internal/uset/pattern.go b/go/mysql/icuregex/internal/uset/pattern.go new file mode 100644 index 00000000000..468e439c06e --- /dev/null +++ b/go/mysql/icuregex/internal/uset/pattern.go @@ -0,0 +1,107 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "strings" + + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" +) + +func (u *UnicodeSet) String() string { + var buf strings.Builder + u.ToPattern(&buf, true) + return buf.String() +} + +func (u *UnicodeSet) ToPattern(w *strings.Builder, escapeUnprintable bool) { + w.WriteByte('[') + + // // Check against the predefined categories. We implicitly build + // // up ALL category sets the first time toPattern() is called. + // for (int8_t cat=0; cat 1 && u.rangeStart(0) == MIN_VALUE && u.rangeEnd(count-1) == MAX_VALUE { + + // Emit the inverse + w.WriteByte('^') + + for i := 1; i < count; i++ { + start := u.rangeEnd(i-1) + 1 + end := u.rangeStart(i) - 1 + u.appendToPattern(w, start, escapeUnprintable) + if start != end { + if (start + 1) != end { + w.WriteByte('-') + } + u.appendToPattern(w, end, escapeUnprintable) + } + } + } else { + // Default; emit the ranges as pairs + for i := 0; i < count; i++ { + start := u.rangeStart(i) + end := u.rangeEnd(i) + u.appendToPattern(w, start, escapeUnprintable) + if start != end { + if (start + 1) != end { + w.WriteByte('-') + } + u.appendToPattern(w, end, escapeUnprintable) + } + } + } + + w.WriteByte(']') +} + +func (u *UnicodeSet) appendToPattern(w *strings.Builder, c rune, escapeUnprintable bool) { + if escapeUnprintable && pattern.IsUnprintable(c) { + // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything + // unprintable + pattern.EscapeUnprintable(w, c) + return + } + + // Okay to let ':' pass through + switch c { + case '[', ']', '-', '^', '&', '\\', '{', '}', ':', '$': + w.WriteByte('\\') + default: + // Escape whitespace + if pattern.IsWhitespace(c) { + w.WriteByte('\\') + } + } + w.WriteRune(c) +} diff --git a/go/mysql/icuregex/internal/uset/properties.go b/go/mysql/icuregex/internal/uset/properties.go new file mode 100644 index 00000000000..f9403f584ea --- /dev/null +++ b/go/mysql/icuregex/internal/uset/properties.go @@ -0,0 +1,417 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "fmt" + "strconv" + "strings" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + uchar2 "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" + "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" + "vitess.io/vitess/go/mysql/icuregex/internal/unames" + uprops2 "vitess.io/vitess/go/mysql/icuregex/internal/uprops" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var inclusionsMu sync.Mutex +var inclusionsForSource = make(map[uprops2.PropertySource]*UnicodeSet) +var inclusionsForProperty = make(map[uprops2.Property]*UnicodeSet) + +func GetInclusionsForBinaryProperty(prop uprops2.Property) *UnicodeSet { + inclusionsMu.Lock() + defer inclusionsMu.Unlock() + return getInclusionsForBinaryProperty(prop) +} + +func getInclusionsForSource(src uprops2.PropertySource) *UnicodeSet { + if inc, ok := inclusionsForSource[src]; ok { + return inc + } + + u := New() + + switch src { + case uprops2.UPROPS_SRC_CHAR: + uchar2.AddPropertyStarts(u) + case uprops2.UPROPS_SRC_PROPSVEC: + uchar2.VecAddPropertyStarts(u) + case uprops2.UPROPS_SRC_CHAR_AND_PROPSVEC: + uchar2.AddPropertyStarts(u) + uchar2.VecAddPropertyStarts(u) + case uprops2.UPROPS_SRC_CASE_AND_NORM: + panic("TODO") + case uprops2.UPROPS_SRC_NFC: + panic("TODO") + case uprops2.UPROPS_SRC_NFKC: + panic("TODO") + case uprops2.UPROPS_SRC_NFKC_CF: + panic("TODO") + case uprops2.UPROPS_SRC_NFC_CANON_ITER: + panic("TODO") + case uprops2.UPROPS_SRC_CASE: + ucase.AddPropertyStarts(u) + case uprops2.UPROPS_SRC_BIDI: + ubidi.AddPropertyStarts(u) + case uprops2.UPROPS_SRC_INPC, uprops2.UPROPS_SRC_INSC, uprops2.UPROPS_SRC_VO: + AddULayoutPropertyStarts(src, u) + default: + panic(fmt.Sprintf("unsupported property source: %v", src)) + } + + inclusionsForSource[src] = u + return u +} + +func getInclusionsForProperty(prop uprops2.Property) *UnicodeSet { + if uprops2.UCHAR_INT_START <= prop && prop < uprops2.UCHAR_INT_LIMIT { + return getInclusionsForIntProperty(prop) + } + return getInclusionsForSource(prop.Source()) +} + +func GetInclusionsForProperty(prop uprops2.Property) *UnicodeSet { + inclusionsMu.Lock() + defer inclusionsMu.Unlock() + return getInclusionsForProperty(prop) +} + +func getInclusionsForBinaryProperty(prop uprops2.Property) *UnicodeSet { + if inc, ok := inclusionsForProperty[prop]; ok { + return inc + } + + incl := getInclusionsForProperty(prop) + set := New() + + numRanges := incl.rangeCount() + startHasProperty := rune(-1) + + for i := 0; i < numRanges; i++ { + rangeEnd := incl.rangeEnd(i) + for c := incl.rangeStart(i); c <= rangeEnd; c++ { + if uprops2.HasBinaryProperty(c, prop) { + if startHasProperty < 0 { + startHasProperty = c + } + } else if startHasProperty >= 0 { + set.AddRuneRange(startHasProperty, c-1) + startHasProperty = -1 + } + } + } + if startHasProperty >= 0 { + set.AddRuneRange(startHasProperty, MAX_VALUE) + } + + inclusionsForProperty[prop] = set + return set +} + +func getInclusionsForIntProperty(prop uprops2.Property) *UnicodeSet { + if inc, ok := inclusionsForProperty[prop]; ok { + return inc + } + + src := prop.Source() + incl := getInclusionsForSource(src) + + intPropIncl := New() + intPropIncl.AddRune(0) + + numRanges := incl.rangeCount() + prevValue := int32(0) + + for i := 0; i < numRanges; i++ { + rangeEnd := incl.rangeEnd(i) + for c := incl.rangeStart(i); c <= rangeEnd; c++ { + value := uprops2.GetIntPropertyValue(c, prop) + if value != prevValue { + intPropIncl.AddRune(c) + prevValue = value + } + } + } + + inclusionsForProperty[prop] = intPropIncl + return intPropIncl +} + +func (u *UnicodeSet) ApplyIntPropertyValue(prop uprops2.Property, value int32) { + switch { + case prop == uprops2.UCHAR_GENERAL_CATEGORY_MASK: + inclusions := GetInclusionsForProperty(prop) + u.applyFilter(inclusions, func(ch rune) bool { + return (uprops2.U_MASK(uchar2.CharType(ch)) & uint32(value)) != 0 + }) + case prop == uprops2.UCHAR_SCRIPT_EXTENSIONS: + inclusions := GetInclusionsForProperty(prop) + u.applyFilter(inclusions, func(ch rune) bool { + return uprops2.UScriptHasScript(ch, uprops2.UScriptCode(value)) + }) + case 0 <= prop && prop < uprops2.UCHAR_BINARY_LIMIT: + if value == 0 || value == 1 { + set := GetInclusionsForBinaryProperty(prop) + u.CopyFrom(set) + if value == 0 { + u.Complement() + } + } else { + u.Clear() + } + + case uprops2.UCHAR_INT_START <= prop && prop < uprops2.UCHAR_INT_LIMIT: + inclusions := GetInclusionsForProperty(prop) + u.applyFilter(inclusions, func(ch rune) bool { + return uprops2.GetIntPropertyValue(ch, prop) == value + }) + + default: + panic("invalid Property type") + } +} + +func mungeCharName(charname string) string { + out := make([]byte, 0, len(charname)) + for _, ch := range []byte(charname) { + j := len(out) + if ch == ' ' && (j == 0 || out[j-1] == ' ') { + continue + } + out = append(out, ch) + } + return string(out) +} + +func (u *UnicodeSet) ApplyPropertyPattern(pat string) error { + if len(pat) < 5 { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + + var posix, isName, invert bool + + if isPOSIXOpen(pat) { + posix = true + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) > 0 && pat[0] == '^' { + pat = pat[1:] + invert = true + } + } else if isPerlOpen(pat) || isNameOpen(pat) { + c := pat[1] + invert = c == 'P' + isName = c == 'N' + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) == 0 || pat[0] != '{' { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + pat = pat[1:] + } else { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + + var close int + if posix { + close = strings.Index(pat, ":]") + } else { + close = strings.IndexByte(pat, '}') + } + if close < 0 { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + + equals := strings.IndexByte(pat, '=') + var propName, valueName string + if equals >= 0 && equals < close && !isName { + propName = pat[:equals] + valueName = pat[equals+1 : close] + } else { + propName = pat[:close] + if isName { + valueName = propName + propName = "na" + } + } + + if err := u.ApplyPropertyAlias(propName, valueName); err != nil { + return err + } + if invert { + u.Complement() + } + return nil +} + +func isPOSIXOpen(pattern string) bool { + return pattern[0] == '[' && pattern[1] == ':' +} + +func isNameOpen(pattern string) bool { + return pattern[0] == '\\' && pattern[1] == 'N' +} + +func isPerlOpen(pattern string) bool { + return pattern[0] == '\\' && (pattern[1] == 'p' || pattern[1] == 'P') +} + +func (u *UnicodeSet) ApplyPropertyAlias(prop, value string) error { + var p uprops2.Property + var v int32 + var invert bool + + if len(value) > 0 { + p = uprops2.GetPropertyEnum(prop) + if p == -1 { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + if p == uprops2.UCHAR_GENERAL_CATEGORY { + p = uprops2.UCHAR_GENERAL_CATEGORY_MASK + } + + if (p >= uprops2.UCHAR_BINARY_START && p < uprops2.UCHAR_BINARY_LIMIT) || + (p >= uprops2.UCHAR_INT_START && p < uprops2.UCHAR_INT_LIMIT) || + (p >= uprops2.UCHAR_MASK_START && p < uprops2.UCHAR_MASK_LIMIT) { + v = uprops2.GetPropertyValueEnum(p, value) + if v == -1 { + // Handle numeric CCC + if p == uprops2.UCHAR_CANONICAL_COMBINING_CLASS || + p == uprops2.UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || + p == uprops2.UCHAR_LEAD_CANONICAL_COMBINING_CLASS { + val, err := strconv.ParseUint(value, 10, 8) + if err != nil { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + v = int32(val) + } else { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + } + } else { + switch p { + case uprops2.UCHAR_NUMERIC_VALUE: + val, err := strconv.ParseFloat(value, 64) + if err != nil { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + u.applyFilter(GetInclusionsForProperty(p), func(ch rune) bool { + return uchar2.NumericValue(ch) == val + }) + return nil + case uprops2.UCHAR_NAME: + // Must munge name, since u_charFromName() does not do + // 'loose' matching. + charName := mungeCharName(value) + ch := unames.CharForName(unames.U_EXTENDED_CHAR_NAME, charName) + if ch < 0 { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + u.Clear() + u.AddRune(ch) + return nil + case uprops2.UCHAR_AGE: + // Must munge name, since u_versionFromString() does not do + // 'loose' matching. + charName := mungeCharName(value) + version := uchar2.VersionFromString(charName) + u.applyFilter(GetInclusionsForProperty(p), func(ch rune) bool { + return uchar2.CharAge(ch) == version + }) + return nil + case uprops2.UCHAR_SCRIPT_EXTENSIONS: + v = uprops2.GetPropertyValueEnum(uprops2.UCHAR_SCRIPT, value) + if v == -1 { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + default: + // p is a non-binary, non-enumerated property that we + // don't support (yet). + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + } + } else { + // value is empty. Interpret as General Category, Script, or + // Binary property. + p = uprops2.UCHAR_GENERAL_CATEGORY_MASK + v = uprops2.GetPropertyValueEnum(p, prop) + if v == -1 { + p = uprops2.UCHAR_SCRIPT + v = uprops2.GetPropertyValueEnum(p, prop) + if v == -1 { + p = uprops2.GetPropertyEnum(prop) + if p >= uprops2.UCHAR_BINARY_START && p < uprops2.UCHAR_BINARY_LIMIT { + v = 1 + } else if 0 == uprops2.ComparePropertyNames("ANY", prop) { + u.Clear() + u.AddRuneRange(MIN_VALUE, MAX_VALUE) + return nil + } else if 0 == uprops2.ComparePropertyNames("ASCII", prop) { + u.Clear() + u.AddRuneRange(0, 0x7F) + return nil + } else if 0 == uprops2.ComparePropertyNames("Assigned", prop) { + // [:Assigned:]=[:^Cn:] + p = uprops2.UCHAR_GENERAL_CATEGORY_MASK + v = int32(uchar2.U_GC_CN_MASK) + invert = true + } else { + return uerror.U_ILLEGAL_ARGUMENT_ERROR + } + } + } + } + + u.ApplyIntPropertyValue(p, v) + if invert { + u.Complement() + } + return nil +} + +func AddULayoutPropertyStarts(src uprops2.PropertySource, u *UnicodeSet) { + var trie *utrie.UcpTrie + switch src { + case uprops2.UPROPS_SRC_INPC: + trie = ulayout.InpcTrie() + case uprops2.UPROPS_SRC_INSC: + trie = ulayout.InscTrie() + case uprops2.UPROPS_SRC_VO: + trie = ulayout.VoTrie() + default: + panic("unreachable") + } + + // Add the start code point of each same-value range of the trie. + var start, end rune + for { + end, _ = trie.GetRange(start, utrie.UCPMAP_RANGE_NORMAL, 0, nil) + if end < 0 { + break + } + u.AddRune(start) + start = end + 1 + } +} diff --git a/go/mysql/icuregex/internal/uset/unicode_set.go b/go/mysql/icuregex/internal/uset/unicode_set.go new file mode 100644 index 00000000000..56280265444 --- /dev/null +++ b/go/mysql/icuregex/internal/uset/unicode_set.go @@ -0,0 +1,653 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" +) + +// HIGH_VALUE > all valid values. 110000 for codepoints +const UNICODESET_HIGH = 0x0110000 + +// LOW <= all valid values. ZERO for codepoints +const UNICODESET_LOW = 0x000000 + +/** Max list [0, 1, 2, ..., max code point, HIGH] */ +const MAX_LENGTH = UNICODESET_HIGH + 1 + +const ( + /** + * Minimum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MIN_VALUE = 0 + + /** + * Maximum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MAX_VALUE = 0x10ffff +) + +type UnicodeSet struct { + list []rune + buffer []rune +} + +func New() *UnicodeSet { + buf := make([]rune, 1, 25) + buf[0] = UNICODESET_HIGH + return &UnicodeSet{list: buf} +} + +func FromRunes(list []rune) *UnicodeSet { + return &UnicodeSet{list: list} +} + +func ParsePattern(pattern string, flags USet) (*UnicodeSet, error) { + u := New() + if err := u.ApplyPropertyPattern(pattern); err != nil { + return nil, err + } + if flags&USET_CASE_INSENSITIVE != 0 { + u.CloseOver(USET_CASE_INSENSITIVE) + } + return u, nil +} + +func MustParsePattern(pattern string, flags USet) *UnicodeSet { + u, err := ParsePattern(pattern, flags) + if err != nil { + panic(err) + } + return u +} + +func (u *UnicodeSet) ensureBufferCapacity(c int) { + if cap(u.buffer) < c { + u.buffer = make([]rune, c) + return + } + u.buffer = u.buffer[:cap(u.buffer)] +} + +func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { + u.ensureBufferCapacity(len(u.list) + len(other)) + + i := 1 + j := 1 + k := 0 + + a := u.list[0] + b := other[0] + + for { + switch polarity { + case 0: + if a < b { + if k > 0 && a <= u.buffer[k-1] { + k-- + a = max(u.list[i], u.buffer[k]) + } else { + u.buffer[k] = a + k++ + a = u.list[i] + } + i++ + polarity ^= 1 + } else if b < a { + if k > 0 && b <= u.buffer[k-1] { + k-- + b = max(other[j], u.buffer[k]) + } else { + u.buffer[k] = b + k++ + b = other[j] + } + j++ + polarity ^= 2 + } else { + if a == UNICODESET_HIGH { + goto loopEnd + } + if k > 0 && a <= u.buffer[k-1] { + k-- + a = max(u.list[i], u.buffer[k]) + } else { + u.buffer[k] = a + k++ + a = u.list[i] + } + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 3: + if b <= a { + if a == UNICODESET_HIGH { + goto loopEnd + } + u.buffer[k] = a + k++ + } else { + if b == UNICODESET_HIGH { + goto loopEnd + } + u.buffer[k] = b + k++ + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + case 1: + if a < b { + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { + b = other[j] + j++ + polarity ^= 2 + } else { + if a == UNICODESET_HIGH { + goto loopEnd + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 2: + if b < a { + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else if a < b { + a = u.list[i] + i++ + polarity ^= 1 + } else { + if a == UNICODESET_HIGH { + goto loopEnd + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + } + } + +loopEnd: + u.buffer[k] = UNICODESET_HIGH + k++ + + u.list, u.buffer = u.buffer[:k], u.list +} + +func max(a, b rune) rune { + if a > b { + return a + } + return b +} + +func pinCodePoint(c *rune) rune { + if *c < UNICODESET_LOW { + *c = UNICODESET_LOW + } else if *c > (UNICODESET_HIGH - 1) { + *c = UNICODESET_HIGH - 1 + } + return *c +} + +func (u *UnicodeSet) AddRune(c rune) { + // find smallest i such that c < list[i] + // if odd, then it is IN the set + // if even, then it is OUT of the set + i := u.findCodePoint(pinCodePoint(&c)) + + // already in set? + if (i & 1) != 0 { + return + } + + // HIGH is 0x110000 + // assert(list[len-1] == HIGH); + + // empty = [HIGH] + // [start_0, limit_0, start_1, limit_1, HIGH] + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // i == 0 means c is before the first range + if c == u.list[i]-1 { + // c is before start of next range + u.list[i] = c + // if we touched the HIGH mark, then add a new one + if c == (UNICODESET_HIGH - 1) { + u.list = append(u.list, UNICODESET_HIGH) + } + if i > 0 && c == u.list[i-1] { + // collapse adjacent ranges + + // [..., start_k-1, c, c, limit_k, ..., HIGH] + // ^ + // list[i] + for k := i - 1; k < len(u.list)-2; k++ { + u.list[k] = u.list[k+2] + } + u.list = u.list[:len(u.list)-2] + } + } else if i > 0 && c == u.list[i-1] { + // c is after end of prior range + u.list[i-1]++ + // no need to check for collapse here + } else { + // At this point we know the new char is not adjacent to + // any existing ranges, and it is not 10FFFF. + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + u.list = slices.Insert(u.list, i, c, c+1) + } +} + +func (u *UnicodeSet) AddRuneRange(start, end rune) { + if pinCodePoint(&start) < pinCodePoint(&end) { + limit := end + 1 + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if (len(u.list) & 1) != 0 { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + var lastLimit rune + if len(u.list) == 1 { + lastLimit = -2 + } else { + lastLimit = u.list[len(u.list)-2] + } + if lastLimit <= start { + if lastLimit == start { + // Extend the last range. + u.list[len(u.list)-2] = limit + if limit == UNICODESET_HIGH { + u.list = u.list[:len(u.list)-1] + } + } else { + u.list[len(u.list)-1] = start + if limit < UNICODESET_HIGH { + u.list = append(u.list, limit) + u.list = append(u.list, UNICODESET_HIGH) + } else { // limit == UNICODESET_HIGH + u.list = append(u.list, UNICODESET_HIGH) + } + } + return + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + addRange := [3]rune{start, limit, UNICODESET_HIGH} + u.addbuffer(addRange[:], 0) + } else if start == end { + u.AddRune(start) + } +} + +func (u *UnicodeSet) AddAll(u2 *UnicodeSet) { + if len(u2.list) > 0 { + u.addbuffer(u2.list, 0) + } +} + +func (u *UnicodeSet) Complement() { + if u.list[0] == UNICODESET_LOW { + copy(u.list, u.list[1:]) + u.list = u.list[:len(u.list)-1] + } else { + u.list = slices.Insert(u.list, 0, UNICODESET_LOW) + } +} + +func (u *UnicodeSet) RemoveRuneRange(start, end rune) { + if pinCodePoint(&start) < pinCodePoint(&end) { + range_ := [3]rune{start, end + 1, UNICODESET_HIGH} + u.retain(range_[:], 2) + } +} + +func (u *UnicodeSet) RemoveAll(c *UnicodeSet) { + u.retain(c.list, 2) +} + +func (u *UnicodeSet) RetainAll(c *UnicodeSet) { + u.retain(c.list, 0) +} + +func (u *UnicodeSet) retain(other []rune, polarity int8) { + u.ensureBufferCapacity(len(u.list) + len(other)) + + i := 1 + j := 1 + k := 0 + + a := u.list[0] + b := other[0] + + // change from xor is that we have to check overlapping pairs + // polarity bit 1 means a is second, bit 2 means b is. + for { + switch polarity { + case 0: // both first; drop the smaller + if a < b { // drop a + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // drop b + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, take one, drop other + if a == UNICODESET_HIGH { + goto loop_end + } + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 3: // both second; take lower if unequal + if a < b { // take a + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // take b + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, take one, drop other + if a == UNICODESET_HIGH { + goto loop_end + } + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 1: // a second, b first; + if a < b { // NO OVERLAP, drop a + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // OVERLAP, take b + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, drop both! + if a == UNICODESET_HIGH { + goto loop_end + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 2: // a first, b second; if a < b, overlap + if b < a { // no overlap, drop b + b = other[j] + j++ + polarity ^= 2 + } else if a < b { // OVERLAP, take a + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else { // a == b, drop both! + if a == UNICODESET_HIGH { + goto loop_end + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + } + } + +loop_end: + u.buffer[k] = UNICODESET_HIGH // terminate + k++ + u.list, u.buffer = u.buffer[:k], u.list +} + +func (u *UnicodeSet) Clear() { + u.list = u.list[:1] + u.list[0] = UNICODESET_HIGH +} + +func (u *UnicodeSet) Len() (n int) { + count := u.rangeCount() + for i := 0; i < count; i++ { + n += int(u.rangeEnd(i)) - int(u.rangeStart(i)) + 1 + } + return +} + +func (u *UnicodeSet) rangeCount() int { + return len(u.list) / 2 +} + +func (u *UnicodeSet) rangeStart(idx int) rune { + return u.list[idx*2] +} + +func (u *UnicodeSet) rangeEnd(idx int) rune { + return u.list[idx*2+1] - 1 +} + +func (u *UnicodeSet) RuneAt(idx int) rune { + if idx >= 0 { + // len2 is the largest even integer <= len, that is, it is len + // for even values and len-1 for odd values. With odd values + // the last entry is UNICODESET_HIGH. + len2 := len(u.list) + if (len2 & 0x1) != 0 { + len2-- + } + + var i int + for i < len2 { + start := u.list[i] + count := int(u.list[i+1] - start) + i += 2 + if idx < count { + return start + rune(idx) + } + idx -= count + } + } + return -1 +} + +func (u *UnicodeSet) ContainsRune(c rune) bool { + if c >= UNICODESET_HIGH { + return false + } + i := u.findCodePoint(c) + return (i & 1) != 0 +} + +func (u *UnicodeSet) ContainsRuneRange(from, to rune) bool { + i := u.findCodePoint(from) + return (i&1) != 0 && to < u.list[i] +} + +func (u *UnicodeSet) findCodePoint(c rune) int { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if c < u.list[0] { + return 0 + } + + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + lo := 0 + hi := len(u.list) - 1 + if lo >= hi || c >= u.list[hi-1] { + return hi + } + + // invariant: c >= list[lo] + // invariant: c < list[hi] + for { + i := (lo + hi) >> 1 + if i == lo { + break // Found! + } else if c < u.list[i] { + hi = i + } else { + lo = i + } + } + return hi +} + +func (u *UnicodeSet) AddCategory(mask uint32) { + set := New() + set.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(mask)) + u.AddAll(set) +} + +func (u *UnicodeSet) AddString(chars string) { + for _, c := range chars { + u.AddRune(c) + } +} + +type Filter func(ch rune) bool + +func (u *UnicodeSet) applyFilter(inclusions *UnicodeSet, filter Filter) { + // Logically, walk through all Unicode characters, noting the start + // and end of each range for which filter.contain(c) is + // true. Add each range to a set. + // + // To improve performance, use an inclusions set which + // encodes information about character ranges that are known + // to have identical properties. + // inclusions contains the first characters of + // same-value ranges for the given property. + + u.Clear() + + startHasProperty := rune(-1) + limitRange := inclusions.rangeCount() + + for j := 0; j < limitRange; j++ { + // get current range + start := inclusions.rangeStart(j) + end := inclusions.rangeEnd(j) + + // for all the code points in the range, process + for ch := start; ch <= end; ch++ { + // only add to this UnicodeSet on inflection points -- + // where the hasProperty value changes to false + if filter(ch) { + if startHasProperty < 0 { + startHasProperty = ch + } + } else if startHasProperty >= 0 { + u.AddRuneRange(startHasProperty, ch-1) + startHasProperty = -1 + } + } + } + if startHasProperty >= 0 { + u.AddRuneRange(startHasProperty, 0x10FFFF) + } +} + +func (u *UnicodeSet) Clone() *UnicodeSet { + return &UnicodeSet{list: slices.Clone(u.list)} +} + +func (u *UnicodeSet) IsEmpty() bool { + return len(u.list) == 1 +} + +func (u *UnicodeSet) CopyFrom(set *UnicodeSet) { + u.list = slices.Clone(set.list) +} + +func (u *UnicodeSet) Equals(other *UnicodeSet) bool { + return slices.Equal(u.list, other.list) +} diff --git a/go/mysql/icuregex/internal/uset/unicode_set_test.go b/go/mysql/icuregex/internal/uset/unicode_set_test.go new file mode 100644 index 00000000000..908abd8889d --- /dev/null +++ b/go/mysql/icuregex/internal/uset/unicode_set_test.go @@ -0,0 +1,43 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSimpleBelong(t *testing.T) { + ss1 := New() + ss1.AddString("*?+[(){}^$|\\.") + ss2 := New() + ss2.AddString("*?+[(){}^$|\\.") + ss2.Complement() + ss3 := New() + ss3.AddRune('*') + ss3.AddRune('?') + + assert.True(t, ss1.ContainsRune('(')) + assert.False(t, ss2.ContainsRune('(')) + assert.True(t, ss3.ContainsRune('*')) +} diff --git a/go/mysql/icuregex/internal/utf16/helpers.go b/go/mysql/icuregex/internal/utf16/helpers.go new file mode 100644 index 00000000000..b87af5222fc --- /dev/null +++ b/go/mysql/icuregex/internal/utf16/helpers.go @@ -0,0 +1,76 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utf16 + +import "unicode/utf16" + +func IsLead(c rune) bool { + return (uint32(c) & 0xfffffc00) == 0xd800 +} + +func IsTrail(c rune) bool { + return (uint32(c) & 0xfffffc00) == 0xdc00 +} + +/** + * Is this code point a surrogate (U+d800..U+dfff)? + * @param c 32-bit code point + * @return true or false + * @stable ICU 2.4 + */ +func IsSurrogate(c rune) bool { + return (uint32(c) & 0xfffff800) == 0xd800 +} + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 32-bit code point + * @return true or false + * @stable ICU 2.4 + */ +func IsSurrogateLead(c rune) bool { + return (uint32(c) & 0x400) == 0 +} + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a trail surrogate? + * @param c 32-bit code point + * @return true or false + * @stable ICU 4.2 + */ +func IsSurrogateTrail(c rune) bool { + return (uint32(c) & 0x400) != 0 +} + +func DecodeRune(a, b rune) rune { + return utf16.DecodeRune(a, b) +} + +func NextUnsafe(s []uint16) (rune, []uint16) { + c := rune(s[0]) + if !IsLead(c) { + return c, s[1:] + } + return DecodeRune(c, rune(s[1])), s[2:] +} diff --git a/go/mysql/icuregex/internal/utrie/ucptrie.go b/go/mysql/icuregex/internal/utrie/ucptrie.go new file mode 100644 index 00000000000..05bbccd1610 --- /dev/null +++ b/go/mysql/icuregex/internal/utrie/ucptrie.go @@ -0,0 +1,715 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utrie + +import ( + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +type UcpTrie struct { + Index []uint16 + Data8 []uint8 + Data16 []uint16 + Data32 []uint32 + + IndexLength, DataLength int + /** Start of the last range which ends at U+10FFFF. @internal */ + HighStart rune + Shifted12HighStart uint16 + + Type UCPTrieType + ValueWidth UCPTrieValueWidth + + /** + * Internal index-3 null block offset. + * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. + * @internal + */ + Index3NullOffset uint16 + /** + * Internal data null block offset, not shifted. + * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. + * @internal + */ + DataNullOffset int32 + + NullValue uint32 +} + +/** + * Selectors for the type of a UCPTrie. + * Different trade-offs for size vs. speed. + * + * @see umutablecptrie_buildImmutable + * @see ucptrie_openFromBinary + * @see ucptrie_getType + * @stable ICU 63 + */ +type UCPTrieType int8 + +const ( + /** + * For ucptrie_openFromBinary() to accept any type. + * ucptrie_getType() will return the actual type. + * @stable ICU 63 + */ + UCPTRIE_TYPE_ANY UCPTrieType = iota - 1 + /** + * Fast/simple/larger BMP data structure. Use functions and "fast" macros. + * @stable ICU 63 + */ + UCPTRIE_TYPE_FAST + /** + * Small/slower BMP data structure. Use functions and "small" macros. + * @stable ICU 63 + */ + UCPTRIE_TYPE_SMALL +) + +/** + * Selectors for the number of bits in a UCPTrie data value. + * + * @see umutablecptrie_buildImmutable + * @see ucptrie_openFromBinary + * @see ucptrie_getValueWidth + * @stable ICU 63 + */ +type UCPTrieValueWidth int8 + +const ( + /** + * For ucptrie_openFromBinary() to accept any data value width. + * ucptrie_getValueWidth() will return the actual data value width. + * @stable ICU 63 + */ + UCPTRIE_VALUE_BITS_ANY UCPTrieValueWidth = iota - 1 + /** + * The trie stores 16 bits per data value. + * It returns them as unsigned values 0..0xffff=65535. + * @stable ICU 63 + */ + UCPTRIE_VALUE_BITS_16 + /** + * The trie stores 32 bits per data value. + * @stable ICU 63 + */ + UCPTRIE_VALUE_BITS_32 + /** + * The trie stores 8 bits per data value. + * It returns them as unsigned values 0..0xff=255. + * @stable ICU 63 + */ + UCPTRIE_VALUE_BITS_8 +) + +const UCPTRIE_SIG = 0x54726933 +const UCPTRIE_OE_SIG = 0x33697254 + +/** + * Constants for use with UCPTrieHeader.options. + * @internal + */ +const ( + UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000 + UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00 + UCPTRIE_OPTIONS_RESERVED_MASK = 0x38 + UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7 + /** + * Value for index3NullOffset which indicates that there is no index-3 null block. + * Bit 15 is unused for this value because this bit is used if the index-3 contains + * 18-bit indexes. + */ + UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff + UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff +) + +const ( + /** @internal */ + UCPTRIE_FAST_SHIFT = 6 + + /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ + UCPTRIE_FAST_DATA_BLOCK_LENGTH = 1 << UCPTRIE_FAST_SHIFT + + /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ + UCPTRIE_FAST_DATA_MASK = UCPTRIE_FAST_DATA_BLOCK_LENGTH - 1 + + /** @internal */ + UCPTRIE_SMALL_MAX = 0xfff + + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for out-of-range code points and ill-formed UTF-8/16. + * @internal + */ + UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET = 1 + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for code points highStart..U+10FFFF. + * @internal + */ + UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET = 2 +) + +// Internal constants. +const ( + /** The length of the BMP index table. 1024=0x400 */ + UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT + + UCPTRIE_SMALL_LIMIT = 0x1000 + UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT + + /** Shift size for getting the index-3 table offset. */ + UCPTRIE_SHIFT_3 = 4 + + /** Shift size for getting the index-2 table offset. */ + UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3 + + /** Shift size for getting the index-1 table offset. */ + UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2 + + /** + * Difference between two shift sizes, + * for getting an index-2 offset from an index-3 offset. 5=9-4 + */ + UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3 + + /** + * Difference between two shift sizes, + * for getting an index-1 offset from an index-2 offset. 5=14-9 + */ + UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2 + + /** + * Number of index-1 entries for the BMP. (4) + * This part of the index-1 table is omitted from the serialized form. + */ + UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1 + + /** Number of entries in an index-2 block. 32=0x20 */ + UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2 + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1 + + /** Number of code points per index-2 table entry. 512=0x200 */ + UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2 + + /** Number of entries in an index-3 block. 32=0x20 */ + UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3 + + /** Mask for getting the lower bits for the in-index-3-block offset. */ + UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1 + + /** Number of entries in a small data block. 16=0x10 */ + UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3 + + /** Mask for getting the lower bits for the in-small-data-block offset. */ + UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1 +) + +func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { + type UcpHeader struct { + /** "Tri3" in big-endian US-ASCII (0x54726933) */ + signature uint32 + + /** + * Options bit field: + * Bits 15..12: Data length bits 19..16. + * Bits 11..8: Data null block offset bits 19..16. + * Bits 7..6: UCPTrieType + * Bits 5..3: Reserved (0). + * Bits 2..0: UCPTrieValueWidth + */ + options uint16 + + /** Total length of the index tables. */ + indexLength uint16 + + /** Data length bits 15..0. */ + dataLength uint16 + + /** Index-3 null block offset, 0x7fff or 0xffff if none. */ + index3NullOffset uint16 + + /** Data null block offset bits 15..0, 0xfffff if none. */ + dataNullOffset uint16 + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UCPTRIE_SHIFT_2. + */ + shiftedHighStart uint16 + } + + var header UcpHeader + header.signature = bytes.Uint32() + + switch header.signature { + case UCPTRIE_SIG: + case UCPTRIE_OE_SIG: + return nil, fmt.Errorf("unsupported: BigEndian encoding") + default: + return nil, fmt.Errorf("invalid signature for UcpTrie: 0x%08x", header.signature) + } + + header.options = bytes.Uint16() + header.indexLength = bytes.Uint16() + header.dataLength = bytes.Uint16() + header.index3NullOffset = bytes.Uint16() + header.dataNullOffset = bytes.Uint16() + header.shiftedHighStart = bytes.Uint16() + + typeInt := (header.options >> 6) & 3 + valueWidthInt := header.options & UCPTRIE_OPTIONS_VALUE_BITS_MASK + if typeInt > uint16(UCPTRIE_TYPE_SMALL) || valueWidthInt > uint16(UCPTRIE_VALUE_BITS_8) || + (header.options&UCPTRIE_OPTIONS_RESERVED_MASK) != 0 { + return nil, fmt.Errorf("invalid options for serialized UcpTrie") + } + actualType := UCPTrieType(typeInt) + actualValueWidth := UCPTrieValueWidth(valueWidthInt) + + trie := &UcpTrie{ + IndexLength: int(header.indexLength), + DataLength: int(((header.options & UCPTRIE_OPTIONS_DATA_LENGTH_MASK) << 4) | header.dataLength), + Index3NullOffset: header.index3NullOffset, + DataNullOffset: int32(((header.options & UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK) << 8) | header.dataNullOffset), + HighStart: rune(header.shiftedHighStart) << UCPTRIE_SHIFT_2, + Type: actualType, + ValueWidth: actualValueWidth, + } + nullValueOffset := trie.DataNullOffset + if nullValueOffset >= int32(trie.DataLength) { + nullValueOffset = int32(trie.DataLength) - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + } + + trie.Shifted12HighStart = uint16((trie.HighStart + 0xfff) >> 12) + trie.Index = bytes.Uint16Slice(int32(header.indexLength)) + switch actualValueWidth { + case UCPTRIE_VALUE_BITS_16: + trie.Data16 = trie.Index[trie.IndexLength:] + trie.NullValue = uint32(trie.Index[nullValueOffset]) + case UCPTRIE_VALUE_BITS_32: + trie.Data32 = bytes.Uint32Slice(int32(trie.DataLength)) + trie.NullValue = trie.Data32[nullValueOffset] + case UCPTRIE_VALUE_BITS_8: + trie.Data8 = bytes.Uint8Slice(int32(trie.DataLength)) + trie.NullValue = uint32(trie.Data8[nullValueOffset]) + } + + return trie, nil +} + +func (trie *UcpTrie) Get(c rune) uint32 { + var dataIndex int32 + if c <= 0x7f { + // linear ASCII + dataIndex = c + } else { + var fastMax rune + if trie.Type == UCPTRIE_TYPE_FAST { + fastMax = 0xffff + } else { + fastMax = UCPTRIE_SMALL_MAX + } + dataIndex = trie.cpIndex(fastMax, c) + } + return trie.getValue(dataIndex) +} + +func (trie *UcpTrie) getValue(dataIndex int32) uint32 { + switch trie.ValueWidth { + case UCPTRIE_VALUE_BITS_16: + return uint32(trie.Data16[dataIndex]) + case UCPTRIE_VALUE_BITS_32: + return trie.Data32[dataIndex] + case UCPTRIE_VALUE_BITS_8: + return uint32(trie.Data8[dataIndex]) + default: + // Unreachable if the trie is properly initialized. + return 0xffffffff + } +} + +/** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */ +func (trie *UcpTrie) fastIndex(c rune) int32 { + return int32(trie.Index[c>>UCPTRIE_FAST_SHIFT]) + (c & UCPTRIE_FAST_DATA_MASK) +} + +/** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */ +func (trie *UcpTrie) smallIndex(c rune) int32 { + if c >= trie.HighStart { + return int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) + } + return trie.internalSmallIndex(c) +} + +func (trie *UcpTrie) internalSmallIndex(c rune) int32 { + i1 := c >> UCPTRIE_SHIFT_1 + if trie.Type == UCPTRIE_TYPE_FAST { + i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH + } else { + i1 += UCPTRIE_SMALL_INDEX_LENGTH + } + i3Block := int32(trie.Index[int32(trie.Index[i1])+((c>>UCPTRIE_SHIFT_2)&UCPTRIE_INDEX_2_MASK)]) + i3 := (c >> UCPTRIE_SHIFT_3) & UCPTRIE_INDEX_3_MASK + var dataBlock int32 + if (i3Block & 0x8000) == 0 { + // 16-bit indexes + dataBlock = int32(trie.Index[i3Block+i3]) + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + i3Block = (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) + i3 &= 7 + dataBlock = int32(trie.Index[i3Block]) << (2 + (2 * i3)) & 0x30000 + i3Block++ + dataBlock |= int32(trie.Index[i3Block+i3]) + } + return dataBlock + (c & UCPTRIE_SMALL_DATA_MASK) +} + +/** + * Internal trie getter for a code point, with checking that c is in U+0000..10FFFF. + * Returns the data index. + * @internal + */ +func (trie *UcpTrie) cpIndex(fastMax, c rune) int32 { + if c <= fastMax { + return trie.fastIndex(c) + } + if c <= 0x10ffff { + return trie.smallIndex(c) + } + return int32(trie.DataLength) - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET +} + +/** + * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates. + * Most users should use UCPMAP_RANGE_NORMAL. + * + * @see ucpmap_getRange + * @see ucptrie_getRange + * @see umutablecptrie_getRange + * @stable ICU 63 + */ +type UCPMapRangeOption int8 + +const ( + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map. + * Most users should use this option. + * @stable ICU 63 + */ + UCPMAP_RANGE_NORMAL UCPMapRangeOption = iota + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that lead surrogates (U+D800..U+DBFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_LEAD(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * @stable ICU 63 + */ + UCPMAP_RANGE_FIXED_LEAD_SURROGATES + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that all surrogates (U+D800..U+DFFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_SURROGATE(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * @stable ICU 63 + */ + UCPMAP_RANGE_FIXED_ALL_SURROGATES +) + +/** + * Callback function type: Modifies a map value. + * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange(). + * The modified value will be returned by the getRange function. + * + * Can be used to ignore some of the value bits, + * make a filter for one of several values, + * return a value index computed from the map value, etc. + * + * @param context an opaque pointer, as passed into the getRange function + * @param value a value from the map + * @return the modified value + * @stable ICU 63 + */ +type UCPMapValueFilter func(value uint32) uint32 + +/** + * GetRange returns the last code point such that all those from start to there have the same value. + * Can be used to efficiently iterate over all same-value ranges in a trie. + * (This is normally faster than iterating over code points and get()ting each value, + * but much slower than a data structure that stores ranges directly.) + * + * If the UCPMapValueFilter function pointer is not NULL, then + * the value to be delivered is passed through that function, and the return value is the end + * of the range where all values are modified to the same actual value. + * The value is unchanged if that function pointer is NULL. + * + * Example: + * \code + * UChar32 start = 0, end; + * uint32_t value; + * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, + * NULL, NULL, &value)) >= 0) { + * // Work with the range start..end and its value. + * start = end + 1; + * } + * \endcode + * + * @param trie the trie + * @param start range start + * @param option defines whether surrogates are treated normally, + * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL + * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL + * @param filter a pointer to a function that may modify the trie data value, + * or NULL if the values from the trie are to be used unmodified + * @param context an opaque pointer that is passed on to the filter function + * @param pValue if not NULL, receives the value that every code point start..end has; + * may have been modified by filter(context, trie value) + * if that function pointer is not NULL + * @return the range end code point, or -1 if start is not a valid code point + * @stable ICU 63 + */ +func (trie *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateValue uint32, filter UCPMapValueFilter) (rune, uint32) { + if option == UCPMAP_RANGE_NORMAL { + return trie.getRange(start, filter) + } + + var surrEnd rune + if option == UCPMAP_RANGE_FIXED_ALL_SURROGATES { + surrEnd = 0xdfff + } else { + surrEnd = 0xdbff + } + end, value := trie.getRange(start, filter) + if end < 0xd7ff || start > surrEnd { + return end, value + } + if value == surrogateValue { + if end >= surrEnd { + // Surrogates followed by a non-surrogateValue range, + // or surrogates are part of a larger surrogateValue range. + return end, value + } + } else { + if start <= 0xd7ff { + return 0xd7ff, value // Non-surrogateValue range ends before surrogateValue surrogates. + } + // Start is a surrogate with a non-surrogateValue code *unit* value. + // Return a surrogateValue code *point* range. + value = surrogateValue + if end > surrEnd { + return surrEnd, value // Surrogate range ends before non-surrogateValue rest of range. + } + } + // See if the surrogateValue surrogate range can be merged with + // an immediately following range. + end2, value2 := trie.getRange(surrEnd+1, filter) + if value2 == surrogateValue { + return end2, value + } + return surrEnd, value +} + +const MAX_UNICODE = 0x10ffff + +func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) { + if start > MAX_UNICODE { + return -1, 0 + } + + if start >= trie.HighStart { + di := int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) + value := trie.getValue(di) + if filter != nil { + value = filter(value) + } + return MAX_UNICODE, value + } + + nullValue := trie.NullValue + if filter != nil { + nullValue = filter(nullValue) + } + index := trie.Index + + prevI3Block := int32(-1) + prevBlock := int32(-1) + c := start + var trieValue uint32 + value := nullValue + haveValue := false + for { + var i3Block, i3, i3BlockLength, dataBlockLength int32 + if c <= 0xffff && (trie.Type == UCPTRIE_TYPE_FAST || c <= UCPTRIE_SMALL_MAX) { + i3Block = 0 + i3 = c >> UCPTRIE_FAST_SHIFT + if trie.Type == UCPTRIE_TYPE_FAST { + i3BlockLength = UCPTRIE_BMP_INDEX_LENGTH + } else { + i3BlockLength = UCPTRIE_SMALL_INDEX_LENGTH + } + dataBlockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH + } else { + // Use the multi-stage index. + i1 := c >> UCPTRIE_SHIFT_1 + if trie.Type == UCPTRIE_TYPE_FAST { + i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH + } else { + i1 += UCPTRIE_SMALL_INDEX_LENGTH + } + shft := (c >> UCPTRIE_SHIFT_2) + idx := int32(trie.Index[i1]) + (shft & UCPTRIE_INDEX_2_MASK) + i3Block = int32(trie.Index[idx]) + if i3Block == prevI3Block && (c-start) >= UCPTRIE_CP_PER_INDEX_2_ENTRY { + // The index-3 block is the same as the previous one, and filled with value. + c += UCPTRIE_CP_PER_INDEX_2_ENTRY + continue + } + prevI3Block = i3Block + if i3Block == int32(trie.Index3NullOffset) { + // This is the index-3 null block. + if haveValue { + if nullValue != value { + return c - 1, value + } + } else { + trieValue = trie.NullValue + value = nullValue + haveValue = true + } + prevBlock = trie.DataNullOffset + c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ^(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1) + continue + } + i3 = (c >> UCPTRIE_SHIFT_3) & UCPTRIE_INDEX_3_MASK + i3BlockLength = UCPTRIE_INDEX_3_BLOCK_LENGTH + dataBlockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH + } + + // Enumerate data blocks for one index-3 block. + for { + var block int32 + if (i3Block & 0x8000) == 0 { + block = int32(index[i3Block+i3]) + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + group := (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) + gi := i3 & 7 + block = (int32(index[group]) << (2 + (2 * gi))) & 0x30000 + group++ + block |= int32(index[group+gi]) + } + if block == prevBlock && (c-start) >= dataBlockLength { + // The block is the same as the previous one, and filled with value. + c += dataBlockLength + } else { + dataMask := dataBlockLength - 1 + prevBlock = block + if block == trie.DataNullOffset { + // This is the data null block. + if haveValue { + if nullValue != value { + return c - 1, value + } + } else { + trieValue = trie.NullValue + value = nullValue + haveValue = true + } + c = (c + dataBlockLength) & ^dataMask + } else { + di := block + (c & dataMask) + trieValue2 := trie.getValue(di) + if haveValue { + if trieValue2 != trieValue { + if filter == nil || maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) != value { + return c - 1, value + } + trieValue = trieValue2 // may or may not help + } + } else { + trieValue = trieValue2 + value = maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) + haveValue = true + } + for { + c++ + if c&dataMask == 0 { + break + } + di++ + trieValue2 = trie.getValue(di) + if trieValue2 != trieValue { + if filter == nil || maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) != value { + return c - 1, value + } + trieValue = trieValue2 // may or may not help + } + } + } + } + i3++ + if i3 >= i3BlockLength { + break + } + } + if c >= trie.HighStart { + break + } + } + + di := int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) + highValue := trie.getValue(di) + if maybeFilterValue(highValue, trie.NullValue, nullValue, filter) != value { + return c - 1, value + } else { + return MAX_UNICODE, value + } +} + +func maybeFilterValue(value uint32, trieNullValue uint32, nullValue uint32, filter UCPMapValueFilter) uint32 { + if value == trieNullValue { + value = nullValue + } else if filter != nil { + value = filter(value) + } + return value +} diff --git a/go/mysql/icuregex/internal/utrie/utrie2.go b/go/mysql/icuregex/internal/utrie/utrie2.go new file mode 100644 index 00000000000..6fd2ccd7120 --- /dev/null +++ b/go/mysql/icuregex/internal/utrie/utrie2.go @@ -0,0 +1,448 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utrie + +import ( + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +type UTrie2 struct { + Index []uint16 + Data16 []uint16 + Data32 []uint32 + + IndexLength, DataLength int + Index2NullOffset uint16 + DataNullOffset uint16 + InitialValue uint32 + ErrorValue uint32 + + HighStart rune + HighValueIndex int +} + +func (t *UTrie2) SerializedLength() int32 { + return 16 + int32(t.IndexLength+t.DataLength)*2 +} + +func (t *UTrie2) getIndex(asciiOffset int, c rune) uint16 { + return t.Index[t.indexFromCp(asciiOffset, c)] +} + +func (t *UTrie2) Get16(c rune) uint16 { + return t.getIndex(t.IndexLength, c) +} + +func (t *UTrie2) indexFromCp(asciiOffset int, c rune) int { + switch { + case c < 0xd800: + return indexRaw(0, t.Index, c) + case c <= 0xffff: + var offset int32 + if c <= 0xdbff { + offset = UTRIE2_LSCP_INDEX_2_OFFSET - (0xd800 >> UTRIE2_SHIFT_2) + } + return indexRaw(offset, t.Index, c) + case c > 0x10ffff: + return asciiOffset + UTRIE2_BAD_UTF8_DATA_OFFSET + case c >= t.HighStart: + return t.HighValueIndex + default: + return indexFromSupp(t.Index, c) + } +} + +type EnumRange func(start, end rune, value uint32) bool +type EnumValue func(value uint32) uint32 + +func (t *UTrie2) Enum(enumValue EnumValue, enumRange EnumRange) { + t.enumEitherTrie(0, 0x110000, enumValue, enumRange) +} + +func enumSameValue(value uint32) uint32 { + return value +} + +func min(a, b rune) rune { + if a < b { + return a + } + return b +} + +func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRange EnumRange) { + if enumRange == nil { + return + } + if enumValue == nil { + enumValue = enumSameValue + } + + /* frozen trie */ + var ( + idx = t.Index + data32 = t.Data32 + index2NullOffset = int(t.Index2NullOffset) + nullBlock = int(t.DataNullOffset) + + c rune + prev = start + highStart = t.HighStart + + /* get the enumeration value that corresponds to an initial-value trie data entry */ + initialValue = enumValue(t.InitialValue) + + /* set variables for previous range */ + i2Block int + block int + prevI2Block = -1 + prevBlock = -1 + prevValue = uint32(0) + ) + + /* enumerate index-2 blocks */ + for c = start; c < limit && c < highStart; { + /* Code point limit for iterating inside this i2Block. */ + tempLimit := c + UTRIE2_CP_PER_INDEX_1_ENTRY + if limit < tempLimit { + tempLimit = limit + } + if c <= 0xffff { + if !utf16.IsSurrogate(c) { + i2Block = int(c >> UTRIE2_SHIFT_2) + } else if utf16.IsSurrogateLead(c) { + /* + * Enumerate values for lead surrogate code points, not code units: + * This special block has half the normal length. + */ + i2Block = UTRIE2_LSCP_INDEX_2_OFFSET + tempLimit = min(0xdc00, limit) + } else { + /* + * Switch back to the normal part of the index-2 table. + * Enumerate the second half of the surrogates block. + */ + i2Block = 0xd800 >> UTRIE2_SHIFT_2 + tempLimit = min(0xe000, limit) + } + } else { + /* supplementary code points */ + i2Block = int(idx[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+(c>>UTRIE2_SHIFT_1)]) + if i2Block == prevI2Block && (c-prev) >= UTRIE2_CP_PER_INDEX_1_ENTRY { + /* + * The index-2 block is the same as the previous one, and filled with prevValue. + * Only possible for supplementary code points because the linear-BMP index-2 + * table creates unique i2Block values. + */ + c += UTRIE2_CP_PER_INDEX_1_ENTRY + continue + } + } + prevI2Block = i2Block + if i2Block == index2NullOffset { + /* this is the null index-2 block */ + if prevValue != initialValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prevBlock = nullBlock + prev = c + prevValue = initialValue + } + c += UTRIE2_CP_PER_INDEX_1_ENTRY + } else { + /* enumerate data blocks for one index-2 block */ + var i2Limit int + if (c >> UTRIE2_SHIFT_1) == (tempLimit >> UTRIE2_SHIFT_1) { + i2Limit = int(tempLimit>>UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK + } else { + i2Limit = UTRIE2_INDEX_2_BLOCK_LENGTH + } + for i2 := int(c>>UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; i2 < i2Limit; i2++ { + block = int(idx[i2Block+i2] << UTRIE2_INDEX_SHIFT) + if block == prevBlock && (c-prev) >= UTRIE2_DATA_BLOCK_LENGTH { + /* the block is the same as the previous one, and filled with prevValue */ + c += UTRIE2_DATA_BLOCK_LENGTH + continue + } + prevBlock = block + if block == nullBlock { + /* this is the null data block */ + if prevValue != initialValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prev = c + prevValue = initialValue + } + c += UTRIE2_DATA_BLOCK_LENGTH + } else { + for j := 0; j < UTRIE2_DATA_BLOCK_LENGTH; j++ { + var value uint32 + if data32 != nil { + value = data32[block+j] + } else { + value = uint32(idx[block+j]) + } + value = enumValue(value) + if value != prevValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prev = c + prevValue = value + } + c++ + } + } + } + } + } + + if c > limit { + c = limit /* could be higher if in the index2NullOffset */ + } else if c < limit { + /* c==highStart>UTRIE2_SHIFT_1)]) + return (int(index[i1+int((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK)]) << UTRIE2_INDEX_SHIFT) + int(c&UTRIE2_DATA_MASK) +} + +func indexRaw(offset int32, index []uint16, c rune) int { + return int(index[offset+(c>>UTRIE2_SHIFT_2)]<> UTRIE2_SHIFT_1 + + /** Number of code points per index-1 table entry. 2048=0x800 */ + UTRIE2_CP_PER_INDEX_1_ENTRY = 1 << UTRIE2_SHIFT_1 + + /** Number of entries in an index-2 block. 64=0x40 */ + UTRIE2_INDEX_2_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_1_2 + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + UTRIE2_INDEX_2_MASK = UTRIE2_INDEX_2_BLOCK_LENGTH - 1 + + /** Number of entries in a data block. 32=0x20 */ + UTRIE2_DATA_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_2 + + /** Mask for getting the lower bits for the in-data-block offset. */ + UTRIE2_DATA_MASK = UTRIE2_DATA_BLOCK_LENGTH - 1 + + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. + */ + UTRIE2_INDEX_SHIFT = 2 + + /** The alignment size of a data block. Also the granularity for compaction. */ + UTRIE2_DATA_GRANULARITY = 1 << UTRIE2_INDEX_SHIFT + + /* Fixed layout of the first part of the index array. ------------------- */ + + /** + * The BMP part of the index-2 table is fixed and linear and starts at offset 0. + * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2 + */ + UTRIE2_INDEX_2_OFFSET = 0 + + /** + * The part of the index-2 table for U+D800..U+DBFF stores values for + * lead surrogate code _units_ not code _points_. + * Values for lead surrogate code _points_ are indexed with this portion of the table. + * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + UTRIE2_LSCP_INDEX_2_OFFSET = 0x10000 >> UTRIE2_SHIFT_2 + UTRIE2_LSCP_INDEX_2_LENGTH = 0x400 >> UTRIE2_SHIFT_2 + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + UTRIE2_INDEX_2_BMP_LENGTH = UTRIE2_LSCP_INDEX_2_OFFSET + UTRIE2_LSCP_INDEX_2_LENGTH + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + UTRIE2_UTF8_2B_INDEX_2_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH + UTRIE2_UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6 /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + UTRIE2_INDEX_1_OFFSET = UTRIE2_UTF8_2B_INDEX_2_OFFSET + UTRIE2_UTF8_2B_INDEX_2_LENGTH + UTRIE2_MAX_INDEX_1_LENGTH = 0x100000 >> UTRIE2_SHIFT_1 + + /* + * Fixed layout of the first part of the data array. ----------------------- + * Starts with 4 blocks (128=0x80 entries) for ASCII. + */ + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + UTRIE2_BAD_UTF8_DATA_OFFSET = 0x80 + + /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */ + UTRIE2_DATA_START_OFFSET = 0xc0 +) + +func UTrie2FromBytes(bytes *udata.Bytes) (*UTrie2, error) { + type UTrie2Header struct { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + signature uint32 + + /** + * options bit field: + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + options uint16 + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + indexLength uint16 + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + shiftedDataLength uint16 + + /** Null index and data blocks, not shifted. */ + index2NullOffset, dataNullOffset uint16 + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. + */ + shiftedHighStart uint16 + } + + var header UTrie2Header + header.signature = bytes.Uint32() + + switch header.signature { + case 0x54726932: + case 0x32697254: + return nil, fmt.Errorf("unsupported: BigEndian encoding") + default: + return nil, fmt.Errorf("invalid signature for Trie2: 0x%08x", header.signature) + } + + header.options = bytes.Uint16() + header.indexLength = bytes.Uint16() + header.shiftedDataLength = bytes.Uint16() + header.index2NullOffset = bytes.Uint16() + header.dataNullOffset = bytes.Uint16() + header.shiftedHighStart = bytes.Uint16() + + var width int + switch header.options & 0xf { + case 0: + width = 16 + case 1: + width = 32 + default: + return nil, fmt.Errorf("invalid width for serialized UTrie2") + } + + trie := &UTrie2{ + IndexLength: int(header.indexLength), + DataLength: int(header.shiftedDataLength) << UTRIE2_INDEX_SHIFT, + Index2NullOffset: header.index2NullOffset, + DataNullOffset: header.dataNullOffset, + HighStart: rune(header.shiftedHighStart) << UTRIE2_SHIFT_1, + } + + trie.HighValueIndex = trie.DataLength - UTRIE2_DATA_GRANULARITY + if width == 16 { + trie.HighValueIndex += trie.IndexLength + } + + indexArraySize := trie.IndexLength + if width == 16 { + indexArraySize += trie.DataLength + } + + trie.Index = bytes.Uint16Slice(int32(indexArraySize)) + + if width == 16 { + trie.Data16 = trie.Index[trie.IndexLength:] + trie.InitialValue = uint32(trie.Index[trie.DataNullOffset]) + trie.ErrorValue = uint32(trie.Index[trie.IndexLength+UTRIE2_BAD_UTF8_DATA_OFFSET]) + } else { + trie.Data32 = bytes.Uint32Slice(int32(trie.DataLength)) + trie.InitialValue = trie.Data32[trie.DataNullOffset] + trie.ErrorValue = trie.Data32[UTRIE2_BAD_UTF8_DATA_OFFSET] + } + + return trie, nil +} diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go new file mode 100644 index 00000000000..842b3b458d9 --- /dev/null +++ b/go/mysql/icuregex/matcher.go @@ -0,0 +1,1890 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "io" + + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" +) + +type BreakIterator interface { +} + +const TIMER_INITIAL_VALUE = 10000 +const DEFAULT_TIMEOUT = 3 +const DEFAULT_STACK_LIMIT = 0 + +type Matcher struct { + pattern *Pattern + + input []rune + + regionStart int // Start of the input region, default = 0. + regionLimit int // End of input region, default to input.length. + + anchorStart int // Region bounds for anchoring operations (^ or $). + anchorLimit int // See useAnchoringBounds + + lookStart int // Region bounds for look-ahead/behind and + lookLimit int // and other boundary tests. See + // useTransparentBounds + + activeStart int // Currently active bounds for matching. + activeLimit int // Usually is the same as region, but + // is changed to fLookStart/Limit when + // entering look around regions. + + match bool // True if the last attempted match was successful. + matchStart int // Position of the start of the most recent match + matchEnd int // First position after the end of the most recent match + // Zero if no previous match, even when a region + // is active. + lastMatchEnd int // First position after the end of the previous match, + // or -1 if there was no previous match. + appendPosition int // First position after the end of the previous + // appendReplacement(). As described by the + // JavaDoc for Java Matcher, where it is called + // "append position" + hitEnd bool // True if the last match touched the end of input. + requireEnd bool // True if the last match required end-of-input + // (matched $ or Z) + + stack Stack + frame StackFrame // After finding a match, the last active stack frame, + // which will contain the capture group results. + // NOT valid while match engine is running. + + data []int // Data area for use by the compiled pattern. + + timeLimit int32 // Max time (in arbitrary steps) to let the + // match engine run. Zero for unlimited. + + time int32 // Match time, accumulates while matching. + tickCounter int32 // Low bits counter for time. Counts down StateSaves. + // Kept separately from fTime to keep as much + // code as possible out of the inline + // StateSave function. + + wordBreakItr BreakIterator + gcBreakItr BreakIterator +} + +func NewMatcher(pat *Pattern) *Matcher { + m := &Matcher{ + pattern: pat, + data: make([]int, pat.dataSize), + stack: Stack{ + frameSize: pat.frameSize, + stackLimit: DEFAULT_STACK_LIMIT, + }, + timeLimit: DEFAULT_TIMEOUT, + } + m.reset() + return m +} + +var Dumper io.Writer + +func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { + //-------------------------------------------------------------------------------- + // + // MatchAt This is the actual matching engine. + // + // startIdx: begin matching a this index. + // toEnd: if true, match must extend to end of the input region + // + //-------------------------------------------------------------------------------- + var err error + var isMatch bool // True if the we have a match. + + if Dumper != nil { + fmt.Fprintf(Dumper, "MatchAt(startIdx=%d)\n", startIdx) + fmt.Fprintf(Dumper, "Original Pattern: \"%s\"\n", m.pattern.pattern) + fmt.Fprintf(Dumper, "Input String: \"%s\"\n\n", string(m.input)) + } + + pat := m.pattern.compiledPat + inputText := m.input + inputLength := len(inputText) + litText := m.pattern.literalText + sets := m.pattern.sets + + fp := m.resetStack() + *fp.inputIdx() = startIdx + *fp.patIdx() = 0 + for i := 0; i < len(m.data); i++ { + m.data[i] = 0 + } + + for { + op := pat[*fp.patIdx()] + + if Dumper != nil { + fmt.Fprintf(Dumper, "inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", *fp.inputIdx(), + charAt(inputText, *fp.inputIdx()), m.stack.sp(), m.activeLimit) + m.pattern.dumpOp(Dumper, *fp.patIdx()) + } + + *fp.patIdx()++ + + switch op.Type() { + case URX_NOP: + // Nothing to do. + case URX_BACKTRACK: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + fp = m.stack.popFrame() + case URX_ONECHAR: + if *fp.inputIdx() < m.activeLimit { + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + if c == rune(op.Value()) { + break + } + } else { + m.hitEnd = true + } + fp = m.stack.popFrame() + case URX_STRING: + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + stringStartIdx := op.Value() + nextOp := pat[*fp.patIdx()] // Fetch the second operand + *fp.patIdx()++ + stringLen := nextOp.Value() + if nextOp.Type() != URX_STRING_LEN { + panic("URX_STRING_LEN expected") + } + if stringLen < 2 { + panic("stringLen < 2, would have expected URX_ONECHAR for a single character") + } + + patternString := litText[stringStartIdx:] + var patternStringIndex int + success := true + for patternStringIndex < stringLen { + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + success = false + break + } + if charAt(patternString, patternStringIndex) != charAt(inputText, *fp.inputIdx()) { + success = false + break + } + patternStringIndex++ + *fp.inputIdx()++ + } + + if !success { + fp = m.stack.popFrame() + } + case URX_STATE_SAVE: + fp, err = m.StateSave(*fp.inputIdx(), op.Value()) + if err != nil { + return err + } + case URX_END: + // The match loop will exit via this path on a successful match, + // when we reach the end of the pattern. + if toEnd && *fp.inputIdx() != m.activeLimit { + // The pattern matched, but not to the end of input. Try some more. + fp = m.stack.popFrame() + break + } + isMatch = true + goto breakFromLoop + + // Start and End Capture stack frame variables are laid out out like this: + // fp->fExtra[opValue] - The start of a completed capture group + // opValue+1 - The end of a completed capture group + // opValue+2 - the start of a capture group whose end + // has not yet been reached (and might not ever be). + case URX_START_CAPTURE: + if !(op.Value() >= 0 && op.Value() < m.stack.frameSize-3) { + panic("failed assertion: opValue >= 0 && opValue < fFrameSize-3") + } + *fp.extra(op.Value() + 2) = *fp.inputIdx() + case URX_END_CAPTURE: + if !(op.Value() >= 0 && op.Value() < m.stack.frameSize-3) { + panic("failed assertion: opValue >= 0 && opValue < fFrameSize-3") + } + if *fp.extra(op.Value() + 2) < 0 { + panic("start pos for this group must be set") + } + + *fp.extra(op.Value()) = *fp.extra(op.Value() + 2) // Tentative start becomes real. + *fp.extra(op.Value() + 1) = *fp.inputIdx() // End position + if !(*fp.extra(op.Value()) <= *fp.extra(op.Value() + 1)) { + panic("failed assertion: fp->fExtra[opValue] <= fp->fExtra[opValue+1]") + } + + case URX_DOLLAR: // $, test for End of line + if *fp.inputIdx() < m.anchorLimit-2 { + fp = m.stack.popFrame() + break + } + // or for position before new line at end of input + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } + + if *fp.inputIdx() == m.anchorLimit-1 { + c := m.input[*fp.inputIdx()] + if isLineTerminator(c) { + if !(c == 0x0a && *fp.inputIdx() > m.anchorStart && m.input[*fp.inputIdx()-1] == 0x0d) { + // At new-line at end of input. Success + m.hitEnd = true + m.requireEnd = true + break + } + } + } else if *fp.inputIdx() == m.anchorLimit-2 && m.input[*fp.inputIdx()] == 0x0d && m.input[*fp.inputIdx()+1] == 0x0a { + m.hitEnd = true + m.requireEnd = true + break // At CR/LF at end of input. Success + } + fp = m.stack.popFrame() + + case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + if *fp.inputIdx() >= m.anchorLimit { + // Off the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } else { + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + // Either at the last character of input, or off the end. + if c == 0x0a && *fp.inputIdx() == m.anchorLimit { + m.hitEnd = true + m.requireEnd = true + break + } + } + + // Not at end of input. Back-track out. + fp = m.stack.popFrame() + case URX_DOLLAR_M: // $, test for End of line in multi-line mode + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if !(c == 0x0a && *fp.inputIdx() > m.anchorStart && charAt(inputText, *fp.inputIdx()-1) == 0x0d) { + break + } + } + // not at a new line. Fail. + fp = m.stack.popFrame() + case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true // Java set requireEnd in this case, even though + break // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + if charAt(inputText, *fp.inputIdx()) != 0x0a { + fp = m.stack.popFrame() + } + case URX_CARET: // ^, test for start of line + if *fp.inputIdx() != m.anchorStart { + fp = m.stack.popFrame() + } + case URX_CARET_M: // ^, test for start of line in mulit-line mode + if *fp.inputIdx() == m.anchorStart { + // We are at the start input. Success. + break + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + c := charAt(inputText, *fp.inputIdx()-1) + if (*fp.inputIdx() < m.anchorLimit) && isLineTerminator(c) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break + } + // Not at the start of a line. Fail. + fp = m.stack.popFrame() + case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + if !(*fp.inputIdx() >= m.anchorStart) { + panic("failed assertion: *fp.inputIdx() >= m.anchorStart") + } + if *fp.inputIdx() <= m.anchorStart { + // We are at the start input. Success. + break + } + // Check whether character just before the current pos is a new-line + if !(*fp.inputIdx() <= m.anchorLimit) { + panic("failed assertion: *fp.inputIdx() <= m.anchorLimit") + } + + c := charAt(inputText, *fp.inputIdx()-1) + if c != 0x0a { + // Not at the start of a line. Back-track out. + fp = m.stack.popFrame() + } + case URX_BACKSLASH_B: // Test for word boundaries + success := m.isWordBoundary(*fp.inputIdx()) + success = success != (op.Value() != 0) // flip sense for \B + if !success { + fp = m.stack.popFrame() + } + case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + success := m.isUWordBoundary(*fp.inputIdx()) + success = success != (op.Value() != 0) // flip sense for \B + if !success { + fp = m.stack.popFrame() + } + case URX_BACKSLASH_D: // Test for decimal digit + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + + success := m.isDecimalDigit(c) + success = success != (op.Value() != 0) // flip sense for \D + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case URX_BACKSLASH_G: // Test for position at end of previous match + if !((m.match && *fp.inputIdx() == m.matchEnd) || (!m.match && *fp.inputIdx() == m.activeStart)) { + fp = m.stack.popFrame() + } + + case URX_BACKSLASH_H: // Test for \h, horizontal white space. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + success := m.isHorizWS(c) || c == 9 + success = success != (op.Value() != 0) // flip sense for \H + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case URX_BACKSLASH_R: // Test for \R, any line break sequence. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + if c == 0x0d && charAt(inputText, *fp.inputIdx()+1) == 0x0a { + *fp.inputIdx()++ + } + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case URX_BACKSLASH_V: // \v, any single line ending character. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + c := charAt(inputText, *fp.inputIdx()) + success := isLineTerminator(c) + success = success != (op.Value() != 0) // flip sense for \V + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case URX_BACKSLASH_X: + // Match a Grapheme, as defined by Unicode UAX 29. + + // Fail if at end of input + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + *fp.inputIdx() = m.followingGCBoundary(*fp.inputIdx()) + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + *fp.inputIdx() = m.activeLimit + } + + case URX_BACKSLASH_Z: // Test for end of Input + if *fp.inputIdx() < m.anchorLimit { + fp = m.stack.popFrame() + } else { + m.hitEnd = true + m.requireEnd = true + } + case URX_STATIC_SETREF: + // Test input character against one of the predefined sets + // (Word Characters, for example) + // The high bit of the op value is a flag for the match polarity. + // 0: success if input char is in set. + // 1: success if input char is not in set. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + success := (op.Value() & URX_NEG_SET) == URX_NEG_SET + negOp := op.Value() & ^URX_NEG_SET + + if !(negOp > 0 && negOp < URX_LAST_SET) { + panic("assertion failed: negOp > 0 && negOp < URX_LAST_SET") + } + + c := charAt(inputText, *fp.inputIdx()) + s := staticPropertySets[op.Value()] + if s.ContainsRune(c) { + success = !success + } + + if success { + *fp.inputIdx()++ + } else { + // the character wasn't in the set. + fp = m.stack.popFrame() + } + case URX_STAT_SETREF_N: + // Test input character for NOT being a member of one of + // the predefined sets (Word Characters, for example) + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + if !(op.Value() > 0 && op.Value() < URX_LAST_SET) { + panic("assertion failed: op.Value() > 0 && op.Value() < URX_LAST_SET") + } + + c := charAt(inputText, *fp.inputIdx()) + s := staticPropertySets[op.Value()] + if !s.ContainsRune(c) { + *fp.inputIdx()++ + break + } + // the character wasn't in the set. + fp = m.stack.popFrame() + + case URX_SETREF: + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + // There is input left. Pick up one char and test it for set membership. + c := charAt(inputText, *fp.inputIdx()) + + if !(op.Value() > 0 && op.Value() < len(m.pattern.sets)) { + panic("assertion failed: op.Value() > 0 && op.Value() < fSets->size()") + } + s := sets[op.Value()] + if s.ContainsRune(c) { + *fp.inputIdx()++ + break + } + + // the character wasn't in the set. + fp = m.stack.popFrame() + + case URX_DOTANY: + // . matches anything, but stops at end-of-line. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + // End of line in normal mode. . does not match. + fp = m.stack.popFrame() + break + } + *fp.inputIdx()++ + + case URX_DOTANY_ALL: + // ., in dot-matches-all (including new lines) mode + if *fp.inputIdx() >= m.activeLimit { + // At end of input. Match failed. Backtrack out. + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + if c == 0x0d && *fp.inputIdx() < m.activeLimit { + // In the case of a CR/LF, we need to advance over both. + nextc := charAt(inputText, *fp.inputIdx()) + if nextc == 0x0a { + *fp.inputIdx()++ + } + } + + case URX_DOTANY_UNIX: + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if *fp.inputIdx() >= m.activeLimit { + // At end of input. Match failed. Backtrack out. + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + // There is input left. Advance over one char, unless we've hit end-of-line + c := charAt(inputText, *fp.inputIdx()) + if c == 0x0a { + // End of line in normal mode. '.' does not match the \n + fp = m.stack.popFrame() + } else { + *fp.inputIdx()++ + } + case URX_JMP: + *fp.patIdx() = int(op.Value()) + + case URX_FAIL: + isMatch = false + goto breakFromLoop + + case URX_JMP_SAV: + if !(op.Value() > 0 && int(op.Value()) < len(pat)) { + panic("assertion failed: op.Value() > 0 && op.Value() < fPattern->fCompiledPat->size()") + } + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + if err != nil { + return err + } + *fp.patIdx() = int(op.Value()) // Then JMP. + + case URX_JMP_SAV_X: + // This opcode is used with (x)+, when x can match a zero length string. + // Same as JMP_SAV, except conditional on the match having made forward progress. + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the + // data address of the input position at the start of the loop. + if !(op.Value() > 0 && int(op.Value()) < len(pat)) { + panic("assertion failed: op.Value() > 0 && op.Value() < fPattern->fCompiledPat->size()") + } + stoOp := pat[op.Value()-1] + if !(stoOp.Type() == URX_STO_INP_LOC) { + panic("assertion failed: stoOp.Type() == URX_STO_INP_LOC") + } + + frameLoc := int(stoOp.Value()) + if !(frameLoc >= 0 && frameLoc < m.stack.frameSize) { + panic("assertion failed: frameLoc >= 0 && frameLoc < fFrameSize") + } + + prevInputIdx := *fp.extra(frameLoc) + if !(prevInputIdx <= *fp.inputIdx()) { + panic("assertion failed: prevInputIdx <= *fp.inputIdx()") + } + if prevInputIdx < *fp.inputIdx() { + // The match did make progress. Repeat the loop. + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + if err != nil { + return err + } + *fp.patIdx() = int(op.Value()) // Then JMP. + *fp.extra(frameLoc) = *fp.inputIdx() + } + // If the input position did not advance, we do nothing here, + // execution will fall out of the loop. + + case URX_CTR_INIT: + if !(op.Value() >= 0 && int(op.Value()) < m.stack.frameSize-2) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize-2") + } + *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + instOperandLoc := *fp.patIdx() + *fp.patIdx() += 3 // Skip over the three operands that CTR_INIT has. + + loopLoc := pat[instOperandLoc].Value() + minCount := int(pat[instOperandLoc+1]) + maxCount := int(pat[instOperandLoc+2]) + + if !(minCount >= 0 && maxCount >= minCount || maxCount == -1) { + panic("assertion failed: minCount >= 0 && maxCount >= minCount || maxCount == -1") + } + if !(int(loopLoc) >= *fp.patIdx()) { + panic("assertion failed: loopLoc >= *fp.patIdx()") + } + + if minCount == 0 { + fp, err = m.StateSave(*fp.inputIdx(), loopLoc+1) + if err != nil { + return err + } + } + if maxCount == -1 { + *fp.extra(op.Value() + 1) = *fp.inputIdx() // For loop breaking. + } else if maxCount == 0 { + fp = m.stack.popFrame() + } + + case URX_CTR_LOOP: + if !(op.Value() >= 0 && op.Value() < *fp.patIdx()-2) { + panic("assertion failed: op.Value() >= 0 && op.Value() < *fp.patIdx()-2") + } + initOp := pat[op.Value()] + if !(initOp.Type() == URX_CTR_INIT) { + panic("assertion failed: initOp.Type() == URX_CTR_INIT") + } + opValue := initOp.Value() + pCounter := fp.extra(opValue) + minCount := int(pat[op.Value()+2]) + maxCount := int(pat[op.Value()+3]) + *pCounter++ + if *pCounter >= maxCount && maxCount != -1 { + if !(*pCounter == maxCount) { + panic("assertion failed: *pCounter == maxCount") + } + break + } + + if *pCounter >= minCount { + if maxCount == -1 { + // Loop has no hard upper bound. + // Check that it is progressing through the input, break if it is not. + pLastIntputIdx := fp.extra(opValue + 1) + if *pLastIntputIdx == *fp.inputIdx() { + break + } else { + *pLastIntputIdx = *fp.inputIdx() + } + } + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + } else { + // Increment time-out counter. (StateSave() does it if count >= minCount) + m.tickCounter-- + if m.tickCounter <= 0 { + if err = m.incrementTime(*fp.inputIdx()); err != nil { + return err + } // Re-initializes fTickCounter + } + } + + *fp.patIdx() = op.Value() + 4 // Loop back. + + case URX_CTR_INIT_NG: + if !(op.Value() >= 0 && int(op.Value()) < m.stack.frameSize-2) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize-2") + } + *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT_NG has, and + // skip the pattern location counter past + instrOperandLoc := *fp.patIdx() + *fp.patIdx() += 3 + loopLoc := pat[instrOperandLoc].Value() + minCount := pat[instrOperandLoc+1].Value() + maxCount := pat[instrOperandLoc+2].Value() + + if !(minCount >= 0 && maxCount >= minCount || maxCount == -1) { + panic("assertion failed: minCount >= 0 && maxCount >= minCount || maxCount == -1") + } + + if maxCount == -1 { + *fp.extra(op.Value() + 1) = *fp.inputIdx() // Save initial input index for loop breaking. + } + + if minCount == 0 { + if maxCount != 0 { + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + } + *fp.patIdx() = loopLoc + 1 + } + + case URX_CTR_LOOP_NG: + if !(op.Value() >= 0 && int(op.Value()) < *fp.patIdx()-2) { + panic("assertion failed: op.Value() >= 0 && op.Value() < *fp.patIdx()-2") + } + initOp := pat[op.Value()] + if !(initOp.Type() == URX_CTR_INIT_NG) { + panic("assertion failed: initOp.Type() == URX_CTR_INIT_NG") + } + pCounter := fp.extra(initOp.Value()) + minCount := int(pat[op.Value()+2]) + maxCount := int(pat[op.Value()+3]) + *pCounter++ + if *pCounter >= maxCount && maxCount != -1 { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + if !(*pCounter == maxCount) { + panic("assertion failed: *pCounter == maxCount") + } + break + } + + if *pCounter < minCount { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + *fp.patIdx() = op.Value() + 4 // Loop back. + // Increment time-out counter. (StateSave() does it if count >= minCount) + m.tickCounter-- + if m.tickCounter <= 0 { + if err = m.incrementTime(*fp.inputIdx()); err != nil { + return err + } // Re-initializes fTickCounter + } + } else { + // We do have the minimum number of matches. + + // If there is no upper bound on the loop iterations, check that the input index + // is progressing, and stop the loop if it is not. + if maxCount == -1 { + lastInputIdx := fp.extra(initOp.Value() + 1) + if *fp.inputIdx() == *lastInputIdx { + break + } + *lastInputIdx = *fp.inputIdx() + } + } + + // Loop Continuation: we will fall into the pattern following the loop + // (non-greedy, don't execute loop body first), but first do + // a state save to the top of the loop, so that a match failure + // in the following pattern will try another iteration of the loop. + fp, err = m.StateSave(*fp.inputIdx(), op.Value()+4) + if err != nil { + return err + } + + case URX_STO_SP: + if !(op.Value() >= 0 && op.Value() < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fPattern->fDataSize") + } + m.data[op.Value()] = m.stack.len() + + case URX_LD_SP: + if !(op.Value() >= 0 && op.Value() < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fPattern->fDataSize") + } + newStackSize := m.data[op.Value()] + if !(newStackSize <= m.stack.len()) { + panic("assertion failed: newStackSize <= fStack->size()") + } + newFp := m.stack.offset(newStackSize) + if newFp.equals(fp) { + break + } + copy(newFp, fp) + fp = newFp + + m.stack.setSize(newStackSize) + case URX_BACKREF: + if !(op.Value() < m.stack.frameSize) { + panic("assertion failed: op.Value() < fFrameSize") + } + + groupStartIdx := *fp.extra(op.Value()) + groupEndIdx := *fp.extra(op.Value() + 1) + + if !(groupStartIdx <= groupEndIdx) { + panic("assertion failed: groupStartIdx <= groupEndIdx") + } + + if groupStartIdx < 0 { + // This capture group has not participated in the match thus far, + fp = m.stack.popFrame() // FAIL, no match. + break + } + + success := true + for { + if groupStartIdx >= groupEndIdx { + success = true + break + } + + if *fp.inputIdx() >= m.activeLimit { + success = false + m.hitEnd = true + break + } + + captureGroupChar := charAt(inputText, groupStartIdx) + inputChar := charAt(inputText, *fp.inputIdx()) + groupStartIdx++ + *fp.inputIdx()++ + if inputChar != captureGroupChar { + success = false + break + } + } + + if !success { + fp = m.stack.popFrame() + } + case URX_BACKREF_I: + if !(op.Value() < m.stack.frameSize) { + panic("assertion failed: op.Value() < fFrameSize") + } + + groupStartIdx := *fp.extra(op.Value()) + groupEndIdx := *fp.extra(op.Value() + 1) + if !(groupStartIdx <= groupEndIdx) { + panic("assertion failed: groupStartIdx <= groupEndIdx") + } + if !(groupStartIdx <= groupEndIdx) { + panic("assertion failed: groupStartIdx <= groupEndIdx") + } + + if groupStartIdx < 0 { + // This capture group has not participated in the match thus far, + fp = m.stack.popFrame() // FAIL, no match. + break + } + + captureGroupItr := newCaseFoldIterator(m.input, groupStartIdx, groupEndIdx) + inputItr := newCaseFoldIterator(m.input, *fp.inputIdx(), m.activeLimit) + success := true + + for { + captureGroupChar := captureGroupItr.next() + if captureGroupChar == -1 { + success = true + break + } + inputChar := inputItr.next() + if inputChar == -1 { + success = false + m.hitEnd = true + break + } + if inputChar != captureGroupChar { + success = false + break + } + } + + if success && inputItr.inExpansion() { + // We otained a match by consuming part of a string obtained from + // case-folding a single code point of the input text. + // This does not count as an overall match. + success = false + } + + if success { + *fp.inputIdx() = inputItr.index + } else { + fp = m.stack.popFrame() + } + + case URX_STO_INP_LOC: + if !(op.Value() >= 0 && op.Value() < m.stack.frameSize) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize") + } + *fp.extra(op.Value()) = *fp.inputIdx() + + case URX_JMPX: + instrOperandLoc := *fp.patIdx() + *fp.patIdx()++ + dataLoc := pat[instrOperandLoc].Value() + if !(dataLoc >= 0 && dataLoc < m.stack.frameSize) { + panic("assertion failed: dataLoc >= 0 && dataLoc < fFrameSize") + } + + saveInputIdx := *fp.extra(dataLoc) + if !(saveInputIdx <= *fp.inputIdx()) { + panic("assertion failed: saveInputIdx <= *fp.inputIdx()") + } + + if saveInputIdx < *fp.inputIdx() { + *fp.patIdx() = op.Value() // JMP + } else { + fp = m.stack.popFrame() // FAIL, no progress in loop. + } + + case URX_LA_START: + if !(op.Value() >= 0 && op.Value()+3 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+3 < fDataSize") + } + m.data[op.Value()] = m.stack.len() + m.data[op.Value()+1] = *fp.inputIdx() + m.data[op.Value()+2] = m.activeStart + m.data[op.Value()+3] = m.activeLimit + m.activeStart = m.lookStart // Set the match region change for + m.activeLimit = m.lookLimit // transparent bounds. + + case URX_LA_END: + if !(op.Value() >= 0 && op.Value()+3 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+3 < fDataSize") + } + stackSize := m.stack.len() + newStackSize := m.data[op.Value()] + if !(stackSize >= newStackSize) { + panic("assertion failed: stackSize >= newStackSize") + } + if stackSize > newStackSize { + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + newFp := m.stack.offset(newStackSize) + copy(newFp, fp) + fp = newFp + m.stack.setSize(newStackSize) + } + + *fp.inputIdx() = m.data[op.Value()+1] + + m.activeStart = m.data[op.Value()+2] + m.activeLimit = m.data[op.Value()+3] + if !(m.activeStart >= 0) { + panic("assertion failed: m.activeStart >= 0") + } + if !(m.activeLimit <= len(inputText)) { + panic("assertion failed: m.activeLimit <= len(inputText)") + } + + case URX_ONECHAR_I: + // Case insensitive one char. The char from the pattern is already case folded. + // Input text is not, but case folding the input can not reduce two or more code + // points to one. + if *fp.inputIdx() < m.activeLimit { + c := charAt(inputText, *fp.inputIdx()) + if ucase.Fold(c) == op.Value32() { + *fp.inputIdx()++ + break + } + } else { + m.hitEnd = true + } + + fp = m.stack.popFrame() + + case URX_STRING_I: + // Case-insensitive test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + // The compiled string has already been case folded. + patternString := litText[op.Value():] + var patternStringIdx int + nextOp := pat[*fp.patIdx()] + *fp.patIdx()++ + if !(nextOp.Type() == URX_STRING_LEN) { + panic("assertion failed: nextOp.Type() == URX_STRING_LEN") + } + patternStringLen := nextOp.Value() + + success := true + + it := newCaseFoldIterator(inputText, *fp.inputIdx(), m.activeLimit) + for patternStringIdx < patternStringLen { + cText := it.next() + cPattern := patternString[patternStringIdx] + patternStringIdx++ + + if cText != cPattern { + success = false + if cText == -1 { + m.hitEnd = true + } + break + } + } + if it.inExpansion() { + success = false + } + + if success { + *fp.inputIdx() = it.index + } else { + fp = m.stack.popFrame() + } + + case URX_LB_START: + // Entering a look-behind block. + // Save Stack Ptr, Input Pos and active input region. + // TODO: implement transparent bounds. Ticket #6067 + if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") + } + m.data[op.Value()] = m.stack.len() + m.data[op.Value()+1] = *fp.inputIdx() + // Save input string length, then reset to pin any matches to end at + // the current position. + m.data[op.Value()+2] = m.activeStart + m.data[op.Value()+3] = m.activeLimit + m.activeStart = m.regionStart + m.activeLimit = *fp.inputIdx() + // Init the variable containing the start index for attempted matches. + m.data[op.Value()+4] = -1 + case URX_LB_CONT: + // Positive Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the min and max possible match lengths. They are the operands + // of this op in the pattern. + minML := pat[*fp.patIdx()] + *fp.patIdx()++ + maxML := pat[*fp.patIdx()] + *fp.patIdx()++ + if !(minML <= maxML) { + panic("assertion failed: minML <= maxML") + } + if !(minML >= 0) { + panic("assertion failed: minML >= 0") + } + + if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") + } + lbStartIdx := &m.data[op.Value()+4] + if *lbStartIdx < 0 { + // First time through loop. + *lbStartIdx = *fp.inputIdx() - int(minML) + if *lbStartIdx > 0 { + *lbStartIdx = *fp.inputIdx() + } + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + *lbStartIdx-- + } + + if *lbStartIdx < 0 || *lbStartIdx < *fp.inputIdx()-int(maxML) { + // We have tried all potential match starting points without + // getting a match. Backtrack out, and out of the + // Look Behind altogether. + fp = m.stack.popFrame() + m.activeStart = m.data[op.Value()+2] + m.activeLimit = m.data[op.Value()+3] + if !(m.activeStart >= 0) { + panic("assertion failed: fActiveStart >= 0") + } + if !(m.activeLimit <= inputLength) { + panic("assertion failed: fActiveLimit <= fInputLength") + } + break + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will fall off the end of the loop.) + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-3) + if err != nil { + return err + } + *fp.inputIdx() = *lbStartIdx + + case URX_LB_END: + // End of a look-behind block, after a successful match. + if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") + } + if *fp.inputIdx() != m.activeLimit { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or fail + // the look-behind altogether, whichever is appropriate. + fp = m.stack.popFrame() + break + } + + // Look-behind match is good. Restore the orignal input string region, + // which had been truncated to pin the end of the lookbehind match to the + // position being looked-behind. + m.activeStart = m.data[op.Value()+2] + m.activeLimit = m.data[op.Value()+3] + if !(m.activeStart >= 0) { + panic("assertion failed: fActiveStart >= 0") + } + if !(m.activeLimit <= inputLength) { + panic("assertion failed: fActiveLimit <= fInputLength") + } + case URX_LBN_CONT: + // Negative Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the extra parameters of this op. + minML := pat[*fp.patIdx()] + *fp.patIdx()++ + maxML := pat[*fp.patIdx()] + *fp.patIdx()++ + + continueLoc := pat[*fp.patIdx()].Value() + *fp.patIdx()++ + + if !(minML <= maxML) { + panic("assertion failed: minML <= maxML") + } + if !(minML >= 0) { + panic("assertion failed: minML >= 0") + } + if !(continueLoc > *fp.patIdx()) { + panic("assertion failed: continueLoc > *fp.patIdx()") + } + + // Fetch (from data) the last input index where a match was attempted. + if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") + } + + lbStartIdx := &m.data[op.Value()+4] + + if *lbStartIdx < 0 { + // First time through loop. + *lbStartIdx = *fp.inputIdx() - int(minML) + if *lbStartIdx > 0 { + // move index to a code point boundary, if it's not on one already. + *lbStartIdx = *fp.inputIdx() + } + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + *lbStartIdx-- + } + + if *lbStartIdx < 0 || *lbStartIdx < *fp.inputIdx()-int(maxML) { + // We have tried all potential match starting points without + // getting a match, which means that the negative lookbehind as + // a whole has succeeded. Jump forward to the continue location + m.activeStart = m.data[op.Value()+2] + m.activeLimit = m.data[op.Value()+3] + if !(m.activeStart >= 0) { + panic("assertion failed: fActiveStart >= 0") + } + if !(m.activeLimit <= inputLength) { + panic("assertion failed: fActiveLimit <= fInputLength") + } + *fp.patIdx() = continueLoc + break + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will cause a FAIL out of the loop altogether.) + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-4) + if err != nil { + return err + } + *fp.inputIdx() = *lbStartIdx + case URX_LBN_END: + // End of a negative look-behind block, after a successful match. + if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") + } + + if *fp.inputIdx() != m.activeLimit { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or succeed + // the look-behind altogether, whichever is appropriate. + fp = m.stack.popFrame() + break + } + + // Look-behind expression matched, which means look-behind test as + // a whole Fails + + // Restore the orignal input string length, which had been truncated + // inorder to pin the end of the lookbehind match + // to the position being looked-behind. + m.activeStart = m.data[op.Value()+2] + m.activeLimit = m.data[op.Value()+3] + if !(m.activeStart >= 0) { + panic("assertion failed: fActiveStart >= 0") + } + if !(m.activeLimit <= inputLength) { + panic("assertion failed: fActiveLimit <= fInputLength") + } + + // Restore original stack position, discarding any state saved + // by the successful pattern match. + if !(op.Value() >= 0 && op.Value()+1 < m.pattern.dataSize) { + panic("assertion failed: op.Value() >= 0 && op.Value()+1 < fDataSize") + } + newStackSize := m.data[op.Value()] + if !(m.stack.len() > newStackSize) { + panic("assertion failed: fStack.size() > newStackSize") + } + m.stack.setSize(newStackSize) + + // FAIL, which will take control back to someplace + // prior to entering the look-behind test. + fp = m.stack.popFrame() + case URX_LOOP_SR_I: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + if !(op.Value() >= 0 && op.Value() < len(sets)) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fSets.size()") + } + s := sets[op.Value()] + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + ix := *fp.inputIdx() + + for { + if ix >= m.activeLimit { + m.hitEnd = true + break + } + c := charAt(inputText, ix) + if !s.ContainsRune(c) { + break + } + ix++ + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if ix == *fp.inputIdx() { + *fp.patIdx()++ // skip the URX_LOOP_C op. + break + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + loopcOp := pat[*fp.patIdx()] + if !(loopcOp.Type() == URX_LOOP_C) { + panic("assertion failed: loopcOp.Type() == URX_LOOP_C") + } + stackLoc := loopcOp.Value() + if !(stackLoc >= 0 && stackLoc < m.stack.frameSize) { + panic("assertion failed: stackLoc >= 0 && stackLoc < fFrameSize") + } + *fp.extra(stackLoc) = *fp.inputIdx() + *fp.inputIdx() = ix + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + *fp.patIdx()++ + case URX_LOOP_DOT_I: + // Loop Initialization for the optimized implementation of .* + // This op scans through all remaining input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + + // Loop through input until the input is exhausted (we reach an end-of-line) + // In DOTALL mode, we can just go straight to the end of the input. + var ix int + if (op.Value() & 1) == 1 { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = m.activeLimit + m.hitEnd = true + } else { + // NOT DOT ALL mode. Line endings do not match '.' + // Scan forward until a line ending or end of input. + ix = *fp.inputIdx() + for { + if ix >= m.activeLimit { + m.hitEnd = true + break + } + c := charAt(inputText, ix) + if (c & 0x7f) <= 0x29 { // Fast filter of non-new-line-s + if (c == 0x0a) || // 0x0a is newline in both modes. + (((op.Value() & 2) == 0) && // IF not UNIX_LINES mode + isLineTerminator(c)) { + // char is a line ending. Exit the scanning loop. + break + } + } + ix++ + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if ix == *fp.inputIdx() { + *fp.patIdx()++ // skip the URX_LOOP_C op. + break + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this .* + loopcOp := pat[*fp.patIdx()] + if !(loopcOp.Type() == URX_LOOP_C) { + panic("assertion failed: loopcOp.Type() == URX_LOOP_C") + } + stackLoc := loopcOp.Value() + if !(stackLoc >= 0 && stackLoc < m.stack.frameSize) { + panic("assertion failed: stackLoc >= 0 && stackLoc < fFrameSize") + } + *fp.extra(stackLoc) = *fp.inputIdx() + *fp.inputIdx() = ix + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + *fp.patIdx()++ + + case URX_LOOP_C: + if !(op.Value() >= 0 && op.Value() < m.stack.frameSize) { + panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize") + } + backSearchIndex := *fp.extra(op.Value()) + if !(backSearchIndex <= *fp.inputIdx()) { + panic("assertion failed: backSearchIndex <= *fp.inputIdx()") + } + + if backSearchIndex == *fp.inputIdx() { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + if !(*fp.inputIdx() > 0) { + panic("assertion failed: *fp.inputIdx() > 0") + } + + prevC := charAt(inputText, *fp.inputIdx()-1) + *fp.inputIdx()-- + twoPrevC := charAt(inputText, *fp.inputIdx()-1) + + if prevC == 0x0a && + *fp.inputIdx() > backSearchIndex && + twoPrevC == 0x0d { + prevOp := pat[*fp.patIdx()-2] + if prevOp.Type() == URX_LOOP_DOT_I { + // .*, stepping back over CRLF pair. + *fp.inputIdx()-- + } + } + + fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-1) + if err != nil { + return err + } + default: + // Trouble. The compiled pattern contains an entry with an + // unrecognized type tag. + panic("unreachable") + } + } + +breakFromLoop: + m.match = isMatch + if isMatch { + m.lastMatchEnd = m.matchEnd + m.matchStart = startIdx + m.matchEnd = *fp.inputIdx() + } + + if Dumper != nil { + if isMatch { + fmt.Fprintf(Dumper, "Match. start=%d end=%d\n\n", m.matchStart, m.matchEnd) + } else { + fmt.Fprintf(Dumper, "No match\n\n") + } + } + + m.frame = fp // The active stack frame when the engine stopped. + // Contains the capture group results that we need to + // access later. + return nil +} + +func charAt(str []rune, idx int) rune { + if idx >= 0 && idx < len(str) { + return str[idx] + } + return -1 +} + +func (m *Matcher) isWordBoundary(pos int) bool { + cIsWord := false + + if pos >= m.lookLimit { + m.hitEnd = true + } else { + c := charAt(m.input, pos) + if uprops.HasBinaryProperty(c, uprops.UCHAR_GRAPHEME_EXTEND) || uchar.CharType(c) == uchar.U_FORMAT_CHAR { + return false + } + cIsWord = staticPropertySets[URX_ISWORD_SET].ContainsRune(c) + } + + prevCIsWord := false + for { + if pos <= m.lookStart { + break + } + prevChar := charAt(m.input, pos-1) + pos-- + if !(uprops.HasBinaryProperty(prevChar, uprops.UCHAR_GRAPHEME_EXTEND) || uchar.CharType(prevChar) == uchar.U_FORMAT_CHAR) { + prevCIsWord = staticPropertySets[URX_ISWORD_SET].ContainsRune(prevChar) + break + } + } + return cIsWord != prevCIsWord +} + +func (m *Matcher) isUWordBoundary(pos int) bool { + // TODO: implement + /* + UBool returnVal = FALSE; + + #if UCONFIG_NO_BREAK_ITERATION==0 + // Note: this point will never be reached if break iteration is configured out. + // Regex patterns that would require this function will fail to compile. + + // If we haven't yet created a break iterator for this matcher, do it now. + if (fWordBreakItr == nullptr) { + fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status); + if (U_FAILURE(status)) { + return FALSE; + } + fWordBreakItr->setText(fInputText, status); + } + + // Note: zero width boundary tests like \b see through transparent region bounds, + // which is why fLookLimit is used here, rather than fActiveLimit. + if (pos >= fLookLimit) { + fHitEnd = TRUE; + returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" + // words are not boundaries. All non-word chars stand by themselves, + // with word boundaries on both sides. + } else { + returnVal = fWordBreakItr->isBoundary((int32_t)pos); + } + #endif + return returnVal; + */ + return false +} + +func (m *Matcher) resetStack() StackFrame { + m.stack.reset() + frame, _ := m.stack.newFrame(0, nil, "") + frame.clearExtra() + return frame +} + +func (m *Matcher) StateSave(inputIdx, savePatIdx int) (StackFrame, error) { + // push storage for a new frame. + newFP, err := m.stack.newFrame(inputIdx, m.input, m.pattern.pattern) + if err != nil { + return nil, err + } + fp := m.stack.prevFromTop() + + // New stack frame = copy of old top frame. + copy(newFP, fp) + + m.tickCounter-- + if m.tickCounter <= 0 { + if err := m.incrementTime(*fp.inputIdx()); err != nil { + return nil, err + } + } + *fp.patIdx() = savePatIdx + return newFP, nil +} + +func (m *Matcher) incrementTime(inputIdx int) error { + m.tickCounter = TIMER_INITIAL_VALUE + m.time++ + if m.timeLimit > 0 && m.time >= m.timeLimit { + return &MatchError{ + Code: uerror.U_REGEX_TIME_OUT, + Pattern: m.pattern.pattern, + Position: inputIdx, + Input: m.input, + } + } + return nil +} + +func (m *Matcher) isDecimalDigit(c rune) bool { + return uchar.IsDigit(c) +} + +func (m *Matcher) isHorizWS(c rune) bool { + return uchar.CharType(c) == uchar.U_SPACE_SEPARATOR || c == 9 +} + +func (m *Matcher) followingGCBoundary(pos int) int { + // TODO: implement + /* + // Note: this point will never be reached if break iteration is configured out. + // Regex patterns that would require this function will fail to compile. + + // If we haven't yet created a break iterator for this matcher, do it now. + if (m.gcBreakItr == nil) { + m.gcBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); + if (U_FAILURE(status)) { + return pos; + } + fGCBreakItr->setText(fInputText, status); + } + result = fGCBreakItr->following(pos); + if (result == BreakIterator::DONE) { + result = pos; + } + */ + panic("TODO") +} + +func (m *Matcher) Reset(input string) { + m.input = []rune(input) + m.reset() +} + +func (m *Matcher) Matches() (bool, error) { + err := m.MatchAt(m.activeStart, true) + return m.match, err +} + +func (m *Matcher) LookingAt() (bool, error) { + err := m.MatchAt(m.activeStart, false) + return m.match, err +} + +func (m *Matcher) Find() (bool, error) { + startPos := m.matchEnd + if startPos == 0 { + startPos = m.activeStart + } + + if m.match { + // Save the position of any previous successful match. + m.lastMatchEnd = m.matchEnd + if m.matchStart == m.matchEnd { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if startPos >= m.activeLimit { + m.match = false + m.hitEnd = true + return false, nil + } + startPos++ + } + } else { + if m.lastMatchEnd >= 0 { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + m.hitEnd = true + return false, nil + } + } + + testStartLimit := m.activeLimit - int(m.pattern.minMatchLen) + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + + if !(startPos >= 0) { + panic("assertion failed: startPos >= 0") + } + + switch m.pattern.startType { + case START_NO_INFO: + // No optimization was found. + // Try a match at each input position. + for { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + if startPos >= testStartLimit { + m.hitEnd = true + return false, nil + } + startPos++ + } + case START_SET: + // Match may start on any char from a pre-computed set. + if !(m.pattern.minMatchLen > 0) { + panic("assertion failed: minMatchLen > 0") + } + + for { + pos := startPos + c := charAt(m.input, startPos) + startPos++ + // c will be -1 (U_SENTINEL) at end of text, in which case we + // skip this next block (so we don't have a negative array index) + // and handle end of text in the following block. + if c >= 0 && m.pattern.initialChars.ContainsRune(c) { + err := m.MatchAt(pos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + } + case START_START: + // Matches are only possible at the start of the input string + // (pattern begins with ^ or \A) + if startPos > m.activeStart { + m.match = false + return false, nil + } + err := m.MatchAt(startPos, false) + return m.match, err + case START_LINE: + var ch rune + if startPos == m.anchorStart { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + ch = charAt(m.input, startPos) + startPos++ + } else { + ch = charAt(m.input, startPos-1) + } + + if m.pattern.flags&UREGEX_UNIX_LINES != 0 { + for { + if ch == 0x0a { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos >= testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + ch = charAt(m.input, startPos) + startPos++ + } + } else { + for { + if isLineTerminator(ch) { + if ch == 0x0d && startPos < m.activeLimit && charAt(m.input, startPos) == 0x0a { + startPos++ + } + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos >= testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + ch = charAt(m.input, startPos) + startPos++ + } + } + case START_CHAR, START_STRING: + // Match starts on exactly one char. + if !(m.pattern.minMatchLen > 0) { + panic("assertion failed: minMatchLen > 0") + } + + theChar := m.pattern.initialChar + for { + pos := startPos + c := charAt(m.input, startPos) + startPos++ + if c == theChar { + err := m.MatchAt(pos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + } + default: + panic("unreachable") + } +} + +func (m *Matcher) Start() int { + if !m.match { + return -1 + } + + return m.matchStart +} + +func (m *Matcher) reset() { + m.regionStart = 0 + m.regionLimit = len(m.input) + m.activeStart = 0 + m.activeLimit = len(m.input) + m.anchorStart = 0 + m.anchorLimit = len(m.input) + m.lookStart = 0 + m.lookLimit = len(m.input) + m.resetPreserveRegion() +} + +func (m *Matcher) resetPreserveRegion() { + m.matchStart = 0 + m.matchEnd = 0 + m.lastMatchEnd = -1 + m.appendPosition = 0 + m.match = false + m.hitEnd = false + m.requireEnd = false + m.time = 0 + m.tickCounter = TIMER_INITIAL_VALUE +} + +func (m *Matcher) GroupCount() int { + return len(m.pattern.groupMap) +} + +func (m *Matcher) StartForGroup(group int) int { + if !m.match { + return -1 + } + if group < 0 || group > len(m.pattern.groupMap) { + return -1 + } + if group == 0 { + return m.matchStart + } + groupOffset := int(m.pattern.groupMap[group-1]) + return *m.frame.extra(groupOffset) +} + +func (m *Matcher) EndForGroup(group int) int { + if !m.match { + return -1 + } + if group < 0 || group > len(m.pattern.groupMap) { + return -1 + } + if group == 0 { + return m.matchEnd + } + groupOffset := int(m.pattern.groupMap[group-1]) + return *m.frame.extra(groupOffset + 1) +} + +func (m *Matcher) HitEnd() bool { + return m.hitEnd +} + +func (m *Matcher) RequireEnd() bool { + return m.requireEnd +} + +func (m *Matcher) Group(i int) (string, bool) { + start := m.StartForGroup(i) + end := m.EndForGroup(i) + if start == -1 || end == -1 { + return "", false + } + return string(m.input[start:end]), true +} + +// Test for any of the Unicode line terminating characters. +func isLineTerminator(c rune) bool { + if (c & ^(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) != 0 { + return false + } + return (c <= 0x0d && c >= 0x0a) || c == 0x85 || c == 0x2028 || c == 0x2029 +} diff --git a/go/mysql/icuregex/ops.go b/go/mysql/icuregex/ops.go new file mode 100644 index 00000000000..394059d886c --- /dev/null +++ b/go/mysql/icuregex/ops.go @@ -0,0 +1,417 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +type Opcode uint8 + +const ( + URX_RESERVED_OP Opcode = iota // For multi-operand ops, most non-first words. + URX_BACKTRACK // Force a backtrack, as if a match test had failed. + URX_END + URX_ONECHAR // Value field is the 21 bit unicode char to match + URX_STRING // Value field is index of string start + URX_STRING_LEN // Value field is string length (code units) + URX_STATE_SAVE // Value field is pattern position to push + URX_NOP + URX_START_CAPTURE // Value field is capture group number. + URX_END_CAPTURE // Value field is capture group number + URX_STATIC_SETREF // Value field is index of set in array of sets. + URX_SETREF // Value field is index of set in array of sets. + URX_DOTANY + URX_JMP // Value field is destination position in the pattern. + URX_FAIL // Stop match operation, No match. + + URX_JMP_SAV // Operand: JMP destination location + URX_BACKSLASH_B // Value field: 0: \b 1: \B + URX_BACKSLASH_G + URX_JMP_SAV_X // Conditional JMP_SAV, + // Used in (x)+, breaks loop on zero length match. + // Operand: Jmp destination. + URX_BACKSLASH_X + URX_BACKSLASH_Z // \z Unconditional end of line. + + URX_DOTANY_ALL // ., in the . matches any mode. + URX_BACKSLASH_D // Value field: 0: \d 1: \D + URX_CARET // Value field: 1: multi-line mode. + URX_DOLLAR // Also for \Z + + URX_CTR_INIT // Counter Inits for {Interval} loops. + URX_CTR_INIT_NG // 2 kinds, normal and non-greedy. + // These are 4 word opcodes. See description. + // First Operand: Data loc of counter variable + // 2nd Operand: Pat loc of the URX_CTR_LOOPx + // at the end of the loop. + // 3rd Operand: Minimum count. + // 4th Operand: Max count, -1 for unbounded. + + URX_DOTANY_UNIX // '.' operator in UNIX_LINES mode, only \n marks end of line. + + URX_CTR_LOOP // Loop Ops for {interval} loops. + URX_CTR_LOOP_NG // Also in three flavors. + // Operand is loc of corresponding CTR_INIT. + + URX_CARET_M_UNIX // '^' operator, test for start of line in multi-line + // plus UNIX_LINES mode. + + URX_RELOC_OPRND // Operand value in multi-operand ops that refers + // back into compiled pattern code, and thus must + // be relocated when inserting/deleting ops in code. + + URX_STO_SP // Store the stack ptr. Operand is location within + // matcher data (not stack data) to store it. + URX_LD_SP // Load the stack pointer. Operand is location + // to load from. + URX_BACKREF // Back Reference. Parameter is the index of the + // capture group variables in the state stack frame. + URX_STO_INP_LOC // Store the input location. Operand is location + // within the matcher stack frame. + URX_JMPX // Conditional JMP. + // First Operand: JMP target location. + // Second Operand: Data location containing an + // input position. If current input position == + // saved input position, FAIL rather than taking + // the JMP + URX_LA_START // Starting a LookAround expression. + // Save InputPos, SP and active region in static data. + // Operand: Static data offset for the save + URX_LA_END // Ending a Lookaround expression. + // Restore InputPos and Stack to saved values. + // Operand: Static data offset for saved data. + URX_ONECHAR_I // Test for case-insensitive match of a literal character. + // Operand: the literal char. + URX_STRING_I // Case insensitive string compare. + // First Operand: Index of start of string in string literals + // Second Operand (next word in compiled code): + // the length of the string. + URX_BACKREF_I // Case insensitive back reference. + // Parameter is the index of the + // capture group variables in the state stack frame. + URX_DOLLAR_M // $ in multi-line mode. + URX_CARET_M // ^ in multi-line mode. + URX_LB_START // LookBehind Start. + // Parameter is data location + URX_LB_CONT // LookBehind Continue. + // Param 0: the data location + // Param 1: The minimum length of the look-behind match + // Param 2: The max length of the look-behind match + URX_LB_END // LookBehind End. + // Parameter is the data location. + // Check that match ended at the right spot, + // Restore original input string len. + URX_LBN_CONT // Negative LookBehind Continue + // Param 0: the data location + // Param 1: The minimum length of the look-behind match + // Param 2: The max length of the look-behind match + // Param 3: The pattern loc following the look-behind block. + URX_LBN_END // Negative LookBehind end + // Parameter is the data location. + // Check that the match ended at the right spot. + URX_STAT_SETREF_N // Reference to a prebuilt set (e.g. \w), negated + // Operand is index of set in array of sets. + URX_LOOP_SR_I // Init a [set]* loop. + // Operand is the sets index in array of user sets. + URX_LOOP_C // Continue a [set]* or OneChar* loop. + // Operand is a matcher static data location. + // Must always immediately follow LOOP_x_I instruction. + URX_LOOP_DOT_I // .*, initialization of the optimized loop. + // Operand value: + // bit 0: + // 0: Normal (. doesn't match new-line) mode. + // 1: . matches new-line mode. + // bit 1: controls what new-lines are recognized by this operation. + // 0: All Unicode New-lines + // 1: UNIX_LINES, \u000a only. + URX_BACKSLASH_BU // \b or \B in UREGEX_UWORD mode, using Unicode style + // word boundaries. + URX_DOLLAR_D // $ end of input test, in UNIX_LINES mode. + URX_DOLLAR_MD // $ end of input test, in MULTI_LINE and UNIX_LINES mode. + URX_BACKSLASH_H // Value field: 0: \h 1: \H + URX_BACKSLASH_R // Any line break sequence. + URX_BACKSLASH_V // Value field: 0: \v 1: \V + + URX_RESERVED_OP_N Opcode = 255 // For multi-operand ops, negative operand values. +) + +// Keep this list of opcode names in sync with the above enum +// +// Used for debug printing only. +var UrxOpcodeNames = []string{ + " ", + "BACKTRACK", + "END", + "ONECHAR", + "STRING", + "STRING_LEN", + "STATE_SAVE", + "NOP", + "START_CAPTURE", + "END_CAPTURE", + "URX_STATIC_SETREF", + "SETREF", + "DOTANY", + "JMP", + "FAIL", + "JMP_SAV", + "BACKSLASH_B", + "BACKSLASH_G", + "JMP_SAV_X", + "BACKSLASH_X", + "BACKSLASH_Z", + "DOTANY_ALL", + "BACKSLASH_D", + "CARET", + "DOLLAR", + "CTR_INIT", + "CTR_INIT_NG", + "DOTANY_UNIX", + "CTR_LOOP", + "CTR_LOOP_NG", + "URX_CARET_M_UNIX", + "RELOC_OPRND", + "STO_SP", + "LD_SP", + "BACKREF", + "STO_INP_LOC", + "JMPX", + "LA_START", + "LA_END", + "ONECHAR_I", + "STRING_I", + "BACKREF_I", + "DOLLAR_M", + "CARET_M", + "LB_START", + "LB_CONT", + "LB_END", + "LBN_CONT", + "LBN_END", + "STAT_SETREF_N", + "LOOP_SR_I", + "LOOP_C", + "LOOP_DOT_I", + "BACKSLASH_BU", + "DOLLAR_D", + "DOLLAR_MD", + "URX_BACKSLASH_H", + "URX_BACKSLASH_R", + "URX_BACKSLASH_V", +} + +type Instruction int32 + +func (ins Instruction) Type() Opcode { + return Opcode(uint32(ins) >> 24) +} + +func (ins Instruction) Value32() int32 { + return int32(ins) & 0xffffff +} + +func (ins Instruction) Value() int { + return int(ins.Value32()) +} + +// Access to Unicode Sets composite character properties +// +// The sets are accessed by the match engine for things like \w (word boundary) +const ( + URX_ISWORD_SET = 1 + URX_ISALNUM_SET = 2 + URX_ISALPHA_SET = 3 + URX_ISSPACE_SET = 4 + + URX_GC_NORMAL = iota + 1 // Sets for finding grapheme cluster boundaries. + URX_GC_EXTEND + URX_GC_CONTROL + URX_GC_L + URX_GC_LV + URX_GC_LVT + URX_GC_V + URX_GC_T + + URX_LAST_SET + + URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set + // membership test. +) + +type Stack struct { + ary []int + frameSize int + stackLimit int +} + +type StackFrame []int + +func (f StackFrame) inputIdx() *int { + return &f[0] +} + +func (f StackFrame) patIdx() *int { + return &f[1] +} + +func (f StackFrame) extra(n int) *int { + return &f[2+n] +} + +func (f StackFrame) equals(f2 StackFrame) bool { + return &f[0] == &f2[0] +} + +func (stack *Stack) len() int { + return len(stack.ary) +} + +func (stack *Stack) sp() int { + return len(stack.ary) - stack.frameSize +} + +func (stack *Stack) newFrame(inputIdx int, input []rune, pattern string) (StackFrame, error) { + if stack.stackLimit != 0 && len(stack.ary)+stack.frameSize > stack.stackLimit { + return nil, &MatchError{ + Code: uerror.U_REGEX_STACK_OVERFLOW, + Pattern: pattern, + Position: inputIdx, + Input: input, + } + } + stack.ary = slices.Grow(stack.ary, stack.frameSize) + + f := stack.ary[len(stack.ary) : len(stack.ary)+stack.frameSize] + stack.ary = stack.ary[:len(stack.ary)+stack.frameSize] + return f, nil +} + +func (stack *Stack) prevFromTop() StackFrame { + return stack.ary[len(stack.ary)-2*stack.frameSize:] +} + +func (stack *Stack) popFrame() StackFrame { + stack.ary = stack.ary[:len(stack.ary)-stack.frameSize] + return stack.ary[len(stack.ary)-stack.frameSize:] +} + +func (stack *Stack) reset() { + stack.ary = stack.ary[:0] +} + +func (stack *Stack) offset(size int) StackFrame { + return stack.ary[size-stack.frameSize : size] +} + +func (stack *Stack) setSize(size int) { + stack.ary = stack.ary[:size] +} + +func (f StackFrame) clearExtra() { + for i := 2; i < len(f); i++ { + f[i] = -1 + } +} + +// number of UVector elements in the header +const RESTACKFRAME_HDRCOUNT = 2 + +// Start-Of-Match type. Used by find() to quickly scan to positions where a +// +// match might start before firing up the full match engine. +type StartOfMatch int8 + +const ( + START_NO_INFO StartOfMatch = iota // No hint available. + START_CHAR // Match starts with a literal code point. + START_SET // Match starts with something matching a set. + START_START // Match starts at start of buffer only (^ or \A) + START_LINE // Match starts with ^ in multi-line mode. + START_STRING // Match starts with a literal string. +) + +func (som StartOfMatch) String() string { + switch som { + case START_NO_INFO: + return "START_NO_INFO" + case START_CHAR: + return "START_CHAR" + case START_SET: + return "START_SET" + case START_START: + return "START_START" + case START_LINE: + return "START_LINE" + case START_STRING: + return "START_STRING" + default: + panic("unknown StartOfMatch") + } +} + +type CaseFoldIterator struct { + chars []rune + index int + limit int + + foldChars []uint16 +} + +func (it *CaseFoldIterator) next() rune { + if len(it.foldChars) == 0 { + // We are not in a string folding of an earlier character. + // Start handling the next char from the input UText. + if it.index >= it.limit { + return -1 + } + + originalC := it.chars[it.index] + it.index++ + + originalC, it.foldChars = ucase.FullFolding(originalC) + if len(it.foldChars) == 0 { + // input code point folds to a single code point, possibly itself. + return originalC + } + } + + var res rune + res, it.foldChars = utf16.NextUnsafe(it.foldChars) + return res +} + +func (it *CaseFoldIterator) inExpansion() bool { + return len(it.foldChars) > 0 +} + +func newCaseFoldIterator(chars []rune, start, limit int) CaseFoldIterator { + return CaseFoldIterator{ + chars: chars, + index: start, + limit: limit, + } +} diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go new file mode 100644 index 00000000000..26fbc5ff88f --- /dev/null +++ b/go/mysql/icuregex/pattern.go @@ -0,0 +1,144 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uset" +) + +type Pattern struct { + pattern string + flags RegexpFlag + + compiledPat []Instruction + literalText []rune + + sets []*uset.UnicodeSet + + minMatchLen int32 + frameSize int + dataSize int + + groupMap []int32 + + startType StartOfMatch + initialStringIdx int + initialStringLen int + initialChars *uset.UnicodeSet + initialChar rune + needsAltInput bool + + namedCaptureMap map[string]int +} + +func NewPattern(flags RegexpFlag) *Pattern { + return &Pattern{ + flags: flags, + initialChars: uset.New(), + // Slot zero of the vector of sets is reserved. Fill it here. + sets: []*uset.UnicodeSet{nil}, + } +} + +func MustCompile(in string, flags RegexpFlag) *Pattern { + pat, err := Compile(in, flags) + if err != nil { + panic(err) + } + return pat +} + +func Compile(in string, flags RegexpFlag) (*Pattern, error) { + pat := NewPattern(flags) + cmp := NewCompiler(pat) + if err := cmp.compile(in); err != nil { + return nil, err + } + return pat, nil +} + +func (p *Pattern) Match(input string) *Matcher { + m := NewMatcher(p) + m.Reset(input) + return m +} + +func (p *Pattern) Matcher() *Matcher { + return NewMatcher(p) +} + +type RegexpFlag int32 + +const ( + /** Enable case insensitive matching. @stable ICU 2.4 */ + UREGEX_CASE_INSENSITIVE RegexpFlag = 2 + + /** Allow white space and comments within patterns @stable ICU 2.4 */ + UREGEX_COMMENTS RegexpFlag = 4 + + /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. + * @stable ICU 2.4 */ + UREGEX_DOTALL RegexpFlag = 32 + + /** If set, treat the entire pattern as a literal string. + * Metacharacters or escape sequences in the input sequence will be given + * no special meaning. + * + * The flag UREGEX_CASE_INSENSITIVE retains its impact + * on matching when used in conjunction with this flag. + * The other flags become superfluous. + * + * @stable ICU 4.0 + */ + UREGEX_LITERAL RegexpFlag = 16 + + /** Control behavior of "$" and "^" + * If set, recognize line terminators within string, + * otherwise, match only at start and end of input string. + * @stable ICU 2.4 */ + UREGEX_MULTILINE RegexpFlag = 8 + + /** Unix-only line endings. + * When this mode is enabled, only \\u000a is recognized as a line ending + * in the behavior of ., ^, and $. + * @stable ICU 4.0 + */ + UREGEX_UNIX_LINES RegexpFlag = 1 + + /** Unicode word boundaries. + * If set, \b uses the Unicode TR 29 definition of word boundaries. + * Warning: Unicode word boundaries are quite different from + * traditional regular expression word boundaries. See + * http://unicode.org/reports/tr29/#Word_Boundaries + * @stable ICU 2.8 + */ + UREGEX_UWORD RegexpFlag = 256 + + /** Error on Unrecognized backslash escapes. + * If set, fail with an error on patterns that contain + * backslash-escaped ASCII letters without a known special + * meaning. If this flag is not set, these + * escaped letters represent themselves. + * @stable ICU 4.0 + */ + UREGEX_ERROR_ON_UNKNOWN_ESCAPES RegexpFlag = 512 +) diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go new file mode 100644 index 00000000000..b607df5349c --- /dev/null +++ b/go/mysql/icuregex/perl_test.go @@ -0,0 +1,218 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "bufio" + "os" + "strconv" + "strings" + "testing" + + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" +) + +func TestPerl(t *testing.T) { + f, err := os.Open("testdata/re_tests.txt") + if err != nil { + t.Fatalf("failed to open test data: %v", err) + } + defer f.Close() + + flagPat := MustCompile(`('?)(.*)\1(.*)`, 0) + flagMat := flagPat.Matcher() + + groupsPat := MustCompile(`\$([+\-])\[(\d+)\]`, 0) + groupsMat := groupsPat.Matcher() + + cgPat := MustCompile(`\$(\d+)`, 0) + cgMat := cgPat.Matcher() + + group := func(m *Matcher, idx int) string { + g, _ := m.Group(idx) + return g + } + + lookingAt := func(m *Matcher) bool { + ok, err := m.LookingAt() + if err != nil { + t.Fatalf("failed to match with LookingAt(): %v", err) + } + return ok + } + + replacer := strings.NewReplacer( + `${bang}`, "!", + `${nulnul}`, "\x00\x00", + `${ffff}`, "\uffff", + ) + + scanner := bufio.NewScanner(f) + var lineno int + + for scanner.Scan() { + lineno++ + fields := strings.Split(scanner.Text(), "\t") + + flagMat.Reset(fields[0]) + ok, _ := flagMat.Matches() + if !ok { + t.Fatalf("could not match pattern+flags (line %d)", lineno) + } + + pattern, _ := flagMat.Group(2) + pattern = replacer.Replace(pattern) + + flagStr, _ := flagMat.Group(3) + var flags RegexpFlag + if strings.IndexByte(flagStr, 'i') >= 0 { + flags |= UREGEX_CASE_INSENSITIVE + } + if strings.IndexByte(flagStr, 'm') >= 0 { + flags |= UREGEX_MULTILINE + } + if strings.IndexByte(flagStr, 'x') >= 0 { + flags |= UREGEX_COMMENTS + } + + testPat, err := Compile(pattern, flags) + if err != nil { + if cerr, ok := err.(*CompileError); ok && cerr.Code == uerror.U_REGEX_UNIMPLEMENTED { + continue + } + if strings.IndexByte(fields[2], 'c') == -1 && strings.IndexByte(fields[2], 'i') == -1 { + t.Errorf("line %d: ICU error %q", lineno, err) + } + continue + } + + if strings.IndexByte(fields[2], 'i') >= 0 { + continue + } + if strings.IndexByte(fields[2], 'c') >= 0 { + t.Errorf("line %d: expected error", lineno) + continue + } + + matchString := fields[1] + matchString = replacer.Replace(matchString) + matchString = strings.ReplaceAll(matchString, `\n`, "\n") + + testMat := testPat.Match(matchString) + found, _ := testMat.Find() + expected := strings.IndexByte(fields[2], 'y') >= 0 + + if expected != found { + t.Errorf("line %d: expected %v, found %v", lineno, expected, found) + continue + } + + if !found { + continue + } + + var result []byte + var perlExpr = fields[3] + + for len(perlExpr) > 0 { + groupsMat.Reset(perlExpr) + cgMat.Reset(perlExpr) + + switch { + case strings.HasPrefix(perlExpr, "$&"): + result = append(result, group(testMat, 0)...) + perlExpr = perlExpr[2:] + + case lookingAt(groupsMat): + groupNum, err := strconv.ParseInt(group(groupsMat, 2), 10, 32) + if err != nil { + t.Fatalf("failed to parse Perl pattern: %v", err) + } + + var matchPosition int + if group(groupsMat, 1) == "+" { + matchPosition = testMat.EndForGroup(int(groupNum)) + } else { + matchPosition = testMat.StartForGroup(int(groupNum)) + } + if matchPosition != -1 { + result = strconv.AppendInt(result, int64(matchPosition), 10) + } + + perlExpr = perlExpr[groupsMat.EndForGroup(0):] + + case lookingAt(cgMat): + groupNum, err := strconv.ParseInt(group(cgMat, 1), 10, 32) + if err != nil { + t.Fatalf("failed to parse Perl pattern: %v", err) + } + result = append(result, group(testMat, int(groupNum))...) + perlExpr = perlExpr[cgMat.EndForGroup(0):] + + case strings.HasPrefix(perlExpr, "@-"): + for i := 0; i <= testMat.GroupCount(); i++ { + if i > 0 { + result = append(result, ' ') + } + result = strconv.AppendInt(result, int64(testMat.StartForGroup(i)), 10) + } + perlExpr = perlExpr[2:] + + case strings.HasPrefix(perlExpr, "@+"): + for i := 0; i <= testMat.GroupCount(); i++ { + if i > 0 { + result = append(result, ' ') + } + result = strconv.AppendInt(result, int64(testMat.EndForGroup(i)), 10) + } + perlExpr = perlExpr[2:] + + case strings.HasPrefix(perlExpr, "\\"): + if len(perlExpr) > 1 { + perlExpr = perlExpr[1:] + } + c := perlExpr[0] + switch c { + case 'n': + c = '\n' + } + result = append(result, c) + perlExpr = perlExpr[1:] + + default: + result = append(result, perlExpr[0]) + perlExpr = perlExpr[1:] + } + } + + var expectedS string + if len(fields) > 4 { + expectedS = fields[4] + expectedS = replacer.Replace(expectedS) + expectedS = strings.ReplaceAll(expectedS, `\n`, "\n") + } + + if expectedS != string(result) { + t.Errorf("line %d: Incorrect Perl expression results\nwant: %s\ngot: %s", lineno, expectedS, result) + } + } +} diff --git a/go/mysql/icuregex/sets.go b/go/mysql/icuregex/sets.go new file mode 100644 index 00000000000..9d362e748cb --- /dev/null +++ b/go/mysql/icuregex/sets.go @@ -0,0 +1,103 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uset" +) + +var staticPropertySets [13]*uset.UnicodeSet + +func init() { + staticPropertySets[URX_ISWORD_SET] = func() *uset.UnicodeSet { + s := uset.New() + s.AddAll(uset.MustParsePattern(`\p{Alphabetic}`, 0)) + s.AddAll(uset.MustParsePattern(`\p{M}`, 0)) + s.AddAll(uset.MustParsePattern(`\p{Nd}`, 0)) + s.AddAll(uset.MustParsePattern(`\p{Pc}`, 0)) + s.AddRune(0x200c) + s.AddRune(0x200d) + return s + }() + + staticPropertySets[URX_ISSPACE_SET] = uset.MustParsePattern(`\p{Whitespace}`, 0) + + staticPropertySets[URX_GC_EXTEND] = uset.MustParsePattern(`\p{Grapheme_Extend}`, 0) + staticPropertySets[URX_GC_CONTROL] = func() *uset.UnicodeSet { + s := uset.New() + s.AddAll(uset.MustParsePattern(`[:Zl:]`, 0)) + s.AddAll(uset.MustParsePattern(`[:Zp:]`, 0)) + s.AddAll(uset.MustParsePattern(`[:Cc:]`, 0)) + s.AddAll(uset.MustParsePattern(`[:Cf:]`, 0)) + s.RemoveAll(uset.MustParsePattern(`[:Grapheme_Extend:]`, 0)) + return s + }() + staticPropertySets[URX_GC_L] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=L}`, 0) + staticPropertySets[URX_GC_LV] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LV}`, 0) + staticPropertySets[URX_GC_LVT] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LVT}`, 0) + staticPropertySets[URX_GC_V] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=V}`, 0) + staticPropertySets[URX_GC_T] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=T}`, 0) + + staticPropertySets[URX_GC_NORMAL] = func() *uset.UnicodeSet { + s := uset.New() + s.Complement() + s.RemoveRuneRange(0xac00, 0xd7a4) + s.RemoveAll(staticPropertySets[URX_GC_CONTROL]) + s.RemoveAll(staticPropertySets[URX_GC_L]) + s.RemoveAll(staticPropertySets[URX_GC_V]) + s.RemoveAll(staticPropertySets[URX_GC_T]) + return s + }() +} + +var staticSetUnescape = func() *uset.UnicodeSet { + u := uset.New() + u.AddString("acefnrtuUx") + return u +}() + +const ( + kRuleSetDigitChar = 128 + kRuleSetAsciiLetter = 129 + kRuleSetRuleChar = 130 + kRuleSetCount = 131 - 128 +) + +var staticRuleSet = [kRuleSetCount]*uset.UnicodeSet{ + func() *uset.UnicodeSet { + u := uset.New() + u.AddRuneRange('0', '9') + return u + }(), + func() *uset.UnicodeSet { + u := uset.New() + u.AddRuneRange('A', 'Z') + u.AddRuneRange('a', 'z') + return u + }(), + func() *uset.UnicodeSet { + u := uset.New() + u.AddString("*?+[(){}^$|\\.") + u.Complement() + return u + }(), +} diff --git a/go/mysql/icuregex/sets_test.go b/go/mysql/icuregex/sets_test.go new file mode 100644 index 00000000000..e5e5200227a --- /dev/null +++ b/go/mysql/icuregex/sets_test.go @@ -0,0 +1,50 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "testing" +) + +func TestStaticSetContents(t *testing.T) { + // These are the number of codepoints contained in each of the static sets as of ICU69-1, + // as to sanity check that we're re-creating the sets properly. + // This table must be re-created when updating Unicode versions. + var ExpectedSetSizes = map[int]int{ + 1: 134564, + 4: 25, + 5: 1102451, + 6: 1979, + 7: 131, + 8: 125, + 9: 399, + 10: 10773, + 11: 95, + 12: 137, + } + + for setid, expected := range ExpectedSetSizes { + if got := staticPropertySets[setid].Len(); got != expected { + t.Fatalf("static set [%d] has wrong size: got %d, expected %d", setid, got, expected) + } + } +} diff --git a/go/mysql/icuregex/testdata/re_tests.txt b/go/mysql/icuregex/testdata/re_tests.txt new file mode 100644 index 00000000000..c18b638f9b3 --- /dev/null +++ b/go/mysql/icuregex/testdata/re_tests.txt @@ -0,0 +1,923 @@ +abc abc y $& abc +abc abc y $-[0] 0 +abc abc y $+[0] 3 +abc xbc n - - +abc axc n - - +abc abx n - - +abc xabcy y $& abc +abc xabcy y $-[0] 1 +abc xabcy y $+[0] 4 +abc ababc y $& abc +abc ababc y $-[0] 2 +abc ababc y $+[0] 5 +ab*c abc y $& abc +ab*c abc y $-[0] 0 +ab*c abc y $+[0] 3 +ab*bc abc y $& abc +ab*bc abc y $-[0] 0 +ab*bc abc y $+[0] 3 +ab*bc abbc y $& abbc +ab*bc abbc y $-[0] 0 +ab*bc abbc y $+[0] 4 +ab*bc abbbbc y $& abbbbc +ab*bc abbbbc y $-[0] 0 +ab*bc abbbbc y $+[0] 6 +.{1} abbbbc y $& a +.{1} abbbbc y $-[0] 0 +.{1} abbbbc y $+[0] 1 +.{3,4} abbbbc y $& abbb +.{3,4} abbbbc y $-[0] 0 +.{3,4} abbbbc y $+[0] 4 +ab{0,}bc abbbbc y $& abbbbc +ab{0,}bc abbbbc y $-[0] 0 +ab{0,}bc abbbbc y $+[0] 6 +ab+bc abbc y $& abbc +ab+bc abbc y $-[0] 0 +ab+bc abbc y $+[0] 4 +ab+bc abc n - - +ab+bc abq n - - +ab{1,}bc abq n - - +ab+bc abbbbc y $& abbbbc +ab+bc abbbbc y $-[0] 0 +ab+bc abbbbc y $+[0] 6 +ab{1,}bc abbbbc y $& abbbbc +ab{1,}bc abbbbc y $-[0] 0 +ab{1,}bc abbbbc y $+[0] 6 +ab{1,3}bc abbbbc y $& abbbbc +ab{1,3}bc abbbbc y $-[0] 0 +ab{1,3}bc abbbbc y $+[0] 6 +ab{3,4}bc abbbbc y $& abbbbc +ab{3,4}bc abbbbc y $-[0] 0 +ab{3,4}bc abbbbc y $+[0] 6 +ab{4,5}bc abbbbc n - - +ab?bc abbc y $& abbc +ab?bc abc y $& abc +ab{0,1}bc abc y $& abc +ab?bc abbbbc n - - +ab?c abc y $& abc +ab{0,1}c abc y $& abc +^abc$ abc y $& abc +^abc$ abcc n - - +^abc abcc y $& abc +^abc$ aabc n - - +abc$ aabc y $& abc +abc$ aabcd n - - +^ abc y $& +$ abc y $& +a.c abc y $& abc +a.c axc y $& axc +a.*c axyzc y $& axyzc +a.*c axyzd n - - +a[bc]d abc n - - +a[bc]d abd y $& abd +a[b-d]e abd n - - +a[b-d]e ace y $& ace +a[b-d] aac y $& ac +a[-b] a- y $& a- +a[b-] a- y $& a- +a[b-a] - c - Invalid [] range "b-a" +a[]b - ci - Unmatched [ +a[ - c - Unmatched [ +a] a] y $& a] +a[]]b a]b y $& a]b +a[^bc]d aed y $& aed +a[^bc]d abd n - - +a[^-b]c adc y $& adc +a[^-b]c a-c n - - +a[^]b]c a]c n - - +a[^]b]c adc y $& adc +\ba\b a- y - - +\ba\b -a y - - +\ba\b -a- y - - +\by\b xy n - - +\by\b yz n - - +\by\b xyz n - - +\Ba\B a- n - - +\Ba\B -a n - - +\Ba\B -a- n - - +\By\b xy y - - +\By\b xy y $-[0] 1 +\By\b xy y $+[0] 2 +\By\b xy y - - +\by\B yz y - - +\By\B xyz y - - +\w a y - - +\w - n - - +\W a n - - +\W - y - - +a\sb a b y - - +a\sb a-b n - - +a\Sb a b n - - +a\Sb a-b y - - +\d 1 y - - +\d - n - - +\D 1 n - - +\D - y - - +[\w] a y - - +[\w] - n - - +[\W] a n - - +[\W] - y - - +a[\s]b a b y - - +a[\s]b a-b n - - +a[\S]b a b n - - +a[\S]b a-b y - - +[\d] 1 y - - +[\d] - n - - +[\D] 1 n - - +[\D] - y - - +ab|cd abc y $& ab +ab|cd abcd y $& ab +()ef def y $&-$1 ef- +()ef def y $-[0] 1 +()ef def y $+[0] 3 +()ef def y $-[1] 1 +()ef def y $+[1] 1 +*a - c - Quantifier follows nothing +(*)b - c - Quantifier follows nothing +$b b n - - +a\ - c - Search pattern not terminated +a\(b a(b y $&-$1 a(b- +a\(*b ab y $& ab +a\(*b a((b y $& a((b +a\\b a\b y $& a\b +abc) - c - Unmatched ) +(abc - c - Unmatched ( +((a)) abc y $&-$1-$2 a-a-a +((a)) abc y $-[0]-$-[1]-$-[2] 0-0-0 +((a)) abc y $+[0]-$+[1]-$+[2] 1-1-1 +((a)) abc by @- 0 0 0 +((a)) abc by @+ 1 1 1 +(a)b(c) abc y $&-$1-$2 abc-a-c +(a)b(c) abc y $-[0]-$-[1]-$-[2] 0-0-2 +(a)b(c) abc y $+[0]-$+[1]-$+[2] 3-1-3 +a+b+c aabbabc y $& abc +a{1,}b{1,}c aabbabc y $& abc +a** - c - Nested quantifiers +a.+?c abcabc y $& abc +(a+|b)* ab y $&-$1 ab-b +(a+|b)* ab y $-[0] 0 +(a+|b)* ab y $+[0] 2 +(a+|b)* ab y $-[1] 1 +(a+|b)* ab y $+[1] 2 +(a+|b){0,} ab y $&-$1 ab-b +(a+|b)+ ab y $&-$1 ab-b +(a+|b){1,} ab y $&-$1 ab-b +(a+|b)? ab y $&-$1 a-a +(a+|b){0,1} ab y $&-$1 a-a +)( - c - Unmatched ) +[^ab]* cde y $& cde +abc n - - +a* y $& +([abc])*d abbbcd y $&-$1 abbbcd-c +([abc])*bcd abcd y $&-$1 abcd-a +a|b|c|d|e e y $& e +(a|b|c|d|e)f ef y $&-$1 ef-e +(a|b|c|d|e)f ef y $-[0] 0 +(a|b|c|d|e)f ef y $+[0] 2 +(a|b|c|d|e)f ef y $-[1] 0 +(a|b|c|d|e)f ef y $+[1] 1 +abcd*efg abcdefg y $& abcdefg +ab* xabyabbbz y $& ab +ab* xayabbbz y $& a +(ab|cd)e abcde y $&-$1 cde-cd +[abhgefdc]ij hij y $& hij +^(ab|cd)e abcde n x$1y xy +(abc|)ef abcdef y $&-$1 ef- +(a|b)c*d abcd y $&-$1 bcd-b +(ab|ab*)bc abc y $&-$1 abc-a +a([bc]*)c* abc y $&-$1 abc-bc +a([bc]*)(c*d) abcd y $&-$1-$2 abcd-bc-d +a([bc]*)(c*d) abcd y $-[0] 0 +a([bc]*)(c*d) abcd y $+[0] 4 +a([bc]*)(c*d) abcd y $-[1] 1 +a([bc]*)(c*d) abcd y $+[1] 3 +a([bc]*)(c*d) abcd y $-[2] 3 +a([bc]*)(c*d) abcd y $+[2] 4 +a([bc]+)(c*d) abcd y $&-$1-$2 abcd-bc-d +a([bc]*)(c+d) abcd y $&-$1-$2 abcd-b-cd +a([bc]*)(c+d) abcd y $-[0] 0 +a([bc]*)(c+d) abcd y $+[0] 4 +a([bc]*)(c+d) abcd y $-[1] 1 +a([bc]*)(c+d) abcd y $+[1] 2 +a([bc]*)(c+d) abcd y $-[2] 2 +a([bc]*)(c+d) abcd y $+[2] 4 +a[bcd]*dcdcde adcdcde y $& adcdcde +a[bcd]+dcdcde adcdcde n - - +(ab|a)b*c abc y $&-$1 abc-ab +(ab|a)b*c abc y $-[0] 0 +(ab|a)b*c abc y $+[0] 3 +(ab|a)b*c abc y $-[1] 0 +(ab|a)b*c abc y $+[1] 2 +((a)(b)c)(d) abcd y $1-$2-$3-$4 abc-a-b-d +((a)(b)c)(d) abcd y $-[0] 0 +((a)(b)c)(d) abcd y $+[0] 4 +((a)(b)c)(d) abcd y $-[1] 0 +((a)(b)c)(d) abcd y $+[1] 3 +((a)(b)c)(d) abcd y $-[2] 0 +((a)(b)c)(d) abcd y $+[2] 1 +((a)(b)c)(d) abcd y $-[3] 1 +((a)(b)c)(d) abcd y $+[3] 2 +((a)(b)c)(d) abcd y $-[4] 3 +((a)(b)c)(d) abcd y $+[4] 4 +[a-zA-Z_][a-zA-Z0-9_]* alpha y $& alpha +^a(bc+|b[eh])g|.h$ abh y $&-$1 bh- +(bc+d$|ef*g.|h?i(j|k)) effgz y $&-$1-$2 effgz-effgz- +(bc+d$|ef*g.|h?i(j|k)) ij y $&-$1-$2 ij-ij-j +(bc+d$|ef*g.|h?i(j|k)) effg n - - +(bc+d$|ef*g.|h?i(j|k)) bcdd n - - +(bc+d$|ef*g.|h?i(j|k)) reffgz y $&-$1-$2 effgz-effgz- +((((((((((a)))))))))) a y $10 a +((((((((((a)))))))))) a y $-[0] 0 +((((((((((a)))))))))) a y $+[0] 1 +((((((((((a)))))))))) a y $-[10] 0 +((((((((((a)))))))))) a y $+[10] 1 +((((((((((a))))))))))\10 aa y $& aa +((((((((((a))))))))))${bang} aa n - - +((((((((((a))))))))))${bang} a! y $& a! +(((((((((a))))))))) a y $& a +multiple words of text uh-uh n - - +multiple words multiple words, yeah y $& multiple words +(.*)c(.*) abcde y $&-$1-$2 abcde-ab-de +\((.*), (.*)\) (a, b) y ($2, $1) (b, a) +[k] ab n - - +abcd abcd y $&-\$&-\\$& abcd-$&-\abcd +a(bc)d abcd y $1-\$1-\\$1 bc-$1-\bc +a[-]?c ac y $& ac +(abc)\1 abcabc y $1 abc +([a-c]*)\1 abcabc y $1 abc +\1 - c - Reference to nonexistent group +\2 - c - Reference to nonexistent group +(a)|\1 a y - - +(a)|\1 x n - - +(a)|\2 - c - Reference to nonexistent group +(([a-c])b*?\2)* ababbbcbc y $&-$1-$2 ababb-bb-b +(([a-c])b*?\2){3} ababbbcbc y $&-$1-$2 ababbbcbc-cbc-c +((\3|b)\2(a)x)+ aaxabxbaxbbx n - - +((\3|b)\2(a)x)+ aaaxabaxbaaxbbax y $&-$1-$2-$3 bbax-bbax-b-a +((\3|b)\2(a)){2,} bbaababbabaaaaabbaaaabba y $&-$1-$2-$3 bbaaaabba-bba-b-a +(a)|(b) b y $-[0] 0 +(a)|(b) b y $+[0] 1 +(a)|(b) b y x$-[1] x +(a)|(b) b y x$+[1] x +(a)|(b) b y $-[2] 0 +(a)|(b) b y $+[2] 1 +'abc'i ABC y $& ABC +'abc'i XBC n - - +'abc'i AXC n - - +'abc'i ABX n - - +'abc'i XABCY y $& ABC +'abc'i ABABC y $& ABC +'ab*c'i ABC y $& ABC +'ab*bc'i ABC y $& ABC +'ab*bc'i ABBC y $& ABBC +'ab*?bc'i ABBBBC y $& ABBBBC +'ab{0,}?bc'i ABBBBC y $& ABBBBC +'ab+?bc'i ABBC y $& ABBC +'ab+bc'i ABC n - - +'ab+bc'i ABQ n - - +'ab{1,}bc'i ABQ n - - +'ab+bc'i ABBBBC y $& ABBBBC +'ab{1,}?bc'i ABBBBC y $& ABBBBC +'ab{1,3}?bc'i ABBBBC y $& ABBBBC +'ab{3,4}?bc'i ABBBBC y $& ABBBBC +'ab{4,5}?bc'i ABBBBC n - - +'ab??bc'i ABBC y $& ABBC +'ab??bc'i ABC y $& ABC +'ab{0,1}?bc'i ABC y $& ABC +'ab??bc'i ABBBBC n - - +'ab??c'i ABC y $& ABC +'ab{0,1}?c'i ABC y $& ABC +'^abc$'i ABC y $& ABC +'^abc$'i ABCC n - - +'^abc'i ABCC y $& ABC +'^abc$'i AABC n - - +'abc$'i AABC y $& ABC +'^'i ABC y $& +'$'i ABC y $& +'a.c'i ABC y $& ABC +'a.c'i AXC y $& AXC +'a.*?c'i AXYZC y $& AXYZC +'a.*c'i AXYZD n - - +'a[bc]d'i ABC n - - +'a[bc]d'i ABD y $& ABD +'a[b-d]e'i ABD n - - +'a[b-d]e'i ACE y $& ACE +'a[b-d]'i AAC y $& AC +'a[-b]'i A- y $& A- +'a[b-]'i A- y $& A- +'a[b-a]'i - c - Invalid [] range "b-a" +'a[]b'i - ci - Unmatched [ +'a['i - c - Unmatched [ +'a]'i A] y $& A] +'a[]]b'i A]B y $& A]B +'a[^bc]d'i AED y $& AED +'a[^bc]d'i ABD n - - +'a[^-b]c'i ADC y $& ADC +'a[^-b]c'i A-C n - - +'a[^]b]c'i A]C n - - +'a[^]b]c'i ADC y $& ADC +'ab|cd'i ABC y $& AB +'ab|cd'i ABCD y $& AB +'()ef'i DEF y $&-$1 EF- +'*a'i - c - Quantifier follows nothing +'(*)b'i - c - Quantifier follows nothing +'$b'i B n - - +'a\'i - c - Search pattern not terminated +'a\(b'i A(B y $&-$1 A(B- +'a\(*b'i AB y $& AB +'a\(*b'i A((B y $& A((B +'a\\b'i A\B y $& A\B +'abc)'i - c - Unmatched ) +'(abc'i - c - Unmatched ( +'((a))'i ABC y $&-$1-$2 A-A-A +'(a)b(c)'i ABC y $&-$1-$2 ABC-A-C +'a+b+c'i AABBABC y $& ABC +'a{1,}b{1,}c'i AABBABC y $& ABC +'a**'i - c - Nested quantifiers +'a.+?c'i ABCABC y $& ABC +'a.*?c'i ABCABC y $& ABC +'a.{0,5}?c'i ABCABC y $& ABC +'(a+|b)*'i AB y $&-$1 AB-B +'(a+|b){0,}'i AB y $&-$1 AB-B +'(a+|b)+'i AB y $&-$1 AB-B +'(a+|b){1,}'i AB y $&-$1 AB-B +'(a+|b)?'i AB y $&-$1 A-A +'(a+|b){0,1}'i AB y $&-$1 A-A +'(a+|b){0,1}?'i AB y $&-$1 - +')('i - c - Unmatched ) +'[^ab]*'i CDE y $& CDE +'abc'i n - - +'a*'i y $& +'([abc])*d'i ABBBCD y $&-$1 ABBBCD-C +'([abc])*bcd'i ABCD y $&-$1 ABCD-A +'a|b|c|d|e'i E y $& E +'(a|b|c|d|e)f'i EF y $&-$1 EF-E +'abcd*efg'i ABCDEFG y $& ABCDEFG +'ab*'i XABYABBBZ y $& AB +'ab*'i XAYABBBZ y $& A +'(ab|cd)e'i ABCDE y $&-$1 CDE-CD +'[abhgefdc]ij'i HIJ y $& HIJ +'^(ab|cd)e'i ABCDE n x$1y XY +'(abc|)ef'i ABCDEF y $&-$1 EF- +'(a|b)c*d'i ABCD y $&-$1 BCD-B +'(ab|ab*)bc'i ABC y $&-$1 ABC-A +'a([bc]*)c*'i ABC y $&-$1 ABC-BC +'a([bc]*)(c*d)'i ABCD y $&-$1-$2 ABCD-BC-D +'a([bc]+)(c*d)'i ABCD y $&-$1-$2 ABCD-BC-D +'a([bc]*)(c+d)'i ABCD y $&-$1-$2 ABCD-B-CD +'a[bcd]*dcdcde'i ADCDCDE y $& ADCDCDE +'a[bcd]+dcdcde'i ADCDCDE n - - +'(ab|a)b*c'i ABC y $&-$1 ABC-AB +'((a)(b)c)(d)'i ABCD y $1-$2-$3-$4 ABC-A-B-D +'[a-zA-Z_][a-zA-Z0-9_]*'i ALPHA y $& ALPHA +'^a(bc+|b[eh])g|.h$'i ABH y $&-$1 BH- +'(bc+d$|ef*g.|h?i(j|k))'i EFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'(bc+d$|ef*g.|h?i(j|k))'i IJ y $&-$1-$2 IJ-IJ-J +'(bc+d$|ef*g.|h?i(j|k))'i EFFG n - - +'(bc+d$|ef*g.|h?i(j|k))'i BCDD n - - +'(bc+d$|ef*g.|h?i(j|k))'i REFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'((((((((((a))))))))))'i A y $10 A +'((((((((((a))))))))))\10'i AA y $& AA +'((((((((((a))))))))))${bang}'i AA n - - +'((((((((((a))))))))))${bang}'i A! y $& A! +'(((((((((a)))))))))'i A y $& A +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))'i A y $1 A +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))'i C y $1 C +'multiple words of text'i UH-UH n - - +'multiple words'i MULTIPLE WORDS, YEAH y $& MULTIPLE WORDS +'(.*)c(.*)'i ABCDE y $&-$1-$2 ABCDE-AB-DE +'\((.*), (.*)\)'i (A, B) y ($2, $1) (B, A) +'[k]'i AB n - - +'abcd'i ABCD y $&-\$&-\\$& ABCD-$&-\ABCD +'a(bc)d'i ABCD y $1-\$1-\\$1 BC-$1-\BC +'a[-]?c'i AC y $& AC +'(abc)\1'i ABCABC y $1 ABC +'([a-c]*)\1'i ABCABC y $1 ABC +a(?!b). abad y $& ad +a(?=d). abad y $& ad +a(?=c|d). abad y $& ad +a(?:b|c|d)(.) ace y $1 e +a(?:b|c|d)*(.) ace y $1 e +a(?:b|c|d)+?(.) ace y $1 e +a(?:b|c|d)+?(.) acdbcdbe y $1 d +a(?:b|c|d)+(.) acdbcdbe y $1 e +a(?:b|c|d){2}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}?(.) acdbcdbe y $1 d +((foo)|(bar))* foobar y $1-$2-$3 bar-foo-bar +:(?: - c - Sequence (? incomplete +a(?:b|c|d){6,7}(.) acdbcdbe y $1 e +a(?:b|c|d){6,7}?(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}?(.) acdbcdbe y $1 b +a(?:b|c|d){5,7}(.) acdbcdbe y $1 e +a(?:b|c|d){5,7}?(.) acdbcdbe y $1 b +a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce +^(.+)?B AB y $1 A +^([^a-z])|(\^)$ . y $1 . +^[<>]& <&OUT y $& <& +^(a\1?){4}$ aaaaaaaaaa y $1 aaaa +^(a\1?){4}$ aaaaaaaaa n - - +^(a\1?){4}$ aaaaaaaaaaa n - - +^(a(?(1)\1)){4}$ aaaaaaaaaa y $1 aaaa +^(a(?(1)\1)){4}$ aaaaaaaaa n - - +^(a(?(1)\1)){4}$ aaaaaaaaaaa n - - +((a{4})+) aaaaaaaaa y $1 aaaaaaaa +(((aa){2})+) aaaaaaaaaa y $1 aaaaaaaa +(((a{2}){2})+) aaaaaaaaaa y $1 aaaaaaaa +(?:(f)(o)(o)|(b)(a)(r))* foobar y $1:$2:$3:$4:$5:$6 f:o:o:b:a:r +(?<=a)b ab y $& b +(?<=a)b cb n - - +(?<=a)b b n - - +(?a+)ab aaab n - - +(?>a+)b aaab y - - +([[:]+) a:[b]: yi $1 :[ Java and ICU dont escape [[xyz +([[=]+) a=[b]= yi $1 =[ Java and ICU dont escape [[xyz +([[.]+) a.[b]. yi $1 .[ Java and ICU dont escape [[xyz +[a[:xyz: - c - Unmatched [ +[a[:xyz:] - c - POSIX class [:xyz:] unknown +[a[:]b[:c] abc yi $& abc Java and ICU embedded [ is nested set +([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown +[a[:]b[:c] abc iy $& abc Java and ICU embedded [ is nested set +([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul} +([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul} +([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd +([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- +([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 +([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__ +([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01 +([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff} +([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff} +([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff} +([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 -- ${nulnul}${ffff} +([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01 +([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff} +[[:foo:]] - c - POSIX class [:foo:] unknown +[[:^foo:]] - c - POSIX class [:^foo:] unknown +((?>a+)b) aaab y $1 aaab +(?>(a+))b aaab y $1 aaa +((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x +(?<=x+)y - c - Variable length lookbehind not implemented +a{37,17} - c - Can't do {n,m} with n > m +\Z a\nb\n y $-[0] 3 +\z a\nb\n y $-[0] 4 +$ a\nb\n y $-[0] 3 +\Z b\na\n y $-[0] 3 +\z b\na\n y $-[0] 4 +$ b\na\n y $-[0] 3 +\Z b\na y $-[0] 3 +\z b\na y $-[0] 3 +$ b\na y $-[0] 3 +'\Z'm a\nb\n y $-[0] 3 +'\z'm a\nb\n y $-[0] 4 +'$'m a\nb\n y $-[0] 1 +'\Z'm b\na\n y $-[0] 3 +'\z'm b\na\n y $-[0] 4 +'$'m b\na\n y $-[0] 1 +'\Z'm b\na y $-[0] 3 +'\z'm b\na y $-[0] 3 +'$'m b\na y $-[0] 1 +a\Z a\nb\n n - - +a\z a\nb\n n - - +a$ a\nb\n n - - +a\Z b\na\n y $-[0] 2 +a\z b\na\n n - - +a$ b\na\n y $-[0] 2 +a\Z b\na y $-[0] 2 +a\z b\na y $-[0] 2 +a$ b\na y $-[0] 2 +'a\Z'm a\nb\n n - - +'a\z'm a\nb\n n - - +'a$'m a\nb\n y $-[0] 0 +'a\Z'm b\na\n y $-[0] 2 +'a\z'm b\na\n n - - +'a$'m b\na\n y $-[0] 2 +'a\Z'm b\na y $-[0] 2 +'a\z'm b\na y $-[0] 2 +'a$'m b\na y $-[0] 2 +aa\Z aa\nb\n n - - +aa\z aa\nb\n n - - +aa$ aa\nb\n n - - +aa\Z b\naa\n y $-[0] 2 +aa\z b\naa\n n - - +aa$ b\naa\n y $-[0] 2 +aa\Z b\naa y $-[0] 2 +aa\z b\naa y $-[0] 2 +aa$ b\naa y $-[0] 2 +'aa\Z'm aa\nb\n n - - +'aa\z'm aa\nb\n n - - +'aa$'m aa\nb\n y $-[0] 0 +'aa\Z'm b\naa\n y $-[0] 2 +'aa\z'm b\naa\n n - - +'aa$'m b\naa\n y $-[0] 2 +'aa\Z'm b\naa y $-[0] 2 +'aa\z'm b\naa y $-[0] 2 +'aa$'m b\naa y $-[0] 2 +aa\Z ac\nb\n n - - +aa\z ac\nb\n n - - +aa$ ac\nb\n n - - +aa\Z b\nac\n n - - +aa\z b\nac\n n - - +aa$ b\nac\n n - - +aa\Z b\nac n - - +aa\z b\nac n - - +aa$ b\nac n - - +'aa\Z'm ac\nb\n n - - +'aa\z'm ac\nb\n n - - +'aa$'m ac\nb\n n - - +'aa\Z'm b\nac\n n - - +'aa\z'm b\nac\n n - - +'aa$'m b\nac\n n - - +'aa\Z'm b\nac n - - +'aa\z'm b\nac n - - +'aa$'m b\nac n - - +aa\Z ca\nb\n n - - +aa\z ca\nb\n n - - +aa$ ca\nb\n n - - +aa\Z b\nca\n n - - +aa\z b\nca\n n - - +aa$ b\nca\n n - - +aa\Z b\nca n - - +aa\z b\nca n - - +aa$ b\nca n - - +'aa\Z'm ca\nb\n n - - +'aa\z'm ca\nb\n n - - +'aa$'m ca\nb\n n - - +'aa\Z'm b\nca\n n - - +'aa\z'm b\nca\n n - - +'aa$'m b\nca\n n - - +'aa\Z'm b\nca n - - +'aa\z'm b\nca n - - +'aa$'m b\nca n - - +ab\Z ab\nb\n n - - +ab\z ab\nb\n n - - +ab$ ab\nb\n n - - +ab\Z b\nab\n y $-[0] 2 +ab\z b\nab\n n - - +ab$ b\nab\n y $-[0] 2 +ab\Z b\nab y $-[0] 2 +ab\z b\nab y $-[0] 2 +ab$ b\nab y $-[0] 2 +'ab\Z'm ab\nb\n n - - +'ab\z'm ab\nb\n n - - +'ab$'m ab\nb\n y $-[0] 0 +'ab\Z'm b\nab\n y $-[0] 2 +'ab\z'm b\nab\n n - - +'ab$'m b\nab\n y $-[0] 2 +'ab\Z'm b\nab y $-[0] 2 +'ab\z'm b\nab y $-[0] 2 +'ab$'m b\nab y $-[0] 2 +ab\Z ac\nb\n n - - +ab\z ac\nb\n n - - +ab$ ac\nb\n n - - +ab\Z b\nac\n n - - +ab\z b\nac\n n - - +ab$ b\nac\n n - - +ab\Z b\nac n - - +ab\z b\nac n - - +ab$ b\nac n - - +'ab\Z'm ac\nb\n n - - +'ab\z'm ac\nb\n n - - +'ab$'m ac\nb\n n - - +'ab\Z'm b\nac\n n - - +'ab\z'm b\nac\n n - - +'ab$'m b\nac\n n - - +'ab\Z'm b\nac n - - +'ab\z'm b\nac n - - +'ab$'m b\nac n - - +ab\Z ca\nb\n n - - +ab\z ca\nb\n n - - +ab$ ca\nb\n n - - +ab\Z b\nca\n n - - +ab\z b\nca\n n - - +ab$ b\nca\n n - - +ab\Z b\nca n - - +ab\z b\nca n - - +ab$ b\nca n - - +'ab\Z'm ca\nb\n n - - +'ab\z'm ca\nb\n n - - +'ab$'m ca\nb\n n - - +'ab\Z'm b\nca\n n - - +'ab\z'm b\nca\n n - - +'ab$'m b\nca\n n - - +'ab\Z'm b\nca n - - +'ab\z'm b\nca n - - +'ab$'m b\nca n - - +abb\Z abb\nb\n n - - +abb\z abb\nb\n n - - +abb$ abb\nb\n n - - +abb\Z b\nabb\n y $-[0] 2 +abb\z b\nabb\n n - - +abb$ b\nabb\n y $-[0] 2 +abb\Z b\nabb y $-[0] 2 +abb\z b\nabb y $-[0] 2 +abb$ b\nabb y $-[0] 2 +'abb\Z'm abb\nb\n n - - +'abb\z'm abb\nb\n n - - +'abb$'m abb\nb\n y $-[0] 0 +'abb\Z'm b\nabb\n y $-[0] 2 +'abb\z'm b\nabb\n n - - +'abb$'m b\nabb\n y $-[0] 2 +'abb\Z'm b\nabb y $-[0] 2 +'abb\z'm b\nabb y $-[0] 2 +'abb$'m b\nabb y $-[0] 2 +abb\Z ac\nb\n n - - +abb\z ac\nb\n n - - +abb$ ac\nb\n n - - +abb\Z b\nac\n n - - +abb\z b\nac\n n - - +abb$ b\nac\n n - - +abb\Z b\nac n - - +abb\z b\nac n - - +abb$ b\nac n - - +'abb\Z'm ac\nb\n n - - +'abb\z'm ac\nb\n n - - +'abb$'m ac\nb\n n - - +'abb\Z'm b\nac\n n - - +'abb\z'm b\nac\n n - - +'abb$'m b\nac\n n - - +'abb\Z'm b\nac n - - +'abb\z'm b\nac n - - +'abb$'m b\nac n - - +abb\Z ca\nb\n n - - +abb\z ca\nb\n n - - +abb$ ca\nb\n n - - +abb\Z b\nca\n n - - +abb\z b\nca\n n - - +abb$ b\nca\n n - - +abb\Z b\nca n - - +abb\z b\nca n - - +abb$ b\nca n - - +'abb\Z'm ca\nb\n n - - +'abb\z'm ca\nb\n n - - +'abb$'m ca\nb\n n - - +'abb\Z'm b\nca\n n - - +'abb\z'm b\nca\n n - - +'abb$'m b\nca\n n - - +'abb\Z'm b\nca n - - +'abb\z'm b\nca n - - +'abb$'m b\nca n - - +(^|x)(c) ca y $2 c +a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz x n - - +a(?{$a=2;$b=3;($b)=$a})b yabz y $b 2 +round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz +'((?x:.) )' x y $1- x - +'((?-x:.) )'x x y $1- x- +foo.bart foo.bart y - - +'^d[x][x][x]'m abcd\ndxxx y - - +.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - # TODO: ICU doesn't optimize on trailing literals in pattern. +.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +tt+$ xxxtt y - - +([a-\d]+) za-9z yi $1 a-9 +([\d-z]+) a0-za y $1 0-z +([\d-\s]+) a0- z y $1 0- +([a-[:digit:]]+) za-9z y $1 a-9 +([[:digit:]-z]+) =0-z= y $1 0-z +([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z Set difference in ICU +\GX.*X aaaXbX n - - +(\d+\.\d+) 3.1415926 y $1 3.1415926 +(\ba.{0,10}br) have a web browser y $1 a web br +'\.c(pp|xx|c)?$'i Changes n - - +'\.c(pp|xx|c)?$'i IO.c y - - +'(\.c(pp|xx|c)?$)'i IO.c y $1 .c +^([a-z]:) C:/ n - - +'^\S\s+aa$'m \nx aa y - - +(^|a)b ab y - - +^([ab]*?)(b)?(c)$ abac y -$2- -- +(\w)?(abc)\1b abcab n - - +^(?:.,){2}c a,b,c y - - +^(.,){2}c a,b,c y $1 b, +^(?:[^,]*,){2}c a,b,c y - - +^([^,]*,){2}c a,b,c y $1 b, +^([^,]*,){3}d aaa,b,c,d y $1 c, +^([^,]*,){3,}d aaa,b,c,d y $1 c, +^([^,]*,){0,3}d aaa,b,c,d y $1 c, +^([^,]{1,3},){3}d aaa,b,c,d y $1 c, +^([^,]{1,3},){3,}d aaa,b,c,d y $1 c, +^([^,]{1,3},){0,3}d aaa,b,c,d y $1 c, +^([^,]{1,},){3}d aaa,b,c,d y $1 c, +^([^,]{1,},){3,}d aaa,b,c,d y $1 c, +^([^,]{1,},){0,3}d aaa,b,c,d y $1 c, +^([^,]{0,3},){3}d aaa,b,c,d y $1 c, +^([^,]{0,3},){3,}d aaa,b,c,d y $1 c, +^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c, +(?i) y - - +'(?!\A)x'm a\nxb\n y - - +^(a(b)?)+$ aba yi -$1-$2- -a-- Java disagrees. Not clear who is right. +'^.{9}abc.*\n'm 123\nabcabcabcabc\n y - - +^(a)?a$ a y -$1- -- +^(a)?(?(1)a|b)+$ a n - - +^(a\1?)(a\1?)(a\2?)(a\3?)$ aaaaaa y $1,$2,$3,$4 a,aa,a,aa +^(a\1?){4}$ aaaaaa y $1 aa +^(0+)?(?:x(1))? x1 y - - +^([0-9a-fA-F]+)(?:x([0-9a-fA-F]+)?)(?:x([0-9a-fA-F]+))? 012cxx0190 y - - +^(b+?|a){1,2}c bbbac y $1 a +^(b+?|a){1,2}c bbbbac y $1 a +\((\w\. \w+)\) cd. (A. Tw) y -$1- -A. Tw- +((?:aaaa|bbbb)cccc)? aaaacccc y - - +((?:aaaa|bbbb)cccc)? bbbbcccc y - - +(a)?(a)+ a y $1:$2 :a - +(ab)?(ab)+ ab y $1:$2 :ab - +(abc)?(abc)+ abc y $1:$2 :abc - +'b\s^'m a\nb\n n - - +\ba a y - - +^(a(??{"(?!)"})|(a)(?{1}))b ab yi $2 a # [ID 20010811.006] +ab(?i)cd AbCd n - - # [ID 20010809.023] +ab(?i)cd abCd y - - +(A|B)*(?(1)(CD)|(CD)) CD y $2-$3 -CD +(A|B)*(?(1)(CD)|(CD)) ABCD y $2-$3 CD- +(A|B)*?(?(1)(CD)|(CD)) CD y $2-$3 -CD # [ID 20010803.016] +(A|B)*?(?(1)(CD)|(CD)) ABCD y $2-$3 CD- +'^(o)(?!.*\1)'i Oo n - - +(.*)\d+\1 abc12bc y $1 bc +(?m:(foo\s*$)) foo\n bar y $1 foo +(.*)c abcd y $1 ab +(.*)(?=c) abcd y $1 ab +(.*)(?=c)c abcd yB $1 ab +(.*)(?=b|c) abcd y $1 ab +(.*)(?=b|c)c abcd y $1 ab +(.*)(?=c|b) abcd y $1 ab +(.*)(?=c|b)c abcd y $1 ab +(.*)(?=[bc]) abcd y $1 ab +(.*)(?=[bc])c abcd yB $1 ab +(.*)(?<=b) abcd y $1 ab +(.*)(?<=b)c abcd y $1 ab +(.*)(?<=b|c) abcd y $1 abc +(.*)(?<=b|c)c abcd y $1 ab +(.*)(?<=c|b) abcd y $1 abc +(.*)(?<=c|b)c abcd y $1 ab +(.*)(?<=[bc]) abcd y $1 abc +(.*)(?<=[bc])c abcd y $1 ab +(.*?)c abcd y $1 ab +(.*?)(?=c) abcd y $1 ab +(.*?)(?=c)c abcd yB $1 ab +(.*?)(?=b|c) abcd y $1 a +(.*?)(?=b|c)c abcd y $1 ab +(.*?)(?=c|b) abcd y $1 a +(.*?)(?=c|b)c abcd y $1 ab +(.*?)(?=[bc]) abcd y $1 a +(.*?)(?=[bc])c abcd yB $1 ab +(.*?)(?<=b) abcd y $1 ab +(.*?)(?<=b)c abcd y $1 ab +(.*?)(?<=b|c) abcd y $1 ab +(.*?)(?<=b|c)c abcd y $1 ab +(.*?)(?<=c|b) abcd y $1 ab +(.*?)(?<=c|b)c abcd y $1 ab +(.*?)(?<=[bc]) abcd y $1 ab +(.*?)(?<=[bc])c abcd y $1 ab +2(]*)?$\1 2 y $& 2 +(??{}) x yi - - diff --git a/go/mysql/icuregex/testdata/regextst.txt b/go/mysql/icuregex/testdata/regextst.txt new file mode 100644 index 00000000000..8d5d2c34a8e --- /dev/null +++ b/go/mysql/icuregex/testdata/regextst.txt @@ -0,0 +1,2793 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. +# +# file: +# +# ICU regular expression test cases. +# +# format: one test case per line, +# = [# comment] +# = "" +# = "" +# the quotes on the pattern and match string can be " or ' or / +# = text, with the start and end of each +# capture group tagged with .... The overall match, +# if any, is group 0, as in <0>matched text +# A region can be specified with ... tags. +# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear. +# +# = any combination of +# i case insensitive match +# x free spacing and comments +# s dot-matches-all mode +# m multi-line mode. +# ($ and ^ match at embedded new-lines) +# D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. +# v If icu configured without break iteration, this +# regex test pattern should not compile. +# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag +# d dump the compiled pattern +# t trace operation of match engine. +# 2-9 a digit between 2 and 9, specifies the number of +# times to execute find(). The expected results are +# for the last find() in the sequence. +# G Only check match / no match. Do not check capture groups. +# E Pattern compilation error expected +# L Use LookingAt() rather than find() +# M Use matches() rather than find(). +# +# a Use non-Anchoring Bounds. +# b Use Transparent Bounds. +# The a and b options only make a difference if +# a region has been specified in the string. +# z|Z hitEnd was expected(z) or not expected (Z). +# With neither, hitEnd is not checked. +# y|Y Require End expected(y) or not expected (Y). +# +# White space must be present between the flags and the match string. +# + +# Look-ahead expressions +# +"(?!0{5})(\d{5})" "<0><1>00001zzzz" +"(?!0{5})(\d{5})z" "<0><1>00001zzzz" +"(?!0{5})(\d{5})(?!y)" "<0><1>00001zzzz" +"abc(?=def)" "<0>abcdef" +"(.*)(?=c)" "<0><1>abcdef" + +"(?:.*)(?=c)" "abcdef" +"(?:.*)(?=c)" b "<0>abcdef" # transparent bounds +"(?:.*)(?=c)" bM "<0>abcdef" # transparent bounds + +"(?:.*)(?=(c))" b "<0>ab<1>cdef" # Capture in look-ahead +"(?=(.)\1\1)\1" "abcc<0><1>dddefg" # Backrefs to look-ahead capture + +".(?!\p{L})" "abc<0>d " # Negated look-ahead +".(?!(\p{L}))" "abc<0>d " # Negated look-ahead, no capture + # visible outside of look-ahead +"and(?=roid)" L "<0>android" +"and(?=roid)" M "android" +"and(?=roid)" bM "<0>android" + +"and(?!roid)" L "<0>androix" +"and(?!roid)" L "android" + +"and(?!roid)" M "<0>android" # Opaque bounds +"and(?!roid)" bM "android" +"and(?!roid)" bM "<0>androix" + +# +# Negated Lookahead, various regions and region transparency +# +"abc(?!def)" "<0>abcxyz" +"abc(?!def)" "abcdef" +"abc(?!def)" "<0>abcdef" +"abc(?!def)" b "abcdef" +"abc(?!def)" b "<0>abcxyz" + +# +# Nested Lookahead / Behind +# +"one(?=(?:(?!).)*)" "<0>one stuff" +"one(?=(?:(?!).)*)" "one " + +# More nesting lookaround: pattern matches "qq" when not preceded by 'a' and followed by 'z' +"(?qqc" +"(?qqc" +"(?A<0>jk<2>B" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "ajkB" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "Ajkb" + +# Nested lookaround cases from bug ICU-20564 +"(?<=(?<=((?=)){0}+))" "<0>abc" +"(?<=c(?<=c((?=c)){1}+))" "c<0><1>cc" + +# +# Anchoring Bounds +# +"^def$" "abc<0>defghi" # anchoring (default) bounds +"^def$" a "abcdefghi" # non-anchoring bounds +"^def" a "<0>defghi" # non-anchoring bounds +"def$" a "abc<0>def" # non-anchoring bounds + +"^.*$" m "<0>line 1\n line 2" +"^.*$" m2 "line 1\n<0> line 2" +"^.*$" m3 "line 1\n line 2" +"^.*$" m "li<0>ne 1\n line 2" # anchoring bounds +"^.*$" m2 "line 1\n line 2" # anchoring bounds +"^.*$" am "line 1\n line 2" # non-anchoring bounds +"^.*$" am "li\n<0>ne \n1\n line 2" # non-anchoring bounds + +# +# HitEnd and RequireEnd for new-lines just before end-of-input +# +"xyz$" yz "<0>xyz\n" +"xyz$" yz "<0>xyz\x{d}\x{a}" + +"xyz$" myz "<0>xyz" # multi-line mode +"xyz$" mYZ "<0>xyz\n" +"xyz$" mYZ "<0>xyz\r\n" +"xyz$" mYZ "<0>xyz\x{85}abcd" + +"xyz$" Yz "xyz\nx" +"xyz$" Yz "xyza" +"xyz$" yz "<0>xyz" + +# +# HitEnd +# +"abcd" Lz "a" +"abcd" Lz "ab" +"abcd" Lz "abc" +"abcd" LZ "<0>abcd" +"abcd" LZ "<0>abcde" +"abcd" LZ "abcx" +"abcd" LZ "abx" +"abcd" Lzi "a" +"abcd" Lzi "ab" +"abcd" Lzi "abc" +"abcd" LZi "<0>abcd" +"abcd" LZi "<0>abcde" +"abcd" LZi "abcx" +"abcd" LZi "abx" + +# +# All Unicode line endings recognized. +# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029 +# Multi-line and non-multiline mode take different paths, so repeated tests. +# +"^def$" mYZ "abc\x{a}<0>def\x{a}ghi" +"^def$" mYZ "abc\x{b}<0>def\x{b}ghi" +"^def$" mYZ "abc\x{c}<0>def\x{c}ghi" +"^def$" mYZ "abc\x{d}<0>def\x{d}ghi" +"^def$" mYZ "abc\x{85}<0>def\x{85}ghi" +"^def$" mYZ "abc\x{2028}<0>def\x{2028}ghi" +"^def$" mYZ "abc\x{2029}<0>def\x{2029}ghi" +"^def$" mYZ "abc\r\n<0>def\r\nghi" + +"^def$" yz "<0>def\x{a}" +"^def$" yz "<0>def\x{b}" +"^def$" yz "<0>def\x{c}" +"^def$" yz "<0>def\x{d}" +"^def$" yz "<0>def\x{85}" +"^def$" yz "<0>def\x{2028}" +"^def$" yz "<0>def\x{2029}" +"^def$" yz "<0>def\r\n" +"^def$" yz "<0>def" + + +# "^def$" "<0>def\x{2028" #TODO: should be an error of some sort. + +# +# UNIX_LINES mode +# +"abc$" D "<0>abc\n" +"abc$" D "abc\r" +"abc$" D "abc\u0085" +"a.b" D "<0>a\rb" +"a.b" D "a\nb" +"(?d)abc$" "<0>abc\n" +"(?d)abc$" "abc\r" +"abc$" mD "<0>abc\ndef" +"abc$" mD "abc\rdef" + +".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r +".*def" DL "<0>abc\r def xyz" # Unix Lines mode, \r not line end. +".*def" DL "abc\n def xyz" + +"(?d)a.b" "a\nb" +"(?d)a.b" "<0>a\rb" + +"^abc" m "xyz\r<0>abc" +"^abc" Dm "xyz\rabc" +"^abc" Dm "xyz\n<0>abc" + + + +# Capturing parens +".(..)." "<0>a<1>bcd" + ".*\A( +hello)" "<0><1> hello" +"(hello)|(goodbye)" "<0><1>hello" +"(hello)|(goodbye)" "<0><2>goodbye" +"abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3> xyz cruft" +"\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d<2> " +"(a|b)c*d" "a<0><1>bcd" + +# Non-capturing parens (?: stuff). Groups, but does not capture. +"(?:abc)*(tail)" "<0>abcabcabc<1>tail" + +# Non-greedy *? quantifier +".*?(abc)" "<0> abx <1>abc abc abc abc" +".*(abc)" "<0> abx abc abc abc <1>abc" + +"((?:abc |xyz )*?)abc " "<0><1>xyz abc abc abc " +"((?:abc |xyz )*)abc " "<0><1>xyz abc abc abc " + +# Non-greedy +? quantifier +"(a+?)(a*)" "<0><1>a<2>aaaaaaaaaaaa" +"(a+)(a*)" "<0><1>aaaaaaaaaaaaa<2>" + +"((ab)+?)((ab)*)" "<0><1><2>ab<3>ababababab<4>ab" +"((ab)+)((ab)*)" "<0><1>abababababab<2>ab<3>" + +# Non-greedy ?? quantifier +"(ab)(ab)??(ab)??(ab)??(ab)??c" "<0><1>ab<4>ab<5>abc" + +# Unicode Properties as naked elements in a pattern +"\p{Lu}+" "here we go ... <0>ABC and no more." +"(\p{L}+)(\P{L}*?) (\p{Zs}*)" "7999<0><1>letters<2>4949%^&*( <3> " + +# \w and \W +"\w+" " $%^&*( <0>hello123%^&*(" +"\W+" "<0> $%^&*( hello123%^&*(" + +# \A match at beginning of input only. + ".*\Ahello" "<0>hello hello" + ".*hello" "<0>hello hello" +".*\Ahello" "stuff\nhello" # don't match after embedded new-line. + +# \b \B +# +".*?\b(.).*" "<0> $%^&*( <1>hello123%^&*()gxx" +"\ba\b" "-<0>a" +"\by\b" "xy" +"[ \b]" "<0>b" # in a set, \b is a literal b. + +# Finds first chars of up to 5 words +"(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>Tthe <2>qick <3>brown <4>fox" + +"H.*?((?:\B.)+)" "<0>H<1>ello " +".*?((?:\B.)+).*?((?:\B.)+).*?((?:\B.)+)" "<0>H<1>ello <2> g<3>oodbye " + +"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A\u0302BC\u0303\u0304<2> \u0305 \u0306<3>X\u0307Y\u0308" + + +# +# Unicode word boundary mode +# +"(?w).*?\b" v "<0>hello, world" +"(?w).*?(\b.+?\b).*" v "<0><1> 123.45 " +"(?w).*?(\b\d.*?\b).*" v "<0> <1>123.45 " +".*?(\b.+?\b).*" "<0> <1>123.45 " +"(?w:.*?(\b\d.*?\b).*)" v "<0> <1>123.45 " +"(?w:.*?(\b.+?\b).*)" v "<0><1>don't " +"(?w:.+?(\b\S.+?\b).*)" v "<0> <1>don't " +"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" v "<0><1>.<2> <3>,<4>:<5>$<6>37,000.50<7> " + +# +# Unicode word boundaries with Regions +# +"(?w).*?\b" v "abc<0>defghi" +"(?w).*?\b" v2 "abcdef<0>ghi" +"(?w).*?\b" v3 "abcdefghi" +#"(?w).*?\b" vb "abc<0>defghi" # TODO: bug. Ticket 6073 +#"(?w).*?\b" vb2 "abcdefghi" + + + +# . does not match new-lines +"." "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X\u000aY" +"A." "A\u000a "# no match + +# \d for decimal digits +"\d*" "<0>0123456789\u0660\u06F9\u0969\u0A66\u17E2\uFF10\U0001D7CE\U0001D7FFnon-digits" +"\D+" "<0>non digits" +"\D*(\d*)(\D*)" "<0>non-digits<1>3456666<2>more non digits" + +# \Q...\E quote mode +"hel\Qlo, worl\Ed" "<0>hello, world" +"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa" +"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\\r..." # \Q ... \E escape in a [set] + +# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized. +# Note that data strings in test cases still get escape processing. +"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031byeextra" +"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral" + +# \S and \s space characters +"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029xyz" +"(\S+).*?(\S+).*" "<0><1>Not-spaces <2>more-non-spaces " + +# \X consume one Grapheme Cluster. +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A<2>B<3> <4>\r\n" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A\u0301<2>\n<3>\u0305<4>a\u0302\u0303\u0304" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1161\u11a8<2>\u115f\u11a2\u11f9" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\uac01<2>\uac02<3>\uac03\u11b0" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1101\uac02\u0301<2>\u1100" +# Regional indicator pairs are grapheme clusters +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U0001f1e6\U0001f1e8<2>\U0001f1ea\U0001f1ff" +# Grapheme Break rule 9b: Prepend x +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U000111C2x" + +# Grapheme clusters that straddle a match region. Matching is pinned to the region limits, +# giving boundaries inside grapheme clusters +"(\X)?(\X)?(\X)?" v "a\u0301<0><1>\u0301\u0301<2>z\u0302\u0302\u0302" +# Same as previous test case, but without the region limits. +"(\X)?(\X)?(\X)?" v "<0><1>a\u0301\u0301\u0301<2>z\u0302\u0302\u0302" + +# ^ matches only at beginning of line +".*^(Hello)" "<0><1>Hello Hello Hello Hello Goodbye" +".*(Hello)" "<0>Hello Hello Hello <1>Hello Goodbye" +".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match + +# $ matches only at end of line, or before a newline preceding the end of line +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)" ZY "<0>Hello <1>Goodbye Goodbye Goodbye" +".*?(Goodbye)$" z "Hello Goodbye> Goodbye Goodbye "# No Match + +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" +".*?(Goodbye)$" z "Hello Goodbye Goodbye Goodbye\n\n"# No Match + +# \Z matches at end of input, like $ with default flags. +".*?(Goodbye)\Z" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)" ZY "<0>Hello <1>Goodbye Goodbye Goodbye" +".*?(Goodbye)\Z" z "Hello Goodbye> Goodbye Goodbye "# No Match +"here$" z "here\nthe end"# No Match + +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" +".*?(Goodbye)\Z" "Hello Goodbye Goodbye Goodbye\n\n"# No Match + +# \z matches only at the end of string. +# no special treatment of new lines. +# no dependencies on flag settings. +".*?(Goodbye)\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye "# No Match +"here$" z "here\nthe end"# No Match + +".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye\n"# No Match +".*?(Goodbye)\n\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +"abc\z|def" ZY "abc<0>def" + +# (?# comment) doesn't muck up pattern +"Hello (?# this is a comment) world" " <0>Hello world..." + +# Check some implementation corner cases base on the way literal strings are compiled. +"A" "<0>A" +"AB" "<0>ABABABAB" +"AB+" "<0>ABBBA" +"AB+" "<0>ABABAB" +"ABC+" "<0>ABCABC" +"ABC+" "<0>ABCCCCABC" +"(?:ABC)+" "<0>ABCABCABCD" +"(?:ABC)DEF+" "<0>ABCDEFFFD" +"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666EF" +"ab\Bde" "<0>abde" + +# loop breaking +"(a?)*" "<0><1>xyz" +"(a?)+" "<0><1>xyz" +"^(?:a?b?)*$" "a--" +"(x?)*xyz" "<0>xx<1>xyz" # Sligthtly weird, but correct. The "last" time through (x?), + # it matches the empty string. + +# Set expressions, basic operators and escapes work +# +"[\d]+" "<0>0123abc/.," +"[^\d]+" "0123<0>abc/.," +"[\D]+" "0123<0>abc/.," +"[^\D]+" "<0>0123abc/.," + +"[\s]+" "<0> \tabc/.," +"[^\s]+" " \t<0>abc/.," +"[\S]+" " \t<0>abc/.," +"[^\S]+" "<0> \tabc/.," + +"[\w]+" "<0>abc123 .,;" +"[^\w]+" "abc123<0> .,;" +"[\W]+" "abc123<0> .,;" +"[^\W]+" "<0>abc123 .,;" + +"[\z]+" "abc<0>zzzdef" # \z has no special meaning +"[^\z]+" "<0>abczzzdef" +"[\^]+" "abc<0>^^" +"[^\^]+" "<0>abc^^" + +"[\u0041c]+" "<0>AcAcdef" +"[\U00010002]+" "<0>\ud800\udc02\U00010003" +"[^\U00010002]+" "<0>Hello\x{10002}" +"[\x61b]+" "<0>ababcde" +#"[\x6z]+" "\x06" #TODO: single hex digits should fail +"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+" "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3abc" + +"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+" "x<0> \u0184\U000E0135 abc" +"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+" "ab<0>cdefghi" + + + +# +# [set expressions], check the precedence of '-', '&', '--', '&&' +# '-' and '&', for compatibility with ICU UnicodeSet, have the same +# precedence as the implicit Union between adjacent items. +# '--' and '&&', for compatibility with Java, have lower precedence than +# the implicit Union operations. '--' and '&&' themselves +# have the same precedence, and group left to right. +# +"[[a-m]-[f-w]p]+" "<0>depfgwxyz" +"[^[a-m]-[f-w]p]+" "dep<0>fgwxyz" + +"[[a-m]--[f-w]p]+" "<0>depfgwxyz" +"[^[a-m]--[f-w]p]+" "de<0>pfgwxyz" + +"[[a-m]&[e-s]w]+" "<0>efmwadnst" +"[^[a-m]&[e-s]w]+" "efmw<0>adnst" + +"[[a-m]&[e-s]]+" "<0>efmadnst" + + + +# {min,max} iteration qualifier +"A{3}BC" "<0>AAABC" + +"(ABC){2,3}AB" "no matchAB" +"(ABC){2,3}AB" "ABCAB" +"(ABC){2,3}AB" "<0>ABC<1>ABCAB" +"(ABC){2,3}AB" "<0>ABCABC<1>ABCAB" +"(ABC){2,3}AB" "<0>ABCABC<1>ABCABCAB" + +"(ABC){2}AB" "ABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCABCABCAB" + +"(ABC){2,}AB" "ABCAB" +"(ABC){2,}AB" "<0>ABC<1>ABCAB" +"(ABC){2,}AB" "<0>ABCABC<1>ABCAB" +"(ABC){2,}AB" "<0>ABCABCABC<1>ABCAB" + +"X{0,0}ABC" "<0>ABC" +"X{0,1}ABC" "<0>ABC" + +"(?:Hello(!{1,3}) there){1}" "Hello there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>! there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!! there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!! there" +"(?:Hello(!{1,3}) there){1}" "Hello!!!! there" + +# Nongreedy {min,max}? intervals +"(ABC){2,3}?AB" "no matchAB" +"(ABC){2,3}?AB" "ABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCABCABCAB" +"(ABC){2,3}?AX" "<0>ABCABC<1>ABCAX" +"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABCAX" + +# Possessive {min,max}+ intervals +"(ABC){2,3}+ABC" "ABCABCABC" +"(ABC){1,2}+ABC" "<0>ABC<1>ABCABC" +"(?:(.)\1){2,5}+." "<0>aabbcc<1>ddex" + + +# Atomic Grouping +"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string. +"(?>(abc{2,4}?))(c*)" "<0><1>abcc<2>cccddd" +"(\.\d\d(?>[1-9]?))\d+" "1.625" +"(\.\d\d(?>[1-9]?))\d+" "1<0><1>.6250" + +# Possessive *+ +"(abc)*+a" "abcabcabc" +"(abc)*+a" "<0>abc<1>abcab" +"(a*b)*+a" "<0><1>aaaabaaaa" + +# Possessive ?+ +"c?+ddd" "<0>cddd" +"c?+cddd" "cddd" +"c?cddd" "<0>cddd" + +# Back Reference +"(?:ab(..)cd\1)*" "<0>ab23cd23ab<1>wwcdwwabxxcdyy" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>c" +"ab(?:c|(d?))(\1)" "<0>ab<1>d<2>d" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>e" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>" + +# Back References that hit/don't hit end +"(abcd) \1" z "abcd abc" +"(abcd) \1" Z "<0><1>abcd abcd" +"(abcd) \1" Z "<0><1>abcd abcd " + +# Case Insensitive back references that hit/don't hit end. +"(abcd) \1" zi "abcd abc" +"(abcd) \1" Zi "<0><1>abcd ABCD" +"(abcd) \1" Zi "<0><1>abcd ABCD " + +# Back references that hit/don't hit boundary limits. + +"(abcd) \1" z "abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " + +"(abcd) \1" zi "abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " + +# Back reference that fails match near the end of input without actually hitting the end. +"(abcd) \1" ZL "abcd abd" +"(abcd) \1" ZLi "abcd abd" + +# Back reference to a zero-length match. They are always a successful match. +"ab(x?)cd(\1)ef" "<0>ab<1>cd<2>ef" +"ab(x?)cd(\1)ef" i "<0>ab<1>cd<2>ef" + +# Back refs to capture groups that didn't participate in the match. +"ab(?:(c)|(d))\1" "abde" +"ab(?:(c)|(d))\1" "<0>ab<1>cce" +"ab(?:(c)|(d))\1" i "abde" +"ab(?:(c)|(d))\1" i "<0>ab<1>cce" + +# Named back references +"(?abcd)\k" "<0><1>abcdabcd" +"(no)?(?abcd)\k" "<0><2>abcdabcd" + +"(?...)" E " " # backref names are ascii letters & numbers only" +"(?<1a>...)" E " " # backref names must begin with a letter" +"(?.)(?.)" E " " # Repeated names are illegal. + + +# Case Insensitive +"aBc" i "<0>ABC" +"a[^bc]d" i "ABD" +'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>AA" + +"(?:(?i)a)b" "<0>Ab" +"ab(?i)cd" "<0>abCd" +"ab$cd" "abcd" + +"ssl" i "abc<0>ßlxyz" +"ssl" i "abc<0>ẞlxyz" +"FIND" i "can <0>find ?" # fi ligature, \ufb01 +"find" i "can <0>FIND ?" +"ῧ" i "xxx<0>ῧxxx" # Composed char (match string) decomposes when case-folded (pattern) + +# White space handling +"a b" "ab" +"abc " "abc" +"abc " "<0>abc " +"ab[cd e]z" "<0>ab z" +"ab\ c" "<0>ab c " +"ab c" "<0>ab c " +"ab c" x "ab c " +"ab\ c" x "<0>ab c " + +# +# Pattern Flags +# +"(?u)abc" "<0>abc" +"(?-u)abc" "<0>abc" + +# +# \c escapes (Control-whatever) +# +"\cA" "<0>\u0001" +"\ca" "<0>\u0001" +"\c\x" "<0>\u001cx" + + +#Multi-line mode +'b\s^' m "a\nb\n" +"(?m)^abc$" "abc \n abc\n<0>abc\nabc" +"(?m)^abc$" 2 "abc \n abc\nabc\n<0>abc" +"^abc$" 2 "abc \n abc\nabc\nabc" + +# Empty and full range +"[\u0000-\U0010ffff]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz" +"[^\u0000-\U0010ffff]" "abc\u0000\uffff\U00010000\U0010ffffzz" +"[^a--a]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz" + +# Free-spacing mode +"a b c # this is a comment" x "<0>abc " +'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac" +"a b c [x y z]" x "abc " +"a b c [x y z]" x "a b c " +"a b c [x y z]" x "<0>abcxyz" +"a b c [x y z]" x "<0>abcyyz" + +# +# Look Behind +# +"(?<=a)b" "a<0>b" +"(.*)(?<=[bc])" "<0><1>abcd" +"(?<=(abc))def" "<1>abc<0>def" # lookbehind precedes main match. +"(?<=ab|abc)xyz" "abwxyz" # ab matches, but not far enough. +"(?<=abc)cde" "abcde" +"(?<=abc|ab)cde" "ab<0>cde" +"(?<=abc|ab)cde" "abc<0>cde" + +"(?<=bc?c?c?)cd" "ab<0>cd" +"(?<=bc?c?c?)cd" "abc<0>cd" +"(?<=bc?c?c?)cd" "abcc<0>cd" +"(?<=bc?c?c?)cd" "abccc<0>cd" +"(?<=bc?c?c?)cd" "abcccccd" +"(?<=bc?c?c?)c+d" "ab<0>cccccd" + +".*(?<=: ?)(\w*)" "<0>1:one 2: two 3:<1>three " + +# +# Named Characters +# +"a\N{LATIN SMALL LETTER B}c" "<0>abc" +"a\N{LATIN SMALL LETTER B}c" i "<0>abc" +"a\N{LATIN SMALL LETTER B}c" i "<0>aBc" +"a\N{LATIN SMALL LETTER B}c" "aBc" + +"\N{FULL STOP}*" "<0>...abc" + +"$" "abc<0>" + +# +# Optimizations of .* at end of patterns +# +"abc.*" "<0>abcdef" +"abc.*$" "<0>abcdef" +"abc(.*)" "<0>abc<1>def" +"abc(.*)" "<0>abc<1>" +"abc.*" "<0>abc\ndef" +"abc.*" s "<0>abc\ndef" +"abc.*$" s "<0>abc\ndef" +"abc.*$" "abc\ndef" +"abc.*$" m "<0>abc\ndef" +"abc.*\Z" m "abc\ndef" +"abc.*\Z" sm "<0>abc\ndef" + +"abc*" "<0>abcccd" +"abc*$" "<0>abccc" +"ab(?:ab[xyz]\s)*" "<0>ababy abx abc" + +"(?:(abc)|a)(?:bc)+" "<0>abc" +"(?:(abc)|a)(?:bc)*" "<0><1>abc" +"^[+\-]?[0-9]*\.?[0-9]*" "<0>123.456" + +"ab.+yz" "<0>abc12345xyzttt" +"ab.+yz" s "<0>abc12345xyzttt" + +"ab.+yz" "abc123\n45xyzttt" +"ab.+yz" s "<0>abc12\n345xyzttt" + +"ab[0-9]+yz" "---abyz+++" +"ab[0-9]+yz" "---<0>ab1yz+++" +"ab[0-9]+yz" "---<0>ab12yz+++" +"ab[0-9]+yz" "---<0>ab123456yz+++" + +"ab([0-9]+|[A-Z]+)yz" "---abyz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>1yz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>12yz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>Ayz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>AByz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>ABCDEyz+++" + +# +# Hex format \x escaping +# +"ab\x63" "<0>abc" +"ab\x09w" "<0>ab\u0009w" +"ab\xabcdc" "<0>ab\u00abcdc" +"ab\x{abcd}c" "<0>ab\uabcdc" +"ab\x{101234}c" "<0>ab\U00101234c" +"abα" "<0>abα" + +# +# Octal Escaping. This conforms to Java conventions, not Perl. +"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032" +"\0776" "<0>\u003f\u0036" # overflow, the 6 is literal. +"\0376xyz" "<0>\u00fexyz" +"\08" E "<0>\u00008" +"\0" E "x" + +# +# \u Surrogate Pairs +# +"\ud800\udc00" "<0>\U00010000" +"\ud800\udc00*" "<0>\U00010000\U00010000\U00010000\U00010001" +# TODO (Vitess): The next case has invalid UTF-8, so it's not supported right now for testing. It likely works in practice though! +# "\ud800\ud800\udc00" "<0>\ud800\U00010000\U00010000\U00010000\U00010001" +"(\ud800)(\udc00)" "\U00010000" +"\U00010001+" "<0>\U00010001\U00010001\udc01" + +# +# hitEnd with find() +# +"abc" Z "aa<0>abc abcab" +"abc" 2Z "aaabc <0>abcab" +"abc" 3z "aa>abc abcab" + +# +# \ escaping +# +"abc\jkl" "<0>abcjkl" # escape of a non-special letter is just itself. +"abc[ \j]kl" "<0>abcjkl" + +# +# \R all newline sequences. +# +"abc\Rxyz" "<0>abc\u000axyzgh" +"abc\Rxyz" "<0>abc\u000bxyzgh" +"abc\Rxyz" "<0>abc\u000cxyzgh" +"abc\Rxyz" "<0>abc\u000dxyzgh" +"abc\Rxyz" "<0>abc\u0085xyzgh" +"abc\Rxyz" "<0>abc\u2028xyzgh" +"abc\Rxyz" "<0>abc\u2029xyzgh" +"abc\Rxyz" "<0>abc\u000d\u000axyzgh" + +"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence. +"abc\r\nxyz" "<0>abc\u000d\u000axyzgh" + +"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches. +"abc\Rxyz" "abc\u000exyz" +"abc\Rxyz" "abc\u202axyz" + +# \v \V single character new line sequences. + +"abc\vxyz" "<0>abc\u000axyzgh" +"abc\vxyz" "<0>abc\u000bxyzgh" +"abc\vxyz" "<0>abc\u000cxyzgh" +"abc\vxyz" "<0>abc\u000dxyzgh" +"abc\vxyz" "<0>abc\u0085xyzgh" +"abc\vxyz" "<0>abc\u2028xyzgh" +"abc\vxyz" "<0>abc\u2029xyzgh" +"abc\vxyz" "abc\u000d\u000axyzgh" +"abc\vxyz" "abc?xyzgh" + +"abc[\v]xyz" "<0>abc\u000axyzgh" +"abc[\v]xyz" "<0>abc\u000bxyzgh" +"abc[\v]xyz" "<0>abc\u000cxyzgh" +"abc[\v]xyz" "<0>abc\u000dxyzgh" +"abc[\v]xyz" "<0>abc\u0085xyzgh" +"abc[\v]xyz" "<0>abc\u2028xyzgh" +"abc[\v]xyz" "<0>abc\u2029xyzgh" +"abc[\v]xyz" "abc\u000d\u000axyzgh" +"abc[\v]xyz" "abc?xyzgh" + +"abc\Vxyz" "abc\u000axyzgh" +"abc\Vxyz" "abc\u000bxyzgh" +"abc\Vxyz" "abc\u000cxyzgh" +"abc\Vxyz" "abc\u000dxyzgh" +"abc\Vxyz" "abc\u0085xyzgh" +"abc\Vxyz" "abc\u2028xyzgh" +"abc\Vxyz" "abc\u2029xyzgh" +"abc\Vxyz" "abc\u000d\u000axyzgh" +"abc\Vxyz" "<0>abc?xyzgh" + +# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab + +"abc\hxyz" "<0>abc xyzgh" +"abc\Hxyz" "abc xyzgh" +"abc\hxyz" "<0>abc\u2003xyzgh" +"abc\Hxyz" "abc\u2003xyzgh" +"abc\hxyz" "<0>abc\u0009xyzgh" +"abc\Hxyz" "abc\u0009xyzgh" +"abc\hxyz" "abc?xyzgh" +"abc\Hxyz" "<0>abc?xyzgh" + +"abc[\h]xyz" "<0>abc xyzgh" +"abc[\H]xyz" "abc xyzgh" +"abc[\h]xyz" "<0>abc\u2003xyzgh" +"abc[\H]xyz" "abc\u2003xyzgh" +"abc[\h]xyz" "<0>abc\u0009xyzgh" +"abc[\H]xyz" "abc\u0009xyzgh" +"abc[\h]xyz" "abc?xyzgh" +"abc[\H]xyz" "<0>abc?xyzgh" + + +# +# Bug xxxx +# +"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?" MG "<0>-1234-21-31T41:51:61.789+71:81" + + +# +# A random, complex, meaningless pattern that should at least compile +# +"(?![^\\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()" "<0>abc" + + +# +# Bug 3225 + +"1|9" "<0>1" +"1|9" "<0>9" +"1*|9" "<0>1" +"1*|9" "<0>9" + +"(?:a|ac)d" "<0>acd" +"a|ac" "<0>ac" + +# +# Bug 3320 +# +"(a([^ ]+)){0,} (c)" "<0><1>a<2>b <3>c " +"(a([^ ]+))* (c)" "<0><1>a<2>b <3>c " + +# +# Bug 3436 +# +"(.*?) *$" "<0><1>test " + +# +# Bug 4034 +# +"\D" "<0>ABC\u00ffDEF" +"\d" "ABC\u00ffDEF" +"\D" "<0>\u00ffDEF" +"\d" "\u00ffDEF" +"\D" "123<0>\u00ffDEF" +"\D" "<0>\u0100DEF" +"\D" "123<0>\u0100DEF" + +# +#bug 4024, new line sequence handling +# +"(?m)^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)^" 2 "AA\u000d\u000a<0>BB\u000d\u000aCC\u000d\u000a" +"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0>CC\u000d\u000a" +"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"(?m)$" "AA<0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)$" 2 "AA\u000d\u000aBB<0>\u000d\u000aCC\u000d\u000a" +"(?m)$" 3 "AA\u000d\u000aBB\u000d\u000aCC<0>\u000d\u000a" +"(?m)$" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0>" +"(?m)$" 5 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"$" "AA\u000d\u000aBB\u000d\u000aCC<0>\u000d\u000a" +"$" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0>" +"$" 3 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"$" "\u000a\u0000a<0>\u000a" +"$" 2 "\u000a\u0000a\u000a<0>" +"$" 3 "\u000a\u0000a\u000a" + +"$" "<0>" +"$" 2 "" + +"$" "<0>\u000a" +"$" 2 "\u000a<0>" +"$" 3 "\u000a" + +"^" "<0>" +"^" 2 "" + +"\Z" "<0>" +"\Z" 2 "" +"\Z" 2 "\u000a<0>" +"\Z" "<0>\u000d\u000a" +"\Z" 2 "\u000d\u000a<0>" + + +# No matching ^ at interior new-lines if not in multi-line mode. +"^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"^" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +# +# Dot-matches-any mode, and stopping at new-lines if off. +# +"." "<0>123\u000aXYZ" +"." 2 "1<0>23\u000aXYZ" +"." 3 "12<0>3\u000aXYZ" +"." 4 "123\u000a<0>XYZ" # . doesn't match newlines +"." 4 "123\u000b<0>XYZ" +"." 4 "123\u000c<0>XYZ" +"." 4 "123\u000d<0>XYZ" +"." 4 "123\u000d\u000a<0>XYZ" +"." 4 "123\u0085<0>XYZ" +"." 4 "123\u2028<0>XYZ" +"." 4 "123\u2029<0>XYZ" +"." 4s "123<0>\u000aXYZ" # . matches any +"." 4s "123<0>\u000bXYZ" +"." 4s "123<0>\u000cXYZ" +"." 4s "123<0>\u000dXYZ" +"." 4s "123<0>\u000d\u000aXYZ" +"." 4s "123<0>\u0085XYZ" +"." 4s "123<0>\u2028XYZ" +"." 4s "123<0>\u2029XYZ" +".{6}" "123\u000a\u000dXYZ" +".{6}" s "<0>123\u000a\u000dXY" + + +# +# Ranges +# +".*" "abc<0>defghi" +"a" "aaa<0>aaaaaa" +"a" 2 "aaaa<0>aaaaa" +"a" 3 "aaaaa<0>aaaa" +"a" 4 "aaaaaaaaa" +"a" "aaa<0>aaaaaa" + +# +# [set] parsing, systematically run through all of the parser states. +# +# +"[def]+" "abc<0>ddeeffghi" # set-open +"[^def]+" "<0>abcdefghi" +"[:digit:]+" "abc<0>123def" +"[:^digit:]+" "<0>abc123def" +"[\u005edef]+" "abc<0>de^fghi" + +"[]]+" "abc<0>]]][def" # set-open2 +"[^]]+" "<0>abc]]][def" + +"[:Lu:]+" "abc<0>ABCdef" # set-posix +"[:Lu]+" "abc<0>uL::Lu" +"[:^Lu]+" "abc<0>uL:^:Lu" +"[:]+" "abc<0>:::def" +"[:whats this:]" E " " +"[--]+" dE "-------" + +"[[nested]]+" "xyz[<0>nnetsteed]abc" #set-start +"[\x{41}]+" "CB<0>AAZYX" +"[\[\]\\]+" "&*<0>[]\\..." +"[*({<]+" "^&<0>{{(<<*)))" + + +"[-def]+" "abc<0>def-ef-dxyz" # set-start-dash +"[abc[--def]]" E " " + +"[x[&def]]+" "abc<0>def&ghi" # set-start-amp +"[&& is bad at start]" E " " + +"[abc" E " " # set-after-lit +"[def]]" "abcdef" +"[def]]" "abcde<0>f]]" + +"[[def][ghi]]+" "abc]<0>defghi[xyz" # set-after-set +"[[def]ghi]+" "abc]<0>defghi[xyz" +"[[[[[[[[[[[abc]" E " " +"[[abc]\p{Lu}]+" "def<0>abcABCxyz" + +"[d-f]+" "abc<0>defghi" # set-after-range +"[d-f[x-z]]+" "abc<0>defxyzzzgw" +"[\s\d]+" "abc<0> 123def" +"[d-f\d]+" "abc<0>def123ghi" +"[d-fr-t]+" "abc<0>defrstuvw" + +"[abc--]" E " " # set-after-op +"[[def]&&]" E " " +"[-abcd---]+" "<0>abc--" #[-abcd]--[-] +"[&abcd&&&ac]+" "b<0>ac&&cad" #[&abcd]&&[&ac] + +"[[abcd]&[ac]]+" "b<0>acacd" # set-set-amp +"[[abcd]&&[ac]]+" "b<0>acacd" +"[[abcd]&&ac]+" "b<0>acacd" +"[[abcd]&ac]+" "<0>bacacd&&&" + +"[abcd&[ac]]+" "<0>bacacd&&&" #set-lit-amp +"[abcd&&[ac]]+" "b<0>acacd" +"[abcd&&ac]+" "b<0>acacd" + +"[[abcd]-[ac]]+" "a<0>bdbdc" # set-set-dash +"[[abcd]--[ac]]+" "a<0>bdbdc" +"[[abcd]--ac]+" "a<0>bdbdc" +"[[abcd]-ac]+" "<0>bacacd---" + +"[a-d--[b-c]]+" "b<0>adadc" # set-range-dash +"[a-d--b-c]+" "b<0>adadc" +"[a-d-[b-c]]+" "<0>bad-adc" +"[a-d-b-c]+" "<0>bad-adc" +"[\w--[b-c]]+" "b<0>adadc" +"[\w--b-c]+" "b<0>adadc" +"[\w-[b-c]]+" "<0>bad-adc" +"[\w-b-c]+" "<0>bad-adc" + +"[a-d&&[b-c]]+" "a<0>bcbcd" # set-range-amp +"[a-d&&b-c]+" "a<0>bcbcd" +"[a-d&[b-c]]+" "<0>abc&bcd" +"[a-d&b-c]+" "<0>abc&bcd" + +"[abcd--bc]+" "b<0>addac" # set-lit-dash +"[abcd--[bc]]+" "b<0>addac" +"[abcd-[bc]]+" "<0>bad--dacxyz" +"[abcd-]+" "<0>bad--dacxyz" + +"[abcd-\s]+" E "xyz<0>abcd --xyz" # set-lit-dash-esc +"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefghij-" +"[bcd-\{]+" "a<0>bcdefyz{|}" + +"[\p{Ll}]+" "ABC<0>abc^&*&" # set-escape +"[\P{Ll}]+" "abc<0>ABC^&*&xyz" +"[\N{LATIN SMALL LETTER Q}]+" "mnop<0>qqqrst" +"[\sa]+" "cb<0>a a (*&" +"[\S]+" " <0>hello " +"[\w]+" " <0>hello_world! " +"[\W]+" "a<0> *$%#,hello " +"[\d]+" "abc<0>123def" +"[\D]+" "123<0>abc567" +"[\$\#]+" "123<0>$#$#\\" + +# +# Try each of the Java compatibility properties. +# These are checked here, while normal Unicode properties aren't, because +# these Java compatibility properties are implemented directly by regexp, while other +# properties are handled by ICU's Property and UnicodeSet APIs. +# +# These tests are only to verify that the names are recognized and the +# implementation isn't dead. They are not intended to verify that the +# function definitions are 100% correct. +# +"[:InBasic Latin:]+" "ΓΔΕΖΗΘ<0>hello, world.ニヌネノハバパ" +"[:^InBasic Latin:]+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.ニヌネノハバパ" +"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InGreek}+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InCombining Marks for Symbols}" "<0>\u20d0" +"\p{Incombiningmarksforsymbols}" "<0>\u20d0" + + +"\p{javaDefined}+" "\uffff<0>abcd\U00045678" +"\p{javaDigit}+" "abc<0>1234xyz" +"\p{javaIdentifierIgnorable}+" "abc<0>\u0000\u000e\u009fxyz" +"\p{javaISOControl}+" "abc<0>\u0000\u000d\u0083xyz" +"\p{javaJavaIdentifierPart}+" "#@!<0>abc123_$;" +"\p{javaJavaIdentifierStart}+" "123\u0301<0>abc$_%^&" +"\p{javaLetter}+" "123<0>abcDEF&*()(" +"\p{javaLetterOrDigit}+" "$%^&*<0>123abcகஙசஜஞ☺♘♚☔☎♬⚄⚡" +"\p{javaLowerCase}+" "ABC<0>def&^%#:=" +"\p{javaMirrored}+" "ab$%<0>(){}[]xyz" +"\p{javaSpaceChar}+" "abc<0> \u00a0\u2028!@#" +"\p{javaSupplementaryCodePoint}+" "abc\uffff<0>\U00010000\U0010ffff\u0000" +"\p{javaTitleCase}+" "abCE<0>Džῌᾨ123" +"\p{javaUnicodeIdentifierStart}+" "123<0>abcⅣ%^&&*" +"\p{javaUnicodeIdentifierPart}+" "%&&^<0>abc123\u0301\u0002..." +"\p{javaUpperCase}+" "abc<0>ABC123" +"\p{javaValidCodePoint}+" "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff" +"\p{javaWhitespace}+" "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u202842" +"\p{all}+" "<0>123\u0000\U0010ffff" +"\P{all}+" "123\u0000\U0010ffff" + +# [:word:] is implemented directly by regexp. Not a java compat property, but PCRE and others. + +"[:word:]+" ".??$<0>abc123ΓΔΕΖΗ_%%%" +"\P{WORD}+" "<0>.??$abc123ΓΔΕΖΗ_%%%" + +# +# Errors on unrecognized ASCII letter escape sequences. +# +"[abc\Y]+" "<0>abcY" +"[abc\Y]+" eE "<0>abcY" + +"(?:a|b|c|\Y)+" "<0>abcY" +"(?:a|b|c|\Y)+" eE "<0>abcY" + +"\Q\Y\E" e "<0>\\Y" + +# +# Reported problem +# +"[a-\w]" E "x" + +# +# Bug 4045 +# +"A*" "<0>AAAA" +"A*" 2 "AAAA<0>" +"A*" 3 "AAAA" +"A*" 4 "AAAA" +"A*" 5 "AAAA" +"A*" 6 "AAAA" +"A*" "<0>" +"A*" 2 "" +"A*" 3 "" +"A*" 4 "" +"A*" 5 "" + +# +# Bug 4046 +# +"(?m)^" "<0>AA\u000dBB\u000dCC\u000d" +"(?m)^" 2 "AA\u000d<0>BB\u000dCC\u000d" +"(?m)^" 3 "AA\u000dBB\u000d<0>CC\u000d" +"(?m)^" 4 "AA\u000dBB\u000dCC\u000d" +"(?m)^" 5 "AA\u000dBB\u000dCC\u000d" +"(?m)^" 6 "AA\u000dBB\u000dCC\u000d" + +"(?m)^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)^" 2 "AA\u000d\u000a<0>BB\u000d\u000aCC\u000d\u000a" +"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0>CC\u000d\u000a" +"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +# +# Bug 4059 +# +"\w+" "<0>イチロー" +"\b....\b." "<0>イチロー?" + + +# +# Bug 4058 ICU Unicode Set patterns have an odd feature - +# A $ as the last character before the close bracket means match +# a \uffff, which means off the end of the string in transliterators. +# Didn't make sense for regular expressions, and is now fixed. +# +"[\$](P|C|D);" "<0>$<1>P;" +"[$](P|C|D);" "<0>$<1>P;" +"[$$](P|C|D);" "<0>$<1>P;" + +# +# bug 4888 Flag settings lost in some cases. +# +"((a){2})|(#)" is "no" +"((a){2})|(#)" is "<0><1>a<2>a#" +"((a){2})|(#)" is "a<0><3>#" + +"((a|b){2})|c" is "<0>c" +"((a|b){2})|c" is "<0>C" +"((a|b){2})|c" s "C" + +# +# bug 5617 ZWJ \u200d shouldn't cause word boundaries +# +".+?\b" "<0> \u0935\u0915\u094D\u200D\u0924\u0947 " +".+?\b" 2 " <0>\u0935\u0915\u094D\u200D\u0924\u0947 " +".+?\b" 3 " \u0935\u0915\u094D\u200D\u0924\u0947 " + +# +# bug 5386 "^.*$" should match empty input +# +"^.*$" "<0>" +"^.*$" m "<0>" +"^.*$" "<0>\n" +"(?s)^.*$" "<0>\n" + +# +# bug 5386 Empty pattern and empty input should match. +# +"" "<0>abc" +"" "<0>" + +# +# bug 5386 Range upper and lower bounds can be equal +# +"[a-a]" "<0>a" + +# +# bug 5386 $* should not fail, should match empty string. +# +"$*" "<0>abc" + +# +# bug 5386 \Q ... \E escaping problem +# +"[a-z\Q-$\E]+" "QE<0>abc-def$." + +# More reported 5386 Java comaptibility failures +# +"[^]*abb]*" "<0>kkkk" +"\xa" "huh" # Java would like to be warned. +"^.*$" "<0>" + +# +# bug 5386 Empty left alternation should produce a zero length match. +# +"|a" "<0>a" +"$|ab" "<0>ab" +"$|ba" "ab<0>" + +# +# bug 5386 Java compatibility for set expressions +# +"[a-z&&[cde]]+" "ab<0>cdefg" + +# +# bug 6019 matches() needs to backtrack and check for a longer match if the +# first match(es) found don't match the entire input. +# +"a?|b" "<0>b" +"a?|b" M "<0>b" +"a?|.*?u|stuff|d" M "<0>stuff" +"a?|.*?(u)|stuff|d" M "<0>stuff<1>u" +"a+?" "<0>aaaaaaaaaaaaa" +"a+?" M "<0>aaaaaaaaaaaaa" + +# +# Bug 7724. Expression to validate zip codes. +# +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "<0><1>94040<2>-3344" +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "94040-0000" +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "00000-3344" + +# +# Bug 8666. Assertion failure on match, bad operand to JMP_SAV_X opcode. +# +"((.??)+|A)*" "<0><1><2>AAAAABBBBBCCCCCDDDDEEEEE" + +# +# Bug 8826. Incorrect results with case insensitive matches. +# +"AS(X)" i "aßx" +"AS.*" i "aßx" # Expansion of sharp s can't split between pattern terms. +"ASßS" i "<0>aßß" # All one literal string, does match. +"ASß{1}S" i "aßß" # Pattern with terms, no match. +"aßx" i "<0>assx" +"aßx" i "<0>ASSX" +"aßx" i "<0>aßx" +"ASS(.)" i "<0>aß<1>x" + +# Case Insensitive, probe some corner cases. +"ass+" i "aß" # Second 's' in pattern is qualified, can't combine with first. +"as+" i "aß" +"aßs" i "as" # Can't match half of a ß +"aß+" i "<0>asssssssss" +"aß+" i "<0>assßSssSSSs" +"a(ß?)+" i "<0>assssssss<1>s" +"a(ß?)+" i "<0>a<1>zzzzzzzzs" + +"\U00010400" i "<0>\U00010428" # case folded supplemental code point. + +"sstuff" i "<0>ßtuff" # exercise optimizations on what chars can start a match. +"sstuff" i "s<0>ßtuff" # exercise optimizations on what chars can start a match. +"ßtuff" i "s<0>sstuff" +"ßtuff" i "s<0>Sstuff" + +"a(..)\1" i "<0>A<1>bcBCdef" +"(ß)\1" i "aa<0><1>ssßzz" # Case insensitive back reference +"..(.)\1" i "<0>aa<1>ßss" +"ab(..)\1" i "xx<0>ab<1>ssßss" + +" (ss) ((\1.*)|(.*))" i "<0> <1>ss <2><4>sß" # The back reference 'ss' must not match in 'sß' + +# Bug 9057 +# \u200c and \u200d should be word characters. +# +"\w+" " <0>abc\u200cdef\u200dghi " +"\w+" i " <0>abc\u200cdef\u200dghi " +"[\w]+" " <0>abc\u200cdef\u200dghi " +"[\w]+" i " <0>abc\u200cdef\u200dghi " + +# Bug 9283 +# uregex_open fails for look-behind assertion + case-insensitive + +"(ab)?(?<=ab)cd|ef" i "<0><1>abcd" + +# Bug 9719 Loop breaking on (zero length match){3,} (unlimited upper bound). +# + +"(?:abc){1,}abc" "<0>abcabcabcabcabc" +"(?:2*){2,}?a2\z" "<0>2a2" +"(?:2*){2,}?a2\z" "2a3" +"(?:x?+){3,}+yz" "w<0>yz" +"(2*){2,}?a2\\z" "2a3" +"(2*){2,}?a2\\z" "<0>2<1>a2\\z" +"(2*){2,}?a2\z" "<0>2<1>a2" + + +# Bug 10024 +# Incorrect (unbounded) longest match length with {1, 20} style quantifiers. +# Unbounded match is disallowed in look-behind expressions. +# Max match length is used to limit where to check for look-behind matches. + +"(?<=a{1,5})bc" "aaaa<0>bcdef" +"(?<=(?:aa){3,20})bc" "aaaaaa<0>bcdef" +"(?jkl" +"(?<=a{11})bc" "aaaaaaaaaaa<0>bc" +"(?<=a{11})bc" "aaaaaaaaaabc" +"(?<=a{1,})bc" E "aaaa<0>bcdef" # U_REGEX_LOOK_BEHIND_LIMIT error. +"(?<=(?:){11})bc" "<0>bc" # Empty (?:) expression. + +# Bug 10835 +# Match Start Set not being correctly computed for case insensitive patterns. +# (Test here is to dump the compiled pattern & manually check the start set.) + +"(private|secret|confidential|classified|restricted)" i "hmm, <0><1>Classified stuff" +"(private|secret|confidential|classified|restricted)" "hmm, Classified stuff" + +# Bug 10844 + +"^([\w\d:]+)$" "<0><1>DiesIst1Beispiel:text" +"^([\w\d:]+)$" i "<0><1>DiesIst1Beispiel:text" +"^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text" +"^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text" + +# Bug 11049 +# Edge cases in find() when pattern match begins with set of code points +# and the match begins at the end of the string. + +"A|B|C" "hello <0>A" +"A|B|C" "hello \U00011234" +"A|B|\U00012345" "hello <0>\U00012345" +"A|B|\U00010000" "hello \ud800" + +# Bug 11369 +# Incorrect optimization of patterns with a zero length quantifier {0} + +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" +"(|b)ab(c)" "<0><1>ab<2>c" +"(|b){0}a{3}(D*)" "<0>aaa<2>" +"(|b){0,1}a{3}(D*)" "<0><1>aaa<2>" +"((|b){0})a{3}(D*)" "<0><1>aaa<3>" + +# Bug 11370 +# Max match length computation of look-behind expression gives result that is too big to fit in the +# in the 24 bit operand portion of the compiled code. Expressions should fail to compile +# (Look-behind match length must be bounded. This case is treated as unbounded, an error.) + +"(?pre<1>\ud800post\ud800 fin" +"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path +"pre(.)post\1" i "<0>pre<1>\ud800post\ud800 fin" + +# Bug 11554 +# +# Maximum match length computation was assuming UTF-16. +# Used in look-behind matches to constrain how far back to look. + +"(?<=a\x{100000})spam" "***a\x{100000}<0>spam**" +"(?<=aą)spam" "**aą<0>spam**" +"(?<=ąabc)spam" "**ąabc<0>spam**" + +"(?<=a\x{100000})spam" "***a\x{100001}spam**" +"(?<=aą)spam" "**bąspam**" +"(?<=ąabc)spam" "**ąabxspam**" + +# with negative look-behind + +"(?spam**" +"(?spam**" +"(?spam**" + +# Bug #12930 +# +# Minimum Match Length computation, int32_t overflow on an empty set in the pattern. +# The empty set, with no match possible, has a min match length of INT32_MAX. +# Was incremented subsequently. Caused assertion failure on pattern compile. + +"[^\u0000-\U0010ffff]bc?" "bc no match" +"[^\u0000-\U0010ffff]?bc?" "<0>bc has a match" + +# Bug #12160 Hit End behavior after find fails to find. +# To match Java, should be true if find fails to find. +# +"abc" Z "<0>abc abc abc xyz" +"abc" Z2 "abc <0>abc abc xyz" +"abc" Z3 "abc abc <0>abc xyz" +"abc" z4 "abc abc abc xyz" + +# Bug #13844 Verify that non-standard Java property names are recognized. +"[\p{IsAlphabetic}]" " <0>A" +"[\P{IsAlphabetic}]" "A<0> " +"[\p{IsIdeographic}]" "A<0>〆" +"[\P{IsIdeographic}]" "〆<0>A" +"[\p{IsLetter}]" " <0>A" +"[\P{IsLetter}]" "A<0> " +"[\p{Letter}]" " <0>A" +"[\p{IsLowercase}]" "A<0>a" +"[\P{IsLowercase}]" "a<0>A" +"[\p{IsUppercase}]" "a<0>A" +"[\P{IsUppercase}]" "A<0>a" +"[\p{IsTitlecase}]" "D<0>Dz" +"[\P{IsTitlecase}]" "Dz<0>D" +"[\p{IsPunctuation}]" " <0>&" +"[\P{IsPunctuation}]" "&<0> " +"[\p{IsControl}]" " <0>\x{82}" +"[\P{IsControl}]" "\x{82}<0> " +"[\p{IsWhite_Space}]" "x<0> " +"[\P{IsWhite_Space}]" " <0>x" +"[\p{IsDigit}]" " <0>4" +"[\P{IsDigit}]" "4<0> " +"[\p{IsHex_Digit}]" " <0>F" +"[\P{IsHex_Digit}]" "F<0> " +"[\p{IsJoin_Control}]" " <0>\x{200d}" +"[\P{IsJoin_Control}]" "\x{200d}<0> " +"[\p{IsNoncharacter_Code_Point}]" "A<0>\x{5fffe}" +"[\p{IsAssigned}]" "\x{10ffff}<0>a" +"[\P{IsAssigned}]" "a<0>\x{10ffff}" + +"[\p{InBasic Latin}]" "〆<0>A" +"[\p{InBasicLatin}]" "〆<0>A" +"[\p{InBasic-Latin}]" "〆<0>A" # ICU accepts '-'; Java does not. +"[\p{InBasic_Latin}]" "〆<0>A" +"[\p{Inbasiclatin}]" "〆<0>A" +"[\p{inbasiclatin}]" E "〆<0>A" # "In" must be cased as shown. Property name part is case insensitive. +"[\p{InCombining_Marks_for_Symbols}]" "a<0>\x{20DD}" # COMBINING ENCLOSING CIRCLE + +"[\p{all}]*" "<0>\x{00}abc\x{10ffff}" +"[\p{javaBadProperty}]" E "whatever" +"[\p{IsBadProperty}]" E "whatever" +"[\p{InBadBlock}]" E "whatever" +"[\p{In}]" E "whatever" +"[\p{Is}]" E "whatever" +"[\p{java}]" "x<0>ꦉ" # Note: "java" is a valid script code. + +"[\p{javaLowerCase}]+" "A<0>a" +"[\p{javaLowerCase}]+" i "<0>Aa" +"[\P{javaLowerCase}]+" "<0>Aa" +"[\P{javaLowerCase}]+" i "Aa" # No Match because case fold of the set happens first, then negation. + # JDK is not case insensitive w named properties, even though + # the insensitive match flag is set. A JDK bug? + +"[a-z]+" i "<0>Aa" # Matches JDK behavior. +"[^a-z]+" i "Aa" # (no match) which is JDK behavior. Case fold first, then negation. + +# Bug 20385. Assertion failure while compiling a negative look-behind expression consisting of a set with +# no contents. Meaning the [set] can never match. There is no syntax to directly express +# an empty set, so generate it by negating (^) a set of all code points. +# Also check empty sets in other contexts. + +"(?abc" + +"(?abc" +"x(?xabc" +"x(?xabc" +"x(?xabc" + +"[^\u0000-\U0010ffff]" "a" +"[^[^\u0000-\U0010ffff]]" "<0>a" + +"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings" + +# Bug ICU-20544. Similar to 20385, above. Assertion failure with a negative look-behind assertion containing +# a set with no contents. Look-behind pattern includes more than just the empty set. + +"(?abc" # note: first 'ⰿ' is \u2c3f, hence empty set. +"(?abc" +"(?<=[^[^]]†)" "abc" # Problem also exists w positive look-behind + +# Bug ICU-20391. Crash in computation of minimum match length with nested look-around patterns. +# +"(?<=(?<=((?=)){0}+)" E "aaa" +"(?<=(?<=((?=)){0}+))" "<0>" +"(?<=c(?<=b((?=a)){1}+))" "aaa" +"abc(?=de(?=f))...g" "<0>abcdefg" +"abc(?=de(?=f))...g" "abcdxfg" + +# Bug ICU-20618 Assertion failure with nested look-around expressions. +# +"(?<=(?<=b?(?=a)))" "hello, world." + +# Bug ICU-20939 +# Incorrect word \b boundaries w UTF-8 input and non-ASCII text +# +"(?w)\b" v2 "äää<0> äää" + +# Bug ICU-21492 Assertion failure with nested look-around expressions. +# +"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}" E "<0>" # orig failure from bug report, w mismatched parens. +"(?:(?<=(?:(?<=)){2}))" "<0>" # Simplified case, with a valid pattern. + +# Random debugging, Temporary +# + +# +# Regexps from http://www.regexlib.com +# +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>G1 1AA" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>EH10 2QQ" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>SW1 1ZZ" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "G111 1AA" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "X10 WW" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "DDD 5WW" +#"^[\w\-]+(?:\.[\w\-]+)*@(?:[\w\-]+\.)+[a-zA-Z]{2,7}$" dG "<0>joe.tillis@unit.army.mil" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" G "<0>jack_rabbit@slims.com" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" G "<0>foo99@foo.co.uk" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" "find_the_mistake.@foo.org" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" ".prefix.@some.net" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>asmith@mactec.com" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>foo12@foo.edu" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>bob.smith@foo.tv" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "joe" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "@foo.com" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "a@a" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>4/1/2001" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>12/12/2001" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>55/5/3434" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "1/1/01" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "12 Jan 01" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "1-1-2001" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>01.1.02" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>11-30-2001" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>2/29/2000" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "02/29/01" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "13/01/2002" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "11/00/02" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>127.0.0.1" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>255.255.255.0" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>192.168.0.1" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "1200.5.4.3" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "abc.def.ghi.jkl" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "255.foo.bar.1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>COM1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>AUX" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>LPT1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "image.jpg" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "index.html" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "readme.txt" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>29/02/1972" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>5-9-98" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>10-11-2002" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "29/02/2003" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "12/13/2002" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "1-1-1500" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" G "<0>user=foo,bar,quux;group=manager,admin;level=100;" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" G "<0>group=nobody;level=24;" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" "user=foo" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" "blahh" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>(+44)(0)20-12341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>02012341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>+44 (0) 1234-1234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" "(44+)020-12341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" "12341234(+020)" +"\b(\w+)\s+\1\b" G "<0>Tell the the preacher" +"\b(\w+)\s+\1\b" G "<0>some some" +"\b(\w+)\s+\1\b" G "<0>hubba hubba" +"\b(\w+)\s+\1\b" "once an annual report" +"\b(\w+)\s+\1\b" "mandate dated submissions" +"\b(\w+)\s+\1\b" "Hubba hubba" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>+31235256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>+31(0)235256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>023-5256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "+3123525667788999" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "3123525667788" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "232-2566778" +"^[-+]?\d*\.?\d*$" G "<0>123" +"^[-+]?\d*\.?\d*$" G "<0>+3.14159" +"^[-+]?\d*\.?\d*$" G "<0>-3.14159" +"^[-+]?\d*\.?\d*$" "abc" +"^[-+]?\d*\.?\d*$" "3.4.5" +"^[-+]?\d*\.?\d*$" "$99.95" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>$1,234.50" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>$0.70" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>.7" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" "$0,123.50" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" "$00.5" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456D" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456F" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456M" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" "AB123456E" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" "ab123456d" +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" G "<0>http://regxlib.com/Default.aspx" # TODO: \w in pattern +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" G "<0>http://electronics.cnet.com/electronics/0-6342366-8-8994967-1.html" # TODO: \w in pattern +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" "www.yahoo.com" # TODO: \w in pattern +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034 AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034 ak" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" "2034 AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" "321321 AKSSAA" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>4/5/91" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>04/5/1991" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>4/05/89" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" "4/5/1" +#"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>01/01/2001 " #TODO - \s in pattern. +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>01-01-2001:" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>(1-1-01)" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "13/1/2001" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "1-32-2001" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "1-1-1801" +"^\d{3}\s?\d{3}$" G "<0>400 099" +"^\d{3}\s?\d{3}$" G "<0>400099" +"^\d{3}\s?\d{3}$" G "<0>400050" +"^\d{3}\s?\d{3}$" "2345678" +"^\d{3}\s?\d{3}$" "12345" +"^\d{3}\s?\d{3}$" "asdf" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>(111) 222-3333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>1112223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>111-222-3333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11122223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11112223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11122233333" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>#00ccff" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>#039" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>ffffcc" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "blue" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "0x000000" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "#ff000" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>01:23:45:67:89:ab" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>01:23:45:67:89:AB" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>fE:dC:bA:98:76:54" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:67:89:ab:cd" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:67:89:Az" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:56:" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>http://www.blah.com/~joe" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>ftp://ftp.blah.co.uk:2828/blah%20blah.gif" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>https://blah.gov/blah-blah.as" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "www.blah.com" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "ftp://blah_underscore/[nope]" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002 12:32:10" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "32/12/2002" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "12/13/2001" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "12/02/06" +"^[0-9](\.[0-9]+)?$" G "<0>1.2345" +"^[0-9](\.[0-9]+)?$" G "<0>0.00001" +"^[0-9](\.[0-9]+)?$" G "<0>7" +"^[0-9](\.[0-9]+)?$" "12.2" +"^[0-9](\.[0-9]+)?$" "1.10.1" +"^[0-9](\.[0-9]+)?$" "15.98" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>III" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>xiv" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>MCMLXLIX" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "iiV" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "MCCM" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "XXXX" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>123" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>-123.35" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>-123.35e-2" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "abc" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "123.32e" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "123.32.3" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>T.F. Johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>John O'Neil" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>Mary-Kate Johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "sam_johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "Joe--Bob Jones" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "dfjsd0rd" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>1200" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>1645" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>2359" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "2400" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "asbc" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "12:45" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G '<0>' +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G "<0>" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G "<0>" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ "= img.jpg" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ "img.jpg" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>78754" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>78754-1234" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>G3H 6A3" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "78754-12aA" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "7875A" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "g3h6a3" +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob@somewhere.com" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob.jones@[1.1.1.1]" +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob@a.b.c.d.info" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob@com" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob.jones@some.where" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob@1.1.1.123" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" "ab@cd.ef" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" ""bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" "bob A. jones " # TODO: \w in pattern +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>SW112LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>SW11 2LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>CR05LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "12CR0LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "12CR 0LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "SWLE05" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2099-12-31T23:59:59" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2002/02/09 16:30:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2000-01-01T00:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2000-13-31T00:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2002/02/33 24:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2000-01-01 60:00:00" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011567812345678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011 5678 1234 5678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011-5678-1234-5678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" "1234567890123456" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>01/01/2001" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>02/29/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>12/31/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "1/1/02" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "02/30/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "1/25/2002" +#"^(?=[^\&])(?:(?[^:/?#]+):)?(?://(?[^/?#]*))?(?[^?#]*)(?:\?(?[^#]*))?(?:#(?.*))?" G "<0>http://regexlib.com/REDetails.aspx?regexp_id=x#Details" # out of context, can't work stand-alone +#"^(?=[^\&])(?:(?[^:/?#]+):)?(?://(?[^/?#]*))?(?[^?#]*)(?:\?(?[^#]*))?(?:#(?.*))?" "&" # out of context, can't work stand-alone +"^[-+]?\d+(\.\d+)?$" G "<0>123" +"^[-+]?\d+(\.\d+)?$" G "<0>-123.45" +"^[-+]?\d+(\.\d+)?$" G "<0>+123.56" +"^[-+]?\d+(\.\d+)?$" "123x" +"^[-+]?\d+(\.\d+)?$" ".123" +"^[-+]?\d+(\.\d+)?$" "-123." +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234-1234-1234-1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234 1234 1234 1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234123412341234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "Visa" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "123-1234-12345" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>6011-1111-1111-1111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>5423-1111-1111-1111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>341111111111111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "4111-111-111-111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "3411-1111-1111-111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "Visa" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "<0>4D28C5AD-6482-41CD-B84E-4573F384BB5C" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "<0>B1E1282C-A35C-4D5A-BF8B-7A3A51D9E388" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "91036A4A-A0F4-43F0-8CD" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "{B1E1282C-A35C-4D3A-BF8B-7A3A51D9E388}" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "AAAAAAAAAAAAAAAAA" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "B;E1282C-A35C-4D3A-BF8B-7A3A51D9E38" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>4111-1234-1234-1234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>6011123412341234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>3711-123456-12345" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "1234567890123456" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "4111-123-1234-1234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "412-1234-1234-1234" +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' G '<0>[link="http://www.yahoo.com"]Yahoo[/link]' #named capture +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' "[link]http://www.yahoo.com[/link]" #named capture +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' "[link=http://www.yahoo.com]Yahoo[/link]" #named capture +"^[a-zA-Z0-9]+$" G "<0>10a" +"^[a-zA-Z0-9]+$" G "<0>ABC" +"^[a-zA-Z0-9]+$" G "<0>A3fg" +"^[a-zA-Z0-9]+$" "45.3" +"^[a-zA-Z0-9]+$" "this or that" +"^[a-zA-Z0-9]+$" "$23" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" G "<0>(123) 456-7890" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" G "<0>123-456-7890" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" "1234567890" +"^[a-zA-Z]\w{3,14}$" G "<0>abcd" +"^[a-zA-Z]\w{3,14}$" G "<0>aBc45DSD_sdf" +"^[a-zA-Z]\w{3,14}$" G "<0>password" +"^[a-zA-Z]\w{3,14}$" "afv" +"^[a-zA-Z]\w{3,14}$" "1234" +"^[a-zA-Z]\w{3,14}$" "reallylongpassword" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>G1 1AA " +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>GIR 0AA" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>SW1 1ZZ" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" "BT01 3RT" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" "G111 1AA" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>03-6106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>036106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>02-5523344" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "00-6106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "03-0106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "02-55812346" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>050-346634" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>058633633" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>064-228226" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "059-336622" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "064-022663" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "0545454545" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>AA11 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>AA1A 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>A11-1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "111 AAA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "1AAA 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "A1AA 1AA" +"@{2}((\S)+)@{2}" G "<0>@@test@@" +"@{2}((\S)+)@{2}" G "<0>@@name@@" +"@{2}((\S)+)@{2}" G "<0>@@2342@@" +"@{2}((\S)+)@{2}" "@test@" +"@{2}((\S)+)@{2}" "@@na me@@" +"@{2}((\S)+)@{2}" "@@ name@@" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>00:00" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>13:59" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>23:59" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" "24:00" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" "23:60" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>23" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>-17.e23" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>+.23e+2" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "+.e2" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "23.17.5" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "10e2.0" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>email@email.com" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>My Name" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>asdf12df" +"^([1-zA-Z0-1@.\s ]{1,255})$" "‘,\*&$<>" +"^([1-zA-Z0-1@.\s ]{1,255})$" "1001' string" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>12/2002" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>11/1900" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>02/1977" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "1/1977" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "00/000" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "15/2002" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(0 34 56) 34 56 67" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(03 45) 5 67 67" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(0 45) 2 33 45-45" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(2345) 34 34" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(0 56) 456 456" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(3 45) 2 34-45678" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>Genesis 3:3-4,6" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>II Sam 2:11,2" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>2 Tim 3:16" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" "Genesis chap 3, verse 3" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" "2nd Samuel 2" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[IMG]http://bleh.jpg[/IMG]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[ImG]bleh[/imG]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[img]ftp://login:pass@bleh.gif[/img]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" '' +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>10/03/1979" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>1-1-02" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>01.1.2003" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "10/03/197" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "01-02-003" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "01 02 03" +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" G "<0>12345" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" G "<0>12345-6789" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "00000" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "00000-0000" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "a4650-465s" # No Conditionals? +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>01" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>12" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>31" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "123" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "32" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "abc" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>1.222.333.1234" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>1-223-123-1232" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>12223334444" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "1.1.123123.123" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "12-1322-112-31" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "11231321131" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>DN3 6GB" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>SW42 4RG" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>GIR 0AA" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "SEW4 5TY" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "AA2C 4FG" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "AA2 4CV" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>asD1" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>asDF1234" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>ASPgo123" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "asdf" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "1234" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "ASDF12345" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1.222.333.1234" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1-223-123-1232" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1-888-425-DELL" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "1.1.123123.123" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "12-1322-112-31" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "1-800-CALL-DEL" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>09:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>9:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>11:35" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "13:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "9.00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "6:60" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>1" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>108" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>255" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" "01" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" "256" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>01/01/2001" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>1/01/2001" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "2/30/2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "13/23/2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "12345" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>SP939393H" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>PX123456D" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>SW355667G" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "12SP9393H" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "S3P93930D" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "11223344SP00ddSS" +"(^0[78][2347][0-9]{7})" G "<0>0834128458" +"(^0[78][2347][0-9]{7})" G "<0>0749526308" +"(^0[78][2347][0-9]{7})" "0861212308" +"(^0[78][2347][0-9]{7})" "0892549851" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>C1406HHA" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>A4126AAB" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>c1406hha" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "c1406HHA" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "4126" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "C1406hha" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>66.129.71.120" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>207.46.230.218" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>64.58.76.225" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "10.0.5.4" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "192.168.0.1" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "my ip address" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com" +"/\*[\d\D]*?\*/" G "<0>/* my comment */" +"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */" +"/\*[\d\D]*?\*/" G "<0>/* my nested comment */" +"/\*[\d\D]*?\*/" "*/ anything here /*" +"/\*[\d\D]*?\*/" "anything between 2 separate comments" +"/\*[\d\D]*?\*/" "\* *\\" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my comment */" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my multiline comment */" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my nested comment */" +"/\*[\p{N}\P{N}]*?\*/" "*/ anything here /*" +"/\*[\p{N}\P{N}]*?\*/" "anything between 2 separate comments" +"/\*[\p{N}\P{N}]*?\*/" "\* *\\" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>1/31/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>04-30-02" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>12-01/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "2/31/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "13/0/02" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "Jan 1, 2001" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>blah@[10.0.0.1]" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>a@b.c" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' "non@match@." +"^\d{9}[\d|X]$" G "<0>1234123412" +"^\d{9}[\d|X]$" G "<0>123412341X" +"^\d{9}[\d|X]$" "not an isbn" +"^\d{9}(\d|X)$" G "<0>1234123412" +"^\d{9}(\d|X)$" G "<0>123412341X" +"^\d{9}(\d|X)$" "not an isbn" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>01/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>1/1/1999" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>10/20/2080" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "13/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "1/1/1800" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "10/32/2080" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>0.25" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>.75" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>123.50" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" ".77" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" "1.435" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>12345" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>932 68" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>S-621 46" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "5367" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "425611" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "31 545" +"^\d{5}(-\d{4})?$" G "<0>48222" +"^\d{5}(-\d{4})?$" G "<0>48222-1746" +"^\d{5}(-\d{4})?$" "4632" +"^\d{5}(-\d{4})?$" "Blake" +"^\d{5}(-\d{4})?$" "37333-32" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>test.txt" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>test.jpg.txt" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>a&b c.bmp" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' "CON" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' ".pdf" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' "test:2.pdf" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>1'235.140" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>1'222'333.120" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>456" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "1234.500" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "78'45.123" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "123,0012" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T2p 3c7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T3P3c7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T2P 3C7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "123456" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "3C7T2P" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "11T21RWW" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$1.50" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$49" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$0.50" +"^\$[0-9]+(\.[0-9][0-9])?$" "1.5" +"^\$[0-9]+(\.[0-9][0-9])?$" "$1.333" +"^\$[0-9]+(\.[0-9][0-9])?$" "this $5.12 fails" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>217.6.9.89" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>0.0.0.0" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>255.255.255.255" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "256.0.0.0" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "0978.3.3.3" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "65.4t.54.3" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>http://www.aspemporium.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>mailto:dominionx@hotmail.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>ftp://ftp.test.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "www.aspemporium.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "dominionx@hotmail.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "bloggs" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(12) 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(01512) 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(0xx12) 1234 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "12 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123/1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123 12345" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com" +"^(?=.*\d).{4,8}$" G "<0>1234" +"^(?=.*\d).{4,8}$" G "<0>asdf1234" +"^(?=.*\d).{4,8}$" G "<0>asp123" +"^(?=.*\d).{4,8}$" "asdf" +"^(?=.*\d).{4,8}$" "asdf12345" +"^(?=.*\d).{4,8}$" "password" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>user name" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>user#name" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>....." +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "User_Name1" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "username@foo.com" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "user.name@mail.foo.com" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" G "<0>12,654" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" G "<0>1,987" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" "128,2" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" "12," +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" G "<0>https://www.restrictd.com/~myhome/" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "http://www.krumedia.com." +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "(http://www.krumedia.com)" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "http://www.krumedia.com," +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>2&651.50" +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>987.895" +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "25$%787*" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$1,456,983.00" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$1,700.07" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$68,944.23" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$20,86.93" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$1098.84" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$150." +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$28,009,987.88" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$23,099.05" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$.88" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" "$234,5.99" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>29/02/2004 20:15:27" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>29/2/04 8:9:5" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>31/3/2004 9:20:17" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "29/02/2003 20:15:15" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "2/29/04 20:15:15" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "31/3/4 9:20:17" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>something@someserver.com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>firstname.lastname@mailserver.domain.com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>username-something@some-server.nl" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "username@someserver.domain.c" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "somename@server.domain-com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "someone@something.se_eo" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8 am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8:00 am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8 a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8:00 a" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>55(21)123-4567" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>(11)1234-5678" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>55(71)4562-2234" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "3434-3432" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "4(23)232-3232" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "55(2)232-232" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>1:01 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>23:52:01" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>03.24.36 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "19:31 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "9:9 PM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "25:60:61" +"^\d{0,2}(\.\d{1,2})?$" G "<0>99.99" +"^\d{0,2}(\.\d{1,2})?$" G "<0>99" +"^\d{0,2}(\.\d{1,2})?$" G "<0>.99" +"^\d{0,2}(\.\d{1,2})?$" "999.999" +"^\d{0,2}(\.\d{1,2})?$" "999" +"^\d{0,2}(\.\d{1,2})?$" ".999" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "wyrn%@*&$# f" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "mbndkfh782" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "BNfhjdhfjd&*)%#$)" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>freshmeat.net" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>123.com" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>TempLate-toolkKt.orG" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "-dog.com" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "?boy.net" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "this.domain" +"^[^']*$" G "<0>asljas" +"^[^']*$" G "<0>%/&89uhuhadjkh" +"^[^']*$" G '<0>"hi there!"' +"^[^']*$" "'hi there!'" +"^[^']*$" "It's 9 o'clock" +"^[^']*$" "'''''" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>((24,((1,2,3),(3,4,5))))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>((1,((2,3,4),(4,5,6),(96,34,26))),(12,((1,3,4),(4,5,6),(7,8,9))))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>()" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "(24,((1,2,3),(3,4,5)))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "( )" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "((23,(12,3,4),(4,5,6)))" +"^[a-zA-Z0-9\s .\-_']+$" G "<0>dony d'gsa" +"^[a-zA-Z0-9\s .\-_']+$" "^[a-zA-Z0-9\s.\-_']+$" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>example@example.com" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>foo@bar.info" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>blah@127.0.0.1" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "broken@@example.com" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "foo@bar.infp" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "blah@.nospam.biz" +"^\d{5}(-\d{3})?$" G "<0>13165-000" +"^\d{5}(-\d{3})?$" G "<0>38175-000" +"^\d{5}(-\d{3})?$" G "<0>81470-276" +"^\d{5}(-\d{3})?$" "13165-00" +"^\d{5}(-\d{3})?$" "38175-abc" +"^\d{5}(-\d{3})?$" "81470-2763" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$0.84" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$123458" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$1,234,567.89" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$12,3456.01" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "12345" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$1.234" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>\\\\Andromeda\\share\\file name.123" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "tz:\temp\ fi*le?na:m.doc" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "\\Andromeda\share\filename.a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>10:35" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>9:20" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>23" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "24:00" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "20 PM" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "20:15 PM" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>$3,023,123.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>9,876,453" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>123456.78" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "4,33,234.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "$1.234" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "abc" +"^\$?\d+(\.(\d{2}))?$" G "<0>$2.43" +"^\$?\d+(\.(\d{2}))?$" G "<0>2.02" +"^\$?\d+(\.(\d{2}))?$" G "<0>$2112" +"^\$?\d+(\.(\d{2}))?$" "2.1" +"^\$?\d+(\.(\d{2}))?$" "$.14" +"^\$?\d+(\.(\d{2}))?$" "$2,222.12" +/("[^"]*")|('[^\r]*)(\r\n)?/ G '<0>"my string"' +/("[^"]*")|('[^\r]*)(\r\n)?/ G '<0>"a string with \u0027 in it"' +/("[^"]*")|('[^\r]*)(\r\n)?/ G "<0>' comment" +/("[^"]*")|('[^\r]*)(\r\n)?/ /asd "/ +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" G "<0>BFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" G "<0>BFDB4d31-3e35-4dab-afca-5e6e5c8f61ea" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "qqqBFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E35-4DAB-AF" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>12.345-678" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>23.345-123" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>99.999" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "41222-222" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "3.444-233" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "43.324444" +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt" # TODO: debug +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt" # TODO: debug +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt" # TODO: debug +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:" +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls" +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>my.domain.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>regexlib.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>big-reg.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" ".mydomain.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "regexlib.comm" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "-bigreg.com" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" G "<0>0001-12-31" +"^\d{4}[\-\/\s ]?((((0[13578])|(1[02]))[\-\/\s ]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s ]?(([0-2][0-9])|(30)))|(02[\-\/\s ]?[0-2][0-9]))$" G "<0>9999 09 30" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" G "<0>2002/03/03" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "0001\\02\\30" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "9999.15.01" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "2002/3/3" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://psychopop.org" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://www.edsroom.com/newUser.asp" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://unpleasant.jarrin.net/markov/inde" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "ftp://psychopop.org" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "http://www.edsroom/" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "http://un/pleasant.jarrin.net/markov/index.asp" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0>1145" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0>933" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0> 801" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "0000" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "1330" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "8:30" +"^\d{1,2}\/\d{2,4}$" G "<0>9/02" +"^\d{1,2}\/\d{2,4}$" G "<0>09/2002" +"^\d{1,2}\/\d{2,4}$" G "<0>09/02" +"^\d{1,2}\/\d{2,4}$" "Fall 2002" +"^\d{1,2}\/\d{2,4}$" "Sept 2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>01/01/2001" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>02/30/2001" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>12/31/2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/1/02" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/1/2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/25/2002" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>15615552323" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>1-561-555-1212" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>5613333" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "1-555-5555" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "15553333" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "0-561-555-1212" +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0>' +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0>" # TODO: \w in pattern +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' '' # TODO: \w in pattern +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "The dirty brown fox stank like" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00 AM" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>12:00 PM" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00am" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" "24:00" +"^\d*$" G "<0>123" +"^\d*$" G "<0>000" +"^\d*$" G "<0>43" +"^\d*$" "asbc" +"^\d*$" "-34" +"^\d*$" "3.1415" +"^[-+]?\d*$" G "<0>123" +"^[-+]?\d*$" G "<0>-123" +"^[-+]?\d*$" G "<0>+123" +"^[-+]?\d*$" "abc" +"^[-+]?\d*$" "3.14159" +"^[-+]?\d*$" "-3.14159" +"^\d*\.?\d*$" G "<0>123" +"^\d*\.?\d*$" G "<0>3.14159" +"^\d*\.?\d*$" G "<0>.234" +"^\d*\.?\d*$" "abc" +"^\d*\.?\d*$" "-3.14159" +"^\d*\.?\d*$" "3.4.2" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>44240" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>44240-5555" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>T2P 3C7" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "44240ddd" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "t44240-55" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "t2p3c7" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-8970 x12" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-8970 1211" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 156-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 056-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 556-7890 x" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>31.01.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>29.2.2004" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>09.02.2005" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "31.11.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "29.2.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "33.06.2000" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>12/31/2003" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>01/01/1900" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>11/31/2002" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "1/1/2002" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "01/01/02" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "01/01/2004" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2003" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2002 3:33 pm" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2003 3:33:33 am" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "13/1/2002" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "3/3/2002 3:33" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "31/3/2002" +"([a-zA-Z]:(\\w+)*\\[a-zA-Z0_9]+)?.xls" G "<0>E:\DyAGT\SD01A_specV2.xls" +"([a-zA-Z]:(\\w+)*\\[a-zA-Z0_9]+)?.xls" "E:\DyAGT\SD01A_specV2.txt" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>02/29/2084" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>01/31/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>11/30/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "02/29/2083" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "11/31/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "01/32/2000" +"^[a-zA-Z0-9\s .\-]+$" G "<0>2222 Mock St." # TODO: \s in patterns not implemented +"^[a-zA-Z0-9\s .\-]+$" G "<0>1 A St." +"^[a-zA-Z0-9\s .\-]+$" G "<0>555-1212" +"^[a-zA-Z0-9\s.\-]+$" "[A Street]" +"^[a-zA-Z0-9\s.\-]+$" "(3 A St.)" +"^[a-zA-Z0-9\s.\-]+$" "{34 C Ave.}" +"^[a-zA-Z0-9\s.\-]+$" "Last.*?(\d+.?\d*)" +"^[a-zA-Z0-9\s .\-]+$" G " Last1-(123)-123-1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" G "<0>123 123 1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" G "<0>1-800-ALPHNUM" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "1.123.123.1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "(123)-1234-123" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "123-1234" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>02:04" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>16:56" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>23:59" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "02:00 PM" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "PM2:00" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "24:00" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>01/01/1990" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>12/12/9999" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>3/28/2001" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "3-8-01" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "13/32/1001" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "03/32/1989" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>1.2123644567" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>0-234.567/8912" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>1-(212)-123 4567" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "0-212364345" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "1212-364,4321" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "0212\345/6789" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000 000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000-000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" "000000_000000000000" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>01/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>1/1/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>01/1/01" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "13/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "1/2/100" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "09/32/2001" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>$3,023,123.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>9,876,453" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>123456.78" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "4,33,234.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "$1.234" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "abc" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>55555-5555" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>34564-3342" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>90210" +"^\d{5}$|^\d{5}-\d{4}$" "434454444" +"^\d{5}$|^\d{5}-\d{4}$" "645-32-2345" +"^\d{5}$|^\d{5}-\d{4}$" "abc" +"^\d{3}-\d{2}-\d{4}$" G "<0>333-22-4444" +"^\d{3}-\d{2}-\d{4}$" G "<0>123-45-6789" +"^\d{3}-\d{2}-\d{4}$" "123456789" +"^\d{3}-\d{2}-\d{4}$" "SSN" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>800-555-5555" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>333-444-5555" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>212-666-1234" +"^[2-9]\d{2}-\d{3}-\d{4}$" "000-000-0000" +"^[2-9]\d{2}-\d{3}-\d{4}$" "123-456-7890" +"^[2-9]\d{2}-\d{3}-\d{4}$" "2126661234" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>44240" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>44240-5555" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>G3H 6A3" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "Ohio" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "abc" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "g3h6a3" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054 WD" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054WD" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054 wd" +"[0-9]{4}\s*[a-zA-Z]{2}" "10543" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>0732105432" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>1300333444" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>131313" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" "32105432" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" "13000456" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>http://207.68.172.254/home.ashx" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>ftp://ftp.netscape.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>https://www.brinkster.com/login.asp" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "htp://mistake.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "http://www_address.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "ftp://www.files.com/file with spaces.txt" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>2002-11-03" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>2007-17-08" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>9999-99-99" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "2002/17/18" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "2002.18.45" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "18.45.2002" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>$0,234.50" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>0234.5" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>0,234." +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" "$1,23,50" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" "$123.123" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12.345-678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12345-678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12345678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "12.345678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "12345-1" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "123" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>x:\\test\\testing.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>x:\\test\\test#$ ing.html" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>\\\\test\testing.html" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "x:\test\test/ing.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "x:\test\test*.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "\\test?<.htm" +"^[1-9]{1}[0-9]{3}$" G "<0>1234" +"^[1-9]{1}[0-9]{3}$" "123" +"^[1-9]{1}[0-9]{3}$" "123A" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A-1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A 1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" "AA-1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" "A12345" +"^(F-)?[0-9]{5}$" G "<0>12345" +"^(F-)?[0-9]{5}$" G "<0>F-12345" +"^(F-)?[0-9]{5}$" "F12345" +"^(F-)?[0-9]{5}$" "F-123456" +"^(F-)?[0-9]{5}$" "123456" +"^(V-|I-)?[0-9]{4}$" G "<0>1234" +"^(V-|I-)?[0-9]{4}$" G "<0>V-1234" +"^(V-|I-)?[0-9]{4}$" "12345" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" G "<0>1234 AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" G "<0>1234AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" "123AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" "1234AAA" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>12345" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>10234" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>01234" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" "00123" +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>John Doe Sr." +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>100 Elm St., Suite 25" +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>Valerie's Gift Shop" +"^(/w|/W|[^<>+?$%\{}\&])+$" "

    Hey

    " +/<[a-zA-Z][^>]*\son\w+=(\w+|'[^']*'|"[^"]*")[^>]*>/ G '<0>' +/<[a-zA-Z][^>]*\son\w+=(\w+|'[^']*'|"[^"]*")[^>]*>/ '' +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>1" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>12345.123" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>0.5" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "0" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "0.0" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "123456.1234" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>whatever@somewhere.museum" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>foreignchars@myforeigncharsdomain.nu" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>me+mysomething@mydomain.com" +"^.+@[^\.].*\.[a-z]{2,}$" "a@b.c" +"^.+@[^\.].*\.[a-z]{2,}$" "me@.my.com" +"^.+@[^\.].*\.[a-z]{2,}$" "a@b.comFOREIGNCHAR" +"^(\d{5}-\d{4}|\d{5})$" G "<0>12345" +"^(\d{5}-\d{4}|\d{5})$" G "<0>12345-1234" +"^(\d{5}-\d{4}|\d{5})$" "12345-12345" +"^(\d{5}-\d{4}|\d{5})$" "123" +"^(\d{5}-\d{4}|\d{5})$" "12345-abcd" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>0.0.0.0" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>255.255.255.02" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>192.168.0.136" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "256.1.3.4" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "023.44.33.22" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "10.57.98.23." +"]*[^/])>" G '<0>' +"]*[^/])>" '' +"" G "<0>" +"" G "<0>" +"" "this is a comment" +"" G "<0>" +"" G "<0>" +"" "this is a comment" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G "<0>" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G '<0>' +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G "<0>" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ "No Tag Here ..." +"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f0\\Some Font names here;" +"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f1\\fswiss\\fcharset0\\fprq2{\\*\\panose 020b0604020202020204}Arial;" +"(\{\\f\d*)\\([^;]+;)" G "{\\f" +"(\{\\f\d*)\\([^;]+;)" "{f0fs20 some text}" +#"" G '<0>space' # TODO: Can't quote this pattern with the test syntax! +#"" "this is not a tag" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>12/30/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/12/1998 13:30" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/28/2002 22:35:00" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "13/30/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "01/12/1998 24:30" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "01/28/2002 22:35:64" +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>BEGIN:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>TEL;WORK;VOICE:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>TEL:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" "begin:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" "TEL;PREF;" #named capture +'^]*)>(.*?(?=<\/a>))<\/a>$' G '<0>my external link' +'^]*)>(.*?(?=<\/a>))<\/a>$' G ']*)>(.*?(?=<\/a>))<\/a>$' 'my internal link' +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002 08:00" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002 08:00 AM" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" "12/31/02" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" "12/31/2002 14:00" +"
    (?:\s*([^<]+)
    \s*)+
    " G "<0>
    string1
    string2
    string3
    " +"
    (?:\s*([^<]+)
    \s*)+
    " ".." +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>1/2/03" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>2/30/1999" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>03/04/19" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" "3/4/2020" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" "3/4/1919" +']*))*|/?>' G '<0>' +']*))*|/?>' G "<0>" +']*))*|/?>' G "<0>
    " +']*))*|/?>' "this is a test..." +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0>12:00am" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0>1:00 PM" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0> 12:59 pm" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "0:00" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "0:01 am" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "13:00 pm" +"\({1}[0-9]{3}\){1}\-{1}[0-9]{3}\-{1}[0-9]{4}" G "<0>(111)-111-1111" +"\({1}[0-9]{3}\){1}\-{1}[0-9]{3}\-{1}[0-9]{4}" "11111111111" +"[^abc]" G "<0>def" +"[^abc]" "abc" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>01/01/2002 04:42" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>5-12-02 04:42 AM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>01.01/02 04-42aM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-1999 4:50PM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-2002 15:10PM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-002 8:20PM" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>11-02-02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>1-25-2002" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>01/25/2002" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "13-02-02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "11.02.02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "11/32/2002" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>09:30:00" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>17:45:20" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>23:59:59" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" "24:00:00" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>29/02/2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>31/01/2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>30-01-2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "29/02/2002" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "32/01/2002" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "10/2/2002" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>01 46 70 89 12" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>01-46-70-89-12" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>0146708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "01-46708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "01 46708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "+33235256677" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>good.gif" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>go d.GIf" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>goo_d.jPg" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "junk" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "bad.bad.gif" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "slash\gif." +"<[^>\s]*\bauthor\b[^>]*>" G '<0>' +"<[^>\s]*\bauthor\b[^>]*>" G "<0>" +# "<[^>\s]*\bauthor\b[^>]*>" G '<0>' #Debug should work +"<[^> ]*\bauthor\b[^>]*>" G "<0>" +"<[^> ]*\bauthor\b[^>]*>" G '<0>' +"<[^>\s]*\bauthor\b[^>]*>" "" +"<[^>\s]*\bauthor\b[^>]*>" "" +"<[^>\s]*\bauthor\b[^>]*>" "author" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>04/2/29" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>2002-4-30" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>02.10.31" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "2003/2/29" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "02.4.31" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "00/00/00" +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>5\u0027-3/16"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>1\u0027-2"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>5/16"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' '1 3/16' +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>1" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>23" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>50" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "0" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "111" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "xyz" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>Jon Doe" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>J\u00f8rn" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>Mc'Neelan" +"^([ \u00c0-\u01ffa-zA-Z'])+$" "Henry); hacking attempt" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>1:00 PM" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>6:45 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>17:30" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "4:32 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "5:30:00 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "17:01" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>0.050" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>5.0000" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>5000" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0.0" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" ".0" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><2>San Francisco" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><3>San Luis Obispo" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>@12X*567" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>1#Zv96g@*Yfasd4" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>#67jhgt@erd" +"^([a-zA-Z0-9@*#]{8,15})$" "$12X*567" +"^([a-zA-Z0-9@*#]{8,15})$" "1#Zv_96" +"^([a-zA-Z0-9@*#]{8,15})$" "+678jhgt@erd" +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' G '<0>href="produktsida.asp?kategori2=218"' +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' G '<0>href="NuclearTesting.htm"' +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' 'U Suck' +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>05-01-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>29-02-2004" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>31-12-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "1-1-02" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "29-02-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "31-11-2002" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456.123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456,123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "123a.123" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "123a,123" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "a" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>AC" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>RJ" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>SP" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "XX" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "AB" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "HJ" +"^[+]?\d*$" G "<0>0123456789" +"^[+]?\d*$" G "<0>1234" +"^[+]?\d*$" G "<0>1" +"^[+]?\d*$" "1.0?&" +"^[+]?\d*$" "a1" +"^[+]?\d*$" "2a-" +#/<[aA][ ]{0,}([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,}>((<(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})>([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})|(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})){1,}/ G "<0>this text is italicized" #TODO: Need infinite loop breaking +#/<[aA][ ]{0,}([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,}>((<(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})>([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})|(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})){1,}/ "

    " #TODO: need infinite loop breaking. +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>0:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>23:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>00:59" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "0:0" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "24:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "00:60" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>11/03" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>01/04" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" "13/03" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" "10/2003" +"]*>[\w|\t|\r|\W]*" G '<0>' +"]*>[\w|\t|\r|\W]*" "--" +"]*>[\w|\t|\r|\W]*" "A-Z][a-z]+" +#"]*>[\w|\t|\r|\W]*" G "<0>strFirstName" # Test Case damaged? +#"]*>[\w|\t|\r|\W]*" G "<0>intAgeInYears" # Test Case damaged? +#"]*>[\w|\t|\r|\W]*" G "<0>Where the Wild Things Are" # Test Case damaged? +"]*>[\w|\t|\r|\W]*" "123" +"]*>[\w|\t|\r|\W]*" "abc" +"]*>[\w|\t|\r|\W]*" "this has no caps in it" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-0.050" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5.000" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" "0" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" "0.0" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" ".0" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>2002/02/03" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>2002/02/03 12:12:18" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "2002/02/36" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "02/03/2002" +"^(\d|,)*\.?\d*$" G "<0>1,000" +"^(\d|,)*\.?\d*$" G "<0>3,000.05" +"^(\d|,)*\.?\d*$" G "<0>5,000,000" +"^(\d|,)*\.?\d*$" "abc" +"^(\d|,)*\.?\d*$" "$100,000" +"^(\d|,)*\.?\d*$" "Forty" +"^\d$" G "<0>1" +"^\d$" G "<0>2" +"^\d$" G "<0>3" +"^\d$" "a" +"^\d$" "324" +"^\d$" "num" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" "http://none" +"^[0-9]+$" "http://none" +"^[0-9]+$" "http://none" +"^.{4,8}$" G "<0>asdf" +"^.{4,8}$" G "<0>1234" +"^.{4,8}$" G "<0>asdf1234" +"^.{4,8}$" "asd" +"^.{4,8}$" "123" +"^.{4,8}$" "asdfe12345" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word" +"^\d{5}-\d{4}$" G "<0>22222-3333" +"^\d{5}-\d{4}$" G "<0>34545-2367" +"^\d{5}-\d{4}$" G "<0>56334-2343" +"^\d{5}-\d{4}$" "123456789" +"^\d{5}-\d{4}$" "A3B 4C5" +"^\d{5}-\d{4}$" "55335" +"(a|b|c).(a.b)*.b+.c" G "<0>autbfc" +"(a|b|c).(a.b)*.b+.c" "attc" +'"((\\")|[^"(\\")])+"' G '<0>"test"' +'"((\\")|[^"(\\")])+"' G '<0>"escape\"quote"' +'"((\\")|[^"(\\")])+"' G '<0>"\\""' +'"((\\")|[^"(\\")])+"' "test" +'"((\\")|[^"(\\")])+"' '"test' +'"((\\")|[^"(\\")])+"' '""test\\"' +"((0[1-9])|(1[02]))/\d{2}" G "<0>01/00" +"((0[1-9])|(1[02]))/\d{2}" G "<0>12/99" +"((0[1-9])|(1[02]))/\d{2}" "13/00" +"((0[1-9])|(1[02]))/\d{2}" "12/AS" +"^[a-zA-Z]$" G "<0>a" +"^[a-zA-Z]$" G "<0>B" +"^[a-zA-Z]$" G "<0>c" +"^[a-zA-Z]$" "0" +"^[a-zA-Z]$" "&" +"^[a-zA-Z]$" "AbC" +"^[a-zA-Z]+$" G "<0>abc" +"^[a-zA-Z]+$" G "<0>ABC" +"^[a-zA-Z]+$" G "<0>aBcDeF" +"^[a-zA-Z]+$" "abc123" +"^[a-zA-Z]+$" "mr." +"^[a-zA-Z]+$" "a word" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>Smith, Ed" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>Ed Smith" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>aBcDeFgH" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "a123" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "AB5" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "Mr. Ed" +"(\w+?@\w+?\u002E.+)" G "<0>bob@vsnl.com" +"(\w+?@\w+?\u002E.+)" "[AABB]" +"^\d+$" G "<0>123" +"^\d+$" G "<0>10" +"^\d+$" G "<0>54" +"^\d+$" "-54" +"^\d+$" "54.234" +"^\d+$" "abc" +"^(\+|-)?\d+$" G "<0>-34" +"^(\+|-)?\d+$" G "<0>34" +"^(\+|-)?\d+$" G "<0>+5" +"^(\+|-)?\d+$" "abc" +"^(\+|-)?\d+$" "3.1415" +"^(\+|-)?\d+$" "-5.3" +"foo" G "<0>foo" +"foo" "bar" +"^[1-5]$" G "<0>1" +"^[1-5]$" G "<0>3" +"^[1-5]$" G "<0>4" +"^[1-5]$" "6" +"^[1-5]$" "23" +"^[1-5]$" "a" +"^[12345]$" G "<0>1" +"^[12345]$" G "<0>2" +"^[12345]$" G "<0>4" +"^[12345]$" "6" +"^[12345]$" "-1" +"^[12345]$" "abc" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@." +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>joe@aol.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>ssmith@aspalliance.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>a@b.cc" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@123aspx.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@web.info" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@company.co.uk" +"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com" +"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c" +"[\w-]+@([\w-]+\.)+[\w-]+" "asdf" +"[\w-]+@([\w-]+\.)+[\w-]+" "1234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234-1234-1234-1234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234123412341234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" "1234123412345" +"^\d{5}$" G "<0>33333" +"^\d{5}$" G "<0>55555" +"^\d{5}$" G "<0>23445" +"^\d{5}$" "abcd" +"^\d{5}$" "1324" +"^\d{5}$" "as;lkjdf" +"(\w+)\s+\1" G "<0>hubba hubba" +"(\w+)\s+\1" G "<0>mandate dated" +"(\w+)\s+\1" G "<0>an annual" +"(\w+)\s+\1" "may day" +"(\w+)\s+\1" "gogo" +"(\w+)\s+\1" "1212" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>3SquareBand.com" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>asp.net" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>army.mil" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "$SquareBand.com" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "asp/dot.net" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "army.military" + diff --git a/go/mysql/icuregex/testdata/regextst_extended.txt b/go/mysql/icuregex/testdata/regextst_extended.txt new file mode 100644 index 00000000000..7824d8028a1 --- /dev/null +++ b/go/mysql/icuregex/testdata/regextst_extended.txt @@ -0,0 +1,88 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. +# +# file: +# +# ICU regular expression test cases. +# +# format: one test case per line, +# = [# comment] +# = "" +# = "" +# the quotes on the pattern and match string can be " or ' or / +# = text, with the start and end of each +# capture group tagged with .... The overall match, +# if any, is group 0, as in <0>matched text +# A region can be specified with ... tags. +# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear. +# +# = any combination of +# i case insensitive match +# x free spacing and comments +# s dot-matches-all mode +# m multi-line mode. +# ($ and ^ match at embedded new-lines) +# D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. +# v If icu configured without break iteration, this +# regex test pattern should not compile. +# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag +# d dump the compiled pattern +# t trace operation of match engine. +# 2-9 a digit between 2 and 9, specifies the number of +# times to execute find(). The expected results are +# for the last find() in the sequence. +# G Only check match / no match. Do not check capture groups. +# E Pattern compilation error expected +# L Use LookingAt() rather than find() +# M Use matches() rather than find(). +# +# a Use non-Anchoring Bounds. +# b Use Transparent Bounds. +# The a and b options only make a difference if +# a region has been specified in the string. +# z|Z hitEnd was expected(z) or not expected (Z). +# With neither, hitEnd is not checked. +# y|Y Require End expected(y) or not expected (Y). +# +# White space must be present between the flags and the match string. +# + +"[:xdigit:]" " <0>4f" +"\P{XDIGIT}+" "4f<0> " + +"[:blank:]" "<0> 4f" +"\P{BLANK}+" "<0>4f " + +"[:print:]" "<0> 4f\x07" +"\P{PRINT}+" " 4f<0>\x07" + +"\p{Age=1.1}" "<0>4f🥱" +"\p{Age=11}" "4f🥱" +"\p{Age=12}" "4f<0>🥱" + +"\p{Name=LATIN SMALL LETTER B}" "Good<0>bye" + +"\p{Numeric_Value=3}" "Good<0>3ye" +"\p{Numeric_Value=14}" "Good<0>⑭ye" + +"\p{Script_Extensions=Greek}" "Good<0>βye" + +"\p{Bidi_Control}" "Good<0>\u200Eye" +"\p{Bidi_Class=LeftToRight}" "<0>Goodbye" +"\p{Bidi_Class=RightToLeft}" "Goodbye" +"\p{Bidi_Class=LeftToRight}" "؈" + +"\p{Soft_Dotted}" "Good<0>iye" + +"\p{Changes_When_Lowercased}" "<0>Goodbye" +"\p{Changes_When_Titlecased}" "<0>goodbye" +"\p{Changes_When_Uppercased}" "G<0>oodbye" +"\p{Changes_When_CaseMapped}" " <0>Goodbye3" +"\p{Cased}" " <0>Goodbye3" + +"\p{Indic_Syllabic_Category=Avagraha}" "foo<0>\u09BDbar" +"\p{IndicPositionalCategory=Top_And_Left_And_Right}" "foo<0>\u0B4Cbar" +"\p{VerticalOrientation=U}" "foo<0>\uA015bar" \ No newline at end of file From 8dfdd6998dd0db74952eb09e6a4c8af3815190d4 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 28 Jun 2023 11:01:10 +0200 Subject: [PATCH 02/18] icuregex: implement freeze set optimization Signed-off-by: Vicent Marti --- go/mysql/icuregex/internal/uset/frozen.go | 321 ++++++++++++++++++ .../icuregex/internal/uset/unicode_set.go | 71 ++++ go/mysql/icuregex/sets.go | 28 +- go/mysql/icuregex/sets_test.go | 16 + 4 files changed, 422 insertions(+), 14 deletions(-) create mode 100644 go/mysql/icuregex/internal/uset/frozen.go diff --git a/go/mysql/icuregex/internal/uset/frozen.go b/go/mysql/icuregex/internal/uset/frozen.go new file mode 100644 index 00000000000..308b8fb6aca --- /dev/null +++ b/go/mysql/icuregex/internal/uset/frozen.go @@ -0,0 +1,321 @@ +package uset + +type frozen struct { + // One byte 0 or 1 per Latin-1 character. + latin1Contains [0x100]byte + + // true if contains(U+FFFD) + containsFFFD bool + + /* + * One bit per code point from U+0000..U+07FF. + * The bits are organized vertically; consecutive code points + * correspond to the same bit positions in consecutive table words. + * With code point parts + * lead=c{10..6} + * trail=c{5..0} + * it is set.contains(c)==(table7FF[trail] bit lead) + * + * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) + * for faster validity checking at runtime. + */ + table7FF [64]uint32 + + /* + * One bit per 64 BMP code points. + * The bits are organized vertically; consecutive 64-code point blocks + * correspond to the same bit position in consecutive table words. + * With code point parts + * lead=c{15..12} + * t1=c{11..6} + * test bits (lead+16) and lead in bmpBlockBits[t1]. + * If the upper bit is 0, then the lower bit indicates if contains(c) + * for all code points in the 64-block. + * If the upper bit is 1, then the block is mixed and set.contains(c) + * must be called. + * + * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to + * the result of contains(FFFD) for faster validity checking at runtime. + */ + bmpBlockBits [64]uint32 + + /* + * Inversion list indexes for restricted binary searches in + * findCodePoint(), from + * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000). + * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are + * always looked up in the bit tables. + * The last pair of indexes is for finding supplementary code points. + */ + list4kStarts [18]int32 +} + +func freeze(list []rune) *frozen { + f := &frozen{} + + listEnd := int32(len(list) - 1) + + f.list4kStarts[0] = f.findCodePoint(list, 0x800, 0, listEnd) + for i := 1; i <= 0x10; i++ { + f.list4kStarts[i] = f.findCodePoint(list, rune(i)<<12, f.list4kStarts[i-1], listEnd) + } + f.list4kStarts[0x11] = listEnd + f.containsFFFD = f.containsSlow(list, 0xfffd, f.list4kStarts[0xf], f.list4kStarts[0x10]) + + f.initBits(list) + f.overrideIllegal() + + return f +} + +func (f *frozen) containsSlow(list []rune, c rune, lo, hi int32) bool { + return (f.findCodePoint(list, c, lo, hi) & 1) != 0 +} + +func (f *frozen) findCodePoint(list []rune, c rune, lo, hi int32) int32 { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if c < list[lo] { + return lo + } + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if lo >= hi || c >= list[hi-1] { + return hi + } + // invariant: c >= list[lo] + // invariant: c < list[hi] + for { + i := (lo + hi) >> 1 + if i == lo { + break // Found! + } else if c < list[i] { + hi = i + } else { + lo = i + } + } + return hi +} + +func (f *frozen) set32x64bits(table *[64]uint32, start, limit int32) { + // U_ASSERT(start < limit) + // U_ASSERT(limit <= 0x800) + + lead := start >> 6 // Named for UTF-8 2-byte lead byte with upper 5 bits. + trail := start & 0x3f // Named for UTF-8 2-byte trail byte with lower 6 bits. + + // Set one bit indicating an all-one block. + bits := uint32(1) << lead + if (start + 1) == limit { // Single-character shortcut. + table[trail] |= bits + return + } + + limitLead := limit >> 6 + limitTrail := limit & 0x3f + + if lead == limitLead { + // Partial vertical bit column. + for trail < limitTrail { + table[trail] |= bits + trail++ + } + } else { + // Partial vertical bit column, + // followed by a bit rectangle, + // followed by another partial vertical bit column. + if trail > 0 { + for { + table[trail] |= bits + trail++ + if trail >= 64 { + break + } + } + lead++ + } + if lead < limitLead { + bits = ^((uint32(1) << lead) - 1) + if limitLead < 0x20 { + bits &= (uint32(1) << limitLead) - 1 + } + for trail = 0; trail < 64; trail++ { + table[trail] |= bits + } + } + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. + // In that case, bits=1<= 0x100 { + break + } + for { + f.latin1Contains[start] = 1 + start++ + if start >= limit || start >= 0x100 { + break + } + } + if limit > 0x100 { + break + } + } + + // Find the first range overlapping with (or after) 80..FF again, + // to include them in table7FF as well. + listIndex = 0 + for { + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + if limit > 0x80 { + if start < 0x80 { + start = 0x80 + } + break + } + } + + // Set table7FF[]. + for start < 0x800 { + var end rune + if limit <= 0x800 { + end = limit + } else { + end = 0x800 + } + f.set32x64bits(&f.table7FF, start, end) + if limit > 0x800 { + start = 0x800 + break + } + + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + } + + // Set bmpBlockBits[]. + minStart := rune(0x800) + for start < 0x10000 { + if limit > 0x10000 { + limit = 0x10000 + } + + if start < minStart { + start = minStart + } + if start < limit { // Else: Another range entirely in a known mixed-value block. + if (start & 0x3f) != 0 { + // Mixed-value block of 64 code points. + start >>= 6 + f.bmpBlockBits[start&0x3f] |= 0x10001 << (start >> 6) + start = (start + 1) << 6 // Round up to the next block boundary. + minStart = start // Ignore further ranges in this block. + } + if start < limit { + if start < (limit &^ 0x3f) { + // Multiple all-ones blocks of 64 code points each. + f.set32x64bits(&f.bmpBlockBits, start>>6, limit>>6) + } + + if (limit & 0x3f) != 0 { + // Mixed-value block of 64 code points. + limit >>= 6 + f.bmpBlockBits[limit&0x3f] |= 0x10001 << (limit >> 6) + limit = (limit + 1) << 6 // Round up to the next block boundary. + minStart = limit // Ignore further ranges in this block. + } + } + } + + if limit == 0x10000 { + break + } + + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + } +} diff --git a/go/mysql/icuregex/internal/uset/unicode_set.go b/go/mysql/icuregex/internal/uset/unicode_set.go index 56280265444..7e1b1de20c2 100644 --- a/go/mysql/icuregex/internal/uset/unicode_set.go +++ b/go/mysql/icuregex/internal/uset/unicode_set.go @@ -22,6 +22,8 @@ limitations under the License. package uset import ( + "fmt" + "golang.org/x/exp/slices" "vitess.io/vitess/go/mysql/icuregex/internal/uprops" @@ -53,6 +55,7 @@ const ( type UnicodeSet struct { list []rune buffer []rune + frozen *frozen } func New() *UnicodeSet { @@ -93,6 +96,9 @@ func (u *UnicodeSet) ensureBufferCapacity(c int) { } func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } u.ensureBufferCapacity(len(u.list) + len(other)) i := 1 @@ -236,6 +242,10 @@ func pinCodePoint(c *rune) rune { } func (u *UnicodeSet) AddRune(c rune) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + // find smallest i such that c < list[i] // if odd, then it is IN the set // if even, then it is OUT of the set @@ -342,6 +352,9 @@ func (u *UnicodeSet) AddAll(u2 *UnicodeSet) { } func (u *UnicodeSet) Complement() { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } if u.list[0] == UNICODESET_LOW { copy(u.list, u.list[1:]) u.list = u.list[:len(u.list)-1] @@ -366,6 +379,10 @@ func (u *UnicodeSet) RetainAll(c *UnicodeSet) { } func (u *UnicodeSet) retain(other []rune, polarity int8) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + u.ensureBufferCapacity(len(u.list) + len(other)) i := 1 @@ -481,6 +498,9 @@ loop_end: } func (u *UnicodeSet) Clear() { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } u.list = u.list[:1] u.list[0] = UNICODESET_HIGH } @@ -530,6 +550,32 @@ func (u *UnicodeSet) RuneAt(idx int) rune { } func (u *UnicodeSet) ContainsRune(c rune) bool { + if f := u.frozen; f != nil { + if c <= 0xff { + return f.latin1Contains[c] != 0 + } else if c <= 0x7ff { + return (f.table7FF[c&0x3f] & (uint32(1) << (c >> 6))) != 0 + } else if c < 0xd800 || (c >= 0xe000 && c <= 0xffff) { + lead := c >> 12 + twoBits := (f.bmpBlockBits[(c>>6)&0x3f] >> lead) & 0x10001 + if twoBits <= 1 { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + return twoBits != 0 + } else { + // Look up the code point in its 4k block of code points. + return f.containsSlow(u.list, c, f.list4kStarts[lead], f.list4kStarts[lead+1]) + } + } else if c <= 0x10ffff { + // surrogate or supplementary code point + return f.containsSlow(u.list, c, f.list4kStarts[0xd], f.list4kStarts[0x11]) + } else { + // Out-of-range code points get FALSE, consistent with long-standing + // behavior of UnicodeSet::contains(c). + return false + } + } + if c >= UNICODESET_HIGH { return false } @@ -645,9 +691,34 @@ func (u *UnicodeSet) IsEmpty() bool { } func (u *UnicodeSet) CopyFrom(set *UnicodeSet) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } u.list = slices.Clone(set.list) } func (u *UnicodeSet) Equals(other *UnicodeSet) bool { return slices.Equal(u.list, other.list) } + +func (u *UnicodeSet) Freeze() *UnicodeSet { + u.frozen = freeze(u.list) + return u +} + +func (u *UnicodeSet) FreezeCheck_() error { + if u == nil { + return nil + } + if u.frozen == nil { + return fmt.Errorf("UnicodeSet is not frozen") + } + for r := rune(0); r <= 0x10ffff; r++ { + want := (u.findCodePoint(r) & 1) != 0 + got := u.ContainsRune(r) + if want != got { + return fmt.Errorf("rune '%c' (U+%04X) did not freeze", r, r) + } + } + return nil +} diff --git a/go/mysql/icuregex/sets.go b/go/mysql/icuregex/sets.go index 9d362e748cb..04304f93820 100644 --- a/go/mysql/icuregex/sets.go +++ b/go/mysql/icuregex/sets.go @@ -36,12 +36,12 @@ func init() { s.AddAll(uset.MustParsePattern(`\p{Pc}`, 0)) s.AddRune(0x200c) s.AddRune(0x200d) - return s + return s.Freeze() }() - staticPropertySets[URX_ISSPACE_SET] = uset.MustParsePattern(`\p{Whitespace}`, 0) + staticPropertySets[URX_ISSPACE_SET] = uset.MustParsePattern(`\p{Whitespace}`, 0).Freeze() - staticPropertySets[URX_GC_EXTEND] = uset.MustParsePattern(`\p{Grapheme_Extend}`, 0) + staticPropertySets[URX_GC_EXTEND] = uset.MustParsePattern(`\p{Grapheme_Extend}`, 0).Freeze() staticPropertySets[URX_GC_CONTROL] = func() *uset.UnicodeSet { s := uset.New() s.AddAll(uset.MustParsePattern(`[:Zl:]`, 0)) @@ -49,13 +49,13 @@ func init() { s.AddAll(uset.MustParsePattern(`[:Cc:]`, 0)) s.AddAll(uset.MustParsePattern(`[:Cf:]`, 0)) s.RemoveAll(uset.MustParsePattern(`[:Grapheme_Extend:]`, 0)) - return s + return s.Freeze() }() - staticPropertySets[URX_GC_L] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=L}`, 0) - staticPropertySets[URX_GC_LV] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LV}`, 0) - staticPropertySets[URX_GC_LVT] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LVT}`, 0) - staticPropertySets[URX_GC_V] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=V}`, 0) - staticPropertySets[URX_GC_T] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=T}`, 0) + staticPropertySets[URX_GC_L] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() + staticPropertySets[URX_GC_LV] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() + staticPropertySets[URX_GC_LVT] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() + staticPropertySets[URX_GC_V] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() + staticPropertySets[URX_GC_T] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() staticPropertySets[URX_GC_NORMAL] = func() *uset.UnicodeSet { s := uset.New() @@ -65,14 +65,14 @@ func init() { s.RemoveAll(staticPropertySets[URX_GC_L]) s.RemoveAll(staticPropertySets[URX_GC_V]) s.RemoveAll(staticPropertySets[URX_GC_T]) - return s + return s.Freeze() }() } var staticSetUnescape = func() *uset.UnicodeSet { u := uset.New() u.AddString("acefnrtuUx") - return u + return u.Freeze() }() const ( @@ -86,18 +86,18 @@ var staticRuleSet = [kRuleSetCount]*uset.UnicodeSet{ func() *uset.UnicodeSet { u := uset.New() u.AddRuneRange('0', '9') - return u + return u.Freeze() }(), func() *uset.UnicodeSet { u := uset.New() u.AddRuneRange('A', 'Z') u.AddRuneRange('a', 'z') - return u + return u.Freeze() }(), func() *uset.UnicodeSet { u := uset.New() u.AddString("*?+[(){}^$|\\.") u.Complement() - return u + return u.Freeze() }(), } diff --git a/go/mysql/icuregex/sets_test.go b/go/mysql/icuregex/sets_test.go index e5e5200227a..d33552732f2 100644 --- a/go/mysql/icuregex/sets_test.go +++ b/go/mysql/icuregex/sets_test.go @@ -48,3 +48,19 @@ func TestStaticSetContents(t *testing.T) { } } } + +func TestStaticFreeze(t *testing.T) { + for _, s := range staticPropertySets { + if err := s.FreezeCheck_(); err != nil { + t.Error(err) + } + } + for _, s := range staticRuleSet { + if err := s.FreezeCheck_(); err != nil { + t.Error(err) + } + } + if err := staticSetUnescape.FreezeCheck_(); err != nil { + t.Error(err) + } +} From 70938f3528b61d9d78eb29b29134bb63ea33a5bf Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 28 Jun 2023 18:29:11 +0200 Subject: [PATCH 03/18] evalengine: wire up regex Signed-off-by: Vicent Marti --- go/mysql/collations/charset/convert.go | 32 ++++++ go/mysql/icuregex/compiler.go | 4 +- go/mysql/icuregex/icu_test.go | 6 +- go/mysql/icuregex/matcher.go | 8 +- go/mysql/icuregex/pattern.go | 8 +- go/mysql/icuregex/perl_test.go | 14 +-- go/vt/vtgate/evalengine/fn_regexp.go | 114 +++++++++++++++++++ go/vt/vtgate/evalengine/mysql_test.go | 2 +- go/vt/vtgate/evalengine/testcases/cases.go | 40 +++++++ go/vt/vtgate/evalengine/translate.go | 8 ++ go/vt/vtgate/evalengine/translate_builtin.go | 26 +++++ 11 files changed, 242 insertions(+), 20 deletions(-) create mode 100644 go/vt/vtgate/evalengine/fn_regexp.go diff --git a/go/mysql/collations/charset/convert.go b/go/mysql/collations/charset/convert.go index 1c0ced27e4e..a54e8f4d718 100644 --- a/go/mysql/collations/charset/convert.go +++ b/go/mysql/collations/charset/convert.go @@ -126,6 +126,38 @@ func Convert(dst []byte, dstCharset Charset, src []byte, srcCharset Charset) ([] } } +func Expand(dst []rune, src []byte, srcCharset Charset) []rune { + switch srcCharset := srcCharset.(type) { + case Charset_utf8mb3, Charset_utf8mb4: + if dst == nil { + return []rune(string(src)) + } + dst = make([]rune, 0, len(src)) + for _, cp := range string(src) { + dst = append(dst, cp) + } + return dst + case Charset_binary: + if dst == nil { + dst = make([]rune, 0, len(src)) + } + for _, c := range src { + dst = append(dst, rune(c)) + } + return dst + default: + if dst == nil { + dst = make([]rune, 0, len(src)) + } + for len(src) > 0 { + cp, width := srcCharset.DecodeRune(src) + src = src[width:] + dst = append(dst, cp) + } + return dst + } +} + func ConvertFromUTF8(dst []byte, dstCharset Charset, src []byte) ([]byte, error) { return Convert(dst, dstCharset, src, Charset_utf8mb4{}) } diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index cef7f26623b..c98fe0efb46 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -96,8 +96,7 @@ type Compiler struct { newModeFlags RegexpFlag setModeFlag bool - literalChars []rune - patternLength int + literalChars []rune parenStack []int matchOpenParen int @@ -316,7 +315,6 @@ func (c *Compiler) compile(pat string) error { c.out.pattern = pat c.p = pat - c.patternLength = utf8.RuneCountInString(pat) var state uint16 = 1 var table []regexTableEl diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index 1e766ac35ed..7c7c266783d 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -252,7 +252,7 @@ func (tp *TestPattern) Test(t testing.TB) bool { err = fmt.Errorf("PANIC: %v", r) } }() - re, err = icuregex.Compile(tp.Pattern, tp.Flags) + re, err = icuregex.CompileString(tp.Pattern, tp.Flags) return }() if err != nil { @@ -396,7 +396,7 @@ func TestCornerCases(t *testing.T) { for _, tc := range cases { t.Run(tc.Pattern, func(t *testing.T) { - _, err := icuregex.Compile(tc.Pattern, tc.Flags) + _, err := icuregex.CompileString(tc.Pattern, tc.Flags) if err != nil { t.Fatal(err) } @@ -411,7 +411,7 @@ func TestOne(t *testing.T) { const Input = "foo\u09BDbar" const Flags = 0 - re, err := icuregex.Compile(Pattern, Flags) + re, err := icuregex.CompileString(Pattern, Flags) if err != nil { t.Fatalf("compilation failed: %v", err) } diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index 842b3b458d9..7fc727be083 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -1600,8 +1600,12 @@ func (m *Matcher) followingGCBoundary(pos int) int { panic("TODO") } -func (m *Matcher) Reset(input string) { - m.input = []rune(input) +func (m *Matcher) ResetString(input string) { + m.Reset([]rune(input)) +} + +func (m *Matcher) Reset(input []rune) { + m.input = input m.reset() } diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go index 26fbc5ff88f..6e67410bd2c 100644 --- a/go/mysql/icuregex/pattern.go +++ b/go/mysql/icuregex/pattern.go @@ -59,15 +59,15 @@ func NewPattern(flags RegexpFlag) *Pattern { } } -func MustCompile(in string, flags RegexpFlag) *Pattern { - pat, err := Compile(in, flags) +func MustCompileString(in string, flags RegexpFlag) *Pattern { + pat, err := CompileString(in, flags) if err != nil { panic(err) } return pat } -func Compile(in string, flags RegexpFlag) (*Pattern, error) { +func CompileString(in string, flags RegexpFlag) (*Pattern, error) { pat := NewPattern(flags) cmp := NewCompiler(pat) if err := cmp.compile(in); err != nil { @@ -78,7 +78,7 @@ func Compile(in string, flags RegexpFlag) (*Pattern, error) { func (p *Pattern) Match(input string) *Matcher { m := NewMatcher(p) - m.Reset(input) + m.ResetString(input) return m } diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go index b607df5349c..8958bc4fcfd 100644 --- a/go/mysql/icuregex/perl_test.go +++ b/go/mysql/icuregex/perl_test.go @@ -38,13 +38,13 @@ func TestPerl(t *testing.T) { } defer f.Close() - flagPat := MustCompile(`('?)(.*)\1(.*)`, 0) + flagPat := MustCompileString(`('?)(.*)\1(.*)`, 0) flagMat := flagPat.Matcher() - groupsPat := MustCompile(`\$([+\-])\[(\d+)\]`, 0) + groupsPat := MustCompileString(`\$([+\-])\[(\d+)\]`, 0) groupsMat := groupsPat.Matcher() - cgPat := MustCompile(`\$(\d+)`, 0) + cgPat := MustCompileString(`\$(\d+)`, 0) cgMat := cgPat.Matcher() group := func(m *Matcher, idx int) string { @@ -73,7 +73,7 @@ func TestPerl(t *testing.T) { lineno++ fields := strings.Split(scanner.Text(), "\t") - flagMat.Reset(fields[0]) + flagMat.ResetString(fields[0]) ok, _ := flagMat.Matches() if !ok { t.Fatalf("could not match pattern+flags (line %d)", lineno) @@ -94,7 +94,7 @@ func TestPerl(t *testing.T) { flags |= UREGEX_COMMENTS } - testPat, err := Compile(pattern, flags) + testPat, err := CompileString(pattern, flags) if err != nil { if cerr, ok := err.(*CompileError); ok && cerr.Code == uerror.U_REGEX_UNIMPLEMENTED { continue @@ -134,8 +134,8 @@ func TestPerl(t *testing.T) { var perlExpr = fields[3] for len(perlExpr) > 0 { - groupsMat.Reset(perlExpr) - cgMat.Reset(perlExpr) + groupsMat.ResetString(perlExpr) + cgMat.ResetString(perlExpr) switch { case strings.HasPrefix(perlExpr, "$&"): diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go new file mode 100644 index 00000000000..1128b1155d9 --- /dev/null +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -0,0 +1,114 @@ +package evalengine + +import ( + "strings" + + "vitess.io/vitess/go/hack" + "vitess.io/vitess/go/mysql/collations" + "vitess.io/vitess/go/mysql/collations/charset" + "vitess.io/vitess/go/mysql/icuregex" + "vitess.io/vitess/go/sqltypes" + querypb "vitess.io/vitess/go/vt/proto/query" +) + +type builtinRegexpLike struct { + CallExpr + Negate bool +} + +func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) (icuregex.RegexpFlag, error) { + m, err := match.eval(env) + if err != nil || m == nil { + return flags, err + } + + switch m := m.(type) { + case *evalBytes: + for _, b := range m.bytes { + switch b { + case 'c': + flags &= ^icuregex.UREGEX_CASE_INSENSITIVE + case 'i': + flags |= icuregex.UREGEX_CASE_INSENSITIVE + case 'm': + flags |= icuregex.UREGEX_MULTILINE + case 'n': + flags |= icuregex.UREGEX_DOTALL + case 'u': + flags |= icuregex.UREGEX_UNIX_LINES + } + } + } + + return flags, nil +} + +func regexpMatcher(input, pat, flags eval) (*icuregex.Matcher, error) { + +} + +func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + var colid collations.ID + input, pat, colid, err = mergeAndCoerceCollations(input, pat) + if err != nil { + return nil, err + } + + var flags icuregex.RegexpFlag + var collation = colid.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.UREGEX_CASE_INSENSITIVE + } + + if len(r.Arguments) > 2 { + flags, err = evalRegexpFlags(env, r.Arguments[2], flags) + if err != nil { + return nil, err + } + } + + patUtf8, err := charset.Convert(nil, &charset.Charset_utf8mb4{}, pat.ToRawBytes(), collation.Charset()) + if err != nil { + return nil, err + } + + regexp, err := icuregex.CompileString(hack.String(patUtf8), flags) + if err != nil { + return nil, err + } + + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes) + + ok, err := m.Matches() + if err != nil { + return nil, err + } + if r.Negate { + ok = !ok + } + return newEvalBool(ok), nil +} + +func (r *builtinRegexpLike) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + return sqltypes.Int64, f1 | f2 +} + +func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { + return ctype{}, c.unsupported(r) +} + +var _ Expr = (*builtinRegexpLike)(nil) diff --git a/go/vt/vtgate/evalengine/mysql_test.go b/go/vt/vtgate/evalengine/mysql_test.go index 18802cfb8dc..987ad906b88 100644 --- a/go/vt/vtgate/evalengine/mysql_test.go +++ b/go/vt/vtgate/evalengine/mysql_test.go @@ -147,6 +147,6 @@ func TestMySQLGolden(t *testing.T) { func TestDebug1(t *testing.T) { // Debug - eval, err := testSingle(t, `SELECT DATE_SUB(TIMESTAMP'2025-01-01 00:00:00', INTERVAL '1.999999' year_month)`) + eval, err := testSingle(t, `SELECT _latin1 0xFF regexp _latin1 '[[:lower:]]' COLLATE latin1_bin`) t.Logf("eval=%s err=%v coll=%s", eval.String(), err, eval.Collation().Get().Name()) } diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index b72c5dae816..603b24498dd 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -151,6 +151,7 @@ var Cases = []TestCase{ {Run: FnUUID}, {Run: FnUUIDToBin}, {Run: DateMath}, + {Run: Regexp}, } func JSONPathOperations(yield Query) { @@ -1898,3 +1899,42 @@ func DateMath(yield Query) { } } } + +func Regexp(yield Query) { + mysqlDocSamples := []string{ + `'Michael!' REGEXP '.*'`, + `'new*\n*line' REGEXP 'new\\*.\\*line'`, + `'a' REGEXP '^[a-d]'`, + `REGEXP_LIKE('CamelCase', 'CAMELCASE')`, + `REGEXP_LIKE('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_LIKE('abc', 'ABC'`, + `REGEXP_LIKE('abc', 'ABC', 'c')`, + `' ' REGEXP '[[:blank:]]'`, + `'\t' REGEXP '[[:blank:]]'`, + `' ' REGEXP '[[:space:]]'`, + `'\t' REGEXP '[[:space:]]'`, + `_latin1 0xFF regexp _latin1 '[[:lower:]]' COLLATE latin1_bin`, + `_koi8r 0xFF regexp _koi8r '[[:lower:]]' COLLATE koi8r_bin`, + `_latin1 0xFF regexp _latin1 '[[:upper:]]' COLLATE latin1_bin`, + `_koi8r 0xFF regexp _koi8r '[[:upper:]]' COLLATE koi8r_bin`, + `_latin1 0xF7 regexp _latin1 '[[:alpha:]]'`, + `_koi8r 0xF7 regexp _koi8r '[[:alpha:]]'`, + `_latin1'a' regexp _latin1'A' collate latin1_general_ci`, + `_latin1'a' regexp _latin1'A' collate latin1_bin`, + `'a' regexp '\\p{alphabetic}'`, + `'a' regexp '\\P{alphabetic}'`, + `'👌🏾regexp '\\p{Emoji}\\p{Emoji_modifier}'`, + `'a' regexp '\\p{Lowercase_letter}'`, + `'a' regexp '\\p{Uppercase_letter}'`, + `'A' regexp '\\p{Lowercase_letter}'`, + `'A' regexp '\\p{Uppercase_letter}'`, + `'a' collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}'`, + `'A' collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}'`, + `'a' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, + `'A' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} diff --git a/go/vt/vtgate/evalengine/translate.go b/go/vt/vtgate/evalengine/translate.go index 7690201f2a3..8cc6df7bd02 100644 --- a/go/vt/vtgate/evalengine/translate.go +++ b/go/vt/vtgate/evalengine/translate.go @@ -75,6 +75,14 @@ func (ast *astCompiler) translateComparisonExpr2(op sqlparser.ComparisonExprOper return &LikeExpr{BinaryExpr: binaryExpr}, nil case sqlparser.NotLikeOp: return &LikeExpr{BinaryExpr: binaryExpr, Negate: true}, nil + case sqlparser.RegexpOp, sqlparser.NotRegexpOp: + return &builtinRegexpLike{ + CallExpr: CallExpr{ + Arguments: []Expr{left, right}, + Method: "REGEXP_LIKE", + }, + Negate: op == sqlparser.NotRegexpOp, + }, nil default: return nil, vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, op.ToString()) } diff --git a/go/vt/vtgate/evalengine/translate_builtin.go b/go/vt/vtgate/evalengine/translate_builtin.go index fb6f988af7d..f4a27dad704 100644 --- a/go/vt/vtgate/evalengine/translate_builtin.go +++ b/go/vt/vtgate/evalengine/translate_builtin.go @@ -765,6 +765,32 @@ func (ast *astCompiler) translateCallable(call sqlparser.Callable) (Expr, error) collate: ast.cfg.Collation, }, nil + case *sqlparser.RegexpLikeExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpLike{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_LIKE"}, + Negate: false, + }, nil + default: return nil, translateExprNotSupported(call) } From 8261873f89f8cc5e2f9e7c9bdd4d1b5817a0069c Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Thu, 29 Jun 2023 17:18:26 +0200 Subject: [PATCH 04/18] Fix remaining TODOs and fix a bunch of bugs Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/compiler.go | 146 ++++-- go/mysql/icuregex/icu_test.go | 6 +- .../icuregex/internal/bytestrie/bytes_trie.go | 14 - .../icuregex/internal/normalizer/constants.go | 124 +++++ .../internal/normalizer/normalizer.go | 482 ++++++++++++++++++ go/mysql/icuregex/internal/ubidi/ubidi.go | 6 +- go/mysql/icuregex/internal/ucase/ucase.go | 6 +- go/mysql/icuregex/internal/uchar/uchar.go | 6 +- go/mysql/icuregex/internal/uerror/error.go | 106 +--- go/mysql/icuregex/internal/ulayout/ulayout.go | 6 +- go/mysql/icuregex/internal/unames/unames.go | 23 +- .../icuregex/internal/uprops/properties.go | 476 +++++++++++++++++ go/mysql/icuregex/internal/uprops/uprops.go | 9 +- .../icuregex/internal/uprops/uprops_binary.go | 44 +- .../icuregex/internal/uprops/uprops_int.go | 27 +- go/mysql/icuregex/internal/uprops/uscript.go | 4 +- go/mysql/icuregex/internal/uset/close.go | 10 +- go/mysql/icuregex/internal/uset/frozen.go | 21 + go/mysql/icuregex/internal/uset/pattern.go | 12 +- go/mysql/icuregex/internal/uset/properties.go | 417 --------------- .../icuregex/internal/uset/unicode_set.go | 49 +- go/mysql/icuregex/internal/utf16/helpers.go | 11 - go/mysql/icuregex/internal/utrie/ucptrie.go | 128 ++--- go/mysql/icuregex/matcher.go | 260 +--------- go/mysql/icuregex/perl_test.go | 2 +- go/mysql/icuregex/sets.go | 33 +- .../icuregex/testdata/regextst_extended.txt | 40 +- go/vt/vtgate/evalengine/fn_regexp.go | 4 - 28 files changed, 1431 insertions(+), 1041 deletions(-) create mode 100644 go/mysql/icuregex/internal/normalizer/constants.go create mode 100644 go/mysql/icuregex/internal/normalizer/normalizer.go create mode 100644 go/mysql/icuregex/internal/uprops/properties.go delete mode 100644 go/mysql/icuregex/internal/uset/properties.go diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index c98fe0efb46..0bee3e49b26 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -1486,18 +1486,18 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doSetBackslash_d: set := c.setStack[len(c.setStack)-1] - set.AddCategory(uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) case doSetBackslash_D: digits := uset.New() - digits.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) + c.err = uprops.ApplyIntPropertyValue(digits, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) digits.Complement() set := c.setStack[len(c.setStack)-1] set.AddAll(digits) case doSetBackslash_h: h := uset.New() - h.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) h.AddRune(9) // Tab set := c.setStack[len(c.setStack)-1] @@ -1505,7 +1505,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doSetBackslash_H: h := uset.New() - h.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) h.AddRune(9) // Tab h.Complement() @@ -1786,7 +1786,7 @@ func (c *Compiler) stripNOPs() { case URX_BACKREF, URX_BACKREF_I: where := op.Value() - if int(where) > len(c.out.groupMap) { + if where > len(c.out.groupMap) { c.error(uerror.U_REGEX_INVALID_BACK_REF) break } @@ -1996,7 +1996,7 @@ func (c *Compiler) matchStartType() { // Digit Char if currentLen == 0 { s := uset.New() - s.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) if op.Value() != 0 { s.Complement() } @@ -2010,7 +2010,7 @@ func (c *Compiler) matchStartType() { // Horiz white space if currentLen == 0 { s := uset.New() - s.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) s.AddRune(9) // Tab if op.Value() != 0 { s.Complement() @@ -2080,7 +2080,7 @@ func (c *Compiler) matchStartType() { case URX_JMP: jmpDest := op.Value() - if int(jmpDest) < loc { + if jmpDest < loc { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength[loc+1] @@ -2228,7 +2228,7 @@ func (c *Compiler) matchStartType() { // Need this because neg lookahead blocks will FAIL to outside // of the block. jmpDest := op.Value() - if int(jmpDest) > loc { + if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = (currentLen) } @@ -2574,7 +2574,7 @@ func (c *Compiler) insertOp(where int) { for loc, op := range c.out.compiledPat { switch op.Type() { case URX_JMP, URX_JMPX, URX_STATE_SAVE, URX_CTR_LOOP, URX_CTR_LOOP_NG, URX_JMP_SAV, URX_JMP_SAV_X, URX_RELOC_OPRND: - if int(op.Value()) > where { + if op.Value() > where { op = c.buildOp(op.Type(), op.Value()+1) c.out.compiledPat[loc] = op } @@ -2930,7 +2930,7 @@ func (c *Compiler) minMatchLength(start, end int) int32 { case URX_JMP: jmpDest := op.Value() - if int(jmpDest) < loc { + if jmpDest < loc { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength[loc+1] @@ -2951,7 +2951,7 @@ func (c *Compiler) minMatchLength(start, end int) int32 { // State Save, for forward jumps, propagate the current minimum. // of the state save. jmpDest := op.Value() - if int(jmpDest) > loc { + if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen } @@ -2981,7 +2981,7 @@ func (c *Compiler) minMatchLength(start, end int) int32 { loopEndLoc := loopEndOp.Value() minLoopCount := c.out.compiledPat[loc+2] if minLoopCount == 0 { - loc = int(loopEndLoc) + loc = loopEndLoc } else { loc += 3 // Skips over operands of CTR_INIT } @@ -3031,7 +3031,7 @@ func (c *Compiler) minMatchLength(start, end int) int32 { if op.Type() == URX_STATE_SAVE { // Need this because neg lookahead blocks will FAIL to outside of the block. jmpDest := op.Value() - if int(jmpDest) > loc { + if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen } @@ -3146,7 +3146,7 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { // Jumps. // case URX_JMP, URX_JMPX, URX_JMP_SAV, URX_JMP_SAV_X: - jmpDest := int(op.Value()) + jmpDest := op.Value() if jmpDest < loc { // Loop of some kind. Max match length is unbounded. currentLen = math.MaxInt32 @@ -3168,7 +3168,7 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { // of the state save. // For backwards jumps, they create a loop, maximum // match length is unbounded. - jmpDest := int(op.Value()) + jmpDest := op.Value() if jmpDest > loc { if currentLen > forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen @@ -3210,7 +3210,7 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { case URX_CTR_INIT, URX_CTR_INIT_NG: // For Loops, recursively call this function on the pattern for the loop body, // then multiply the result by the maximum loop count. - loopEndLoc := int(c.out.compiledPat[loc+1].Value()) + loopEndLoc := c.out.compiledPat[loc+1].Value() if loopEndLoc == loc+4 { // Loop has an empty body. No affect on max match length. // Continue processing with code after the loop end. @@ -3381,7 +3381,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni } var err error - set, err = uset.ParsePattern("\\p{"+propName+"}", usetFlags) + set, err = uprops.NewUnicodeSetFomPattern("\\p{"+propName+"}", usetFlags) if err == nil { goto done } @@ -3406,7 +3406,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni // if strings.HasPrefix(propName, "In") && len(propName) >= 3 { set = uset.New() - if set.ApplyPropertyAlias("Block", propName[2:]) != nil { + if uprops.ApplyPropertyAlias(set, "Block", propName[2:]) != nil { c.error(uerror.U_REGEX_PROPERTY_SYNTAX) } goto done @@ -3429,7 +3429,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni mPropName = "Titlecase_Letter" } - set, err = uset.ParsePattern("\\p{"+mPropName+"}", 0) + set, err = uprops.NewUnicodeSetFomPattern("\\p{"+mPropName+"}", 0) if err != nil { c.error(uerror.U_REGEX_PROPERTY_SYNTAX) } else if !set.IsEmpty() && (usetFlags&uset.USET_CASE_INSENSITIVE) != 0 { @@ -3446,61 +3446,97 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni // These all begin with "java" // if propName == "javaDefined" { - set.AddCategory(uchar.U_GC_CN_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_CN_MASK) set.Complement() } else if propName == "javaDigit" { - set.AddCategory(uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) } else if propName == "javaIdentifierIgnorable" { - addIdentifierIgnorable(set) + c.err = addIdentifierIgnorable(set) } else if propName == "javaISOControl" { set.AddRuneRange(0, 0x1F) set.AddRuneRange(0x7F, 0x9F) } else if propName == "javaJavaIdentifierPart" { - set.AddCategory(uchar.U_GC_L_MASK) - set.AddCategory(uchar.U_GC_SC_MASK) - set.AddCategory(uchar.U_GC_PC_MASK) - set.AddCategory(uchar.U_GC_ND_MASK) - set.AddCategory(uchar.U_GC_NL_MASK) - set.AddCategory(uchar.U_GC_MC_MASK) - set.AddCategory(uchar.U_GC_MN_MASK) - addIdentifierIgnorable(set) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_SC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_MC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_MN_MASK) + } + if c.err == nil { + c.err = addIdentifierIgnorable(set) + } } else if propName == "javaJavaIdentifierStart" { - set.AddCategory(uchar.U_GC_L_MASK) - set.AddCategory(uchar.U_GC_NL_MASK) - set.AddCategory(uchar.U_GC_SC_MASK) - set.AddCategory(uchar.U_GC_PC_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_SC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + } } else if propName == "javaLetter" { - set.AddCategory(uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) } else if propName == "javaLetterOrDigit" { - set.AddCategory(uchar.U_GC_L_MASK) - set.AddCategory(uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + } } else if propName == "javaLowerCase" { - set.AddCategory(uchar.U_GC_LL_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_LL_MASK) } else if propName == "javaMirrored" { - set.ApplyIntPropertyValue(uprops.UCHAR_BIDI_MIRRORED, 1) + c.err = uprops.ApplyIntPropertyValue(set, uprops.UCHAR_BIDI_MIRRORED, 1) } else if propName == "javaSpaceChar" { - set.AddCategory(uchar.U_GC_Z_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_Z_MASK) } else if propName == "javaSupplementaryCodePoint" { set.AddRuneRange(0x10000, uset.MAX_VALUE) } else if propName == "javaTitleCase" { - set.AddCategory(uchar.U_GC_LT_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_LT_MASK) } else if propName == "javaUnicodeIdentifierStart" { - set.AddCategory(uchar.U_GC_L_MASK) - set.AddCategory(uchar.U_GC_NL_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + } } else if propName == "javaUnicodeIdentifierPart" { - set.AddCategory(uchar.U_GC_L_MASK) - set.AddCategory(uchar.U_GC_PC_MASK) - set.AddCategory(uchar.U_GC_ND_MASK) - set.AddCategory(uchar.U_GC_NL_MASK) - set.AddCategory(uchar.U_GC_MC_MASK) - set.AddCategory(uchar.U_GC_MN_MASK) - addIdentifierIgnorable(set) + c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_MC_MASK) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.U_GC_MN_MASK) + } + if c.err == nil { + c.err = addIdentifierIgnorable(set) + } } else if propName == "javaUpperCase" { - set.AddCategory(uchar.U_GC_LU_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_LU_MASK) } else if propName == "javaValidCodePoint" { set.AddRuneRange(0, uset.MAX_VALUE) } else if propName == "javaWhitespace" { - set.AddCategory(uchar.U_GC_Z_MASK) + c.err = uprops.AddCategory(set, uchar.U_GC_Z_MASK) excl := uset.New() excl.AddRune(0x0a) excl.AddRune(0x2007) @@ -3532,12 +3568,12 @@ done: return set } -func addIdentifierIgnorable(set *uset.UnicodeSet) { +func addIdentifierIgnorable(set *uset.UnicodeSet) error { set.AddRuneRange(0, 8) set.AddRuneRange(0x0e, 0x1b) set.AddRuneRange(0x7f, 0x9f) - set.AddCategory(uchar.U_GC_CF_MASK) + return uprops.AddCategory(set, uchar.U_GC_CF_MASK) } func (c *Compiler) scanPosixProp() *uset.UnicodeSet { diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index 7c7c266783d..64f56637fd7 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -156,7 +156,7 @@ func (tp *TestPattern) parseMatch(input string) error { } else { num, err := strconv.Atoi(groupNum) if err != nil { - return fmt.Errorf("bad group number %q: %v", groupNum, err) + return fmt.Errorf("bad group number %q: %w", groupNum, err) } if num >= len(tp.Groups) { @@ -407,8 +407,8 @@ func TestCornerCases(t *testing.T) { func TestOne(t *testing.T) { icuregex.Dumper = os.Stderr - const Pattern = `\p{Indic_Syllabic_Category=Avagraha}` - const Input = "foo\u09BDbar" + const Pattern = `\p{CaseIgnorable}` + const Input = "foo.bar" const Flags = 0 re, err := icuregex.CompileString(Pattern, Flags) diff --git a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go index c46084ff21b..732fddc231d 100644 --- a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go +++ b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go @@ -91,25 +91,11 @@ const ( kMinThreeByteValueLead = kMinTwoByteValueLead + (kMaxTwoByteValue >> 8) + 1 // 0x6c kFourByteValueLead = 0x7e - // A little more than Unicode code points. (0x11ffff) - kMaxThreeByteValue = ((kFourByteValueLead - kMinThreeByteValueLead) << 16) - 1 - kFiveByteValueLead = 0x7f - // Compact delta integers. kMaxOneByteDelta = 0xbf kMinTwoByteDeltaLead = kMaxOneByteDelta + 1 // 0xc0 kMinThreeByteDeltaLead = 0xf0 kFourByteDeltaLead = 0xfe - kFiveByteDeltaLead = 0xff - kMaxTwoByteDelta = ((kMinThreeByteDeltaLead - kMinTwoByteDeltaLead) << 8) - 1 // 0x2fff - kMaxThreeByteDelta = ((kFourByteDeltaLead - kMinThreeByteDeltaLead) << 16) - 1 // 0xdffff - - // For getState64(): - // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 - // so we need at least 5 bits for that. - // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. - kState64RemainingShift = 59 - kState64PosMask = (uint64(1) << kState64RemainingShift) - 1 ) func (bt *BytesTrie) ContainsName(name string) bool { diff --git a/go/mysql/icuregex/internal/normalizer/constants.go b/go/mysql/icuregex/internal/normalizer/constants.go new file mode 100644 index 00000000000..85b19de4b82 --- /dev/null +++ b/go/mysql/icuregex/internal/normalizer/constants.go @@ -0,0 +1,124 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +const ( + // Fixed norm16 values. + MIN_YES_YES_WITH_CC = 0xfe02 + JAMO_VT = 0xfe00 + MIN_NORMAL_MAYBE_YES = 0xfc00 + JAMO_L = 2 // offset=1 hasCompBoundaryAfter=false + INERT = 1 // offset=0 hasCompBoundaryAfter=true + + // norm16 bit 0 is comp-boundary-after. + HAS_COMP_BOUNDARY_AFTER = 1 + OFFSET_SHIFT = 1 + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + DELTA_TCCC_0 = 0 + DELTA_TCCC_1 = 2 + DELTA_TCCC_GT_1 = 4 + DELTA_TCCC_MASK = 6 + DELTA_SHIFT = 3 + + MAX_DELTA = 0x40 +) + +const ( + JAMO_L_BASE rune = 0x1100 /* "lead" jamo */ + JAMO_L_END rune = 0x1112 + JAMO_V_BASE rune = 0x1161 /* "vowel" jamo */ + JAMO_V_END rune = 0x1175 + JAMO_T_BASE rune = 0x11a7 /* "trail" jamo */ + JAMO_T_END rune = 0x11c2 + + HANGUL_BASE rune = 0xac00 + HANGUL_END rune = 0xd7a3 + + JAMO_L_COUNT rune = 19 + JAMO_V_COUNT rune = 21 + JAMO_T_COUNT rune = 28 + + JAMO_VT_COUNT = JAMO_V_COUNT * JAMO_T_COUNT + + HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT * JAMO_T_COUNT + HANGUL_LIMIT = HANGUL_BASE + HANGUL_COUNT +) + +const ( + MAPPING_HAS_CCC_LCCC_WORD = 0x80 + MAPPING_HAS_RAW_MAPPING = 0x40 + // unused bit 0x20, + MAPPING_LENGTH_MASK = 0x1f +) + +/** + * Constants for normalization modes. + * @deprecated ICU 56 Use unorm2.h instead. + */ +type UNormalizationMode int32 + +const ( + /** No decomposition/composition. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_NONE UNormalizationMode = 1 + /** Canonical decomposition. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_NFD UNormalizationMode = 2 + /** Compatibility decomposition. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_NFKD UNormalizationMode = 3 + /** Canonical decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_NFC UNormalizationMode = 4 + /** Default normalization. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_DEFAULT UNormalizationMode = UNORM_NFC + /** Compatibility decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_NFKC UNormalizationMode = 5 + /** "Fast C or D" form. @deprecated ICU 56 Use unorm2.h instead. */ + UNORM_FCD UNormalizationMode = 6 +) + +/** + * Result values for normalization quick check functions. + * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + * @stable ICU 2.0 + */ +type UNormalizationCheckResult int + +const ( + /** + * The input string is not in the normalization form. + * @stable ICU 2.0 + */ + UNORM_NO UNormalizationCheckResult = iota + /** + * The input string is in the normalization form. + * @stable ICU 2.0 + */ + UNORM_YES + /** + * The input string may or may not be in the normalization form. + * This value is only returned for composition forms like NFC and FCC, + * when a backward-combining character is found for which the surrounding text + * would have to be analyzed further. + * @stable ICU 2.0 + */ + UNORM_MAYBE +) diff --git a/go/mysql/icuregex/internal/normalizer/normalizer.go b/go/mysql/icuregex/internal/normalizer/normalizer.go new file mode 100644 index 00000000000..87f370ea0ab --- /dev/null +++ b/go/mysql/icuregex/internal/normalizer/normalizer.go @@ -0,0 +1,482 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +import ( + "fmt" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +type normalizer struct { + minDecompNoCP rune + minCompNoMaybeCP rune + minLcccCP rune + + // Norm16 value thresholds for quick check combinations and types of extra data. + minYesNo uint16 + minYesNoMappingsOnly uint16 + minNoNo uint16 + minNoNoCompBoundaryBefore uint16 + minNoNoCompNoMaybeCC uint16 + minNoNoEmpty uint16 + limitNoNo uint16 + centerNoNoDelta uint16 + minMaybeYes uint16 + + normTrie *utrie.UcpTrie + + maybeYesCompositions []uint16 + extraData []uint16 // mappings and/or compositions for yesYes, yesNo & noNo characters + smallFCD []uint8 // [0x100] one bit per 32 BMP code points, set if any FCD!=0 +} + +var nfc *normalizer +var nfkc *normalizer + +var normalizerOnce sync.Once + +func loadNormalizer() { + normalizerOnce.Do(func() { + nfc = &normalizer{} + if err := nfc.load(icudata.NFC); err != nil { + panic(err) + } + + nfkc = &normalizer{} + if err := nfkc.load(icudata.NFKC); err != nil { + panic(err) + } + }) +} + +const IX_NORM_TRIE_OFFSET = 0 +const IX_EXTRA_DATA_OFFSET = 1 +const IX_SMALL_FCD_OFFSET = 2 +const IX_RESERVED3_OFFSET = 3 +const IX_TOTAL_SIZE = 7 + +const IX_MIN_DECOMP_NO_CP = 8 +const IX_MIN_COMP_NO_MAYBE_CP = 9 + +/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ +const IX_MIN_YES_NO = 10 + +/** Mappings are comp-normalized. */ +const IX_MIN_NO_NO = 11 +const IX_LIMIT_NO_NO = 12 +const IX_MIN_MAYBE_YES = 13 + +/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ +const IX_MIN_YES_NO_MAPPINGS_ONLY = 14 + +/** Mappings are not comp-normalized but have a comp boundary before. */ +const IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE = 15 + +/** Mappings do not have a comp boundary before. */ +const IX_MIN_NO_NO_COMP_NO_MAYBE_CC = 16 + +/** Mappings to the empty string. */ +const IX_MIN_NO_NO_EMPTY = 17 + +const IX_MIN_LCCC_CP = 18 +const IX_COUNT = 20 + +func (n *normalizer) load(data []byte) error { + bytes := udata.NewBytes(data) + + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x4e && /* dataFormat="unam" */ + info.DataFormat[1] == 0x72 && + info.DataFormat[2] == 0x6d && + info.DataFormat[3] == 0x32 && + info.FormatVersion[0] == 4 + }) + if err != nil { + return err + } + + indexesLength := int32(bytes.Uint32()) / 4 + if indexesLength <= IX_MIN_LCCC_CP { + return fmt.Errorf("normalizer2 data: not enough indexes") + } + indexes := make([]int32, indexesLength) + indexes[0] = indexesLength * 4 + for i := int32(1); i < indexesLength; i++ { + indexes[i] = bytes.Int32() + } + + n.minDecompNoCP = indexes[IX_MIN_DECOMP_NO_CP] + n.minCompNoMaybeCP = indexes[IX_MIN_COMP_NO_MAYBE_CP] + n.minLcccCP = indexes[IX_MIN_LCCC_CP] + + n.minYesNo = uint16(indexes[IX_MIN_YES_NO]) + n.minYesNoMappingsOnly = uint16(indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]) + n.minNoNo = uint16(indexes[IX_MIN_NO_NO]) + n.minNoNoCompBoundaryBefore = uint16(indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]) + n.minNoNoCompNoMaybeCC = uint16(indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]) + n.minNoNoEmpty = uint16(indexes[IX_MIN_NO_NO_EMPTY]) + n.limitNoNo = uint16(indexes[IX_LIMIT_NO_NO]) + n.minMaybeYes = uint16(indexes[IX_MIN_MAYBE_YES]) + + n.centerNoNoDelta = uint16(indexes[IX_MIN_MAYBE_YES]>>DELTA_SHIFT) - MAX_DELTA - 1 + + offset := indexes[IX_NORM_TRIE_OFFSET] + nextOffset := indexes[IX_EXTRA_DATA_OFFSET] + triePosition := bytes.Position() + + n.normTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + + trieLength := bytes.Position() - triePosition + if trieLength > nextOffset-offset { + return fmt.Errorf("normalizer2 data: not enough bytes for normTrie") + } + bytes.Skip((nextOffset - offset) - trieLength) // skip padding after trie bytes + + // Read the composition and mapping data. + offset = nextOffset + nextOffset = indexes[IX_SMALL_FCD_OFFSET] + numChars := (nextOffset - offset) / 2 + if numChars != 0 { + n.maybeYesCompositions = bytes.Uint16Slice(numChars) + n.extraData = n.maybeYesCompositions[((MIN_NORMAL_MAYBE_YES - n.minMaybeYes) >> OFFSET_SHIFT):] + } + + // smallFCD: new in formatVersion 2 + n.smallFCD = bytes.Uint8Slice(0x100) + return nil +} + +func Nfc() *normalizer { + loadNormalizer() + return nfc +} + +func Nfkc() *normalizer { + loadNormalizer() + return nfkc +} + +func (n *normalizer) AddPropertyStarts(u *uset.UnicodeSet) { + var start, end rune + var value uint32 + for { + end, value = nfc.normTrie.GetRange(start, utrie.UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, nil) + if end < 0 { + break + } + u.AddRune(start) + if start != end && n.isAlgorithmicNoNo(uint16(value)) && (value&DELTA_TCCC_MASK) > DELTA_TCCC_1 { + // Range of code points with same-norm16-value algorithmic decompositions. + // They might have different non-zero FCD16 values. + prevFCD16 := n.GetFCD16(start) + for { + start++ + if start > end { + break + } + fcd16 := n.GetFCD16(start) + if fcd16 != prevFCD16 { + u.AddRune(start) + prevFCD16 = fcd16 + } + } + } + start = end + 1 + } + + // add Hangul LV syllables and LV+1 because of skippables + for c := HANGUL_BASE; c < HANGUL_LIMIT; c += JAMO_T_COUNT { + u.AddRune(c) + u.AddRune(c + 1) + } + u.AddRune(HANGUL_LIMIT) +} + +func (n *normalizer) isAlgorithmicNoNo(norm16 uint16) bool { + return n.limitNoNo <= norm16 && norm16 < n.minMaybeYes +} + +func (n *normalizer) GetFCD16(c rune) uint16 { + if c < n.minDecompNoCP { + return 0 + } else if c <= 0xffff { + if !n.singleLeadMightHaveNonZeroFCD16(c) { + return 0 + } + } + return n.getFCD16FromNormData(c) +} + +func (n *normalizer) singleLeadMightHaveNonZeroFCD16(lead rune) bool { + // 0<=lead<=0xffff + bits := n.smallFCD[lead>>8] + if bits == 0 { + return false + } + return ((bits >> ((lead >> 5) & 7)) & 1) != 0 +} + +func (n *normalizer) getFCD16FromNormData(c rune) uint16 { + norm16 := n.GetNorm16(c) + if norm16 >= n.limitNoNo { + if norm16 >= MIN_NORMAL_MAYBE_YES { + // combining mark + norm16 = uint16(n.getCCFromNormalYesOrMaybe(norm16)) + return norm16 | (norm16 << 8) + } else if norm16 >= n.minMaybeYes { + return 0 + } else { // isDecompNoAlgorithmic(norm16) + deltaTrailCC := norm16 & DELTA_TCCC_MASK + if deltaTrailCC <= DELTA_TCCC_1 { + return deltaTrailCC >> OFFSET_SHIFT + } + // Maps to an isCompYesAndZeroCC. + c = n.mapAlgorithmic(c, norm16) + norm16 = n.getRawNorm16(c) + } + } + + if norm16 <= n.minYesNo || n.isHangulLVT(norm16) { + // no decomposition or Hangul syllable, all zeros + return 0 + } + // c decomposes, get everything from the variable-length extra data + mapping := n.getMapping(norm16) + firstUnit := mapping[1] + if firstUnit&MAPPING_HAS_CCC_LCCC_WORD != 0 { + norm16 |= mapping[0] & 0xff00 + } + return norm16 +} + +func (n *normalizer) getMapping(norm16 uint16) []uint16 { + return n.extraData[(norm16>>OFFSET_SHIFT)-1:] +} + +func (n *normalizer) GetNorm16(c rune) uint16 { + if utf16.IsLead(c) { + return INERT + } + return n.getRawNorm16(c) +} + +func (n *normalizer) getRawNorm16(c rune) uint16 { + return uint16(n.normTrie.Get(c)) +} + +func (n *normalizer) getCCFromNormalYesOrMaybe(norm16 uint16) uint8 { + return uint8(norm16 >> OFFSET_SHIFT) +} + +func (n *normalizer) mapAlgorithmic(c rune, norm16 uint16) rune { + return c + rune(norm16>>DELTA_SHIFT) - rune(n.centerNoNoDelta) +} + +func (n *normalizer) isHangulLV(norm16 uint16) bool { + return norm16 == n.minYesNo +} + +func (n *normalizer) isHangulLVT(norm16 uint16) bool { + return norm16 == n.hangulLVT() +} + +func (n *normalizer) hangulLVT() uint16 { + return n.minYesNoMappingsOnly | HAS_COMP_BOUNDARY_AFTER +} + +func (n *normalizer) getComposeQuickCheck(c rune) UNormalizationCheckResult { + return n.getCompQuickCheck(n.GetNorm16(c)) +} + +func (n *normalizer) getDecomposeQuickCheck(c rune) UNormalizationCheckResult { + if n.isDecompYes(n.GetNorm16(c)) { + return UNORM_YES + } + return UNORM_NO +} + +func QuickCheck(c rune, mode UNormalizationMode) UNormalizationCheckResult { + if mode <= UNORM_NONE || UNORM_FCD <= mode { + return UNORM_YES + } + switch mode { + case UNORM_NFC: + return Nfc().getComposeQuickCheck(c) + case UNORM_NFD: + return Nfc().getDecomposeQuickCheck(c) + case UNORM_NFKC: + return Nfkc().getComposeQuickCheck(c) + case UNORM_NFKD: + return Nfkc().getDecomposeQuickCheck(c) + default: + return UNORM_MAYBE + } +} + +func IsInert(c rune, mode UNormalizationMode) bool { + switch mode { + case UNORM_NFC: + return Nfc().isCompInert(c) + case UNORM_NFD: + return Nfc().isDecompInert(c) + case UNORM_NFKC: + return Nfkc().isCompInert(c) + case UNORM_NFKD: + return Nfkc().isDecompInert(c) + default: + return true + } +} + +func (n *normalizer) isDecompYes(norm16 uint16) bool { + return norm16 < n.minYesNo || n.minMaybeYes <= norm16 +} + +func (n *normalizer) getCompQuickCheck(norm16 uint16) UNormalizationCheckResult { + if norm16 < n.minNoNo || MIN_YES_YES_WITH_CC <= norm16 { + return UNORM_YES + } else if n.minMaybeYes <= norm16 { + return UNORM_MAYBE + } else { + return UNORM_NO + } +} + +func (n *normalizer) isMaybeOrNonZeroCC(norm16 uint16) bool { + return norm16 >= n.minMaybeYes +} + +func (n *normalizer) isDecompNoAlgorithmic(norm16 uint16) bool { + return norm16 >= n.limitNoNo +} + +func (n *normalizer) IsCompNo(norm16 uint16) bool { + return n.minNoNo <= norm16 && norm16 < n.minMaybeYes +} + +func (n *normalizer) Decompose(c rune) []rune { + norm16 := n.GetNorm16(c) + if c < n.minDecompNoCP || n.isMaybeOrNonZeroCC(norm16) { + // c does not decompose + return nil + } + var decomp []rune + + if n.isDecompNoAlgorithmic(norm16) { + // Maps to an isCompYesAndZeroCC. + c = n.mapAlgorithmic(c, norm16) + decomp = append(decomp, c) + // The mapping might decompose further. + norm16 = n.getRawNorm16(c) + } + if norm16 < n.minYesNo { + return decomp + } else if n.isHangulLV(norm16) || n.isHangulLVT(norm16) { + // Hangul syllable: decompose algorithmically + parts := hangulDecompose(c) + for len(parts) > 0 { + c = rune(parts[0]) + decomp = append(decomp, c) + parts = parts[1:] + } + return decomp + } + // c decomposes, get everything from the variable-length extra data + mapping := n.getMapping(norm16) + length := mapping[1] & MAPPING_LENGTH_MASK + mapping = mapping[2 : 2+length] + + for len(mapping) > 0 { + c, mapping = utf16.NextUnsafe(mapping) + decomp = append(decomp, c) + } + + return decomp +} + +func hangulDecompose(c rune) []uint16 { + c -= HANGUL_BASE + c2 := c % JAMO_T_COUNT + c /= JAMO_T_COUNT + var buffer []uint16 + buffer = append(buffer, uint16(JAMO_L_BASE+c/JAMO_V_COUNT)) + buffer = append(buffer, uint16(JAMO_V_BASE+c%JAMO_V_COUNT)) + if c2 != 0 { + buffer = append(buffer, uint16(JAMO_T_BASE+c2)) + } + return buffer +} + +func (n *normalizer) isCompInert(c rune) bool { + norm16 := n.GetNorm16(c) + return n.isCompYesAndZeroCC(norm16) && (norm16&HAS_COMP_BOUNDARY_AFTER) != 0 +} + +func (n *normalizer) isDecompInert(c rune) bool { + return n.isDecompYesAndZeroCC(n.GetNorm16(c)) +} + +func (n *normalizer) isCompYesAndZeroCC(norm16 uint16) bool { + return norm16 < n.minNoNo +} + +func (n *normalizer) isDecompYesAndZeroCC(norm16 uint16) bool { + return norm16 < n.minYesNo || + norm16 == JAMO_VT || + (n.minMaybeYes <= norm16 && norm16 <= MIN_NORMAL_MAYBE_YES) +} + +func (n *normalizer) CombiningClass(c rune) uint8 { + return n.getCC(n.GetNorm16(c)) +} + +func (n *normalizer) getCC(norm16 uint16) uint8 { + if norm16 >= MIN_NORMAL_MAYBE_YES { + return n.getCCFromNormalYesOrMaybe(norm16) + } + if norm16 < n.minNoNo || n.limitNoNo <= norm16 { + return 0 + } + return n.getCCFromNoNo(norm16) + +} + +func (n *normalizer) getCCFromNoNo(norm16 uint16) uint8 { + mapping := n.getMapping(norm16) + if mapping[1]&MAPPING_HAS_CCC_LCCC_WORD != 0 { + return uint8(mapping[0]) + } else { + return 0 + } +} diff --git a/go/mysql/icuregex/internal/ubidi/ubidi.go b/go/mysql/icuregex/internal/ubidi/ubidi.go index 97d137cbed8..b8c67d75368 100644 --- a/go/mysql/icuregex/internal/ubidi/ubidi.go +++ b/go/mysql/icuregex/internal/ubidi/ubidi.go @@ -54,7 +54,11 @@ var ubidi struct { func readData(bytes *udata.Bytes) error { err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.FormatVersion[0] == 2 + return info.DataFormat[0] == 0x42 && + info.DataFormat[1] == 0x69 && + info.DataFormat[2] == 0x44 && + info.DataFormat[3] == 0x69 && + info.FormatVersion[0] == 2 }) if err != nil { return err diff --git a/go/mysql/icuregex/internal/ucase/ucase.go b/go/mysql/icuregex/internal/ucase/ucase.go index e2f8acd2a92..1542745390d 100644 --- a/go/mysql/icuregex/internal/ucase/ucase.go +++ b/go/mysql/icuregex/internal/ucase/ucase.go @@ -48,7 +48,11 @@ func readData(bytes *udata.Bytes) error { ) err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.FormatVersion[0] == 4 + return info.DataFormat[0] == 0x63 && + info.DataFormat[1] == 0x41 && + info.DataFormat[2] == 0x53 && + info.DataFormat[3] == 0x45 && + info.FormatVersion[0] == 4 }) if err != nil { return err diff --git a/go/mysql/icuregex/internal/uchar/uchar.go b/go/mysql/icuregex/internal/uchar/uchar.go index 50167902a49..55fb6100017 100644 --- a/go/mysql/icuregex/internal/uchar/uchar.go +++ b/go/mysql/icuregex/internal/uchar/uchar.go @@ -40,7 +40,11 @@ var uprops struct { func readData(bytes *udata.Bytes) error { err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.FormatVersion[0] == 7 + return info.DataFormat[0] == 0x55 && + info.DataFormat[1] == 0x50 && + info.DataFormat[2] == 0x72 && + info.DataFormat[3] == 0x6f && + info.FormatVersion[0] == 7 }) if err != nil { return err diff --git a/go/mysql/icuregex/internal/uerror/error.go b/go/mysql/icuregex/internal/uerror/error.go index 9d23d8dc4a3..7feb86fe805 100644 --- a/go/mysql/icuregex/internal/uerror/error.go +++ b/go/mysql/icuregex/internal/uerror/error.go @@ -21,106 +21,14 @@ limitations under the License. package uerror -import "fmt" - -type UErrorCode int32 - -const ( - U_ZERO_ERROR UErrorCode = iota /**< No error, no warning. */ - U_ILLEGAL_ARGUMENT_ERROR /**< Start of codes indicating failure */ - U_MISSING_RESOURCE_ERROR /**< The requested resource cannot be found */ - U_INVALID_FORMAT_ERROR /**< Data format is not what is expected */ - U_FILE_ACCESS_ERROR /**< The requested file cannot be found */ - U_INTERNAL_PROGRAM_ERROR /**< Indicates a bug in the library code */ - U_MESSAGE_PARSE_ERROR /**< Unable to parse a message (message format) */ - U_MEMORY_ALLOCATION_ERROR /**< Memory allocation error */ - U_INDEX_OUTOFBOUNDS_ERROR /**< Trying to access the index that is out of bounds */ - U_PARSE_ERROR /**< Equivalent to Java ParseException */ - U_INVALID_CHAR_FOUND /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */ - U_TRUNCATED_CHAR_FOUND /**< Character conversion: Incomplete input sequence. */ - U_ILLEGAL_CHAR_FOUND /**< Character conversion: Illegal input sequence/combination of input units. */ - U_INVALID_TABLE_FORMAT /**< Conversion table file found, but corrupted */ - U_INVALID_TABLE_FILE /**< Conversion table file not found */ - U_BUFFER_OVERFLOW_ERROR /**< A result would not fit in the supplied buffer */ - U_UNSUPPORTED_ERROR /**< Requested operation not supported in current context */ - U_RESOURCE_TYPE_MISMATCH /**< an operation is requested over a resource that does not support it */ - U_ILLEGAL_ESCAPE_SEQUENCE /**< ISO-2022 illegal escape sequence */ - U_UNSUPPORTED_ESCAPE_SEQUENCE /**< ISO-2022 unsupported escape sequence */ - U_NO_SPACE_AVAILABLE /**< No space available for in-buffer expansion for Arabic shaping */ - U_CE_NOT_FOUND_ERROR /**< Currently used only while setting variable top, but can be used generally */ - U_PRIMARY_TOO_LONG_ERROR /**< User tried to set variable top to a primary that is longer than two bytes */ - U_STATE_TOO_OLD_ERROR /**< ICU cannot construct a service from this state, as it is no longer supported */ - U_TOO_MANY_ALIASES_ERROR /**< There are too many aliases in the path to the requested resource. - It is very possible that a circular alias definition has occurred */ - U_ENUM_OUT_OF_SYNC_ERROR /**< UEnumeration out of sync with underlying collection */ - U_INVARIANT_CONVERSION_ERROR /**< Unable to convert a UChar* string to char* with the invariant converter. */ - U_INVALID_STATE_ERROR /**< Requested operation can not be completed with ICU in its current state */ - U_COLLATOR_VERSION_MISMATCH /**< Collator version is not compatible with the base version */ - U_USELESS_COLLATOR_ERROR /**< Collator is options only and no base is specified */ - U_NO_WRITE_PERMISSION /**< Attempt to modify read-only or constant data. */ - U_INPUT_TOO_LONG_ERROR +import ( + "errors" ) -/* - * Error codes in the range 0x10000 0x10100 are reserved for Transliterator. - */ -const ( - U_BAD_VARIABLE_DEFINITION UErrorCode = iota + 0x10000 /**< Missing '$' or duplicate variable name */ - U_MALFORMED_RULE /**< Elements of a rule are misplaced */ - U_MALFORMED_SET /**< A UnicodeSet pattern is invalid*/ - U_MALFORMED_SYMBOL_REFERENCE /**< UNUSED as of ICU 2.4 */ - U_MALFORMED_UNICODE_ESCAPE /**< A Unicode escape pattern is invalid*/ - U_MALFORMED_VARIABLE_DEFINITION /**< A variable definition is invalid */ - U_MALFORMED_VARIABLE_REFERENCE /**< A variable reference is invalid */ - U_MISMATCHED_SEGMENT_DELIMITERS /**< UNUSED as of ICU 2.4 */ - U_MISPLACED_ANCHOR_START /**< A start anchor appears at an illegal position */ - U_MISPLACED_CURSOR_OFFSET /**< A cursor offset occurs at an illegal position */ - U_MISPLACED_QUANTIFIER /**< A quantifier appears after a segment close delimiter */ - U_MISSING_OPERATOR /**< A rule contains no operator */ - U_MISSING_SEGMENT_CLOSE /**< UNUSED as of ICU 2.4 */ - U_MULTIPLE_ANTE_CONTEXTS /**< More than one ante context */ - U_MULTIPLE_CURSORS /**< More than one cursor */ - U_MULTIPLE_POST_CONTEXTS /**< More than one post context */ - U_TRAILING_BACKSLASH /**< A dangling backslash */ - U_UNDEFINED_SEGMENT_REFERENCE /**< A segment reference does not correspond to a defined segment */ - U_UNDEFINED_VARIABLE /**< A variable reference does not correspond to a defined variable */ - U_UNQUOTED_SPECIAL /**< A special character was not quoted or escaped */ - U_UNTERMINATED_QUOTE /**< A closing single quote is missing */ - U_RULE_MASK_ERROR /**< A rule is hidden by an earlier more general rule */ - U_MISPLACED_COMPOUND_FILTER /**< A compound filter is in an invalid location */ - U_MULTIPLE_COMPOUND_FILTERS /**< More than one compound filter */ - U_INVALID_RBT_SYNTAX /**< A "::id" rule was passed to the RuleBasedTransliterator parser */ - U_INVALID_PROPERTY_PATTERN /**< UNUSED as of ICU 2.4 */ - U_MALFORMED_PRAGMA /**< A 'use' pragma is invalid */ - U_UNCLOSED_SEGMENT /**< A closing ')' is missing */ - U_ILLEGAL_CHAR_IN_SEGMENT /**< UNUSED as of ICU 2.4 */ - U_VARIABLE_RANGE_EXHAUSTED /**< Too many stand-ins generated for the given variable range */ - U_VARIABLE_RANGE_OVERLAP /**< The variable range overlaps characters used in rules */ - U_ILLEGAL_CHARACTER /**< A special character is outside its allowed context */ - U_INTERNAL_TRANSLITERATOR_ERROR /**< Internal transliterator system error */ - U_INVALID_ID /**< A "::id" rule specifies an unknown transliterator */ - U_INVALID_FUNCTION /**< A "&fn()" rule specifies an unknown transliterator */ -) +type UErrorCode int32 -/* - * Error codes in the range 0x10200 0x102ff are reserved for BreakIterator. - */ -const ( - U_BRK_INTERNAL_ERROR UErrorCode = iota + 0x10200 /**< An internal error (bug) was detected. */ - U_BRK_HEX_DIGITS_EXPECTED /**< Hex digits expected as part of a escaped char in a rule. */ - U_BRK_SEMICOLON_EXPECTED /**< Missing ';' at the end of a RBBI rule. */ - U_BRK_RULE_SYNTAX /**< Syntax error in RBBI rule. */ - U_BRK_UNCLOSED_SET /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ - U_BRK_ASSIGN_ERROR /**< Syntax error in RBBI rule assignment statement. */ - U_BRK_VARIABLE_REDFINITION /**< RBBI rule $Variable redefined. */ - U_BRK_MISMATCHED_PAREN /**< Mis-matched parentheses in an RBBI rule. */ - U_BRK_NEW_LINE_IN_QUOTED_STRING /**< Missing closing quote in an RBBI rule. */ - U_BRK_UNDEFINED_VARIABLE /**< Use of an undefined $Variable in an RBBI rule. */ - U_BRK_INIT_ERROR /**< Initialization failure. Probable missing ICU Data. */ - U_BRK_RULE_EMPTY_SET /**< Rule contains an empty Unicode Set. */ - U_BRK_UNRECOGNIZED_OPTION /**< !!option in RBBI rules not recognized. */ - U_BRK_MALFORMED_RULE_TAG /**< The {nnn} tag on a rule is malformed */ -) +var IllegalArgumentError = errors.New("illegal argument") +var UnsupportedError = errors.New("unsupported") type URegexCompileErrorCode int32 @@ -153,7 +61,3 @@ const ( U_REGEX_STACK_OVERFLOW URegexMatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ U_REGEX_TIME_OUT /**< Maximum allowed match time exceeded */ ) - -func (e UErrorCode) Error() string { - return fmt.Sprintf("UErrorCode: %d", e) -} diff --git a/go/mysql/icuregex/internal/ulayout/ulayout.go b/go/mysql/icuregex/internal/ulayout/ulayout.go index 744c9727461..5e86d508895 100644 --- a/go/mysql/icuregex/internal/ulayout/ulayout.go +++ b/go/mysql/icuregex/internal/ulayout/ulayout.go @@ -79,7 +79,11 @@ func loadLayouts() { func readData(bytes *udata.Bytes) error { err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.FormatVersion[0] == 1 + return info.DataFormat[0] == 0x4c && + info.DataFormat[1] == 0x61 && + info.DataFormat[2] == 0x79 && + info.DataFormat[3] == 0x6f && + info.FormatVersion[0] == 1 }) if err != nil { return err diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index dad3f5a0bd3..f015a77485e 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -23,7 +23,6 @@ package unames import ( "bytes" - _ "embed" "math" "strconv" "strings" @@ -38,20 +37,18 @@ var charNamesOnce sync.Once var charNames *UCharNames func loadCharNames() { - validCharNames := func(info *udata.DataInfo) bool { - return info.Size >= 20 && - info.IsBigEndian == 0 && - info.CharsetFamily == 0 && - info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ - info.DataFormat[1] == 0x6e && - info.DataFormat[2] == 0x61 && - info.DataFormat[3] == 0x6d && - info.FormatVersion[0] == 1 - } - charNamesOnce.Do(func() { b := udata.NewBytes(icudata.UNames) - if err := b.ReadHeader(validCharNames); err != nil { + if err := b.ReadHeader(func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 1 + }); err != nil { panic(err) } charNames = (*UCharNames)(b.Pointer()) diff --git a/go/mysql/icuregex/internal/uprops/properties.go b/go/mysql/icuregex/internal/uprops/properties.go new file mode 100644 index 00000000000..6f55575cc68 --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/properties.go @@ -0,0 +1,476 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "strconv" + "strings" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uerror" + "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" + "vitess.io/vitess/go/mysql/icuregex/internal/unames" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var inclusionsMu sync.Mutex +var inclusionsForSource = make(map[PropertySource]*uset.UnicodeSet) +var inclusionsForProperty = make(map[Property]*uset.UnicodeSet) + +func GetInclusionsForBinaryProperty(prop Property) (*uset.UnicodeSet, error) { + inclusionsMu.Lock() + defer inclusionsMu.Unlock() + return getInclusionsForBinaryProperty(prop) +} + +func getInclusionsForSource(src PropertySource) (*uset.UnicodeSet, error) { + if inc, ok := inclusionsForSource[src]; ok { + return inc, nil + } + + u := uset.New() + + switch src { + case UPROPS_SRC_CHAR: + uchar.AddPropertyStarts(u) + case UPROPS_SRC_PROPSVEC: + uchar.VecAddPropertyStarts(u) + case UPROPS_SRC_CHAR_AND_PROPSVEC: + uchar.AddPropertyStarts(u) + uchar.VecAddPropertyStarts(u) + case UPROPS_SRC_CASE_AND_NORM: + normalizer.Nfc().AddPropertyStarts(u) + ucase.AddPropertyStarts(u) + case UPROPS_SRC_NFC: + normalizer.Nfc().AddPropertyStarts(u) + case UPROPS_SRC_NFKC: + normalizer.Nfkc().AddPropertyStarts(u) + case UPROPS_SRC_NFKC_CF: + return nil, uerror.UnsupportedError + case UPROPS_SRC_NFC_CANON_ITER: + return nil, uerror.UnsupportedError + case UPROPS_SRC_CASE: + ucase.AddPropertyStarts(u) + case UPROPS_SRC_BIDI: + ubidi.AddPropertyStarts(u) + case UPROPS_SRC_INPC, UPROPS_SRC_INSC, UPROPS_SRC_VO: + AddULayoutPropertyStarts(src, u) + default: + return nil, uerror.UnsupportedError + } + + inclusionsForSource[src] = u + return u, nil +} + +func getInclusionsForProperty(prop Property) (*uset.UnicodeSet, error) { + if UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT { + return getInclusionsForIntProperty(prop) + } + return getInclusionsForSource(prop.Source()) +} + +func GetInclusionsForProperty(prop Property) (*uset.UnicodeSet, error) { + inclusionsMu.Lock() + defer inclusionsMu.Unlock() + return getInclusionsForProperty(prop) +} + +func getInclusionsForBinaryProperty(prop Property) (*uset.UnicodeSet, error) { + if inc, ok := inclusionsForProperty[prop]; ok { + return inc, nil + } + + incl, err := getInclusionsForProperty(prop) + if err != nil { + return nil, err + } + set := uset.New() + + numRanges := incl.RangeCount() + startHasProperty := rune(-1) + + for i := 0; i < numRanges; i++ { + rangeEnd := incl.RangeEnd(i) + for c := incl.RangeStart(i); c <= rangeEnd; c++ { + if HasBinaryProperty(c, prop) { + if startHasProperty < 0 { + startHasProperty = c + } + } else if startHasProperty >= 0 { + set.AddRuneRange(startHasProperty, c-1) + startHasProperty = -1 + } + } + } + if startHasProperty >= 0 { + set.AddRuneRange(startHasProperty, uset.MAX_VALUE) + } + + inclusionsForProperty[prop] = set + return set, nil +} + +func getInclusionsForIntProperty(prop Property) (*uset.UnicodeSet, error) { + if inc, ok := inclusionsForProperty[prop]; ok { + return inc, nil + } + + src := prop.Source() + incl, err := getInclusionsForSource(src) + if err != nil { + return nil, err + } + + intPropIncl := uset.New() + intPropIncl.AddRune(0) + + numRanges := incl.RangeCount() + prevValue := int32(0) + + for i := 0; i < numRanges; i++ { + rangeEnd := incl.RangeEnd(i) + for c := incl.RangeStart(i); c <= rangeEnd; c++ { + value := GetIntPropertyValue(c, prop) + if value != prevValue { + intPropIncl.AddRune(c) + prevValue = value + } + } + } + + inclusionsForProperty[prop] = intPropIncl + return intPropIncl, nil +} + +func ApplyIntPropertyValue(u *uset.UnicodeSet, prop Property, value int32) error { + switch { + case prop == UCHAR_GENERAL_CATEGORY_MASK: + inclusions, err := GetInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return (U_MASK(uchar.CharType(ch)) & uint32(value)) != 0 + }) + case prop == UCHAR_SCRIPT_EXTENSIONS: + inclusions, err := GetInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return UScriptHasScript(ch, UScriptCode(value)) + }) + case 0 <= prop && prop < UCHAR_BINARY_LIMIT: + if value == 0 || value == 1 { + set, err := GetInclusionsForBinaryProperty(prop) + if err != nil { + return err + } + u.CopyFrom(set) + if value == 0 { + u.Complement() + } + } else { + u.Clear() + } + + case UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT: + inclusions, err := GetInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return GetIntPropertyValue(ch, prop) == value + }) + default: + return uerror.UnsupportedError + } + return nil +} + +func mungeCharName(charname string) string { + out := make([]byte, 0, len(charname)) + for _, ch := range []byte(charname) { + j := len(out) + if ch == ' ' && (j == 0 || out[j-1] == ' ') { + continue + } + out = append(out, ch) + } + return string(out) +} + +func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { + if len(pat) < 5 { + return uerror.IllegalArgumentError + } + + var posix, isName, invert bool + + if isPOSIXOpen(pat) { + posix = true + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) > 0 && pat[0] == '^' { + pat = pat[1:] + invert = true + } + } else if isPerlOpen(pat) || isNameOpen(pat) { + c := pat[1] + invert = c == 'P' + isName = c == 'N' + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) == 0 || pat[0] != '{' { + return uerror.IllegalArgumentError + } + pat = pat[1:] + } else { + return uerror.IllegalArgumentError + } + + var close int + if posix { + close = strings.Index(pat, ":]") + } else { + close = strings.IndexByte(pat, '}') + } + if close < 0 { + return uerror.IllegalArgumentError + } + + equals := strings.IndexByte(pat, '=') + var propName, valueName string + if equals >= 0 && equals < close && !isName { + propName = pat[:equals] + valueName = pat[equals+1 : close] + } else { + propName = pat[:close] + if isName { + valueName = propName + propName = "na" + } + } + + if err := ApplyPropertyAlias(u, propName, valueName); err != nil { + return err + } + if invert { + u.Complement() + } + return nil +} + +func isPOSIXOpen(pattern string) bool { + return pattern[0] == '[' && pattern[1] == ':' +} + +func isNameOpen(pattern string) bool { + return pattern[0] == '\\' && pattern[1] == 'N' +} + +func isPerlOpen(pattern string) bool { + return pattern[0] == '\\' && (pattern[1] == 'p' || pattern[1] == 'P') +} + +func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { + var p Property + var v int32 + var invert bool + + if len(value) > 0 { + p = GetPropertyEnum(prop) + if p == -1 { + return uerror.IllegalArgumentError + } + if p == UCHAR_GENERAL_CATEGORY { + p = UCHAR_GENERAL_CATEGORY_MASK + } + + if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || + (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || + (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT) { + v = GetPropertyValueEnum(p, value) + if v == -1 { + // Handle numeric CCC + if p == UCHAR_CANONICAL_COMBINING_CLASS || + p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || + p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS { + val, err := strconv.ParseUint(value, 10, 8) + if err != nil { + return uerror.IllegalArgumentError + } + v = int32(val) + } else { + return uerror.IllegalArgumentError + } + } + } else { + switch p { + case UCHAR_NUMERIC_VALUE: + val, err := strconv.ParseFloat(value, 64) + if err != nil { + return uerror.IllegalArgumentError + } + incl, err := GetInclusionsForProperty(p) + if err != nil { + return err + } + u.ApplyFilter(incl, func(ch rune) bool { + return uchar.NumericValue(ch) == val + }) + return nil + case UCHAR_NAME: + // Must munge name, since u_charFromName() does not do + // 'loose' matching. + charName := mungeCharName(value) + ch := unames.CharForName(unames.U_EXTENDED_CHAR_NAME, charName) + if ch < 0 { + return uerror.IllegalArgumentError + } + u.Clear() + u.AddRune(ch) + return nil + case UCHAR_AGE: + // Must munge name, since u_versionFromString() does not do + // 'loose' matching. + charName := mungeCharName(value) + version := uchar.VersionFromString(charName) + incl, err := GetInclusionsForProperty(p) + if err != nil { + return err + } + u.ApplyFilter(incl, func(ch rune) bool { + return uchar.CharAge(ch) == version + }) + return nil + case UCHAR_SCRIPT_EXTENSIONS: + v = GetPropertyValueEnum(UCHAR_SCRIPT, value) + if v == -1 { + return uerror.IllegalArgumentError + } + default: + // p is a non-binary, non-enumerated property that we + // don't support (yet). + return uerror.IllegalArgumentError + } + } + } else { + // value is empty. Interpret as General Category, Script, or + // Binary property. + p = UCHAR_GENERAL_CATEGORY_MASK + v = GetPropertyValueEnum(p, prop) + if v == -1 { + p = UCHAR_SCRIPT + v = GetPropertyValueEnum(p, prop) + if v == -1 { + p = GetPropertyEnum(prop) + if p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT { + v = 1 + } else if 0 == ComparePropertyNames("ANY", prop) { + u.Clear() + u.AddRuneRange(uset.MIN_VALUE, uset.MAX_VALUE) + return nil + } else if 0 == ComparePropertyNames("ASCII", prop) { + u.Clear() + u.AddRuneRange(0, 0x7F) + return nil + } else if 0 == ComparePropertyNames("Assigned", prop) { + // [:Assigned:]=[:^Cn:] + p = UCHAR_GENERAL_CATEGORY_MASK + v = int32(uchar.U_GC_CN_MASK) + invert = true + } else { + return uerror.IllegalArgumentError + } + } + } + } + + err := ApplyIntPropertyValue(u, p, v) + if err != nil { + return err + } + if invert { + u.Complement() + } + return nil +} + +func AddULayoutPropertyStarts(src PropertySource, u *uset.UnicodeSet) { + var trie *utrie.UcpTrie + switch src { + case UPROPS_SRC_INPC: + trie = ulayout.InpcTrie() + case UPROPS_SRC_INSC: + trie = ulayout.InscTrie() + case UPROPS_SRC_VO: + trie = ulayout.VoTrie() + default: + panic("unreachable") + } + + // Add the start code point of each same-value range of the trie. + var start, end rune + for { + end, _ = trie.GetRange(start, utrie.UCPMAP_RANGE_NORMAL, 0, nil) + if end < 0 { + break + } + u.AddRune(start) + start = end + 1 + } +} + +func AddCategory(u *uset.UnicodeSet, mask uint32) error { + set := uset.New() + err := ApplyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, int32(mask)) + if err != nil { + return err + } + u.AddAll(set) + return nil +} + +func NewUnicodeSetFomPattern(pattern string, flags uset.USet) (*uset.UnicodeSet, error) { + u := uset.New() + if err := ApplyPropertyPattern(u, pattern); err != nil { + return nil, err + } + if flags&uset.USET_CASE_INSENSITIVE != 0 { + u.CloseOver(uset.USET_CASE_INSENSITIVE) + } + return u, nil +} + +func MustNewUnicodeSetFomPattern(pattern string, flags uset.USet) *uset.UnicodeSet { + u, err := NewUnicodeSetFomPattern(pattern, flags) + if err != nil { + panic(err) + } + return u +} diff --git a/go/mysql/icuregex/internal/uprops/uprops.go b/go/mysql/icuregex/internal/uprops/uprops.go index 2ebf957f610..f8683b1b269 100644 --- a/go/mysql/icuregex/internal/uprops/uprops.go +++ b/go/mysql/icuregex/internal/uprops/uprops.go @@ -45,7 +45,11 @@ func readData(bytes *udata.Bytes) error { ) err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.FormatVersion[0] == 2 + return info.DataFormat[0] == 0x70 && + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 2 }) if err != nil { return err @@ -234,6 +238,9 @@ func GetIntPropertyValue(c rune, which Property) int32 { if which < UCHAR_INT_START { if UCHAR_BINARY_START <= which && which < UCHAR_BINARY_LIMIT { prop := binProps[which] + if prop.contains == nil { + return 0 + } if prop.contains(prop, c, which) { return 1 } diff --git a/go/mysql/icuregex/internal/uprops/uprops_binary.go b/go/mysql/icuregex/internal/uprops/uprops_binary.go index 19e3141cbc5..0d6981abde7 100644 --- a/go/mysql/icuregex/internal/uprops/uprops_binary.go +++ b/go/mysql/icuregex/internal/uprops/uprops_binary.go @@ -23,7 +23,9 @@ package uprops import ( "golang.org/x/exp/constraints" + "golang.org/x/exp/slices" + "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" @@ -51,6 +53,8 @@ var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ * * Properties with mask==0 are handled in code. * For them, column is the UPropertySource value. + * + * See also https://unicode-org.github.io/icu/userguide/strings/properties.html */ {1, U_MASK(UPROPS_ALPHABETIC), defaultContains}, {1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains}, @@ -89,11 +93,11 @@ var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASE_SENSITIVE {1, U_MASK(UPROPS_S_TERM), defaultContains}, {1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains}, - {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFD_INERT - {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKD_INERT - {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFC_INERT - {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKC_INERT - {UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter}, + {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFD_INERT + {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKD_INERT + {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFC_INERT + {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKC_INERT + {UPROPS_SRC_NFC_CANON_ITER, 0, nil}, // Segment_Starter is currently unsupported {1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains}, {1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains}, {UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum}, @@ -108,7 +112,7 @@ var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_TITLECASED {UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded}, {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED - {UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded}, + {UPROPS_SRC_NFKC_CF, 0, nil}, // Changes_When_NFKC_Casefolded is currently unsupported {2, U_MASK(UPROPS_2_EMOJI), defaultContains}, {2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains}, {2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains}, @@ -131,12 +135,17 @@ func isRegionalIndicator(prop *BinaryProperty, c rune, which Property) bool { return 0x1F1E6 <= c && c <= 0x1F1FF } -func changesWhenNFKC_Casefolded(prop *BinaryProperty, c rune, which Property) bool { - panic("TODO") -} - func changesWhenCasefolded(prop *BinaryProperty, c rune, which Property) bool { - panic("TODO") + if c < 0 { + return false + } + + nfd := normalizer.Nfc().Decompose(c) + if nfd == nil { + nfd = []rune{c} + } + folded := ucase.FoldRunes(nfd) + return !slices.Equal(nfd, folded) } func isPOSIX_xdigit(prop *BinaryProperty, c rune, which Property) bool { @@ -159,16 +168,13 @@ func isPOSIX_alnum(prop *BinaryProperty, c rune, which Property) bool { return (uchar.GetUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC)) != 0 || uchar.IsDigit(c) } -func isCanonSegmentStarter(prop *BinaryProperty, c rune, which Property) bool { - panic("TODO") -} - func isJoinControl(prop *BinaryProperty, c rune, which Property) bool { return ubidi.IsJoinControl(c) } func hasFullCompositionExclusion(prop *BinaryProperty, c rune, which Property) bool { - panic("TODO") + impl := normalizer.Nfc() + return impl.IsCompNo(impl.GetNorm16(c)) } func caseBinaryPropertyContains(prop *BinaryProperty, c rune, which Property) bool { @@ -217,7 +223,8 @@ func HasBinaryPropertyUcase(c rune, which Property) bool { } func isNormInert(prop *BinaryProperty, c rune, which Property) bool { - panic("TODO") + mode := normalizer.UNormalizationMode(int32(which) - int32(UCHAR_NFD_INERT) + int32(normalizer.UNORM_NFD)) + return normalizer.IsInert(c, mode) } func HasBinaryProperty(c rune, which Property) bool { @@ -225,5 +232,8 @@ func HasBinaryProperty(c rune, which Property) bool { return false } prop := binProps[which] + if prop.contains == nil { + return false + } return prop.contains(prop, c, which) } diff --git a/go/mysql/icuregex/internal/uprops/uprops_int.go b/go/mysql/icuregex/internal/uprops/uprops_int.go index c8a6795749a..9c89e260d73 100644 --- a/go/mysql/icuregex/internal/uprops/uprops_int.go +++ b/go/mysql/icuregex/internal/uprops/uprops_int.go @@ -22,8 +22,9 @@ limitations under the License. package uprops import ( + "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" - uchar2 "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" ) @@ -154,7 +155,7 @@ var intProps = [UCHAR_INT_LIMIT - UCHAR_INT_START]*IntProperty{ {UPROPS_SRC_NFC, 0, 0xff, getCombiningClass}, {2, UPROPS_DT_MASK, 0, defaultGetValue}, {0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue}, - {UPROPS_SRC_CHAR, 0, uchar2.U_CHAR_CATEGORY_COUNT - 1, getGeneralCategory}, + {UPROPS_SRC_CHAR, 0, uchar.U_CHAR_CATEGORY_COUNT - 1, getGeneralCategory}, {UPROPS_SRC_BIDI, 0, 0, getJoiningGroup}, {UPROPS_SRC_BIDI, 0, 0, getJoiningType}, {2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue}, @@ -197,15 +198,15 @@ func getBiDiPairedBracketType(prop *IntProperty, c rune, which Property) int32 { } func getTrailCombiningClass(prop *IntProperty, c rune, which Property) int32 { - panic("TODO") + return int32(normalizer.Nfc().GetFCD16(c) & 0xff) } func getLeadCombiningClass(prop *IntProperty, c rune, which Property) int32 { - panic("TODO") + return int32(normalizer.Nfc().GetFCD16(c) >> 8) } func getNormQuickCheck(prop *IntProperty, c rune, which Property) int32 { - panic("TODO") + return int32(normalizer.QuickCheck(c, normalizer.UNormalizationMode(int32(which)-int32(UCHAR_NFD_QUICK_CHECK)+int32(normalizer.UNORM_NFD)))) } /* @@ -231,7 +232,7 @@ var gcbToHst = []HangunSyllableType{ func getHangulSyllableType(prop *IntProperty, c rune, which Property) int32 { /* see comments on gcbToHst[] above */ - gcb := (int32(uchar2.GetUnicodeProperties(c, 2)) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT + gcb := (int32(uchar.GetUnicodeProperties(c, 2)) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT if gcb < int32(len(gcbToHst)) { return int32(gcbToHst[gcb]) @@ -245,7 +246,7 @@ func getScript(_ *IntProperty, c rune, _ Property) int32 { } func getNumericType(prop *IntProperty, c rune, which Property) int32 { - ntv := uchar2.NumericTypeValue(c) + ntv := uchar.NumericTypeValue(c) return int32(ntvGetType(ntv)) } @@ -258,15 +259,15 @@ func getJoiningGroup(prop *IntProperty, c rune, which Property) int32 { } func getGeneralCategory(prop *IntProperty, c rune, which Property) int32 { - return int32(uchar2.CharType(c)) + return int32(uchar.CharType(c)) } func getCombiningClass(prop *IntProperty, c rune, which Property) int32 { - panic("TODO") + return int32(normalizer.Nfc().CombiningClass(c)) } func defaultGetValue(prop *IntProperty, c rune, which Property) int32 { - return int32(uchar2.GetUnicodeProperties(c, int(prop.column))&prop.mask) >> prop.shift + return int32(uchar.GetUnicodeProperties(c, int(prop.column))&prop.mask) >> prop.shift } func getBiDiClass(prop *IntProperty, c rune, which Property) int32 { @@ -275,11 +276,11 @@ func getBiDiClass(prop *IntProperty, c rune, which Property) int32 { func ntvGetType(ntv uint16) NumericType { switch { - case ntv == uchar2.UPROPS_NTV_NONE: + case ntv == uchar.UPROPS_NTV_NONE: return U_NT_NONE - case ntv < uchar2.UPROPS_NTV_DIGIT_START: + case ntv < uchar.UPROPS_NTV_DIGIT_START: return U_NT_DECIMAL - case ntv < uchar2.UPROPS_NTV_NUMERIC_START: + case ntv < uchar.UPROPS_NTV_NUMERIC_START: return U_NT_DIGIT default: return U_NT_NUMERIC diff --git a/go/mysql/icuregex/internal/uprops/uscript.go b/go/mysql/icuregex/internal/uprops/uscript.go index 0b80d54a69f..98b3275dd1b 100644 --- a/go/mysql/icuregex/internal/uprops/uscript.go +++ b/go/mysql/icuregex/internal/uprops/uscript.go @@ -21,9 +21,7 @@ limitations under the License. package uprops -import ( - "vitess.io/vitess/go/mysql/icuregex/internal/uchar" -) +import "vitess.io/vitess/go/mysql/icuregex/internal/uchar" /** * Constants for ISO 15924 script codes. diff --git a/go/mysql/icuregex/internal/uset/close.go b/go/mysql/icuregex/internal/uset/close.go index 02e1d117b52..9b59fed8bf3 100644 --- a/go/mysql/icuregex/internal/uset/close.go +++ b/go/mysql/icuregex/internal/uset/close.go @@ -21,9 +21,7 @@ limitations under the License. package uset -import ( - "vitess.io/vitess/go/mysql/icuregex/internal/ucase" -) +import "vitess.io/vitess/go/mysql/icuregex/internal/ucase" type USet uint32 @@ -82,11 +80,11 @@ func (u *UnicodeSet) CloseOver(attribute USet) { } foldSet := u.Clone() - n := u.rangeCount() + n := u.RangeCount() for i := 0; i < n; i++ { - start := u.rangeStart(i) - end := u.rangeEnd(i) + start := u.RangeStart(i) + end := u.RangeEnd(i) // full case closure for cp := start; cp <= end; cp++ { diff --git a/go/mysql/icuregex/internal/uset/frozen.go b/go/mysql/icuregex/internal/uset/frozen.go index 308b8fb6aca..2b17ae904c8 100644 --- a/go/mysql/icuregex/internal/uset/frozen.go +++ b/go/mysql/icuregex/internal/uset/frozen.go @@ -1,3 +1,24 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package uset type frozen struct { diff --git a/go/mysql/icuregex/internal/uset/pattern.go b/go/mysql/icuregex/internal/uset/pattern.go index 468e439c06e..51463b10542 100644 --- a/go/mysql/icuregex/internal/uset/pattern.go +++ b/go/mysql/icuregex/internal/uset/pattern.go @@ -46,19 +46,19 @@ func (u *UnicodeSet) ToPattern(w *strings.Builder, escapeUnprintable bool) { // } // } - count := u.rangeCount() + count := u.RangeCount() // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. - if count > 1 && u.rangeStart(0) == MIN_VALUE && u.rangeEnd(count-1) == MAX_VALUE { + if count > 1 && u.RangeStart(0) == MIN_VALUE && u.RangeEnd(count-1) == MAX_VALUE { // Emit the inverse w.WriteByte('^') for i := 1; i < count; i++ { - start := u.rangeEnd(i-1) + 1 - end := u.rangeStart(i) - 1 + start := u.RangeEnd(i-1) + 1 + end := u.RangeStart(i) - 1 u.appendToPattern(w, start, escapeUnprintable) if start != end { if (start + 1) != end { @@ -70,8 +70,8 @@ func (u *UnicodeSet) ToPattern(w *strings.Builder, escapeUnprintable bool) { } else { // Default; emit the ranges as pairs for i := 0; i < count; i++ { - start := u.rangeStart(i) - end := u.rangeEnd(i) + start := u.RangeStart(i) + end := u.RangeEnd(i) u.appendToPattern(w, start, escapeUnprintable) if start != end { if (start + 1) != end { diff --git a/go/mysql/icuregex/internal/uset/properties.go b/go/mysql/icuregex/internal/uset/properties.go deleted file mode 100644 index f9403f584ea..00000000000 --- a/go/mysql/icuregex/internal/uset/properties.go +++ /dev/null @@ -1,417 +0,0 @@ -/* -© 2016 and later: Unicode, Inc. and others. -Copyright (C) 2004-2015, International Business Machines Corporation and others. -Copyright 2023 The Vitess Authors. - -This file contains code derived from the Unicode Project's ICU library. -License & terms of use for the original code: http://www.unicode.org/copyright.html - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package uset - -import ( - "fmt" - "strconv" - "strings" - "sync" - - "vitess.io/vitess/go/mysql/icuregex/internal/pattern" - "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" - "vitess.io/vitess/go/mysql/icuregex/internal/ucase" - uchar2 "vitess.io/vitess/go/mysql/icuregex/internal/uchar" - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" - "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" - "vitess.io/vitess/go/mysql/icuregex/internal/unames" - uprops2 "vitess.io/vitess/go/mysql/icuregex/internal/uprops" - "vitess.io/vitess/go/mysql/icuregex/internal/utrie" -) - -var inclusionsMu sync.Mutex -var inclusionsForSource = make(map[uprops2.PropertySource]*UnicodeSet) -var inclusionsForProperty = make(map[uprops2.Property]*UnicodeSet) - -func GetInclusionsForBinaryProperty(prop uprops2.Property) *UnicodeSet { - inclusionsMu.Lock() - defer inclusionsMu.Unlock() - return getInclusionsForBinaryProperty(prop) -} - -func getInclusionsForSource(src uprops2.PropertySource) *UnicodeSet { - if inc, ok := inclusionsForSource[src]; ok { - return inc - } - - u := New() - - switch src { - case uprops2.UPROPS_SRC_CHAR: - uchar2.AddPropertyStarts(u) - case uprops2.UPROPS_SRC_PROPSVEC: - uchar2.VecAddPropertyStarts(u) - case uprops2.UPROPS_SRC_CHAR_AND_PROPSVEC: - uchar2.AddPropertyStarts(u) - uchar2.VecAddPropertyStarts(u) - case uprops2.UPROPS_SRC_CASE_AND_NORM: - panic("TODO") - case uprops2.UPROPS_SRC_NFC: - panic("TODO") - case uprops2.UPROPS_SRC_NFKC: - panic("TODO") - case uprops2.UPROPS_SRC_NFKC_CF: - panic("TODO") - case uprops2.UPROPS_SRC_NFC_CANON_ITER: - panic("TODO") - case uprops2.UPROPS_SRC_CASE: - ucase.AddPropertyStarts(u) - case uprops2.UPROPS_SRC_BIDI: - ubidi.AddPropertyStarts(u) - case uprops2.UPROPS_SRC_INPC, uprops2.UPROPS_SRC_INSC, uprops2.UPROPS_SRC_VO: - AddULayoutPropertyStarts(src, u) - default: - panic(fmt.Sprintf("unsupported property source: %v", src)) - } - - inclusionsForSource[src] = u - return u -} - -func getInclusionsForProperty(prop uprops2.Property) *UnicodeSet { - if uprops2.UCHAR_INT_START <= prop && prop < uprops2.UCHAR_INT_LIMIT { - return getInclusionsForIntProperty(prop) - } - return getInclusionsForSource(prop.Source()) -} - -func GetInclusionsForProperty(prop uprops2.Property) *UnicodeSet { - inclusionsMu.Lock() - defer inclusionsMu.Unlock() - return getInclusionsForProperty(prop) -} - -func getInclusionsForBinaryProperty(prop uprops2.Property) *UnicodeSet { - if inc, ok := inclusionsForProperty[prop]; ok { - return inc - } - - incl := getInclusionsForProperty(prop) - set := New() - - numRanges := incl.rangeCount() - startHasProperty := rune(-1) - - for i := 0; i < numRanges; i++ { - rangeEnd := incl.rangeEnd(i) - for c := incl.rangeStart(i); c <= rangeEnd; c++ { - if uprops2.HasBinaryProperty(c, prop) { - if startHasProperty < 0 { - startHasProperty = c - } - } else if startHasProperty >= 0 { - set.AddRuneRange(startHasProperty, c-1) - startHasProperty = -1 - } - } - } - if startHasProperty >= 0 { - set.AddRuneRange(startHasProperty, MAX_VALUE) - } - - inclusionsForProperty[prop] = set - return set -} - -func getInclusionsForIntProperty(prop uprops2.Property) *UnicodeSet { - if inc, ok := inclusionsForProperty[prop]; ok { - return inc - } - - src := prop.Source() - incl := getInclusionsForSource(src) - - intPropIncl := New() - intPropIncl.AddRune(0) - - numRanges := incl.rangeCount() - prevValue := int32(0) - - for i := 0; i < numRanges; i++ { - rangeEnd := incl.rangeEnd(i) - for c := incl.rangeStart(i); c <= rangeEnd; c++ { - value := uprops2.GetIntPropertyValue(c, prop) - if value != prevValue { - intPropIncl.AddRune(c) - prevValue = value - } - } - } - - inclusionsForProperty[prop] = intPropIncl - return intPropIncl -} - -func (u *UnicodeSet) ApplyIntPropertyValue(prop uprops2.Property, value int32) { - switch { - case prop == uprops2.UCHAR_GENERAL_CATEGORY_MASK: - inclusions := GetInclusionsForProperty(prop) - u.applyFilter(inclusions, func(ch rune) bool { - return (uprops2.U_MASK(uchar2.CharType(ch)) & uint32(value)) != 0 - }) - case prop == uprops2.UCHAR_SCRIPT_EXTENSIONS: - inclusions := GetInclusionsForProperty(prop) - u.applyFilter(inclusions, func(ch rune) bool { - return uprops2.UScriptHasScript(ch, uprops2.UScriptCode(value)) - }) - case 0 <= prop && prop < uprops2.UCHAR_BINARY_LIMIT: - if value == 0 || value == 1 { - set := GetInclusionsForBinaryProperty(prop) - u.CopyFrom(set) - if value == 0 { - u.Complement() - } - } else { - u.Clear() - } - - case uprops2.UCHAR_INT_START <= prop && prop < uprops2.UCHAR_INT_LIMIT: - inclusions := GetInclusionsForProperty(prop) - u.applyFilter(inclusions, func(ch rune) bool { - return uprops2.GetIntPropertyValue(ch, prop) == value - }) - - default: - panic("invalid Property type") - } -} - -func mungeCharName(charname string) string { - out := make([]byte, 0, len(charname)) - for _, ch := range []byte(charname) { - j := len(out) - if ch == ' ' && (j == 0 || out[j-1] == ' ') { - continue - } - out = append(out, ch) - } - return string(out) -} - -func (u *UnicodeSet) ApplyPropertyPattern(pat string) error { - if len(pat) < 5 { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - - var posix, isName, invert bool - - if isPOSIXOpen(pat) { - posix = true - pat = pattern.SkipWhitespace(pat[2:]) - if len(pat) > 0 && pat[0] == '^' { - pat = pat[1:] - invert = true - } - } else if isPerlOpen(pat) || isNameOpen(pat) { - c := pat[1] - invert = c == 'P' - isName = c == 'N' - pat = pattern.SkipWhitespace(pat[2:]) - if len(pat) == 0 || pat[0] != '{' { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - pat = pat[1:] - } else { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - - var close int - if posix { - close = strings.Index(pat, ":]") - } else { - close = strings.IndexByte(pat, '}') - } - if close < 0 { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - - equals := strings.IndexByte(pat, '=') - var propName, valueName string - if equals >= 0 && equals < close && !isName { - propName = pat[:equals] - valueName = pat[equals+1 : close] - } else { - propName = pat[:close] - if isName { - valueName = propName - propName = "na" - } - } - - if err := u.ApplyPropertyAlias(propName, valueName); err != nil { - return err - } - if invert { - u.Complement() - } - return nil -} - -func isPOSIXOpen(pattern string) bool { - return pattern[0] == '[' && pattern[1] == ':' -} - -func isNameOpen(pattern string) bool { - return pattern[0] == '\\' && pattern[1] == 'N' -} - -func isPerlOpen(pattern string) bool { - return pattern[0] == '\\' && (pattern[1] == 'p' || pattern[1] == 'P') -} - -func (u *UnicodeSet) ApplyPropertyAlias(prop, value string) error { - var p uprops2.Property - var v int32 - var invert bool - - if len(value) > 0 { - p = uprops2.GetPropertyEnum(prop) - if p == -1 { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - if p == uprops2.UCHAR_GENERAL_CATEGORY { - p = uprops2.UCHAR_GENERAL_CATEGORY_MASK - } - - if (p >= uprops2.UCHAR_BINARY_START && p < uprops2.UCHAR_BINARY_LIMIT) || - (p >= uprops2.UCHAR_INT_START && p < uprops2.UCHAR_INT_LIMIT) || - (p >= uprops2.UCHAR_MASK_START && p < uprops2.UCHAR_MASK_LIMIT) { - v = uprops2.GetPropertyValueEnum(p, value) - if v == -1 { - // Handle numeric CCC - if p == uprops2.UCHAR_CANONICAL_COMBINING_CLASS || - p == uprops2.UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || - p == uprops2.UCHAR_LEAD_CANONICAL_COMBINING_CLASS { - val, err := strconv.ParseUint(value, 10, 8) - if err != nil { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - v = int32(val) - } else { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - } - } else { - switch p { - case uprops2.UCHAR_NUMERIC_VALUE: - val, err := strconv.ParseFloat(value, 64) - if err != nil { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - u.applyFilter(GetInclusionsForProperty(p), func(ch rune) bool { - return uchar2.NumericValue(ch) == val - }) - return nil - case uprops2.UCHAR_NAME: - // Must munge name, since u_charFromName() does not do - // 'loose' matching. - charName := mungeCharName(value) - ch := unames.CharForName(unames.U_EXTENDED_CHAR_NAME, charName) - if ch < 0 { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - u.Clear() - u.AddRune(ch) - return nil - case uprops2.UCHAR_AGE: - // Must munge name, since u_versionFromString() does not do - // 'loose' matching. - charName := mungeCharName(value) - version := uchar2.VersionFromString(charName) - u.applyFilter(GetInclusionsForProperty(p), func(ch rune) bool { - return uchar2.CharAge(ch) == version - }) - return nil - case uprops2.UCHAR_SCRIPT_EXTENSIONS: - v = uprops2.GetPropertyValueEnum(uprops2.UCHAR_SCRIPT, value) - if v == -1 { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - default: - // p is a non-binary, non-enumerated property that we - // don't support (yet). - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - } - } else { - // value is empty. Interpret as General Category, Script, or - // Binary property. - p = uprops2.UCHAR_GENERAL_CATEGORY_MASK - v = uprops2.GetPropertyValueEnum(p, prop) - if v == -1 { - p = uprops2.UCHAR_SCRIPT - v = uprops2.GetPropertyValueEnum(p, prop) - if v == -1 { - p = uprops2.GetPropertyEnum(prop) - if p >= uprops2.UCHAR_BINARY_START && p < uprops2.UCHAR_BINARY_LIMIT { - v = 1 - } else if 0 == uprops2.ComparePropertyNames("ANY", prop) { - u.Clear() - u.AddRuneRange(MIN_VALUE, MAX_VALUE) - return nil - } else if 0 == uprops2.ComparePropertyNames("ASCII", prop) { - u.Clear() - u.AddRuneRange(0, 0x7F) - return nil - } else if 0 == uprops2.ComparePropertyNames("Assigned", prop) { - // [:Assigned:]=[:^Cn:] - p = uprops2.UCHAR_GENERAL_CATEGORY_MASK - v = int32(uchar2.U_GC_CN_MASK) - invert = true - } else { - return uerror.U_ILLEGAL_ARGUMENT_ERROR - } - } - } - } - - u.ApplyIntPropertyValue(p, v) - if invert { - u.Complement() - } - return nil -} - -func AddULayoutPropertyStarts(src uprops2.PropertySource, u *UnicodeSet) { - var trie *utrie.UcpTrie - switch src { - case uprops2.UPROPS_SRC_INPC: - trie = ulayout.InpcTrie() - case uprops2.UPROPS_SRC_INSC: - trie = ulayout.InscTrie() - case uprops2.UPROPS_SRC_VO: - trie = ulayout.VoTrie() - default: - panic("unreachable") - } - - // Add the start code point of each same-value range of the trie. - var start, end rune - for { - end, _ = trie.GetRange(start, utrie.UCPMAP_RANGE_NORMAL, 0, nil) - if end < 0 { - break - } - u.AddRune(start) - start = end + 1 - } -} diff --git a/go/mysql/icuregex/internal/uset/unicode_set.go b/go/mysql/icuregex/internal/uset/unicode_set.go index 7e1b1de20c2..db6659b1121 100644 --- a/go/mysql/icuregex/internal/uset/unicode_set.go +++ b/go/mysql/icuregex/internal/uset/unicode_set.go @@ -25,8 +25,6 @@ import ( "fmt" "golang.org/x/exp/slices" - - "vitess.io/vitess/go/mysql/icuregex/internal/uprops" ) // HIGH_VALUE > all valid values. 110000 for codepoints @@ -68,25 +66,6 @@ func FromRunes(list []rune) *UnicodeSet { return &UnicodeSet{list: list} } -func ParsePattern(pattern string, flags USet) (*UnicodeSet, error) { - u := New() - if err := u.ApplyPropertyPattern(pattern); err != nil { - return nil, err - } - if flags&USET_CASE_INSENSITIVE != 0 { - u.CloseOver(USET_CASE_INSENSITIVE) - } - return u, nil -} - -func MustParsePattern(pattern string, flags USet) *UnicodeSet { - u, err := ParsePattern(pattern, flags) - if err != nil { - panic(err) - } - return u -} - func (u *UnicodeSet) ensureBufferCapacity(c int) { if cap(u.buffer) < c { u.buffer = make([]rune, c) @@ -506,22 +485,22 @@ func (u *UnicodeSet) Clear() { } func (u *UnicodeSet) Len() (n int) { - count := u.rangeCount() + count := u.RangeCount() for i := 0; i < count; i++ { - n += int(u.rangeEnd(i)) - int(u.rangeStart(i)) + 1 + n += int(u.RangeEnd(i)) - int(u.RangeStart(i)) + 1 } return } -func (u *UnicodeSet) rangeCount() int { +func (u *UnicodeSet) RangeCount() int { return len(u.list) / 2 } -func (u *UnicodeSet) rangeStart(idx int) rune { +func (u *UnicodeSet) RangeStart(idx int) rune { return u.list[idx*2] } -func (u *UnicodeSet) rangeEnd(idx int) rune { +func (u *UnicodeSet) RangeEnd(idx int) rune { return u.list[idx*2+1] - 1 } @@ -551,7 +530,9 @@ func (u *UnicodeSet) RuneAt(idx int) rune { func (u *UnicodeSet) ContainsRune(c rune) bool { if f := u.frozen; f != nil { - if c <= 0xff { + if c < 0 { + return false + } else if c <= 0xff { return f.latin1Contains[c] != 0 } else if c <= 0x7ff { return (f.table7FF[c&0x3f] & (uint32(1) << (c >> 6))) != 0 @@ -628,12 +609,6 @@ func (u *UnicodeSet) findCodePoint(c rune) int { return hi } -func (u *UnicodeSet) AddCategory(mask uint32) { - set := New() - set.ApplyIntPropertyValue(uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(mask)) - u.AddAll(set) -} - func (u *UnicodeSet) AddString(chars string) { for _, c := range chars { u.AddRune(c) @@ -642,7 +617,7 @@ func (u *UnicodeSet) AddString(chars string) { type Filter func(ch rune) bool -func (u *UnicodeSet) applyFilter(inclusions *UnicodeSet, filter Filter) { +func (u *UnicodeSet) ApplyFilter(inclusions *UnicodeSet, filter Filter) { // Logically, walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. @@ -656,12 +631,12 @@ func (u *UnicodeSet) applyFilter(inclusions *UnicodeSet, filter Filter) { u.Clear() startHasProperty := rune(-1) - limitRange := inclusions.rangeCount() + limitRange := inclusions.RangeCount() for j := 0; j < limitRange; j++ { // get current range - start := inclusions.rangeStart(j) - end := inclusions.rangeEnd(j) + start := inclusions.RangeStart(j) + end := inclusions.RangeEnd(j) // for all the code points in the range, process for ch := start; ch <= end; ch++ { diff --git a/go/mysql/icuregex/internal/utf16/helpers.go b/go/mysql/icuregex/internal/utf16/helpers.go index b87af5222fc..bdf53ae731c 100644 --- a/go/mysql/icuregex/internal/utf16/helpers.go +++ b/go/mysql/icuregex/internal/utf16/helpers.go @@ -52,17 +52,6 @@ func IsSurrogateLead(c rune) bool { return (uint32(c) & 0x400) == 0 } -/** - * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), - * is it a trail surrogate? - * @param c 32-bit code point - * @return true or false - * @stable ICU 4.2 - */ -func IsSurrogateTrail(c rune) bool { - return (uint32(c) & 0x400) != 0 -} - func DecodeRune(a, b rune) rune { return utf16.DecodeRune(a, b) } diff --git a/go/mysql/icuregex/internal/utrie/ucptrie.go b/go/mysql/icuregex/internal/utrie/ucptrie.go index 05bbccd1610..f7e64107343 100644 --- a/go/mysql/icuregex/internal/utrie/ucptrie.go +++ b/go/mysql/icuregex/internal/utrie/ucptrie.go @@ -33,7 +33,7 @@ type UcpTrie struct { Data16 []uint16 Data32 []uint32 - IndexLength, DataLength int + IndexLength, DataLength int32 /** Start of the last range which ends at U+10FFFF. @internal */ HighStart rune Shifted12HighStart uint16 @@ -290,8 +290,8 @@ func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { actualValueWidth := UCPTrieValueWidth(valueWidthInt) trie := &UcpTrie{ - IndexLength: int(header.indexLength), - DataLength: int(((header.options & UCPTRIE_OPTIONS_DATA_LENGTH_MASK) << 4) | header.dataLength), + IndexLength: int32(header.indexLength), + DataLength: int32(((header.options & UCPTRIE_OPTIONS_DATA_LENGTH_MASK) << 4) | header.dataLength), Index3NullOffset: header.index3NullOffset, DataNullOffset: int32(((header.options & UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK) << 8) | header.dataNullOffset), HighStart: rune(header.shiftedHighStart) << UCPTRIE_SHIFT_2, @@ -299,52 +299,52 @@ func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { ValueWidth: actualValueWidth, } nullValueOffset := trie.DataNullOffset - if nullValueOffset >= int32(trie.DataLength) { - nullValueOffset = int32(trie.DataLength) - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + if nullValueOffset >= trie.DataLength { + nullValueOffset = trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET } trie.Shifted12HighStart = uint16((trie.HighStart + 0xfff) >> 12) trie.Index = bytes.Uint16Slice(int32(header.indexLength)) switch actualValueWidth { case UCPTRIE_VALUE_BITS_16: - trie.Data16 = trie.Index[trie.IndexLength:] - trie.NullValue = uint32(trie.Index[nullValueOffset]) + trie.Data16 = bytes.Uint16Slice(trie.DataLength) + trie.NullValue = uint32(trie.Data16[nullValueOffset]) case UCPTRIE_VALUE_BITS_32: - trie.Data32 = bytes.Uint32Slice(int32(trie.DataLength)) + trie.Data32 = bytes.Uint32Slice(trie.DataLength) trie.NullValue = trie.Data32[nullValueOffset] case UCPTRIE_VALUE_BITS_8: - trie.Data8 = bytes.Uint8Slice(int32(trie.DataLength)) + trie.Data8 = bytes.Uint8Slice(trie.DataLength) trie.NullValue = uint32(trie.Data8[nullValueOffset]) } return trie, nil } -func (trie *UcpTrie) Get(c rune) uint32 { +func (t *UcpTrie) Get(c rune) uint32 { var dataIndex int32 if c <= 0x7f { // linear ASCII dataIndex = c } else { var fastMax rune - if trie.Type == UCPTRIE_TYPE_FAST { + if t.Type == UCPTRIE_TYPE_FAST { fastMax = 0xffff } else { fastMax = UCPTRIE_SMALL_MAX } - dataIndex = trie.cpIndex(fastMax, c) + dataIndex = t.cpIndex(fastMax, c) } - return trie.getValue(dataIndex) + return t.getValue(dataIndex) } -func (trie *UcpTrie) getValue(dataIndex int32) uint32 { - switch trie.ValueWidth { +func (t *UcpTrie) getValue(dataIndex int32) uint32 { + switch t.ValueWidth { case UCPTRIE_VALUE_BITS_16: - return uint32(trie.Data16[dataIndex]) + return uint32(t.Data16[dataIndex]) case UCPTRIE_VALUE_BITS_32: - return trie.Data32[dataIndex] + return t.Data32[dataIndex] case UCPTRIE_VALUE_BITS_8: - return uint32(trie.Data8[dataIndex]) + return uint32(t.Data8[dataIndex]) default: // Unreachable if the trie is properly initialized. return 0xffffffff @@ -352,38 +352,38 @@ func (trie *UcpTrie) getValue(dataIndex int32) uint32 { } /** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */ -func (trie *UcpTrie) fastIndex(c rune) int32 { - return int32(trie.Index[c>>UCPTRIE_FAST_SHIFT]) + (c & UCPTRIE_FAST_DATA_MASK) +func (t *UcpTrie) fastIndex(c rune) int32 { + return int32(t.Index[c>>UCPTRIE_FAST_SHIFT]) + (c & UCPTRIE_FAST_DATA_MASK) } /** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */ -func (trie *UcpTrie) smallIndex(c rune) int32 { - if c >= trie.HighStart { - return int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) +func (t *UcpTrie) smallIndex(c rune) int32 { + if c >= t.HighStart { + return t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET } - return trie.internalSmallIndex(c) + return t.internalSmallIndex(c) } -func (trie *UcpTrie) internalSmallIndex(c rune) int32 { +func (t *UcpTrie) internalSmallIndex(c rune) int32 { i1 := c >> UCPTRIE_SHIFT_1 - if trie.Type == UCPTRIE_TYPE_FAST { + if t.Type == UCPTRIE_TYPE_FAST { i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH } else { i1 += UCPTRIE_SMALL_INDEX_LENGTH } - i3Block := int32(trie.Index[int32(trie.Index[i1])+((c>>UCPTRIE_SHIFT_2)&UCPTRIE_INDEX_2_MASK)]) + i3Block := int32(t.Index[int32(t.Index[i1])+((c>>UCPTRIE_SHIFT_2)&UCPTRIE_INDEX_2_MASK)]) i3 := (c >> UCPTRIE_SHIFT_3) & UCPTRIE_INDEX_3_MASK var dataBlock int32 if (i3Block & 0x8000) == 0 { // 16-bit indexes - dataBlock = int32(trie.Index[i3Block+i3]) + dataBlock = int32(t.Index[i3Block+i3]) } else { // 18-bit indexes stored in groups of 9 entries per 8 indexes. i3Block = (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) i3 &= 7 - dataBlock = int32(trie.Index[i3Block]) << (2 + (2 * i3)) & 0x30000 + dataBlock = int32(t.Index[i3Block]) << (2 + (2 * i3)) & 0x30000 i3Block++ - dataBlock |= int32(trie.Index[i3Block+i3]) + dataBlock |= int32(t.Index[i3Block+i3]) } return dataBlock + (c & UCPTRIE_SMALL_DATA_MASK) } @@ -393,14 +393,14 @@ func (trie *UcpTrie) internalSmallIndex(c rune) int32 { * Returns the data index. * @internal */ -func (trie *UcpTrie) cpIndex(fastMax, c rune) int32 { +func (t *UcpTrie) cpIndex(fastMax, c rune) int32 { if c <= fastMax { - return trie.fastIndex(c) + return t.fastIndex(c) } if c <= 0x10ffff { - return trie.smallIndex(c) + return t.smallIndex(c) } - return int32(trie.DataLength) - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET + return t.DataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET } /** @@ -507,9 +507,9 @@ type UCPMapValueFilter func(value uint32) uint32 * @return the range end code point, or -1 if start is not a valid code point * @stable ICU 63 */ -func (trie *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateValue uint32, filter UCPMapValueFilter) (rune, uint32) { +func (t *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateValue uint32, filter UCPMapValueFilter) (rune, uint32) { if option == UCPMAP_RANGE_NORMAL { - return trie.getRange(start, filter) + return t.getRange(start, filter) } var surrEnd rune @@ -518,7 +518,7 @@ func (trie *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateVal } else { surrEnd = 0xdbff } - end, value := trie.getRange(start, filter) + end, value := t.getRange(start, filter) if end < 0xd7ff || start > surrEnd { return end, value } @@ -541,7 +541,7 @@ func (trie *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateVal } // See if the surrogateValue surrogate range can be merged with // an immediately following range. - end2, value2 := trie.getRange(surrEnd+1, filter) + end2, value2 := t.getRange(surrEnd+1, filter) if value2 == surrogateValue { return end2, value } @@ -550,25 +550,25 @@ func (trie *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateVal const MAX_UNICODE = 0x10ffff -func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) { +func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) { if start > MAX_UNICODE { return -1, 0 } - if start >= trie.HighStart { - di := int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) - value := trie.getValue(di) + if start >= t.HighStart { + di := t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + value := t.getValue(di) if filter != nil { value = filter(value) } return MAX_UNICODE, value } - nullValue := trie.NullValue + nullValue := t.NullValue if filter != nil { nullValue = filter(nullValue) } - index := trie.Index + index := t.Index prevI3Block := int32(-1) prevBlock := int32(-1) @@ -578,10 +578,10 @@ func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint3 haveValue := false for { var i3Block, i3, i3BlockLength, dataBlockLength int32 - if c <= 0xffff && (trie.Type == UCPTRIE_TYPE_FAST || c <= UCPTRIE_SMALL_MAX) { + if c <= 0xffff && (t.Type == UCPTRIE_TYPE_FAST || c <= UCPTRIE_SMALL_MAX) { i3Block = 0 i3 = c >> UCPTRIE_FAST_SHIFT - if trie.Type == UCPTRIE_TYPE_FAST { + if t.Type == UCPTRIE_TYPE_FAST { i3BlockLength = UCPTRIE_BMP_INDEX_LENGTH } else { i3BlockLength = UCPTRIE_SMALL_INDEX_LENGTH @@ -590,32 +590,32 @@ func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint3 } else { // Use the multi-stage index. i1 := c >> UCPTRIE_SHIFT_1 - if trie.Type == UCPTRIE_TYPE_FAST { + if t.Type == UCPTRIE_TYPE_FAST { i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH } else { i1 += UCPTRIE_SMALL_INDEX_LENGTH } - shft := (c >> UCPTRIE_SHIFT_2) - idx := int32(trie.Index[i1]) + (shft & UCPTRIE_INDEX_2_MASK) - i3Block = int32(trie.Index[idx]) + shft := c >> UCPTRIE_SHIFT_2 + idx := int32(t.Index[i1]) + (shft & UCPTRIE_INDEX_2_MASK) + i3Block = int32(t.Index[idx]) if i3Block == prevI3Block && (c-start) >= UCPTRIE_CP_PER_INDEX_2_ENTRY { // The index-3 block is the same as the previous one, and filled with value. c += UCPTRIE_CP_PER_INDEX_2_ENTRY continue } prevI3Block = i3Block - if i3Block == int32(trie.Index3NullOffset) { + if i3Block == int32(t.Index3NullOffset) { // This is the index-3 null block. if haveValue { if nullValue != value { return c - 1, value } } else { - trieValue = trie.NullValue + trieValue = t.NullValue value = nullValue haveValue = true } - prevBlock = trie.DataNullOffset + prevBlock = t.DataNullOffset c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ^(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1) continue } @@ -643,31 +643,31 @@ func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint3 } else { dataMask := dataBlockLength - 1 prevBlock = block - if block == trie.DataNullOffset { + if block == t.DataNullOffset { // This is the data null block. if haveValue { if nullValue != value { return c - 1, value } } else { - trieValue = trie.NullValue + trieValue = t.NullValue value = nullValue haveValue = true } c = (c + dataBlockLength) & ^dataMask } else { di := block + (c & dataMask) - trieValue2 := trie.getValue(di) + trieValue2 := t.getValue(di) if haveValue { if trieValue2 != trieValue { - if filter == nil || maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) != value { + if filter == nil || maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) != value { return c - 1, value } trieValue = trieValue2 // may or may not help } } else { trieValue = trieValue2 - value = maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) + value = maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) haveValue = true } for { @@ -676,9 +676,9 @@ func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint3 break } di++ - trieValue2 = trie.getValue(di) + trieValue2 = t.getValue(di) if trieValue2 != trieValue { - if filter == nil || maybeFilterValue(trieValue2, trie.NullValue, nullValue, filter) != value { + if filter == nil || maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) != value { return c - 1, value } trieValue = trieValue2 // may or may not help @@ -691,14 +691,14 @@ func (trie *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint3 break } } - if c >= trie.HighStart { + if c >= t.HighStart { break } } - di := int32(trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET) - highValue := trie.getValue(di) - if maybeFilterValue(highValue, trie.NullValue, nullValue, filter) != value { + di := t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + highValue := t.getValue(di) + if maybeFilterValue(highValue, t.NullValue, nullValue, filter) != value { return c - 1, value } else { return MAX_UNICODE, value diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index 7fc727be083..c7b89233a43 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -31,9 +31,6 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/uprops" ) -type BreakIterator interface { -} - const TIMER_INITIAL_VALUE = 10000 const DEFAULT_TIMEOUT = 3 const DEFAULT_STACK_LIMIT = 0 @@ -88,9 +85,6 @@ type Matcher struct { // Kept separately from fTime to keep as much // code as possible out of the inline // StateSave function. - - wordBreakItr BreakIterator - gcBreakItr BreakIterator } func NewMatcher(pat *Pattern) *Matcher { @@ -129,7 +123,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { pat := m.pattern.compiledPat inputText := m.input - inputLength := len(inputText) litText := m.pattern.literalText sets := m.pattern.sets @@ -178,12 +171,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { nextOp := pat[*fp.patIdx()] // Fetch the second operand *fp.patIdx()++ stringLen := nextOp.Value() - if nextOp.Type() != URX_STRING_LEN { - panic("URX_STRING_LEN expected") - } - if stringLen < 2 { - panic("stringLen < 2, would have expected URX_ONECHAR for a single character") - } patternString := litText[stringStartIdx:] var patternStringIndex int @@ -227,23 +214,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: - if !(op.Value() >= 0 && op.Value() < m.stack.frameSize-3) { - panic("failed assertion: opValue >= 0 && opValue < fFrameSize-3") - } *fp.extra(op.Value() + 2) = *fp.inputIdx() case URX_END_CAPTURE: - if !(op.Value() >= 0 && op.Value() < m.stack.frameSize-3) { - panic("failed assertion: opValue >= 0 && opValue < fFrameSize-3") - } - if *fp.extra(op.Value() + 2) < 0 { - panic("start pos for this group must be set") - } - *fp.extra(op.Value()) = *fp.extra(op.Value() + 2) // Tentative start becomes real. *fp.extra(op.Value() + 1) = *fp.inputIdx() // End position - if !(*fp.extra(op.Value()) <= *fp.extra(op.Value() + 1)) { - panic("failed assertion: fp->fExtra[opValue] <= fp->fExtra[opValue+1]") - } case URX_DOLLAR: // $, test for End of line if *fp.inputIdx() < m.anchorLimit-2 { @@ -346,17 +320,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Not at the start of a line. Fail. fp = m.stack.popFrame() case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode - if !(*fp.inputIdx() >= m.anchorStart) { - panic("failed assertion: *fp.inputIdx() >= m.anchorStart") - } if *fp.inputIdx() <= m.anchorStart { // We are at the start input. Success. break } - // Check whether character just before the current pos is a new-line - if !(*fp.inputIdx() <= m.anchorLimit) { - panic("failed assertion: *fp.inputIdx() <= m.anchorLimit") - } c := charAt(inputText, *fp.inputIdx()-1) if c != 0x0a { @@ -482,12 +449,8 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { success := (op.Value() & URX_NEG_SET) == URX_NEG_SET negOp := op.Value() & ^URX_NEG_SET - if !(negOp > 0 && negOp < URX_LAST_SET) { - panic("assertion failed: negOp > 0 && negOp < URX_LAST_SET") - } - c := charAt(inputText, *fp.inputIdx()) - s := staticPropertySets[op.Value()] + s := staticPropertySets[negOp] if s.ContainsRune(c) { success = !success } @@ -507,10 +470,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { break } - if !(op.Value() > 0 && op.Value() < URX_LAST_SET) { - panic("assertion failed: op.Value() > 0 && op.Value() < URX_LAST_SET") - } - c := charAt(inputText, *fp.inputIdx()) s := staticPropertySets[op.Value()] if !s.ContainsRune(c) { @@ -530,9 +489,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // There is input left. Pick up one char and test it for set membership. c := charAt(inputText, *fp.inputIdx()) - if !(op.Value() > 0 && op.Value() < len(m.pattern.sets)) { - panic("assertion failed: op.Value() > 0 && op.Value() < fSets->size()") - } s := sets[op.Value()] if s.ContainsRune(c) { *fp.inputIdx()++ @@ -596,60 +552,41 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.inputIdx()++ } case URX_JMP: - *fp.patIdx() = int(op.Value()) + *fp.patIdx() = op.Value() case URX_FAIL: isMatch = false goto breakFromLoop case URX_JMP_SAV: - if !(op.Value() > 0 && int(op.Value()) < len(pat)) { - panic("assertion failed: op.Value() > 0 && op.Value() < fPattern->fCompiledPat->size()") - } fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current if err != nil { return err } - *fp.patIdx() = int(op.Value()) // Then JMP. + *fp.patIdx() = op.Value() // Then JMP. case URX_JMP_SAV_X: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the // data address of the input position at the start of the loop. - if !(op.Value() > 0 && int(op.Value()) < len(pat)) { - panic("assertion failed: op.Value() > 0 && op.Value() < fPattern->fCompiledPat->size()") - } stoOp := pat[op.Value()-1] - if !(stoOp.Type() == URX_STO_INP_LOC) { - panic("assertion failed: stoOp.Type() == URX_STO_INP_LOC") - } - - frameLoc := int(stoOp.Value()) - if !(frameLoc >= 0 && frameLoc < m.stack.frameSize) { - panic("assertion failed: frameLoc >= 0 && frameLoc < fFrameSize") - } + frameLoc := stoOp.Value() prevInputIdx := *fp.extra(frameLoc) - if !(prevInputIdx <= *fp.inputIdx()) { - panic("assertion failed: prevInputIdx <= *fp.inputIdx()") - } if prevInputIdx < *fp.inputIdx() { // The match did make progress. Repeat the loop. fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current if err != nil { return err } - *fp.patIdx() = int(op.Value()) // Then JMP. + *fp.patIdx() = op.Value() // Then JMP. *fp.extra(frameLoc) = *fp.inputIdx() } // If the input position did not advance, we do nothing here, // execution will fall out of the loop. case URX_CTR_INIT: - if !(op.Value() >= 0 && int(op.Value()) < m.stack.frameSize-2) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize-2") - } *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and @@ -661,13 +598,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { minCount := int(pat[instOperandLoc+1]) maxCount := int(pat[instOperandLoc+2]) - if !(minCount >= 0 && maxCount >= minCount || maxCount == -1) { - panic("assertion failed: minCount >= 0 && maxCount >= minCount || maxCount == -1") - } - if !(int(loopLoc) >= *fp.patIdx()) { - panic("assertion failed: loopLoc >= *fp.patIdx()") - } - if minCount == 0 { fp, err = m.StateSave(*fp.inputIdx(), loopLoc+1) if err != nil { @@ -681,22 +611,13 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } case URX_CTR_LOOP: - if !(op.Value() >= 0 && op.Value() < *fp.patIdx()-2) { - panic("assertion failed: op.Value() >= 0 && op.Value() < *fp.patIdx()-2") - } initOp := pat[op.Value()] - if !(initOp.Type() == URX_CTR_INIT) { - panic("assertion failed: initOp.Type() == URX_CTR_INIT") - } opValue := initOp.Value() pCounter := fp.extra(opValue) minCount := int(pat[op.Value()+2]) maxCount := int(pat[op.Value()+3]) *pCounter++ if *pCounter >= maxCount && maxCount != -1 { - if !(*pCounter == maxCount) { - panic("assertion failed: *pCounter == maxCount") - } break } @@ -728,9 +649,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.patIdx() = op.Value() + 4 // Loop back. case URX_CTR_INIT_NG: - if !(op.Value() >= 0 && int(op.Value()) < m.stack.frameSize-2) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize-2") - } *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT_NG has, and @@ -741,10 +659,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { minCount := pat[instrOperandLoc+1].Value() maxCount := pat[instrOperandLoc+2].Value() - if !(minCount >= 0 && maxCount >= minCount || maxCount == -1) { - panic("assertion failed: minCount >= 0 && maxCount >= minCount || maxCount == -1") - } - if maxCount == -1 { *fp.extra(op.Value() + 1) = *fp.inputIdx() // Save initial input index for loop breaking. } @@ -760,13 +674,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } case URX_CTR_LOOP_NG: - if !(op.Value() >= 0 && int(op.Value()) < *fp.patIdx()-2) { - panic("assertion failed: op.Value() >= 0 && op.Value() < *fp.patIdx()-2") - } initOp := pat[op.Value()] - if !(initOp.Type() == URX_CTR_INIT_NG) { - panic("assertion failed: initOp.Type() == URX_CTR_INIT_NG") - } pCounter := fp.extra(initOp.Value()) minCount := int(pat[op.Value()+2]) maxCount := int(pat[op.Value()+3]) @@ -775,9 +683,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. - if !(*pCounter == maxCount) { - panic("assertion failed: *pCounter == maxCount") - } break } @@ -816,19 +721,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } case URX_STO_SP: - if !(op.Value() >= 0 && op.Value() < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fPattern->fDataSize") - } m.data[op.Value()] = m.stack.len() case URX_LD_SP: - if !(op.Value() >= 0 && op.Value() < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fPattern->fDataSize") - } newStackSize := m.data[op.Value()] - if !(newStackSize <= m.stack.len()) { - panic("assertion failed: newStackSize <= fStack->size()") - } newFp := m.stack.offset(newStackSize) if newFp.equals(fp) { break @@ -838,17 +734,9 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { m.stack.setSize(newStackSize) case URX_BACKREF: - if !(op.Value() < m.stack.frameSize) { - panic("assertion failed: op.Value() < fFrameSize") - } - groupStartIdx := *fp.extra(op.Value()) groupEndIdx := *fp.extra(op.Value() + 1) - if !(groupStartIdx <= groupEndIdx) { - panic("assertion failed: groupStartIdx <= groupEndIdx") - } - if groupStartIdx < 0 { // This capture group has not participated in the match thus far, fp = m.stack.popFrame() // FAIL, no match. @@ -882,18 +770,8 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() } case URX_BACKREF_I: - if !(op.Value() < m.stack.frameSize) { - panic("assertion failed: op.Value() < fFrameSize") - } - groupStartIdx := *fp.extra(op.Value()) groupEndIdx := *fp.extra(op.Value() + 1) - if !(groupStartIdx <= groupEndIdx) { - panic("assertion failed: groupStartIdx <= groupEndIdx") - } - if !(groupStartIdx <= groupEndIdx) { - panic("assertion failed: groupStartIdx <= groupEndIdx") - } if groupStartIdx < 0 { // This capture group has not participated in the match thus far, @@ -937,23 +815,14 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } case URX_STO_INP_LOC: - if !(op.Value() >= 0 && op.Value() < m.stack.frameSize) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize") - } *fp.extra(op.Value()) = *fp.inputIdx() case URX_JMPX: instrOperandLoc := *fp.patIdx() *fp.patIdx()++ dataLoc := pat[instrOperandLoc].Value() - if !(dataLoc >= 0 && dataLoc < m.stack.frameSize) { - panic("assertion failed: dataLoc >= 0 && dataLoc < fFrameSize") - } saveInputIdx := *fp.extra(dataLoc) - if !(saveInputIdx <= *fp.inputIdx()) { - panic("assertion failed: saveInputIdx <= *fp.inputIdx()") - } if saveInputIdx < *fp.inputIdx() { *fp.patIdx() = op.Value() // JMP @@ -962,9 +831,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } case URX_LA_START: - if !(op.Value() >= 0 && op.Value()+3 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+3 < fDataSize") - } m.data[op.Value()] = m.stack.len() m.data[op.Value()+1] = *fp.inputIdx() m.data[op.Value()+2] = m.activeStart @@ -973,14 +839,8 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { m.activeLimit = m.lookLimit // transparent bounds. case URX_LA_END: - if !(op.Value() >= 0 && op.Value()+3 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+3 < fDataSize") - } stackSize := m.stack.len() newStackSize := m.data[op.Value()] - if !(stackSize >= newStackSize) { - panic("assertion failed: stackSize >= newStackSize") - } if stackSize > newStackSize { // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead @@ -995,12 +855,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { m.activeStart = m.data[op.Value()+2] m.activeLimit = m.data[op.Value()+3] - if !(m.activeStart >= 0) { - panic("assertion failed: m.activeStart >= 0") - } - if !(m.activeLimit <= len(inputText)) { - panic("assertion failed: m.activeLimit <= len(inputText)") - } case URX_ONECHAR_I: // Case insensitive one char. The char from the pattern is already case folded. @@ -1027,9 +881,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { var patternStringIdx int nextOp := pat[*fp.patIdx()] *fp.patIdx()++ - if !(nextOp.Type() == URX_STRING_LEN) { - panic("assertion failed: nextOp.Type() == URX_STRING_LEN") - } patternStringLen := nextOp.Value() success := true @@ -1062,9 +913,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Entering a look-behind block. // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 - if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") - } m.data[op.Value()] = m.stack.len() m.data[op.Value()+1] = *fp.inputIdx() // Save input string length, then reset to pin any matches to end at @@ -1085,16 +933,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.patIdx()++ maxML := pat[*fp.patIdx()] *fp.patIdx()++ - if !(minML <= maxML) { - panic("assertion failed: minML <= maxML") - } - if !(minML >= 0) { - panic("assertion failed: minML >= 0") - } - if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") - } lbStartIdx := &m.data[op.Value()+4] if *lbStartIdx < 0 { // First time through loop. @@ -1115,12 +954,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() m.activeStart = m.data[op.Value()+2] m.activeLimit = m.data[op.Value()+3] - if !(m.activeStart >= 0) { - panic("assertion failed: fActiveStart >= 0") - } - if !(m.activeLimit <= inputLength) { - panic("assertion failed: fActiveLimit <= fInputLength") - } break } @@ -1134,9 +967,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { case URX_LB_END: // End of a look-behind block, after a successful match. - if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") - } if *fp.inputIdx() != m.activeLimit { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. @@ -1152,12 +982,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // position being looked-behind. m.activeStart = m.data[op.Value()+2] m.activeLimit = m.data[op.Value()+3] - if !(m.activeStart >= 0) { - panic("assertion failed: fActiveStart >= 0") - } - if !(m.activeLimit <= inputLength) { - panic("assertion failed: fActiveLimit <= fInputLength") - } case URX_LBN_CONT: // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. @@ -1171,21 +995,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { continueLoc := pat[*fp.patIdx()].Value() *fp.patIdx()++ - if !(minML <= maxML) { - panic("assertion failed: minML <= maxML") - } - if !(minML >= 0) { - panic("assertion failed: minML >= 0") - } - if !(continueLoc > *fp.patIdx()) { - panic("assertion failed: continueLoc > *fp.patIdx()") - } - - // Fetch (from data) the last input index where a match was attempted. - if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") - } - lbStartIdx := &m.data[op.Value()+4] if *lbStartIdx < 0 { @@ -1207,12 +1016,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // a whole has succeeded. Jump forward to the continue location m.activeStart = m.data[op.Value()+2] m.activeLimit = m.data[op.Value()+3] - if !(m.activeStart >= 0) { - panic("assertion failed: fActiveStart >= 0") - } - if !(m.activeLimit <= inputLength) { - panic("assertion failed: fActiveLimit <= fInputLength") - } *fp.patIdx() = continueLoc break } @@ -1226,9 +1029,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.inputIdx() = *lbStartIdx case URX_LBN_END: // End of a negative look-behind block, after a successful match. - if !(op.Value() >= 0 && op.Value()+4 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+4 < fDataSize") - } if *fp.inputIdx() != m.activeLimit { // The look-behind expression matched, but the match did not @@ -1248,22 +1048,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // to the position being looked-behind. m.activeStart = m.data[op.Value()+2] m.activeLimit = m.data[op.Value()+3] - if !(m.activeStart >= 0) { - panic("assertion failed: fActiveStart >= 0") - } - if !(m.activeLimit <= inputLength) { - panic("assertion failed: fActiveLimit <= fInputLength") - } // Restore original stack position, discarding any state saved // by the successful pattern match. - if !(op.Value() >= 0 && op.Value()+1 < m.pattern.dataSize) { - panic("assertion failed: op.Value() >= 0 && op.Value()+1 < fDataSize") - } newStackSize := m.data[op.Value()] - if !(m.stack.len() > newStackSize) { - panic("assertion failed: fStack.size() > newStackSize") - } m.stack.setSize(newStackSize) // FAIL, which will take control back to someplace @@ -1274,9 +1062,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // [some character set]* // This op scans through all matching input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. - if !(op.Value() >= 0 && op.Value() < len(sets)) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fSets.size()") - } s := sets[op.Value()] // Loop through input, until either the input is exhausted or @@ -1306,13 +1091,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* loopcOp := pat[*fp.patIdx()] - if !(loopcOp.Type() == URX_LOOP_C) { - panic("assertion failed: loopcOp.Type() == URX_LOOP_C") - } stackLoc := loopcOp.Value() - if !(stackLoc >= 0 && stackLoc < m.stack.frameSize) { - panic("assertion failed: stackLoc >= 0 && stackLoc < fFrameSize") - } *fp.extra(stackLoc) = *fp.inputIdx() *fp.inputIdx() = ix @@ -1369,13 +1148,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // must follow. It's operand is the stack location // that holds the starting input index for the match of this .* loopcOp := pat[*fp.patIdx()] - if !(loopcOp.Type() == URX_LOOP_C) { - panic("assertion failed: loopcOp.Type() == URX_LOOP_C") - } stackLoc := loopcOp.Value() - if !(stackLoc >= 0 && stackLoc < m.stack.frameSize) { - panic("assertion failed: stackLoc >= 0 && stackLoc < fFrameSize") - } *fp.extra(stackLoc) = *fp.inputIdx() *fp.inputIdx() = ix @@ -1389,13 +1162,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.patIdx()++ case URX_LOOP_C: - if !(op.Value() >= 0 && op.Value() < m.stack.frameSize) { - panic("assertion failed: op.Value() >= 0 && op.Value() < fFrameSize") - } backSearchIndex := *fp.extra(op.Value()) - if !(backSearchIndex <= *fp.inputIdx()) { - panic("assertion failed: backSearchIndex <= *fp.inputIdx()") - } if backSearchIndex == *fp.inputIdx() { // We've backed up the input idx to the point that the loop started. @@ -1408,9 +1175,6 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // and a state save to this instruction in case the following code fails again. // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) - if !(*fp.inputIdx() > 0) { - panic("assertion failed: *fp.inputIdx() > 0") - } prevC := charAt(inputText, *fp.inputIdx()-1) *fp.inputIdx()-- @@ -1580,6 +1344,7 @@ func (m *Matcher) isHorizWS(c rune) bool { func (m *Matcher) followingGCBoundary(pos int) int { // TODO: implement + return pos /* // Note: this point will never be reached if break iteration is configured out. // Regex patterns that would require this function will fail to compile. @@ -1597,7 +1362,6 @@ func (m *Matcher) followingGCBoundary(pos int) int { result = pos; } */ - panic("TODO") } func (m *Matcher) ResetString(input string) { @@ -1655,10 +1419,6 @@ func (m *Matcher) Find() (bool, error) { return false, nil } - if !(startPos >= 0) { - panic("assertion failed: startPos >= 0") - } - switch m.pattern.startType { case START_NO_INFO: // No optimization was found. @@ -1679,10 +1439,6 @@ func (m *Matcher) Find() (bool, error) { } case START_SET: // Match may start on any char from a pre-computed set. - if !(m.pattern.minMatchLen > 0) { - panic("assertion failed: minMatchLen > 0") - } - for { pos := startPos c := charAt(m.input, startPos) @@ -1775,10 +1531,6 @@ func (m *Matcher) Find() (bool, error) { } case START_CHAR, START_STRING: // Match starts on exactly one char. - if !(m.pattern.minMatchLen > 0) { - panic("assertion failed: minMatchLen > 0") - } - theChar := m.pattern.initialChar for { pos := startPos diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go index 8958bc4fcfd..04d0266357f 100644 --- a/go/mysql/icuregex/perl_test.go +++ b/go/mysql/icuregex/perl_test.go @@ -212,7 +212,7 @@ func TestPerl(t *testing.T) { } if expectedS != string(result) { - t.Errorf("line %d: Incorrect Perl expression results\nwant: %s\ngot: %s", lineno, expectedS, result) + t.Errorf("line %d: Incorrect Perl expression results for %s\nwant: %q\ngot: %q", lineno, pattern, expectedS, result) } } } diff --git a/go/mysql/icuregex/sets.go b/go/mysql/icuregex/sets.go index 04304f93820..3241b0c24d4 100644 --- a/go/mysql/icuregex/sets.go +++ b/go/mysql/icuregex/sets.go @@ -22,6 +22,7 @@ limitations under the License. package icuregex import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" "vitess.io/vitess/go/mysql/icuregex/internal/uset" ) @@ -30,32 +31,32 @@ var staticPropertySets [13]*uset.UnicodeSet func init() { staticPropertySets[URX_ISWORD_SET] = func() *uset.UnicodeSet { s := uset.New() - s.AddAll(uset.MustParsePattern(`\p{Alphabetic}`, 0)) - s.AddAll(uset.MustParsePattern(`\p{M}`, 0)) - s.AddAll(uset.MustParsePattern(`\p{Nd}`, 0)) - s.AddAll(uset.MustParsePattern(`\p{Pc}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Alphabetic}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{M}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Nd}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Pc}`, 0)) s.AddRune(0x200c) s.AddRune(0x200d) return s.Freeze() }() - staticPropertySets[URX_ISSPACE_SET] = uset.MustParsePattern(`\p{Whitespace}`, 0).Freeze() + staticPropertySets[URX_ISSPACE_SET] = uprops.MustNewUnicodeSetFomPattern(`\p{Whitespace}`, 0).Freeze() - staticPropertySets[URX_GC_EXTEND] = uset.MustParsePattern(`\p{Grapheme_Extend}`, 0).Freeze() + staticPropertySets[URX_GC_EXTEND] = uprops.MustNewUnicodeSetFomPattern(`\p{Grapheme_Extend}`, 0).Freeze() staticPropertySets[URX_GC_CONTROL] = func() *uset.UnicodeSet { s := uset.New() - s.AddAll(uset.MustParsePattern(`[:Zl:]`, 0)) - s.AddAll(uset.MustParsePattern(`[:Zp:]`, 0)) - s.AddAll(uset.MustParsePattern(`[:Cc:]`, 0)) - s.AddAll(uset.MustParsePattern(`[:Cf:]`, 0)) - s.RemoveAll(uset.MustParsePattern(`[:Grapheme_Extend:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zl:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zp:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Cc:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Cf:]`, 0)) + s.RemoveAll(uprops.MustNewUnicodeSetFomPattern(`[:Grapheme_Extend:]`, 0)) return s.Freeze() }() - staticPropertySets[URX_GC_L] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() - staticPropertySets[URX_GC_LV] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() - staticPropertySets[URX_GC_LVT] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() - staticPropertySets[URX_GC_V] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() - staticPropertySets[URX_GC_T] = uset.MustParsePattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() + staticPropertySets[URX_GC_L] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() + staticPropertySets[URX_GC_LV] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() + staticPropertySets[URX_GC_LVT] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() + staticPropertySets[URX_GC_V] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() + staticPropertySets[URX_GC_T] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() staticPropertySets[URX_GC_NORMAL] = func() *uset.UnicodeSet { s := uset.New() diff --git a/go/mysql/icuregex/testdata/regextst_extended.txt b/go/mysql/icuregex/testdata/regextst_extended.txt index 7824d8028a1..841e5e46092 100644 --- a/go/mysql/icuregex/testdata/regextst_extended.txt +++ b/go/mysql/icuregex/testdata/regextst_extended.txt @@ -74,6 +74,7 @@ "\p{Bidi_Class=LeftToRight}" "<0>Goodbye" "\p{Bidi_Class=RightToLeft}" "Goodbye" "\p{Bidi_Class=LeftToRight}" "؈" +"\p{Bidi_Paired_Bracket_Type=Open}" "Good<0>(ye" "\p{Soft_Dotted}" "Good<0>iye" @@ -82,7 +83,44 @@ "\p{Changes_When_Uppercased}" "G<0>oodbye" "\p{Changes_When_CaseMapped}" " <0>Goodbye3" "\p{Cased}" " <0>Goodbye3" +"\p{CaseIgnorable}" "foo<0>.bar" "\p{Indic_Syllabic_Category=Avagraha}" "foo<0>\u09BDbar" "\p{IndicPositionalCategory=Top_And_Left_And_Right}" "foo<0>\u0B4Cbar" -"\p{VerticalOrientation=U}" "foo<0>\uA015bar" \ No newline at end of file +"\p{VerticalOrientation=U}" "foo<0>\uA015bar" + +"\p{Canonical_Combining_Class=Nukta}" "foo<0>\u093Cbar" +"\p{Lead_Canonical_Combining_Class=Above}" "foo<0>\u0300bar" +"\p{Trail_Canonical_Combining_Class=Above}" "foo<0>\u0300bar" + +"\p{Changes_When_Casefolded}" "<0>\uFB03Goodbye" +"\p{Changes_When_Casefolded}" 2 "\uFB03<0>Goodbye" + +"\p{NFC_Inert}" "foo<0>\uFB03bar" +"\p{NFKC_Inert}" "foo<0>\uFB03bar" +"\P{NFD_Inert}" "foo<0>Àbar" +"\P{NFKD_Inert}" "foo<0>Àbar" + +"\p{NFC_Quick_Check=No}" "foo<0>\u0340bar" +"\p{NFKC_Quick_Check=No}" "foo<0>\u0340bar" +"\p{NFD_Quick_Check=No}" "foo<0>\u00C0bar" +"\p{NFKD_Quick_Check=No}" "foo<0>\u00C0bar" + +"\p{Full_Composition_Exclusion}" "foo<0>\u0374bar" + +"\p{Numeric_Type=Decimal}" "foo<0>3bar" +"\p{Joining_Type=Dual_Joining}" "foo<0>\u0626bar" +"\p{Joining_Group=African_Feh}" "foo<0>\u08BBbar" +"\p{General_Category=Close_Punctuation}" "foo[bar" +"\p{General_Category=Close_Punctuation}" "foo<0>]]bar" +"\p{General_Category=Close_Punctuation}" 2 "foo]<0>]bar" + +"\p{Hangul_Syllable_Type=Not_Applicable}" "<0>f" +"\p{Hangul_Syllable_Type=Leading_Jamo}" "foo<0>\u1100bar" + +"\p{Regional_Indicator=Yes}" "foo<0>\U0001F1E6bar" + +# Currently unsupported property classes below. They require +# significant additional code to support. +"\p{Changes_When_NFKC_Casefolded}" E "foo<0>\uFB03bar" +"\p{Segment_Starter}" E "<0>\uFB03Goodbye" \ No newline at end of file diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index 1128b1155d9..b554da87856 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -43,10 +43,6 @@ func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) return flags, nil } -func regexpMatcher(input, pat, flags eval) (*icuregex.Matcher, error) { - -} - func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { input, err := r.Arguments[0].eval(env) if err != nil || input == nil { From c9c9b8ef90c2a759c62d12d52c17266e346e5cf2 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Thu, 29 Jun 2023 18:50:35 +0200 Subject: [PATCH 05/18] Update sizegen Signed-off-by: Dirkjan Bussink --- go/vt/vtgate/evalengine/cached_size.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/go/vt/vtgate/evalengine/cached_size.go b/go/vt/vtgate/evalengine/cached_size.go index c249bf3e86c..8dda2d3902c 100644 --- a/go/vt/vtgate/evalengine/cached_size.go +++ b/go/vt/vtgate/evalengine/cached_size.go @@ -1257,6 +1257,18 @@ func (cached *builtinRandomBytes) CachedSize(alloc bool) int64 { size += cached.CallExpr.CachedSize(false) return size } +func (cached *builtinRegexpLike) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} func (cached *builtinRepeat) CachedSize(alloc bool) int64 { if cached == nil { return int64(0) From 01f14315dfdabc4765ea029fde4b4aa4fb3875d3 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Fri, 30 Jun 2023 20:28:59 +0200 Subject: [PATCH 06/18] icuregex: Fix invalid slice creation Parse the structure so we can create buffers with the proper size and never with infinite sizes. While this was not the immediate cause of the race error, it's better to create with the right slice size also for debugging when digging into it. The real fix here is that the size of `algorithmicRange` includes the size of the struct itself, so if we want to get the remaining slice size it needs to subtract this value. Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/internal/udata/udata.go | 4 +- go/mysql/icuregex/internal/unames/unames.go | 72 ++++++++++++++------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/go/mysql/icuregex/internal/udata/udata.go b/go/mysql/icuregex/internal/udata/udata.go index 50b67b5f427..802f2420acf 100644 --- a/go/mysql/icuregex/internal/udata/udata.go +++ b/go/mysql/icuregex/internal/udata/udata.go @@ -135,8 +135,8 @@ func (b *Bytes) Int32() int32 { return int32(b.Uint32()) } -func (b *Bytes) Pointer() unsafe.Pointer { - return unsafe.Pointer(unsafe.SliceData(b.buf)) +func (b *Bytes) Buffer() []byte { + return b.buf } func (b *Bytes) Skip(size int32) { diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index f015a77485e..ce2af805b08 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -23,7 +23,6 @@ package unames import ( "bytes" - "math" "strconv" "strings" "sync" @@ -51,10 +50,39 @@ func loadCharNames() { }); err != nil { panic(err) } - charNames = (*UCharNames)(b.Pointer()) + charNames = &UCharNames{ + tokenStringOffset: b.Uint32() - 16, + groupsOffset: b.Uint32() - 16, + groupStringOffset: b.Uint32() - 16, + algNamesOffset: b.Uint32() - 16, + buf: b.Buffer(), + } }) } +func (names *UCharNames) tokenStrings() []uint8 { + return names.buf[names.tokenStringOffset:names.groupsOffset] +} + +func (names *UCharNames) getGroupName(group []uint16) []uint8 { + return names.buf[names.groupStringOffset+names.getGroupOffset(group) : names.algNamesOffset] +} + +func (names *UCharNames) getGroups() []uint16 { + buf := names.buf[names.groupsOffset:names.groupStringOffset] + return unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/2) +} + +func (names *UCharNames) tokens() []uint16 { + buf := names.buf[:names.tokenStringOffset] + return unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/2) +} + +func (names *UCharNames) algNames() []uint32 { + buf := names.buf[names.algNamesOffset:] + return unsafe.Slice((*uint32)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/4) +} + type NameChoice int32 const ( @@ -91,11 +119,15 @@ func (ar *algorithmicRange) ptrend(offset uintptr) unsafe.Pointer { } func (ar *algorithmicRange) slice8(offset uintptr) []uint8 { - return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.size) + return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.sliceSize()) } func (ar *algorithmicRange) slice16() []uint16 { - return unsafe.Slice((*uint16)(ar.ptrend(0)), ar.size/2) + return unsafe.Slice((*uint16)(ar.ptrend(0)), ar.sliceSize()/2) +} + +func (ar *algorithmicRange) sliceSize() int { + return int(ar.size) - int(unsafe.Sizeof(algorithmicRange{})) } func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) rune { @@ -241,15 +273,20 @@ func CharForName(nameChoice NameChoice, name string) rune { return -1 } - p := charNames.ptr32(charNames.algNamesOffset) + p := charNames.algNames() i := p[0] algRange := (*algorithmicRange)(unsafe.Pointer(unsafe.SliceData(p[1:]))) - for i > 0 { + for { if cp := algRange.findAlgName(nameChoice, upper); cp != -1 { return cp } - algRange = algRange.next() i-- + if i == 0 { + break + } + // Only move to the next if we know it's safe, or we otherwise + // create slices that are outside the buffer. + algRange = algRange.next() } return charNames.enumNames(0, 0x10ffff+1, upper, nameChoice) @@ -257,6 +294,7 @@ func CharForName(nameChoice NameChoice, name string) rune { type UCharNames struct { tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset uint32 + buf []byte } const GROUP_SHIFT = 5 @@ -316,20 +354,8 @@ func (names *UCharNames) enumNames(start, limit rune, otherName string, nameChoi return -1 } -func (names *UCharNames) ptr8(offset8 uint32) []byte { - return unsafe.Slice((*uint8)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt) -} - -func (names *UCharNames) ptr16(offset8 uint32) []uint16 { - return unsafe.Slice((*uint16)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt/2) -} - -func (names *UCharNames) ptr32(offset8 uint32) []uint32 { - return unsafe.Slice((*uint32)(unsafe.Add(unsafe.Pointer(names), offset8)), math.MaxInt/4) -} - func (names *UCharNames) getGroup(code rune) []uint16 { - groups := names.ptr16(names.groupsOffset) + groups := names.getGroups() groupMSB := uint16(code >> GROUP_SHIFT) start := 0 @@ -357,7 +383,7 @@ func (names *UCharNames) enumGroupNames(group []uint16, start, end rune, otherNa var offsets [LINES_PER_GROUP + 2]uint16 var lengths [LINES_PER_GROUP + 2]uint16 - s := names.ptr8(names.groupStringOffset + names.getGroupOffset(group)) + s := names.getGroupName(group) s = expandGroupLengths(s, offsets[:0], lengths[:0]) for start < end { @@ -423,12 +449,12 @@ func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { } func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName string) bool { - tokens := names.ptr16(0)[8:] + tokens := names.tokens() tokenCount := tokens[0] tokens = tokens[1:] - tokenStrings := names.ptr8(names.tokenStringOffset) + tokenStrings := names.tokenStrings() otherNameLen := len(otherName) for len(name) > 0 && len(otherName) > 0 { From 73dd23f2ef6d24a242201c21e418ee744890b9b4 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Sat, 1 Jul 2023 20:30:40 +0200 Subject: [PATCH 07/18] icuregex: Create valid slice length for algorithmicRange We also want to create a valid slice length for the additional data, this was too long if an offset was given and would read into the next entry. Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/internal/unames/unames.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index ce2af805b08..7ca2ee299cb 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -119,7 +119,7 @@ func (ar *algorithmicRange) ptrend(offset uintptr) unsafe.Pointer { } func (ar *algorithmicRange) slice8(offset uintptr) []uint8 { - return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.sliceSize()) + return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.sliceSize()-int(offset)) } func (ar *algorithmicRange) slice16() []uint16 { @@ -229,10 +229,8 @@ func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) run } func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, count int, s []uint8, buf *strings.Builder, elements, elementBases *[8][]byte) { - i := 0 - /* write each element */ - for { + for i := 0; i < count; i++ { (*elements)[i] = s (*elementBases)[i] = s @@ -240,17 +238,11 @@ func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, count int, s [] buf.Write(s[:nul]) s = s[nul+1:] - if i >= count { - break - } - factor := int(factors[i] - 1) for factor > 0 { s = s[bytes.IndexByte(s, 0)+1:] factor-- } - - i++ } } From b8d23b2e21eb1e2ccdf292039acab3c452636f50 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Sat, 1 Jul 2023 20:35:04 +0200 Subject: [PATCH 08/18] icuregex: Clean up more unsafe usage This reduces unsafe usage to just udata and doesn't use it anywhere outside of it. Makes it more Go idiomatic this way. Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/internal/udata/udata.go | 23 ++-- go/mysql/icuregex/internal/unames/unames.go | 145 ++++++++------------ go/mysql/icuregex/internal/uprops/uprops.go | 12 +- 3 files changed, 72 insertions(+), 108 deletions(-) diff --git a/go/mysql/icuregex/internal/udata/udata.go b/go/mysql/icuregex/internal/udata/udata.go index 802f2420acf..f20f8be1efa 100644 --- a/go/mysql/icuregex/internal/udata/udata.go +++ b/go/mysql/icuregex/internal/udata/udata.go @@ -23,7 +23,7 @@ package udata import ( "encoding/binary" - "fmt" + "errors" "unsafe" ) @@ -92,21 +92,26 @@ func (b *Bytes) ReadHeader(isValid func(info *DataInfo) bool) error { header := (*DataHeader)(unsafe.Pointer(data)) if header.dataHeader.magic1 != 0xda || header.dataHeader.magic2 != 0x27 { - return fmt.Errorf("invalid magic number") + return errors.New("invalid magic number") } if header.info.IsBigEndian != 0 { - return fmt.Errorf("unsupported: BigEndian data source") + return errors.New("unsupported: BigEndian data source") } if !isValid(&header.info) { - return fmt.Errorf("failed to validate data header") + return errors.New("failed to validate data header") } b.buf = b.buf[header.dataHeader.headerSize:] return nil } +func (b *Bytes) Uint8() uint8 { + u := b.buf[0] + b.buf = b.buf[1:] + return u +} func (b *Bytes) Uint16() uint16 { u := b.enc.Uint16(b.buf) b.buf = b.buf[2:] @@ -135,10 +140,6 @@ func (b *Bytes) Int32() int32 { return int32(b.Uint32()) } -func (b *Bytes) Buffer() []byte { - return b.buf -} - func (b *Bytes) Skip(size int32) { b.buf = b.buf[size:] } @@ -149,12 +150,6 @@ func (b *Bytes) Uint8Slice(n int32) []uint8 { return s } -func (b *Bytes) String(size int32) string { - s := unsafe.String(&b.buf[0], size) - b.buf = b.buf[size:] - return s -} - func (b *Bytes) Position() int32 { return int32(len(b.orig) - len(b.buf)) } diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index 7ca2ee299cb..1a7329189ac 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -26,7 +26,6 @@ import ( "strconv" "strings" "sync" - "unsafe" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" "vitess.io/vitess/go/mysql/icuregex/internal/udata" @@ -35,6 +34,14 @@ import ( var charNamesOnce sync.Once var charNames *UCharNames +type UCharNames struct { + tokens []uint16 + tokenStrings []uint8 + groups []uint16 + groupNames []uint8 + algNames []algorithmicRange +} + func loadCharNames() { charNamesOnce.Do(func() { b := udata.NewBytes(icudata.UNames) @@ -50,37 +57,43 @@ func loadCharNames() { }); err != nil { panic(err) } + + tokenStringOffset := int32(b.Uint32() - 16) + groupsOffset := int32(b.Uint32() - 16) + groupStringOffset := int32(b.Uint32() - 16) + algNamesOffset := int32(b.Uint32() - 16) charNames = &UCharNames{ - tokenStringOffset: b.Uint32() - 16, - groupsOffset: b.Uint32() - 16, - groupStringOffset: b.Uint32() - 16, - algNamesOffset: b.Uint32() - 16, - buf: b.Buffer(), + tokens: b.Uint16Slice(tokenStringOffset / 2), + tokenStrings: b.Uint8Slice(groupsOffset - tokenStringOffset), + groups: b.Uint16Slice((groupStringOffset - groupsOffset) / 2), + groupNames: b.Uint8Slice(algNamesOffset - groupStringOffset), } - }) -} -func (names *UCharNames) tokenStrings() []uint8 { - return names.buf[names.tokenStringOffset:names.groupsOffset] -} + algCount := b.Uint32() + charNames.algNames = make([]algorithmicRange, 0, algCount) -func (names *UCharNames) getGroupName(group []uint16) []uint8 { - return names.buf[names.groupStringOffset+names.getGroupOffset(group) : names.algNamesOffset] -} - -func (names *UCharNames) getGroups() []uint16 { - buf := names.buf[names.groupsOffset:names.groupStringOffset] - return unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/2) -} - -func (names *UCharNames) tokens() []uint16 { - buf := names.buf[:names.tokenStringOffset] - return unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/2) + for i := uint32(0); i < algCount; i++ { + ar := algorithmicRange{ + start: b.Uint32(), + end: b.Uint32(), + typ: b.Uint8(), + variant: b.Uint8(), + } + size := b.Uint16() + switch ar.typ { + case 0: + ar.s = b.Uint8Slice(int32(size) - 12) + case 1: + ar.factors = b.Uint16Slice(int32(ar.variant)) + ar.s = b.Uint8Slice(int32(size) - 12 - int32(ar.variant)*2) + } + charNames.algNames = append(charNames.algNames, ar) + } + }) } -func (names *UCharNames) algNames() []uint32 { - buf := names.buf[names.algNamesOffset:] - return unsafe.Slice((*uint32)(unsafe.Pointer(unsafe.SliceData(buf))), len(buf)/4) +func (names *UCharNames) getGroupName(group []uint16) []uint8 { + return names.groupNames[names.getGroupOffset(group):] } type NameChoice int32 @@ -105,35 +118,16 @@ const ( ) type algorithmicRange struct { - start, end uint32 - type_, variant uint8 - size uint16 + start, end uint32 + typ, variant uint8 + factors []uint16 + s []uint8 } -func (ar *algorithmicRange) next() *algorithmicRange { - return (*algorithmicRange)(unsafe.Add(unsafe.Pointer(ar), ar.size)) -} - -func (ar *algorithmicRange) ptrend(offset uintptr) unsafe.Pointer { - return unsafe.Add(unsafe.Pointer(ar), unsafe.Sizeof(algorithmicRange{})+offset) -} - -func (ar *algorithmicRange) slice8(offset uintptr) []uint8 { - return unsafe.Slice((*uint8)(ar.ptrend(offset)), ar.sliceSize()-int(offset)) -} - -func (ar *algorithmicRange) slice16() []uint16 { - return unsafe.Slice((*uint16)(ar.ptrend(0)), ar.sliceSize()/2) -} - -func (ar *algorithmicRange) sliceSize() int { - return int(ar.size) - int(unsafe.Sizeof(algorithmicRange{})) -} - -func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) rune { - switch ar.type_ { +func (ar *algorithmicRange) findAlgName(otherName string) rune { + switch ar.typ { case 0: - s := ar.slice8(0) + s := ar.s for s[0] != 0 && len(otherName) > 0 { if s[0] != otherName[0] { @@ -161,9 +155,8 @@ func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) run return code } case 1: - factors := ar.slice16() - count := int(ar.variant) - s := ar.slice8(2 * uintptr(count)) + factors := ar.factors + s := ar.s for s[0] != 0 && len(otherName) > 0 { if s[0] != otherName[0] { @@ -182,14 +175,14 @@ func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) run var elements [8][]byte var elementBases [8][]byte - ar.writeFactorSuffix0(factors, count, s, &buf, &elements, &elementBases) + ar.writeFactorSuffix0(factors, s, &buf, &elements, &elementBases) if buf.String() == otherName { return start } for start+1 < limit { start++ - i := count + i := len(factors) for { i-- @@ -200,14 +193,14 @@ func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) run s = s[bytes.IndexByte(s, 0)+1:] elements[i] = s break - } else { - indexes[i] = 0 - elements[i] = elementBases[i] } + + indexes[i] = 0 + elements[i] = elementBases[i] } t := otherName - for i = 0; i < count; i++ { + for i = 0; i < len(factors); i++ { s = elements[i] for s[0] != 0 && len(t) > 0 { @@ -228,9 +221,9 @@ func (ar *algorithmicRange) findAlgName(choice NameChoice, otherName string) run return -1 } -func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, count int, s []uint8, buf *strings.Builder, elements, elementBases *[8][]byte) { +func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, s []uint8, buf *strings.Builder, elements, elementBases *[8][]byte) { /* write each element */ - for i := 0; i < count; i++ { + for i := 0; i < len(factors); i++ { (*elements)[i] = s (*elementBases)[i] = s @@ -265,30 +258,15 @@ func CharForName(nameChoice NameChoice, name string) rune { return -1 } - p := charNames.algNames() - i := p[0] - algRange := (*algorithmicRange)(unsafe.Pointer(unsafe.SliceData(p[1:]))) - for { - if cp := algRange.findAlgName(nameChoice, upper); cp != -1 { + for _, ar := range charNames.algNames { + if cp := ar.findAlgName(upper); cp != -1 { return cp } - i-- - if i == 0 { - break - } - // Only move to the next if we know it's safe, or we otherwise - // create slices that are outside the buffer. - algRange = algRange.next() } return charNames.enumNames(0, 0x10ffff+1, upper, nameChoice) } -type UCharNames struct { - tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset uint32 - buf []byte -} - const GROUP_SHIFT = 5 const LINES_PER_GROUP = 1 << GROUP_SHIFT const GROUP_MASK = LINES_PER_GROUP - 1 @@ -347,7 +325,7 @@ func (names *UCharNames) enumNames(start, limit rune, otherName string, nameChoi } func (names *UCharNames) getGroup(code rune) []uint16 { - groups := names.getGroups() + groups := names.groups groupMSB := uint16(code >> GROUP_SHIFT) start := 0 @@ -441,12 +419,11 @@ func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { } func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName string) bool { - tokens := names.tokens() + tokens := names.tokens tokenCount := tokens[0] tokens = tokens[1:] - tokenStrings := names.tokenStrings() otherNameLen := len(otherName) for len(name) > 0 && len(otherName) > 0 { @@ -483,7 +460,7 @@ func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName s break } } else { - tokenString := tokenStrings[token:] + tokenString := names.tokenStrings[token:] for tokenString[0] != 0 && len(otherName) > 0 { if tokenString[0] != otherName[0] { return false diff --git a/go/mysql/icuregex/internal/uprops/uprops.go b/go/mysql/icuregex/internal/uprops/uprops.go index f8683b1b269..37608b989d4 100644 --- a/go/mysql/icuregex/internal/uprops/uprops.go +++ b/go/mysql/icuregex/internal/uprops/uprops.go @@ -31,9 +31,8 @@ import ( ) var pnames struct { - valueMaps []uint32 - byteTrie []uint8 - nameGroups string + valueMaps []uint32 + byteTrie []uint8 } func readData(bytes *udata.Bytes) error { @@ -78,13 +77,6 @@ func readData(bytes *udata.Bytes) error { numBytes := nextOffset - offset pnames.byteTrie = bytes.Uint8Slice(numBytes) - - offset = nextOffset - nextOffset = indexes[IX_RESERVED3_OFFSET] - numBytes = nextOffset - offset - - pnames.nameGroups = bytes.String(numBytes) - return nil } From 4348b185202f671d12856ee7959cfc9612f411ce Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Mon, 3 Jul 2023 09:37:09 +0200 Subject: [PATCH 09/18] icuregex: Use more Go like naming and reduce exposed API Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/compiler.go | 1336 ++++++++--------- go/mysql/icuregex/compiler_table.go | 40 +- go/mysql/icuregex/debug.go | 126 +- go/mysql/icuregex/error.go | 44 +- go/mysql/icuregex/icu_test.go | 70 +- .../icuregex/internal/bytestrie/bytes_trie.go | 159 +- go/mysql/icuregex/internal/icudata/embed.go | 12 +- .../icuregex/internal/normalizer/constants.go | 90 +- .../internal/normalizer/normalizer.go | 258 ++-- .../icuregex/internal/pattern/unescape.go | 10 +- go/mysql/icuregex/internal/pattern/utils.go | 18 +- go/mysql/icuregex/internal/ubidi/ubidi.go | 395 +++-- go/mysql/icuregex/internal/ucase/fold.go | 73 +- go/mysql/icuregex/internal/ucase/ucase.go | 169 ++- go/mysql/icuregex/internal/uchar/constants.go | 166 +- go/mysql/icuregex/internal/uchar/uchar.go | 129 +- go/mysql/icuregex/internal/uerror/error.go | 53 +- go/mysql/icuregex/internal/ulayout/ulayout.go | 29 +- go/mysql/icuregex/internal/unames/unames.go | 101 +- .../icuregex/internal/unames/unames_test.go | 4 +- .../icuregex/internal/uprops/constants.go | 386 +++-- .../icuregex/internal/uprops/properties.go | 196 ++- go/mysql/icuregex/internal/uprops/uprops.go | 149 +- .../icuregex/internal/uprops/uprops_binary.go | 200 +-- .../icuregex/internal/uprops/uprops_int.go | 223 ++- go/mysql/icuregex/internal/uprops/uscript.go | 410 ++--- go/mysql/icuregex/internal/uset/close.go | 10 +- go/mysql/icuregex/internal/uset/frozen.go | 3 - go/mysql/icuregex/internal/uset/pattern.go | 2 +- .../icuregex/internal/uset/unicode_set.go | 81 +- go/mysql/icuregex/internal/utrie/ucptrie.go | 313 ++-- go/mysql/icuregex/internal/utrie/utrie2.go | 164 +- go/mysql/icuregex/matcher.go | 372 +++-- go/mysql/icuregex/ops.go | 266 ++-- go/mysql/icuregex/pattern.go | 26 +- go/mysql/icuregex/perl_test.go | 14 +- go/mysql/icuregex/sets.go | 38 +- go/vt/vtgate/evalengine/fn_regexp.go | 12 +- 38 files changed, 2994 insertions(+), 3153 deletions(-) diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index 0bee3e49b26..c1544e2bd7b 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -22,9 +22,7 @@ limitations under the License. package icuregex import ( - "fmt" "math" - "os" "strings" "unicode/utf8" @@ -41,7 +39,7 @@ import ( ) const BreakIteration = false -const kStackSize = 100 +const stackSize = 100 type reChar struct { char rune @@ -73,7 +71,7 @@ const ( setIntersection1 setOperation = 4<<16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. ) -type Compiler struct { +type compiler struct { err error out *Pattern p string @@ -89,7 +87,7 @@ type Compiler struct { peekChar rune c reChar - stack [kStackSize]uint16 + stack [stackSize]uint16 stackPtr int modeFlags RegexpFlag @@ -112,8 +110,8 @@ type Compiler struct { captureName *strings.Builder } -func NewCompiler(pat *Pattern) *Compiler { - return &Compiler{ +func newCompiler(pat *Pattern) *compiler { + return &compiler{ out: pat, scanIndex: 0, eolComments: true, @@ -128,7 +126,7 @@ func NewCompiler(pat *Pattern) *Compiler { } } -func (c *Compiler) nextCharLL() (ch rune) { +func (c *compiler) nextCharLL() (ch rune) { if c.peekChar != -1 { ch, c.peekChar = c.peekChar, -1 return @@ -152,21 +150,21 @@ func (c *Compiler) nextCharLL() (ch rune) { return } -func (c *Compiler) peekCharLL() rune { +func (c *compiler) peekCharLL() rune { if c.peekChar == -1 { c.peekChar = c.nextCharLL() } return c.peekChar } -func (c *Compiler) nextChar(ch *reChar) { +func (c *compiler) nextChar(ch *reChar) { c.scanIndex++ ch.char = c.nextCharLL() ch.quoted = false if c.quoteMode { ch.quoted = true - if (ch.char == chBackSlash && c.peekCharLL() == chE && ((c.modeFlags & UREGEX_LITERAL) == 0)) || + if (ch.char == chBackSlash && c.peekCharLL() == chE && ((c.modeFlags & Literal) == 0)) || ch.char == -1 { c.quoteMode = false // Exit quote mode, c.nextCharLL() // discard the E @@ -182,7 +180,7 @@ func (c *Compiler) nextChar(ch *reChar) { } else { // We are not in a \Q quoted region \E of the source. // - if (c.modeFlags & UREGEX_COMMENTS) != 0 { + if (c.modeFlags & Comments) != 0 { // // We are in free-spacing and comments mode. // Scan through any white space and comments, until we @@ -228,7 +226,7 @@ func (c *Compiler) nextChar(ch *reChar) { ch.char, c.p = pattern.UnescapeAt(beforeEscape) if ch.char < 0 { - c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + c.error(uerror.BadEscapeSequence) } c.charNum += len(beforeEscape) - len(c.p) } else if c.peekCharLL() == chDigit0 { @@ -246,7 +244,7 @@ func (c *Compiler) nextChar(ch *reChar) { if ch2 < chDigit0 || ch2 > chDigit7 { if index == 0 { // \0 is not followed by any octal digits. - c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + c.error(uerror.BadEscapeSequence) } break } @@ -305,7 +303,7 @@ const ( chDash = 0x2d // '-' ) -func (c *Compiler) compile(pat string) error { +func (c *compiler) compile(pat string) error { if c.err != nil { return c.err } @@ -320,7 +318,7 @@ func (c *Compiler) compile(pat string) error { var table []regexTableEl // UREGEX_LITERAL force entire pattern to be treated as a literal string. - if c.modeFlags&UREGEX_LITERAL != 0 { + if c.modeFlags&Literal != 0 { c.quoteMode = true } @@ -372,8 +370,8 @@ func (c *Compiler) compile(pat string) error { if table[0].pushState != 0 { c.stackPtr++ - if c.stackPtr >= kStackSize { - c.error(uerror.U_REGEX_INTERNAL_ERROR) + if c.stackPtr >= stackSize { + c.error(uerror.InternalError) c.stackPtr-- } c.stack[c.stackPtr] = uint16(table[0].pushState) @@ -390,7 +388,7 @@ func (c *Compiler) compile(pat string) error { c.stackPtr-- if c.stackPtr < 0 { c.stackPtr++ - c.error(uerror.U_REGEX_MISMATCHED_PAREN) + c.error(uerror.MismatchedParen) } } } @@ -399,7 +397,7 @@ func (c *Compiler) compile(pat string) error { return c.err } - c.allocateStackData(RESTACKFRAME_HDRCOUNT) + c.allocateStackData(restackframeHdrCount) c.stripNOPs() c.out.minMatchLen = c.minMatchLength(3, len(c.out.compiledPat)-1) @@ -408,13 +406,7 @@ func (c *Compiler) compile(pat string) error { return c.err } -const DebugParseActions = false - -func (c *Compiler) doParseActions(action patternParseAction) bool { - if DebugParseActions { - fmt.Fprintf(os.Stderr, "doParseActions(action=%d)\n\t%s\n", action, c.p) - } - +func (c *compiler) doParseActions(action patternParseAction) bool { switch action { case doPatStart: // Start of pattern compiles to: @@ -425,9 +417,9 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // the start of an ( grouping. //4 NOP Resreved, will be replaced by a save if there are // OR | operators at the top level - c.appendOp(URX_STATE_SAVE, 2) - c.appendOp(URX_JMP, 3) - c.appendOp(URX_FAIL, 0) + c.appendOp(urxStateSave, 2) + c.appendOp(urxJmp, 3) + c.appendOp(urxFail, 0) // Standard open nonCapture paren action emits the two NOPs and // sets up the paren stack frame. @@ -445,11 +437,11 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.handleCloseParen() if len(c.parenStack) > 0 { // Missing close paren in pattern. - c.error(uerror.U_REGEX_MISMATCHED_PAREN) + c.error(uerror.MismatchedParen) } // add the END operation to the compiled pattern. - c.appendOp(URX_END, 0) + c.appendOp(urxEnd, 0) // Terminate the pattern compilation state machine. return false @@ -468,17 +460,17 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { savePosition, c.parenStack = stackPop(c.parenStack) op := c.out.compiledPat[savePosition] - if op.Type() != URX_NOP { + if op.typ() != urxNop { panic("expected a NOP placeholder") } - op = c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)+1) + op = c.buildOp(urxStateSave, len(c.out.compiledPat)+1) c.out.compiledPat[savePosition] = op // Append an JMP operation into the compiled pattern. The operand for // the JMP will eventually be the location following the ')' for the // group. This will be patched in later, when the ')' is encountered. - c.appendOp(URX_JMP, 0) + c.appendOp(urxJmp, 0) // Push the position of the newly added JMP op onto the parentheses stack. // This registers if for fixup when this block's close paren is encountered. @@ -487,7 +479,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Append a NOP to the compiled pattern. This is the slot reserved // for a SAVE in the event that there is yet another '|' following // this one. - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) case doBeginNamedCapture: @@ -499,7 +491,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.captureName.WriteRune(c.c.char) case doBadNamedCapture: - c.error(uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME) + c.error(uerror.InvalidCaptureGroupName) case doOpenCaptureParen: // Open Capturing Paren, possibly named. @@ -520,10 +512,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // encountered. This will be promoted to a completed capture when (and if) the corresponding // END_CAPTURE is encountered. c.fixLiterals(false) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) varsLoc := c.allocateStackData(3) // Reserve three slots in match stack frame. - c.appendOp(URX_START_CAPTURE, varsLoc) - c.appendOp(URX_NOP, 0) + c.appendOp(urxStartCapture, varsLoc) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the two NOPs. Depending on what follows in the pattern, the @@ -547,7 +539,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.captureName = nil if _, ok := c.out.namedCaptureMap[captureName]; ok { - c.error(uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME) + c.error(uerror.InvalidCaptureGroupName) } c.out.namedCaptureMap[captureName] = groupNumber } @@ -560,8 +552,8 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. c.fixLiterals(false) - c.appendOp(URX_NOP, 0) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the two NOPs. @@ -579,10 +571,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. c.fixLiterals(false) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) varLoc := c.allocateData(1) // Reserve a data location for saving the state stack ptr. - c.appendOp(URX_STO_SP, varLoc) - c.appendOp(URX_NOP, 0) + c.appendOp(urxStoSp, varLoc) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the two NOPs. Depending on what follows in the pattern, the @@ -626,13 +618,13 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // 3: fActiveLimit, the active bounds limit on entry. c.fixLiterals(false) dataLoc := c.allocateData(4) - c.appendOp(URX_LA_START, dataLoc) - c.appendOp(URX_STATE_SAVE, len(c.out.compiledPat)+2) - c.appendOp(URX_JMP, len(c.out.compiledPat)+3) - c.appendOp(URX_LA_END, dataLoc) - c.appendOp(URX_BACKTRACK, 0) - c.appendOp(URX_NOP, 0) - c.appendOp(URX_NOP, 0) + c.appendOp(urxLaStart, dataLoc) + c.appendOp(urxStateSave, len(c.out.compiledPat)+2) + c.appendOp(urxJmp, len(c.out.compiledPat)+3) + c.appendOp(urxLaEnd, dataLoc) + c.appendOp(urxBacktrack, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the NOPs. @@ -660,9 +652,9 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // 3: fActiveLimit, the active bounds limit on entry. c.fixLiterals(false) dataLoc := c.allocateData(4) - c.appendOp(URX_LA_START, dataLoc) - c.appendOp(URX_STATE_SAVE, 0) // dest address will be patched later. - c.appendOp(URX_NOP, 0) + c.appendOp(urxLaStart, dataLoc) + c.appendOp(urxStateSave, 0) // dest address will be patched later. + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the StateSave and NOP. @@ -702,16 +694,16 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { dataLoc := c.allocateData(5) // Emit URX_LB_START - c.appendOp(URX_LB_START, dataLoc) + c.appendOp(urxLbStart, dataLoc) // Emit URX_LB_CONT - c.appendOp(URX_LB_CONT, dataLoc) - c.appendOp(URX_RESERVED_OP, 0) // MinMatchLength. To be filled later. - c.appendOp(URX_RESERVED_OP, 0) // MaxMatchLength. To be filled later. + c.appendOp(urxLbCont, dataLoc) + c.appendOp(urxReservedOp, 0) // MinMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // MaxMatchLength. To be filled later. // Emit the NOPs - c.appendOp(URX_NOP, 0) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the URX_LB_CONT and the NOP. @@ -752,17 +744,17 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { dataLoc := c.allocateData(5) // Emit URX_LB_START - c.appendOp(URX_LB_START, dataLoc) + c.appendOp(urxLbStart, dataLoc) // Emit URX_LBN_CONT - c.appendOp(URX_LBN_CONT, dataLoc) - c.appendOp(URX_RESERVED_OP, 0) // MinMatchLength. To be filled later. - c.appendOp(URX_RESERVED_OP, 0) // MaxMatchLength. To be filled later. - c.appendOp(URX_RESERVED_OP, 0) // Continue Loc. To be filled later. + c.appendOp(urxLbnCount, dataLoc) + c.appendOp(urxReservedOp, 0) // MinMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // MaxMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // Continue Loc. To be filled later. // Emit the NOPs - c.appendOp(URX_NOP, 0) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the URX_LB_CONT and the NOP. @@ -776,22 +768,22 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doConditionalExpr, doPerlInline: // Conditionals such as (?(1)a:b) // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. - c.error(uerror.U_REGEX_UNIMPLEMENTED) + c.error(uerror.Unimplemented) case doCloseParen: c.handleCloseParen() if len(c.parenStack) == 0 { // Extra close paren, or missing open paren. - c.error(uerror.U_REGEX_MISMATCHED_PAREN) + c.error(uerror.MismatchedParen) } case doNOP: case doBadOpenParenType, doRuleError: - c.error(uerror.U_REGEX_RULE_SYNTAX) + c.error(uerror.RuleSyntax) case doMismatchedParenErr: - c.error(uerror.U_REGEX_MISMATCHED_PAREN) + c.error(uerror.MismatchedParen) case doPlus: // Normal '+' compiles to @@ -816,27 +808,27 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { if topLoc == len(c.out.compiledPat)-1 { repeatedOp := c.out.compiledPat[topLoc] - if repeatedOp.Type() == URX_SETREF { + if repeatedOp.typ() == urxSetref { // Emit optimized code for [char set]+ - c.appendOp(URX_LOOP_SR_I, repeatedOp.Value()) + c.appendOp(urxLoopSrI, repeatedOp.value()) frameLoc := c.allocateStackData(1) - c.appendOp(URX_LOOP_C, frameLoc) + c.appendOp(urxLoopC, frameLoc) break } - if repeatedOp.Type() == URX_DOTANY || repeatedOp.Type() == URX_DOTANY_ALL || repeatedOp.Type() == URX_DOTANY_UNIX { + if repeatedOp.typ() == urxDotany || repeatedOp.typ() == urxDotanyAll || repeatedOp.typ() == urxDotanyUnix { // Emit Optimized code for .+ operations. - loopOpI := c.buildOp(URX_LOOP_DOT_I, 0) - if repeatedOp.Type() == URX_DOTANY_ALL { + loopOpI := c.buildOp(urxLoopDotI, 0) + if repeatedOp.typ() == urxDotanyAll { // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. loopOpI |= 1 } - if c.modeFlags&UREGEX_UNIX_LINES != 0 { + if c.modeFlags&UnixLines != 0 { loopOpI |= 2 } c.appendIns(loopOpI) frameLoc := c.allocateStackData(1) - c.appendOp(URX_LOOP_C, frameLoc) + c.appendOp(urxLoopC, frameLoc) break } } @@ -850,13 +842,13 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Emit the code sequence that can handle it. c.insertOp(topLoc) frameLoc := c.allocateStackData(1) - op := c.buildOp(URX_STO_INP_LOC, frameLoc) + op := c.buildOp(urxStoInpLoc, frameLoc) c.out.compiledPat[topLoc] = op - c.appendOp(URX_JMP_SAV_X, topLoc+1) + c.appendOp(urxJmpSavX, topLoc+1) } else { // Simpler code when the repeated body must match something non-empty - c.appendOp(URX_JMP_SAV, topLoc) + c.appendOp(urxJmpSav, topLoc) } case doNGPlus: @@ -865,7 +857,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // 2. state-save 1 // 3. ... topLoc := c.blockTopLoc(false) - c.appendOp(URX_STATE_SAVE, topLoc) + c.appendOp(urxStateSave, topLoc) case doOpt: // Normal (greedy) ? quantifier. @@ -875,7 +867,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // 3. ... // Insert the state save into the compiled pattern, and we're done. saveStateLoc := c.blockTopLoc(true) - saveStateOp := c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)) + saveStateOp := c.buildOp(urxStateSave, len(c.out.compiledPat)) c.out.compiledPat[saveStateLoc] = saveStateOp case doNGOpt: @@ -891,11 +883,11 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { jmp1Loc := c.blockTopLoc(true) jmp2Loc := len(c.out.compiledPat) - jmp1Op := c.buildOp(URX_JMP, jmp2Loc+1) + jmp1Op := c.buildOp(urxJmp, jmp2Loc+1) c.out.compiledPat[jmp1Loc] = jmp1Op - c.appendOp(URX_JMP, jmp2Loc+2) - c.appendOp(URX_STATE_SAVE, jmp1Loc+1) + c.appendOp(urxJmp, jmp2Loc+2) + c.appendOp(urxStateSave, jmp1Loc+1) case doStar: // Normal (greedy) * quantifier. @@ -928,28 +920,28 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { if topLoc == len(c.out.compiledPat)-1 { repeatedOp := c.out.compiledPat[topLoc] - if repeatedOp.Type() == URX_SETREF { + if repeatedOp.typ() == urxSetref { // Emit optimized code for a [char set]* - loopOpI := c.buildOp(URX_LOOP_SR_I, repeatedOp.Value()) + loopOpI := c.buildOp(urxLoopSrI, repeatedOp.value()) c.out.compiledPat[topLoc] = loopOpI dataLoc := c.allocateStackData(1) - c.appendOp(URX_LOOP_C, dataLoc) + c.appendOp(urxLoopC, dataLoc) break } - if repeatedOp.Type() == URX_DOTANY || repeatedOp.Type() == URX_DOTANY_ALL || repeatedOp.Type() == URX_DOTANY_UNIX { + if repeatedOp.typ() == urxDotany || repeatedOp.typ() == urxDotanyAll || repeatedOp.typ() == urxDotanyUnix { // Emit Optimized code for .* operations. - loopOpI := c.buildOp(URX_LOOP_DOT_I, 0) - if repeatedOp.Type() == URX_DOTANY_ALL { + loopOpI := c.buildOp(urxLoopDotI, 0) + if repeatedOp.typ() == urxDotanyAll { // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. loopOpI |= 1 } - if (c.modeFlags & UREGEX_UNIX_LINES) != 0 { + if (c.modeFlags & UnixLines) != 0 { loopOpI |= 2 } c.out.compiledPat[topLoc] = loopOpI dataLoc := c.allocateStackData(1) - c.appendOp(URX_LOOP_C, dataLoc) + c.appendOp(urxLoopC, dataLoc) break } } @@ -958,7 +950,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // The optimizations did not apply. saveStateLoc := c.blockTopLoc(true) - jmpOp := c.buildOp(URX_JMP_SAV, saveStateLoc+1) + jmpOp := c.buildOp(urxJmpSav, saveStateLoc+1) // Check for minimum match length of zero, which requires // extra loop-breaking code. @@ -966,9 +958,9 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.insertOp(saveStateLoc) dataLoc := c.allocateStackData(1) - op := c.buildOp(URX_STO_INP_LOC, dataLoc) + op := c.buildOp(urxStoInpLoc, dataLoc) c.out.compiledPat[saveStateLoc+1] = op - jmpOp = c.buildOp(URX_JMP_SAV_X, saveStateLoc+2) + jmpOp = c.buildOp(urxJmpSavX, saveStateLoc+2) } // Locate the position in the compiled pattern where the match will continue @@ -976,7 +968,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { continueLoc := len(c.out.compiledPat) + 1 // Put together the save state op and store it into the compiled code. - saveStateOp := c.buildOp(URX_STATE_SAVE, continueLoc) + saveStateOp := c.buildOp(urxStateSave, continueLoc) c.out.compiledPat[saveStateLoc] = saveStateOp // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. @@ -991,9 +983,9 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // 4 ... jmpLoc := c.blockTopLoc(true) // loc 1. saveLoc := len(c.out.compiledPat) // loc 3. - jmpOp := c.buildOp(URX_JMP, saveLoc) + jmpOp := c.buildOp(urxJmp, saveLoc) c.out.compiledPat[jmpLoc] = jmpOp - c.appendOp(URX_STATE_SAVE, jmpLoc+1) + c.appendOp(urxStateSave, jmpLoc+1) case doIntervalInit: // The '{' opening an interval quantifier was just scanned. @@ -1004,10 +996,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doIntevalLowerDigit: // Scanned a digit from the lower value of an {lower,upper} interval - digitValue := u_charDigitValue(c.c.char) + digitValue := uCharDigitValue(c.c.char) val := int64(c.intervalLow)*10 + digitValue if val > math.MaxInt32 { - c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + c.error(uerror.NumberTooBig) } else { c.intervalLow = int(val) } @@ -1017,10 +1009,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { if c.intervalUpper < 0 { c.intervalUpper = 0 } - digitValue := u_charDigitValue(c.c.char) + digitValue := uCharDigitValue(c.c.char) val := int64(c.intervalUpper)*10 + digitValue if val > math.MaxInt32 { - c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + c.error(uerror.NumberTooBig) } else { c.intervalUpper = int(val) } @@ -1032,7 +1024,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doInterval: // Finished scanning a normal {lower,upper} interval. Generate the code for it. if !c.compileInlineInterval() { - c.compileInterval(URX_CTR_INIT, URX_CTR_LOOP) + c.compileInterval(urxCtrInit, utxCtrLoop) } case doPossessiveInterval: @@ -1045,7 +1037,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { topLoc := c.blockTopLoc(false) // Produce normal looping code. - c.compileInterval(URX_CTR_INIT, URX_CTR_LOOP) + c.compileInterval(urxCtrInit, utxCtrLoop) // Surround the just-emitted normal looping code with a STO_SP ... LD_SP // just as if the loop was inclosed in atomic parentheses. @@ -1054,12 +1046,12 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.insertOp(topLoc) varLoc := c.allocateData(1) // Reserve a data location for saving the - op := c.buildOp(URX_STO_SP, varLoc) + op := c.buildOp(urxStoSp, varLoc) c.out.compiledPat[topLoc] = op - var loopOp Instruction + var loopOp instruction loopOp, c.out.compiledPat = stackPop(c.out.compiledPat) - if loopOp.Type() != URX_CTR_LOOP || loopOp.Value() != topLoc { + if loopOp.typ() != utxCtrLoop || loopOp.value() != topLoc { panic("bad instruction at the end of compiled pattern") } @@ -1067,14 +1059,14 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.appendIns(loopOp) // Then the LD_SP after the end of the loop - c.appendOp(URX_LD_SP, varLoc) + c.appendOp(urxLdSp, varLoc) case doNGInterval: // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. - c.compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG) + c.compileInterval(urxCtrInitNg, urxCtrLoopNg) case doIntervalError: - c.error(uerror.U_REGEX_BAD_INTERVAL) + c.error(uerror.BadInterval) case doLiteralChar: // We've just scanned a "normal" character from the pattern, @@ -1083,142 +1075,142 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { case doEscapedLiteralChar: // We've just scanned an backslashed escaped character with no // special meaning. It represents itself. - if (c.modeFlags&UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || /* in [A-Z] */ (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] - c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || /* in [A-Z] */ (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] + c.error(uerror.BadEscapeSequence) } c.literalChar(c.c.char) case doDotAny: // scanned a ".", match any single character. c.fixLiterals(false) - if (c.modeFlags & UREGEX_DOTALL) != 0 { - c.appendOp(URX_DOTANY_ALL, 0) - } else if (c.modeFlags & UREGEX_UNIX_LINES) != 0 { - c.appendOp(URX_DOTANY_UNIX, 0) + if (c.modeFlags & DotAll) != 0 { + c.appendOp(urxDotanyAll, 0) + } else if (c.modeFlags & UnixLines) != 0 { + c.appendOp(urxDotanyUnix, 0) } else { - c.appendOp(URX_DOTANY, 0) + c.appendOp(urxDotany, 0) } case doCaret: c.fixLiterals(false) - if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { - c.appendOp(URX_CARET, 0) - } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { - c.appendOp(URX_CARET_M, 0) - } else if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { - c.appendOp(URX_CARET, 0) // Only testing true start of input. - } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { - c.appendOp(URX_CARET_M_UNIX, 0) + if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxCaret, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxCaretM, 0) + } else if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxCaret, 0) // Only testing true start of input. + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxCaretMUnix, 0) } case doDollar: c.fixLiterals(false) - if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { - c.appendOp(URX_DOLLAR, 0) - } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) == 0 { - c.appendOp(URX_DOLLAR_M, 0) - } else if (c.modeFlags&UREGEX_MULTILINE) == 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { - c.appendOp(URX_DOLLAR_D, 0) - } else if (c.modeFlags&UREGEX_MULTILINE) != 0 && (c.modeFlags&UREGEX_UNIX_LINES) != 0 { - c.appendOp(URX_DOLLAR_MD, 0) + if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxDollar, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxDollarM, 0) + } else if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxDollarD, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxDollarMd, 0) } case doBackslashA: c.fixLiterals(false) - c.appendOp(URX_CARET, 0) + c.appendOp(urxCaret, 0) case doBackslashB: if !BreakIteration { - if (c.modeFlags & UREGEX_UWORD) != 0 { - c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + if (c.modeFlags & UWord) != 0 { + c.error(uerror.Unimplemented) } } c.fixLiterals(false) - if c.modeFlags&UREGEX_UWORD != 0 { - c.appendOp(URX_BACKSLASH_BU, 1) + if c.modeFlags&UWord != 0 { + c.appendOp(urxBackslashBu, 1) } else { - c.appendOp(URX_BACKSLASH_B, 1) + c.appendOp(urxBackslashB, 1) } case doBackslashb: if !BreakIteration { - if (c.modeFlags & UREGEX_UWORD) != 0 { - c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + if (c.modeFlags & UWord) != 0 { + c.error(uerror.Unimplemented) } } c.fixLiterals(false) - if c.modeFlags&UREGEX_UWORD != 0 { - c.appendOp(URX_BACKSLASH_BU, 0) + if c.modeFlags&UWord != 0 { + c.appendOp(urxBackslashBu, 0) } else { - c.appendOp(URX_BACKSLASH_B, 0) + c.appendOp(urxBackslashB, 0) } case doBackslashD: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_D, 1) + c.appendOp(urxBackslashD, 1) case doBackslashd: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_D, 0) + c.appendOp(urxBackslashD, 0) case doBackslashG: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_G, 0) + c.appendOp(urxBackslashG, 0) case doBackslashH: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_H, 1) + c.appendOp(urxBackslashH, 1) case doBackslashh: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_H, 0) + c.appendOp(urxBackslashH, 0) case doBackslashR: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_R, 0) + c.appendOp(urxBackslashR, 0) case doBackslashS: c.fixLiterals(false) - c.appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET) + c.appendOp(urxStatSetrefN, urxIsspaceSet) case doBackslashs: c.fixLiterals(false) - c.appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET) + c.appendOp(urxStaticSetref, urxIsspaceSet) case doBackslashV: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_V, 1) + c.appendOp(urxBackslashV, 1) case doBackslashv: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_V, 0) + c.appendOp(urxBackslashV, 0) case doBackslashW: c.fixLiterals(false) - c.appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET) + c.appendOp(urxStatSetrefN, urxIswordSet) case doBackslashw: c.fixLiterals(false) - c.appendOp(URX_STATIC_SETREF, URX_ISWORD_SET) + c.appendOp(urxStaticSetref, urxIswordSet) case doBackslashX: if !BreakIteration { // Grapheme Cluster Boundary requires ICU break iteration. - c.error(uerror.U_REGEX_UNSUPPORTED_ERROR) + c.error(uerror.Unimplemented) } c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_X, 0) + c.appendOp(urxBackslashX, 0) case doBackslashZ: c.fixLiterals(false) - c.appendOp(URX_DOLLAR, 0) + c.appendOp(urxDollar, 0) case doBackslashz: c.fixLiterals(false) - c.appendOp(URX_BACKSLASH_Z, 0) + c.appendOp(urxBackslashZ, 0) case doEscapeError: - c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + c.error(uerror.BadEscapeSequence) case doExit: c.fixLiterals(false) @@ -1244,13 +1236,13 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { for { // Loop once per digit, for max allowed number of digits in a back reference. - digit := u_charDigitValue(ch) + digit := uCharDigitValue(ch) groupNum = groupNum*10 + digit if groupNum >= int64(numCaptureGroups) { break } ch = c.peekCharLL() - if !staticRuleSet[kRuleSetDigitChar-128].ContainsRune(ch) { + if !staticRuleSet[ruleSetDigitChar-128].ContainsRune(ch) { break } c.nextCharLL() @@ -1265,10 +1257,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { panic("\\0 begins an octal escape sequence, and shouldn't enter this code path at all") } c.fixLiterals(false) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { - c.appendOp(URX_BACKREF_I, int(groupNum)) + if (c.modeFlags & CaseInsensitive) != 0 { + c.appendOp(urxBackrefI, int(groupNum)) } else { - c.appendOp(URX_BACKREF, int(groupNum)) + c.appendOp(urxBackref, int(groupNum)) } case doBeginNamedBackRef: @@ -1287,15 +1279,15 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Group name has not been defined. // Could be a forward reference. If we choose to support them at some // future time, extra mechanism will be required at this point. - c.error(uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME) + c.error(uerror.InvalidCaptureGroupName) } else { // Given the number, handle identically to a \n numbered back reference. // See comments above, under doBackRef c.fixLiterals(false) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { - c.appendOp(URX_BACKREF_I, groupNumber) + if (c.modeFlags & CaseInsensitive) != 0 { + c.appendOp(urxBackrefI, groupNumber) } else { - c.appendOp(URX_BACKREF, groupNumber) + c.appendOp(urxBackref, groupNumber) } } c.captureName = nil @@ -1317,17 +1309,17 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Emit the STO_SP topLoc := c.blockTopLoc(true) stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. - op := c.buildOp(URX_STO_SP, stoLoc) + op := c.buildOp(urxStoSp, stoLoc) c.out.compiledPat[topLoc] = op // Emit the STATE_SAVE - c.appendOp(URX_STATE_SAVE, len(c.out.compiledPat)+2) + c.appendOp(urxStateSave, len(c.out.compiledPat)+2) // Emit the JMP - c.appendOp(URX_JMP, topLoc+1) + c.appendOp(urxJmp, topLoc+1) // Emit the LD_SP - c.appendOp(URX_LD_SP, stoLoc) + c.appendOp(urxLdSp, stoLoc) case doPossessiveStar: // Possessive *+ quantifier. @@ -1345,19 +1337,19 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // emit STO_SP loc stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. - op := c.buildOp(URX_STO_SP, stoLoc) + op := c.buildOp(urxStoSp, stoLoc) c.out.compiledPat[topLoc] = op // Emit the SAVE_STATE 5 L7 := len(c.out.compiledPat) + 1 - op = c.buildOp(URX_STATE_SAVE, L7) + op = c.buildOp(urxStateSave, L7) c.out.compiledPat[topLoc+1] = op // Append the JMP operation. - c.appendOp(URX_JMP, topLoc+1) + c.appendOp(urxJmp, topLoc+1) // Emit the LD_SP loc - c.appendOp(URX_LD_SP, stoLoc) + c.appendOp(urxLdSp, stoLoc) case doPossessiveOpt: // Possessive ?+ quantifier. @@ -1374,16 +1366,16 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Emit the STO_SP stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. - op := c.buildOp(URX_STO_SP, stoLoc) + op := c.buildOp(urxStoSp, stoLoc) c.out.compiledPat[topLoc] = op // Emit the SAVE_STATE continueLoc := len(c.out.compiledPat) + 1 - op = c.buildOp(URX_STATE_SAVE, continueLoc) + op = c.buildOp(urxStateSave, continueLoc) c.out.compiledPat[topLoc+1] = op // Emit the LD_SP - c.appendOp(URX_LD_SP, stoLoc) + c.appendOp(urxLdSp, stoLoc) case doBeginMatchMode: c.newModeFlags = c.modeFlags @@ -1392,19 +1384,19 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { var bit RegexpFlag switch c.c.char { case 0x69: /* 'i' */ - bit = UREGEX_CASE_INSENSITIVE + bit = CaseInsensitive case 0x64: /* 'd' */ - bit = UREGEX_UNIX_LINES + bit = UnixLines case 0x6d: /* 'm' */ - bit = UREGEX_MULTILINE + bit = Multiline case 0x73: /* 's' */ - bit = UREGEX_DOTALL + bit = DotAll case 0x75: /* 'u' */ bit = 0 /* Unicode casing */ case 0x77: /* 'w' */ - bit = UREGEX_UWORD + bit = UWord case 0x78: /* 'x' */ - bit = UREGEX_COMMENTS + bit = Comments case 0x2d: /* '-' */ c.setModeFlag = false default: @@ -1438,8 +1430,8 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. c.fixLiterals(false) - c.appendOp(URX_NOP, 0) - c.appendOp(URX_NOP, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) // On the Parentheses stack, start a new frame and add the postions // of the two NOPs (a normal non-capturing () frame, except for the @@ -1456,7 +1448,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.modeFlags = c.newModeFlags case doBadModeFlag: - c.error(uerror.U_REGEX_INVALID_FLAG) + c.error(uerror.InvalidFlag) case doSuppressComments: // We have just scanned a '(?'. We now need to prevent the character scanner from @@ -1472,53 +1464,53 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { set := c.setStack[len(c.setStack)-1] set.AddRune(chDash) - case doSetBackslash_s: + case doSetBackslashs: set := c.setStack[len(c.setStack)-1] - set.AddAll(staticPropertySets[URX_ISSPACE_SET]) + set.AddAll(staticPropertySets[urxIsspaceSet]) - case doSetBackslash_S: + case doSetBackslashS: sset := uset.New() - sset.AddAll(staticPropertySets[URX_ISSPACE_SET]) // TODO: add latin1 spaces + sset.AddAll(staticPropertySets[urxIsspaceSet]) // TODO: add latin1 spaces sset.Complement() set := c.setStack[len(c.setStack)-1] set.AddAll(sset) - case doSetBackslash_d: + case doSetBackslashd: set := c.setStack[len(c.setStack)-1] - c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.GcNdMask) - case doSetBackslash_D: + case doSetBackslashD: digits := uset.New() - c.err = uprops.ApplyIntPropertyValue(digits, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) + c.err = uprops.ApplyIntPropertyValue(digits, uprops.UCharGeneralCategoryMask, int32(uchar.GcNdMask)) digits.Complement() set := c.setStack[len(c.setStack)-1] set.AddAll(digits) - case doSetBackslash_h: + case doSetBackslashh: h := uset.New() - c.err = uprops.ApplyIntPropertyValue(h, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) h.AddRune(9) // Tab set := c.setStack[len(c.setStack)-1] set.AddAll(h) - case doSetBackslash_H: + case doSetBackslashH: h := uset.New() - c.err = uprops.ApplyIntPropertyValue(h, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) h.AddRune(9) // Tab h.Complement() set := c.setStack[len(c.setStack)-1] set.AddAll(h) - case doSetBackslash_v: + case doSetBackslashv: set := c.setStack[len(c.setStack)-1] set.AddRuneRange(0x0a, 0x0d) // add range set.AddRune(0x85) set.AddRuneRange(0x2028, 0x2029) - case doSetBackslash_V: + case doSetBackslashV: v := uset.New() v.AddRuneRange(0x0a, 0x0d) // add range v.AddRune(0x85) @@ -1528,13 +1520,13 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { set := c.setStack[len(c.setStack)-1] set.AddAll(v) - case doSetBackslash_w: + case doSetBackslashw: set := c.setStack[len(c.setStack)-1] - set.AddAll(staticPropertySets[URX_ISWORD_SET]) + set.AddAll(staticPropertySets[urxIswordSet]) - case doSetBackslash_W: + case doSetBackslashW: sset := uset.New() - sset.AddAll(staticPropertySets[URX_ISWORD_SET]) + sset.AddAll(staticPropertySets[urxIswordSet]) sset.Complement() set := c.setStack[len(c.setStack)-1] @@ -1544,7 +1536,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { c.fixLiterals(false) c.setStack = append(c.setStack, uset.New()) c.setOpStack = append(c.setOpStack, setStart) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + if (c.modeFlags & CaseInsensitive) != 0 { c.setOpStack = append(c.setOpStack, setCaseClose) } @@ -1555,7 +1547,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // went before once it is created. c.setPushOp(setDifference1) c.setOpStack = append(c.setOpStack, setStart) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + if (c.modeFlags & CaseInsensitive) != 0 { c.setOpStack = append(c.setOpStack, setCaseClose) } @@ -1564,7 +1556,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Need both the '&' operator and the open '[' operator. c.setPushOp(setIntersection1) c.setOpStack = append(c.setOpStack, setStart) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + if (c.modeFlags & CaseInsensitive) != 0 { c.setOpStack = append(c.setOpStack, setCaseClose) } @@ -1573,7 +1565,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // Need to handle the union operation explicitly [[abc] | [ c.setPushOp(setUnion) c.setOpStack = append(c.setOpStack, setStart) - if (c.modeFlags & UREGEX_CASE_INSENSITIVE) != 0 { + if (c.modeFlags & CaseInsensitive) != 0 { c.setOpStack = append(c.setOpStack, setCaseClose) } @@ -1624,10 +1616,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // A back-slash escaped literal character was encountered. // Processing is the same as with setLiteral, above, with the addition of // the optional check for errors on escaped ASCII letters. - if (c.modeFlags&UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && + if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || // in [A-Z] (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] - c.error(uerror.U_REGEX_BAD_ESCAPE_SEQUENCE) + c.error(uerror.BadEscapeSequence) } c.setEval(setUnion) set := c.setStack[len(c.setStack)-1] @@ -1652,7 +1644,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // and ICU UnicodeSet behavior. ch := c.scanNamedChar() if c.err == nil && (c.lastSetLiteral == -1 || c.lastSetLiteral > ch) { - c.error(uerror.U_REGEX_INVALID_RANGE) + c.error(uerror.InvalidRange) } set := c.setStack[len(c.setStack)-1] set.AddRuneRange(c.lastSetLiteral, ch) @@ -1676,10 +1668,10 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { } case doSetNoCloseError: - c.error(uerror.U_REGEX_MISSING_CLOSE_BRACKET) + c.error(uerror.MissingCloseBracket) case doSetOpError: - c.error(uerror.U_REGEX_RULE_SYNTAX) // -- or && at the end of a set. Illegal. + c.error(uerror.RuleSyntax) // -- or && at the end of a set. Illegal. case doSetPosixProp: if set := c.scanPosixProp(); set != nil { @@ -1700,7 +1692,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { // and ICU UnicodeSet behavior. if c.lastSetLiteral == -1 || c.lastSetLiteral > c.c.char { - c.error(uerror.U_REGEX_INVALID_RANGE) + c.error(uerror.InvalidRange) } c.setStack[len(c.setStack)-1].AddRuneRange(c.lastSetLiteral, c.c.char) @@ -1711,7 +1703,7 @@ func (c *Compiler) doParseActions(action patternParseAction) bool { return c.err == nil } -func u_charDigitValue(char rune) int64 { +func uCharDigitValue(char rune) int64 { if char >= '0' && char <= '9' { return int64(char - '0') } @@ -1727,7 +1719,7 @@ func stackPop[T any](stack []T) (T, []T) { return out, stack } -func (c *Compiler) error(e uerror.URegexCompileErrorCode) { +func (c *compiler) error(e uerror.CompileErrorCode) { c.err = &CompileError{ Code: e, Line: c.lineNum, @@ -1736,7 +1728,7 @@ func (c *Compiler) error(e uerror.URegexCompileErrorCode) { } } -func (c *Compiler) stripNOPs() { +func (c *compiler) stripNOPs() { if c.err != nil { return } @@ -1750,7 +1742,7 @@ func (c *Compiler) stripNOPs() { for loc = 0; loc < end; loc++ { deltas = append(deltas, d) op := c.out.compiledPat[loc] - if op.Type() == URX_NOP { + if op.typ() == urxNop { d++ } } @@ -1762,32 +1754,31 @@ func (c *Compiler) stripNOPs() { var src, dst int for src = 0; src < end; src++ { op := c.out.compiledPat[src] - opType := op.Type() + opType := op.typ() switch opType { - case URX_NOP: + case urxNop: // skip - case URX_STATE_SAVE, - URX_JMP, - URX_CTR_LOOP, - URX_CTR_LOOP_NG, - URX_RELOC_OPRND, - URX_JMPX, - URX_JMP_SAV, - URX_JMP_SAV_X: + case urxStateSave, + urxJmp, + utxCtrLoop, + urxCtrLoopNg, + urxRelocOprnd, + urxJmpx, + urxJmpSav, + urxJmpSavX: // These are instructions with operands that refer to code locations. - operandAddress := op.Value() - // U_ASSERT(operandAddress >= 0 && operandAddress < deltas.size()); + operandAddress := op.value() fixedOperandAddress := operandAddress - deltas[operandAddress] op = c.buildOp(opType, fixedOperandAddress) c.out.compiledPat[dst] = op dst++ - case URX_BACKREF, URX_BACKREF_I: - where := op.Value() + case urxBackref, urxBackrefI: + where := op.value() if where > len(c.out.groupMap) { - c.error(uerror.U_REGEX_INVALID_BACK_REF) + c.error(uerror.InvalidBackRef) break } @@ -1797,55 +1788,55 @@ func (c *Compiler) stripNOPs() { dst++ c.out.needsAltInput = true - case URX_RESERVED_OP, - URX_RESERVED_OP_N, - URX_BACKTRACK, - URX_END, - URX_ONECHAR, - URX_STRING, - URX_STRING_LEN, - URX_START_CAPTURE, - URX_END_CAPTURE, - URX_STATIC_SETREF, - URX_STAT_SETREF_N, - URX_SETREF, - URX_DOTANY, - URX_FAIL, - URX_BACKSLASH_B, - URX_BACKSLASH_BU, - URX_BACKSLASH_G, - URX_BACKSLASH_X, - URX_BACKSLASH_Z, - URX_DOTANY_ALL, - URX_BACKSLASH_D, - URX_CARET, - URX_DOLLAR, - URX_CTR_INIT, - URX_CTR_INIT_NG, - URX_DOTANY_UNIX, - URX_STO_SP, - URX_LD_SP, - URX_STO_INP_LOC, - URX_LA_START, - URX_LA_END, - URX_ONECHAR_I, - URX_STRING_I, - URX_DOLLAR_M, - URX_CARET_M, - URX_CARET_M_UNIX, - URX_LB_START, - URX_LB_CONT, - URX_LB_END, - URX_LBN_CONT, - URX_LBN_END, - URX_LOOP_SR_I, - URX_LOOP_DOT_I, - URX_LOOP_C, - URX_DOLLAR_D, - URX_DOLLAR_MD, - URX_BACKSLASH_H, - URX_BACKSLASH_R, - URX_BACKSLASH_V: + case urxReservedOp, + urxReservedOpN, + urxBacktrack, + urxEnd, + urxOnechar, + urxString, + urxStringLen, + urxStartCapture, + urxEndCapture, + urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxDotany, + urxFail, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashX, + urxBackslashZ, + urxDotanyAll, + urxBackslashD, + urxCaret, + urxDollar, + urxCtrInit, + urxCtrInitNg, + urxDotanyUnix, + urxStoSp, + urxLdSp, + urxStoInpLoc, + urxLaStart, + urxLaEnd, + urcOnecharI, + urxStringI, + urxDollarM, + urxCaretM, + urxCaretMUnix, + urxLbStart, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd, + urxLoopSrI, + urxLoopDotI, + urxLoopC, + urxDollarD, + urxDollarMd, + urxBackslashH, + urxBackslashR, + urxBackslashV: // These instructions are unaltered by the relocation. c.out.compiledPat[dst] = op dst++ @@ -1859,7 +1850,7 @@ func (c *Compiler) stripNOPs() { c.out.compiledPat = c.out.compiledPat[:dst] } -func (c *Compiler) matchStartType() { +func (c *compiler) matchStartType() { var loc int // Location in the pattern of the current op being processed. var currentLen int32 // Minimum length of a match to this point (loc) in the pattern var numInitialStrings int // Number of strings encountered that could match at start. @@ -1881,7 +1872,7 @@ func (c *Compiler) matchStartType() { for loc = 3; loc < end; loc++ { op := c.out.compiledPat[loc] - opType := op.Type() + opType := op.typ() // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, @@ -1889,58 +1880,56 @@ func (c *Compiler) matchStartType() { // replace the current accumulated value. if forwardedLength[loc] < currentLen { currentLen = forwardedLength[loc] - // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX); } switch opType { // Ops that don't change the total length matched - case URX_RESERVED_OP, - URX_END, - URX_FAIL, - URX_STRING_LEN, - URX_NOP, - URX_START_CAPTURE, - URX_END_CAPTURE, - URX_BACKSLASH_B, - URX_BACKSLASH_BU, - URX_BACKSLASH_G, - URX_BACKSLASH_Z, - URX_DOLLAR, - URX_DOLLAR_M, - URX_DOLLAR_D, - URX_DOLLAR_MD, - URX_RELOC_OPRND, - URX_STO_INP_LOC, - URX_BACKREF, // BackRef. Must assume that it might be a zero length match - URX_BACKREF_I, - URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. - URX_LD_SP: + case urxReservedOp, + urxEnd, + urxFail, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp: // skip - case URX_CARET: + case urxCaret: if atStart { - c.out.startType = START_START + c.out.startType = startStart } - case URX_CARET_M, URX_CARET_M_UNIX: + case urxCaretM, urxCaretMUnix: if atStart { - c.out.startType = START_LINE + c.out.startType = startLine } - case URX_ONECHAR: + case urxOnechar: if currentLen == 0 { // This character could appear at the start of a match. // Add it to the set of possible starting characters. - c.out.initialChars.AddRune(op.Value32()) + c.out.initialChars.AddRune(op.value32()) numInitialStrings += 2 } currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_SETREF: + case urxSetref: if currentLen == 0 { - sn := op.Value() - // U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); + sn := op.value() set := c.out.sets[sn] c.out.initialChars.AddAll(set) numInitialStrings += 2 @@ -1948,19 +1937,18 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_LOOP_SR_I: + case urxLoopSrI: // [Set]*, like a SETREF, above, in what it can match, // but may not match at all, so currentLen is not incremented. if currentLen == 0 { - sn := op.Value() - // U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); + sn := op.value() set := c.out.sets[sn] c.out.initialChars.AddAll(set) numInitialStrings += 2 } atStart = false - case URX_LOOP_DOT_I: + case urxLoopDotI: if currentLen == 0 { // .* at the start of a pattern. // Any character can begin the match. @@ -1970,18 +1958,18 @@ func (c *Compiler) matchStartType() { } atStart = false - case URX_STATIC_SETREF: + case urxStaticSetref: if currentLen == 0 { - sn := op.Value() + sn := op.value() c.out.initialChars.AddAll(staticPropertySets[sn]) numInitialStrings += 2 } currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_STAT_SETREF_N: + case urxStatSetrefN: if currentLen == 0 { - sn := op.Value() + sn := op.value() sc := uset.New() sc.AddAll(staticPropertySets[sn]) sc.Complement() @@ -1992,12 +1980,12 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_BACKSLASH_D: + case urxBackslashD: // Digit Char if currentLen == 0 { s := uset.New() - c.err = uprops.ApplyIntPropertyValue(s, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ND_MASK)) - if op.Value() != 0 { + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCharGeneralCategoryMask, int32(uchar.GcNdMask)) + if op.value() != 0 { s.Complement() } c.out.initialChars.AddAll(s) @@ -2006,13 +1994,13 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_BACKSLASH_H: + case urxBackslashH: // Horiz white space if currentLen == 0 { s := uset.New() - c.err = uprops.ApplyIntPropertyValue(s, uprops.UCHAR_GENERAL_CATEGORY_MASK, int32(uchar.U_GC_ZS_MASK)) + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) s.AddRune(9) // Tab - if op.Value() != 0 { + if op.value() != 0 { s.Complement() } c.out.initialChars.AddAll(s) @@ -2021,14 +2009,14 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_BACKSLASH_R, // Any line ending sequence - URX_BACKSLASH_V: // Any line ending code point, with optional negation + case urxBackslashR, // Any line ending sequence + urxBackslashV: // Any line ending code point, with optional negation if currentLen == 0 { s := uset.New() s.AddRuneRange(0x0a, 0x0d) // add range s.AddRune(0x85) s.AddRuneRange(0x2028, 0x2029) - if op.Value() != 0 { + if op.value() != 0 { // Complement option applies to URX_BACKSLASH_V only. s.Complement() } @@ -2038,14 +2026,14 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_ONECHAR_I: + case urcOnecharI: // Case Insensitive Single Character. if currentLen == 0 { - ch := op.Value32() - if uprops.HasBinaryProperty(ch, uprops.UCHAR_CASE_SENSITIVE) { + ch := op.value32() + if uprops.HasBinaryProperty(ch, uprops.UCharCaseSensitive) { starters := uset.New() starters.AddRuneRange(ch, ch) - starters.CloseOver(uset.USET_CASE_INSENSITIVE) + starters.CloseOver(uset.CaseInsensitive) // findCaseInsensitiveStarters(c, &starters); // For ONECHAR_I, no need to worry about text chars that expand on folding into // strings. The expanded folding can't match the pattern. @@ -2060,10 +2048,10 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_BACKSLASH_X, // Grahpeme Cluster. Minimum is 1, max unbounded. - URX_DOTANY_ALL, // . matches one or two. - URX_DOTANY, - URX_DOTANY_UNIX: + case urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + urxDotanyAll, // . matches one or two. + urxDotany, + urxDotanyUnix: if currentLen == 0 { // These constructs are all bad news when they appear at the start // of a match. Any character can begin the match. @@ -2074,41 +2062,40 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case URX_JMPX: + case urxJmpx: loc++ // Except for extra operand on URX_JMPX, same as URX_JMP. fallthrough - case URX_JMP: - jmpDest := op.Value() + case urxJmp: + jmpDest := op.value() if jmpDest < loc { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength[loc+1] } else { // Forward jump. Propagate the current min length to the target loc of the jump. - // U_ASSERT(jmpDest <= end + 1); if forwardedLength[jmpDest] > currentLen { forwardedLength[jmpDest] = currentLen } } atStart = false - case URX_JMP_SAV, - URX_JMP_SAV_X: + case urxJmpSav, + urxJmpSavX: // Combo of state save to the next loc, + jmp backwards. // Net effect on min. length computation is nothing. atStart = false - case URX_BACKTRACK: + case urxBacktrack: // Fails are kind of like a branch, except that the min length was // propagated already, by the state save. currentLen = forwardedLength[loc+1] atStart = false - case URX_STATE_SAVE: + case urxStateSave: // State Save, for forward jumps, propagate the current minimum. // of the state save. - jmpDest := op.Value() + jmpDest := op.value() if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = (currentLen) @@ -2116,16 +2103,14 @@ func (c *Compiler) matchStartType() { } atStart = false - case URX_STRING: + case urxString: loc++ stringLenOp := c.out.compiledPat[loc] - stringLen := stringLenOp.Value() - // U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); - // U_ASSERT(stringLenOp >= 2); + stringLen := stringLenOp.value() if currentLen == 0 { // Add the starting character of this string to the set of possible starting // characters for this pattern. - stringStartIdx := op.Value() + stringStartIdx := op.value() ch := c.out.literalText[stringStartIdx] c.out.initialChars.AddRune(ch) @@ -2139,19 +2124,17 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, stringLen) atStart = false - case URX_STRING_I: + case urxStringI: // Case-insensitive string. Unlike exact-match strings, we won't // attempt a string search for possible match positions. But we // do update the set of possible starting characters. loc++ stringLenOp := c.out.compiledPat[loc] - stringLen := stringLenOp.Value() - // U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); - // U_ASSERT(stringLenOp >= 2); + stringLen := stringLenOp.value() if currentLen == 0 { // Add the starting character of this string to the set of possible starting // characters for this pattern. - stringStartIdx := op.Value() + stringStartIdx := op.value() ch := c.out.literalText[stringStartIdx] s := uset.New() c.findCaseInsensitiveStarters(ch, s) @@ -2161,8 +2144,8 @@ func (c *Compiler) matchStartType() { currentLen = safeIncrement(currentLen, stringLen) atStart = false - case URX_CTR_INIT, - URX_CTR_INIT_NG: + case urxCtrInit, + urxCtrInitNg: // Loop Init Ops. These don't change the min length, but they are 4 word ops // so location must be updated accordingly. // Loop Init Ops. @@ -2170,13 +2153,12 @@ func (c *Compiler) matchStartType() { // move loc forwards to the end of the loop, skipping over the body. // If the min count is > 0, // continue normal processing of the body of the loop. - loopEndLoc := c.out.compiledPat[loc+1].Value() + loopEndLoc := c.out.compiledPat[loc+1].value() minLoopCount := int(c.out.compiledPat[loc+2]) if minLoopCount == 0 { // Min Loop Count of 0, treat like a forward branch and // move the current minimum length up to the target // (end of loop) location. - // U_ASSERT(loopEndLoc <= end + 1); if forwardedLength[loopEndLoc] > currentLen { forwardedLength[loopEndLoc] = currentLen } @@ -2184,19 +2166,19 @@ func (c *Compiler) matchStartType() { loc += 3 // Skips over operands of CTR_INIT atStart = false - case URX_CTR_LOOP, - URX_CTR_LOOP_NG: + case utxCtrLoop, + urxCtrLoopNg: // Loop ops. // The jump is conditional, backwards only. atStart = false - case URX_LOOP_C: + case urxLoopC: // More loop ops. These state-save to themselves. // don't change the minimum match atStart = false - case URX_LA_START, - URX_LB_START: + case urxLaStart, + urxLbStart: // Look-around. Scan forward until the matching look-ahead end, // without processing the look-around block. This is overly pessimistic. @@ -2204,7 +2186,7 @@ func (c *Compiler) matchStartType() { // lookahead contains two LA_END instructions, so count goes up by two // for each LA_START. var depth int - if opType == URX_LA_START { + if opType == urxLaStart { depth = 2 } else { depth = 1 @@ -2212,36 +2194,35 @@ func (c *Compiler) matchStartType() { for { loc++ op = c.out.compiledPat[loc] - if op.Type() == URX_LA_START { + if op.typ() == urxLaStart { depth += 2 } - if op.Type() == URX_LB_START { + if op.typ() == urxLbStart { depth++ } - if op.Type() == URX_LA_END || op.Type() == URX_LBN_END { + if op.typ() == urxLaEnd || op.typ() == urxLbnEnd { depth-- if depth == 0 { break } } - if op.Type() == URX_STATE_SAVE { + if op.typ() == urxStateSave { // Need this because neg lookahead blocks will FAIL to outside // of the block. - jmpDest := op.Value() + jmpDest := op.value() if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = (currentLen) } } } - // U_ASSERT(loc <= end); } - case URX_LA_END, - URX_LB_CONT, - URX_LB_END, - URX_LBN_CONT, - URX_LBN_END: + case urxLaEnd, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd: panic("should be consumed in URX_LA_START") default: @@ -2257,47 +2238,45 @@ func (c *Compiler) matchStartType() { // 4. A single literal character. // 5. A character from a set of characters. // - if c.out.startType == START_START { + if c.out.startType == startStart { // Match only at the start of an input text string. // start type is already set. We're done. } else if numInitialStrings == 1 && c.out.minMatchLen > 0 { // Match beginning only with a literal string. ch := c.out.literalText[c.out.initialStringIdx] - // U_ASSERT(fRXPat->fInitialChars->contains(c)); - c.out.startType = START_STRING + c.out.startType = startString c.out.initialChar = ch - } else if c.out.startType == START_LINE { + } else if c.out.startType == startLine { // Match at start of line in Multi-Line mode. // Nothing to do here; everything is already set. } else if c.out.minMatchLen == 0 { // Zero length match possible. We could start anywhere. - c.out.startType = START_NO_INFO + c.out.startType = startNoInfo } else if c.out.initialChars.Len() == 1 { // All matches begin with the same char. - c.out.startType = START_CHAR + c.out.startType = startChar c.out.initialChar = c.out.initialChars.RuneAt(0) - // U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); } else if !c.out.initialChars.ContainsRuneRange(0, 0x10ffff) && c.out.minMatchLen > 0 { // Matches start with a set of character smaller than the set of all chars. - c.out.startType = START_SET + c.out.startType = startSet } else { // Matches can start with anything - c.out.startType = START_NO_INFO + c.out.startType = startNoInfo } } -func (c *Compiler) appendOp(typ Opcode, arg int) { +func (c *compiler) appendOp(typ opcode, arg int) { c.appendIns(c.buildOp(typ, arg)) } -func (c *Compiler) appendIns(ins Instruction) { +func (c *compiler) appendIns(ins instruction) { if c.err != nil { return } c.out.compiledPat = append(c.out.compiledPat, ins) } -func (c *Compiler) buildOp(typ Opcode, val int) Instruction { +func (c *compiler) buildOp(typ opcode, val int) instruction { if c.err != nil { return 0 } @@ -2305,24 +2284,24 @@ func (c *Compiler) buildOp(typ Opcode, val int) Instruction { panic("bad argument to buildOp") } if val < 0 { - if !(typ == URX_RESERVED_OP_N || typ == URX_RESERVED_OP) { + if !(typ == urxReservedOpN || typ == urxReservedOp) { panic("bad value to buildOp") } - typ = URX_RESERVED_OP_N + typ = urxReservedOpN } - return Instruction(int32(typ)<<24 | int32(val)) + return instruction(int32(typ)<<24 | int32(val)) } -func (c *Compiler) handleCloseParen() { +func (c *compiler) handleCloseParen() { if len(c.parenStack) == 0 { - c.error(uerror.U_REGEX_MISMATCHED_PAREN) + c.error(uerror.MismatchedParen) return } c.fixLiterals(false) var patIdx int - var patOp Instruction + var patOp instruction for { patIdx, c.parenStack = stackPop(c.parenStack) @@ -2331,10 +2310,10 @@ func (c *Compiler) handleCloseParen() { } patOp = c.out.compiledPat[patIdx] - if patOp.Value() != 0 { + if patOp.value() != 0 { panic("branch target for JMP should not be set") } - patOp |= Instruction(len(c.out.compiledPat)) + patOp |= instruction(len(c.out.compiledPat)) c.out.compiledPat[patIdx] = patOp c.matchOpenParen = patIdx } @@ -2358,58 +2337,58 @@ func (c *Compiler) handleCloseParen() { // start capture op and put it into the end-capture op. captureOp := c.out.compiledPat[c.matchOpenParen+1] - if captureOp.Type() != URX_START_CAPTURE { + if captureOp.typ() != urxStartCapture { panic("bad type in capture op (expected URX_START_CAPTURE)") } - frameVarLocation := captureOp.Value() - c.appendOp(URX_END_CAPTURE, frameVarLocation) + frameVarLocation := captureOp.value() + c.appendOp(urxEndCapture, frameVarLocation) case parenAtomic: // Atomic Parenthesis. // Insert a LD_SP operation to restore the state stack to the position // it was when the atomic parens were entered. stoOp := c.out.compiledPat[c.matchOpenParen+1] - if stoOp.Type() != URX_STO_SP { + if stoOp.typ() != urxStoSp { panic("bad type in capture op (expected URX_STO_SP)") } - stoLoc := stoOp.Value() - c.appendOp(URX_LD_SP, stoLoc) + stoLoc := stoOp.value() + c.appendOp(urxLdSp, stoLoc) case parenLookahead: startOp := c.out.compiledPat[c.matchOpenParen-5] - if startOp.Type() != URX_LA_START { + if startOp.typ() != urxLaStart { panic("bad type in capture op (expected URX_LA_START)") } - dataLoc := startOp.Value() - c.appendOp(URX_LA_END, dataLoc) + dataLoc := startOp.value() + c.appendOp(urxLaEnd, dataLoc) case parenNegLookahead: startOp := c.out.compiledPat[c.matchOpenParen-1] - if startOp.Type() != URX_LA_START { + if startOp.typ() != urxLaStart { panic("bad type in capture op (expected URX_LA_START)") } - dataLoc := startOp.Value() - c.appendOp(URX_LA_END, dataLoc) - c.appendOp(URX_BACKTRACK, 0) - c.appendOp(URX_LA_END, dataLoc) + dataLoc := startOp.value() + c.appendOp(urxLaEnd, dataLoc) + c.appendOp(urxBacktrack, 0) + c.appendOp(urxLaEnd, dataLoc) // Patch the URX_SAVE near the top of the block. // The destination of the SAVE is the final LA_END that was just added. saveOp := c.out.compiledPat[c.matchOpenParen] - if saveOp.Type() != URX_STATE_SAVE { + if saveOp.typ() != urxStateSave { panic("bad type in capture op (expected URX_STATE_SAVE)") } - saveOp = c.buildOp(URX_STATE_SAVE, len(c.out.compiledPat)-1) + saveOp = c.buildOp(urxStateSave, len(c.out.compiledPat)-1) c.out.compiledPat[c.matchOpenParen] = saveOp case parenLookBehind: startOp := c.out.compiledPat[c.matchOpenParen-4] - if startOp.Type() != URX_LB_START { + if startOp.typ() != urxLbStart { panic("bad type in capture op (expected URX_LB_START)") } - dataLoc := startOp.Value() - c.appendOp(URX_LB_END, dataLoc) - c.appendOp(URX_LA_END, dataLoc) + dataLoc := startOp.value() + c.appendOp(urxLbEnd, dataLoc) + c.appendOp(urxLaEnd, dataLoc) // Determine the min and max bounds for the length of the // string that the pattern can match. @@ -2419,7 +2398,7 @@ func (c *Compiler) handleCloseParen() { maxML := c.maxMatchLength(c.matchOpenParen, patEnd) if maxML == math.MaxInt32 { - c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + c.error(uerror.LookBehindLimit) break } if minML == math.MaxInt32 { @@ -2430,16 +2409,16 @@ func (c *Compiler) handleCloseParen() { minML = 0 } - c.out.compiledPat[c.matchOpenParen-2] = Instruction(minML) - c.out.compiledPat[c.matchOpenParen-1] = Instruction(maxML) + c.out.compiledPat[c.matchOpenParen-2] = instruction(minML) + c.out.compiledPat[c.matchOpenParen-1] = instruction(maxML) case parenLookBehindN: startOp := c.out.compiledPat[c.matchOpenParen-5] - if startOp.Type() != URX_LB_START { + if startOp.typ() != urxLbStart { panic("bad type in capture op (expected URX_LB_START)") } - dataLoc := startOp.Value() - c.appendOp(URX_LBN_END, dataLoc) + dataLoc := startOp.value() + c.appendOp(urxLbnEnd, dataLoc) // Determine the min and max bounds for the length of the // string that the pattern can match. @@ -2448,12 +2427,12 @@ func (c *Compiler) handleCloseParen() { minML := c.minMatchLength(c.matchOpenParen, patEnd) maxML := c.maxMatchLength(c.matchOpenParen, patEnd) - if Instruction(maxML).Type() != 0 { - c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + if instruction(maxML).typ() != 0 { + c.error(uerror.LookBehindLimit) break } if maxML == math.MaxInt32 { - c.error(uerror.U_REGEX_LOOK_BEHIND_LIMIT) + c.error(uerror.LookBehindLimit) break } if minML == math.MaxInt32 { @@ -2464,10 +2443,10 @@ func (c *Compiler) handleCloseParen() { minML = 0 } - c.out.compiledPat[c.matchOpenParen-3] = Instruction(minML) - c.out.compiledPat[c.matchOpenParen-2] = Instruction(maxML) + c.out.compiledPat[c.matchOpenParen-3] = instruction(minML) + c.out.compiledPat[c.matchOpenParen-2] = instruction(maxML) - op := c.buildOp(URX_RELOC_OPRND, len(c.out.compiledPat)) + op := c.buildOp(urxRelocOprnd, len(c.out.compiledPat)) c.out.compiledPat[c.matchOpenParen-1] = op default: @@ -2477,7 +2456,7 @@ func (c *Compiler) handleCloseParen() { c.matchCloseParen = len(c.out.compiledPat) } -func (c *Compiler) fixLiterals(split bool) { +func (c *compiler) fixLiterals(split bool) { if len(c.literalChars) == 0 { return } @@ -2497,85 +2476,85 @@ func (c *Compiler) fixLiterals(split bool) { return } - if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { + if c.modeFlags&CaseInsensitive != 0 { c.literalChars = ucase.FoldRunes(c.literalChars) lastCodePoint = c.literalChars[len(c.literalChars)-1] } if len(c.literalChars) == 1 { - if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 && uprops.HasBinaryProperty(lastCodePoint, uprops.UCHAR_CASE_SENSITIVE) { - c.appendOp(URX_ONECHAR_I, int(lastCodePoint)) + if c.modeFlags&CaseInsensitive != 0 && uprops.HasBinaryProperty(lastCodePoint, uprops.UCharCaseSensitive) { + c.appendOp(urcOnecharI, int(lastCodePoint)) } else { - c.appendOp(URX_ONECHAR, int(lastCodePoint)) + c.appendOp(urxOnechar, int(lastCodePoint)) } } else { if len(c.literalChars) > 0x00ffffff || len(c.out.literalText) > 0x00ffffff { - c.error(uerror.U_REGEX_PATTERN_TOO_BIG) + c.error(uerror.PatternTooBig) } - if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { - c.appendOp(URX_STRING_I, len(c.out.literalText)) + if c.modeFlags&CaseInsensitive != 0 { + c.appendOp(urxStringI, len(c.out.literalText)) } else { - c.appendOp(URX_STRING, len(c.out.literalText)) + c.appendOp(urxString, len(c.out.literalText)) } - c.appendOp(URX_STRING_LEN, len(c.literalChars)) + c.appendOp(urxStringLen, len(c.literalChars)) c.out.literalText = append(c.out.literalText, c.literalChars...) } c.literalChars = c.literalChars[:0] } -func (c *Compiler) literalChar(point rune) { +func (c *compiler) literalChar(point rune) { c.literalChars = append(c.literalChars, point) } -func (c *Compiler) allocateData(size int) int { +func (c *compiler) allocateData(size int) int { if c.err != nil { return 0 } if size <= 0 || size > 0x100 || c.out.dataSize < 0 { - c.error(uerror.U_REGEX_INTERNAL_ERROR) + c.error(uerror.InternalError) return 0 } dataIndex := c.out.dataSize c.out.dataSize += size if c.out.dataSize >= 0x00fffff0 { - c.error(uerror.U_REGEX_INTERNAL_ERROR) + c.error(uerror.InternalError) } return dataIndex } -func (c *Compiler) allocateStackData(size int) int { +func (c *compiler) allocateStackData(size int) int { if c.err != nil { return 0 } if size <= 0 || size > 0x100 || c.out.frameSize < 0 { - c.error(uerror.U_REGEX_INTERNAL_ERROR) + c.error(uerror.InternalError) return 0 } dataIndex := c.out.frameSize c.out.frameSize += size if c.out.frameSize >= 0x00fffff0 { - c.error(uerror.U_REGEX_INTERNAL_ERROR) + c.error(uerror.InternalError) } return dataIndex } -func (c *Compiler) insertOp(where int) { +func (c *compiler) insertOp(where int) { if where < 0 || where >= len(c.out.compiledPat) { panic("insertOp: out of bounds") } - nop := c.buildOp(URX_NOP, 0) + nop := c.buildOp(urxNop, 0) c.out.compiledPat = slices.Insert(c.out.compiledPat, where, nop) // Walk through the pattern, looking for any ops with targets that // were moved down by the insert. Fix them. for loc, op := range c.out.compiledPat { - switch op.Type() { - case URX_JMP, URX_JMPX, URX_STATE_SAVE, URX_CTR_LOOP, URX_CTR_LOOP_NG, URX_JMP_SAV, URX_JMP_SAV_X, URX_RELOC_OPRND: - if op.Value() > where { - op = c.buildOp(op.Type(), op.Value()+1) + switch op.typ() { + case urxJmp, urxJmpx, urxStateSave, utxCtrLoop, urxCtrLoopNg, urxJmpSav, urxJmpSavX, urxRelocOprnd: + if op.value() > where { + op = c.buildOp(op.typ(), op.value()+1) c.out.compiledPat[loc] = op } } @@ -2597,7 +2576,7 @@ func (c *Compiler) insertOp(where int) { } } -func (c *Compiler) blockTopLoc(reserve bool) int { +func (c *compiler) blockTopLoc(reserve bool) int { var loc int c.fixLiterals(true) @@ -2610,20 +2589,20 @@ func (c *Compiler) blockTopLoc(reserve bool) int { // We need to make space now. loc = len(c.out.compiledPat) - 1 op := c.out.compiledPat[loc] - if op.Type() == URX_STRING_LEN { + if op.typ() == urxStringLen { // Strings take two opcode, we want the position of the first one. // We can have a string at this point if a single character case-folded to two. loc-- } if reserve { - nop := c.buildOp(URX_NOP, 0) + nop := c.buildOp(urxNop, 0) c.out.compiledPat = slices.Insert(c.out.compiledPat, loc, nop) } } return loc } -func (c *Compiler) compileInlineInterval() bool { +func (c *compiler) compileInlineInterval() bool { if c.intervalUpper > 10 || c.intervalUpper < c.intervalLow { return false } @@ -2660,7 +2639,7 @@ func (c *Compiler) compileInlineInterval() bool { // endOfSequenceLoc := len(c.out.compiledPat) - 1 + c.intervalUpper + (c.intervalUpper - c.intervalLow) - saveOp := c.buildOp(URX_STATE_SAVE, endOfSequenceLoc) + saveOp := c.buildOp(urxStateSave, endOfSequenceLoc) if c.intervalLow == 0 { c.insertOp(topOfBlock) c.out.compiledPat[topOfBlock] = saveOp @@ -2678,7 +2657,7 @@ func (c *Compiler) compileInlineInterval() bool { return true } -func (c *Compiler) compileInterval(init Opcode, loop Opcode) { +func (c *compiler) compileInterval(init opcode, loop opcode) { // The CTR_INIT op at the top of the block with the {n,m} quantifier takes // four slots in the compiled code. Reserve them. topOfBlock := c.blockTopLoc(true) @@ -2707,30 +2686,30 @@ func (c *Compiler) compileInterval(init Opcode, loop Opcode) { // compilation of something later on causes the code to grow and the target // position to move. loopEnd := len(c.out.compiledPat) - op = c.buildOp(URX_RELOC_OPRND, loopEnd) + op = c.buildOp(urxRelocOprnd, loopEnd) c.out.compiledPat[topOfBlock+1] = op // Followed by the min and max counts. - c.out.compiledPat[topOfBlock+2] = Instruction(c.intervalLow) - c.out.compiledPat[topOfBlock+3] = Instruction(c.intervalUpper) + c.out.compiledPat[topOfBlock+2] = instruction(c.intervalLow) + c.out.compiledPat[topOfBlock+3] = instruction(c.intervalUpper) // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. // Goes at end of the block being looped over, so just append to the code so far. c.appendOp(loop, topOfBlock) if (c.intervalLow&0xff000000) != 0 || (c.intervalUpper > 0 && (c.intervalUpper&0xff000000) != 0) { - c.error(uerror.U_REGEX_NUMBER_TOO_BIG) + c.error(uerror.NumberTooBig) } if c.intervalLow > c.intervalUpper && c.intervalUpper != -1 { - c.error(uerror.U_REGEX_MAX_LT_MIN) + c.error(uerror.MaxLtMin) } } -func (c *Compiler) scanNamedChar() rune { +func (c *compiler) scanNamedChar() rune { c.nextChar(&c.c) if c.c.char != chLBrace { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) return 0 } @@ -2741,7 +2720,7 @@ func (c *Compiler) scanNamedChar() rune { break } if c.c.char == -1 { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) return 0 } charName = append(charName, c.c.char) @@ -2751,13 +2730,13 @@ func (c *Compiler) scanNamedChar() rune { // All Unicode character names have only invariant characters. // The API to get a character, given a name, accepts only char *, forcing us to convert, // which requires this error check - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) return 0 } - theChar := unames.CharForName(unames.U_UNICODE_CHAR_NAME, string(charName)) + theChar := unames.CharForName(unames.UnicodeCharName, string(charName)) if c.err != nil { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) } c.nextChar(&c.c) // Continue overall regex pattern processing with char after the '}' @@ -2770,7 +2749,7 @@ func isInvariantUString(name []rune) bool { * no assertions here because these functions are legitimately called * for strings with variant characters */ - if !UCHAR_IS_INVARIANT(c) { + if !ucharIsInvariant(c) { return false /* found a variant char */ } } @@ -2784,17 +2763,17 @@ var invariantChars = [...]uint32{ 0x87fffffe, /* 60..7f but not 60 7b..7e */ } -func UCHAR_IS_INVARIANT(c rune) bool { +func ucharIsInvariant(c rune) bool { return c <= 0x7f && (invariantChars[(c)>>5]&(uint32(1)<<(c&0x1f))) != 0 } -func (c *Compiler) setPushOp(op setOperation) { +func (c *compiler) setPushOp(op setOperation) { c.setEval(op) c.setOpStack = append(c.setOpStack, op) c.setStack = append(c.setStack, uset.New()) } -func (c *Compiler) setEval(nextOp setOperation) { +func (c *compiler) setEval(nextOp setOperation) { var rightOperand *uset.UnicodeSet var leftOperand *uset.UnicodeSet @@ -2812,7 +2791,7 @@ func (c *Compiler) setEval(nextOp setOperation) { rightOperand.Complement() case setCaseClose: - rightOperand.CloseOver(uset.USET_CASE_INSENSITIVE) + rightOperand.CloseOver(uset.CaseInsensitive) case setDifference1, setDifference2: c.setStack = c.setStack[:len(c.setStack)-1] @@ -2842,14 +2821,11 @@ func safeIncrement(val int32, delta int) int32 { return math.MaxInt32 } -func (c *Compiler) minMatchLength(start, end int) int32 { +func (c *compiler) minMatchLength(start, end int) int32 { if c.err != nil { return 0 } - // U_ASSERT(start <= end); - // U_ASSERT(end < fRXPat->fCompiledPat->size()); - var loc int var currentLen int32 @@ -2865,104 +2841,101 @@ func (c *Compiler) minMatchLength(start, end int) int32 { for loc = start; loc <= end; loc++ { op := c.out.compiledPat[loc] - opType := op.Type() + opType := op.typ() // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, // and that path has a shorter minimum length than the current accumulated value, // replace the current accumulated value. - // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some // no-match-possible cases. if forwardedLength[loc] < currentLen { currentLen = forwardedLength[loc] - // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX); } switch opType { // Ops that don't change the total length matched - case URX_RESERVED_OP, - URX_END, - URX_STRING_LEN, - URX_NOP, - URX_START_CAPTURE, - URX_END_CAPTURE, - URX_BACKSLASH_B, - URX_BACKSLASH_BU, - URX_BACKSLASH_G, - URX_BACKSLASH_Z, - URX_CARET, - URX_DOLLAR, - URX_DOLLAR_M, - URX_DOLLAR_D, - URX_DOLLAR_MD, - URX_RELOC_OPRND, - URX_STO_INP_LOC, - URX_CARET_M, - URX_CARET_M_UNIX, - URX_BACKREF, // BackRef. Must assume that it might be a zero length match - URX_BACKREF_I, - URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. - URX_LD_SP, - URX_JMP_SAV, - URX_JMP_SAV_X: + case urxReservedOp, + urxEnd, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxCaret, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxCaretM, + urxCaretMUnix, + urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp, + urxJmpSav, + urxJmpSavX: // no-op // Ops that match a minimum of one character (one or two 16 bit code units.) // - case URX_ONECHAR, - URX_STATIC_SETREF, - URX_STAT_SETREF_N, - URX_SETREF, - URX_BACKSLASH_D, - URX_BACKSLASH_H, - URX_BACKSLASH_R, - URX_BACKSLASH_V, - URX_ONECHAR_I, - URX_BACKSLASH_X, // Grahpeme Cluster. Minimum is 1, max unbounded. - URX_DOTANY_ALL, // . matches one or two. - URX_DOTANY, - URX_DOTANY_UNIX: + case urxOnechar, + urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxBackslashD, + urxBackslashH, + urxBackslashR, + urxBackslashV, + urcOnecharI, + urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + urxDotanyAll, // . matches one or two. + urxDotany, + urxDotanyUnix: currentLen = safeIncrement(currentLen, 1) - case URX_JMPX: + case urxJmpx: loc++ // URX_JMPX has an extra operand, ignored here, otherwise processed identically to URX_JMP. fallthrough - case URX_JMP: - jmpDest := op.Value() + case urxJmp: + jmpDest := op.value() if jmpDest < loc { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength[loc+1] } else { // Forward jump. Propagate the current min length to the target loc of the jump. - // U_ASSERT(jmpDest <= end + 1); if forwardedLength[jmpDest] > currentLen { forwardedLength[jmpDest] = currentLen } } - case URX_BACKTRACK: + case urxBacktrack: // Back-tracks are kind of like a branch, except that the min length was // propagated already, by the state save. currentLen = forwardedLength[loc+1] - case URX_STATE_SAVE: + case urxStateSave: // State Save, for forward jumps, propagate the current minimum. // of the state save. - jmpDest := op.Value() + jmpDest := op.value() if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen } } - case URX_STRING: + case urxString: loc++ stringLenOp := c.out.compiledPat[loc] - currentLen = safeIncrement(currentLen, stringLenOp.Value()) + currentLen = safeIncrement(currentLen, stringLenOp.value()) - case URX_STRING_I: + case urxStringI: loc++ // TODO: with full case folding, matching input text may be shorter than // the string we have here. More smarts could put some bounds on it. @@ -2971,14 +2944,14 @@ func (c *Compiler) minMatchLength(start, end int) int32 { // currentLen += URX_VAL(stringLenOp); currentLen = safeIncrement(currentLen, 1) - case URX_CTR_INIT, URX_CTR_INIT_NG: + case urxCtrInit, urxCtrInitNg: // Loop Init Ops. // If the min loop count == 0 // move loc forwards to the end of the loop, skipping over the body. // If the min count is > 0, // continue normal processing of the body of the loop. loopEndOp := c.out.compiledPat[loc+1] - loopEndLoc := loopEndOp.Value() + loopEndLoc := loopEndOp.value() minLoopCount := c.out.compiledPat[loc+2] if minLoopCount == 0 { loc = loopEndLoc @@ -2986,20 +2959,20 @@ func (c *Compiler) minMatchLength(start, end int) int32 { loc += 3 // Skips over operands of CTR_INIT } - case URX_CTR_LOOP, URX_CTR_LOOP_NG: + case utxCtrLoop, urxCtrLoopNg: // Loop ops. The jump is conditional, backwards only. - case URX_LOOP_SR_I, URX_LOOP_DOT_I, URX_LOOP_C: + case urxLoopSrI, urxLoopDotI, urxLoopC: // More loop ops. These state-save to themselves. don't change the minimum match - could match nothing at all. - case URX_LA_START, URX_LB_START: + case urxLaStart, urxLbStart: // Look-around. Scan forward until the matching look-ahead end, // without processing the look-around block. This is overly pessimistic for look-ahead, // it assumes that the look-ahead match might be zero-length. // TODO: Positive lookahead could recursively do the block, then continue // with the longer of the block or the value coming in. Ticket 6060 var depth int32 - if opType == URX_LA_START { + if opType == urxLaStart { depth = 2 } else { depth = 1 @@ -3008,39 +2981,38 @@ func (c *Compiler) minMatchLength(start, end int) int32 { for { loc++ op = c.out.compiledPat[loc] - if op.Type() == URX_LA_START { + if op.typ() == urxLaStart { // The boilerplate for look-ahead includes two LA_END insturctions, // Depth will be decremented by each one when it is seen. depth += 2 } - if op.Type() == URX_LB_START { + if op.typ() == urxLbStart { depth++ } - if op.Type() == URX_LA_END { + if op.typ() == urxLaEnd { depth-- if depth == 0 { break } } - if op.Type() == URX_LBN_END { + if op.typ() == urxLbnEnd { depth-- if depth == 0 { break } } - if op.Type() == URX_STATE_SAVE { + if op.typ() == urxStateSave { // Need this because neg lookahead blocks will FAIL to outside of the block. - jmpDest := op.Value() + jmpDest := op.value() if jmpDest > loc { if currentLen < forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen } } } - // U_ASSERT(loc <= end); } - case URX_LA_END, URX_LB_CONT, URX_LB_END, URX_LBN_CONT, URX_LBN_END: + case urxLaEnd, urxLbCont, urxLbEnd, urxLbnCount, urxLbnEnd: // Only come here if the matching URX_LA_START or URX_LB_START was not in the // range being sized, which happens when measuring size of look-behind blocks. @@ -3053,19 +3025,15 @@ func (c *Compiler) minMatchLength(start, end int) int32 { // propagated a shorter length to location end+1. if forwardedLength[end+1] < currentLen { currentLen = forwardedLength[end+1] - // U_ASSERT(currentLen >= 0 && currentLen < INT32_MAX) } return currentLen } -func (c *Compiler) maxMatchLength(start, end int) int32 { +func (c *compiler) maxMatchLength(start, end int) int32 { if c.err != nil { return 0 } - // U_ASSERT(start <= end); - // U_ASSERT(end < fRXPat->fCompiledPat->size()); - var loc int var currentLen int32 @@ -3073,7 +3041,7 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { for loc = start; loc <= end; loc++ { op := c.out.compiledPat[loc] - opType := op.Type() + opType := op.typ() // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, @@ -3085,68 +3053,68 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { switch opType { // Ops that don't change the total length matched - case URX_RESERVED_OP, - URX_END, - URX_STRING_LEN, - URX_NOP, - URX_START_CAPTURE, - URX_END_CAPTURE, - URX_BACKSLASH_B, - URX_BACKSLASH_BU, - URX_BACKSLASH_G, - URX_BACKSLASH_Z, - URX_CARET, - URX_DOLLAR, - URX_DOLLAR_M, - URX_DOLLAR_D, - URX_DOLLAR_MD, - URX_RELOC_OPRND, - URX_STO_INP_LOC, - URX_CARET_M, - URX_CARET_M_UNIX, - URX_STO_SP, // Setup for atomic or possessive blocks. Doesn't change what can match. - URX_LD_SP, - URX_LB_END, - URX_LB_CONT, - URX_LBN_CONT, - URX_LBN_END: + case urxReservedOp, + urxEnd, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxCaret, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxCaretM, + urxCaretMUnix, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp, + urxLbEnd, + urxLbCont, + urxLbnCount, + urxLbnEnd: // no-op // Ops that increase that cause an unbounded increase in the length // of a matched string, or that increase it a hard to characterize way. // Call the max length unbounded, and stop further checking. - case URX_BACKREF, // BackRef. Must assume that it might be a zero length match - URX_BACKREF_I, - URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. + case urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxBackslashX: // Grahpeme Cluster. Minimum is 1, max unbounded. currentLen = math.MaxInt32 // Ops that match a max of one character (possibly two 16 bit code units.) // - case URX_STATIC_SETREF, - URX_STAT_SETREF_N, - URX_SETREF, - URX_BACKSLASH_D, - URX_BACKSLASH_H, - URX_BACKSLASH_R, - URX_BACKSLASH_V, - URX_ONECHAR_I, - URX_DOTANY_ALL, - URX_DOTANY, - URX_DOTANY_UNIX: + case urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxBackslashD, + urxBackslashH, + urxBackslashR, + urxBackslashV, + urcOnecharI, + urxDotanyAll, + urxDotany, + urxDotanyUnix: currentLen = safeIncrement(currentLen, 2) // Single literal character. Increase current max length by one or two, // depending on whether the char is in the supplementary range. - case URX_ONECHAR: + case urxOnechar: currentLen = safeIncrement(currentLen, 1) - if op.Value() > 0x10000 { + if op.value() > 0x10000 { currentLen = safeIncrement(currentLen, 1) } // Jumps. // - case URX_JMP, URX_JMPX, URX_JMP_SAV, URX_JMP_SAV_X: - jmpDest := op.Value() + case urxJmp, urxJmpx, urxJmpSav, urxJmpSavX: + jmpDest := op.value() if jmpDest < loc { // Loop of some kind. Max match length is unbounded. currentLen = math.MaxInt32 @@ -3158,17 +3126,17 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { currentLen = 0 } - case URX_BACKTRACK: + case urxBacktrack: // back-tracks are kind of like a branch, except that the max length was // propagated already, by the state save. currentLen = forwardedLength[loc+1] - case URX_STATE_SAVE: + case urxStateSave: // State Save, for forward jumps, propagate the current minimum. // of the state save. // For backwards jumps, they create a loop, maximum // match length is unbounded. - jmpDest := op.Value() + jmpDest := op.value() if jmpDest > loc { if currentLen > forwardedLength[jmpDest] { forwardedLength[jmpDest] = currentLen @@ -3177,12 +3145,12 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { currentLen = math.MaxInt32 } - case URX_STRING: + case urxString: loc++ stringLenOp := c.out.compiledPat[loc] - currentLen = safeIncrement(currentLen, stringLenOp.Value()) + currentLen = safeIncrement(currentLen, stringLenOp.value()) - case URX_STRING_I: + case urxStringI: // TODO: This code assumes that any user string that matches will be no longer // than our compiled string, with case insensitive matching. // Our compiled string has been case-folded already. @@ -3205,12 +3173,12 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { // loc++ stringLenOp := c.out.compiledPat[loc] - currentLen = safeIncrement(currentLen, stringLenOp.Value()) + currentLen = safeIncrement(currentLen, stringLenOp.value()) - case URX_CTR_INIT, URX_CTR_INIT_NG: + case urxCtrInit, urxCtrInitNg: // For Loops, recursively call this function on the pattern for the loop body, // then multiply the result by the maximum loop count. - loopEndLoc := c.out.compiledPat[loc+1].Value() + loopEndLoc := c.out.compiledPat[loc+1].value() if loopEndLoc == loc+4 { // Loop has an empty body. No affect on max match length. // Continue processing with code after the loop end. @@ -3225,7 +3193,6 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { break } - // U_ASSERT(loopEndLoc >= loc + 4); blockLen := c.maxMatchLength(loc+4, loopEndLoc-1) // Recursive call. updatedLen := int(currentLen) + int(blockLen)*maxLoopCount if updatedLen >= math.MaxInt32 { @@ -3235,29 +3202,28 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { currentLen = int32(updatedLen) loc = loopEndLoc - case URX_CTR_LOOP, URX_CTR_LOOP_NG: + case utxCtrLoop, urxCtrLoopNg: panic("should not encounter this opcode") - case URX_LOOP_SR_I, URX_LOOP_DOT_I, URX_LOOP_C: + case urxLoopSrI, urxLoopDotI, urxLoopC: // For anything to do with loops, make the match length unbounded. currentLen = math.MaxInt32 - case URX_LA_START, URX_LA_END: + case urxLaStart, urxLaEnd: // Look-ahead. Just ignore, treat the look-ahead block as if // it were normal pattern. Gives a too-long match length, // but good enough for now. - case URX_LB_START: + case urxLbStart: // Look-behind. Scan forward until the matching look-around end, // without processing the look-behind block. - dataLoc := op.Value() + dataLoc := op.value() for loc = loc + 1; loc <= end; loc++ { op = c.out.compiledPat[loc] - if (op.Type() == URX_LA_END || op.Type() == URX_LBN_END) && (op.Value() == dataLoc) { + if (op.typ() == urxLaEnd || op.typ() == urxLbnEnd) && (op.value() == dataLoc) { break } } - // U_ASSERT(loc <= end); default: panic("unreachable") @@ -3280,25 +3246,25 @@ func (c *Compiler) maxMatchLength(start, end int) int32 { // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing // Machine Generated Data. Do not hand edit. -var RECaseFixCodePoints = [...]rune{ +var reCaseFixCodePoints = [...]rune{ 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000} -var RECaseFixStringOffsets = [...]int16{ +var reCaseFixStringOffsets = [...]int16{ 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0} -var RECaseFixCounts = [...]int16{ +var reCaseFixCounts = [...]int16{ 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0} -var RECaseFixData = [...]uint16{ +var reCaseFixData = [...]uint16{ 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, @@ -3309,20 +3275,20 @@ var RECaseFixData = [...]uint16{ 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0} -func (c *Compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.UnicodeSet) { - if uprops.HasBinaryProperty(ch, uprops.UCHAR_CASE_SENSITIVE) { +func (c *compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.UnicodeSet) { + if uprops.HasBinaryProperty(ch, uprops.UCharCaseSensitive) { caseFoldedC := ucase.Fold(ch) starterChars.Clear() starterChars.AddRune(caseFoldedC) var i int - for i = 0; RECaseFixCodePoints[i] < ch; i++ { + for i = 0; reCaseFixCodePoints[i] < ch; i++ { // Simple linear search through the sorted list of interesting code points. } - if RECaseFixCodePoints[i] == ch { - data := RECaseFixData[RECaseFixStringOffsets[i]:] - numCharsToAdd := RECaseFixCounts[i] + if reCaseFixCodePoints[i] == ch { + data := reCaseFixData[reCaseFixStringOffsets[i]:] + numCharsToAdd := reCaseFixCounts[i] for j := int16(0); j < numCharsToAdd; j++ { var cpToAdd rune cpToAdd, data = utf16.NextUnsafe(data) @@ -3330,7 +3296,7 @@ func (c *Compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.Unico } } - starterChars.CloseOver(uset.USET_CASE_INSENSITIVE) + starterChars.CloseOver(uset.CaseInsensitive) } else { // Not a cased character. Just return it alone. starterChars.Clear() @@ -3338,7 +3304,7 @@ func (c *Compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.Unico } } -func (c *Compiler) scanProp() *uset.UnicodeSet { +func (c *compiler) scanProp() *uset.UnicodeSet { if c.err != nil { return nil } @@ -3346,7 +3312,7 @@ func (c *Compiler) scanProp() *uset.UnicodeSet { c.nextChar(&c.c) if c.c.char != chLBrace { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) return nil } @@ -3357,7 +3323,7 @@ func (c *Compiler) scanProp() *uset.UnicodeSet { break } if c.c.char == -1 { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) return nil } propertyName.WriteRune(c.c.char) @@ -3368,7 +3334,7 @@ func (c *Compiler) scanProp() *uset.UnicodeSet { return ss } -func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.UnicodeSet { +func (c *compiler) createSetForProperty(propName string, negated bool) *uset.UnicodeSet { if c.err != nil { return nil } @@ -3376,8 +3342,8 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni var set *uset.UnicodeSet var usetFlags uset.USet - if c.modeFlags&UREGEX_CASE_INSENSITIVE != 0 { - usetFlags |= uset.USET_CASE_INSENSITIVE + if c.modeFlags&CaseInsensitive != 0 { + usetFlags |= uset.CaseInsensitive } var err error @@ -3393,7 +3359,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni // Java accepts 'word' with mixed case. // Java accepts 'all' only in all lower case. if strings.EqualFold(propName, "word") { - set = staticPropertySets[URX_ISWORD_SET].Clone() + set = staticPropertySets[urxIswordSet].Clone() goto done } if propName == "all" { @@ -3407,7 +3373,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni if strings.HasPrefix(propName, "In") && len(propName) >= 3 { set = uset.New() if uprops.ApplyPropertyAlias(set, "Block", propName[2:]) != nil { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) } goto done } @@ -3418,7 +3384,7 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni if strings.HasPrefix(propName, "Is") && len(propName) >= 3 { mPropName := propName[2:] if strings.IndexByte(mPropName, '=') >= 0 { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) goto done } @@ -3431,9 +3397,9 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni set, err = uprops.NewUnicodeSetFomPattern("\\p{"+mPropName+"}", 0) if err != nil { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) - } else if !set.IsEmpty() && (usetFlags&uset.USET_CASE_INSENSITIVE) != 0 { - set.CloseOver(uset.USET_CASE_INSENSITIVE) + c.error(uerror.PropertySyntax) + } else if !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { + set.CloseOver(uset.CaseInsensitive) } goto done } @@ -3446,97 +3412,97 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni // These all begin with "java" // if propName == "javaDefined" { - c.err = uprops.AddCategory(set, uchar.U_GC_CN_MASK) + c.err = uprops.AddCategory(set, uchar.GcCnMask) set.Complement() } else if propName == "javaDigit" { - c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.GcNdMask) } else if propName == "javaIdentifierIgnorable" { c.err = addIdentifierIgnorable(set) } else if propName == "javaISOControl" { set.AddRuneRange(0, 0x1F) set.AddRuneRange(0x7F, 0x9F) } else if propName == "javaJavaIdentifierPart" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_SC_MASK) + c.err = uprops.AddCategory(set, uchar.GcScMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + c.err = uprops.AddCategory(set, uchar.GcPcMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.GcNdMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + c.err = uprops.AddCategory(set, uchar.GcNlMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_MC_MASK) + c.err = uprops.AddCategory(set, uchar.GcMcMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_MN_MASK) + c.err = uprops.AddCategory(set, uchar.GcMnMask) } if c.err == nil { c.err = addIdentifierIgnorable(set) } } else if propName == "javaJavaIdentifierStart" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + c.err = uprops.AddCategory(set, uchar.GcNlMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_SC_MASK) + c.err = uprops.AddCategory(set, uchar.GcScMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + c.err = uprops.AddCategory(set, uchar.GcPcMask) } } else if propName == "javaLetter" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) } else if propName == "javaLetterOrDigit" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.GcNdMask) } } else if propName == "javaLowerCase" { - c.err = uprops.AddCategory(set, uchar.U_GC_LL_MASK) + c.err = uprops.AddCategory(set, uchar.GcLlMask) } else if propName == "javaMirrored" { - c.err = uprops.ApplyIntPropertyValue(set, uprops.UCHAR_BIDI_MIRRORED, 1) + c.err = uprops.ApplyIntPropertyValue(set, uprops.UCharBidiMirrored, 1) } else if propName == "javaSpaceChar" { - c.err = uprops.AddCategory(set, uchar.U_GC_Z_MASK) + c.err = uprops.AddCategory(set, uchar.GcZMask) } else if propName == "javaSupplementaryCodePoint" { - set.AddRuneRange(0x10000, uset.MAX_VALUE) + set.AddRuneRange(0x10000, uset.MaxValue) } else if propName == "javaTitleCase" { - c.err = uprops.AddCategory(set, uchar.U_GC_LT_MASK) + c.err = uprops.AddCategory(set, uchar.GcLtMask) } else if propName == "javaUnicodeIdentifierStart" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + c.err = uprops.AddCategory(set, uchar.GcNlMask) } } else if propName == "javaUnicodeIdentifierPart" { - c.err = uprops.AddCategory(set, uchar.U_GC_L_MASK) + c.err = uprops.AddCategory(set, uchar.GcLMask) if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_PC_MASK) + c.err = uprops.AddCategory(set, uchar.GcPcMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_ND_MASK) + c.err = uprops.AddCategory(set, uchar.GcNdMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_NL_MASK) + c.err = uprops.AddCategory(set, uchar.GcNlMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_MC_MASK) + c.err = uprops.AddCategory(set, uchar.GcMcMask) } if c.err == nil { - c.err = uprops.AddCategory(set, uchar.U_GC_MN_MASK) + c.err = uprops.AddCategory(set, uchar.GcMnMask) } if c.err == nil { c.err = addIdentifierIgnorable(set) } } else if propName == "javaUpperCase" { - c.err = uprops.AddCategory(set, uchar.U_GC_LU_MASK) + c.err = uprops.AddCategory(set, uchar.GcLuMask) } else if propName == "javaValidCodePoint" { - set.AddRuneRange(0, uset.MAX_VALUE) + set.AddRuneRange(0, uset.MaxValue) } else if propName == "javaWhitespace" { - c.err = uprops.AddCategory(set, uchar.U_GC_Z_MASK) + c.err = uprops.AddCategory(set, uchar.GcZMask) excl := uset.New() excl.AddRune(0x0a) excl.AddRune(0x2007) @@ -3545,18 +3511,18 @@ func (c *Compiler) createSetForProperty(propName string, negated bool) *uset.Uni set.AddRuneRange(9, 0x0d) set.AddRuneRange(0x1c, 0x1f) } else { - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) } - if c.err == nil && !set.IsEmpty() && (usetFlags&uset.USET_CASE_INSENSITIVE) != 0 { - set.CloseOver(uset.USET_CASE_INSENSITIVE) + if c.err == nil && !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { + set.CloseOver(uset.CaseInsensitive) } goto done } // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility // extensions matched it. - c.error(uerror.U_REGEX_PROPERTY_SYNTAX) + c.error(uerror.PropertySyntax) done: if c.err != nil { @@ -3573,10 +3539,10 @@ func addIdentifierIgnorable(set *uset.UnicodeSet) error { set.AddRuneRange(0x0e, 0x1b) set.AddRuneRange(0x7f, 0x9f) - return uprops.AddCategory(set, uchar.U_GC_CF_MASK) + return uprops.AddCategory(set, uchar.GcCfMask) } -func (c *Compiler) scanPosixProp() *uset.UnicodeSet { +func (c *compiler) scanPosixProp() *uset.UnicodeSet { var set *uset.UnicodeSet if !(c.c.char == chColon) { @@ -3647,7 +3613,7 @@ func (c *Compiler) scanPosixProp() *uset.UnicodeSet { return set } -func (c *Compiler) compileSet(set *uset.UnicodeSet) { +func (c *compiler) compileSet(set *uset.UnicodeSet) { if set == nil { return } @@ -3660,7 +3626,7 @@ func (c *Compiler) compileSet(set *uset.UnicodeSet) { switch setSize { case 0: // Set of no elements. Always fails to match. - c.appendOp(URX_BACKTRACK, 0) + c.appendOp(urxBacktrack, 0) case 1: // The set contains only a single code point. Put it into @@ -3674,6 +3640,6 @@ func (c *Compiler) compileSet(set *uset.UnicodeSet) { // theSet->freeze(); setNumber := len(c.out.sets) c.out.sets = append(c.out.sets, set) - c.appendOp(URX_SETREF, setNumber) + c.appendOp(urxSetref, setNumber) } } diff --git a/go/mysql/icuregex/compiler_table.go b/go/mysql/icuregex/compiler_table.go index 609eb3764bf..e8cfe0d5e55 100644 --- a/go/mysql/icuregex/compiler_table.go +++ b/go/mysql/icuregex/compiler_table.go @@ -24,7 +24,7 @@ package icuregex type patternParseAction uint8 const ( - doSetBackslash_D patternParseAction = iota + doSetBackslashD patternParseAction = iota doBackslashh doBackslashH doSetLiteralEscaped @@ -41,7 +41,7 @@ const ( doBackslashG doBackslashR doSetBegin - doSetBackslash_v + doSetBackslashv doPossessivePlus doPerlInline doBackslashZ @@ -58,17 +58,17 @@ const ( doOpenNonCaptureParen doExit doSetNamedChar - doSetBackslash_V + doSetBackslashV doConditionalExpr doEscapeError doBadOpenParenType doPossessiveStar doSetAddDash doEscapedLiteralChar - doSetBackslash_w + doSetBackslashw doIntervalUpperDigit doBackslashv - doSetBackslash_S + doSetBackslashS doSetNoCloseError doSetProp doBackslashB @@ -89,14 +89,14 @@ const ( doNamedChar doNGPlus doSetDifference2 - doSetBackslash_H + doSetBackslashH doCloseParen doDotAny doOpenCaptureParen doEnterQuoteMode doOpenAtomicParen doBadModeFlag - doSetBackslash_d + doSetBackslashd doSetFinish doProperty doBeginNamedBackRef @@ -111,7 +111,7 @@ const ( doBackslashb doSetBeginUnion doIntevalLowerDigit - doSetBackslash_h + doSetBackslashh doStar doMatchMode doBackslashA @@ -126,9 +126,9 @@ const ( doIntervalSame doNGOpt doOpenLookAhead - doSetBackslash_W + doSetBackslashW doMismatchedParenErr - doSetBackslash_s + doSetBackslashs rbbiLastAction ) @@ -341,16 +341,16 @@ var parseStateTable = []regexTableEl{ {doSetProp, 112 /* p */, 148, 0, false}, // 191 set-escape {doSetProp, 80 /* P */, 148, 0, false}, // 192 {doSetNamedChar, 78 /* N */, 141, 0, false}, // 193 - {doSetBackslash_s, 115 /* s */, 155, 0, true}, // 194 - {doSetBackslash_S, 83 /* S */, 155, 0, true}, // 195 - {doSetBackslash_w, 119 /* w */, 155, 0, true}, // 196 - {doSetBackslash_W, 87 /* W */, 155, 0, true}, // 197 - {doSetBackslash_d, 100 /* d */, 155, 0, true}, // 198 - {doSetBackslash_D, 68 /* D */, 155, 0, true}, // 199 - {doSetBackslash_h, 104 /* h */, 155, 0, true}, // 200 - {doSetBackslash_H, 72 /* H */, 155, 0, true}, // 201 - {doSetBackslash_v, 118 /* v */, 155, 0, true}, // 202 - {doSetBackslash_V, 86 /* V */, 155, 0, true}, // 203 + {doSetBackslashs, 115 /* s */, 155, 0, true}, // 194 + {doSetBackslashS, 83 /* S */, 155, 0, true}, // 195 + {doSetBackslashw, 119 /* w */, 155, 0, true}, // 196 + {doSetBackslashW, 87 /* W */, 155, 0, true}, // 197 + {doSetBackslashd, 100 /* d */, 155, 0, true}, // 198 + {doSetBackslashD, 68 /* D */, 155, 0, true}, // 199 + {doSetBackslashh, 104 /* h */, 155, 0, true}, // 200 + {doSetBackslashH, 72 /* H */, 155, 0, true}, // 201 + {doSetBackslashv, 118 /* v */, 155, 0, true}, // 202 + {doSetBackslashV, 86 /* V */, 155, 0, true}, // 203 {doSetLiteralEscaped, 255, 141, 0, true}, // 204 {doSetFinish, 255, 14, 0, false}, // 205 set-finish {doExit, 255, 206, 0, true}, // 206 errorDeath diff --git a/go/mysql/icuregex/debug.go b/go/mysql/icuregex/debug.go index 5cacc87d007..92c43e704d7 100644 --- a/go/mysql/icuregex/debug.go +++ b/go/mysql/icuregex/debug.go @@ -30,11 +30,11 @@ func (pat *Pattern) Dump(w io.Writer) { fmt.Fprintf(w, "Original Pattern: \"%s\"\n", pat.pattern) fmt.Fprintf(w, " Min Match Length: %d\n", pat.minMatchLen) fmt.Fprintf(w, " Match Start Type: %v\n", pat.startType) - if pat.startType == START_STRING { + if pat.startType == startString { fmt.Fprintf(w, " Initial match string: \"%s\"\n", string(pat.literalText[pat.initialStringIdx:pat.initialStringIdx+pat.initialStringLen])) - } else if pat.startType == START_SET { + } else if pat.startType == startSet { fmt.Fprintf(w, " Match First Chars: %s\n", pat.initialChars.String()) - } else if pat.startType == START_CHAR { + } else if pat.startType == startChar { fmt.Fprintf(w, " First char of Match: ") if pat.initialChar > 0x20 { fmt.Fprintf(w, "'%c'\n", pat.initialChar) @@ -61,93 +61,87 @@ func (pat *Pattern) Dump(w io.Writer) { func (pat *Pattern) dumpOp(w io.Writer, index int) { op := pat.compiledPat[index] - val := op.Value() - opType := op.Type() + val := op.value() + opType := op.typ() pinnedType := opType - if int(pinnedType) >= len(UrxOpcodeNames) { + if int(pinnedType) >= len(urxOpcodeNames) { pinnedType = 0 } - fmt.Fprintf(w, "%4d %08x %-15s ", index, op, UrxOpcodeNames[pinnedType]) + fmt.Fprintf(w, "%4d %08x %-15s ", index, op, urxOpcodeNames[pinnedType]) switch opType { - case URX_NOP, - URX_DOTANY, - URX_DOTANY_ALL, - URX_FAIL, - URX_CARET, - URX_DOLLAR, - URX_BACKSLASH_G, - URX_BACKSLASH_X, - URX_END, - URX_DOLLAR_M, - URX_CARET_M: + case urxNop, + urxDotany, + urxDotanyAll, + urxFail, + urxCaret, + urxDollar, + urxBackslashG, + urxBackslashX, + urxEnd, + urxDollarM, + urxCaretM: // Types with no operand field of interest. - case URX_RESERVED_OP, - URX_START_CAPTURE, - URX_END_CAPTURE, - URX_STATE_SAVE, - URX_JMP, - URX_JMP_SAV, - URX_JMP_SAV_X, - URX_BACKSLASH_B, - URX_BACKSLASH_BU, - URX_BACKSLASH_D, - URX_BACKSLASH_Z, - URX_STRING_LEN, - URX_CTR_INIT, - URX_CTR_INIT_NG, - URX_CTR_LOOP, - URX_CTR_LOOP_NG, - URX_RELOC_OPRND, - URX_STO_SP, - URX_LD_SP, - URX_BACKREF, - URX_STO_INP_LOC, - URX_JMPX, - URX_LA_START, - URX_LA_END, - URX_BACKREF_I, - URX_LB_START, - URX_LB_CONT, - URX_LB_END, - URX_LBN_CONT, - URX_LBN_END, - URX_LOOP_C, - URX_LOOP_DOT_I, - URX_BACKSLASH_H, - URX_BACKSLASH_R, - URX_BACKSLASH_V: + case urxReservedOp, + urxStartCapture, + urxEndCapture, + urxStateSave, + urxJmp, + urxJmpSav, + urxJmpSavX, + urxBackslashB, + urxBackslashBu, + urxBackslashD, + urxBackslashZ, + urxStringLen, + urxCtrInit, + urxCtrInitNg, + utxCtrLoop, + urxCtrLoopNg, + urxRelocOprnd, + urxStoSp, + urxLdSp, + urxBackref, + urxStoInpLoc, + urxJmpx, + urxLaStart, + urxLaEnd, + urxBackrefI, + urxLbStart, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd, + urxLoopC, + urxLoopDotI, + urxBackslashH, + urxBackslashR, + urxBackslashV: // types with an integer operand field. fmt.Fprintf(w, "%d", val) - case URX_ONECHAR, URX_ONECHAR_I: + case urxOnechar, urcOnecharI: if val < 0x20 { fmt.Fprintf(w, "%#x", val) } else { fmt.Fprintf(w, "'%c'", rune(val)) } - case URX_STRING, URX_STRING_I: + case urxString, urxStringI: lengthOp := pat.compiledPat[index+1] - // U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); - length := lengthOp.Value() + length := lengthOp.value() fmt.Fprintf(w, "%q", string(pat.literalText[val:val+length])) - case URX_SETREF, URX_LOOP_SR_I: - // UnicodeString s; - // UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); - //set->toPattern(s, TRUE); + case urxSetref, urxLoopSrI: fmt.Fprintf(w, "%s", pat.sets[val].String()) - case URX_STATIC_SETREF, URX_STAT_SETREF_N: - if (val & URX_NEG_SET) != 0 { + case urxStaticSetref, urxStatSetrefN: + if (val & urxNegSet) != 0 { fmt.Fprintf(w, "NOT ") - val &= ^URX_NEG_SET + val &= ^urxNegSet } - // UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; - // set.toPattern(s, TRUE); fmt.Fprintf(w, "%s", staticPropertySets[val].String()) default: diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go index 9bb77994cea..c2cde70b8be 100644 --- a/go/mysql/icuregex/error.go +++ b/go/mysql/icuregex/error.go @@ -29,7 +29,7 @@ import ( ) type CompileError struct { - Code uerror.URegexCompileErrorCode + Code uerror.CompileErrorCode Line int Offset int Context string @@ -38,41 +38,41 @@ type CompileError struct { func (e *CompileError) Error() string { var out strings.Builder switch e.Code { - case uerror.U_REGEX_INTERNAL_ERROR: + case uerror.InternalError: out.WriteString("Internal Error") - case uerror.U_REGEX_RULE_SYNTAX: + case uerror.RuleSyntax: out.WriteString("Syntax Error") - case uerror.U_REGEX_INVALID_STATE: + case uerror.InvalidState: out.WriteString("Invalid State") - case uerror.U_REGEX_BAD_ESCAPE_SEQUENCE: + case uerror.BadEscapeSequence: out.WriteString("Bad escape sequence") - case uerror.U_REGEX_PROPERTY_SYNTAX: + case uerror.PropertySyntax: out.WriteString("Property syntax error") - case uerror.U_REGEX_UNIMPLEMENTED: + case uerror.Unimplemented: out.WriteString("Unimplemented") - case uerror.U_REGEX_MISMATCHED_PAREN: + case uerror.MismatchedParen: out.WriteString("Mismatched parentheses") - case uerror.U_REGEX_NUMBER_TOO_BIG: + case uerror.NumberTooBig: out.WriteString("Number too big") - case uerror.U_REGEX_BAD_INTERVAL: + case uerror.BadInterval: out.WriteString("Bad interval") - case uerror.U_REGEX_MAX_LT_MIN: + case uerror.MaxLtMin: out.WriteString("Max less than min") - case uerror.U_REGEX_INVALID_BACK_REF: + case uerror.InvalidBackRef: out.WriteString("Invalid back reference") - case uerror.U_REGEX_INVALID_FLAG: + case uerror.InvalidFlag: out.WriteString("Invalid flag") - case uerror.U_REGEX_LOOK_BEHIND_LIMIT: + case uerror.LookBehindLimit: out.WriteString("Look behind limit") - case uerror.U_REGEX_SET_CONTAINS_STRING: + case uerror.SetContainsString: out.WriteString("Set contains string") - case uerror.U_REGEX_MISSING_CLOSE_BRACKET: + case uerror.MissingCloseBracket: out.WriteString("Missing closing ]") - case uerror.U_REGEX_INVALID_RANGE: + case uerror.InvalidRange: out.WriteString("Invalid range") - case uerror.U_REGEX_PATTERN_TOO_BIG: + case uerror.PatternTooBig: out.WriteString("Pattern too big") - case uerror.U_REGEX_INVALID_CAPTURE_GROUP_NAME: + case uerror.InvalidCaptureGroupName: out.WriteString("Invalid capture group name") } _, _ = fmt.Fprintf(&out, " at line %d, column %d: `%s`", e.Line, e.Offset, e.Context) @@ -81,7 +81,7 @@ func (e *CompileError) Error() string { } type MatchError struct { - Code uerror.URegexMatchErrorCode + Code uerror.MatchErrorCode Pattern string Position int Input []rune @@ -92,9 +92,9 @@ const maxMatchInputLength = 20 func (e *MatchError) Error() string { var out strings.Builder switch e.Code { - case uerror.U_REGEX_STACK_OVERFLOW: + case uerror.StackOverflow: out.WriteString("Stack overflow") - case uerror.U_REGEX_TIME_OUT: + case uerror.TimeOut: out.WriteString("Timeout") } diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index 64f56637fd7..ac42cc16b3f 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -36,7 +36,6 @@ import ( "vitess.io/vitess/go/mysql/icuregex" "vitess.io/vitess/go/mysql/icuregex/internal/pattern" - "vitess.io/vitess/go/mysql/icuregex/internal/uprops" ) var ErrSkip = errors.New("ignored test") @@ -89,19 +88,19 @@ func (tp *TestPattern) parseFlags(line string) (string, error) { return line, nil case ' ', '\t': case 'i': - tp.Flags |= icuregex.UREGEX_CASE_INSENSITIVE + tp.Flags |= icuregex.CaseInsensitive case 'x': - tp.Flags |= icuregex.UREGEX_COMMENTS + tp.Flags |= icuregex.Comments case 's': - tp.Flags |= icuregex.UREGEX_DOTALL + tp.Flags |= icuregex.DotAll case 'm': - tp.Flags |= icuregex.UREGEX_MULTILINE + tp.Flags |= icuregex.Multiline case 'e': - tp.Flags |= icuregex.UREGEX_ERROR_ON_UNKNOWN_ESCAPES + tp.Flags |= icuregex.ErrorOnUnknownEscapes case 'D': - tp.Flags |= icuregex.UREGEX_UNIX_LINES + tp.Flags |= icuregex.UnixLines case 'Q': - tp.Flags |= icuregex.UREGEX_LITERAL + tp.Flags |= icuregex.Literal case '2', '3', '4', '5', '6', '7', '8', '9': tp.Options.FindCount = int(line[0] - '0') case 'G': @@ -134,11 +133,10 @@ func (tp *TestPattern) parseFlags(line string) (string, error) { return "", io.ErrUnexpectedEOF } -func (tp *TestPattern) parseMatch(input string) error { - var ok bool - input, ok = pattern.Unescape(input) +func (tp *TestPattern) parseMatch(orig string) error { + input, ok := pattern.Unescape(orig) if !ok { - return fmt.Errorf("failed to unquote input") + return fmt.Errorf("failed to unquote input: %s", orig) } var detagged []rune @@ -153,27 +151,26 @@ func (tp *TestPattern) parseMatch(input string) error { groupNum := input[g[4]:g[5]] if groupNum == "r" { return ErrSkip - } else { - num, err := strconv.Atoi(groupNum) - if err != nil { - return fmt.Errorf("bad group number %q: %w", groupNum, err) - } + } + num, err := strconv.Atoi(groupNum) + if err != nil { + return fmt.Errorf("bad group number %q: %w", groupNum, err) + } - if num >= len(tp.Groups) { - grp := make([]TestGroup, num+1) - for i := range grp { - grp[i].Start = -1 - grp[i].End = -1 - } - copy(grp, tp.Groups) - tp.Groups = grp + if num >= len(tp.Groups) { + grp := make([]TestGroup, num+1) + for i := range grp { + grp[i].Start = -1 + grp[i].End = -1 } + copy(grp, tp.Groups) + tp.Groups = grp + } - if closing { - tp.Groups[num].End = len(detagged) - } else { - tp.Groups[num].Start = len(detagged) - } + if closing { + tp.Groups[num].End = len(detagged) + } else { + tp.Groups[num].Start = len(detagged) } } @@ -193,7 +190,7 @@ func ParseTestFile(t testing.TB, filename string) []TestPattern { var lineno int var patterns []TestPattern - error := func(err error) { + errFunc := func(err error) { if err == ErrSkip { return } @@ -218,14 +215,14 @@ func ParseTestFile(t testing.TB, filename string) []TestPattern { tp.Pattern = line[1 : idx+1] line, err = tp.parseFlags(line[idx+2:]) if err != nil { - error(err) + errFunc(err) continue } idx = strings.IndexByte(line[1:], line[0]) err = tp.parseMatch(line[1 : idx+1]) if err != nil { - error(err) + errFunc(err) continue } @@ -386,7 +383,7 @@ func TestCornerCases(t *testing.T) { {`(abc)*+a`, "abcabcabc", 0, false}, {`(abc)*+a`, "abcabcab", 0, true}, {`a\N{LATIN SMALL LETTER B}c`, "abc", 0, true}, - {`a.b`, "a\rb", icuregex.UREGEX_UNIX_LINES, true}, + {`a.b`, "a\rb", icuregex.UnixLines, true}, {`a.b`, "a\rb", 0, false}, {`(?d)abc$`, "abc\r", 0, false}, {`[ \b]`, "b", 0, true}, @@ -423,8 +420,3 @@ func TestOne(t *testing.T) { require.NoError(t, err) t.Logf("match = %v", found) } - -func TestTrie(t *testing.T) { - p := uprops.GetPropertyEnum("Block") - t.Logf("%v", p) -} diff --git a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go index 732fddc231d..aff80dc3e69 100644 --- a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go +++ b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go @@ -31,7 +31,7 @@ func New(pos []byte) BytesTrie { return BytesTrie{pos: pos, original: pos, remainingMatchLength: -1} } -type Result int32 +type result int32 const ( /** * The input unit(s) did not continue a matching string. @@ -40,14 +40,14 @@ const ( /** * until the trie is reset to its original state or to a saved state. * @stable ICU 4.8 */ - NO_MATCH Result = iota + noMatch result = iota /** * The input unit(s) continued a matching string * but there is no value for the string so far. * (It is a prefix of a longer string.) * @stable ICU 4.8 */ - NO_VALUE + noValue /** * The input unit(s) continued a matching string * and there is a value for the string so far. @@ -55,7 +55,7 @@ const ( /** * No further input byte/unit can continue a matching string. * @stable ICU 4.8 */ - FINAL_VALUE + finalValue /** * The input unit(s) continued a matching string * and there is a value for the string so far. @@ -63,43 +63,43 @@ const ( /** * Another input byte/unit can continue a matching string. * @stable ICU 4.8 */ - INTERMEDIATE_VALUE + intermediateValue ) const ( - kMaxBranchLinearSubNodeLength = 5 + maxBranchLinearSubNodeLength = 5 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. - kMinLinearMatch = 0x10 - kMaxLinearMatchLength = 0x10 + minLinearMatch = 0x10 + maxLinearMatchLength = 0x10 // 20..ff: Variable-length value node. // If odd, the value is final. (Otherwise, intermediate value or jump delta.) // Then shift-right by 1 bit. // The remaining lead byte value indicates the number of following bytes (0..4) // and contains the value's top bits. - kMinValueLead = kMinLinearMatch + kMaxLinearMatchLength // 0x20 + minValueLead = minLinearMatch + maxLinearMatchLength // 0x20 // It is a final value if bit 0 is set. - kValueIsFinal = 1 + valueIsFinal = 1 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. - kMinOneByteValueLead = kMinValueLead / 2 // 0x10 - kMaxOneByteValue = 0x40 // At least 6 bits in the first byte. + minOneByteValueLead = minValueLead / 2 // 0x10 + maxOneByteValue = 0x40 // At least 6 bits in the first byte. - kMinTwoByteValueLead = kMinOneByteValueLead + kMaxOneByteValue + 1 // 0x51 - kMaxTwoByteValue = 0x1aff - kMinThreeByteValueLead = kMinTwoByteValueLead + (kMaxTwoByteValue >> 8) + 1 // 0x6c - kFourByteValueLead = 0x7e + minTwoByteValueLead = minOneByteValueLead + maxOneByteValue + 1 // 0x51 + maxTwoByteValue = 0x1aff + minThreeByteValueLead = minTwoByteValueLead + (maxTwoByteValue >> 8) + 1 // 0x6c + fourByteValueLead = 0x7e // Compact delta integers. - kMaxOneByteDelta = 0xbf - kMinTwoByteDeltaLead = kMaxOneByteDelta + 1 // 0xc0 - kMinThreeByteDeltaLead = 0xf0 - kFourByteDeltaLead = 0xfe + maxOneByteDelta = 0xbf + minTwoByteDeltaLead = maxOneByteDelta + 1 // 0xc0 + minThreeByteDeltaLead = 0xf0 + fourByteDeltaLead = 0xfe ) func (bt *BytesTrie) ContainsName(name string) bool { - result := NO_VALUE + result := noValue for _, c := range []byte(name) { if 'A' <= c && c <= 'Z' { c += 'a' - 'A' @@ -112,13 +112,13 @@ func (bt *BytesTrie) ContainsName(name string) bool { } result = bt.next(int32(c)) } - return result >= FINAL_VALUE + return result >= finalValue } -func (bt *BytesTrie) next(inByte int32) Result { +func (bt *BytesTrie) next(inByte int32) result { pos := bt.pos if pos == nil { - return NO_MATCH + return noMatch } if inByte < 0 { inByte += 0x100 @@ -134,28 +134,27 @@ func (bt *BytesTrie) next(inByte int32) Result { bt.pos = pos if length < 0 { node := int32(pos[0]) - if node >= kMinValueLead { + if node >= minValueLead { return bt.valueResult(node) } } - return NO_VALUE - } else { - bt.stop() - return NO_MATCH + return noValue } + bt.stop() + return noMatch } return bt.nextImpl(pos, inByte) } -func (bt *BytesTrie) nextImpl(pos []byte, inByte int32) Result { +func (bt *BytesTrie) nextImpl(pos []byte, inByte int32) result { for { node := int32(pos[0]) pos = pos[1:] - if node < kMinLinearMatch { + if node < minLinearMatch { return bt.branchNext(pos, node, inByte) - } else if node < kMinValueLead { + } else if node < minValueLead { // Match the first of length+1 bytes. - length := node - kMinLinearMatch // Actual match length minus 1. + length := node - minLinearMatch // Actual match length minus 1. match := inByte == int32(pos[0]) pos = pos[1:] if match { @@ -164,38 +163,36 @@ func (bt *BytesTrie) nextImpl(pos []byte, inByte int32) Result { bt.pos = pos if length < 0 { node = int32(pos[0]) - if node >= kMinValueLead { + if node >= minValueLead { return bt.valueResult(node) } } - return NO_VALUE - } else { - // No match. - break + return noValue } - } else if (node & kValueIsFinal) != 0 { + // No match. + break + } else if (node & valueIsFinal) != 0 { // No further matching bytes. break } else { // Skip intermediate value. pos = bt.skipValue2(pos, node) // The next node must not also be a value node. - // U_ASSERT(*pos kMaxBranchLinearSubNodeLength { + for length > maxBranchLinearSubNodeLength { p := int32(pos[0]) pos = pos[1:] if inByte < p { @@ -222,27 +219,26 @@ func (bt *BytesTrie) branchNext(pos []byte, length int32, inByte int32) Result { p := int32(pos[0]) pos = pos[1:] if inByte == p { - var result Result + var result result node := int32(pos[0]) - // U_ASSERT(node>=kMinValueLead); - if (node & kValueIsFinal) != 0 { + if (node & valueIsFinal) != 0 { // Leave the final value for getValue() to read. - result = FINAL_VALUE + result = finalValue } else { // Use the non-final value as the jump delta. pos = pos[1:] // int32_t delta=readValue(pos, node>>1); node >>= 1 var delta int32 - if node < kMinTwoByteValueLead { - delta = node - kMinOneByteValueLead - } else if node < kMinThreeByteValueLead { - delta = ((node - kMinTwoByteValueLead) << 8) | int32(pos[0]) + if node < minTwoByteValueLead { + delta = node - minOneByteValueLead + } else if node < minThreeByteValueLead { + delta = ((node - minTwoByteValueLead) << 8) | int32(pos[0]) pos = pos[1:] - } else if node < kFourByteValueLead { - delta = ((node - kMinThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + } else if node < fourByteValueLead { + delta = ((node - minThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) pos = pos[2:] - } else if node == kFourByteValueLead { + } else if node == fourByteValueLead { delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) pos = pos[3:] } else { @@ -252,10 +248,10 @@ func (bt *BytesTrie) branchNext(pos []byte, length int32, inByte int32) Result { // end readValue() pos = pos[delta:] node = int32(pos[0]) - if node >= kMinValueLead { + if node >= minValueLead { result = bt.valueResult(node) } else { - result = NO_VALUE + result = noValue } } bt.pos = pos @@ -272,14 +268,13 @@ func (bt *BytesTrie) branchNext(pos []byte, length int32, inByte int32) Result { if inByte == p { bt.pos = pos node := int32(pos[0]) - if node >= kMinValueLead { + if node >= minValueLead { return bt.valueResult(node) } - return NO_VALUE - } else { - bt.stop() - return NO_MATCH + return noValue } + bt.stop() + return noMatch } func (bt *BytesTrie) skipValue1(pos []byte) []byte { @@ -288,10 +283,10 @@ func (bt *BytesTrie) skipValue1(pos []byte) []byte { } func (bt *BytesTrie) skipValue2(pos []byte, leadByte int32) []byte { - if leadByte >= (kMinTwoByteValueLead << 1) { - if leadByte < (kMinThreeByteValueLead << 1) { + if leadByte >= (minTwoByteValueLead << 1) { + if leadByte < (minThreeByteValueLead << 1) { pos = pos[1:] - } else if leadByte < (kFourByteValueLead << 1) { + } else if leadByte < (fourByteValueLead << 1) { pos = pos[2:] } else { pos = pos[3+((leadByte>>1)&1):] @@ -303,10 +298,10 @@ func (bt *BytesTrie) skipValue2(pos []byte, leadByte int32) []byte { func (bt *BytesTrie) skipDelta(pos []byte) []byte { delta := int32(pos[0]) pos = pos[1:] - if delta >= kMinTwoByteDeltaLead { - if delta < kMinThreeByteDeltaLead { + if delta >= minTwoByteDeltaLead { + if delta < minThreeByteDeltaLead { pos = pos[1:] - } else if delta < kFourByteDeltaLead { + } else if delta < fourByteDeltaLead { pos = pos[2:] } else { pos = pos[3+(delta&1):] @@ -318,15 +313,15 @@ func (bt *BytesTrie) skipDelta(pos []byte) []byte { func (bt *BytesTrie) jumpByDelta(pos []byte) []byte { delta := int32(pos[0]) pos = pos[1:] - if delta < kMinTwoByteDeltaLead { + if delta < minTwoByteDeltaLead { // nothing to do - } else if delta < kMinThreeByteDeltaLead { - delta = ((delta - kMinTwoByteDeltaLead) << 8) | int32(pos[0]) + } else if delta < minThreeByteDeltaLead { + delta = ((delta - minTwoByteDeltaLead) << 8) | int32(pos[0]) pos = pos[1:] - } else if delta < kFourByteDeltaLead { - delta = ((delta - kMinThreeByteDeltaLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + } else if delta < fourByteDeltaLead { + delta = ((delta - minThreeByteDeltaLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) pos = pos[2:] - } else if delta == kFourByteDeltaLead { + } else if delta == fourByteDeltaLead { delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) pos = pos[3:] } else { @@ -344,13 +339,13 @@ func (bt *BytesTrie) GetValue() int32 { func (bt *BytesTrie) readValue(pos []byte, leadByte int32) int32 { var value int32 - if leadByte < kMinTwoByteValueLead { - value = leadByte - kMinOneByteValueLead - } else if leadByte < kMinThreeByteValueLead { - value = ((leadByte - kMinTwoByteValueLead) << 8) | int32(pos[0]) - } else if leadByte < kFourByteValueLead { - value = ((leadByte - kMinThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) - } else if leadByte == kFourByteValueLead { + if leadByte < minTwoByteValueLead { + value = leadByte - minOneByteValueLead + } else if leadByte < minThreeByteValueLead { + value = ((leadByte - minTwoByteValueLead) << 8) | int32(pos[0]) + } else if leadByte < fourByteValueLead { + value = ((leadByte - minThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + } else if leadByte == fourByteValueLead { value = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) } else { value = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) diff --git a/go/mysql/icuregex/internal/icudata/embed.go b/go/mysql/icuregex/internal/icudata/embed.go index 3fd006496bd..2b7e3033a21 100644 --- a/go/mysql/icuregex/internal/icudata/embed.go +++ b/go/mysql/icuregex/internal/icudata/embed.go @@ -61,27 +61,27 @@ var UNames []byte //go:embed uprops.icu var UProps []byte -// NFC is the table for character normalization where canonical +// Nfc is the table for character normalization where canonical // decomposition is done followed by canonical composition. // This is used for property checks of characters about composition. // //go:embed nfc.nrm -var NFC []byte +var Nfc []byte -// NFKC is the table for character normalization where compatibility +// Nfkc is the table for character normalization where compatibility // decomposition is done followed by canonical composition. // This is used for property checks of characters about composition. // //go:embed nfkc.nrm -var NFKC []byte +var Nfkc []byte -// NFKC_CF is the table for character normalization where compatibility +// NfkcCf is the table for character normalization where compatibility // decomposition is done followed by canonical composition with // case folding. // This is used for property checks of characters about composition. // //go:embed nfkc_cf.nrm -var NFKC_CF []byte +var NfkcCf []byte // BrkChar is used for matching against character break // characters in regular expressions. diff --git a/go/mysql/icuregex/internal/normalizer/constants.go b/go/mysql/icuregex/internal/normalizer/constants.go index 85b19de4b82..3c2de588952 100644 --- a/go/mysql/icuregex/internal/normalizer/constants.go +++ b/go/mysql/icuregex/internal/normalizer/constants.go @@ -23,76 +23,74 @@ package normalizer const ( // Fixed norm16 values. - MIN_YES_YES_WITH_CC = 0xfe02 - JAMO_VT = 0xfe00 - MIN_NORMAL_MAYBE_YES = 0xfc00 - JAMO_L = 2 // offset=1 hasCompBoundaryAfter=false - INERT = 1 // offset=0 hasCompBoundaryAfter=true + minYesYesWithCC = 0xfe02 + jamoVt = 0xfe00 + minNormalMaybeYes = 0xfc00 + jamoL = 2 // offset=1 hasCompBoundaryAfter=false + inert = 1 // offset=0 hasCompBoundaryAfter=true // norm16 bit 0 is comp-boundary-after. - HAS_COMP_BOUNDARY_AFTER = 1 - OFFSET_SHIFT = 1 + hasCompBoundaryAfter = 1 + offsetShift = 1 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the // tccc (0, 1, >1) for quick FCC boundary-after tests. - DELTA_TCCC_0 = 0 - DELTA_TCCC_1 = 2 - DELTA_TCCC_GT_1 = 4 - DELTA_TCCC_MASK = 6 - DELTA_SHIFT = 3 + deltaTccc0 = 0 + deltaTccc1 = 2 + deltaTcccGt1 = 4 + deltaTcccMask = 6 + deltaShift = 3 - MAX_DELTA = 0x40 + maxDelta = 0x40 ) const ( - JAMO_L_BASE rune = 0x1100 /* "lead" jamo */ - JAMO_L_END rune = 0x1112 - JAMO_V_BASE rune = 0x1161 /* "vowel" jamo */ - JAMO_V_END rune = 0x1175 - JAMO_T_BASE rune = 0x11a7 /* "trail" jamo */ - JAMO_T_END rune = 0x11c2 - - HANGUL_BASE rune = 0xac00 - HANGUL_END rune = 0xd7a3 - - JAMO_L_COUNT rune = 19 - JAMO_V_COUNT rune = 21 - JAMO_T_COUNT rune = 28 - - JAMO_VT_COUNT = JAMO_V_COUNT * JAMO_T_COUNT - - HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT * JAMO_T_COUNT - HANGUL_LIMIT = HANGUL_BASE + HANGUL_COUNT + jamoLBase rune = 0x1100 /* "lead" jamo */ + jamoLEnd rune = 0x1112 + jamoVBase rune = 0x1161 /* "vowel" jamo */ + jamoVEnd rune = 0x1175 + jamoTBase rune = 0x11a7 /* "trail" jamo */ + jamoTEnd rune = 0x11c2 + + hangulBase rune = 0xac00 + hangulEnd rune = 0xd7a3 + + jamoLCount rune = 19 + jamoVCount rune = 21 + jamoTCount rune = 28 + + hangulCount = jamoLCount * jamoVCount * jamoTCount + hangulLimit = hangulBase + hangulCount ) const ( - MAPPING_HAS_CCC_LCCC_WORD = 0x80 - MAPPING_HAS_RAW_MAPPING = 0x40 + mappingHasCccLcccWord = 0x80 + mappingHasRawMapping = 0x40 // unused bit 0x20, - MAPPING_LENGTH_MASK = 0x1f + mappingLengthMask = 0x1f ) /** * Constants for normalization modes. * @deprecated ICU 56 Use unorm2.h instead. */ -type UNormalizationMode int32 +type Mode int32 const ( /** No decomposition/composition. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_NONE UNormalizationMode = 1 + NormNone Mode = 1 /** Canonical decomposition. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_NFD UNormalizationMode = 2 + NormNfd Mode = 2 /** Compatibility decomposition. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_NFKD UNormalizationMode = 3 + NormNfkd Mode = 3 /** Canonical decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_NFC UNormalizationMode = 4 + NormNfc Mode = 4 /** Default normalization. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_DEFAULT UNormalizationMode = UNORM_NFC + NormDefault Mode = NormNfc /** Compatibility decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_NFKC UNormalizationMode = 5 + NormNfkc Mode = 5 /** "Fast C or D" form. @deprecated ICU 56 Use unorm2.h instead. */ - UNORM_FCD UNormalizationMode = 6 + NormFcd Mode = 6 ) /** @@ -100,19 +98,19 @@ const ( * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms * @stable ICU 2.0 */ -type UNormalizationCheckResult int +type CheckResult int const ( /** * The input string is not in the normalization form. * @stable ICU 2.0 */ - UNORM_NO UNormalizationCheckResult = iota + No CheckResult = iota /** * The input string is in the normalization form. * @stable ICU 2.0 */ - UNORM_YES + Yes /** * The input string may or may not be in the normalization form. * This value is only returned for composition forms like NFC and FCC, @@ -120,5 +118,5 @@ const ( * would have to be analyzed further. * @stable ICU 2.0 */ - UNORM_MAYBE + Maybe ) diff --git a/go/mysql/icuregex/internal/normalizer/normalizer.go b/go/mysql/icuregex/internal/normalizer/normalizer.go index 87f370ea0ab..c13a4878deb 100644 --- a/go/mysql/icuregex/internal/normalizer/normalizer.go +++ b/go/mysql/icuregex/internal/normalizer/normalizer.go @@ -22,7 +22,7 @@ limitations under the License. package normalizer import ( - "fmt" + "errors" "sync" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" @@ -32,7 +32,7 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/utrie" ) -type normalizer struct { +type Normalizer struct { minDecompNoCP rune minCompNoMaybeCP rune minLcccCP rune @@ -55,58 +55,58 @@ type normalizer struct { smallFCD []uint8 // [0x100] one bit per 32 BMP code points, set if any FCD!=0 } -var nfc *normalizer -var nfkc *normalizer +var nfc *Normalizer +var nfkc *Normalizer var normalizerOnce sync.Once func loadNormalizer() { normalizerOnce.Do(func() { - nfc = &normalizer{} - if err := nfc.load(icudata.NFC); err != nil { + nfc = &Normalizer{} + if err := nfc.load(icudata.Nfc); err != nil { panic(err) } - nfkc = &normalizer{} - if err := nfkc.load(icudata.NFKC); err != nil { + nfkc = &Normalizer{} + if err := nfkc.load(icudata.Nfkc); err != nil { panic(err) } }) } -const IX_NORM_TRIE_OFFSET = 0 -const IX_EXTRA_DATA_OFFSET = 1 -const IX_SMALL_FCD_OFFSET = 2 -const IX_RESERVED3_OFFSET = 3 -const IX_TOTAL_SIZE = 7 +const ixNormTrieOffset = 0 +const ixExtraDataOffset = 1 +const ixSmallFcdOffset = 2 +const ixReserved3Offset = 3 +const ixTotalSize = 7 -const IX_MIN_DECOMP_NO_CP = 8 -const IX_MIN_COMP_NO_MAYBE_CP = 9 +const ixMinDecompNoCp = 8 +const ixMinCompNoMaybeCp = 9 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ -const IX_MIN_YES_NO = 10 +const ixMinYesNo = 10 /** Mappings are comp-normalized. */ -const IX_MIN_NO_NO = 11 -const IX_LIMIT_NO_NO = 12 -const IX_MIN_MAYBE_YES = 13 +const ixMinNoNo = 11 +const ixLimitNoNo = 12 +const ixMinMaybeYes = 13 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ -const IX_MIN_YES_NO_MAPPINGS_ONLY = 14 +const ixMinYesNoMappingsOnly = 14 /** Mappings are not comp-normalized but have a comp boundary before. */ -const IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE = 15 +const ixMinNoNoCompBoundaryBefore = 15 /** Mappings do not have a comp boundary before. */ -const IX_MIN_NO_NO_COMP_NO_MAYBE_CC = 16 +const ixMinNoNoCompNoMaybeCc = 16 /** Mappings to the empty string. */ -const IX_MIN_NO_NO_EMPTY = 17 +const ixMinNoNoEmpty = 17 -const IX_MIN_LCCC_CP = 18 -const IX_COUNT = 20 +const ixMinLcccCp = 18 +const ixCount = 20 -func (n *normalizer) load(data []byte) error { +func (n *Normalizer) load(data []byte) error { bytes := udata.NewBytes(data) err := bytes.ReadHeader(func(info *udata.DataInfo) bool { @@ -124,8 +124,8 @@ func (n *normalizer) load(data []byte) error { } indexesLength := int32(bytes.Uint32()) / 4 - if indexesLength <= IX_MIN_LCCC_CP { - return fmt.Errorf("normalizer2 data: not enough indexes") + if indexesLength <= ixMinLcccCp { + return errors.New("normalizer2 data: not enough indexes") } indexes := make([]int32, indexesLength) indexes[0] = indexesLength * 4 @@ -133,23 +133,23 @@ func (n *normalizer) load(data []byte) error { indexes[i] = bytes.Int32() } - n.minDecompNoCP = indexes[IX_MIN_DECOMP_NO_CP] - n.minCompNoMaybeCP = indexes[IX_MIN_COMP_NO_MAYBE_CP] - n.minLcccCP = indexes[IX_MIN_LCCC_CP] + n.minDecompNoCP = indexes[ixMinDecompNoCp] + n.minCompNoMaybeCP = indexes[ixMinCompNoMaybeCp] + n.minLcccCP = indexes[ixMinLcccCp] - n.minYesNo = uint16(indexes[IX_MIN_YES_NO]) - n.minYesNoMappingsOnly = uint16(indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]) - n.minNoNo = uint16(indexes[IX_MIN_NO_NO]) - n.minNoNoCompBoundaryBefore = uint16(indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]) - n.minNoNoCompNoMaybeCC = uint16(indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]) - n.minNoNoEmpty = uint16(indexes[IX_MIN_NO_NO_EMPTY]) - n.limitNoNo = uint16(indexes[IX_LIMIT_NO_NO]) - n.minMaybeYes = uint16(indexes[IX_MIN_MAYBE_YES]) + n.minYesNo = uint16(indexes[ixMinYesNo]) + n.minYesNoMappingsOnly = uint16(indexes[ixMinYesNoMappingsOnly]) + n.minNoNo = uint16(indexes[ixMinNoNo]) + n.minNoNoCompBoundaryBefore = uint16(indexes[ixMinNoNoCompBoundaryBefore]) + n.minNoNoCompNoMaybeCC = uint16(indexes[ixMinNoNoCompNoMaybeCc]) + n.minNoNoEmpty = uint16(indexes[ixMinNoNoEmpty]) + n.limitNoNo = uint16(indexes[ixLimitNoNo]) + n.minMaybeYes = uint16(indexes[ixMinMaybeYes]) - n.centerNoNoDelta = uint16(indexes[IX_MIN_MAYBE_YES]>>DELTA_SHIFT) - MAX_DELTA - 1 + n.centerNoNoDelta = uint16(indexes[ixMinMaybeYes]>>deltaShift) - maxDelta - 1 - offset := indexes[IX_NORM_TRIE_OFFSET] - nextOffset := indexes[IX_EXTRA_DATA_OFFSET] + offset := indexes[ixNormTrieOffset] + nextOffset := indexes[ixExtraDataOffset] triePosition := bytes.Position() n.normTrie, err = utrie.UcpTrieFromBytes(bytes) @@ -159,17 +159,17 @@ func (n *normalizer) load(data []byte) error { trieLength := bytes.Position() - triePosition if trieLength > nextOffset-offset { - return fmt.Errorf("normalizer2 data: not enough bytes for normTrie") + return errors.New("normalizer2 data: not enough bytes for normTrie") } bytes.Skip((nextOffset - offset) - trieLength) // skip padding after trie bytes // Read the composition and mapping data. offset = nextOffset - nextOffset = indexes[IX_SMALL_FCD_OFFSET] + nextOffset = indexes[ixSmallFcdOffset] numChars := (nextOffset - offset) / 2 if numChars != 0 { n.maybeYesCompositions = bytes.Uint16Slice(numChars) - n.extraData = n.maybeYesCompositions[((MIN_NORMAL_MAYBE_YES - n.minMaybeYes) >> OFFSET_SHIFT):] + n.extraData = n.maybeYesCompositions[((minNormalMaybeYes - n.minMaybeYes) >> offsetShift):] } // smallFCD: new in formatVersion 2 @@ -177,26 +177,26 @@ func (n *normalizer) load(data []byte) error { return nil } -func Nfc() *normalizer { +func Nfc() *Normalizer { loadNormalizer() return nfc } -func Nfkc() *normalizer { +func Nfkc() *Normalizer { loadNormalizer() return nfkc } -func (n *normalizer) AddPropertyStarts(u *uset.UnicodeSet) { +func (n *Normalizer) AddPropertyStarts(u *uset.UnicodeSet) { var start, end rune var value uint32 for { - end, value = nfc.normTrie.GetRange(start, utrie.UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, nil) + end, value = nfc.normTrie.GetRange(start, utrie.UcpMapRangeFixedLeadSurrogates, inert, nil) if end < 0 { break } u.AddRune(start) - if start != end && n.isAlgorithmicNoNo(uint16(value)) && (value&DELTA_TCCC_MASK) > DELTA_TCCC_1 { + if start != end && n.isAlgorithmicNoNo(uint16(value)) && (value&deltaTcccMask) > deltaTccc1 { // Range of code points with same-norm16-value algorithmic decompositions. // They might have different non-zero FCD16 values. prevFCD16 := n.GetFCD16(start) @@ -216,18 +216,18 @@ func (n *normalizer) AddPropertyStarts(u *uset.UnicodeSet) { } // add Hangul LV syllables and LV+1 because of skippables - for c := HANGUL_BASE; c < HANGUL_LIMIT; c += JAMO_T_COUNT { + for c := hangulBase; c < hangulLimit; c += jamoTCount { u.AddRune(c) u.AddRune(c + 1) } - u.AddRune(HANGUL_LIMIT) + u.AddRune(hangulLimit) } -func (n *normalizer) isAlgorithmicNoNo(norm16 uint16) bool { +func (n *Normalizer) isAlgorithmicNoNo(norm16 uint16) bool { return n.limitNoNo <= norm16 && norm16 < n.minMaybeYes } -func (n *normalizer) GetFCD16(c rune) uint16 { +func (n *Normalizer) GetFCD16(c rune) uint16 { if c < n.minDecompNoCP { return 0 } else if c <= 0xffff { @@ -238,7 +238,7 @@ func (n *normalizer) GetFCD16(c rune) uint16 { return n.getFCD16FromNormData(c) } -func (n *normalizer) singleLeadMightHaveNonZeroFCD16(lead rune) bool { +func (n *Normalizer) singleLeadMightHaveNonZeroFCD16(lead rune) bool { // 0<=lead<=0xffff bits := n.smallFCD[lead>>8] if bits == 0 { @@ -247,19 +247,19 @@ func (n *normalizer) singleLeadMightHaveNonZeroFCD16(lead rune) bool { return ((bits >> ((lead >> 5) & 7)) & 1) != 0 } -func (n *normalizer) getFCD16FromNormData(c rune) uint16 { - norm16 := n.GetNorm16(c) +func (n *Normalizer) getFCD16FromNormData(c rune) uint16 { + norm16 := n.getNorm16(c) if norm16 >= n.limitNoNo { - if norm16 >= MIN_NORMAL_MAYBE_YES { + if norm16 >= minNormalMaybeYes { // combining mark norm16 = uint16(n.getCCFromNormalYesOrMaybe(norm16)) return norm16 | (norm16 << 8) } else if norm16 >= n.minMaybeYes { return 0 } else { // isDecompNoAlgorithmic(norm16) - deltaTrailCC := norm16 & DELTA_TCCC_MASK - if deltaTrailCC <= DELTA_TCCC_1 { - return deltaTrailCC >> OFFSET_SHIFT + deltaTrailCC := norm16 & deltaTcccMask + if deltaTrailCC <= deltaTccc1 { + return deltaTrailCC >> offsetShift } // Maps to an isCompYesAndZeroCC. c = n.mapAlgorithmic(c, norm16) @@ -274,119 +274,120 @@ func (n *normalizer) getFCD16FromNormData(c rune) uint16 { // c decomposes, get everything from the variable-length extra data mapping := n.getMapping(norm16) firstUnit := mapping[1] - if firstUnit&MAPPING_HAS_CCC_LCCC_WORD != 0 { + if firstUnit&mappingHasCccLcccWord != 0 { norm16 |= mapping[0] & 0xff00 } return norm16 } -func (n *normalizer) getMapping(norm16 uint16) []uint16 { - return n.extraData[(norm16>>OFFSET_SHIFT)-1:] +func (n *Normalizer) getMapping(norm16 uint16) []uint16 { + return n.extraData[(norm16>>offsetShift)-1:] } -func (n *normalizer) GetNorm16(c rune) uint16 { +func (n *Normalizer) getNorm16(c rune) uint16 { if utf16.IsLead(c) { - return INERT + return inert } return n.getRawNorm16(c) } -func (n *normalizer) getRawNorm16(c rune) uint16 { +func (n *Normalizer) getRawNorm16(c rune) uint16 { return uint16(n.normTrie.Get(c)) } -func (n *normalizer) getCCFromNormalYesOrMaybe(norm16 uint16) uint8 { - return uint8(norm16 >> OFFSET_SHIFT) +func (n *Normalizer) getCCFromNormalYesOrMaybe(norm16 uint16) uint8 { + return uint8(norm16 >> offsetShift) } -func (n *normalizer) mapAlgorithmic(c rune, norm16 uint16) rune { - return c + rune(norm16>>DELTA_SHIFT) - rune(n.centerNoNoDelta) +func (n *Normalizer) mapAlgorithmic(c rune, norm16 uint16) rune { + return c + rune(norm16>>deltaShift) - rune(n.centerNoNoDelta) } -func (n *normalizer) isHangulLV(norm16 uint16) bool { +func (n *Normalizer) isHangulLV(norm16 uint16) bool { return norm16 == n.minYesNo } -func (n *normalizer) isHangulLVT(norm16 uint16) bool { +func (n *Normalizer) isHangulLVT(norm16 uint16) bool { return norm16 == n.hangulLVT() } -func (n *normalizer) hangulLVT() uint16 { - return n.minYesNoMappingsOnly | HAS_COMP_BOUNDARY_AFTER +func (n *Normalizer) hangulLVT() uint16 { + return n.minYesNoMappingsOnly | hasCompBoundaryAfter } -func (n *normalizer) getComposeQuickCheck(c rune) UNormalizationCheckResult { - return n.getCompQuickCheck(n.GetNorm16(c)) +func (n *Normalizer) getComposeQuickCheck(c rune) CheckResult { + return n.getCompQuickCheck(n.getNorm16(c)) } -func (n *normalizer) getDecomposeQuickCheck(c rune) UNormalizationCheckResult { - if n.isDecompYes(n.GetNorm16(c)) { - return UNORM_YES +func (n *Normalizer) getDecomposeQuickCheck(c rune) CheckResult { + if n.isDecompYes(n.getNorm16(c)) { + return Yes } - return UNORM_NO + return No } -func QuickCheck(c rune, mode UNormalizationMode) UNormalizationCheckResult { - if mode <= UNORM_NONE || UNORM_FCD <= mode { - return UNORM_YES +func QuickCheck(c rune, mode Mode) CheckResult { + if mode <= NormNone || NormFcd <= mode { + return Yes } switch mode { - case UNORM_NFC: + case NormNfc: return Nfc().getComposeQuickCheck(c) - case UNORM_NFD: + case NormNfd: return Nfc().getDecomposeQuickCheck(c) - case UNORM_NFKC: + case NormNfkc: return Nfkc().getComposeQuickCheck(c) - case UNORM_NFKD: + case NormNfkd: return Nfkc().getDecomposeQuickCheck(c) default: - return UNORM_MAYBE + return Maybe } } -func IsInert(c rune, mode UNormalizationMode) bool { +func IsInert(c rune, mode Mode) bool { switch mode { - case UNORM_NFC: + case NormNfc: return Nfc().isCompInert(c) - case UNORM_NFD: + case NormNfd: return Nfc().isDecompInert(c) - case UNORM_NFKC: + case NormNfkc: return Nfkc().isCompInert(c) - case UNORM_NFKD: + case NormNfkd: return Nfkc().isDecompInert(c) default: return true } } -func (n *normalizer) isDecompYes(norm16 uint16) bool { +func (n *Normalizer) isDecompYes(norm16 uint16) bool { return norm16 < n.minYesNo || n.minMaybeYes <= norm16 } -func (n *normalizer) getCompQuickCheck(norm16 uint16) UNormalizationCheckResult { - if norm16 < n.minNoNo || MIN_YES_YES_WITH_CC <= norm16 { - return UNORM_YES +func (n *Normalizer) getCompQuickCheck(norm16 uint16) CheckResult { + if norm16 < n.minNoNo || minYesYesWithCC <= norm16 { + return Yes } else if n.minMaybeYes <= norm16 { - return UNORM_MAYBE + return Maybe } else { - return UNORM_NO + return No } } -func (n *normalizer) isMaybeOrNonZeroCC(norm16 uint16) bool { +func (n *Normalizer) isMaybeOrNonZeroCC(norm16 uint16) bool { return norm16 >= n.minMaybeYes } -func (n *normalizer) isDecompNoAlgorithmic(norm16 uint16) bool { +func (n *Normalizer) isDecompNoAlgorithmic(norm16 uint16) bool { return norm16 >= n.limitNoNo } -func (n *normalizer) IsCompNo(norm16 uint16) bool { +func (n *Normalizer) IsCompNo(c rune) bool { + norm16 := n.getNorm16(c) return n.minNoNo <= norm16 && norm16 < n.minMaybeYes } -func (n *normalizer) Decompose(c rune) []rune { - norm16 := n.GetNorm16(c) +func (n *Normalizer) Decompose(c rune) []rune { + norm16 := n.getNorm16(c) if c < n.minDecompNoCP || n.isMaybeOrNonZeroCC(norm16) { // c does not decompose return nil @@ -414,7 +415,7 @@ func (n *normalizer) Decompose(c rune) []rune { } // c decomposes, get everything from the variable-length extra data mapping := n.getMapping(norm16) - length := mapping[1] & MAPPING_LENGTH_MASK + length := mapping[1] & mappingLengthMask mapping = mapping[2 : 2+length] for len(mapping) > 0 { @@ -426,43 +427,43 @@ func (n *normalizer) Decompose(c rune) []rune { } func hangulDecompose(c rune) []uint16 { - c -= HANGUL_BASE - c2 := c % JAMO_T_COUNT - c /= JAMO_T_COUNT + c -= hangulBase + c2 := c % jamoTCount + c /= jamoTCount var buffer []uint16 - buffer = append(buffer, uint16(JAMO_L_BASE+c/JAMO_V_COUNT)) - buffer = append(buffer, uint16(JAMO_V_BASE+c%JAMO_V_COUNT)) + buffer = append(buffer, uint16(jamoLBase+c/jamoVCount)) + buffer = append(buffer, uint16(jamoVBase+c%jamoVCount)) if c2 != 0 { - buffer = append(buffer, uint16(JAMO_T_BASE+c2)) + buffer = append(buffer, uint16(jamoTBase+c2)) } return buffer } -func (n *normalizer) isCompInert(c rune) bool { - norm16 := n.GetNorm16(c) - return n.isCompYesAndZeroCC(norm16) && (norm16&HAS_COMP_BOUNDARY_AFTER) != 0 +func (n *Normalizer) isCompInert(c rune) bool { + norm16 := n.getNorm16(c) + return n.isCompYesAndZeroCC(norm16) && (norm16&hasCompBoundaryAfter) != 0 } -func (n *normalizer) isDecompInert(c rune) bool { - return n.isDecompYesAndZeroCC(n.GetNorm16(c)) +func (n *Normalizer) isDecompInert(c rune) bool { + return n.isDecompYesAndZeroCC(n.getNorm16(c)) } -func (n *normalizer) isCompYesAndZeroCC(norm16 uint16) bool { +func (n *Normalizer) isCompYesAndZeroCC(norm16 uint16) bool { return norm16 < n.minNoNo } -func (n *normalizer) isDecompYesAndZeroCC(norm16 uint16) bool { +func (n *Normalizer) isDecompYesAndZeroCC(norm16 uint16) bool { return norm16 < n.minYesNo || - norm16 == JAMO_VT || - (n.minMaybeYes <= norm16 && norm16 <= MIN_NORMAL_MAYBE_YES) + norm16 == jamoVt || + (n.minMaybeYes <= norm16 && norm16 <= minNormalMaybeYes) } -func (n *normalizer) CombiningClass(c rune) uint8 { - return n.getCC(n.GetNorm16(c)) +func (n *Normalizer) CombiningClass(c rune) uint8 { + return n.getCC(n.getNorm16(c)) } -func (n *normalizer) getCC(norm16 uint16) uint8 { - if norm16 >= MIN_NORMAL_MAYBE_YES { +func (n *Normalizer) getCC(norm16 uint16) uint8 { + if norm16 >= minNormalMaybeYes { return n.getCCFromNormalYesOrMaybe(norm16) } if norm16 < n.minNoNo || n.limitNoNo <= norm16 { @@ -472,11 +473,10 @@ func (n *normalizer) getCC(norm16 uint16) uint8 { } -func (n *normalizer) getCCFromNoNo(norm16 uint16) uint8 { +func (n *Normalizer) getCCFromNoNo(norm16 uint16) uint8 { mapping := n.getMapping(norm16) - if mapping[1]&MAPPING_HAS_CCC_LCCC_WORD != 0 { + if mapping[1]&mappingHasCccLcccWord != 0 { return uint8(mapping[0]) - } else { - return 0 } + return 0 } diff --git a/go/mysql/icuregex/internal/pattern/unescape.go b/go/mysql/icuregex/internal/pattern/unescape.go index a142983c580..bdef8ad5cb3 100644 --- a/go/mysql/icuregex/internal/pattern/unescape.go +++ b/go/mysql/icuregex/internal/pattern/unescape.go @@ -50,7 +50,7 @@ func _digit16(c rune) rune { return -1 } -var UNESCAPE_MAP = []byte{ +var unscapeMap = []byte{ /*" 0x22, 0x22 */ /*' 0x27, 0x27 */ /*? 0x3F, 0x3F */ @@ -179,11 +179,11 @@ func UnescapeAt(str string) (rune, string) { } if c < utf8.RuneSelf { - for i := 0; i < len(UNESCAPE_MAP); i += 2 { - if byte(c) == UNESCAPE_MAP[i] { - return rune(UNESCAPE_MAP[i+1]), str + for i := 0; i < len(unscapeMap); i += 2 { + if byte(c) == unscapeMap[i] { + return rune(unscapeMap[i+1]), str } - if byte(c) < UNESCAPE_MAP[i] { + if byte(c) < unscapeMap[i] { break } } diff --git a/go/mysql/icuregex/internal/pattern/utils.go b/go/mysql/icuregex/internal/pattern/utils.go index 2113a2cdcf3..4dcf55e9f42 100644 --- a/go/mysql/icuregex/internal/pattern/utils.go +++ b/go/mysql/icuregex/internal/pattern/utils.go @@ -86,7 +86,7 @@ func IsUnprintable(c rune) bool { } // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -var DIGITS = [...]byte{ +var digits = [...]byte{ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, @@ -97,15 +97,15 @@ func EscapeUnprintable(w *strings.Builder, c rune) { w.WriteByte('\\') if (c & ^0xFFFF) != 0 { w.WriteByte('U') - w.WriteByte(DIGITS[0xF&(c>>28)]) - w.WriteByte(DIGITS[0xF&(c>>24)]) - w.WriteByte(DIGITS[0xF&(c>>20)]) - w.WriteByte(DIGITS[0xF&(c>>16)]) + w.WriteByte(digits[0xF&(c>>28)]) + w.WriteByte(digits[0xF&(c>>24)]) + w.WriteByte(digits[0xF&(c>>20)]) + w.WriteByte(digits[0xF&(c>>16)]) } else { w.WriteByte('u') } - w.WriteByte(DIGITS[0xF&(c>>12)]) - w.WriteByte(DIGITS[0xF&(c>>8)]) - w.WriteByte(DIGITS[0xF&(c>>4)]) - w.WriteByte(DIGITS[0xF&c]) + w.WriteByte(digits[0xF&(c>>12)]) + w.WriteByte(digits[0xF&(c>>8)]) + w.WriteByte(digits[0xF&(c>>4)]) + w.WriteByte(digits[0xF&c]) } diff --git a/go/mysql/icuregex/internal/ubidi/ubidi.go b/go/mysql/icuregex/internal/ubidi/ubidi.go index b8c67d75368..195e2b1a6dd 100644 --- a/go/mysql/icuregex/internal/ubidi/ubidi.go +++ b/go/mysql/icuregex/internal/ubidi/ubidi.go @@ -22,7 +22,7 @@ limitations under the License. package ubidi import ( - "fmt" + "errors" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" "vitess.io/vitess/go/mysql/icuregex/internal/udata" @@ -30,18 +30,18 @@ import ( ) const ( - UBIDI_IX_INDEX_TOP = iota - UBIDI_IX_LENGTH - UBIDI_IX_TRIE_SIZE - UBIDI_IX_MIRROR_LENGTH - - UBIDI_IX_JG_START - UBIDI_IX_JG_LIMIT - UBIDI_IX_JG_START2 /* new in format version 2.2, ICU 54 */ - UBIDI_IX_JG_LIMIT2 - - UBIDI_MAX_VALUES_INDEX - UBIDI_IX_TOP + ixIndexTop = iota + ixLength + ixTrieSize + ixMirrorLength + + ixJgStart + ixJgLimit + ixJgStart2 /* new in format version 2.2, ICU 54 */ + ixJgLimit2 + + maxValuesIndex + ixTop ) var ubidi struct { @@ -65,8 +65,8 @@ func readData(bytes *udata.Bytes) error { } count := int32(bytes.Uint32()) - if count < UBIDI_IX_TOP { - return fmt.Errorf("indexes[0] too small in ucase.icu") + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") } ubidi.indexes = make([]int32, count) @@ -81,22 +81,22 @@ func readData(bytes *udata.Bytes) error { return err } - expectedTrieLength := ubidi.indexes[UBIDI_IX_TRIE_SIZE] + expectedTrieLength := ubidi.indexes[ixTrieSize] trieLength := ubidi.trie.SerializedLength() if trieLength > expectedTrieLength { - return fmt.Errorf("ucase.icu: not enough bytes for the trie") + return errors.New("ucase.icu: not enough bytes for the trie") } bytes.Skip(expectedTrieLength - trieLength) - if n := ubidi.indexes[UBIDI_IX_MIRROR_LENGTH]; n > 0 { + if n := ubidi.indexes[ixMirrorLength]; n > 0 { ubidi.mirrors = bytes.Uint32Slice(n) } - if n := ubidi.indexes[UBIDI_IX_JG_LIMIT] - ubidi.indexes[UBIDI_IX_JG_START]; n > 0 { + if n := ubidi.indexes[ixJgLimit] - ubidi.indexes[ixJgStart]; n > 0 { ubidi.jg = bytes.Uint8Slice(n) } - if n := ubidi.indexes[UBIDI_IX_JG_LIMIT2] - ubidi.indexes[UBIDI_IX_JG_START2]; n > 0 { + if n := ubidi.indexes[ixJgLimit2] - ubidi.indexes[ixJgStart2]; n > 0 { ubidi.jg2 = bytes.Uint8Slice(n) } @@ -112,17 +112,14 @@ func init() { const ( /* UBIDI_CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ - UBIDI_JT_SHIFT = 5 /* joining type: 3 bits (7..5) */ + jtShift = 5 /* joining type: 3 bits (7..5) */ - UBIDI_BPT_SHIFT = 8 /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + bptShift = 8 /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ - UBIDI_JOIN_CONTROL_SHIFT = 10 - UBIDI_BIDI_CONTROL_SHIFT = 11 + joinControlShift = 10 + bidiControlShift = 11 - UBIDI_IS_MIRRORED_SHIFT = 12 /* 'is mirrored' */ - UBIDI_MIRROR_DELTA_SHIFT = 13 /* bidi mirroring delta: 3 bits (15..13) */ - - UBIDI_MAX_JG_SHIFT = 16 /* max JG value in indexes[UBIDI_MAX_VALUES_INDEX] bits 23..16 */ + isMirroredShift = 12 /* 'is mirrored' */ ) /** @@ -131,7 +128,7 @@ const ( * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE * @stable ICU 52 */ -type UBidiPairedBracketType int32 +type UPairedBracketType int32 /* * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. @@ -140,16 +137,16 @@ type UBidiPairedBracketType int32 */ const ( /** Not a paired bracket. @stable ICU 52 */ - U_BPT_NONE = iota + BptNone UPairedBracketType = iota /** Open paired bracket. @stable ICU 52 */ - U_BPT_OPEN + BptOpen /** Close paired bracket. @stable ICU 52 */ - U_BPT_CLOSE + BptClose ) -const UBIDI_CLASS_MASK = 0x0000001f -const UBIDI_JT_MASK = 0x000000e0 -const UBIDI_BPT_MASK = 0x00000300 +const classMask = 0x0000001f +const jtMask = 0x000000e0 +const bptMask = 0x00000300 /** * Joining Type constants. @@ -157,7 +154,7 @@ const UBIDI_BPT_MASK = 0x00000300 * @see UCHAR_JOINING_TYPE * @stable ICU 2.2 */ -type UJoiningType int32 +type JoiningType int32 /* * Note: UJoiningType constants are parsed by preparseucd.py. @@ -165,12 +162,12 @@ type UJoiningType int32 * U_JT_ */ const ( - U_JT_NON_JOINING UJoiningType = iota /*[U]*/ - U_JT_JOIN_CAUSING /*[C]*/ - U_JT_DUAL_JOINING /*[D]*/ - U_JT_LEFT_JOINING /*[L]*/ - U_JT_RIGHT_JOINING /*[R]*/ - U_JT_TRANSPARENT /*[T]*/ + JtNonJoining JoiningType = iota /*[U]*/ + JtJoinCausing /*[C]*/ + JtDualJoining /*[D]*/ + JtLeftJoining /*[L]*/ + JtRightJoining /*[R]*/ + JtTransparent /*[T]*/ ) /** @@ -179,7 +176,7 @@ const ( * @see UCHAR_JOINING_GROUP * @stable ICU 2.2 */ -type UJoiningGroup int32 +type JoiningGroup int32 /* * Note: UJoiningGroup constants are parsed by preparseucd.py. @@ -187,120 +184,120 @@ type UJoiningGroup int32 * U_JG_ */ const ( - U_JG_NO_JOINING_GROUP UJoiningGroup = iota - U_JG_AIN - U_JG_ALAPH - U_JG_ALEF - U_JG_BEH - U_JG_BETH - U_JG_DAL - U_JG_DALATH_RISH - U_JG_E - U_JG_FEH - U_JG_FINAL_SEMKATH - U_JG_GAF - U_JG_GAMAL - U_JG_HAH - U_JG_TEH_MARBUTA_GOAL /**< @stable ICU 4.6 */ - U_JG_HE - U_JG_HEH - U_JG_HEH_GOAL - U_JG_HETH - U_JG_KAF - U_JG_KAPH - U_JG_KNOTTED_HEH - U_JG_LAM - U_JG_LAMADH - U_JG_MEEM - U_JG_MIM - U_JG_NOON - U_JG_NUN - U_JG_PE - U_JG_QAF - U_JG_QAPH - U_JG_REH - U_JG_REVERSED_PE - U_JG_SAD - U_JG_SADHE - U_JG_SEEN - U_JG_SEMKATH - U_JG_SHIN - U_JG_SWASH_KAF - U_JG_SYRIAC_WAW - U_JG_TAH - U_JG_TAW - U_JG_TEH_MARBUTA - U_JG_TETH - U_JG_WAW - U_JG_YEH - U_JG_YEH_BARREE - U_JG_YEH_WITH_TAIL - U_JG_YUDH - U_JG_YUDH_HE - U_JG_ZAIN - U_JG_FE /**< @stable ICU 2.6 */ - U_JG_KHAPH /**< @stable ICU 2.6 */ - U_JG_ZHAIN /**< @stable ICU 2.6 */ - U_JG_BURUSHASKI_YEH_BARREE /**< @stable ICU 4.0 */ - U_JG_FARSI_YEH /**< @stable ICU 4.4 */ - U_JG_NYA /**< @stable ICU 4.4 */ - U_JG_ROHINGYA_YEH /**< @stable ICU 49 */ - U_JG_MANICHAEAN_ALEPH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_AYIN /**< @stable ICU 54 */ - U_JG_MANICHAEAN_BETH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_DALETH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_DHAMEDH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_FIVE /**< @stable ICU 54 */ - U_JG_MANICHAEAN_GIMEL /**< @stable ICU 54 */ - U_JG_MANICHAEAN_HETH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_HUNDRED /**< @stable ICU 54 */ - U_JG_MANICHAEAN_KAPH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_LAMEDH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_MEM /**< @stable ICU 54 */ - U_JG_MANICHAEAN_NUN /**< @stable ICU 54 */ - U_JG_MANICHAEAN_ONE /**< @stable ICU 54 */ - U_JG_MANICHAEAN_PE /**< @stable ICU 54 */ - U_JG_MANICHAEAN_QOPH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_RESH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_SADHE /**< @stable ICU 54 */ - U_JG_MANICHAEAN_SAMEKH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_TAW /**< @stable ICU 54 */ - U_JG_MANICHAEAN_TEN /**< @stable ICU 54 */ - U_JG_MANICHAEAN_TETH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_THAMEDH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_TWENTY /**< @stable ICU 54 */ - U_JG_MANICHAEAN_WAW /**< @stable ICU 54 */ - U_JG_MANICHAEAN_YODH /**< @stable ICU 54 */ - U_JG_MANICHAEAN_ZAYIN /**< @stable ICU 54 */ - U_JG_STRAIGHT_WAW /**< @stable ICU 54 */ - U_JG_AFRICAN_FEH /**< @stable ICU 58 */ - U_JG_AFRICAN_NOON /**< @stable ICU 58 */ - U_JG_AFRICAN_QAF /**< @stable ICU 58 */ - - U_JG_MALAYALAM_BHA /**< @stable ICU 60 */ - U_JG_MALAYALAM_JA /**< @stable ICU 60 */ - U_JG_MALAYALAM_LLA /**< @stable ICU 60 */ - U_JG_MALAYALAM_LLLA /**< @stable ICU 60 */ - U_JG_MALAYALAM_NGA /**< @stable ICU 60 */ - U_JG_MALAYALAM_NNA /**< @stable ICU 60 */ - U_JG_MALAYALAM_NNNA /**< @stable ICU 60 */ - U_JG_MALAYALAM_NYA /**< @stable ICU 60 */ - U_JG_MALAYALAM_RA /**< @stable ICU 60 */ - U_JG_MALAYALAM_SSA /**< @stable ICU 60 */ - U_JG_MALAYALAM_TTA /**< @stable ICU 60 */ - - U_JG_HANIFI_ROHINGYA_KINNA_YA /**< @stable ICU 62 */ - U_JG_HANIFI_ROHINGYA_PA /**< @stable ICU 62 */ - - U_JG_THIN_YEH /**< @stable ICU 70 */ - U_JG_VERTICAL_TAIL /**< @stable ICU 70 */ + JgNoJoiningGroup JoiningGroup = iota + JgAin + JgAlaph + JgAlef + JgBeh + JgBeth + JgDal + JgDalathRish + JgE + JgFeh + JgFinalSemkath + JgGaf + JgGamal + JgHah + JgTehMarbutaGoal /**< @stable ICU 4.6 */ + JgHe + JgHeh + JgHehGoal + JgHeth + JgKaf + JgKaph + JgKnottedHeh + JgLam + JgLamadh + JgMeem + JgMim + JgNoon + JgNun + JgPe + JgQaf + JgQaph + JgReh + JgReversedPe + JgSad + JgSadhe + JgSeen + JgSemkath + JgShin + JgSwashKaf + JgSyriacWaw + JgTah + JgTaw + JgTehMarbuta + JgTeth + JgWaw + JgYeh + JgYehBarree + JgYehWithTail + JgYudh + JgYudhHe + JgZain + JgFe /**< @stable ICU 2.6 */ + JgKhaph /**< @stable ICU 2.6 */ + JgZhain /**< @stable ICU 2.6 */ + JgBurushashkiYehBarree /**< @stable ICU 4.0 */ + JgFarsiYeh /**< @stable ICU 4.4 */ + JgNya /**< @stable ICU 4.4 */ + JgRohingyaYeh /**< @stable ICU 49 */ + JgManichaeanAleph /**< @stable ICU 54 */ + JgManichaeanAyin /**< @stable ICU 54 */ + JgManichaeanBeth /**< @stable ICU 54 */ + JgManichaeanDaleth /**< @stable ICU 54 */ + JgManichaeanDhamedh /**< @stable ICU 54 */ + JgManichaeanFive /**< @stable ICU 54 */ + JgManichaeanGimel /**< @stable ICU 54 */ + JgManichaeanHeth /**< @stable ICU 54 */ + JgManichaeanHundred /**< @stable ICU 54 */ + JgManichaeanKaph /**< @stable ICU 54 */ + JgManichaeanLamedh /**< @stable ICU 54 */ + JgManichaeanMem /**< @stable ICU 54 */ + JgManichaeanNun /**< @stable ICU 54 */ + JgManichaeanOne /**< @stable ICU 54 */ + JgManichaeanPe /**< @stable ICU 54 */ + JgManichaeanQoph /**< @stable ICU 54 */ + JgManichaeanResh /**< @stable ICU 54 */ + JgManichaeanSadhe /**< @stable ICU 54 */ + JgManichaeanSamekh /**< @stable ICU 54 */ + JgManichaeanTaw /**< @stable ICU 54 */ + JgManichaeanTen /**< @stable ICU 54 */ + JgManichaeanTeth /**< @stable ICU 54 */ + JgManichaeanThamedh /**< @stable ICU 54 */ + JgManichaeanTwenty /**< @stable ICU 54 */ + JgManichaeanWaw /**< @stable ICU 54 */ + JgManichaeanYodh /**< @stable ICU 54 */ + JgManichaeanZayin /**< @stable ICU 54 */ + JgStraightWaw /**< @stable ICU 54 */ + JgAfricanFeh /**< @stable ICU 58 */ + JgAfricanNoon /**< @stable ICU 58 */ + JgAfricanQaf /**< @stable ICU 58 */ + + JgMalayalamBha /**< @stable ICU 60 */ + JgMalayalamJa /**< @stable ICU 60 */ + JgMalayalamLla /**< @stable ICU 60 */ + JgMalayalamLlla /**< @stable ICU 60 */ + JgMalayalamNga /**< @stable ICU 60 */ + JgMalayalamNna /**< @stable ICU 60 */ + JgMalayalamNnna /**< @stable ICU 60 */ + JgMalayalamNya /**< @stable ICU 60 */ + JgMalayalamRa /**< @stable ICU 60 */ + JgMalayalamSsa /**< @stable ICU 60 */ + JgMalayalamTta /**< @stable ICU 60 */ + + JgHanafiRohingyaKinnaYa /**< @stable ICU 62 */ + JgHanafiRohingyaPa /**< @stable ICU 62 */ + + JgThinYeh /**< @stable ICU 70 */ + JgVerticalTail /**< @stable ICU 70 */ ) /** * This specifies the language directional property of a character set. * @stable ICU 2.0 */ -type UCharDirection int32 +type CharDirection int32 /* * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. @@ -311,59 +308,59 @@ type UCharDirection int32 const ( /** L @stable ICU 2.0 */ - U_LEFT_TO_RIGHT UCharDirection = 0 + LeftToRight CharDirection = 0 /** R @stable ICU 2.0 */ - U_RIGHT_TO_LEFT UCharDirection = 1 + RightToLeft CharDirection = 1 /** EN @stable ICU 2.0 */ - U_EUROPEAN_NUMBER UCharDirection = 2 + EuropeanNumber CharDirection = 2 /** ES @stable ICU 2.0 */ - U_EUROPEAN_NUMBER_SEPARATOR UCharDirection = 3 + EuropeanNumberSeparator CharDirection = 3 /** ET @stable ICU 2.0 */ - U_EUROPEAN_NUMBER_TERMINATOR UCharDirection = 4 + EuropeanNumberTerminator CharDirection = 4 /** AN @stable ICU 2.0 */ - U_ARABIC_NUMBER UCharDirection = 5 + ArabicNumber CharDirection = 5 /** CS @stable ICU 2.0 */ - U_COMMON_NUMBER_SEPARATOR UCharDirection = 6 + CommonNumberSeparator CharDirection = 6 /** B @stable ICU 2.0 */ - U_BLOCK_SEPARATOR UCharDirection = 7 + BlockSeparator CharDirection = 7 /** S @stable ICU 2.0 */ - U_SEGMENT_SEPARATOR UCharDirection = 8 + SegmentSeparator CharDirection = 8 /** WS @stable ICU 2.0 */ - U_WHITE_SPACE_NEUTRAL UCharDirection = 9 + WhiteSpaceNeutral CharDirection = 9 /** ON @stable ICU 2.0 */ - U_OTHER_NEUTRAL UCharDirection = 10 + OtherNeutral CharDirection = 10 /** LRE @stable ICU 2.0 */ - U_LEFT_TO_RIGHT_EMBEDDING UCharDirection = 11 + LeftToRightEmbedding CharDirection = 11 /** LRO @stable ICU 2.0 */ - U_LEFT_TO_RIGHT_OVERRIDE UCharDirection = 12 + LeftToRightOverride CharDirection = 12 /** AL @stable ICU 2.0 */ - U_RIGHT_TO_LEFT_ARABIC UCharDirection = 13 + RightToLeftArabic CharDirection = 13 /** RLE @stable ICU 2.0 */ - U_RIGHT_TO_LEFT_EMBEDDING UCharDirection = 14 + RightToLeftEmbedding CharDirection = 14 /** RLO @stable ICU 2.0 */ - U_RIGHT_TO_LEFT_OVERRIDE UCharDirection = 15 + RightToLeftOverride CharDirection = 15 /** PDF @stable ICU 2.0 */ - U_POP_DIRECTIONAL_FORMAT UCharDirection = 16 + PopDirectionalFormat CharDirection = 16 /** NSM @stable ICU 2.0 */ - U_DIR_NON_SPACING_MARK UCharDirection = 17 + DirNonSpacingMark CharDirection = 17 /** BN @stable ICU 2.0 */ - U_BOUNDARY_NEUTRAL UCharDirection = 18 + BoundaryNeutral CharDirection = 18 /** FSI @stable ICU 52 */ - U_FIRST_STRONG_ISOLATE UCharDirection = 19 + StrongIsolate CharDirection = 19 /** LRI @stable ICU 52 */ - U_LEFT_TO_RIGHT_ISOLATE UCharDirection = 20 + LeftToRightIsolate CharDirection = 20 /** RLI @stable ICU 52 */ - U_RIGHT_TO_LEFT_ISOLATE UCharDirection = 21 + RightToLeftIsolate CharDirection = 21 /** PDI @stable ICU 52 */ - U_POP_DIRECTIONAL_ISOLATE UCharDirection = 22 + PopDirectionalIsolate CharDirection = 22 ) -type PropertySet interface { +type propertySet interface { AddRune(ch rune) AddRuneRange(from rune, to rune) } -func AddPropertyStarts(sa PropertySet) { +func AddPropertyStarts(sa propertySet) { /* add the start code point of each same-value range of the trie */ ubidi.trie.Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) @@ -371,15 +368,15 @@ func AddPropertyStarts(sa PropertySet) { }) /* add the code points from the bidi mirroring table */ - length := ubidi.indexes[UBIDI_IX_MIRROR_LENGTH] + length := ubidi.indexes[ixMirrorLength] for i := int32(0); i < length; i++ { c := mirrorCodePoint(rune(ubidi.mirrors[i])) sa.AddRuneRange(c, c+1) } /* add the code points from the Joining_Group array where the value changes */ - start := ubidi.indexes[UBIDI_IX_JG_START] - limit := ubidi.indexes[UBIDI_IX_JG_LIMIT] + start := ubidi.indexes[ixJgStart] + limit := ubidi.indexes[ixJgLimit] jgArray := ubidi.jg[:] for { prev := uint8(0) @@ -396,10 +393,10 @@ func AddPropertyStarts(sa PropertySet) { /* add the limit code point if the last value was not 0 (it is now start==limit) */ sa.AddRune(limit) } - if limit == ubidi.indexes[UBIDI_IX_JG_LIMIT] { + if limit == ubidi.indexes[ixJgLimit] { /* switch to the second Joining_Group range */ - start = ubidi.indexes[UBIDI_IX_JG_START2] - limit = ubidi.indexes[UBIDI_IX_JG_LIMIT2] + start = ubidi.indexes[ixJgStart2] + limit = ubidi.indexes[ixJgLimit2] jgArray = ubidi.jg2[:] } else { break @@ -421,44 +418,44 @@ func mirrorCodePoint(m rune) rune { func IsJoinControl(c rune) bool { props := ubidi.trie.Get16(c) - return HasFlag(props, UBIDI_JOIN_CONTROL_SHIFT) + return HasFlag(props, joinControlShift) } -func JoiningType(c rune) UJoiningType { +func JoinType(c rune) JoiningType { props := ubidi.trie.Get16(c) - return UJoiningType((props & UBIDI_JT_MASK) >> UBIDI_JT_SHIFT) + return JoiningType((props & jtMask) >> jtShift) } -func JoiningGroup(c rune) UJoiningGroup { - start := ubidi.indexes[UBIDI_IX_JG_START] - limit := ubidi.indexes[UBIDI_IX_JG_LIMIT] +func JoinGroup(c rune) JoiningGroup { + start := ubidi.indexes[ixJgStart] + limit := ubidi.indexes[ixJgLimit] if start <= c && c < limit { - return UJoiningGroup(ubidi.jg[c-start]) + return JoiningGroup(ubidi.jg[c-start]) } - start = ubidi.indexes[UBIDI_IX_JG_START2] - limit = ubidi.indexes[UBIDI_IX_JG_LIMIT2] + start = ubidi.indexes[ixJgStart2] + limit = ubidi.indexes[ixJgLimit2] if start <= c && c < limit { - return UJoiningGroup(ubidi.jg2[c-start]) + return JoiningGroup(ubidi.jg2[c-start]) } - return U_JG_NO_JOINING_GROUP + return JgNoJoiningGroup } func IsMirrored(c rune) bool { props := ubidi.trie.Get16(c) - return HasFlag(props, UBIDI_IS_MIRRORED_SHIFT) + return HasFlag(props, isMirroredShift) } func IsBidiControl(c rune) bool { props := ubidi.trie.Get16(c) - return HasFlag(props, UBIDI_BIDI_CONTROL_SHIFT) + return HasFlag(props, bidiControlShift) } -func PairedBracketType(c rune) UBidiPairedBracketType { +func PairedBracketType(c rune) UPairedBracketType { props := ubidi.trie.Get16(c) - return UBidiPairedBracketType((props & UBIDI_BPT_MASK) >> UBIDI_BPT_SHIFT) + return UPairedBracketType((props & bptMask) >> bptShift) } -func Class(c rune) UCharDirection { +func Class(c rune) CharDirection { props := ubidi.trie.Get16(c) - return UCharDirection(props & UBIDI_CLASS_MASK) + return CharDirection(props & classMask) } diff --git a/go/mysql/icuregex/internal/ucase/fold.go b/go/mysql/icuregex/internal/ucase/fold.go index bb10ba8cb35..88d4f026c65 100644 --- a/go/mysql/icuregex/internal/ucase/fold.go +++ b/go/mysql/icuregex/internal/ucase/fold.go @@ -92,7 +92,7 @@ func Fold(c rune) rune { pe := getExceptions(props) excWord := pe[0] pe = pe[1:] - if (excWord & UCASE_EXC_CONDITIONAL_FOLD) != 0 { + if (excWord & excConditionalFold) != 0 { /* special case folding mappings, hardcoded */ /* default mappings */ if c == 0x49 { @@ -103,23 +103,23 @@ func Fold(c rune) rune { return c } } - if (excWord & UCASE_EXC_NO_SIMPLE_CASE_FOLDING) != 0 { + if (excWord & excNoSimpleCaseFolding) != 0 { return c } - if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { var delta int32 - delta, _ = getSlotValue(excWord, UCASE_EXC_DELTA, pe) - if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + delta, _ = getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { return c + delta } return c - delta } var idx int32 - if hasSlot(excWord, UCASE_EXC_FOLD) { - idx = UCASE_EXC_FOLD - } else if hasSlot(excWord, UCASE_EXC_LOWER) { - idx = UCASE_EXC_LOWER + if hasSlot(excWord, excFold) { + idx = excFold + } else if hasSlot(excWord, excLower) { + idx = excLower } else { return c } @@ -144,7 +144,7 @@ func FullFolding(c rune) (rune, []uint16) { pe = pe[1:] var idx int32 - if excWord&UCASE_EXC_CONDITIONAL_FOLD != 0 { + if excWord&excConditionalFold != 0 { /* use hardcoded conditions and mappings */ /* default mappings */ if c == 0x49 { @@ -154,14 +154,14 @@ func FullFolding(c rune) (rune, []uint16) { /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ return -1, []uint16{0x69, 0x307} } - } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { - full, pe := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + } else if hasSlot(excWord, excFullMappings) { + full, pe := getSlotValue(excWord, excFullMappings, pe) /* start of full case mapping strings */ pe = pe[1:] /* skip the lowercase result string */ - pe = pe[full&UCASE_FULL_LOWER:] + pe = pe[full&fullLower:] full = (full >> 4) & 0xf if full != 0 { @@ -170,20 +170,20 @@ func FullFolding(c rune) (rune, []uint16) { } } - if excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING != 0 { + if excWord&excNoSimpleCaseFolding != 0 { return result, nil } - if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { - delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) - if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { return c + delta, nil } return c - delta, nil } - if hasSlot(excWord, UCASE_EXC_FOLD) { - idx = UCASE_EXC_FOLD - } else if hasSlot(excWord, UCASE_EXC_LOWER) { - idx = UCASE_EXC_LOWER + if hasSlot(excWord, excFold) { + idx = excFold + } else if hasSlot(excWord, excLower) { + idx = excLower } else { return c, nil } @@ -192,26 +192,25 @@ func FullFolding(c rune) (rune, []uint16) { } const ( - UCASE_EXC_LOWER = iota - UCASE_EXC_FOLD - UCASE_EXC_UPPER - UCASE_EXC_TITLE - UCASE_EXC_DELTA - UCASE_EXC_5 /* reserved */ - UCASE_EXC_CLOSURE - UCASE_EXC_FULL_MAPPINGS - UCASE_EXC_ALL_SLOTS /* one past the last slot */ + excLower = iota + excFold + excUpper + excTitle + excDelta + exc5 /* reserved */ + excClosure + excFullMappings ) const ( /* complex/conditional mappings */ - UCASE_EXC_CONDITIONAL_SPECIAL = 0x4000 - UCASE_EXC_CONDITIONAL_FOLD = 0x8000 - UCASE_EXC_NO_SIMPLE_CASE_FOLDING = 0x200 - UCASE_EXC_DELTA_IS_NEGATIVE = 0x400 - UCASE_EXC_SENSITIVE = 0x800 + excConditionalSpecial = 0x4000 + excConditionalFold = 0x8000 + excNoSimpleCaseFolding = 0x200 + excDeltaIsNegative = 0x400 + excSensitive = 0x800 - UCASE_EXC_DOUBLE_SLOTS = 0x100 + excDoubleSlots = 0x100 ) func isUpperOrTitle(props uint16) bool { @@ -235,7 +234,7 @@ func slotOffset(flags uint16, idx int32) int { } func getSlotValue(excWord uint16, idx int32, pExc16 []uint16) (int32, []uint16) { - if excWord&UCASE_EXC_DOUBLE_SLOTS == 0 { + if excWord&excDoubleSlots == 0 { pExc16 = pExc16[slotOffset(excWord, idx):] return int32(pExc16[0]), pExc16 } diff --git a/go/mysql/icuregex/internal/ucase/ucase.go b/go/mysql/icuregex/internal/ucase/ucase.go index 1542745390d..9fb8407ea66 100644 --- a/go/mysql/icuregex/internal/ucase/ucase.go +++ b/go/mysql/icuregex/internal/ucase/ucase.go @@ -22,7 +22,7 @@ limitations under the License. package ucase import ( - "fmt" + "errors" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" "vitess.io/vitess/go/mysql/icuregex/internal/udata" @@ -36,17 +36,17 @@ var ucase struct { unfold []uint16 } -func readData(bytes *udata.Bytes) error { - const ( - IX_INDEX_TOP = 0 - IX_LENGTH = 1 - IX_TRIE_SIZE = 2 - IX_EXC_LENGTH = 3 - IX_UNFOLD_LENGTH = 4 - IX_MAX_FULL_LENGTH = 15 - IX_TOP = 16 - ) +const ( + ixIndexTop = 0 + ixLength = 1 + ixTrieSize = 2 + ixExcLength = 3 + ixUnfoldLength = 4 + ixMaxFullLength = 15 + ixTop = 16 +) +func readData(bytes *udata.Bytes) error { err := bytes.ReadHeader(func(info *udata.DataInfo) bool { return info.DataFormat[0] == 0x63 && info.DataFormat[1] == 0x41 && @@ -59,8 +59,8 @@ func readData(bytes *udata.Bytes) error { } count := int32(bytes.Uint32()) - if count < IX_TOP { - return fmt.Errorf("indexes[0] too small in ucase.icu") + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") } indexes := make([]int32, count) @@ -75,19 +75,19 @@ func readData(bytes *udata.Bytes) error { return err } - expectedTrieLength := indexes[IX_TRIE_SIZE] + expectedTrieLength := indexes[ixTrieSize] trieLength := ucase.trie.SerializedLength() if trieLength > expectedTrieLength { - return fmt.Errorf("ucase.icu: not enough bytes for the trie") + return errors.New("ucase.icu: not enough bytes for the trie") } bytes.Skip(expectedTrieLength - trieLength) - if n := indexes[IX_EXC_LENGTH]; n > 0 { + if n := indexes[ixExcLength]; n > 0 { ucase.exceptions = bytes.Uint16Slice(n) } - if n := indexes[IX_UNFOLD_LENGTH]; n > 0 { + if n := indexes[ixUnfoldLength]; n > 0 { ucase.unfold = bytes.Uint16Slice(n) } @@ -101,11 +101,11 @@ func init() { } } -type PropertySet interface { +type propertySet interface { AddRune(ch rune) } -func AddPropertyStarts(sa PropertySet) { +func AddPropertyStarts(sa propertySet) { /* add the start code point of each same-value range of the trie */ ucase.trie.Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) @@ -123,16 +123,16 @@ func AddPropertyStarts(sa PropertySet) { } const ( - UCASE_FULL_MAPPINGS_MAX_LENGTH = (4 * 0xf) - UCASE_CLOSURE_MAX_LENGTH = 0xf + fullMappingsMaxLength = (4 * 0xf) + closureMaxLength = 0xf - UCASE_FULL_LOWER = 0xf - UCASE_FULL_FOLDING = 0xf0 - UCASE_FULL_UPPER = 0xf00 - UCASE_FULL_TITLE = 0xf000 + fullLower = 0xf + fullFolding = 0xf0 + fullUpper = 0xf00 + fullTitle = 0xf000 ) -func AddCaseClosure(c rune, sa PropertySet) { +func AddCaseClosure(c rune, sa propertySet) { /* * Hardcode the case closure of i and its relatives and ignore the * data file data for these characters. @@ -164,7 +164,7 @@ func AddCaseClosure(c rune, sa PropertySet) { props := ucase.trie.Get16(c) if !hasException(props) { - if getPropsType(props) != UCASE_NONE { + if getPropsType(props) != None { /* add the one simple case mapping, no matter what type it is */ delta := getDelta(props) if delta != 0 { @@ -183,15 +183,15 @@ func AddCaseClosure(c rune, sa PropertySet) { var closure []uint16 /* add all simple case mappings */ - for idx = UCASE_EXC_LOWER; idx <= UCASE_EXC_TITLE; idx++ { + for idx = excLower; idx <= excTitle; idx++ { if hasSlot(excWord, idx) { c, _ = getSlotValue(excWord, idx, pe) sa.AddRune(c) } } - if hasSlot(excWord, UCASE_EXC_DELTA) { - delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) - if excWord&UCASE_EXC_DELTA_IS_NEGATIVE == 0 { + if hasSlot(excWord, excDelta) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { sa.AddRune(c + delta) } else { sa.AddRune(c - delta) @@ -199,15 +199,15 @@ func AddCaseClosure(c rune, sa PropertySet) { } /* get the closure string pointer & length */ - if hasSlot(excWord, UCASE_EXC_CLOSURE) { - closureLength, pe1 := getSlotValue(excWord, UCASE_EXC_CLOSURE, pe) - closureLength &= UCASE_CLOSURE_MAX_LENGTH /* higher bits are reserved */ - closure = pe1[1 : 1+closureLength] /* behind this slot, unless there are full case mappings */ + if hasSlot(excWord, excClosure) { + closureLength, pe1 := getSlotValue(excWord, excClosure, pe) + closureLength &= closureMaxLength /* higher bits are reserved */ + closure = pe1[1 : 1+closureLength] /* behind this slot, unless there are full case mappings */ } /* add the full case folding */ - if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { - fullLength, pe1 := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + if hasSlot(excWord, excFullMappings) { + fullLength, pe1 := getSlotValue(excWord, excFullMappings, pe) /* start of full case mapping strings */ pe1 = pe1[1:] @@ -215,7 +215,7 @@ func AddCaseClosure(c rune, sa PropertySet) { fullLength &= 0xffff /* bits 16 and higher are reserved */ /* skip the lowercase result string */ - pe1 = pe1[fullLength&UCASE_FULL_LOWER:] + pe1 = pe1[fullLength&fullLower:] fullLength >>= 4 /* skip adding the case folding strings */ @@ -239,50 +239,49 @@ func AddCaseClosure(c rune, sa PropertySet) { } } -const UCASE_DOT_MASK = 0x60 +const dotMask = 0x60 const ( - UCASE_NO_DOT = 0 /* normal characters with cc=0 */ - UCASE_SOFT_DOTTED = 0x20 /* soft-dotted characters with cc=0 */ - UCASE_ABOVE = 0x40 /* "above" accents with cc=230 */ - UCASE_OTHER_ACCENT = 0x60 /* other accent character (0> UCASE_EXC_DOT_SHIFT) & UCASE_DOT_MASK) + return int32((pe[0] >> excDotShift) & dotMask) } func IsCaseSensitive(c rune) bool { props := ucase.trie.Get16(c) if !hasException(props) { - return (props & UCASE_SENSITIVE) != 0 - } else { - pe := getExceptions(props) - return (pe[0] & UCASE_EXC_SENSITIVE) != 0 + return (props & sensitive) != 0 } + pe := getExceptions(props) + return (pe[0] & excSensitive) != 0 } func ToFullLower(c rune) rune { @@ -298,30 +297,30 @@ func ToFullLower(c rune) rune { excWord := pe[0] pe = pe[1:] - if excWord&UCASE_EXC_CONDITIONAL_SPECIAL != 0 { + if excWord&excConditionalSpecial != 0 { /* use hardcoded conditions and mappings */ if c == 0x130 { return 2 } /* no known conditional special case mapping, use a normal mapping */ - } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { - full, _ := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) - full = full & UCASE_FULL_LOWER + } else if hasSlot(excWord, excFullMappings) { + full, _ := getSlotValue(excWord, excFullMappings, pe) + full = full & fullLower if full != 0 { /* return the string length */ return full } } - if hasSlot(excWord, UCASE_EXC_DELTA) && isUpperOrTitle(props) { - delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) - if (excWord & UCASE_EXC_DELTA_IS_NEGATIVE) == 0 { + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if (excWord & excDeltaIsNegative) == 0 { return c + delta } return c - delta } - if hasSlot(excWord, UCASE_EXC_LOWER) { - result, _ = getSlotValue(excWord, UCASE_EXC_LOWER, pe) + if hasSlot(excWord, excLower) { + result, _ = getSlotValue(excWord, excLower, pe) } } @@ -343,7 +342,7 @@ func toUpperOrTitle(c rune, upperNotTitle bool) rune { result := c props := ucase.trie.Get16(c) if !hasException(props) { - if getPropsType(props) == UCASE_LOWER { + if getPropsType(props) == Lower { result = c + getDelta(props) } } else { @@ -351,13 +350,13 @@ func toUpperOrTitle(c rune, upperNotTitle bool) rune { excWord := pe[0] pe = pe[1:] - if excWord&UCASE_EXC_CONDITIONAL_SPECIAL != 0 { + if excWord&excConditionalSpecial != 0 { if c == 0x0587 { return 2 } /* no known conditional special case mapping, use a normal mapping */ - } else if hasSlot(excWord, UCASE_EXC_FULL_MAPPINGS) { - full, _ := getSlotValue(excWord, UCASE_EXC_FULL_MAPPINGS, pe) + } else if hasSlot(excWord, excFullMappings) { + full, _ := getSlotValue(excWord, excFullMappings, pe) /* skip the lowercase and case-folding result strings */ full >>= 8 @@ -375,19 +374,19 @@ func toUpperOrTitle(c rune, upperNotTitle bool) rune { } } - if hasSlot(excWord, UCASE_EXC_DELTA) && getPropsType(props) == UCASE_LOWER { - delta, _ := getSlotValue(excWord, UCASE_EXC_DELTA, pe) - if (excWord & UCASE_EXC_DELTA_IS_NEGATIVE) == 0 { + if hasSlot(excWord, excDelta) && getPropsType(props) == Lower { + delta, _ := getSlotValue(excWord, excDelta, pe) + if (excWord & excDeltaIsNegative) == 0 { return c + delta } return c - delta } var idx int32 - if !upperNotTitle && hasSlot(excWord, UCASE_EXC_TITLE) { - idx = UCASE_EXC_TITLE - } else if hasSlot(excWord, UCASE_EXC_UPPER) { + if !upperNotTitle && hasSlot(excWord, excTitle) { + idx = excTitle + } else if hasSlot(excWord, excUpper) { /* here, titlecase is same as uppercase */ - idx = UCASE_EXC_UPPER + idx = excUpper } else { return ^c } @@ -405,22 +404,22 @@ func GetTypeOrIgnorable(c rune) int32 { return int32(props & 7) } -type UCaseType int32 +type Type int32 const ( - UCASE_NONE UCaseType = iota - UCASE_LOWER - UCASE_UPPER - UCASE_TITLE + None Type = iota + Lower + Upper + Title ) -const UCASE_TYPE_MASK = 3 +const typeMask = 3 -func GetType(c rune) UCaseType { +func GetType(c rune) Type { props := ucase.trie.Get16(c) return getPropsType(props) } -func getPropsType(props uint16) UCaseType { - return UCaseType(props & UCASE_TYPE_MASK) +func getPropsType(props uint16) Type { + return Type(props & typeMask) } diff --git a/go/mysql/icuregex/internal/uchar/constants.go b/go/mysql/icuregex/internal/uchar/constants.go index d1edd706586..1ab96751b5c 100644 --- a/go/mysql/icuregex/internal/uchar/constants.go +++ b/go/mysql/icuregex/internal/uchar/constants.go @@ -23,10 +23,12 @@ package uchar import "golang.org/x/exp/constraints" -func U_MASK[T constraints.Integer](x T) uint32 { +func uMask[T constraints.Integer](x T) uint32 { return 1 << x } +type Category int8 + const ( /* * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. @@ -36,67 +38,67 @@ const ( */ /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ - U_UNASSIGNED = 0 + Unassigned Category = 0 /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ - U_GENERAL_OTHER_TYPES = 0 + GeneralOtherTypes Category = iota - 1 /** Lu @stable ICU 2.0 */ - U_UPPERCASE_LETTER = 1 + UppercaseLetter /** Ll @stable ICU 2.0 */ - U_LOWERCASE_LETTER = 2 + LowercaseLetter /** Lt @stable ICU 2.0 */ - U_TITLECASE_LETTER = 3 + TitlecaseLetter /** Lm @stable ICU 2.0 */ - U_MODIFIER_LETTER = 4 + ModifierLetter /** Lo @stable ICU 2.0 */ - U_OTHER_LETTER = 5 + OtherLetter /** Mn @stable ICU 2.0 */ - U_NON_SPACING_MARK = 6 + NonSpacingMask /** Me @stable ICU 2.0 */ - U_ENCLOSING_MARK = 7 + EnclosingMark /** Mc @stable ICU 2.0 */ - U_COMBINING_SPACING_MARK = 8 + CombiningSpacingMask /** Nd @stable ICU 2.0 */ - U_DECIMAL_DIGIT_NUMBER = 9 + DecimalDigitNumber /** Nl @stable ICU 2.0 */ - U_LETTER_NUMBER = 10 + LetterNumber /** No @stable ICU 2.0 */ - U_OTHER_NUMBER = 11 + OtherNumber /** Zs @stable ICU 2.0 */ - U_SPACE_SEPARATOR = 12 + SpaceSeparator /** Zl @stable ICU 2.0 */ - U_LINE_SEPARATOR = 13 + LineSeparator /** Zp @stable ICU 2.0 */ - U_PARAGRAPH_SEPARATOR = 14 + ParagraphSeparator /** Cc @stable ICU 2.0 */ - U_CONTROL_CHAR = 15 + ControlChar /** Cf @stable ICU 2.0 */ - U_FORMAT_CHAR = 16 + FormatChar /** Co @stable ICU 2.0 */ - U_PRIVATE_USE_CHAR = 17 + PrivateUseChar /** Cs @stable ICU 2.0 */ - U_SURROGATE = 18 + Surrogate /** Pd @stable ICU 2.0 */ - U_DASH_PUNCTUATION = 19 + DashPunctuation /** Ps @stable ICU 2.0 */ - U_START_PUNCTUATION = 20 + StartPunctuation /** Pe @stable ICU 2.0 */ - U_END_PUNCTUATION = 21 + EndPunctuation /** Pc @stable ICU 2.0 */ - U_CONNECTOR_PUNCTUATION = 22 + ConnectorPunctuation /** Po @stable ICU 2.0 */ - U_OTHER_PUNCTUATION = 23 + OtherPunctuation /** Sm @stable ICU 2.0 */ - U_MATH_SYMBOL = 24 + MathSymbol /** Sc @stable ICU 2.0 */ - U_CURRENCY_SYMBOL = 25 + CurrencySymbol /** Sk @stable ICU 2.0 */ - U_MODIFIER_SYMBOL = 26 + ModifierSymbol /** So @stable ICU 2.0 */ - U_OTHER_SYMBOL = 27 + OtherSymbol /** Pi @stable ICU 2.0 */ - U_INITIAL_PUNCTUATION = 28 + InitialPunctuation /** Pf @stable ICU 2.0 */ - U_FINAL_PUNCTUATION = 29 + FinalPunctuation /** * One higher than the last enum UCharCategory constant. * This numeric value is stable (will not change), see @@ -104,135 +106,135 @@ const ( * * @stable ICU 2.0 */ - U_CHAR_CATEGORY_COUNT = 30 + CharCategoryCount ) var ( - U_GC_CN_MASK = U_MASK(U_GENERAL_OTHER_TYPES) + GcCnMask = uMask(GeneralOtherTypes) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_LU_MASK = U_MASK(U_UPPERCASE_LETTER) + GcLuMask = uMask(UppercaseLetter) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_LL_MASK = U_MASK(U_LOWERCASE_LETTER) + GcLlMask = uMask(LowercaseLetter) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_LT_MASK = U_MASK(U_TITLECASE_LETTER) + GcLtMask = uMask(TitlecaseLetter) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_LM_MASK = U_MASK(U_MODIFIER_LETTER) + GcLmMask = uMask(ModifierLetter) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_LO_MASK = U_MASK(U_OTHER_LETTER) + GcLoMask = uMask(OtherLetter) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_MN_MASK = U_MASK(U_NON_SPACING_MARK) + GcMnMask = uMask(NonSpacingMask) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_ME_MASK = U_MASK(U_ENCLOSING_MARK) + GcMeMask = uMask(EnclosingMark) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_MC_MASK = U_MASK(U_COMBINING_SPACING_MARK) + GcMcMask = uMask(CombiningSpacingMask) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_ND_MASK = U_MASK(U_DECIMAL_DIGIT_NUMBER) + GcNdMask = uMask(DecimalDigitNumber) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_NL_MASK = U_MASK(U_LETTER_NUMBER) + GcNlMask = uMask(LetterNumber) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_NO_MASK = U_MASK(U_OTHER_NUMBER) + GcNoMask = uMask(OtherNumber) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_ZS_MASK = U_MASK(U_SPACE_SEPARATOR) + GcZsMask = uMask(SpaceSeparator) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_ZL_MASK = U_MASK(U_LINE_SEPARATOR) + GcZlMask = uMask(LineSeparator) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_ZP_MASK = U_MASK(U_PARAGRAPH_SEPARATOR) + GcZpMask = uMask(ParagraphSeparator) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_CC_MASK = U_MASK(U_CONTROL_CHAR) + GcCcMask = uMask(ControlChar) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_CF_MASK = U_MASK(U_FORMAT_CHAR) + GcCfMask = uMask(FormatChar) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_CO_MASK = U_MASK(U_PRIVATE_USE_CHAR) + GcCoMask = uMask(PrivateUseChar) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_CS_MASK = U_MASK(U_SURROGATE) + GcCsMask = uMask(Surrogate) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_PD_MASK = U_MASK(U_DASH_PUNCTUATION) + GcPdMask = uMask(DashPunctuation) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_PS_MASK = U_MASK(U_START_PUNCTUATION) + GcPsMask = uMask(StartPunctuation) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_PE_MASK = U_MASK(U_END_PUNCTUATION) + GcPeMask = uMask(EndPunctuation) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_PC_MASK = U_MASK(U_CONNECTOR_PUNCTUATION) + GcPcMask = uMask(ConnectorPunctuation) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_PO_MASK = U_MASK(U_OTHER_PUNCTUATION) + GcPoMask = uMask(OtherPunctuation) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_SM_MASK = U_MASK(U_MATH_SYMBOL) + GcSmMask = uMask(MathSymbol) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_SC_MASK = U_MASK(U_CURRENCY_SYMBOL) + GcScMask = uMask(CurrencySymbol) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_SK_MASK = U_MASK(U_MODIFIER_SYMBOL) + GcSkMask = uMask(ModifierSymbol) /** Mask constant for a UCharCategory. @stable ICU 2.1 */ - U_GC_SO_MASK = U_MASK(U_OTHER_SYMBOL) + GcSoMask = uMask(OtherSymbol) /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ - U_GC_L_MASK = (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LT_MASK | U_GC_LM_MASK | U_GC_LO_MASK) + GcLMask = (GcLuMask | GcLlMask | GcLtMask | GcLmMask | GcLoMask) /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ - U_GC_LC_MASK = (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LT_MASK) + GcLcMask = (GcLuMask | GcLlMask | GcLtMask) /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ - U_GC_M_MASK = (U_GC_MN_MASK | U_GC_ME_MASK | U_GC_MC_MASK) + GcMMask = (GcMnMask | GcMeMask | GcMcMask) /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ - U_GC_N_MASK = (U_GC_ND_MASK | U_GC_NL_MASK | U_GC_NO_MASK) + GcNMask = (GcNdMask | GcNlMask | GcNoMask) /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ - U_GC_Z_MASK = (U_GC_ZS_MASK | U_GC_ZL_MASK | U_GC_ZP_MASK) + GcZMask = (GcZsMask | GcZlMask | GcZpMask) ) -const UPROPS_AGE_SHIFT = 24 -const U_MAX_VERSION_LENGTH = 4 -const U_VERSION_DELIMITER = '.' +const upropsAgeShift = 24 +const maxVersionLength = 4 +const versionDelimiter = '.' -type UVersionInfo [U_MAX_VERSION_LENGTH]uint8 +type UVersionInfo [maxVersionLength]uint8 const ( /** No numeric value. */ - UPROPS_NTV_NONE = 0 + UPropsNtvNone = 0 /** Decimal digits: nv=0..9 */ - UPROPS_NTV_DECIMAL_START = 1 + UPropsNtvDecimalStart = 1 /** Other digits: nv=0..9 */ - UPROPS_NTV_DIGIT_START = 11 + UPropsNtvDigitStart = 11 /** Small integers: nv=0..154 */ - UPROPS_NTV_NUMERIC_START = 21 + UPropsNtvNumericStart = 21 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ - UPROPS_NTV_FRACTION_START = 0xb0 + UPropsNtvFractionStart = 0xb0 /** * Large integers: * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) * (only one significant decimal digit) */ - UPROPS_NTV_LARGE_START = 0x1e0 + UPropsNtvLargeStart = 0x1e0 /** * Sexagesimal numbers: * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) */ - UPROPS_NTV_BASE60_START = 0x300 + UPropsNtvBase60Start = 0x300 /** * Fraction-20 values: * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 * numerator: num = 2*(frac20&3)+1 * denominator: den = 20<<(frac20>>2) */ - UPROPS_NTV_FRACTION20_START = UPROPS_NTV_BASE60_START + 36 // 0x300+9*4=0x324 + UPropsNtvFraction20Start = UPropsNtvBase60Start + 36 // 0x300+9*4=0x324 /** * Fraction-32 values: * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 * numerator: num = 2*(frac32&3)+1 * denominator: den = 32<<(frac32>>2) */ - UPROPS_NTV_FRACTION32_START = UPROPS_NTV_FRACTION20_START + 24 // 0x324+6*4=0x34c + UPropsNtvFraction32Start = UPropsNtvFraction20Start + 24 // 0x324+6*4=0x34c /** No numeric value (yet). */ - UPROPS_NTV_RESERVED_START = UPROPS_NTV_FRACTION32_START + 16 // 0x34c+4*4=0x35c + UPropsNtvReservedStart = UPropsNtvFraction32Start + 16 // 0x34c+4*4=0x35c - UPROPS_NTV_MAX_SMALL_INT = UPROPS_NTV_FRACTION_START - UPROPS_NTV_NUMERIC_START - 1 + UPropsNtvMaxSmallInt = UPropsNtvFractionStart - UPropsNtvNumericStart - 1 ) -const U_NO_NUMERIC_VALUE = -123456789.0 +const noNumericValue = -123456789.0 diff --git a/go/mysql/icuregex/internal/uchar/uchar.go b/go/mysql/icuregex/internal/uchar/uchar.go index 55fb6100017..a2c758ea1c0 100644 --- a/go/mysql/icuregex/internal/uchar/uchar.go +++ b/go/mysql/icuregex/internal/uchar/uchar.go @@ -22,7 +22,7 @@ limitations under the License. package uchar import ( - "fmt" + "errors" "strconv" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" @@ -73,7 +73,7 @@ func readData(bytes *udata.Bytes) error { trieLength := uprops.trie.SerializedLength() if trieLength > expectedTrieLength { - return fmt.Errorf("ucase.icu: not enough bytes for the trie") + return errors.New("ucase.icu: not enough bytes for the trie") } bytes.Skip(expectedTrieLength - trieLength) @@ -89,7 +89,7 @@ func readData(bytes *udata.Bytes) error { trieLength = uprops.trie2.SerializedLength() if trieLength > expectedTrieLength { - return fmt.Errorf("ucase.icu: not enough bytes for the trie") + return errors.New("ucase.icu: not enough bytes for the trie") } bytes.Skip(expectedTrieLength - trieLength) @@ -121,23 +121,23 @@ func VecAddPropertyStarts(sa PropertySet) { }) } -func AddPropertyStarts(sa PropertySet) { - const ( - TAB = 0x0009 - LF = 0x000a - FF = 0x000c - CR = 0x000d - NBSP = 0x00a0 - CGJ = 0x034f - FIGURESP = 0x2007 - HAIRSP = 0x200a - ZWNJ = 0x200c - ZWJ = 0x200d - RLM = 0x200f - NNBSP = 0x202f - ZWNBSP = 0xfef - ) +const ( + tab = 0x0009 + lf = 0x000a + ff = 0x000c + cr = 0x000d + nbsp = 0x00a0 + cgj = 0x034f + figuresp = 0x2007 + hairsp = 0x200a + zwnj = 0x200c + zwj = 0x200d + rlm = 0x200f + nnbsp = 0x202f + zwnbsp = 0xfef +) +func AddPropertyStarts(sa PropertySet) { /* add the start code point of each same-value range of the main trie */ uprops.trie.Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) @@ -147,11 +147,11 @@ func AddPropertyStarts(sa PropertySet) { /* add code points with hardcoded properties, plus the ones following them */ /* add for u_isblank() */ - sa.AddRune(TAB) - sa.AddRune(TAB + 1) + sa.AddRune(tab) + sa.AddRune(tab + 1) /* add for IS_THAT_CONTROL_SPACE() */ - sa.AddRune(CR + 1) /* range TAB..CR */ + sa.AddRune(cr + 1) /* range TAB..CR */ sa.AddRune(0x1c) sa.AddRune(0x1f + 1) sa.AddRune(0x85) // NEXT LINE (NEL) @@ -159,20 +159,20 @@ func AddPropertyStarts(sa PropertySet) { /* add for u_isIDIgnorable() what was not added above */ sa.AddRune(0x7f) /* range DEL..NBSP-1, NBSP added below */ - sa.AddRune(HAIRSP) - sa.AddRune(RLM + 1) + sa.AddRune(hairsp) + sa.AddRune(rlm + 1) sa.AddRune(0x206a) // INHIBIT SYMMETRIC SWAPPING sa.AddRune(0x206f + 1) // NOMINAL DIGIT SHAPES - sa.AddRune(ZWNBSP) - sa.AddRune(ZWNBSP + 1) + sa.AddRune(zwnbsp) + sa.AddRune(zwnbsp + 1) /* add no-break spaces for u_isWhitespace() what was not added above */ - sa.AddRune(NBSP) - sa.AddRune(NBSP + 1) - sa.AddRune(FIGURESP) - sa.AddRune(FIGURESP + 1) - sa.AddRune(NNBSP) - sa.AddRune(NNBSP + 1) + sa.AddRune(nbsp) + sa.AddRune(nbsp + 1) + sa.AddRune(figuresp) + sa.AddRune(figuresp + 1) + sa.AddRune(nnbsp) + sa.AddRune(nnbsp + 1) /* add for u_digit() */ sa.AddRune('a') @@ -200,21 +200,21 @@ func AddPropertyStarts(sa PropertySet) { sa.AddRune(0xe0fff + 1) /* add for UCHAR_GRAPHEME_BASE and others */ - sa.AddRune(CGJ) - sa.AddRune(CGJ + 1) + sa.AddRune(cgj) + sa.AddRune(cgj + 1) } -func CharType(c rune) int8 { +func CharType(c rune) Category { props := uprops.trie.Get16(c) - return GET_CATEGORY(props) + return getCategory(props) } func GetProperties(c rune) uint16 { return uprops.trie.Get16(c) } -func GET_CATEGORY(props uint16) int8 { - return int8(props & 0x1f) +func getCategory(props uint16) Category { + return Category(props & 0x1f) } func GetUnicodeProperties(c rune, column int) uint32 { @@ -234,18 +234,18 @@ func ScriptExtensions(idx uint32) []uint16 { } func IsDigit(c rune) bool { - return CharType(c) == U_DECIMAL_DIGIT_NUMBER + return CharType(c) == DecimalDigitNumber } func IsPOSIXPrint(c rune) bool { - return CharType(c) == U_SPACE_SEPARATOR || IsGraphPOSIX(c) + return CharType(c) == SpaceSeparator || IsGraphPOSIX(c) } func IsGraphPOSIX(c rune) bool { props := uprops.trie.Get16(c) /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ /* comparing ==0 returns FALSE for the categories mentioned */ - return U_MASK(GET_CATEGORY(props))&(U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK) == 0 + return uMask(getCategory(props))&(GcCcMask|GcCsMask|GcCnMask|GcZMask) == 0 } func IsXDigit(c rune) bool { @@ -262,18 +262,18 @@ func IsBlank(c rune) bool { return c == 9 || c == 0x20 /* TAB or SPACE */ } /* Zs */ - return CharType(c) == U_SPACE_SEPARATOR + return CharType(c) == SpaceSeparator } func CharAge(c rune) UVersionInfo { - version := GetUnicodeProperties(c, 0) >> UPROPS_AGE_SHIFT + version := GetUnicodeProperties(c, 0) >> upropsAgeShift return UVersionInfo{uint8(version >> 4), uint8(version & 0xf), 0, 0} } func VersionFromString(str string) (version UVersionInfo) { part := 0 - for len(str) > 0 && part < U_MAX_VERSION_LENGTH { - if str[0] == U_VERSION_DELIMITER { + for len(str) > 0 && part < maxVersionLength { + if str[0] == versionDelimiter { str = str[1:] } str, version[part] = parseInt(str) @@ -291,13 +291,14 @@ func parseInt(str string) (string, uint8) { start := 0 end := 0 +whitespace: for i := 0; i < len(str); i++ { switch str[i] { case ' ', '\f', '\n', '\r', '\t', '\v': start++ continue default: - break + break whitespace } } str = str[start:] @@ -317,33 +318,33 @@ func parseInt(str string) (string, uint8) { return str[end:], uint8(val) } -const UPROPS_NUMERIC_TYPE_VALUE_SHIFT = 6 +const upropsNumericTypeValueShift = 6 func NumericTypeValue(c rune) uint16 { props := uprops.trie.Get16(c) - return props >> UPROPS_NUMERIC_TYPE_VALUE_SHIFT + return props >> upropsNumericTypeValueShift } func NumericValue(c rune) float64 { ntv := int32(NumericTypeValue(c)) - if ntv == UPROPS_NTV_NONE { - return U_NO_NUMERIC_VALUE - } else if ntv < UPROPS_NTV_DIGIT_START { + if ntv == UPropsNtvNone { + return noNumericValue + } else if ntv < UPropsNtvDigitStart { /* decimal digit */ - return float64(ntv - UPROPS_NTV_DECIMAL_START) - } else if ntv < UPROPS_NTV_NUMERIC_START { + return float64(ntv - UPropsNtvDecimalStart) + } else if ntv < UPropsNtvNumericStart { /* other digit */ - return float64(ntv - UPROPS_NTV_DIGIT_START) - } else if ntv < UPROPS_NTV_FRACTION_START { + return float64(ntv - UPropsNtvDigitStart) + } else if ntv < UPropsNtvFractionStart { /* small integer */ - return float64(ntv - UPROPS_NTV_NUMERIC_START) - } else if ntv < UPROPS_NTV_LARGE_START { + return float64(ntv - UPropsNtvNumericStart) + } else if ntv < UPropsNtvLargeStart { /* fraction */ numerator := (ntv >> 4) - 12 denominator := (ntv & 0xf) + 1 return float64(numerator) / float64(denominator) - } else if ntv < UPROPS_NTV_BASE60_START { + } else if ntv < UPropsNtvBase60Start { /* large, single-significant-digit integer */ mant := (ntv >> 5) - 14 exp := (ntv & 0x1f) + 2 @@ -366,7 +367,7 @@ func NumericValue(c rune) float64 { } return numValue - } else if ntv < UPROPS_NTV_FRACTION20_START { + } else if ntv < UPropsNtvFraction20Start { /* sexagesimal (base 60) integer */ numValue := (ntv >> 2) - 0xbf exp := (ntv & 3) + 1 @@ -385,20 +386,20 @@ func NumericValue(c rune) float64 { } return float64(numValue) - } else if ntv < UPROPS_NTV_FRACTION32_START { + } else if ntv < UPropsNtvFraction32Start { // fraction-20 e.g. 3/80 - frac20 := ntv - UPROPS_NTV_FRACTION20_START // 0..0x17 + frac20 := ntv - UPropsNtvFraction20Start // 0..0x17 numerator := 2*(frac20&3) + 1 denominator := 20 << (frac20 >> 2) return float64(numerator) / float64(denominator) - } else if ntv < UPROPS_NTV_RESERVED_START { + } else if ntv < UPropsNtvReservedStart { // fraction-32 e.g. 3/64 - frac32 := ntv - UPROPS_NTV_FRACTION32_START // 0..15 + frac32 := ntv - UPropsNtvFraction32Start // 0..15 numerator := 2*(frac32&3) + 1 denominator := 32 << (frac32 >> 2) return float64(numerator) / float64(denominator) } else { /* reserved */ - return U_NO_NUMERIC_VALUE + return noNumericValue } } diff --git a/go/mysql/icuregex/internal/uerror/error.go b/go/mysql/icuregex/internal/uerror/error.go index 7feb86fe805..4842f1ae6eb 100644 --- a/go/mysql/icuregex/internal/uerror/error.go +++ b/go/mysql/icuregex/internal/uerror/error.go @@ -25,39 +25,38 @@ import ( "errors" ) -type UErrorCode int32 +type Code int32 -var IllegalArgumentError = errors.New("illegal argument") -var UnsupportedError = errors.New("unsupported") +var ErrIllegalArgument = errors.New("illegal argument") +var ErrUnsupported = errors.New("unsupported") -type URegexCompileErrorCode int32 +type CompileErrorCode int32 const ( - U_REGEX_ZERO_ERROR URegexCompileErrorCode = iota - U_REGEX_INTERNAL_ERROR /**< An internal error (bug) was detected. */ - U_REGEX_RULE_SYNTAX /**< Syntax error in regexp pattern. */ - U_REGEX_INVALID_STATE /**< RegexMatcher in invalid state for requested operation */ - U_REGEX_BAD_ESCAPE_SEQUENCE /**< Unrecognized backslash escape sequence in pattern */ - U_REGEX_PROPERTY_SYNTAX /**< Incorrect Unicode property */ - U_REGEX_UNIMPLEMENTED /**< Use of regexp feature that is not yet implemented. */ - U_REGEX_MISMATCHED_PAREN /**< Incorrectly nested parentheses in regexp pattern. */ - U_REGEX_NUMBER_TOO_BIG /**< Decimal number is too large. */ - U_REGEX_BAD_INTERVAL /**< Error in {min,max} interval */ - U_REGEX_MAX_LT_MIN /**< In {min,max}, max is less than min. */ - U_REGEX_INVALID_BACK_REF /**< Back-reference to a non-existent capture group. */ - U_REGEX_INVALID_FLAG /**< Invalid value for match mode flags. */ - U_REGEX_LOOK_BEHIND_LIMIT /**< Look-Behind pattern matches must have a bounded maximum length. */ - U_REGEX_SET_CONTAINS_STRING /**< Regexps cannot have UnicodeSets containing strings.*/ - U_REGEX_MISSING_CLOSE_BRACKET /**< Missing closing bracket on a bracket expression. */ - U_REGEX_INVALID_RANGE /**< In a character range [x-y], x is greater than y. */ - U_REGEX_PATTERN_TOO_BIG /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ - U_REGEX_INVALID_CAPTURE_GROUP_NAME /**< Invalid capture group name. @stable ICU 55 */ - U_REGEX_UNSUPPORTED_ERROR /**< Use of an unsupported feature. @stable ICU 55 */ + ZeroError CompileErrorCode = iota + InternalError /**< An internal error (bug) was detected. */ + RuleSyntax /**< Syntax error in regexp pattern. */ + InvalidState /**< RegexMatcher in invalid state for requested operation */ + BadEscapeSequence /**< Unrecognized backslash escape sequence in pattern */ + PropertySyntax /**< Incorrect Unicode property */ + Unimplemented /**< Use of regexp feature that is not yet implemented. */ + MismatchedParen /**< Incorrectly nested parentheses in regexp pattern. */ + NumberTooBig /**< Decimal number is too large. */ + BadInterval /**< Error in {min,max} interval */ + MaxLtMin /**< In {min,max}, max is less than min. */ + InvalidBackRef /**< Back-reference to a non-existent capture group. */ + InvalidFlag /**< Invalid value for match mode flags. */ + LookBehindLimit /**< Look-Behind pattern matches must have a bounded maximum length. */ + SetContainsString /**< Regexps cannot have UnicodeSets containing strings.*/ + MissingCloseBracket /**< Missing closing bracket on a bracket expression. */ + InvalidRange /**< In a character range [x-y], x is greater than y. */ + PatternTooBig /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ + InvalidCaptureGroupName /**< Invalid capture group name. @stable ICU 55 */ ) -type URegexMatchErrorCode int32 +type MatchErrorCode int32 const ( - U_REGEX_STACK_OVERFLOW URegexMatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ - U_REGEX_TIME_OUT /**< Maximum allowed match time exceeded */ + StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ + TimeOut /**< Maximum allowed match time exceeded */ ) diff --git a/go/mysql/icuregex/internal/ulayout/ulayout.go b/go/mysql/icuregex/internal/ulayout/ulayout.go index 5e86d508895..dbf21d9460b 100644 --- a/go/mysql/icuregex/internal/ulayout/ulayout.go +++ b/go/mysql/icuregex/internal/ulayout/ulayout.go @@ -22,7 +22,7 @@ limitations under the License. package ulayout import ( - "fmt" + "errors" "sync" "vitess.io/vitess/go/mysql/icuregex/internal/icudata" @@ -35,20 +35,11 @@ var inscTrie *utrie.UcpTrie var voTrie *utrie.UcpTrie const ( - IX_INPC_TRIE_TOP = 1 - IX_INSC_TRIE_TOP = 2 - IX_VO_TRIE_TOP = 3 - IX_RESERVED_TOP = 4 + ixInpcTrieTop = 1 + ixInscTrieTop = 2 + ixVoTrieTop = 3 - IX_TRIES_TOP = 7 - - IX_MAX_VALUES = 9 - - IX_COUNT = 12 - - MAX_INPC_SHIFT = 24 - MAX_INSC_SHIFT = 16 - MAX_VO_SHIFT = 8 + ixCount = 12 ) func InpcTrie() *utrie.UcpTrie { @@ -91,8 +82,8 @@ func readData(bytes *udata.Bytes) error { startPos := bytes.Position() indexesLength := int32(bytes.Uint32()) // inIndexes[IX_INDEXES_LENGTH] - if indexesLength < IX_COUNT { - return fmt.Errorf("Text layout properties data: not enough indexes") + if indexesLength < ixCount { + return errors.New("text layout properties data: not enough indexes") } index := make([]int32, indexesLength) index[0] = indexesLength @@ -101,7 +92,7 @@ func readData(bytes *udata.Bytes) error { } offset := indexesLength * 4 - top := index[IX_INPC_TRIE_TOP] + top := index[ixInpcTrieTop] trieSize := top - offset if trieSize >= 16 { inpcTrie, err = utrie.UcpTrieFromBytes(bytes) @@ -113,7 +104,7 @@ func readData(bytes *udata.Bytes) error { pos := bytes.Position() - startPos bytes.Skip(top - pos) offset = top - top = index[IX_INSC_TRIE_TOP] + top = index[ixInscTrieTop] trieSize = top - offset if trieSize >= 16 { inscTrie, err = utrie.UcpTrieFromBytes(bytes) @@ -125,7 +116,7 @@ func readData(bytes *udata.Bytes) error { pos = bytes.Position() - startPos bytes.Skip(top - pos) offset = top - top = index[IX_VO_TRIE_TOP] + top = index[ixVoTrieTop] trieSize = top - offset if trieSize >= 16 { voTrie, err = utrie.UcpTrieFromBytes(bytes) diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index 1a7329189ac..45920be8292 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -32,9 +32,9 @@ import ( ) var charNamesOnce sync.Once -var charNames *UCharNames +var charNames *unames -type UCharNames struct { +type unames struct { tokens []uint16 tokenStrings []uint8 groups []uint16 @@ -62,7 +62,7 @@ func loadCharNames() { groupsOffset := int32(b.Uint32() - 16) groupStringOffset := int32(b.Uint32() - 16) algNamesOffset := int32(b.Uint32() - 16) - charNames = &UCharNames{ + charNames = &unames{ tokens: b.Uint16Slice(tokenStringOffset / 2), tokenStrings: b.Uint8Slice(groupsOffset - tokenStringOffset), groups: b.Uint16Slice((groupStringOffset - groupsOffset) / 2), @@ -92,29 +92,24 @@ func loadCharNames() { }) } -func (names *UCharNames) getGroupName(group []uint16) []uint8 { +func (names *unames) getGroupName(group []uint16) []uint8 { return names.groupNames[names.getGroupOffset(group):] } type NameChoice int32 const ( - U_UNICODE_CHAR_NAME NameChoice = iota + UnicodeCharName NameChoice = iota /** * The Unicode_1_Name property value which is of little practical value. * Beginning with ICU 49, ICU APIs return an empty string for this name choice. * @deprecated ICU 49 */ - U_UNICODE_10_CHAR_NAME + Unicode10CharName /** Standard or synthetic character name. @stable ICU 2.0 */ - U_EXTENDED_CHAR_NAME + ExtendedCharName /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ - U_CHAR_NAME_ALIAS - /** - * One more than the highest normal UCharNameChoice value. - * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. - */ - U_CHAR_NAME_CHOICE_COUNT + CharNameAlias ) type algorithmicRange struct { @@ -246,7 +241,7 @@ func CharForName(nameChoice NameChoice, name string) rune { upper := strings.ToUpper(name) if lower[0] == '<' { - if nameChoice == U_EXTENDED_CHAR_NAME && lower[len(lower)-1] == '>' { + if nameChoice == ExtendedCharName && lower[len(lower)-1] == '>' { if limit := strings.LastIndexByte(lower, '-'); limit >= 2 { cp, err := strconv.ParseUint(lower[limit+1:len(lower)-1], 16, 32) if err != nil || cp > 0x10ffff { @@ -267,25 +262,25 @@ func CharForName(nameChoice NameChoice, name string) rune { return charNames.enumNames(0, 0x10ffff+1, upper, nameChoice) } -const GROUP_SHIFT = 5 -const LINES_PER_GROUP = 1 << GROUP_SHIFT -const GROUP_MASK = LINES_PER_GROUP - 1 +const groupShift = 5 +const linesPerGroup = 1 << groupShift +const groupMask = linesPerGroup - 1 const ( - GROUP_MSB = iota - GROUP_OFFSET_HIGH - GROUP_OFFSET_LOW - GROUP_LENGTH + groupMsb = iota + groupOffsetHigh + groupOffsetLow + groupLength ) -func (names *UCharNames) enumNames(start, limit rune, otherName string, nameChoice NameChoice) rune { - startGroupMSB := uint16(start >> GROUP_SHIFT) - endGroupMSB := uint16((limit - 1) >> GROUP_SHIFT) +func (names *unames) enumNames(start, limit rune, otherName string, nameChoice NameChoice) rune { + startGroupMSB := uint16(start >> groupShift) + endGroupMSB := uint16((limit - 1) >> groupShift) group := names.getGroup(start) - if startGroupMSB < group[GROUP_MSB] && nameChoice == U_EXTENDED_CHAR_NAME { - extLimit := rune(group[GROUP_MSB]) << GROUP_SHIFT + if startGroupMSB < group[groupMsb] && nameChoice == ExtendedCharName { + extLimit := rune(group[groupMsb]) << groupShift if extLimit > limit { extLimit = limit } @@ -293,40 +288,40 @@ func (names *UCharNames) enumNames(start, limit rune, otherName string, nameChoi } if startGroupMSB == endGroupMSB { - if startGroupMSB == group[GROUP_MSB] { + if startGroupMSB == group[groupMsb] { return names.enumGroupNames(group, start, limit-1, otherName, nameChoice) } } else { - if startGroupMSB == group[GROUP_MSB] { - if start&GROUP_MASK != 0 { - if cp := names.enumGroupNames(group, start, (rune(startGroupMSB)< group[GROUP_MSB] { - group = group[GROUP_LENGTH:] + } else if startGroupMSB > group[groupMsb] { + group = group[groupLength:] } - for len(group) > 0 && group[GROUP_MSB] < endGroupMSB { - start = rune(group[GROUP_MSB]) << GROUP_SHIFT - if cp := names.enumGroupNames(group, start, start+LINES_PER_GROUP-1, otherName, nameChoice); cp != -1 { + for len(group) > 0 && group[groupMsb] < endGroupMSB { + start = rune(group[groupMsb]) << groupShift + if cp := names.enumGroupNames(group, start, start+linesPerGroup-1, otherName, nameChoice); cp != -1 { return cp } - group = group[GROUP_LENGTH:] + group = group[groupLength:] } - if len(group) > 0 && group[GROUP_MSB] == endGroupMSB { - return names.enumGroupNames(group, (limit-1)&^GROUP_MASK, limit-1, otherName, nameChoice) + if len(group) > 0 && group[groupMsb] == endGroupMSB { + return names.enumGroupNames(group, (limit-1)&^groupMask, limit-1, otherName, nameChoice) } } return -1 } -func (names *UCharNames) getGroup(code rune) []uint16 { +func (names *unames) getGroup(code rune) []uint16 { groups := names.groups - groupMSB := uint16(code >> GROUP_SHIFT) + groupMSB := uint16(code >> groupShift) start := 0 groupCount := int(groups[0]) @@ -335,30 +330,30 @@ func (names *UCharNames) getGroup(code rune) []uint16 { for start < limit-1 { number := (start + limit) / 2 - if groupMSB < groups[number*GROUP_LENGTH+GROUP_MSB] { + if groupMSB < groups[number*groupLength+groupMsb] { limit = number } else { start = number } } - return groups[start*GROUP_LENGTH : (groupCount-start)*GROUP_LENGTH] + return groups[start*groupLength : (groupCount-start)*groupLength] } -func (names *UCharNames) getGroupOffset(group []uint16) uint32 { - return (uint32(group[GROUP_OFFSET_HIGH]) << 16) | uint32(group[GROUP_OFFSET_LOW]) +func (names *unames) getGroupOffset(group []uint16) uint32 { + return (uint32(group[groupOffsetHigh]) << 16) | uint32(group[groupOffsetLow]) } -func (names *UCharNames) enumGroupNames(group []uint16, start, end rune, otherName string, choice NameChoice) rune { - var offsets [LINES_PER_GROUP + 2]uint16 - var lengths [LINES_PER_GROUP + 2]uint16 +func (names *unames) enumGroupNames(group []uint16, start, end rune, otherName string, choice NameChoice) rune { + var offsets [linesPerGroup + 2]uint16 + var lengths [linesPerGroup + 2]uint16 s := names.getGroupName(group) s = expandGroupLengths(s, offsets[:0], lengths[:0]) for start < end { - name := s[offsets[start&GROUP_MASK]:] - nameLen := lengths[start&GROUP_MASK] + name := s[offsets[start&groupMask]:] + nameLen := lengths[start&groupMask] if names.compareName(name[:nameLen], choice, otherName) { return start } @@ -373,7 +368,7 @@ func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { var lengthByte uint8 /* all 32 lengths must be read to get the offset of the first group string */ - for i < LINES_PER_GROUP { + for i < linesPerGroup { lengthByte = s[0] s = s[1:] @@ -418,7 +413,7 @@ func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { return s } -func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName string) bool { +func (names *unames) compareName(name []byte, choice NameChoice, otherName string) bool { tokens := names.tokens tokenCount := tokens[0] @@ -452,7 +447,7 @@ func (names *UCharNames) compareName(name []byte, choice NameChoice, otherName s } otherName = otherName[1:] } else { - if len(otherName) == otherNameLen && choice == U_EXTENDED_CHAR_NAME { + if len(otherName) == otherNameLen && choice == ExtendedCharName { if ';' >= tokenCount || int16(tokens[';']) == -1 { continue } diff --git a/go/mysql/icuregex/internal/unames/unames_test.go b/go/mysql/icuregex/internal/unames/unames_test.go index 941556b70d3..f15353eef8d 100644 --- a/go/mysql/icuregex/internal/unames/unames_test.go +++ b/go/mysql/icuregex/internal/unames/unames_test.go @@ -49,13 +49,13 @@ func TestCharForName(t *testing.T) { for _, tn := range TestNames { if tn.name != "" { - r := CharForName(U_UNICODE_CHAR_NAME, tn.name) + r := CharForName(UnicodeCharName, tn.name) if r != tn.code { t.Errorf("CharFromName(U_UNICODE_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.name, r, r, tn.code, tn.code) } } if tn.extName != "" { - r := CharForName(U_EXTENDED_CHAR_NAME, tn.extName) + r := CharForName(ExtendedCharName, tn.extName) if r != tn.code { t.Errorf("CharFromName(U_EXTENDED_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.extName, r, r, tn.code, tn.code) } diff --git a/go/mysql/icuregex/internal/uprops/constants.go b/go/mysql/icuregex/internal/uprops/constants.go index a3a6f0d3d5d..3cfe250599a 100644 --- a/go/mysql/icuregex/internal/uprops/constants.go +++ b/go/mysql/icuregex/internal/uprops/constants.go @@ -21,31 +21,6 @@ limitations under the License. package uprops -const ( - UPROPS_PROPS32_INDEX = iota - UPROPS_EXCEPTIONS_INDEX - UPROPS_EXCEPTIONS_TOP_INDEX - - UPROPS_ADDITIONAL_TRIE_INDEX - UPROPS_ADDITIONAL_VECTORS_INDEX - UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX - - UPROPS_SCRIPT_EXTENSIONS_INDEX - - UPROPS_RESERVED_INDEX_7 - UPROPS_RESERVED_INDEX_8 - - /* size of the data file (number of 32-bit units after the header) */ - UPROPS_DATA_TOP_INDEX - - /* maximum values for code values in vector word 0 */ - UPROPS_MAX_VALUES_INDEX = 10 - /* maximum values for code values in vector word 2 */ - UPROPS_MAX_VALUES_2_INDEX = 11 - - UPROPS_INDEX_COUNT = 16 -) - type Property int32 const ( @@ -62,169 +37,169 @@ const ( /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ - UCHAR_ALPHABETIC Property = 0 + UCharAlphabetic Property = 0 /** First constant for binary Unicode properties. @stable ICU 2.1 */ - UCHAR_BINARY_START = UCHAR_ALPHABETIC + UCharBinaryStart = UCharAlphabetic /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ - UCHAR_ASCII_HEX_DIGIT Property = 1 + UCharASCIIHexDigit Property = 1 /** Binary property Bidi_Control. Format controls which have specific functions in the Bidi Algorithm. @stable ICU 2.1 */ - UCHAR_BIDI_CONTROL Property = 2 + UCharBidiControl Property = 2 /** Binary property Bidi_Mirrored. Characters that may change display in RTL text. Same as u_isMirrored. See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ - UCHAR_BIDI_MIRRORED Property = 3 + UCharBidiMirrored Property = 3 /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ - UCHAR_DASH Property = 4 + UCharDash Property = 4 /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). Ignorable in most processing. <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ - UCHAR_DEFAULT_IGNORABLE_CODE_POINT Property = 5 + UCharDefaultIgnorableCodePoint Property = 5 /** Binary property Deprecated (new in Unicode 3.2). The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ - UCHAR_DEPRECATED Property = 6 + UCharDeprecated Property = 6 /** Binary property Diacritic. Characters that linguistically modify the meaning of another character to which they apply. @stable ICU 2.1 */ - UCHAR_DIACRITIC Property = 7 + UCharDiacritic Property = 7 /** Binary property Extender. Extend the value or shape of a preceding alphabetic character, e.g., length and iteration marks. @stable ICU 2.1 */ - UCHAR_EXTENDER Property = 8 + UCharExtender Property = 8 /** Binary property Full_Composition_Exclusion. CompositionExclusions.txt+Singleton Decompositions+ Non-Starter Decompositions. @stable ICU 2.1 */ - UCHAR_FULL_COMPOSITION_EXCLUSION Property = 9 + UCharFullCompositionExclusion Property = 9 /** Binary property Grapheme_Base (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ - UCHAR_GRAPHEME_BASE Property = 10 + UCharGraphemeBase Property = 10 /** Binary property Grapheme_Extend (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ - UCHAR_GRAPHEME_EXTEND Property = 11 + UCharGraphemeExtend Property = 11 /** Binary property Grapheme_Link (new in Unicode 3.2). For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ - UCHAR_GRAPHEME_LINK Property = 12 + UCharGraphemeLink Property = 12 /** Binary property Hex_Digit. Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ - UCHAR_HEX_DIGIT Property = 13 + UCharHexDigit Property = 13 /** Binary property Hyphen. Dashes used to mark connections between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ - UCHAR_HYPHEN Property = 14 + UCharHyphen Property = 14 /** Binary property ID_Continue. Characters that can continue an identifier. DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ - UCHAR_ID_CONTINUE Property = 15 + UCharIDContinue Property = 15 /** Binary property ID_Start. Characters that can start an identifier. Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ - UCHAR_ID_START Property = 16 + UCharIDStart Property = 16 /** Binary property Ideographic. CJKV ideographs. @stable ICU 2.1 */ - UCHAR_IDEOGRAPHIC Property = 17 + UCharIdeographic Property = 17 /** Binary property IDS_Binary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ - UCHAR_IDS_BINARY_OPERATOR Property = 18 + UCharIdsBinaryOperator Property = 18 /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ - UCHAR_IDS_TRINARY_OPERATOR Property = 19 + UCharIdsTrinaryOperator Property = 19 /** Binary property Join_Control. Format controls for cursive joining and ligation. @stable ICU 2.1 */ - UCHAR_JOIN_CONTROL Property = 20 + UCharJoinControl Property = 20 /** Binary property Logical_Order_Exception (new in Unicode 3.2). Characters that do not use logical order and require special handling in most processing. @stable ICU 2.1 */ - UCHAR_LOGICAL_ORDER_EXCEPTION Property = 21 + UCharLogicalOrderException Property = 21 /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. Ll+Other_Lowercase @stable ICU 2.1 */ - UCHAR_LOWERCASE Property = 22 + UCharLowercase Property = 22 /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ - UCHAR_MATH Property = 23 + UCharMath Property = 23 /** Binary property Noncharacter_Code_Point. Code points that are explicitly defined as illegal for the encoding of characters. @stable ICU 2.1 */ - UCHAR_NONCHARACTER_CODE_POINT Property = 24 + UCharNoncharacterCodePoint Property = 24 /** Binary property Quotation_Mark. @stable ICU 2.1 */ - UCHAR_QUOTATION_MARK Property = 25 + UCharQuotationMark Property = 25 /** Binary property Radical (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ - UCHAR_RADICAL Property = 26 + UCharRadical Property = 26 /** Binary property Soft_Dotted (new in Unicode 3.2). Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear. @stable ICU 2.1 */ - UCHAR_SOFT_DOTTED Property = 27 + UCharSoftDotted Property = 27 /** Binary property Terminal_Punctuation. Punctuation characters that generally mark the end of textual units. @stable ICU 2.1 */ - UCHAR_TERMINAL_PUNCTUATION Property = 28 + UCharTerminalPunctuation Property = 28 /** Binary property Unified_Ideograph (new in Unicode 3.2). For programmatic determination of Ideographic Description Sequences. @stable ICU 2.1 */ - UCHAR_UNIFIED_IDEOGRAPH Property = 29 + UCharUnifiedIdeograph Property = 29 /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. Lu+Other_Uppercase @stable ICU 2.1 */ - UCHAR_UPPERCASE Property = 30 + UCharUppercase Property = 30 /** Binary property White_Space. Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ - UCHAR_WHITE_SPACE Property = 31 + UCharWhiteSpace Property = 31 /** Binary property XID_Continue. ID_Continue modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ - UCHAR_XID_CONTINUE Property = 32 + UCharXidContinue Property = 32 /** Binary property XID_Start. ID_Start modified to allow closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ - UCHAR_XID_START Property = 33 + UCharXidStart Property = 33 /** Binary property Case_Sensitive. Either the source of a case mapping or _in_ the target of a case mapping. Not the same as the general category Cased_Letter. @stable ICU 2.6 */ - UCHAR_CASE_SENSITIVE Property = 34 + UCharCaseSensitive Property = 34 /** Binary property STerm (new in Unicode 4.0.1). Sentence Terminal. Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) @stable ICU 3.0 */ - UCHAR_S_TERM Property = 35 + UCharSTerm Property = 35 /** Binary property Variation_Selector (new in Unicode 4.0.1). Indicates all those characters that qualify as Variation Selectors. For details on the behavior of these characters, see StandardizedVariants.html and 15.6 Variation Selectors. @stable ICU 3.0 */ - UCHAR_VARIATION_SELECTOR Property = 36 + UCharVariationSelector Property = 36 /** Binary property NFD_Inert. ICU-specific property for characters that are inert under NFD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ - UCHAR_NFD_INERT Property = 37 + UCharNfdInert Property = 37 /** Binary property NFKD_Inert. ICU-specific property for characters that are inert under NFKD, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ - UCHAR_NFKD_INERT Property = 38 + UCharNfkdInert Property = 38 /** Binary property NFC_Inert. ICU-specific property for characters that are inert under NFC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ - UCHAR_NFC_INERT Property = 39 + UCharNfcInert Property = 39 /** Binary property NFKC_Inert. ICU-specific property for characters that are inert under NFKC, i.e., they do not interact with adjacent characters. See the documentation for the Normalizer2 class and the Normalizer2::isInert() method. @stable ICU 3.0 */ - UCHAR_NFKC_INERT Property = 40 + UCharNfkcInert Property = 40 /** Binary Property Segment_Starter. ICU-specific property for characters that are starters in terms of Unicode normalization and combining character sequences. @@ -235,196 +210,196 @@ const ( canonically equivalent strings, e.g. for canonical closure while processing collation tailoring rules. @stable ICU 3.0 */ - UCHAR_SEGMENT_STARTER Property = 41 + UCharSegmentStarter Property = 41 /** Binary property Pattern_Syntax (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ - UCHAR_PATTERN_SYNTAX Property = 42 + UCharPatternSyntax Property = 42 /** Binary property Pattern_White_Space (new in Unicode 4.1). See UAX #31 Identifier and Pattern Syntax (http://www.unicode.org/reports/tr31/) @stable ICU 3.4 */ - UCHAR_PATTERN_WHITE_SPACE Property = 43 + UCharPatternWhiteSpace Property = 43 /** Binary property alnum (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ - UCHAR_POSIX_ALNUM Property = 44 + UCharPosixAlnum Property = 44 /** Binary property blank (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ - UCHAR_POSIX_BLANK Property = 45 + UCharPosixBlank Property = 45 /** Binary property graph (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ - UCHAR_POSIX_GRAPH Property = 46 + UCharPosixGraph Property = 46 /** Binary property print (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ - UCHAR_POSIX_PRINT Property = 47 + UCharPosixPrint Property = 47 /** Binary property xdigit (a C/POSIX character class). Implemented according to the UTS #18 Annex C Standard Recommendation. See the uchar.h file documentation. @stable ICU 3.4 */ - UCHAR_POSIX_XDIGIT Property = 48 + UCharPosixXdigit Property = 48 /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ - UCHAR_CASED Property = 49 + UCharCased Property = 49 /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ - UCHAR_CASE_IGNORABLE Property = 50 + UCharCaseIgnorable Property = 50 /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_LOWERCASED Property = 51 + UCharChangesWhenLowercased Property = 51 /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_UPPERCASED Property = 52 + UCharChangesWhenUppercased Property = 52 /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_TITLECASED Property = 53 + UCharChangesWhenTitlecased Property = 53 /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_CASEFOLDED Property = 54 + UCharChangesWhenCasefolded Property = 54 /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_CASEMAPPED Property = 55 + UCharChangesWhenCasemapped Property = 55 /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ - UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED Property = 56 + UCharChangesWhenNfkcCasefolded Property = 56 /** * Binary property Emoji. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ - UCHAR_EMOJI Property = 57 + UCharEmoji Property = 57 /** * Binary property Emoji_Presentation. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ - UCHAR_EMOJI_PRESENTATION Property = 58 + UCharEmojiPresentation Property = 58 /** * Binary property Emoji_Modifier. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ - UCHAR_EMOJI_MODIFIER Property = 59 + UCharEmojiModifier Property = 59 /** * Binary property Emoji_Modifier_Base. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 57 */ - UCHAR_EMOJI_MODIFIER_BASE Property = 60 + UCharEmojiModifierBase Property = 60 /** * Binary property Emoji_Component. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 60 */ - UCHAR_EMOJI_COMPONENT Property = 61 + UCharEmojiComponent Property = 61 /** * Binary property Regional_Indicator. * @stable ICU 60 */ - UCHAR_REGIONAL_INDICATOR Property = 62 + UCharRegionalIndicator Property = 62 /** * Binary property Prepended_Concatenation_Mark. * @stable ICU 60 */ - UCHAR_PREPENDED_CONCATENATION_MARK Property = 63 + UCharPrependedConcatenationMark Property = 63 /** * Binary property Extended_Pictographic. * See http://www.unicode.org/reports/tr51/#Emoji_Properties * * @stable ICU 62 */ - UCHAR_EXTENDED_PICTOGRAPHIC Property = 64 + UCharExtendedPictographic Property = 64 /** Enumerated property Bidi_Class. Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ - UCHAR_BIDI_CLASS Property = 0x1000 + UCharBidiClass Property = 0x1000 /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ - UCHAR_INT_START = UCHAR_BIDI_CLASS + UCharIntStart = UCharBidiClass /** Enumerated property Block. Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ - UCHAR_BLOCK Property = 0x1001 + UCharBlock Property = 0x1001 /** Enumerated property Canonical_Combining_Class. Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ - UCHAR_CANONICAL_COMBINING_CLASS Property = 0x1002 + UCharCanonicalCombiningClass Property = 0x1002 /** Enumerated property Decomposition_Type. Returns UDecompositionType values. @stable ICU 2.2 */ - UCHAR_DECOMPOSITION_TYPE Property = 0x1003 + UCharDecompositionType Property = 0x1003 /** Enumerated property East_Asian_Width. See http://www.unicode.org/reports/tr11/ Returns UEastAsianWidth values. @stable ICU 2.2 */ - UCHAR_EAST_ASIAN_WIDTH Property = 0x1004 + UCharEastAsianWidth Property = 0x1004 /** Enumerated property General_Category. Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ - UCHAR_GENERAL_CATEGORY Property = 0x1005 + UCharGeneralCategory Property = 0x1005 /** Enumerated property Joining_Group. Returns UJoiningGroup values. @stable ICU 2.2 */ - UCHAR_JOINING_GROUP Property = 0x1006 + UCharJoiningGroup Property = 0x1006 /** Enumerated property Joining_Type. Returns UJoiningType values. @stable ICU 2.2 */ - UCHAR_JOINING_TYPE Property = 0x1007 + UCharJoiningType Property = 0x1007 /** Enumerated property Line_Break. Returns ULineBreak values. @stable ICU 2.2 */ - UCHAR_LINE_BREAK Property = 0x1008 + UCharLineBreak Property = 0x1008 /** Enumerated property Numeric_Type. Returns UNumericType values. @stable ICU 2.2 */ - UCHAR_NUMERIC_TYPE Property = 0x1009 + UCharNumericType Property = 0x1009 /** Enumerated property Script. Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ - UCHAR_SCRIPT Property = 0x100A + UCharScript Property = 0x100A /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. Returns UHangulSyllableType values. @stable ICU 2.6 */ - UCHAR_HANGUL_SYLLABLE_TYPE Property = 0x100B + UCharHangulSyllableType Property = 0x100B /** Enumerated property NFD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ - UCHAR_NFD_QUICK_CHECK Property = 0x100C + UCharNfdQuickCheck Property = 0x100C /** Enumerated property NFKD_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ - UCHAR_NFKD_QUICK_CHECK Property = 0x100D + UCharNfkdQuickCheck Property = 0x100D /** Enumerated property NFC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ - UCHAR_NFC_QUICK_CHECK Property = 0x100E + UCharNfcQuickCheck Property = 0x100E /** Enumerated property NFKC_Quick_Check. Returns UNormalizationCheckResult values. @stable ICU 3.0 */ - UCHAR_NFKC_QUICK_CHECK Property = 0x100F + UCharNfkcQuickCheck Property = 0x100F /** Enumerated property Lead_Canonical_Combining_Class. ICU-specific property for the ccc of the first code point of the decomposition, or lccc(c)=ccc(NFD(c)[0]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ - UCHAR_LEAD_CANONICAL_COMBINING_CLASS Property = 0x1010 + UCharLeadCanonicalCombiningClass Property = 0x1010 /** Enumerated property Trail_Canonical_Combining_Class. ICU-specific property for the ccc of the last code point of the decomposition, or tccc(c)=ccc(NFD(c)[last]). Useful for checking for canonically ordered text; see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ - UCHAR_TRAIL_CANONICAL_COMBINING_CLASS Property = 0x1011 + UCharTrailCanonicalCombiningClass Property = 0x1011 /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ - UCHAR_GRAPHEME_CLUSTER_BREAK Property = 0x1012 + UCharGraphemeClusterBreak Property = 0x1012 /** Enumerated property Sentence_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns USentenceBreak values. @stable ICU 3.4 */ - UCHAR_SENTENCE_BREAK Property = 0x1013 + UCharSentenceBreak Property = 0x1013 /** Enumerated property Word_Break (new in Unicode 4.1). Used in UAX #29: Text Boundaries (http://www.unicode.org/reports/tr29/) Returns UWordBreakValues values. @stable ICU 3.4 */ - UCHAR_WORD_BREAK Property = 0x1014 + UCharWordBreak Property = 0x1014 /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). Used in UAX #9: Unicode Bidirectional Algorithm (http://www.unicode.org/reports/tr9/) Returns UBidiPairedBracketType values. @stable ICU 52 */ - UCHAR_BIDI_PAIRED_BRACKET_TYPE Property = 0x1015 + UCharBidiPairedBracketType Property = 0x1015 /** * Enumerated property Indic_Positional_Category. * New in Unicode 6.0 as provisional property Indic_Matra_Category; @@ -432,21 +407,21 @@ const ( * See http://www.unicode.org/reports/tr44/#IndicPositionalCategory.txt * @stable ICU 63 */ - UCHAR_INDIC_POSITIONAL_CATEGORY Property = 0x1016 + UCharIndicPositionalCategory Property = 0x1016 /** * Enumerated property Indic_Syllabic_Category. * New in Unicode 6.0 as provisional; informative since Unicode 8.0. * See http://www.unicode.org/reports/tr44/#IndicSyllabicCategory.txt * @stable ICU 63 */ - UCHAR_INDIC_SYLLABIC_CATEGORY Property = 0x1017 + UCharIndicSyllableCategory Property = 0x1017 /** * Enumerated property Vertical_Orientation. * Used for UAX #50 Unicode Vertical Text Layout (https://www.unicode.org/reports/tr50/). * New as a UCD property in Unicode 10.0. * @stable ICU 63 */ - UCHAR_VERTICAL_ORIENTATION Property = 0x1018 + UCharVerticalOrientation Property = 0x1018 /** Bitmask property General_Category_Mask. This is the General_Category property returned as a bit mask. @@ -456,78 +431,78 @@ const ( a multi-bit mask is used for sets of categories like "Letters". Mask values should be cast to uint32_t. @stable ICU 2.4 */ - UCHAR_GENERAL_CATEGORY_MASK Property = 0x2000 + UCharGeneralCategoryMask Property = 0x2000 /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ - UCHAR_MASK_START = UCHAR_GENERAL_CATEGORY_MASK + UCharMaskStart = UCharGeneralCategoryMask /** Double property Numeric_Value. Corresponds to u_getNumericValue. @stable ICU 2.4 */ - UCHAR_NUMERIC_VALUE Property = 0x3000 + UCharNumericValue Property = 0x3000 /** First constant for double Unicode properties. @stable ICU 2.4 */ - UCHAR_DOUBLE_START = UCHAR_NUMERIC_VALUE + UCharDoubleStart = UCharNumericValue /** String property Age. Corresponds to u_charAge. @stable ICU 2.4 */ - UCHAR_AGE Property = 0x4000 + UCharAge Property = 0x4000 /** First constant for string Unicode properties. @stable ICU 2.4 */ - UCHAR_STRING_START = UCHAR_AGE + UCharStringStart = UCharAge /** String property Bidi_Mirroring_Glyph. Corresponds to u_charMirror. @stable ICU 2.4 */ - UCHAR_BIDI_MIRRORING_GLYPH Property = 0x4001 + UCharBidiMirroringGlyph Property = 0x4001 /** String property Case_Folding. Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ - UCHAR_CASE_FOLDING Property = 0x4002 + UCharCaseFolding Property = 0x4002 /** String property Lowercase_Mapping. Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ - UCHAR_LOWERCASE_MAPPING Property = 0x4004 + UCharLowercaseMapping Property = 0x4004 /** String property Name. Corresponds to u_charName. @stable ICU 2.4 */ - UCHAR_NAME Property = 0x4005 + UCharName Property = 0x4005 /** String property Simple_Case_Folding. Corresponds to u_foldCase. @stable ICU 2.4 */ - UCHAR_SIMPLE_CASE_FOLDING Property = 0x4006 + UCharSimpleCaseFolding Property = 0x4006 /** String property Simple_Lowercase_Mapping. Corresponds to u_tolower. @stable ICU 2.4 */ - UCHAR_SIMPLE_LOWERCASE_MAPPING Property = 0x4007 + UCharSimpleLowercaseMapping Property = 0x4007 /** String property Simple_Titlecase_Mapping. Corresponds to u_totitle. @stable ICU 2.4 */ - UCHAR_SIMPLE_TITLECASE_MAPPING Property = 0x4008 + UcharSimpleTitlecaseMapping Property = 0x4008 /** String property Simple_Uppercase_Mapping. Corresponds to u_toupper. @stable ICU 2.4 */ - UCHAR_SIMPLE_UPPERCASE_MAPPING Property = 0x4009 + UCharSimpleUppercaseMapping Property = 0x4009 /** String property Titlecase_Mapping. Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ - UCHAR_TITLECASE_MAPPING Property = 0x400A + UCharTitlecaseMapping Property = 0x400A /** String property Uppercase_Mapping. Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ - UCHAR_UPPERCASE_MAPPING Property = 0x400C + UCharUppercaseMapping Property = 0x400C /** String property Bidi_Paired_Bracket (new in Unicode 6.3). Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ - UCHAR_BIDI_PAIRED_BRACKET Property = 0x400D + UCharBidiPairedBracket Property = 0x400D /** Miscellaneous property Script_Extensions (new in Unicode 6.0). Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/. Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. @stable ICU 4.6 */ - UCHAR_SCRIPT_EXTENSIONS Property = 0x7000 + UCharScriptExtensions Property = 0x7000 /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ - UCHAR_OTHER_PROPERTY_START = UCHAR_SCRIPT_EXTENSIONS + UCharOtherPropertyStart = UCharScriptExtensions /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ - UCHAR_INVALID_CODE Property = -1 + UCharInvalidCode Property = -1 ) const ( - UCHAR_BINARY_LIMIT = 65 - UCHAR_INT_LIMIT = 0x1019 - UCHAR_MASK_LIMIT = 0x2001 - UCHAR_STRING_LIMIT = 0x400E + uCharBinaryLimit = 65 + uCharIntLimit = 0x1019 + uCharMaskLimit = 0x2001 + uCharStringLimit = 0x400E ) /* * Properties in vector word 1 * Each bit encodes one binary property. * The following constants represent the bit number, use 1<= 0 { - set.AddRuneRange(startHasProperty, uset.MAX_VALUE) + set.AddRuneRange(startHasProperty, uset.MaxValue) } inclusionsForProperty[prop] = set @@ -141,7 +137,7 @@ func getInclusionsForIntProperty(prop Property) (*uset.UnicodeSet, error) { return inc, nil } - src := prop.Source() + src := prop.source() incl, err := getInclusionsForSource(src) if err != nil { return nil, err @@ -156,7 +152,7 @@ func getInclusionsForIntProperty(prop Property) (*uset.UnicodeSet, error) { for i := 0; i < numRanges; i++ { rangeEnd := incl.RangeEnd(i) for c := incl.RangeStart(i); c <= rangeEnd; c++ { - value := GetIntPropertyValue(c, prop) + value := getIntPropertyValue(c, prop) if value != prevValue { intPropIncl.AddRune(c) prevValue = value @@ -170,25 +166,25 @@ func getInclusionsForIntProperty(prop Property) (*uset.UnicodeSet, error) { func ApplyIntPropertyValue(u *uset.UnicodeSet, prop Property, value int32) error { switch { - case prop == UCHAR_GENERAL_CATEGORY_MASK: - inclusions, err := GetInclusionsForProperty(prop) + case prop == UCharGeneralCategoryMask: + inclusions, err := getInclusionsForProperty(prop) if err != nil { return err } u.ApplyFilter(inclusions, func(ch rune) bool { - return (U_MASK(uchar.CharType(ch)) & uint32(value)) != 0 + return (uMask(uchar.CharType(ch)) & uint32(value)) != 0 }) - case prop == UCHAR_SCRIPT_EXTENSIONS: - inclusions, err := GetInclusionsForProperty(prop) + case prop == UCharScriptExtensions: + inclusions, err := getInclusionsForProperty(prop) if err != nil { return err } u.ApplyFilter(inclusions, func(ch rune) bool { - return UScriptHasScript(ch, UScriptCode(value)) + return uscriptHasScript(ch, code(value)) }) - case 0 <= prop && prop < UCHAR_BINARY_LIMIT: + case 0 <= prop && prop < uCharBinaryLimit: if value == 0 || value == 1 { - set, err := GetInclusionsForBinaryProperty(prop) + set, err := getInclusionsForBinaryProperty(prop) if err != nil { return err } @@ -200,16 +196,16 @@ func ApplyIntPropertyValue(u *uset.UnicodeSet, prop Property, value int32) error u.Clear() } - case UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT: - inclusions, err := GetInclusionsForProperty(prop) + case UCharIntStart <= prop && prop < uCharIntLimit: + inclusions, err := getInclusionsForProperty(prop) if err != nil { return err } u.ApplyFilter(inclusions, func(ch rune) bool { - return GetIntPropertyValue(ch, prop) == value + return getIntPropertyValue(ch, prop) == value }) default: - return uerror.UnsupportedError + return uerror.ErrUnsupported } return nil } @@ -228,7 +224,7 @@ func mungeCharName(charname string) string { func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { if len(pat) < 5 { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } var posix, isName, invert bool @@ -246,30 +242,30 @@ func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { isName = c == 'N' pat = pattern.SkipWhitespace(pat[2:]) if len(pat) == 0 || pat[0] != '{' { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } pat = pat[1:] } else { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } - var close int + var closePos int if posix { - close = strings.Index(pat, ":]") + closePos = strings.Index(pat, ":]") } else { - close = strings.IndexByte(pat, '}') + closePos = strings.IndexByte(pat, '}') } - if close < 0 { - return uerror.IllegalArgumentError + if closePos < 0 { + return uerror.ErrIllegalArgument } equals := strings.IndexByte(pat, '=') var propName, valueName string - if equals >= 0 && equals < close && !isName { + if equals >= 0 && equals < closePos && !isName { propName = pat[:equals] - valueName = pat[equals+1 : close] + valueName = pat[equals+1 : closePos] } else { - propName = pat[:close] + propName = pat[:closePos] if isName { valueName = propName propName = "na" @@ -303,40 +299,40 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { var invert bool if len(value) > 0 { - p = GetPropertyEnum(prop) + p = getPropertyEnum(prop) if p == -1 { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } - if p == UCHAR_GENERAL_CATEGORY { - p = UCHAR_GENERAL_CATEGORY_MASK + if p == UCharGeneralCategory { + p = UCharGeneralCategoryMask } - if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || - (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || - (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT) { - v = GetPropertyValueEnum(p, value) + if (p >= UCharBinaryStart && p < uCharBinaryLimit) || + (p >= UCharIntStart && p < uCharIntLimit) || + (p >= UCharMaskStart && p < uCharMaskLimit) { + v = getPropertyValueEnum(p, value) if v == -1 { // Handle numeric CCC - if p == UCHAR_CANONICAL_COMBINING_CLASS || - p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || - p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS { + if p == UCharCanonicalCombiningClass || + p == UCharTrailCanonicalCombiningClass || + p == UCharLeadCanonicalCombiningClass { val, err := strconv.ParseUint(value, 10, 8) if err != nil { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } v = int32(val) } else { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } } } else { switch p { - case UCHAR_NUMERIC_VALUE: + case UCharNumericValue: val, err := strconv.ParseFloat(value, 64) if err != nil { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } - incl, err := GetInclusionsForProperty(p) + incl, err := getInclusionsForProperty(p) if err != nil { return err } @@ -344,23 +340,23 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { return uchar.NumericValue(ch) == val }) return nil - case UCHAR_NAME: + case UCharName: // Must munge name, since u_charFromName() does not do // 'loose' matching. charName := mungeCharName(value) - ch := unames.CharForName(unames.U_EXTENDED_CHAR_NAME, charName) + ch := unames.CharForName(unames.ExtendedCharName, charName) if ch < 0 { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } u.Clear() u.AddRune(ch) return nil - case UCHAR_AGE: + case UCharAge: // Must munge name, since u_versionFromString() does not do // 'loose' matching. charName := mungeCharName(value) version := uchar.VersionFromString(charName) - incl, err := GetInclusionsForProperty(p) + incl, err := getInclusionsForProperty(p) if err != nil { return err } @@ -368,44 +364,44 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { return uchar.CharAge(ch) == version }) return nil - case UCHAR_SCRIPT_EXTENSIONS: - v = GetPropertyValueEnum(UCHAR_SCRIPT, value) + case UCharScriptExtensions: + v = getPropertyValueEnum(UCharScript, value) if v == -1 { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } default: // p is a non-binary, non-enumerated property that we // don't support (yet). - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } } } else { // value is empty. Interpret as General Category, Script, or // Binary property. - p = UCHAR_GENERAL_CATEGORY_MASK - v = GetPropertyValueEnum(p, prop) + p = UCharGeneralCategoryMask + v = getPropertyValueEnum(p, prop) if v == -1 { - p = UCHAR_SCRIPT - v = GetPropertyValueEnum(p, prop) + p = UCharScript + v = getPropertyValueEnum(p, prop) if v == -1 { - p = GetPropertyEnum(prop) - if p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT { + p = getPropertyEnum(prop) + if p >= UCharBinaryStart && p < uCharBinaryLimit { v = 1 - } else if 0 == ComparePropertyNames("ANY", prop) { + } else if 0 == comparePropertyNames("ANY", prop) { u.Clear() - u.AddRuneRange(uset.MIN_VALUE, uset.MAX_VALUE) + u.AddRuneRange(uset.MinValue, uset.MaxValue) return nil - } else if 0 == ComparePropertyNames("ASCII", prop) { + } else if 0 == comparePropertyNames("ASCII", prop) { u.Clear() u.AddRuneRange(0, 0x7F) return nil - } else if 0 == ComparePropertyNames("Assigned", prop) { + } else if 0 == comparePropertyNames("Assigned", prop) { // [:Assigned:]=[:^Cn:] - p = UCHAR_GENERAL_CATEGORY_MASK - v = int32(uchar.U_GC_CN_MASK) + p = UCharGeneralCategoryMask + v = int32(uchar.GcCnMask) invert = true } else { - return uerror.IllegalArgumentError + return uerror.ErrIllegalArgument } } } @@ -421,14 +417,14 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { return nil } -func AddULayoutPropertyStarts(src PropertySource, u *uset.UnicodeSet) { +func AddULayoutPropertyStarts(src propertySource, u *uset.UnicodeSet) { var trie *utrie.UcpTrie switch src { - case UPROPS_SRC_INPC: + case srcInpc: trie = ulayout.InpcTrie() - case UPROPS_SRC_INSC: + case srcInsc: trie = ulayout.InscTrie() - case UPROPS_SRC_VO: + case srcVo: trie = ulayout.VoTrie() default: panic("unreachable") @@ -437,7 +433,7 @@ func AddULayoutPropertyStarts(src PropertySource, u *uset.UnicodeSet) { // Add the start code point of each same-value range of the trie. var start, end rune for { - end, _ = trie.GetRange(start, utrie.UCPMAP_RANGE_NORMAL, 0, nil) + end, _ = trie.GetRange(start, utrie.UcpMapRangeNormal, 0, nil) if end < 0 { break } @@ -448,7 +444,7 @@ func AddULayoutPropertyStarts(src PropertySource, u *uset.UnicodeSet) { func AddCategory(u *uset.UnicodeSet, mask uint32) error { set := uset.New() - err := ApplyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, int32(mask)) + err := ApplyIntPropertyValue(set, UCharGeneralCategoryMask, int32(mask)) if err != nil { return err } @@ -461,8 +457,8 @@ func NewUnicodeSetFomPattern(pattern string, flags uset.USet) (*uset.UnicodeSet, if err := ApplyPropertyPattern(u, pattern); err != nil { return nil, err } - if flags&uset.USET_CASE_INSENSITIVE != 0 { - u.CloseOver(uset.USET_CASE_INSENSITIVE) + if flags&uset.CaseInsensitive != 0 { + u.CloseOver(uset.CaseInsensitive) } return u, nil } diff --git a/go/mysql/icuregex/internal/uprops/uprops.go b/go/mysql/icuregex/internal/uprops/uprops.go index 37608b989d4..ddf0989b5d8 100644 --- a/go/mysql/icuregex/internal/uprops/uprops.go +++ b/go/mysql/icuregex/internal/uprops/uprops.go @@ -35,14 +35,14 @@ var pnames struct { byteTrie []uint8 } -func readData(bytes *udata.Bytes) error { - const ( - IX_VALUE_MAPS_OFFSET = 0 - IX_BYTE_TRIES_OFFSET = 1 - IX_NAME_GROUPS_OFFSET = 2 - IX_RESERVED3_OFFSET = 3 - ) +const ( + ixValueMapsOffset = 0 + ixByteTriesOffset = 1 + ixNameGroupsOffset = 2 + ixReserved3Offset = 3 +) +func readData(bytes *udata.Bytes) error { err := bytes.ReadHeader(func(info *udata.DataInfo) bool { return info.DataFormat[0] == 0x70 && info.DataFormat[1] == 0x6e && @@ -66,14 +66,14 @@ func readData(bytes *udata.Bytes) error { indexes[i] = bytes.Int32() } - offset := indexes[IX_VALUE_MAPS_OFFSET] - nextOffset := indexes[IX_BYTE_TRIES_OFFSET] + offset := indexes[ixValueMapsOffset] + nextOffset := indexes[ixByteTriesOffset] numInts := (nextOffset - offset) / 4 pnames.valueMaps = bytes.Uint32Slice(numInts) offset = nextOffset - nextOffset = indexes[IX_NAME_GROUPS_OFFSET] + nextOffset = indexes[ixNameGroupsOffset] numBytes := nextOffset - offset pnames.byteTrie = bytes.Uint8Slice(numBytes) @@ -87,74 +87,72 @@ func init() { } } -func (prop Property) Source() PropertySource { - if prop < UCHAR_BINARY_START { - return UPROPS_SRC_NONE /* undefined */ - } else if prop < UCHAR_BINARY_LIMIT { +func (prop Property) source() propertySource { + if prop < UCharBinaryStart { + return srcNone /* undefined */ + } else if prop < uCharBinaryLimit { bprop := binProps[prop] if bprop.mask != 0 { - return UPROPS_SRC_PROPSVEC - } else { - return bprop.column + return srcPropsvec } - } else if prop < UCHAR_INT_START { - return UPROPS_SRC_NONE /* undefined */ - } else if prop < UCHAR_INT_LIMIT { - iprop := intProps[prop-UCHAR_INT_START] + return bprop.column + } else if prop < UCharIntStart { + return srcNone /* undefined */ + } else if prop < uCharIntLimit { + iprop := intProps[prop-UCharIntStart] if iprop.mask != 0 { - return UPROPS_SRC_PROPSVEC - } else { - return iprop.column + return srcPropsvec } - } else if prop < UCHAR_STRING_START { + return iprop.column + } else if prop < UCharStringStart { switch prop { - case UCHAR_GENERAL_CATEGORY_MASK, - UCHAR_NUMERIC_VALUE: - return UPROPS_SRC_CHAR + case UCharGeneralCategoryMask, + UCharNumericValue: + return srcChar default: - return UPROPS_SRC_NONE + return srcNone } - } else if prop < UCHAR_STRING_LIMIT { + } else if prop < uCharStringLimit { switch prop { - case UCHAR_AGE: - return UPROPS_SRC_PROPSVEC - - case UCHAR_BIDI_MIRRORING_GLYPH: - return UPROPS_SRC_BIDI - - case UCHAR_CASE_FOLDING, - UCHAR_LOWERCASE_MAPPING, - UCHAR_SIMPLE_CASE_FOLDING, - UCHAR_SIMPLE_LOWERCASE_MAPPING, - UCHAR_SIMPLE_TITLECASE_MAPPING, - UCHAR_SIMPLE_UPPERCASE_MAPPING, - UCHAR_TITLECASE_MAPPING, - UCHAR_UPPERCASE_MAPPING: - return UPROPS_SRC_CASE + case UCharAge: + return srcPropsvec + + case UCharBidiMirroringGlyph: + return srcBidi + + case UCharCaseFolding, + UCharLowercaseMapping, + UCharSimpleCaseFolding, + UCharSimpleLowercaseMapping, + UcharSimpleTitlecaseMapping, + UCharSimpleUppercaseMapping, + UCharTitlecaseMapping, + UCharUppercaseMapping: + return srcCase /* UCHAR_ISO_COMMENT, UCHAR_UNICODE_1_NAME (deprecated) */ - case UCHAR_NAME: - return UPROPS_SRC_NAMES + case UCharName: + return srcNames default: - return UPROPS_SRC_NONE + return srcNone } } else { switch prop { - case UCHAR_SCRIPT_EXTENSIONS: - return UPROPS_SRC_PROPSVEC + case UCharScriptExtensions: + return srcPropsvec default: - return UPROPS_SRC_NONE /* undefined */ + return srcNone /* undefined */ } } } -func GetPropertyEnum(alias string) Property { +func getPropertyEnum(alias string) Property { return Property(getPropertyOrValueEnum(0, alias)) } -func GetPropertyValueEnum(prop Property, alias string) int32 { +func getPropertyValueEnum(prop Property, alias string) int32 { valueMapIdx := findProperty(prop) if valueMapIdx == 0 { return -1 @@ -194,7 +192,7 @@ func getPropertyOrValueEnum(offset int32, alias string) int32 { return -1 } -func ComparePropertyNames(name1, name2 string) int { +func comparePropertyNames(name1, name2 string) int { next := func(s string) (byte, string) { for len(s) > 0 && (s[0] == 0x2d || s[0] == 0x5f || s[0] == 0x20 || (0x09 <= s[0] && s[0] <= 0x0d)) { s = s[1:] @@ -226,9 +224,9 @@ func ComparePropertyNames(name1, name2 string) int { } } -func GetIntPropertyValue(c rune, which Property) int32 { - if which < UCHAR_INT_START { - if UCHAR_BINARY_START <= which && which < UCHAR_BINARY_LIMIT { +func getIntPropertyValue(c rune, which Property) int32 { + if which < UCharIntStart { + if UCharBinaryStart <= which && which < uCharBinaryLimit { prop := binProps[which] if prop.contains == nil { return 0 @@ -238,47 +236,32 @@ func GetIntPropertyValue(c rune, which Property) int32 { } return 0 } - } else if which < UCHAR_INT_LIMIT { - iprop := intProps[which-UCHAR_INT_START] + } else if which < uCharIntLimit { + iprop := intProps[which-UCharIntStart] return iprop.getValue(iprop, c, which) - } else if which == UCHAR_GENERAL_CATEGORY_MASK { - return int32(U_MASK(uchar.CharType(c))) + } else if which == UCharGeneralCategoryMask { + return int32(uMask(uchar.CharType(c))) } return 0 // undefined } -const ( - UPROPS_SCRIPT_X_MASK = 0x00f000ff - UPROPS_SCRIPT_X_SHIFT = 22 - - UPROPS_SCRIPT_HIGH_MASK = 0x00300000 - UPROPS_SCRIPT_HIGH_SHIFT = 12 - UPROPS_MAX_SCRIPT = 0x3ff - - UPROPS_SCRIPT_LOW_MASK = 0x000000ff - - UPROPS_SCRIPT_X_WITH_COMMON = 0x400000 - UPROPS_SCRIPT_X_WITH_INHERITED = 0x800000 - UPROPS_SCRIPT_X_WITH_OTHER = 0xc00000 -) - func mergeScriptCodeOrIndex(scriptX uint32) uint32 { - return ((scriptX & UPROPS_SCRIPT_HIGH_MASK) >> UPROPS_SCRIPT_HIGH_SHIFT) | - (scriptX & UPROPS_SCRIPT_LOW_MASK) + return ((scriptX & scriptHighMask) >> scriptHighShift) | + (scriptX & scriptLowMask) } -func GetScript(c rune) int32 { +func script(c rune) int32 { if c > 0x10ffff { return -1 } - scriptX := uchar.GetUnicodeProperties(c, 0) & UPROPS_SCRIPT_X_MASK + scriptX := uchar.GetUnicodeProperties(c, 0) & scriptXMask codeOrIndex := mergeScriptCodeOrIndex(scriptX) - if scriptX < UPROPS_SCRIPT_X_WITH_COMMON { + if scriptX < scriptXWithCommon { return int32(codeOrIndex) - } else if scriptX < UPROPS_SCRIPT_X_WITH_INHERITED { + } else if scriptX < scriptXWithInherited { return 0 - } else if scriptX < UPROPS_SCRIPT_X_WITH_OTHER { + } else if scriptX < scriptXWithOther { return 1 } else { return int32(uchar.ScriptExtension(codeOrIndex)) diff --git a/go/mysql/icuregex/internal/uprops/uprops_binary.go b/go/mysql/icuregex/internal/uprops/uprops_binary.go index 0d6981abde7..855da92b3b6 100644 --- a/go/mysql/icuregex/internal/uprops/uprops_binary.go +++ b/go/mysql/icuregex/internal/uprops/uprops_binary.go @@ -31,21 +31,21 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/uchar" ) -type BinaryProperty struct { - column PropertySource +type binaryProperty struct { + column propertySource mask uint32 - contains func(prop *BinaryProperty, c rune, which Property) bool + contains func(prop *binaryProperty, c rune, which Property) bool } -func U_MASK[T constraints.Integer](x T) uint32 { +func uMask[T constraints.Integer](x T) uint32 { return 1 << x } -func defaultContains(prop *BinaryProperty, c rune, _ Property) bool { +func defaultContains(prop *binaryProperty, c rune, _ Property) bool { return (uchar.GetUnicodeProperties(c, int(prop.column)) & prop.mask) != 0 } -var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ +var binProps = [uCharBinaryLimit]*binaryProperty{ /* * column and mask values for binary properties from u_getUnicodeProperties(). * Must be in order of corresponding UProperty, @@ -56,86 +56,86 @@ var binProps = [UCHAR_BINARY_LIMIT]*BinaryProperty{ * * See also https://unicode-org.github.io/icu/userguide/strings/properties.html */ - {1, U_MASK(UPROPS_ALPHABETIC), defaultContains}, - {1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains}, - {UPROPS_SRC_BIDI, 0, isBidiControl}, - {UPROPS_SRC_BIDI, 0, isMirrored}, - {1, U_MASK(UPROPS_DASH), defaultContains}, - {1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains}, - {1, U_MASK(UPROPS_DEPRECATED), defaultContains}, - {1, U_MASK(UPROPS_DIACRITIC), defaultContains}, - {1, U_MASK(UPROPS_EXTENDER), defaultContains}, - {UPROPS_SRC_NFC, 0, hasFullCompositionExclusion}, - {1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains}, - {1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains}, - {1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains}, - {1, U_MASK(UPROPS_HEX_DIGIT), defaultContains}, - {1, U_MASK(UPROPS_HYPHEN), defaultContains}, - {1, U_MASK(UPROPS_ID_CONTINUE), defaultContains}, - {1, U_MASK(UPROPS_ID_START), defaultContains}, - {1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains}, - {1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains}, - {1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains}, - {UPROPS_SRC_BIDI, 0, isJoinControl}, - {1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_LOWERCASE - {1, U_MASK(UPROPS_MATH), defaultContains}, - {1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains}, - {1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains}, - {1, U_MASK(UPROPS_RADICAL), defaultContains}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_SOFT_DOTTED - {1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains}, - {1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_UPPERCASE - {1, U_MASK(UPROPS_WHITE_SPACE), defaultContains}, - {1, U_MASK(UPROPS_XID_CONTINUE), defaultContains}, - {1, U_MASK(UPROPS_XID_START), defaultContains}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASE_SENSITIVE - {1, U_MASK(UPROPS_S_TERM), defaultContains}, - {1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains}, - {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFD_INERT - {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKD_INERT - {UPROPS_SRC_NFC, 0, isNormInert}, // UCHAR_NFC_INERT - {UPROPS_SRC_NFKC, 0, isNormInert}, // UCHAR_NFKC_INERT - {UPROPS_SRC_NFC_CANON_ITER, 0, nil}, // Segment_Starter is currently unsupported - {1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains}, - {1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains}, - {UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum}, - {UPROPS_SRC_CHAR, 0, isPOSIX_blank}, - {UPROPS_SRC_CHAR, 0, isPOSIX_graph}, - {UPROPS_SRC_CHAR, 0, isPOSIX_print}, - {UPROPS_SRC_CHAR, 0, isPOSIX_xdigit}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASED - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CASE_IGNORABLE - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_LOWERCASED - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_UPPERCASED - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_TITLECASED - {UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded}, - {UPROPS_SRC_CASE, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED - {UPROPS_SRC_NFKC_CF, 0, nil}, // Changes_When_NFKC_Casefolded is currently unsupported - {2, U_MASK(UPROPS_2_EMOJI), defaultContains}, - {2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains}, - {2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains}, - {2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains}, - {2, U_MASK(UPROPS_2_EMOJI_COMPONENT), defaultContains}, + {1, uMask(pAlphabetic), defaultContains}, + {1, uMask(pASCIIHexDigit), defaultContains}, + {srcBidi, 0, isBidiControl}, + {srcBidi, 0, isMirrored}, + {1, uMask(pDash), defaultContains}, + {1, uMask(pDefaultIgnorableCodePoint), defaultContains}, + {1, uMask(pDeprecated), defaultContains}, + {1, uMask(pDiacritic), defaultContains}, + {1, uMask(pExtender), defaultContains}, + {srcNfc, 0, hasFullCompositionExclusion}, + {1, uMask(pGraphemeBase), defaultContains}, + {1, uMask(pGraphemeExtend), defaultContains}, + {1, uMask(pGraphemeLink), defaultContains}, + {1, uMask(pHexDigit), defaultContains}, + {1, uMask(pHyphen), defaultContains}, + {1, uMask(pIDContinue), defaultContains}, + {1, uMask(pIDStart), defaultContains}, + {1, uMask(pIdeographic), defaultContains}, + {1, uMask(pIdsBinaryOperator), defaultContains}, + {1, uMask(pIdsTrinaryOperator), defaultContains}, + {srcBidi, 0, isJoinControl}, + {1, uMask(pLogicalOrderException), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_LOWERCASE + {1, uMask(pMath), defaultContains}, + {1, uMask(pNoncharacterCodePoint), defaultContains}, + {1, uMask(pQuotationMark), defaultContains}, + {1, uMask(pRadical), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_SOFT_DOTTED + {1, uMask(pTerminalPunctuation), defaultContains}, + {1, uMask(pUnifiedIdeograph), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_UPPERCASE + {1, uMask(pWhiteSpace), defaultContains}, + {1, uMask(pXidContinue), defaultContains}, + {1, uMask(pXidStart), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASE_SENSITIVE + {1, uMask(pSTerm), defaultContains}, + {1, uMask(pVariationSelector), defaultContains}, + {srcNfc, 0, isNormInert}, // UCHAR_NFD_INERT + {srcNfkc, 0, isNormInert}, // UCHAR_NFKD_INERT + {srcNfc, 0, isNormInert}, // UCHAR_NFC_INERT + {srcNfkc, 0, isNormInert}, // UCHAR_NFKC_INERT + {srcNfcCanonIter, 0, nil}, // Segment_Starter is currently unsupported + {1, uMask(pPatternSyntax), defaultContains}, + {1, uMask(pPatternWhiteSpace), defaultContains}, + {srcCharAndPropsvec, 0, isPOSIXAlnum}, + {srcChar, 0, isPOSIXBlank}, + {srcChar, 0, isPOSIXGraph}, + {srcChar, 0, isPOSIXPrint}, + {srcChar, 0, isPOSIXXdigit}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASE_IGNORABLE + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_LOWERCASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_UPPERCASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_TITLECASED + {srcCaseAndNorm, 0, changesWhenCasefolded}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED + {srcNfkcCf, 0, nil}, // Changes_When_NFKC_Casefolded is currently unsupported + {2, uMask(p2Emoji), defaultContains}, + {2, uMask(p2EmojiPresentation), defaultContains}, + {2, uMask(p2EmojiModifier), defaultContains}, + {2, uMask(p2EmojiModifierBase), defaultContains}, + {2, uMask(p2EmojiComponent), defaultContains}, {2, 0, isRegionalIndicator}, - {1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains}, - {2, U_MASK(UPROPS_2_EXTENDED_PICTOGRAPHIC), defaultContains}, + {1, uMask(pPrependedConcatenationMark), defaultContains}, + {2, uMask(p2ExtendedPictographic), defaultContains}, } -func isBidiControl(prop *BinaryProperty, c rune, which Property) bool { +func isBidiControl(_ *binaryProperty, c rune, _ Property) bool { return ubidi.IsBidiControl(c) } -func isMirrored(prop *BinaryProperty, c rune, which Property) bool { +func isMirrored(_ *binaryProperty, c rune, _ Property) bool { return ubidi.IsMirrored(c) } -func isRegionalIndicator(prop *BinaryProperty, c rune, which Property) bool { +func isRegionalIndicator(_ *binaryProperty, c rune, _ Property) bool { return 0x1F1E6 <= c && c <= 0x1F1FF } -func changesWhenCasefolded(prop *BinaryProperty, c rune, which Property) bool { +func changesWhenCasefolded(_ *binaryProperty, c rune, _ Property) bool { if c < 0 { return false } @@ -148,53 +148,53 @@ func changesWhenCasefolded(prop *BinaryProperty, c rune, which Property) bool { return !slices.Equal(nfd, folded) } -func isPOSIX_xdigit(prop *BinaryProperty, c rune, which Property) bool { +func isPOSIXXdigit(_ *binaryProperty, c rune, _ Property) bool { return uchar.IsXDigit(c) } -func isPOSIX_print(prop *BinaryProperty, c rune, which Property) bool { +func isPOSIXPrint(_ *binaryProperty, c rune, _ Property) bool { return uchar.IsPOSIXPrint(c) } -func isPOSIX_graph(prop *BinaryProperty, c rune, which Property) bool { +func isPOSIXGraph(_ *binaryProperty, c rune, _ Property) bool { return uchar.IsGraphPOSIX(c) } -func isPOSIX_blank(prop *BinaryProperty, c rune, which Property) bool { +func isPOSIXBlank(_ *binaryProperty, c rune, _ Property) bool { return uchar.IsBlank(c) } -func isPOSIX_alnum(prop *BinaryProperty, c rune, which Property) bool { - return (uchar.GetUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC)) != 0 || uchar.IsDigit(c) +func isPOSIXAlnum(_ *binaryProperty, c rune, _ Property) bool { + return (uchar.GetUnicodeProperties(c, 1)&uMask(pAlphabetic)) != 0 || uchar.IsDigit(c) } -func isJoinControl(prop *BinaryProperty, c rune, which Property) bool { +func isJoinControl(_ *binaryProperty, c rune, _ Property) bool { return ubidi.IsJoinControl(c) } -func hasFullCompositionExclusion(prop *BinaryProperty, c rune, which Property) bool { +func hasFullCompositionExclusion(_ *binaryProperty, c rune, _ Property) bool { impl := normalizer.Nfc() - return impl.IsCompNo(impl.GetNorm16(c)) + return impl.IsCompNo(c) } -func caseBinaryPropertyContains(prop *BinaryProperty, c rune, which Property) bool { +func caseBinaryPropertyContains(_ *binaryProperty, c rune, which Property) bool { return HasBinaryPropertyUcase(c, which) } func HasBinaryPropertyUcase(c rune, which Property) bool { /* case mapping properties */ switch which { - case UCHAR_LOWERCASE: - return ucase.UCASE_LOWER == ucase.GetType(c) - case UCHAR_UPPERCASE: - return ucase.UCASE_UPPER == ucase.GetType(c) - case UCHAR_SOFT_DOTTED: + case UCharLowercase: + return ucase.Lower == ucase.GetType(c) + case UCharUppercase: + return ucase.Upper == ucase.GetType(c) + case UCharSoftDotted: return ucase.IsSoftDotted(c) - case UCHAR_CASE_SENSITIVE: + case UCharCaseSensitive: return ucase.IsCaseSensitive(c) - case UCHAR_CASED: - return ucase.UCASE_NONE != ucase.GetType(c) - case UCHAR_CASE_IGNORABLE: + case UCharCased: + return ucase.None != ucase.GetType(c) + case UCharCaseIgnorable: return (ucase.GetTypeOrIgnorable(c) >> 2) != 0 /* * Note: The following Changes_When_Xyz are defined as testing whether @@ -208,27 +208,27 @@ func HasBinaryPropertyUcase(c rune, which Property) bool { * and the property starts set needs to be the union of the * start sets for normalization and case mappings. */ - case UCHAR_CHANGES_WHEN_LOWERCASED: + case UCharChangesWhenLowercased: return ucase.ToFullLower(c) >= 0 - case UCHAR_CHANGES_WHEN_UPPERCASED: + case UCharChangesWhenUppercased: return ucase.ToFullUpper(c) >= 0 - case UCHAR_CHANGES_WHEN_TITLECASED: + case UCharChangesWhenTitlecased: return ucase.ToFullTitle(c) >= 0 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ - case UCHAR_CHANGES_WHEN_CASEMAPPED: + case UCharChangesWhenCasemapped: return ucase.ToFullLower(c) >= 0 || ucase.ToFullUpper(c) >= 0 || ucase.ToFullTitle(c) >= 0 default: return false } } -func isNormInert(prop *BinaryProperty, c rune, which Property) bool { - mode := normalizer.UNormalizationMode(int32(which) - int32(UCHAR_NFD_INERT) + int32(normalizer.UNORM_NFD)) +func isNormInert(_ *binaryProperty, c rune, which Property) bool { + mode := normalizer.Mode(int32(which) - int32(UCharNfdInert) + int32(normalizer.NormNfd)) return normalizer.IsInert(c, mode) } func HasBinaryProperty(c rune, which Property) bool { - if which < UCHAR_BINARY_START || UCHAR_BINARY_LIMIT <= which { + if which < UCharBinaryStart || uCharBinaryLimit <= which { return false } prop := binProps[which] diff --git a/go/mysql/icuregex/internal/uprops/uprops_int.go b/go/mysql/icuregex/internal/uprops/uprops_int.go index 9c89e260d73..3e62d31184f 100644 --- a/go/mysql/icuregex/internal/uprops/uprops_int.go +++ b/go/mysql/icuregex/internal/uprops/uprops_int.go @@ -28,61 +28,38 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" ) -type IntPropertyGetValue func(prop *IntProperty, c rune, which Property) int32 +type intPropertyGetValue func(prop *intProperty, c rune, which Property) int32 -type IntProperty struct { - column PropertySource +type intProperty struct { + column propertySource mask uint32 shift int32 - getValue IntPropertyGetValue + getValue intPropertyGetValue } const ( - UPROPS_BLOCK_MASK = 0x0001ff00 - UPROPS_BLOCK_SHIFT = 8 + blockMask = 0x0001ff00 + blockShift = 8 - UPROPS_EA_MASK = 0x000e0000 - UPROPS_EA_SHIFT = 17 + eaMask = 0x000e0000 + eaShift = 17 - UPROPS_LB_MASK = 0x03f00000 - UPROPS_LB_SHIFT = 20 + lbMask = 0x03f00000 + lbShift = 20 - UPROPS_SB_MASK = 0x000f8000 - UPROPS_SB_SHIFT = 15 + sbMask = 0x000f8000 + sbShift = 15 - UPROPS_WB_MASK = 0x00007c00 - UPROPS_WB_SHIFT = 10 + wbMask = 0x00007c00 + wbShift = 10 - UPROPS_GCB_MASK = 0x000003e0 - UPROPS_GCB_SHIFT = 5 + gcbMask = 0x000003e0 + gcbShift = 5 - UPROPS_DT_MASK = 0x0000001f + dtMask = 0x0000001f ) -type NormalizationCheckResult int32 - -const ( - /** - * The input string is not in the normalization form. - * @stable ICU 2.0 - */ - UNORM_NO NormalizationCheckResult = iota - /** - * The input string is in the normalization form. - * @stable ICU 2.0 - */ - UNORM_YES - /** - * The input string may or may not be in the normalization form. - * This value is only returned for composition forms like NFC and FCC, - * when a backward-combining character is found for which the surrounding text - * would have to be analyzed further. - * @stable ICU 2.0 - */ - UNORM_MAYBE -) - -type NumericType int32 +type numericType int32 /** * Numeric Type constants. @@ -97,17 +74,17 @@ const ( * U_NT_ */ - U_NT_NONE NumericType = iota /*[None]*/ - U_NT_DECIMAL /*[de]*/ - U_NT_DIGIT /*[di]*/ - U_NT_NUMERIC /*[nu]*/ + ntNone numericType = iota /*[None]*/ + ntDecimal /*[de]*/ + ntDigit /*[di]*/ + ntNumeric /*[nu]*/ /** * One more than the highest normal UNumericType value. * The highest value is available via u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE). * * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ - U_NT_COUNT + ntCount ) /** @@ -117,7 +94,7 @@ const ( * @stable ICU 2.6 */ -type HangunSyllableType int32 +type hangunSyllableType int32 const ( /* @@ -126,22 +103,22 @@ const ( * U_HST_ */ - U_HST_NOT_APPLICABLE HangunSyllableType = iota /*[NA]*/ - U_HST_LEADING_JAMO /*[L]*/ - U_HST_VOWEL_JAMO /*[V]*/ - U_HST_TRAILING_JAMO /*[T]*/ - U_HST_LV_SYLLABLE /*[LV]*/ - U_HST_LVT_SYLLABLE /*[LVT]*/ + hstNotApplicable hangunSyllableType = iota /*[NA]*/ + hstLeadingJamo /*[L]*/ + hstVowelJamo /*[V]*/ + hstTrailingJamo /*[T]*/ + hstLvSyllable /*[LV]*/ + hstLvtSyllable /*[LVT]*/ /** * One more than the highest normal UHangulSyllableType value. * The highest value is available via u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE). * * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ - U_HST_COUNT + hstCount ) -var intProps = [UCHAR_INT_LIMIT - UCHAR_INT_START]*IntProperty{ +var intProps = [uCharIntLimit - UCharIntStart]*intProperty{ /* * column, mask and shift values for int-value properties from u_getUnicodeProperties(). * Must be in order of corresponding UProperty, @@ -150,139 +127,139 @@ var intProps = [UCHAR_INT_LIMIT - UCHAR_INT_START]*IntProperty{ * Properties with mask==0 are handled in code. * For them, column is the UPropertySource value. */ - {UPROPS_SRC_BIDI, 0, 0, getBiDiClass}, - {0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue}, - {UPROPS_SRC_NFC, 0, 0xff, getCombiningClass}, - {2, UPROPS_DT_MASK, 0, defaultGetValue}, - {0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue}, - {UPROPS_SRC_CHAR, 0, uchar.U_CHAR_CATEGORY_COUNT - 1, getGeneralCategory}, - {UPROPS_SRC_BIDI, 0, 0, getJoiningGroup}, - {UPROPS_SRC_BIDI, 0, 0, getJoiningType}, - {2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue}, - {UPROPS_SRC_CHAR, 0, int32(U_NT_COUNT - 1), getNumericType}, - {UPROPS_SRC_PROPSVEC, 0, 0, getScript}, - {UPROPS_SRC_PROPSVEC, 0, int32(U_HST_COUNT - 1), getHangulSyllableType}, + {srcBidi, 0, 0, getBiDiClass}, + {0, blockMask, blockShift, defaultGetValue}, + {srcNfc, 0, 0xff, getCombiningClass}, + {2, dtMask, 0, defaultGetValue}, + {0, eaMask, eaShift, defaultGetValue}, + {srcChar, 0, int32(uchar.CharCategoryCount - 1), getGeneralCategory}, + {srcBidi, 0, 0, getJoiningGroup}, + {srcBidi, 0, 0, getJoiningType}, + {2, lbMask, lbShift, defaultGetValue}, + {srcChar, 0, int32(ntCount - 1), getNumericType}, + {srcPropsvec, 0, 0, getScript}, + {srcPropsvec, 0, int32(hstCount - 1), getHangulSyllableType}, // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" - {UPROPS_SRC_NFC, 0, int32(UNORM_YES), getNormQuickCheck}, + {srcNfc, 0, int32(normalizer.Yes), getNormQuickCheck}, // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" - {UPROPS_SRC_NFKC, 0, int32(UNORM_YES), getNormQuickCheck}, + {srcNfkc, 0, int32(normalizer.Yes), getNormQuickCheck}, // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE - {UPROPS_SRC_NFC, 0, int32(UNORM_MAYBE), getNormQuickCheck}, + {srcNfc, 0, int32(normalizer.Maybe), getNormQuickCheck}, // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE - {UPROPS_SRC_NFKC, 0, int32(UNORM_MAYBE), getNormQuickCheck}, - {UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass}, - {UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass}, - {2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue}, - {2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue}, - {2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue}, - {UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType}, - {UPROPS_SRC_INPC, 0, 0, getInPC}, - {UPROPS_SRC_INSC, 0, 0, getInSC}, - {UPROPS_SRC_VO, 0, 0, getVo}, + {srcNfkc, 0, int32(normalizer.Maybe), getNormQuickCheck}, + {srcNfc, 0, 0xff, getLeadCombiningClass}, + {srcNfc, 0, 0xff, getTrailCombiningClass}, + {2, gcbMask, gcbShift, defaultGetValue}, + {2, sbMask, sbShift, defaultGetValue}, + {2, wbMask, wbShift, defaultGetValue}, + {srcBidi, 0, 0, getBiDiPairedBracketType}, + {srcInpc, 0, 0, getInPC}, + {srcInsc, 0, 0, getInSC}, + {srcVo, 0, 0, getVo}, } -func getVo(prop *IntProperty, c rune, which Property) int32 { +func getVo(_ *intProperty, c rune, _ Property) int32 { return int32(ulayout.VoTrie().Get(c)) } -func getInSC(prop *IntProperty, c rune, which Property) int32 { +func getInSC(_ *intProperty, c rune, _ Property) int32 { return int32(ulayout.InscTrie().Get(c)) } -func getInPC(prop *IntProperty, c rune, which Property) int32 { +func getInPC(_ *intProperty, c rune, _ Property) int32 { return int32(ulayout.InpcTrie().Get(c)) } -func getBiDiPairedBracketType(prop *IntProperty, c rune, which Property) int32 { +func getBiDiPairedBracketType(_ *intProperty, c rune, _ Property) int32 { return int32(ubidi.PairedBracketType(c)) } -func getTrailCombiningClass(prop *IntProperty, c rune, which Property) int32 { +func getTrailCombiningClass(_ *intProperty, c rune, _ Property) int32 { return int32(normalizer.Nfc().GetFCD16(c) & 0xff) } -func getLeadCombiningClass(prop *IntProperty, c rune, which Property) int32 { - return int32(normalizer.Nfc().GetFCD16(c) >> 8) +func getLeadCombiningClass(_ *intProperty, c rune, _ Property) int32 { + val := int32(normalizer.Nfc().GetFCD16(c) >> 8) + return val } -func getNormQuickCheck(prop *IntProperty, c rune, which Property) int32 { - return int32(normalizer.QuickCheck(c, normalizer.UNormalizationMode(int32(which)-int32(UCHAR_NFD_QUICK_CHECK)+int32(normalizer.UNORM_NFD)))) +func getNormQuickCheck(_ *intProperty, c rune, which Property) int32 { + return int32(normalizer.QuickCheck(c, normalizer.Mode(int32(which)-int32(UCharNfdQuickCheck)+int32(normalizer.NormNfd)))) } /* * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. */ -var gcbToHst = []HangunSyllableType{ - U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ - U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ - U_HST_NOT_APPLICABLE, /* U_GCB_CR */ - U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ - U_HST_LEADING_JAMO, /* U_GCB_L */ - U_HST_NOT_APPLICABLE, /* U_GCB_LF */ - U_HST_LV_SYLLABLE, /* U_GCB_LV */ - U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ - U_HST_TRAILING_JAMO, /* U_GCB_T */ - U_HST_VOWEL_JAMO, /* U_GCB_V */ +var gcbToHst = []hangunSyllableType{ + hstNotApplicable, /* U_GCB_OTHER */ + hstNotApplicable, /* U_GCB_CONTROL */ + hstNotApplicable, /* U_GCB_CR */ + hstNotApplicable, /* U_GCB_EXTEND */ + hstLeadingJamo, /* U_GCB_L */ + hstNotApplicable, /* U_GCB_LF */ + hstLvSyllable, /* U_GCB_LV */ + hstLvtSyllable, /* U_GCB_LVT */ + hstTrailingJamo, /* U_GCB_T */ + hstVowelJamo, /* U_GCB_V */ /* * Omit GCB values beyond what we need for hst. * The code below checks for the array length. */ } -func getHangulSyllableType(prop *IntProperty, c rune, which Property) int32 { +func getHangulSyllableType(_ *intProperty, c rune, _ Property) int32 { /* see comments on gcbToHst[] above */ - gcb := (int32(uchar.GetUnicodeProperties(c, 2)) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT + gcb := (int32(uchar.GetUnicodeProperties(c, 2)) & gcbMask) >> gcbShift if gcb < int32(len(gcbToHst)) { return int32(gcbToHst[gcb]) - } else { - return int32(U_HST_NOT_APPLICABLE) } + return int32(hstNotApplicable) } -func getScript(_ *IntProperty, c rune, _ Property) int32 { - return GetScript(c) +func getScript(_ *intProperty, c rune, _ Property) int32 { + return script(c) } -func getNumericType(prop *IntProperty, c rune, which Property) int32 { +func getNumericType(_ *intProperty, c rune, _ Property) int32 { ntv := uchar.NumericTypeValue(c) return int32(ntvGetType(ntv)) } -func getJoiningType(prop *IntProperty, c rune, which Property) int32 { - return int32(ubidi.JoiningType(c)) +func getJoiningType(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.JoinType(c)) } -func getJoiningGroup(prop *IntProperty, c rune, which Property) int32 { - return int32(ubidi.JoiningGroup(c)) +func getJoiningGroup(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.JoinGroup(c)) } -func getGeneralCategory(prop *IntProperty, c rune, which Property) int32 { +func getGeneralCategory(_ *intProperty, c rune, _ Property) int32 { return int32(uchar.CharType(c)) } -func getCombiningClass(prop *IntProperty, c rune, which Property) int32 { +func getCombiningClass(_ *intProperty, c rune, _ Property) int32 { return int32(normalizer.Nfc().CombiningClass(c)) } -func defaultGetValue(prop *IntProperty, c rune, which Property) int32 { +func defaultGetValue(prop *intProperty, c rune, _ Property) int32 { return int32(uchar.GetUnicodeProperties(c, int(prop.column))&prop.mask) >> prop.shift } -func getBiDiClass(prop *IntProperty, c rune, which Property) int32 { +func getBiDiClass(_ *intProperty, c rune, _ Property) int32 { return int32(ubidi.Class(c)) } -func ntvGetType(ntv uint16) NumericType { +func ntvGetType(ntv uint16) numericType { switch { - case ntv == uchar.UPROPS_NTV_NONE: - return U_NT_NONE - case ntv < uchar.UPROPS_NTV_DIGIT_START: - return U_NT_DECIMAL - case ntv < uchar.UPROPS_NTV_NUMERIC_START: - return U_NT_DIGIT + case ntv == uchar.UPropsNtvNone: + return ntNone + case ntv < uchar.UPropsNtvDigitStart: + return ntDecimal + case ntv < uchar.UPropsNtvNumericStart: + return ntDigit default: - return U_NT_NUMERIC + return ntNumeric } } diff --git a/go/mysql/icuregex/internal/uprops/uscript.go b/go/mysql/icuregex/internal/uprops/uscript.go index 98b3275dd1b..8a4423849df 100644 --- a/go/mysql/icuregex/internal/uprops/uscript.go +++ b/go/mysql/icuregex/internal/uprops/uscript.go @@ -50,7 +50,7 @@ import "vitess.io/vitess/go/mysql/icuregex/internal/uchar" * * @stable ICU 2.2 */ -type UScriptCode int32 +type code int32 /* * Note: UScriptCode constants and their ISO script code comments @@ -61,436 +61,436 @@ type UScriptCode int32 const ( /** @stable ICU 2.2 */ - USCRIPT_INVALID_CODE UScriptCode = -1 + invalidCode code = -1 /** @stable ICU 2.2 */ - USCRIPT_COMMON UScriptCode = 0 /* Zyyy */ + common code = 0 /* Zyyy */ /** @stable ICU 2.2 */ - USCRIPT_INHERITED UScriptCode = 1 /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ + inherited code = 1 /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ /** @stable ICU 2.2 */ - USCRIPT_ARABIC UScriptCode = 2 /* Arab */ + arabic code = 2 /* Arab */ /** @stable ICU 2.2 */ - USCRIPT_ARMENIAN UScriptCode = 3 /* Armn */ + armenian code = 3 /* Armn */ /** @stable ICU 2.2 */ - USCRIPT_BENGALI UScriptCode = 4 /* Beng */ + bengali code = 4 /* Beng */ /** @stable ICU 2.2 */ - USCRIPT_BOPOMOFO UScriptCode = 5 /* Bopo */ + bopomofo code = 5 /* Bopo */ /** @stable ICU 2.2 */ - USCRIPT_CHEROKEE UScriptCode = 6 /* Cher */ + cherokee code = 6 /* Cher */ /** @stable ICU 2.2 */ - USCRIPT_COPTIC UScriptCode = 7 /* Copt */ + coptic code = 7 /* Copt */ /** @stable ICU 2.2 */ - USCRIPT_CYRILLIC UScriptCode = 8 /* Cyrl */ + cyrillic code = 8 /* Cyrl */ /** @stable ICU 2.2 */ - USCRIPT_DESERET UScriptCode = 9 /* Dsrt */ + deseret code = 9 /* Dsrt */ /** @stable ICU 2.2 */ - USCRIPT_DEVANAGARI UScriptCode = 10 /* Deva */ + devanagari code = 10 /* Deva */ /** @stable ICU 2.2 */ - USCRIPT_ETHIOPIC UScriptCode = 11 /* Ethi */ + ethiopic code = 11 /* Ethi */ /** @stable ICU 2.2 */ - USCRIPT_GEORGIAN UScriptCode = 12 /* Geor */ + georgian code = 12 /* Geor */ /** @stable ICU 2.2 */ - USCRIPT_GOTHIC UScriptCode = 13 /* Goth */ + gothic code = 13 /* Goth */ /** @stable ICU 2.2 */ - USCRIPT_GREEK UScriptCode = 14 /* Grek */ + greek code = 14 /* Grek */ /** @stable ICU 2.2 */ - USCRIPT_GUJARATI UScriptCode = 15 /* Gujr */ + gujarati code = 15 /* Gujr */ /** @stable ICU 2.2 */ - USCRIPT_GURMUKHI UScriptCode = 16 /* Guru */ + gurmukhi code = 16 /* Guru */ /** @stable ICU 2.2 */ - USCRIPT_HAN UScriptCode = 17 /* Hani */ + han code = 17 /* Hani */ /** @stable ICU 2.2 */ - USCRIPT_HANGUL UScriptCode = 18 /* Hang */ + hangul code = 18 /* Hang */ /** @stable ICU 2.2 */ - USCRIPT_HEBREW UScriptCode = 19 /* Hebr */ + hebrew code = 19 /* Hebr */ /** @stable ICU 2.2 */ - USCRIPT_HIRAGANA UScriptCode = 20 /* Hira */ + hiragana code = 20 /* Hira */ /** @stable ICU 2.2 */ - USCRIPT_KANNADA UScriptCode = 21 /* Knda */ + kannada code = 21 /* Knda */ /** @stable ICU 2.2 */ - USCRIPT_KATAKANA UScriptCode = 22 /* Kana */ + katakana code = 22 /* Kana */ /** @stable ICU 2.2 */ - USCRIPT_KHMER UScriptCode = 23 /* Khmr */ + khmer code = 23 /* Khmr */ /** @stable ICU 2.2 */ - USCRIPT_LAO UScriptCode = 24 /* Laoo */ + lao code = 24 /* Laoo */ /** @stable ICU 2.2 */ - USCRIPT_LATIN UScriptCode = 25 /* Latn */ + latin code = 25 /* Latn */ /** @stable ICU 2.2 */ - USCRIPT_MALAYALAM UScriptCode = 26 /* Mlym */ + malayalam code = 26 /* Mlym */ /** @stable ICU 2.2 */ - USCRIPT_MONGOLIAN UScriptCode = 27 /* Mong */ + mongolian code = 27 /* Mong */ /** @stable ICU 2.2 */ - USCRIPT_MYANMAR UScriptCode = 28 /* Mymr */ + myanmar code = 28 /* Mymr */ /** @stable ICU 2.2 */ - USCRIPT_OGHAM UScriptCode = 29 /* Ogam */ + ogham code = 29 /* Ogam */ /** @stable ICU 2.2 */ - USCRIPT_OLD_ITALIC UScriptCode = 30 /* Ital */ + oldItalic code = 30 /* Ital */ /** @stable ICU 2.2 */ - USCRIPT_ORIYA UScriptCode = 31 /* Orya */ + oriya code = 31 /* Orya */ /** @stable ICU 2.2 */ - USCRIPT_RUNIC UScriptCode = 32 /* Runr */ + runic code = 32 /* Runr */ /** @stable ICU 2.2 */ - USCRIPT_SINHALA UScriptCode = 33 /* Sinh */ + sinhala code = 33 /* Sinh */ /** @stable ICU 2.2 */ - USCRIPT_SYRIAC UScriptCode = 34 /* Syrc */ + syriac code = 34 /* Syrc */ /** @stable ICU 2.2 */ - USCRIPT_TAMIL UScriptCode = 35 /* Taml */ + tamil code = 35 /* Taml */ /** @stable ICU 2.2 */ - USCRIPT_TELUGU UScriptCode = 36 /* Telu */ + telugu code = 36 /* Telu */ /** @stable ICU 2.2 */ - USCRIPT_THAANA UScriptCode = 37 /* Thaa */ + thaana code = 37 /* Thaa */ /** @stable ICU 2.2 */ - USCRIPT_THAI UScriptCode = 38 /* Thai */ + thai code = 38 /* Thai */ /** @stable ICU 2.2 */ - USCRIPT_TIBETAN UScriptCode = 39 /* Tibt */ + tibetan code = 39 /* Tibt */ /** Canadian_Aboriginal script. @stable ICU 2.6 */ - USCRIPT_CANADIAN_ABORIGINAL UScriptCode = 40 /* Cans */ + canadianAboriginal code = 40 /* Cans */ /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */ - USCRIPT_UCAS UScriptCode = USCRIPT_CANADIAN_ABORIGINAL + ucas code = canadianAboriginal /** @stable ICU 2.2 */ - USCRIPT_YI UScriptCode = 41 /* Yiii */ + yi code = 41 /* Yiii */ /* New scripts in Unicode 3.2 */ /** @stable ICU 2.2 */ - USCRIPT_TAGALOG UScriptCode = 42 /* Tglg */ + tagalog code = 42 /* Tglg */ /** @stable ICU 2.2 */ - USCRIPT_HANUNOO UScriptCode = 43 /* Hano */ + hanunoo code = 43 /* Hano */ /** @stable ICU 2.2 */ - USCRIPT_BUHID UScriptCode = 44 /* Buhd */ + buhid code = 44 /* Buhd */ /** @stable ICU 2.2 */ - USCRIPT_TAGBANWA UScriptCode = 45 /* Tagb */ + tagbanwa code = 45 /* Tagb */ /* New scripts in Unicode 4 */ /** @stable ICU 2.6 */ - USCRIPT_BRAILLE UScriptCode = 46 /* Brai */ + braille code = 46 /* Brai */ /** @stable ICU 2.6 */ - USCRIPT_CYPRIOT UScriptCode = 47 /* Cprt */ + cypriot code = 47 /* Cprt */ /** @stable ICU 2.6 */ - USCRIPT_LIMBU UScriptCode = 48 /* Limb */ + limbu code = 48 /* Limb */ /** @stable ICU 2.6 */ - USCRIPT_LINEAR_B UScriptCode = 49 /* Linb */ + linearB code = 49 /* Linb */ /** @stable ICU 2.6 */ - USCRIPT_OSMANYA UScriptCode = 50 /* Osma */ + osmanya code = 50 /* Osma */ /** @stable ICU 2.6 */ - USCRIPT_SHAVIAN UScriptCode = 51 /* Shaw */ + shavian code = 51 /* Shaw */ /** @stable ICU 2.6 */ - USCRIPT_TAI_LE UScriptCode = 52 /* Tale */ + taiLe code = 52 /* Tale */ /** @stable ICU 2.6 */ - USCRIPT_UGARITIC UScriptCode = 53 /* Ugar */ + ugaratic code = 53 /* Ugar */ /** New script code in Unicode 4.0.1 @stable ICU 3.0 */ - USCRIPT_KATAKANA_OR_HIRAGANA = 54 /*Hrkt */ + katakanaOrHiragana = 54 /*Hrkt */ /* New scripts in Unicode 4.1 */ /** @stable ICU 3.4 */ - USCRIPT_BUGINESE UScriptCode = 55 /* Bugi */ + buginese code = 55 /* Bugi */ /** @stable ICU 3.4 */ - USCRIPT_GLAGOLITIC UScriptCode = 56 /* Glag */ + glagolitic code = 56 /* Glag */ /** @stable ICU 3.4 */ - USCRIPT_KHAROSHTHI UScriptCode = 57 /* Khar */ + kharoshthi code = 57 /* Khar */ /** @stable ICU 3.4 */ - USCRIPT_SYLOTI_NAGRI UScriptCode = 58 /* Sylo */ + sylotiNagri code = 58 /* Sylo */ /** @stable ICU 3.4 */ - USCRIPT_NEW_TAI_LUE UScriptCode = 59 /* Talu */ + newTaiLue code = 59 /* Talu */ /** @stable ICU 3.4 */ - USCRIPT_TIFINAGH UScriptCode = 60 /* Tfng */ + tifinagh code = 60 /* Tfng */ /** @stable ICU 3.4 */ - USCRIPT_OLD_PERSIAN UScriptCode = 61 /* Xpeo */ + oldPersian code = 61 /* Xpeo */ /* New script codes from Unicode and ISO 15924 */ /** @stable ICU 3.6 */ - USCRIPT_BALINESE UScriptCode = 62 /* Bali */ + balinese code = 62 /* Bali */ /** @stable ICU 3.6 */ - USCRIPT_BATAK UScriptCode = 63 /* Batk */ + batak code = 63 /* Batk */ /** @stable ICU 3.6 */ - USCRIPT_BLISSYMBOLS UScriptCode = 64 /* Blis */ + blissymbols code = 64 /* Blis */ /** @stable ICU 3.6 */ - USCRIPT_BRAHMI UScriptCode = 65 /* Brah */ + brahmi code = 65 /* Brah */ /** @stable ICU 3.6 */ - USCRIPT_CHAM UScriptCode = 66 /* Cham */ + cham code = 66 /* Cham */ /** @stable ICU 3.6 */ - USCRIPT_CIRTH UScriptCode = 67 /* Cirt */ + cirth code = 67 /* Cirt */ /** @stable ICU 3.6 */ - USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC UScriptCode = 68 /* Cyrs */ + oldChurchSlavonicCyrillic code = 68 /* Cyrs */ /** @stable ICU 3.6 */ - USCRIPT_DEMOTIC_EGYPTIAN UScriptCode = 69 /* Egyd */ + demoticEgyptian code = 69 /* Egyd */ /** @stable ICU 3.6 */ - USCRIPT_HIERATIC_EGYPTIAN UScriptCode = 70 /* Egyh */ + hieraticEgyptian code = 70 /* Egyh */ /** @stable ICU 3.6 */ - USCRIPT_EGYPTIAN_HIEROGLYPHS UScriptCode = 71 /* Egyp */ + egyptianHieroglyphs code = 71 /* Egyp */ /** @stable ICU 3.6 */ - USCRIPT_KHUTSURI UScriptCode = 72 /* Geok */ + khutsuri code = 72 /* Geok */ /** @stable ICU 3.6 */ - USCRIPT_SIMPLIFIED_HAN UScriptCode = 73 /* Hans */ + simplfiedHan code = 73 /* Hans */ /** @stable ICU 3.6 */ - USCRIPT_TRADITIONAL_HAN UScriptCode = 74 /* Hant */ + traditionalHan code = 74 /* Hant */ /** @stable ICU 3.6 */ - USCRIPT_PAHAWH_HMONG UScriptCode = 75 /* Hmng */ + pahawhHmong code = 75 /* Hmng */ /** @stable ICU 3.6 */ - USCRIPT_OLD_HUNGARIAN UScriptCode = 76 /* Hung */ + oldHungarian code = 76 /* Hung */ /** @stable ICU 3.6 */ - USCRIPT_HARAPPAN_INDUS UScriptCode = 77 /* Inds */ + harappanIndus code = 77 /* Inds */ /** @stable ICU 3.6 */ - USCRIPT_JAVANESE UScriptCode = 78 /* Java */ + javanese code = 78 /* Java */ /** @stable ICU 3.6 */ - USCRIPT_KAYAH_LI UScriptCode = 79 /* Kali */ + kayahLi code = 79 /* Kali */ /** @stable ICU 3.6 */ - USCRIPT_LATIN_FRAKTUR UScriptCode = 80 /* Latf */ + latinFraktur code = 80 /* Latf */ /** @stable ICU 3.6 */ - USCRIPT_LATIN_GAELIC UScriptCode = 81 /* Latg */ + latinGaelic code = 81 /* Latg */ /** @stable ICU 3.6 */ - USCRIPT_LEPCHA UScriptCode = 82 /* Lepc */ + lepcha code = 82 /* Lepc */ /** @stable ICU 3.6 */ - USCRIPT_LINEAR_A UScriptCode = 83 /* Lina */ + linearA code = 83 /* Lina */ /** @stable ICU 4.6 */ - USCRIPT_MANDAIC UScriptCode = 84 /* Mand */ + mandaic code = 84 /* Mand */ /** @stable ICU 3.6 */ - USCRIPT_MANDAEAN UScriptCode = USCRIPT_MANDAIC + mandaean code = mandaic /** @stable ICU 3.6 */ - USCRIPT_MAYAN_HIEROGLYPHS UScriptCode = 85 /* Maya */ + mayanHieroglyphs code = 85 /* Maya */ /** @stable ICU 4.6 */ - USCRIPT_MEROITIC_HIEROGLYPHS UScriptCode = 86 /* Mero */ + meroiticHieroglyphs code = 86 /* Mero */ /** @stable ICU 3.6 */ - USCRIPT_MEROITIC UScriptCode = USCRIPT_MEROITIC_HIEROGLYPHS + meroitic code = meroiticHieroglyphs /** @stable ICU 3.6 */ - USCRIPT_NKO UScriptCode = 87 /* Nkoo */ + nko code = 87 /* Nkoo */ /** @stable ICU 3.6 */ - USCRIPT_ORKHON UScriptCode = 88 /* Orkh */ + orkhon code = 88 /* Orkh */ /** @stable ICU 3.6 */ - USCRIPT_OLD_PERMIC UScriptCode = 89 /* Perm */ + oldPermic code = 89 /* Perm */ /** @stable ICU 3.6 */ - USCRIPT_PHAGS_PA UScriptCode = 90 /* Phag */ + phagsPa code = 90 /* Phag */ /** @stable ICU 3.6 */ - USCRIPT_PHOENICIAN UScriptCode = 91 /* Phnx */ + phoenician code = 91 /* Phnx */ /** @stable ICU 52 */ - USCRIPT_MIAO UScriptCode = 92 /* Plrd */ + miao code = 92 /* Plrd */ /** @stable ICU 3.6 */ - USCRIPT_PHONETIC_POLLARD UScriptCode = USCRIPT_MIAO + phoneticPollard code = miao /** @stable ICU 3.6 */ - USCRIPT_RONGORONGO UScriptCode = 93 /* Roro */ + rongoRongo code = 93 /* Roro */ /** @stable ICU 3.6 */ - USCRIPT_SARATI UScriptCode = 94 /* Sara */ + sarati code = 94 /* Sara */ /** @stable ICU 3.6 */ - USCRIPT_ESTRANGELO_SYRIAC UScriptCode = 95 /* Syre */ + extrangeloSyriac code = 95 /* Syre */ /** @stable ICU 3.6 */ - USCRIPT_WESTERN_SYRIAC UScriptCode = 96 /* Syrj */ + westernSyriac code = 96 /* Syrj */ /** @stable ICU 3.6 */ - USCRIPT_EASTERN_SYRIAC UScriptCode = 97 /* Syrn */ + easternSyriac code = 97 /* Syrn */ /** @stable ICU 3.6 */ - USCRIPT_TENGWAR UScriptCode = 98 /* Teng */ + tengwar code = 98 /* Teng */ /** @stable ICU 3.6 */ - USCRIPT_VAI UScriptCode = 99 /* Vaii */ + vai code = 99 /* Vaii */ /** @stable ICU 3.6 */ - USCRIPT_VISIBLE_SPEECH UScriptCode = 100 /* Visp */ + visibleSpeech code = 100 /* Visp */ /** @stable ICU 3.6 */ - USCRIPT_CUNEIFORM UScriptCode = 101 /* Xsux */ + cuneiform code = 101 /* Xsux */ /** @stable ICU 3.6 */ - USCRIPT_UNWRITTEN_LANGUAGES UScriptCode = 102 /* Zxxx */ + unwrittenLanguages code = 102 /* Zxxx */ /** @stable ICU 3.6 */ - USCRIPT_UNKNOWN UScriptCode = 103 /* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ + unknown code = 103 /* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ /** @stable ICU 3.8 */ - USCRIPT_CARIAN UScriptCode = 104 /* Cari */ + carian code = 104 /* Cari */ /** @stable ICU 3.8 */ - USCRIPT_JAPANESE UScriptCode = 105 /* Jpan */ + japanese code = 105 /* Jpan */ /** @stable ICU 3.8 */ - USCRIPT_LANNA UScriptCode = 106 /* Lana */ + lanna code = 106 /* Lana */ /** @stable ICU 3.8 */ - USCRIPT_LYCIAN UScriptCode = 107 /* Lyci */ + lycian code = 107 /* Lyci */ /** @stable ICU 3.8 */ - USCRIPT_LYDIAN UScriptCode = 108 /* Lydi */ + lydian code = 108 /* Lydi */ /** @stable ICU 3.8 */ - USCRIPT_OL_CHIKI UScriptCode = 109 /* Olck */ + olChiki code = 109 /* Olck */ /** @stable ICU 3.8 */ - USCRIPT_REJANG UScriptCode = 110 /* Rjng */ + rejang code = 110 /* Rjng */ /** @stable ICU 3.8 */ - USCRIPT_SAURASHTRA UScriptCode = 111 /* Saur */ + saurashtra code = 111 /* Saur */ /** Sutton SignWriting @stable ICU 3.8 */ - USCRIPT_SIGN_WRITING UScriptCode = 112 /* Sgnw */ + signWriting code = 112 /* Sgnw */ /** @stable ICU 3.8 */ - USCRIPT_SUNDANESE UScriptCode = 113 /* Sund */ + sundanese code = 113 /* Sund */ /** @stable ICU 3.8 */ - USCRIPT_MOON UScriptCode = 114 /* Moon */ + moon code = 114 /* Moon */ /** @stable ICU 3.8 */ - USCRIPT_MEITEI_MAYEK UScriptCode = 115 /* Mtei */ + meiteiMayek code = 115 /* Mtei */ /** @stable ICU 4.0 */ - USCRIPT_IMPERIAL_ARAMAIC UScriptCode = 116 /* Armi */ + imperialAramaic code = 116 /* Armi */ /** @stable ICU 4.0 */ - USCRIPT_AVESTAN UScriptCode = 117 /* Avst */ + avestan code = 117 /* Avst */ /** @stable ICU 4.0 */ - USCRIPT_CHAKMA UScriptCode = 118 /* Cakm */ + chakma code = 118 /* Cakm */ /** @stable ICU 4.0 */ - USCRIPT_KOREAN UScriptCode = 119 /* Kore */ + korean code = 119 /* Kore */ /** @stable ICU 4.0 */ - USCRIPT_KAITHI UScriptCode = 120 /* Kthi */ + kaithi code = 120 /* Kthi */ /** @stable ICU 4.0 */ - USCRIPT_MANICHAEAN UScriptCode = 121 /* Mani */ + manichaean code = 121 /* Mani */ /** @stable ICU 4.0 */ - USCRIPT_INSCRIPTIONAL_PAHLAVI UScriptCode = 122 /* Phli */ + inscriptionalPahlavi code = 122 /* Phli */ /** @stable ICU 4.0 */ - USCRIPT_PSALTER_PAHLAVI UScriptCode = 123 /* Phlp */ + psalterPahlavi code = 123 /* Phlp */ /** @stable ICU 4.0 */ - USCRIPT_BOOK_PAHLAVI UScriptCode = 124 /* Phlv */ + bookPahlavi code = 124 /* Phlv */ /** @stable ICU 4.0 */ - USCRIPT_INSCRIPTIONAL_PARTHIAN UScriptCode = 125 /* Prti */ + inscriptionalParthian code = 125 /* Prti */ /** @stable ICU 4.0 */ - USCRIPT_SAMARITAN UScriptCode = 126 /* Samr */ + samaritan code = 126 /* Samr */ /** @stable ICU 4.0 */ - USCRIPT_TAI_VIET UScriptCode = 127 /* Tavt */ + taiViet code = 127 /* Tavt */ /** @stable ICU 4.0 */ - USCRIPT_MATHEMATICAL_NOTATION UScriptCode = 128 /* Zmth */ + mathematicalNotation code = 128 /* Zmth */ /** @stable ICU 4.0 */ - USCRIPT_SYMBOLS UScriptCode = 129 /* Zsym */ + symbols code = 129 /* Zsym */ /** @stable ICU 4.4 */ - USCRIPT_BAMUM UScriptCode = 130 /* Bamu */ + bamum code = 130 /* Bamu */ /** @stable ICU 4.4 */ - USCRIPT_LISU UScriptCode = 131 /* Lisu */ + lisu code = 131 /* Lisu */ /** @stable ICU 4.4 */ - USCRIPT_NAKHI_GEBA UScriptCode = 132 /* Nkgb */ + nakhiGeba code = 132 /* Nkgb */ /** @stable ICU 4.4 */ - USCRIPT_OLD_SOUTH_ARABIAN UScriptCode = 133 /* Sarb */ + oldSouthArabian code = 133 /* Sarb */ /** @stable ICU 4.6 */ - USCRIPT_BASSA_VAH UScriptCode = 134 /* Bass */ + bassaVah code = 134 /* Bass */ /** @stable ICU 54 */ - USCRIPT_DUPLOYAN UScriptCode = 135 /* Dupl */ + duployan code = 135 /* Dupl */ /** @stable ICU 4.6 */ - USCRIPT_ELBASAN UScriptCode = 136 /* Elba */ + elbasan code = 136 /* Elba */ /** @stable ICU 4.6 */ - USCRIPT_GRANTHA UScriptCode = 137 /* Gran */ + grantha code = 137 /* Gran */ /** @stable ICU 4.6 */ - USCRIPT_KPELLE UScriptCode = 138 /* Kpel */ + kpelle code = 138 /* Kpel */ /** @stable ICU 4.6 */ - USCRIPT_LOMA UScriptCode = 139 /* Loma */ + loma code = 139 /* Loma */ /** Mende Kikakui @stable ICU 4.6 */ - USCRIPT_MENDE UScriptCode = 140 /* Mend */ + mende code = 140 /* Mend */ /** @stable ICU 4.6 */ - USCRIPT_MEROITIC_CURSIVE UScriptCode = 141 /* Merc */ + meroiticCursive code = 141 /* Merc */ /** @stable ICU 4.6 */ - USCRIPT_OLD_NORTH_ARABIAN UScriptCode = 142 /* Narb */ + oldNorthArabian code = 142 /* Narb */ /** @stable ICU 4.6 */ - USCRIPT_NABATAEAN UScriptCode = 143 /* Nbat */ + nabataean code = 143 /* Nbat */ /** @stable ICU 4.6 */ - USCRIPT_PALMYRENE UScriptCode = 144 /* Palm */ + palmyrene code = 144 /* Palm */ /** @stable ICU 54 */ - USCRIPT_KHUDAWADI UScriptCode = 145 /* Sind */ + khudawadi code = 145 /* Sind */ /** @stable ICU 4.6 */ - USCRIPT_SINDHI UScriptCode = USCRIPT_KHUDAWADI + sindhi code = khudawadi /** @stable ICU 4.6 */ - USCRIPT_WARANG_CITI UScriptCode = 146 /* Wara */ + warangCiti code = 146 /* Wara */ /** @stable ICU 4.8 */ - USCRIPT_AFAKA UScriptCode = 147 /* Afak */ + afaka code = 147 /* Afak */ /** @stable ICU 4.8 */ - USCRIPT_JURCHEN UScriptCode = 148 /* Jurc */ + jurchen code = 148 /* Jurc */ /** @stable ICU 4.8 */ - USCRIPT_MRO UScriptCode = 149 /* Mroo */ + mro code = 149 /* Mroo */ /** @stable ICU 4.8 */ - USCRIPT_NUSHU UScriptCode = 150 /* Nshu */ + nushu code = 150 /* Nshu */ /** @stable ICU 4.8 */ - USCRIPT_SHARADA UScriptCode = 151 /* Shrd */ + sharada code = 151 /* Shrd */ /** @stable ICU 4.8 */ - USCRIPT_SORA_SOMPENG UScriptCode = 152 /* Sora */ + soraSompeng code = 152 /* Sora */ /** @stable ICU 4.8 */ - USCRIPT_TAKRI UScriptCode = 153 /* Takr */ + takri code = 153 /* Takr */ /** @stable ICU 4.8 */ - USCRIPT_TANGUT UScriptCode = 154 /* Tang */ + tangut code = 154 /* Tang */ /** @stable ICU 4.8 */ - USCRIPT_WOLEAI UScriptCode = 155 /* Wole */ + woleai code = 155 /* Wole */ /** @stable ICU 49 */ - USCRIPT_ANATOLIAN_HIEROGLYPHS UScriptCode = 156 /* Hluw */ + anatolianHieroglyphs code = 156 /* Hluw */ /** @stable ICU 49 */ - USCRIPT_KHOJKI UScriptCode = 157 /* Khoj */ + khojki code = 157 /* Khoj */ /** @stable ICU 49 */ - USCRIPT_TIRHUTA UScriptCode = 158 /* Tirh */ + tirhuta code = 158 /* Tirh */ /** @stable ICU 52 */ - USCRIPT_CAUCASIAN_ALBANIAN UScriptCode = 159 /* Aghb */ + caucasianAlbanian code = 159 /* Aghb */ /** @stable ICU 52 */ - USCRIPT_MAHAJANI UScriptCode = 160 /* Mahj */ + mahajani code = 160 /* Mahj */ /** @stable ICU 54 */ - USCRIPT_AHOM UScriptCode = 161 /* Ahom */ + ahom code = 161 /* Ahom */ /** @stable ICU 54 */ - USCRIPT_HATRAN UScriptCode = 162 /* Hatr */ + hatran code = 162 /* Hatr */ /** @stable ICU 54 */ - USCRIPT_MODI UScriptCode = 163 /* Modi */ + modi code = 163 /* Modi */ /** @stable ICU 54 */ - USCRIPT_MULTANI UScriptCode = 164 /* Mult */ + multani code = 164 /* Mult */ /** @stable ICU 54 */ - USCRIPT_PAU_CIN_HAU UScriptCode = 165 /* Pauc */ + pauCinHau code = 165 /* Pauc */ /** @stable ICU 54 */ - USCRIPT_SIDDHAM UScriptCode = 166 /* Sidd */ + siddham code = 166 /* Sidd */ /** @stable ICU 58 */ - USCRIPT_ADLAM UScriptCode = 167 /* Adlm */ + adlam code = 167 /* Adlm */ /** @stable ICU 58 */ - USCRIPT_BHAIKSUKI UScriptCode = 168 /* Bhks */ + bhaiksuki code = 168 /* Bhks */ /** @stable ICU 58 */ - USCRIPT_MARCHEN UScriptCode = 169 /* Marc */ + marchen code = 169 /* Marc */ /** @stable ICU 58 */ - USCRIPT_NEWA UScriptCode = 170 /* Newa */ + newa code = 170 /* Newa */ /** @stable ICU 58 */ - USCRIPT_OSAGE UScriptCode = 171 /* Osge */ + osage code = 171 /* Osge */ /** @stable ICU 58 */ - USCRIPT_HAN_WITH_BOPOMOFO UScriptCode = 172 /* Hanb */ + hanWithBopomofo code = 172 /* Hanb */ /** @stable ICU 58 */ - USCRIPT_JAMO UScriptCode = 173 /* Jamo */ + jamo code = 173 /* Jamo */ /** @stable ICU 58 */ - USCRIPT_SYMBOLS_EMOJI UScriptCode = 174 /* Zsye */ + symbolsEmoji code = 174 /* Zsye */ /** @stable ICU 60 */ - USCRIPT_MASARAM_GONDI UScriptCode = 175 /* Gonm */ + masaramGondi code = 175 /* Gonm */ /** @stable ICU 60 */ - USCRIPT_SOYOMBO UScriptCode = 176 /* Soyo */ + soyombo code = 176 /* Soyo */ /** @stable ICU 60 */ - USCRIPT_ZANABAZAR_SQUARE UScriptCode = 177 /* Zanb */ + zanabazarSquare code = 177 /* Zanb */ /** @stable ICU 62 */ - USCRIPT_DOGRA UScriptCode = 178 /* Dogr */ + dogra code = 178 /* Dogr */ /** @stable ICU 62 */ - USCRIPT_GUNJALA_GONDI UScriptCode = 179 /* Gong */ + gunjalaGondi code = 179 /* Gong */ /** @stable ICU 62 */ - USCRIPT_MAKASAR UScriptCode = 180 /* Maka */ + makasar code = 180 /* Maka */ /** @stable ICU 62 */ - USCRIPT_MEDEFAIDRIN UScriptCode = 181 /* Medf */ + medefaidrin code = 181 /* Medf */ /** @stable ICU 62 */ - USCRIPT_HANIFI_ROHINGYA UScriptCode = 182 /* Rohg */ + hanifiRohingya code = 182 /* Rohg */ /** @stable ICU 62 */ - USCRIPT_SOGDIAN UScriptCode = 183 /* Sogd */ + sogdian code = 183 /* Sogd */ /** @stable ICU 62 */ - USCRIPT_OLD_SOGDIAN UScriptCode = 184 /* Sogo */ + oldSogdian code = 184 /* Sogo */ /** @stable ICU 64 */ - USCRIPT_ELYMAIC UScriptCode = 185 /* Elym */ + elymaic code = 185 /* Elym */ /** @stable ICU 64 */ - USCRIPT_NYIAKENG_PUACHUE_HMONG UScriptCode = 186 /* Hmnp */ + nyiakengPuachueHmong code = 186 /* Hmnp */ /** @stable ICU 64 */ - USCRIPT_NANDINAGARI UScriptCode = 187 /* Nand */ + nandinagari code = 187 /* Nand */ /** @stable ICU 64 */ - USCRIPT_WANCHO UScriptCode = 188 /* Wcho */ + wancho code = 188 /* Wcho */ /** @stable ICU 66 */ - USCRIPT_CHORASMIAN UScriptCode = 189 /* Chrs */ + chorasmian code = 189 /* Chrs */ /** @stable ICU 66 */ - USCRIPT_DIVES_AKURU UScriptCode = 190 /* Diak */ + divesAkuru code = 190 /* Diak */ /** @stable ICU 66 */ - USCRIPT_KHITAN_SMALL_SCRIPT UScriptCode = 191 /* Kits */ + khitanSmallScript code = 191 /* Kits */ /** @stable ICU 66 */ - USCRIPT_YEZIDI UScriptCode = 192 /* Yezi */ + yezedi code = 192 /* Yezi */ ) -func UScriptHasScript(c rune, sc UScriptCode) bool { - scriptX := uchar.GetUnicodeProperties(c, 0) & UPROPS_SCRIPT_X_MASK +func uscriptHasScript(c rune, sc code) bool { + scriptX := uchar.GetUnicodeProperties(c, 0) & scriptXMask codeOrIndex := mergeScriptCodeOrIndex(scriptX) - if scriptX < UPROPS_SCRIPT_X_WITH_COMMON { - return sc == UScriptCode(codeOrIndex) + if scriptX < scriptXWithCommon { + return sc == code(codeOrIndex) } scx := uchar.ScriptExtensions(codeOrIndex) - if scriptX >= UPROPS_SCRIPT_X_WITH_OTHER { + if scriptX >= scriptXWithOther { scx = uchar.ScriptExtensions(uint32(scx[1])) } sc32 := uint32(sc) diff --git a/go/mysql/icuregex/internal/uset/close.go b/go/mysql/icuregex/internal/uset/close.go index 9b59fed8bf3..bd3f9f0f7e3 100644 --- a/go/mysql/icuregex/internal/uset/close.go +++ b/go/mysql/icuregex/internal/uset/close.go @@ -30,7 +30,7 @@ const ( * Ignore white space within patterns unless quoted or escaped. * @stable ICU 2.4 */ - USET_IGNORE_SPACE USet = 1 + IgnoreSpace USet = 1 /** * Enable case insensitive matching. E.g., "[ab]" with this flag @@ -58,7 +58,7 @@ const ( * * @stable ICU 2.4 */ - USET_CASE_INSENSITIVE USet = 2 + CaseInsensitive USet = 2 /** * Enable case insensitive matching. E.g., "[ab]" with this flag @@ -68,14 +68,14 @@ const ( * of each existing element in the set. * @stable ICU 3.2 */ - USET_ADD_CASE_MAPPINGS USet = 4 + AddCaseMappings USet = 4 ) func (u *UnicodeSet) CloseOver(attribute USet) { - if attribute&USET_ADD_CASE_MAPPINGS != 0 { + if attribute&AddCaseMappings != 0 { panic("USET_ADD_CASE_MAPPINGS is unsupported") } - if (attribute & USET_CASE_INSENSITIVE) == 0 { + if (attribute & CaseInsensitive) == 0 { return } diff --git a/go/mysql/icuregex/internal/uset/frozen.go b/go/mysql/icuregex/internal/uset/frozen.go index 2b17ae904c8..2703a4f6975 100644 --- a/go/mysql/icuregex/internal/uset/frozen.go +++ b/go/mysql/icuregex/internal/uset/frozen.go @@ -130,9 +130,6 @@ func (f *frozen) findCodePoint(list []rune, c rune, lo, hi int32) int32 { } func (f *frozen) set32x64bits(table *[64]uint32, start, limit int32) { - // U_ASSERT(start < limit) - // U_ASSERT(limit <= 0x800) - lead := start >> 6 // Named for UTF-8 2-byte lead byte with upper 5 bits. trail := start & 0x3f // Named for UTF-8 2-byte trail byte with lower 6 bits. diff --git a/go/mysql/icuregex/internal/uset/pattern.go b/go/mysql/icuregex/internal/uset/pattern.go index 51463b10542..20b44da9c6d 100644 --- a/go/mysql/icuregex/internal/uset/pattern.go +++ b/go/mysql/icuregex/internal/uset/pattern.go @@ -51,7 +51,7 @@ func (u *UnicodeSet) ToPattern(w *strings.Builder, escapeUnprintable bool) { // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. - if count > 1 && u.RangeStart(0) == MIN_VALUE && u.RangeEnd(count-1) == MAX_VALUE { + if count > 1 && u.RangeStart(0) == MinValue && u.RangeEnd(count-1) == MaxValue { // Emit the inverse w.WriteByte('^') diff --git a/go/mysql/icuregex/internal/uset/unicode_set.go b/go/mysql/icuregex/internal/uset/unicode_set.go index db6659b1121..3dba317eab2 100644 --- a/go/mysql/icuregex/internal/uset/unicode_set.go +++ b/go/mysql/icuregex/internal/uset/unicode_set.go @@ -28,26 +28,23 @@ import ( ) // HIGH_VALUE > all valid values. 110000 for codepoints -const UNICODESET_HIGH = 0x0110000 +const unicodeSetHigh = 0x0110000 // LOW <= all valid values. ZERO for codepoints -const UNICODESET_LOW = 0x000000 - -/** Max list [0, 1, 2, ..., max code point, HIGH] */ -const MAX_LENGTH = UNICODESET_HIGH + 1 +const unicodeSetLow = 0x000000 const ( /** * Minimum value that can be stored in a UnicodeSet. * @stable ICU 2.4 */ - MIN_VALUE = 0 + MinValue = 0 /** * Maximum value that can be stored in a UnicodeSet. * @stable ICU 2.4 */ - MAX_VALUE = 0x10ffff + MaxValue = 0x10ffff ) type UnicodeSet struct { @@ -58,7 +55,7 @@ type UnicodeSet struct { func New() *UnicodeSet { buf := make([]rune, 1, 25) - buf[0] = UNICODESET_HIGH + buf[0] = unicodeSetHigh return &UnicodeSet{list: buf} } @@ -113,7 +110,7 @@ func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { j++ polarity ^= 2 } else { - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loopEnd } if k > 0 && a <= u.buffer[k-1] { @@ -132,13 +129,13 @@ func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { } case 3: if b <= a { - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loopEnd } u.buffer[k] = a k++ } else { - if b == UNICODESET_HIGH { + if b == unicodeSetHigh { goto loopEnd } u.buffer[k] = b @@ -162,7 +159,7 @@ func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { j++ polarity ^= 2 } else { - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loopEnd } a = u.list[i] @@ -184,7 +181,7 @@ func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { i++ polarity ^= 1 } else { - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loopEnd } a = u.list[i] @@ -198,7 +195,7 @@ func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { } loopEnd: - u.buffer[k] = UNICODESET_HIGH + u.buffer[k] = unicodeSetHigh k++ u.list, u.buffer = u.buffer[:k], u.list @@ -212,10 +209,10 @@ func max(a, b rune) rune { } func pinCodePoint(c *rune) rune { - if *c < UNICODESET_LOW { - *c = UNICODESET_LOW - } else if *c > (UNICODESET_HIGH - 1) { - *c = UNICODESET_HIGH - 1 + if *c < unicodeSetLow { + *c = unicodeSetLow + } else if *c > (unicodeSetHigh - 1) { + *c = unicodeSetHigh - 1 } return *c } @@ -250,8 +247,8 @@ func (u *UnicodeSet) AddRune(c rune) { // c is before start of next range u.list[i] = c // if we touched the HIGH mark, then add a new one - if c == (UNICODESET_HIGH - 1) { - u.list = append(u.list, UNICODESET_HIGH) + if c == (unicodeSetHigh - 1) { + u.list = append(u.list, unicodeSetHigh) } if i > 0 && c == u.list[i-1] { // collapse adjacent ranges @@ -300,16 +297,16 @@ func (u *UnicodeSet) AddRuneRange(start, end rune) { if lastLimit == start { // Extend the last range. u.list[len(u.list)-2] = limit - if limit == UNICODESET_HIGH { + if limit == unicodeSetHigh { u.list = u.list[:len(u.list)-1] } } else { u.list[len(u.list)-1] = start - if limit < UNICODESET_HIGH { + if limit < unicodeSetHigh { u.list = append(u.list, limit) - u.list = append(u.list, UNICODESET_HIGH) + u.list = append(u.list, unicodeSetHigh) } else { // limit == UNICODESET_HIGH - u.list = append(u.list, UNICODESET_HIGH) + u.list = append(u.list, unicodeSetHigh) } } return @@ -317,7 +314,7 @@ func (u *UnicodeSet) AddRuneRange(start, end rune) { } // This is slow. Could be much faster using findCodePoint(start) // and modifying the list, dealing with adjacent & overlapping ranges. - addRange := [3]rune{start, limit, UNICODESET_HIGH} + addRange := [3]rune{start, limit, unicodeSetHigh} u.addbuffer(addRange[:], 0) } else if start == end { u.AddRune(start) @@ -334,18 +331,18 @@ func (u *UnicodeSet) Complement() { if u.frozen != nil { panic("UnicodeSet is frozen") } - if u.list[0] == UNICODESET_LOW { + if u.list[0] == unicodeSetLow { copy(u.list, u.list[1:]) u.list = u.list[:len(u.list)-1] } else { - u.list = slices.Insert(u.list, 0, UNICODESET_LOW) + u.list = slices.Insert(u.list, 0, unicodeSetLow) } } func (u *UnicodeSet) RemoveRuneRange(start, end rune) { if pinCodePoint(&start) < pinCodePoint(&end) { - range_ := [3]rune{start, end + 1, UNICODESET_HIGH} - u.retain(range_[:], 2) + r := [3]rune{start, end + 1, unicodeSetHigh} + u.retain(r[:], 2) } } @@ -385,7 +382,7 @@ func (u *UnicodeSet) retain(other []rune, polarity int8) { j++ polarity ^= 2 } else { // a == b, take one, drop other - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loop_end } u.buffer[k] = a @@ -411,7 +408,7 @@ func (u *UnicodeSet) retain(other []rune, polarity int8) { j++ polarity ^= 2 } else { // a == b, take one, drop other - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loop_end } u.buffer[k] = a @@ -435,7 +432,7 @@ func (u *UnicodeSet) retain(other []rune, polarity int8) { j++ polarity ^= 2 } else { // a == b, drop both! - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loop_end } a = u.list[i] @@ -457,7 +454,7 @@ func (u *UnicodeSet) retain(other []rune, polarity int8) { i++ polarity ^= 1 } else { // a == b, drop both! - if a == UNICODESET_HIGH { + if a == unicodeSetHigh { goto loop_end } a = u.list[i] @@ -471,7 +468,7 @@ func (u *UnicodeSet) retain(other []rune, polarity int8) { } loop_end: - u.buffer[k] = UNICODESET_HIGH // terminate + u.buffer[k] = unicodeSetHigh // terminate k++ u.list, u.buffer = u.buffer[:k], u.list } @@ -481,7 +478,7 @@ func (u *UnicodeSet) Clear() { panic("UnicodeSet is frozen") } u.list = u.list[:1] - u.list[0] = UNICODESET_HIGH + u.list[0] = unicodeSetHigh } func (u *UnicodeSet) Len() (n int) { @@ -543,21 +540,19 @@ func (u *UnicodeSet) ContainsRune(c rune) bool { // All 64 code points with the same bits 15..6 // are either in the set or not. return twoBits != 0 - } else { - // Look up the code point in its 4k block of code points. - return f.containsSlow(u.list, c, f.list4kStarts[lead], f.list4kStarts[lead+1]) } + // Look up the code point in its 4k block of code points. + return f.containsSlow(u.list, c, f.list4kStarts[lead], f.list4kStarts[lead+1]) } else if c <= 0x10ffff { // surrogate or supplementary code point return f.containsSlow(u.list, c, f.list4kStarts[0xd], f.list4kStarts[0x11]) - } else { - // Out-of-range code points get FALSE, consistent with long-standing - // behavior of UnicodeSet::contains(c). - return false } + // Out-of-range code points get FALSE, consistent with long-standing + // behavior of UnicodeSet::contains(c). + return false } - if c >= UNICODESET_HIGH { + if c >= unicodeSetHigh { return false } i := u.findCodePoint(c) diff --git a/go/mysql/icuregex/internal/utrie/ucptrie.go b/go/mysql/icuregex/internal/utrie/ucptrie.go index f7e64107343..74e4eb9b2fa 100644 --- a/go/mysql/icuregex/internal/utrie/ucptrie.go +++ b/go/mysql/icuregex/internal/utrie/ucptrie.go @@ -22,39 +22,40 @@ limitations under the License. package utrie import ( + "errors" "fmt" "vitess.io/vitess/go/mysql/icuregex/internal/udata" ) type UcpTrie struct { - Index []uint16 - Data8 []uint8 - Data16 []uint16 - Data32 []uint32 + index []uint16 + data8 []uint8 + data16 []uint16 + data32 []uint32 - IndexLength, DataLength int32 + indexLength, dataLength int32 /** Start of the last range which ends at U+10FFFF. @internal */ - HighStart rune - Shifted12HighStart uint16 + highStart rune + shifted12HighStart uint16 - Type UCPTrieType - ValueWidth UCPTrieValueWidth + typ ucpTrieType + valueWidth ucpTrieValueWidth /** * Internal index-3 null block offset. * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. * @internal */ - Index3NullOffset uint16 + index3NullOffset uint16 /** * Internal data null block offset, not shifted. * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. * @internal */ - DataNullOffset int32 + dataNullOffset int32 - NullValue uint32 + nullValue uint32 } /** @@ -66,7 +67,7 @@ type UcpTrie struct { * @see ucptrie_getType * @stable ICU 63 */ -type UCPTrieType int8 +type ucpTrieType int8 const ( /** @@ -74,17 +75,17 @@ const ( * ucptrie_getType() will return the actual type. * @stable ICU 63 */ - UCPTRIE_TYPE_ANY UCPTrieType = iota - 1 + typeAny ucpTrieType = iota - 1 /** * Fast/simple/larger BMP data structure. Use functions and "fast" macros. * @stable ICU 63 */ - UCPTRIE_TYPE_FAST + typeFast /** * Small/slower BMP data structure. Use functions and "small" macros. * @stable ICU 63 */ - UCPTRIE_TYPE_SMALL + typeSmall ) /** @@ -95,7 +96,7 @@ const ( * @see ucptrie_getValueWidth * @stable ICU 63 */ -type UCPTrieValueWidth int8 +type ucpTrieValueWidth int8 const ( /** @@ -103,133 +104,126 @@ const ( * ucptrie_getValueWidth() will return the actual data value width. * @stable ICU 63 */ - UCPTRIE_VALUE_BITS_ANY UCPTrieValueWidth = iota - 1 + valueBitsAny ucpTrieValueWidth = iota - 1 /** * The trie stores 16 bits per data value. * It returns them as unsigned values 0..0xffff=65535. * @stable ICU 63 */ - UCPTRIE_VALUE_BITS_16 + valueBits16 /** * The trie stores 32 bits per data value. * @stable ICU 63 */ - UCPTRIE_VALUE_BITS_32 + valueBits32 /** * The trie stores 8 bits per data value. * It returns them as unsigned values 0..0xff=255. * @stable ICU 63 */ - UCPTRIE_VALUE_BITS_8 + valueBits8 ) -const UCPTRIE_SIG = 0x54726933 -const UCPTRIE_OE_SIG = 0x33697254 +const ucpTrieSig = 0x54726933 +const ucpTrieOESig = 0x33697254 /** * Constants for use with UCPTrieHeader.options. * @internal */ const ( - UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000 - UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00 - UCPTRIE_OPTIONS_RESERVED_MASK = 0x38 - UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7 - /** - * Value for index3NullOffset which indicates that there is no index-3 null block. - * Bit 15 is unused for this value because this bit is used if the index-3 contains - * 18-bit indexes. - */ - UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff - UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff + optionsDataLengthMask = 0xf000 + optionsDataNullOffsetMask = 0xf00 + optionsReservedMask = 0x38 + optionsValueBitsMask = 7 ) const ( /** @internal */ - UCPTRIE_FAST_SHIFT = 6 + fastShift = 6 /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ - UCPTRIE_FAST_DATA_BLOCK_LENGTH = 1 << UCPTRIE_FAST_SHIFT + fastDataBlockLength = 1 << fastShift /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ - UCPTRIE_FAST_DATA_MASK = UCPTRIE_FAST_DATA_BLOCK_LENGTH - 1 + fastDataMask = fastDataBlockLength - 1 /** @internal */ - UCPTRIE_SMALL_MAX = 0xfff + smallMax = 0xfff /** * Offset from dataLength (to be subtracted) for fetching the * value returned for out-of-range code points and ill-formed UTF-8/16. * @internal */ - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET = 1 + errorValueNegDataOffset = 1 /** * Offset from dataLength (to be subtracted) for fetching the * value returned for code points highStart..U+10FFFF. * @internal */ - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET = 2 + highValueNegDataOffset = 2 ) // Internal constants. const ( /** The length of the BMP index table. 1024=0x400 */ - UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT + bmpIndexLength = 0x10000 >> fastShift - UCPTRIE_SMALL_LIMIT = 0x1000 - UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT + smallLimit = 0x1000 + smallIndexLength = smallLimit >> fastShift /** Shift size for getting the index-3 table offset. */ - UCPTRIE_SHIFT_3 = 4 + ucpShift3 = 4 /** Shift size for getting the index-2 table offset. */ - UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3 + ucpShift2 = 5 + ucpShift3 /** Shift size for getting the index-1 table offset. */ - UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2 + ucpShift1 = 5 + ucpShift2 /** * Difference between two shift sizes, * for getting an index-2 offset from an index-3 offset. 5=9-4 */ - UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3 + ucpShift2Min3 = ucpShift2 - ucpShift3 /** * Difference between two shift sizes, * for getting an index-1 offset from an index-2 offset. 5=14-9 */ - UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2 + ucpShift1Min2 = ucpShift1 - ucpShift2 /** * Number of index-1 entries for the BMP. (4) * This part of the index-1 table is omitted from the serialized form. */ - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1 + ucpOmittedBmpIndex1Length = 0x10000 >> ucpShift1 /** Number of entries in an index-2 block. 32=0x20 */ - UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2 + ucpIndex2BlockLength = 1 << ucpShift1Min2 /** Mask for getting the lower bits for the in-index-2-block offset. */ - UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1 + ucpIndex2Mask = ucpIndex2BlockLength - 1 /** Number of code points per index-2 table entry. 512=0x200 */ - UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2 + ucpCpPerIndex2Entry = 1 << ucpShift2 /** Number of entries in an index-3 block. 32=0x20 */ - UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3 + ucpIndex3BlockLength = 1 << ucpShift2Min3 /** Mask for getting the lower bits for the in-index-3-block offset. */ - UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1 + ucpIndex3Mask = ucpIndex3BlockLength - 1 /** Number of entries in a small data block. 16=0x10 */ - UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3 + ucpSmallDataBlockLength = 1 << ucpShift3 /** Mask for getting the lower bits for the in-small-data-block offset. */ - UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1 + ucpSmallDataMask = ucpSmallDataBlockLength - 1 ) func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { - type UcpHeader struct { + type ucpHeader struct { /** "Tri3" in big-endian US-ASCII (0x54726933) */ signature uint32 @@ -262,13 +256,13 @@ func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { shiftedHighStart uint16 } - var header UcpHeader + var header ucpHeader header.signature = bytes.Uint32() switch header.signature { - case UCPTRIE_SIG: - case UCPTRIE_OE_SIG: - return nil, fmt.Errorf("unsupported: BigEndian encoding") + case ucpTrieSig: + case ucpTrieOESig: + return nil, errors.New("unsupported: BigEndian encoding") default: return nil, fmt.Errorf("invalid signature for UcpTrie: 0x%08x", header.signature) } @@ -281,40 +275,40 @@ func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { header.shiftedHighStart = bytes.Uint16() typeInt := (header.options >> 6) & 3 - valueWidthInt := header.options & UCPTRIE_OPTIONS_VALUE_BITS_MASK - if typeInt > uint16(UCPTRIE_TYPE_SMALL) || valueWidthInt > uint16(UCPTRIE_VALUE_BITS_8) || - (header.options&UCPTRIE_OPTIONS_RESERVED_MASK) != 0 { - return nil, fmt.Errorf("invalid options for serialized UcpTrie") + valueWidthInt := header.options & optionsValueBitsMask + if typeInt > uint16(typeSmall) || valueWidthInt > uint16(valueBits8) || + (header.options&optionsReservedMask) != 0 { + return nil, errors.New("invalid options for serialized UcpTrie") } - actualType := UCPTrieType(typeInt) - actualValueWidth := UCPTrieValueWidth(valueWidthInt) + actualType := ucpTrieType(typeInt) + actualValueWidth := ucpTrieValueWidth(valueWidthInt) trie := &UcpTrie{ - IndexLength: int32(header.indexLength), - DataLength: int32(((header.options & UCPTRIE_OPTIONS_DATA_LENGTH_MASK) << 4) | header.dataLength), - Index3NullOffset: header.index3NullOffset, - DataNullOffset: int32(((header.options & UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK) << 8) | header.dataNullOffset), - HighStart: rune(header.shiftedHighStart) << UCPTRIE_SHIFT_2, - Type: actualType, - ValueWidth: actualValueWidth, + indexLength: int32(header.indexLength), + dataLength: int32(((header.options & optionsDataLengthMask) << 4) | header.dataLength), + index3NullOffset: header.index3NullOffset, + dataNullOffset: int32(((header.options & optionsDataNullOffsetMask) << 8) | header.dataNullOffset), + highStart: rune(header.shiftedHighStart) << ucpShift2, + typ: actualType, + valueWidth: actualValueWidth, } - nullValueOffset := trie.DataNullOffset - if nullValueOffset >= trie.DataLength { - nullValueOffset = trie.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + nullValueOffset := trie.dataNullOffset + if nullValueOffset >= trie.dataLength { + nullValueOffset = trie.dataLength - highValueNegDataOffset } - trie.Shifted12HighStart = uint16((trie.HighStart + 0xfff) >> 12) - trie.Index = bytes.Uint16Slice(int32(header.indexLength)) + trie.shifted12HighStart = uint16((trie.highStart + 0xfff) >> 12) + trie.index = bytes.Uint16Slice(int32(header.indexLength)) switch actualValueWidth { - case UCPTRIE_VALUE_BITS_16: - trie.Data16 = bytes.Uint16Slice(trie.DataLength) - trie.NullValue = uint32(trie.Data16[nullValueOffset]) - case UCPTRIE_VALUE_BITS_32: - trie.Data32 = bytes.Uint32Slice(trie.DataLength) - trie.NullValue = trie.Data32[nullValueOffset] - case UCPTRIE_VALUE_BITS_8: - trie.Data8 = bytes.Uint8Slice(trie.DataLength) - trie.NullValue = uint32(trie.Data8[nullValueOffset]) + case valueBits16: + trie.data16 = bytes.Uint16Slice(trie.dataLength) + trie.nullValue = uint32(trie.data16[nullValueOffset]) + case valueBits32: + trie.data32 = bytes.Uint32Slice(trie.dataLength) + trie.nullValue = trie.data32[nullValueOffset] + case valueBits8: + trie.data8 = bytes.Uint8Slice(trie.dataLength) + trie.nullValue = uint32(trie.data8[nullValueOffset]) } return trie, nil @@ -327,10 +321,10 @@ func (t *UcpTrie) Get(c rune) uint32 { dataIndex = c } else { var fastMax rune - if t.Type == UCPTRIE_TYPE_FAST { + if t.typ == typeFast { fastMax = 0xffff } else { - fastMax = UCPTRIE_SMALL_MAX + fastMax = smallMax } dataIndex = t.cpIndex(fastMax, c) } @@ -338,13 +332,13 @@ func (t *UcpTrie) Get(c rune) uint32 { } func (t *UcpTrie) getValue(dataIndex int32) uint32 { - switch t.ValueWidth { - case UCPTRIE_VALUE_BITS_16: - return uint32(t.Data16[dataIndex]) - case UCPTRIE_VALUE_BITS_32: - return t.Data32[dataIndex] - case UCPTRIE_VALUE_BITS_8: - return uint32(t.Data8[dataIndex]) + switch t.valueWidth { + case valueBits16: + return uint32(t.data16[dataIndex]) + case valueBits32: + return t.data32[dataIndex] + case valueBits8: + return uint32(t.data8[dataIndex]) default: // Unreachable if the trie is properly initialized. return 0xffffffff @@ -353,39 +347,39 @@ func (t *UcpTrie) getValue(dataIndex int32) uint32 { /** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */ func (t *UcpTrie) fastIndex(c rune) int32 { - return int32(t.Index[c>>UCPTRIE_FAST_SHIFT]) + (c & UCPTRIE_FAST_DATA_MASK) + return int32(t.index[c>>fastShift]) + (c & fastDataMask) } /** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */ func (t *UcpTrie) smallIndex(c rune) int32 { - if c >= t.HighStart { - return t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + if c >= t.highStart { + return t.dataLength - highValueNegDataOffset } return t.internalSmallIndex(c) } func (t *UcpTrie) internalSmallIndex(c rune) int32 { - i1 := c >> UCPTRIE_SHIFT_1 - if t.Type == UCPTRIE_TYPE_FAST { - i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH + i1 := c >> ucpShift1 + if t.typ == typeFast { + i1 += bmpIndexLength - ucpOmittedBmpIndex1Length } else { - i1 += UCPTRIE_SMALL_INDEX_LENGTH + i1 += smallIndexLength } - i3Block := int32(t.Index[int32(t.Index[i1])+((c>>UCPTRIE_SHIFT_2)&UCPTRIE_INDEX_2_MASK)]) - i3 := (c >> UCPTRIE_SHIFT_3) & UCPTRIE_INDEX_3_MASK + i3Block := int32(t.index[int32(t.index[i1])+((c>>ucpShift2)&ucpIndex2Mask)]) + i3 := (c >> ucpShift3) & ucpIndex3Mask var dataBlock int32 if (i3Block & 0x8000) == 0 { // 16-bit indexes - dataBlock = int32(t.Index[i3Block+i3]) + dataBlock = int32(t.index[i3Block+i3]) } else { // 18-bit indexes stored in groups of 9 entries per 8 indexes. i3Block = (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) i3 &= 7 - dataBlock = int32(t.Index[i3Block]) << (2 + (2 * i3)) & 0x30000 + dataBlock = int32(t.index[i3Block]) << (2 + (2 * i3)) & 0x30000 i3Block++ - dataBlock |= int32(t.Index[i3Block+i3]) + dataBlock |= int32(t.index[i3Block+i3]) } - return dataBlock + (c & UCPTRIE_SMALL_DATA_MASK) + return dataBlock + (c & ucpSmallDataMask) } /** @@ -400,7 +394,7 @@ func (t *UcpTrie) cpIndex(fastMax, c rune) int32 { if c <= 0x10ffff { return t.smallIndex(c) } - return t.DataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET + return t.dataLength - errorValueNegDataOffset } /** @@ -412,7 +406,7 @@ func (t *UcpTrie) cpIndex(fastMax, c rune) int32 { * @see umutablecptrie_getRange * @stable ICU 63 */ -type UCPMapRangeOption int8 +type UcpMapRangeOption int8 const ( /** @@ -420,7 +414,7 @@ const ( * Most users should use this option. * @stable ICU 63 */ - UCPMAP_RANGE_NORMAL UCPMapRangeOption = iota + UcpMapRangeNormal UcpMapRangeOption = iota /** * ucpmap_getRange() enumerates all same-value ranges as stored in the map, * except that lead surrogates (U+D800..U+DBFF) are treated as having the @@ -436,7 +430,7 @@ const ( * but those values are not to be associated with the lead surrogate code *points*. * @stable ICU 63 */ - UCPMAP_RANGE_FIXED_LEAD_SURROGATES + UcpMapRangeFixedLeadSurrogates /** * ucpmap_getRange() enumerates all same-value ranges as stored in the map, * except that all surrogates (U+D800..U+DFFF) are treated as having the @@ -452,7 +446,7 @@ const ( * but those values are not to be associated with the lead surrogate code *points*. * @stable ICU 63 */ - UCPMAP_RANGE_FIXED_ALL_SURROGATES + UcpMapRangeFixedAllSurrogates ) /** @@ -469,7 +463,7 @@ const ( * @return the modified value * @stable ICU 63 */ -type UCPMapValueFilter func(value uint32) uint32 +type UcpMapValueFilter func(value uint32) uint32 /** * GetRange returns the last code point such that all those from start to there have the same value. @@ -507,13 +501,13 @@ type UCPMapValueFilter func(value uint32) uint32 * @return the range end code point, or -1 if start is not a valid code point * @stable ICU 63 */ -func (t *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateValue uint32, filter UCPMapValueFilter) (rune, uint32) { - if option == UCPMAP_RANGE_NORMAL { +func (t *UcpTrie) GetRange(start rune, option UcpMapRangeOption, surrogateValue uint32, filter UcpMapValueFilter) (rune, uint32) { + if option == UcpMapRangeNormal { return t.getRange(start, filter) } var surrEnd rune - if option == UCPMAP_RANGE_FIXED_ALL_SURROGATES { + if option == UcpMapRangeFixedAllSurrogates { surrEnd = 0xdfff } else { surrEnd = 0xdbff @@ -548,27 +542,27 @@ func (t *UcpTrie) GetRange(start rune, option UCPMapRangeOption, surrogateValue return surrEnd, value } -const MAX_UNICODE = 0x10ffff +const maxUnicode = 0x10ffff -func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) { - if start > MAX_UNICODE { +func (t *UcpTrie) getRange(start rune, filter UcpMapValueFilter) (rune, uint32) { + if start > maxUnicode { return -1, 0 } - if start >= t.HighStart { - di := t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + if start >= t.highStart { + di := t.dataLength - highValueNegDataOffset value := t.getValue(di) if filter != nil { value = filter(value) } - return MAX_UNICODE, value + return maxUnicode, value } - nullValue := t.NullValue + nullValue := t.nullValue if filter != nil { nullValue = filter(nullValue) } - index := t.Index + index := t.index prevI3Block := int32(-1) prevBlock := int32(-1) @@ -578,50 +572,50 @@ func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) haveValue := false for { var i3Block, i3, i3BlockLength, dataBlockLength int32 - if c <= 0xffff && (t.Type == UCPTRIE_TYPE_FAST || c <= UCPTRIE_SMALL_MAX) { + if c <= 0xffff && (t.typ == typeFast || c <= smallMax) { i3Block = 0 - i3 = c >> UCPTRIE_FAST_SHIFT - if t.Type == UCPTRIE_TYPE_FAST { - i3BlockLength = UCPTRIE_BMP_INDEX_LENGTH + i3 = c >> fastShift + if t.typ == typeFast { + i3BlockLength = bmpIndexLength } else { - i3BlockLength = UCPTRIE_SMALL_INDEX_LENGTH + i3BlockLength = smallIndexLength } - dataBlockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH + dataBlockLength = fastDataBlockLength } else { // Use the multi-stage index. - i1 := c >> UCPTRIE_SHIFT_1 - if t.Type == UCPTRIE_TYPE_FAST { - i1 += UCPTRIE_BMP_INDEX_LENGTH - UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH + i1 := c >> ucpShift1 + if t.typ == typeFast { + i1 += bmpIndexLength - ucpOmittedBmpIndex1Length } else { - i1 += UCPTRIE_SMALL_INDEX_LENGTH + i1 += smallIndexLength } - shft := c >> UCPTRIE_SHIFT_2 - idx := int32(t.Index[i1]) + (shft & UCPTRIE_INDEX_2_MASK) - i3Block = int32(t.Index[idx]) - if i3Block == prevI3Block && (c-start) >= UCPTRIE_CP_PER_INDEX_2_ENTRY { + shft := c >> ucpShift2 + idx := int32(t.index[i1]) + (shft & ucpIndex2Mask) + i3Block = int32(t.index[idx]) + if i3Block == prevI3Block && (c-start) >= ucpCpPerIndex2Entry { // The index-3 block is the same as the previous one, and filled with value. - c += UCPTRIE_CP_PER_INDEX_2_ENTRY + c += ucpCpPerIndex2Entry continue } prevI3Block = i3Block - if i3Block == int32(t.Index3NullOffset) { + if i3Block == int32(t.index3NullOffset) { // This is the index-3 null block. if haveValue { if nullValue != value { return c - 1, value } } else { - trieValue = t.NullValue + trieValue = t.nullValue value = nullValue haveValue = true } - prevBlock = t.DataNullOffset - c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ^(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1) + prevBlock = t.dataNullOffset + c = (c + ucpCpPerIndex2Entry) & ^(ucpCpPerIndex2Entry - 1) continue } - i3 = (c >> UCPTRIE_SHIFT_3) & UCPTRIE_INDEX_3_MASK - i3BlockLength = UCPTRIE_INDEX_3_BLOCK_LENGTH - dataBlockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH + i3 = (c >> ucpShift3) & ucpIndex3Mask + i3BlockLength = ucpIndex3BlockLength + dataBlockLength = ucpSmallDataBlockLength } // Enumerate data blocks for one index-3 block. @@ -643,14 +637,14 @@ func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) } else { dataMask := dataBlockLength - 1 prevBlock = block - if block == t.DataNullOffset { + if block == t.dataNullOffset { // This is the data null block. if haveValue { if nullValue != value { return c - 1, value } } else { - trieValue = t.NullValue + trieValue = t.nullValue value = nullValue haveValue = true } @@ -660,14 +654,14 @@ func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) trieValue2 := t.getValue(di) if haveValue { if trieValue2 != trieValue { - if filter == nil || maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) != value { + if filter == nil || maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) != value { return c - 1, value } trieValue = trieValue2 // may or may not help } } else { trieValue = trieValue2 - value = maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) + value = maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) haveValue = true } for { @@ -678,7 +672,7 @@ func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) di++ trieValue2 = t.getValue(di) if trieValue2 != trieValue { - if filter == nil || maybeFilterValue(trieValue2, t.NullValue, nullValue, filter) != value { + if filter == nil || maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) != value { return c - 1, value } trieValue = trieValue2 // may or may not help @@ -691,21 +685,20 @@ func (t *UcpTrie) getRange(start rune, filter UCPMapValueFilter) (rune, uint32) break } } - if c >= t.HighStart { + if c >= t.highStart { break } } - di := t.DataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET + di := t.dataLength - highValueNegDataOffset highValue := t.getValue(di) - if maybeFilterValue(highValue, t.NullValue, nullValue, filter) != value { + if maybeFilterValue(highValue, t.nullValue, nullValue, filter) != value { return c - 1, value - } else { - return MAX_UNICODE, value } + return maxUnicode, value } -func maybeFilterValue(value uint32, trieNullValue uint32, nullValue uint32, filter UCPMapValueFilter) uint32 { +func maybeFilterValue(value uint32, trieNullValue uint32, nullValue uint32, filter UcpMapValueFilter) uint32 { if value == trieNullValue { value = nullValue } else if filter != nil { diff --git a/go/mysql/icuregex/internal/utrie/utrie2.go b/go/mysql/icuregex/internal/utrie/utrie2.go index 6fd2ccd7120..a2c80cf1c50 100644 --- a/go/mysql/icuregex/internal/utrie/utrie2.go +++ b/go/mysql/icuregex/internal/utrie/utrie2.go @@ -22,6 +22,7 @@ limitations under the License. package utrie import ( + "errors" "fmt" "vitess.io/vitess/go/mysql/icuregex/internal/udata" @@ -29,13 +30,13 @@ import ( ) type UTrie2 struct { - Index []uint16 - Data16 []uint16 - Data32 []uint32 + index []uint16 + data16 []uint16 + data32 []uint32 - IndexLength, DataLength int - Index2NullOffset uint16 - DataNullOffset uint16 + indexLength, dataLength int + index2NullOffset uint16 + dataNullOffset uint16 InitialValue uint32 ErrorValue uint32 @@ -44,33 +45,33 @@ type UTrie2 struct { } func (t *UTrie2) SerializedLength() int32 { - return 16 + int32(t.IndexLength+t.DataLength)*2 + return 16 + int32(t.indexLength+t.dataLength)*2 } func (t *UTrie2) getIndex(asciiOffset int, c rune) uint16 { - return t.Index[t.indexFromCp(asciiOffset, c)] + return t.index[t.indexFromCp(asciiOffset, c)] } func (t *UTrie2) Get16(c rune) uint16 { - return t.getIndex(t.IndexLength, c) + return t.getIndex(t.indexLength, c) } func (t *UTrie2) indexFromCp(asciiOffset int, c rune) int { switch { case c < 0xd800: - return indexRaw(0, t.Index, c) + return indexRaw(0, t.index, c) case c <= 0xffff: var offset int32 if c <= 0xdbff { - offset = UTRIE2_LSCP_INDEX_2_OFFSET - (0xd800 >> UTRIE2_SHIFT_2) + offset = lscpIndex2Offset - (0xd800 >> shift2) } - return indexRaw(offset, t.Index, c) + return indexRaw(offset, t.index, c) case c > 0x10ffff: - return asciiOffset + UTRIE2_BAD_UTF8_DATA_OFFSET + return asciiOffset + badUtf8DataOffset case c >= t.HighStart: return t.HighValueIndex default: - return indexFromSupp(t.Index, c) + return indexFromSupp(t.index, c) } } @@ -102,10 +103,10 @@ func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRang /* frozen trie */ var ( - idx = t.Index - data32 = t.Data32 - index2NullOffset = int(t.Index2NullOffset) - nullBlock = int(t.DataNullOffset) + idx = t.index + data32 = t.data32 + index2NullOffset = int(t.index2NullOffset) + nullBlock = int(t.dataNullOffset) c rune prev = start @@ -125,38 +126,38 @@ func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRang /* enumerate index-2 blocks */ for c = start; c < limit && c < highStart; { /* Code point limit for iterating inside this i2Block. */ - tempLimit := c + UTRIE2_CP_PER_INDEX_1_ENTRY + tempLimit := c + cpPerIndex1Entry if limit < tempLimit { tempLimit = limit } if c <= 0xffff { if !utf16.IsSurrogate(c) { - i2Block = int(c >> UTRIE2_SHIFT_2) + i2Block = int(c >> shift2) } else if utf16.IsSurrogateLead(c) { /* * Enumerate values for lead surrogate code points, not code units: * This special block has half the normal length. */ - i2Block = UTRIE2_LSCP_INDEX_2_OFFSET + i2Block = lscpIndex2Offset tempLimit = min(0xdc00, limit) } else { /* * Switch back to the normal part of the index-2 table. * Enumerate the second half of the surrogates block. */ - i2Block = 0xd800 >> UTRIE2_SHIFT_2 + i2Block = 0xd800 >> shift2 tempLimit = min(0xe000, limit) } } else { /* supplementary code points */ - i2Block = int(idx[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+(c>>UTRIE2_SHIFT_1)]) - if i2Block == prevI2Block && (c-prev) >= UTRIE2_CP_PER_INDEX_1_ENTRY { + i2Block = int(idx[(index1Offset-omittedBmpIndex1Length)+(c>>shift1)]) + if i2Block == prevI2Block && (c-prev) >= cpPerIndex1Entry { /* * The index-2 block is the same as the previous one, and filled with prevValue. * Only possible for supplementary code points because the linear-BMP index-2 * table creates unique i2Block values. */ - c += UTRIE2_CP_PER_INDEX_1_ENTRY + c += cpPerIndex1Entry continue } } @@ -171,20 +172,20 @@ func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRang prev = c prevValue = initialValue } - c += UTRIE2_CP_PER_INDEX_1_ENTRY + c += cpPerIndex1Entry } else { /* enumerate data blocks for one index-2 block */ var i2Limit int - if (c >> UTRIE2_SHIFT_1) == (tempLimit >> UTRIE2_SHIFT_1) { - i2Limit = int(tempLimit>>UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK + if (c >> shift1) == (tempLimit >> shift1) { + i2Limit = int(tempLimit>>shift2) & index2Mask } else { - i2Limit = UTRIE2_INDEX_2_BLOCK_LENGTH + i2Limit = index2BlockLength } - for i2 := int(c>>UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; i2 < i2Limit; i2++ { - block = int(idx[i2Block+i2] << UTRIE2_INDEX_SHIFT) - if block == prevBlock && (c-prev) >= UTRIE2_DATA_BLOCK_LENGTH { + for i2 := int(c>>shift2) & index2Mask; i2 < i2Limit; i2++ { + block = int(idx[i2Block+i2] << indexShift) + if block == prevBlock && (c-prev) >= dataBlockLength { /* the block is the same as the previous one, and filled with prevValue */ - c += UTRIE2_DATA_BLOCK_LENGTH + c += dataBlockLength continue } prevBlock = block @@ -197,9 +198,9 @@ func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRang prev = c prevValue = initialValue } - c += UTRIE2_DATA_BLOCK_LENGTH + c += dataBlockLength } else { - for j := 0; j < UTRIE2_DATA_BLOCK_LENGTH; j++ { + for j := 0; j < dataBlockLength; j++ { var value uint32 if data32 != nil { value = data32[block+j] @@ -247,47 +248,47 @@ func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRang } func indexFromSupp(index []uint16, c rune) int { - i1 := int(index[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+(c>>UTRIE2_SHIFT_1)]) - return (int(index[i1+int((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK)]) << UTRIE2_INDEX_SHIFT) + int(c&UTRIE2_DATA_MASK) + i1 := int(index[(index1Offset-omittedBmpIndex1Length)+(c>>shift1)]) + return (int(index[i1+int((c>>shift2)&index2Mask)]) << indexShift) + int(c&dataMask) } func indexRaw(offset int32, index []uint16, c rune) int { - return int(index[offset+(c>>UTRIE2_SHIFT_2)]<>shift2)]<> UTRIE2_SHIFT_1 + omittedBmpIndex1Length = 0x10000 >> shift1 /** Number of code points per index-1 table entry. 2048=0x800 */ - UTRIE2_CP_PER_INDEX_1_ENTRY = 1 << UTRIE2_SHIFT_1 + cpPerIndex1Entry = 1 << shift1 /** Number of entries in an index-2 block. 64=0x40 */ - UTRIE2_INDEX_2_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_1_2 + index2BlockLength = 1 << shift1min2 /** Mask for getting the lower bits for the in-index-2-block offset. */ - UTRIE2_INDEX_2_MASK = UTRIE2_INDEX_2_BLOCK_LENGTH - 1 + index2Mask = index2BlockLength - 1 /** Number of entries in a data block. 32=0x20 */ - UTRIE2_DATA_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_2 + dataBlockLength = 1 << shift2 /** Mask for getting the lower bits for the in-data-block offset. */ - UTRIE2_DATA_MASK = UTRIE2_DATA_BLOCK_LENGTH - 1 + dataMask = dataBlockLength - 1 /** * Shift size for shifting left the index array values. @@ -295,37 +296,31 @@ const ( * of compactability. * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. */ - UTRIE2_INDEX_SHIFT = 2 + indexShift = 2 /** The alignment size of a data block. Also the granularity for compaction. */ - UTRIE2_DATA_GRANULARITY = 1 << UTRIE2_INDEX_SHIFT + dataGranularity = 1 << indexShift /* Fixed layout of the first part of the index array. ------------------- */ - /** - * The BMP part of the index-2 table is fixed and linear and starts at offset 0. - * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2 - */ - UTRIE2_INDEX_2_OFFSET = 0 - /** * The part of the index-2 table for U+D800..U+DBFF stores values for * lead surrogate code _units_ not code _points_. * Values for lead surrogate code _points_ are indexed with this portion of the table. * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) */ - UTRIE2_LSCP_INDEX_2_OFFSET = 0x10000 >> UTRIE2_SHIFT_2 - UTRIE2_LSCP_INDEX_2_LENGTH = 0x400 >> UTRIE2_SHIFT_2 + lscpIndex2Offset = 0x10000 >> shift2 + lscpIndex2Length = 0x400 >> shift2 /** Count the lengths of both BMP pieces. 2080=0x820 */ - UTRIE2_INDEX_2_BMP_LENGTH = UTRIE2_LSCP_INDEX_2_OFFSET + UTRIE2_LSCP_INDEX_2_LENGTH + index2BmpLength = lscpIndex2Offset + lscpIndex2Length /** * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. */ - UTRIE2_UTF8_2B_INDEX_2_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH - UTRIE2_UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6 /* U+0800 is the first code point after 2-byte UTF-8 */ + utf82BIndex2Offset = index2BmpLength + utf82BIndex2Length = 0x800 >> 6 /* U+0800 is the first code point after 2-byte UTF-8 */ /** * The index-1 table, only used for supplementary code points, at offset 2112=0x840. @@ -339,8 +334,8 @@ const ( * Both the index-1 table and the following part of the index-2 table * are omitted completely if there is only BMP data. */ - UTRIE2_INDEX_1_OFFSET = UTRIE2_UTF8_2B_INDEX_2_OFFSET + UTRIE2_UTF8_2B_INDEX_2_LENGTH - UTRIE2_MAX_INDEX_1_LENGTH = 0x100000 >> UTRIE2_SHIFT_1 + index1Offset = utf82BIndex2Offset + utf82BIndex2Length + maxIndex1Length = 0x100000 >> shift1 /* * Fixed layout of the first part of the data array. ----------------------- @@ -352,14 +347,11 @@ const ( * Used with linear access for single bytes 0..0xbf for simple error handling. * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. */ - UTRIE2_BAD_UTF8_DATA_OFFSET = 0x80 - - /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */ - UTRIE2_DATA_START_OFFSET = 0xc0 + badUtf8DataOffset = 0x80 ) func UTrie2FromBytes(bytes *udata.Bytes) (*UTrie2, error) { - type UTrie2Header struct { + type utrie2Header struct { /** "Tri2" in big-endian US-ASCII (0x54726932) */ signature uint32 @@ -386,13 +378,13 @@ func UTrie2FromBytes(bytes *udata.Bytes) (*UTrie2, error) { shiftedHighStart uint16 } - var header UTrie2Header + var header utrie2Header header.signature = bytes.Uint32() switch header.signature { case 0x54726932: case 0x32697254: - return nil, fmt.Errorf("unsupported: BigEndian encoding") + return nil, errors.New("unsupported: BigEndian encoding") default: return nil, fmt.Errorf("invalid signature for Trie2: 0x%08x", header.signature) } @@ -411,37 +403,37 @@ func UTrie2FromBytes(bytes *udata.Bytes) (*UTrie2, error) { case 1: width = 32 default: - return nil, fmt.Errorf("invalid width for serialized UTrie2") + return nil, errors.New("invalid width for serialized UTrie2") } trie := &UTrie2{ - IndexLength: int(header.indexLength), - DataLength: int(header.shiftedDataLength) << UTRIE2_INDEX_SHIFT, - Index2NullOffset: header.index2NullOffset, - DataNullOffset: header.dataNullOffset, - HighStart: rune(header.shiftedHighStart) << UTRIE2_SHIFT_1, + indexLength: int(header.indexLength), + dataLength: int(header.shiftedDataLength) << indexShift, + index2NullOffset: header.index2NullOffset, + dataNullOffset: header.dataNullOffset, + HighStart: rune(header.shiftedHighStart) << shift1, } - trie.HighValueIndex = trie.DataLength - UTRIE2_DATA_GRANULARITY + trie.HighValueIndex = trie.dataLength - dataGranularity if width == 16 { - trie.HighValueIndex += trie.IndexLength + trie.HighValueIndex += trie.indexLength } - indexArraySize := trie.IndexLength + indexArraySize := trie.indexLength if width == 16 { - indexArraySize += trie.DataLength + indexArraySize += trie.dataLength } - trie.Index = bytes.Uint16Slice(int32(indexArraySize)) + trie.index = bytes.Uint16Slice(int32(indexArraySize)) if width == 16 { - trie.Data16 = trie.Index[trie.IndexLength:] - trie.InitialValue = uint32(trie.Index[trie.DataNullOffset]) - trie.ErrorValue = uint32(trie.Index[trie.IndexLength+UTRIE2_BAD_UTF8_DATA_OFFSET]) + trie.data16 = trie.index[trie.indexLength:] + trie.InitialValue = uint32(trie.index[trie.dataNullOffset]) + trie.ErrorValue = uint32(trie.index[trie.indexLength+badUtf8DataOffset]) } else { - trie.Data32 = bytes.Uint32Slice(int32(trie.DataLength)) - trie.InitialValue = trie.Data32[trie.DataNullOffset] - trie.ErrorValue = trie.Data32[UTRIE2_BAD_UTF8_DATA_OFFSET] + trie.data32 = bytes.Uint32Slice(int32(trie.dataLength)) + trie.InitialValue = trie.data32[trie.dataNullOffset] + trie.ErrorValue = trie.data32[badUtf8DataOffset] } return trie, nil diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index c7b89233a43..fa9e540c296 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -31,9 +31,9 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/uprops" ) -const TIMER_INITIAL_VALUE = 10000 -const DEFAULT_TIMEOUT = 3 -const DEFAULT_STACK_LIMIT = 0 +const timerInitialValue = 10000 +const defaultTimeout = 3 +const defaultStackLimit = 0 type Matcher struct { pattern *Pattern @@ -70,8 +70,8 @@ type Matcher struct { requireEnd bool // True if the last match required end-of-input // (matched $ or Z) - stack Stack - frame StackFrame // After finding a match, the last active stack frame, + stack stack + frame stackFrame // After finding a match, the last active stack frame, // which will contain the capture group results. // NOT valid while match engine is running. @@ -91,11 +91,11 @@ func NewMatcher(pat *Pattern) *Matcher { m := &Matcher{ pattern: pat, data: make([]int, pat.dataSize), - stack: Stack{ + stack: stack{ frameSize: pat.frameSize, - stackLimit: DEFAULT_STACK_LIMIT, + stackLimit: defaultStackLimit, }, - timeLimit: DEFAULT_TIMEOUT, + timeLimit: defaultTimeout, } m.reset() return m @@ -144,33 +144,33 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.patIdx()++ - switch op.Type() { - case URX_NOP: + switch op.typ() { + case urxNop: // Nothing to do. - case URX_BACKTRACK: + case urxBacktrack: // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. fp = m.stack.popFrame() - case URX_ONECHAR: + case urxOnechar: if *fp.inputIdx() < m.activeLimit { c := charAt(inputText, *fp.inputIdx()) *fp.inputIdx()++ - if c == rune(op.Value()) { + if c == rune(op.value()) { break } } else { m.hitEnd = true } fp = m.stack.popFrame() - case URX_STRING: + case urxString: // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. - stringStartIdx := op.Value() + stringStartIdx := op.value() nextOp := pat[*fp.patIdx()] // Fetch the second operand *fp.patIdx()++ - stringLen := nextOp.Value() + stringLen := nextOp.value() patternString := litText[stringStartIdx:] var patternStringIndex int @@ -192,12 +192,12 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { if !success { fp = m.stack.popFrame() } - case URX_STATE_SAVE: - fp, err = m.StateSave(*fp.inputIdx(), op.Value()) + case urxStateSave: + fp, err = m.stateSave(*fp.inputIdx(), op.value()) if err != nil { return err } - case URX_END: + case urxEnd: // The match loop will exit via this path on a successful match, // when we reach the end of the pattern. if toEnd && *fp.inputIdx() != m.activeLimit { @@ -213,13 +213,13 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // opValue+1 - The end of a completed capture group // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). - case URX_START_CAPTURE: - *fp.extra(op.Value() + 2) = *fp.inputIdx() - case URX_END_CAPTURE: - *fp.extra(op.Value()) = *fp.extra(op.Value() + 2) // Tentative start becomes real. - *fp.extra(op.Value() + 1) = *fp.inputIdx() // End position + case urxStartCapture: + *fp.extra(op.value() + 2) = *fp.inputIdx() + case urxEndCapture: + *fp.extra(op.value()) = *fp.extra(op.value() + 2) // Tentative start becomes real. + *fp.extra(op.value() + 1) = *fp.inputIdx() // End position - case URX_DOLLAR: // $, test for End of line + case urxDollar: // $, test for End of line if *fp.inputIdx() < m.anchorLimit-2 { fp = m.stack.popFrame() break @@ -249,26 +249,25 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } fp = m.stack.popFrame() - case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. + case urxDollarD: // $, test for End of Line, in UNIX_LINES mode. if *fp.inputIdx() >= m.anchorLimit { // Off the end of input. Success. m.hitEnd = true m.requireEnd = true break - } else { - c := charAt(inputText, *fp.inputIdx()) - *fp.inputIdx()++ - // Either at the last character of input, or off the end. - if c == 0x0a && *fp.inputIdx() == m.anchorLimit { - m.hitEnd = true - m.requireEnd = true - break - } + } + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + // Either at the last character of input, or off the end. + if c == 0x0a && *fp.inputIdx() == m.anchorLimit { + m.hitEnd = true + m.requireEnd = true + break } // Not at end of input. Back-track out. fp = m.stack.popFrame() - case URX_DOLLAR_M: // $, test for End of line in multi-line mode + case urxDollarM: // $, test for End of line in multi-line mode if *fp.inputIdx() >= m.anchorLimit { // We really are at the end of input. Success. m.hitEnd = true @@ -288,7 +287,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } // not at a new line. Fail. fp = m.stack.popFrame() - case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode + case urxDollarMd: // $, test for End of line in multi-line and UNIX_LINES mode if *fp.inputIdx() >= m.anchorLimit { // We really are at the end of input. Success. m.hitEnd = true @@ -300,11 +299,11 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { if charAt(inputText, *fp.inputIdx()) != 0x0a { fp = m.stack.popFrame() } - case URX_CARET: // ^, test for start of line + case urxCaret: // ^, test for start of line if *fp.inputIdx() != m.anchorStart { fp = m.stack.popFrame() } - case URX_CARET_M: // ^, test for start of line in mulit-line mode + case urxCaretM: // ^, test for start of line in mulit-line mode if *fp.inputIdx() == m.anchorStart { // We are at the start input. Success. break @@ -319,7 +318,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } // Not at the start of a line. Fail. fp = m.stack.popFrame() - case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode + case urxCaretMUnix: // ^, test for start of line in mulit-line + Unix-line mode if *fp.inputIdx() <= m.anchorStart { // We are at the start input. Success. break @@ -330,19 +329,19 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Not at the start of a line. Back-track out. fp = m.stack.popFrame() } - case URX_BACKSLASH_B: // Test for word boundaries + case urxBackslashB: // Test for word boundaries success := m.isWordBoundary(*fp.inputIdx()) - success = success != (op.Value() != 0) // flip sense for \B + success = success != (op.value() != 0) // flip sense for \B if !success { fp = m.stack.popFrame() } - case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style + case urxBackslashBu: // Test for word boundaries, Unicode-style success := m.isUWordBoundary(*fp.inputIdx()) - success = success != (op.Value() != 0) // flip sense for \B + success = success != (op.value() != 0) // flip sense for \B if !success { fp = m.stack.popFrame() } - case URX_BACKSLASH_D: // Test for decimal digit + case urxBackslashD: // Test for decimal digit if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true fp = m.stack.popFrame() @@ -352,19 +351,19 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { c := charAt(inputText, *fp.inputIdx()) success := m.isDecimalDigit(c) - success = success != (op.Value() != 0) // flip sense for \D + success = success != (op.value() != 0) // flip sense for \D if success { *fp.inputIdx()++ } else { fp = m.stack.popFrame() } - case URX_BACKSLASH_G: // Test for position at end of previous match + case urxBackslashG: // Test for position at end of previous match if !((m.match && *fp.inputIdx() == m.matchEnd) || (!m.match && *fp.inputIdx() == m.activeStart)) { fp = m.stack.popFrame() } - case URX_BACKSLASH_H: // Test for \h, horizontal white space. + case urxBackslashH: // Test for \h, horizontal white space. if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true fp = m.stack.popFrame() @@ -373,14 +372,14 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { c := charAt(inputText, *fp.inputIdx()) success := m.isHorizWS(c) || c == 9 - success = success != (op.Value() != 0) // flip sense for \H + success = success != (op.value() != 0) // flip sense for \H if success { *fp.inputIdx()++ } else { fp = m.stack.popFrame() } - case URX_BACKSLASH_R: // Test for \R, any line break sequence. + case urxBackslashR: // Test for \R, any line break sequence. if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true fp = m.stack.popFrame() @@ -396,7 +395,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() } - case URX_BACKSLASH_V: // \v, any single line ending character. + case urxBackslashV: // \v, any single line ending character. if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true fp = m.stack.popFrame() @@ -404,14 +403,14 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } c := charAt(inputText, *fp.inputIdx()) success := isLineTerminator(c) - success = success != (op.Value() != 0) // flip sense for \V + success = success != (op.value() != 0) // flip sense for \V if success { *fp.inputIdx()++ } else { fp = m.stack.popFrame() } - case URX_BACKSLASH_X: + case urxBackslashX: // Match a Grapheme, as defined by Unicode UAX 29. // Fail if at end of input @@ -427,14 +426,14 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.inputIdx() = m.activeLimit } - case URX_BACKSLASH_Z: // Test for end of Input + case urxBackslashZ: // Test for end of Input if *fp.inputIdx() < m.anchorLimit { fp = m.stack.popFrame() } else { m.hitEnd = true m.requireEnd = true } - case URX_STATIC_SETREF: + case urxStaticSetref: // Test input character against one of the predefined sets // (Word Characters, for example) // The high bit of the op value is a flag for the match polarity. @@ -446,8 +445,8 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { break } - success := (op.Value() & URX_NEG_SET) == URX_NEG_SET - negOp := op.Value() & ^URX_NEG_SET + success := (op.value() & urxNegSet) == urxNegSet + negOp := op.value() & ^urxNegSet c := charAt(inputText, *fp.inputIdx()) s := staticPropertySets[negOp] @@ -461,7 +460,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // the character wasn't in the set. fp = m.stack.popFrame() } - case URX_STAT_SETREF_N: + case urxStatSetrefN: // Test input character for NOT being a member of one of // the predefined sets (Word Characters, for example) if *fp.inputIdx() >= m.activeLimit { @@ -471,7 +470,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } c := charAt(inputText, *fp.inputIdx()) - s := staticPropertySets[op.Value()] + s := staticPropertySets[op.value()] if !s.ContainsRune(c) { *fp.inputIdx()++ break @@ -479,7 +478,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // the character wasn't in the set. fp = m.stack.popFrame() - case URX_SETREF: + case urxSetref: if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true fp = m.stack.popFrame() @@ -489,7 +488,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // There is input left. Pick up one char and test it for set membership. c := charAt(inputText, *fp.inputIdx()) - s := sets[op.Value()] + s := sets[op.value()] if s.ContainsRune(c) { *fp.inputIdx()++ break @@ -498,7 +497,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // the character wasn't in the set. fp = m.stack.popFrame() - case URX_DOTANY: + case urxDotany: // . matches anything, but stops at end-of-line. if *fp.inputIdx() >= m.activeLimit { m.hitEnd = true @@ -514,7 +513,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } *fp.inputIdx()++ - case URX_DOTANY_ALL: + case urxDotanyAll: // ., in dot-matches-all (including new lines) mode if *fp.inputIdx() >= m.activeLimit { // At end of input. Match failed. Backtrack out. @@ -533,7 +532,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } } - case URX_DOTANY_UNIX: + case urxDotanyUnix: // '.' operator, matches all, but stops at end-of-line. // UNIX_LINES mode, so 0x0a is the only recognized line ending. if *fp.inputIdx() >= m.activeLimit { @@ -551,71 +550,71 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } else { *fp.inputIdx()++ } - case URX_JMP: - *fp.patIdx() = op.Value() + case urxJmp: + *fp.patIdx() = op.value() - case URX_FAIL: + case urxFail: isMatch = false goto breakFromLoop - case URX_JMP_SAV: - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + case urxJmpSav: + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current if err != nil { return err } - *fp.patIdx() = op.Value() // Then JMP. + *fp.patIdx() = op.value() // Then JMP. - case URX_JMP_SAV_X: + case urxJmpSavX: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the // data address of the input position at the start of the loop. - stoOp := pat[op.Value()-1] - frameLoc := stoOp.Value() + stoOp := pat[op.value()-1] + frameLoc := stoOp.value() prevInputIdx := *fp.extra(frameLoc) if prevInputIdx < *fp.inputIdx() { // The match did make progress. Repeat the loop. - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current if err != nil { return err } - *fp.patIdx() = op.Value() // Then JMP. + *fp.patIdx() = op.value() // Then JMP. *fp.extra(frameLoc) = *fp.inputIdx() } // If the input position did not advance, we do nothing here, // execution will fall out of the loop. - case URX_CTR_INIT: - *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero + case urxCtrInit: + *fp.extra(op.value()) = 0 // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past instOperandLoc := *fp.patIdx() *fp.patIdx() += 3 // Skip over the three operands that CTR_INIT has. - loopLoc := pat[instOperandLoc].Value() + loopLoc := pat[instOperandLoc].value() minCount := int(pat[instOperandLoc+1]) maxCount := int(pat[instOperandLoc+2]) if minCount == 0 { - fp, err = m.StateSave(*fp.inputIdx(), loopLoc+1) + fp, err = m.stateSave(*fp.inputIdx(), loopLoc+1) if err != nil { return err } } if maxCount == -1 { - *fp.extra(op.Value() + 1) = *fp.inputIdx() // For loop breaking. + *fp.extra(op.value() + 1) = *fp.inputIdx() // For loop breaking. } else if maxCount == 0 { fp = m.stack.popFrame() } - case URX_CTR_LOOP: - initOp := pat[op.Value()] - opValue := initOp.Value() + case utxCtrLoop: + initOp := pat[op.value()] + opValue := initOp.value() pCounter := fp.extra(opValue) - minCount := int(pat[op.Value()+2]) - maxCount := int(pat[op.Value()+3]) + minCount := int(pat[op.value()+2]) + maxCount := int(pat[op.value()+3]) *pCounter++ if *pCounter >= maxCount && maxCount != -1 { break @@ -628,11 +627,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { pLastIntputIdx := fp.extra(opValue + 1) if *pLastIntputIdx == *fp.inputIdx() { break - } else { - *pLastIntputIdx = *fp.inputIdx() } + *pLastIntputIdx = *fp.inputIdx() } - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) if err != nil { return err } @@ -646,26 +644,26 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { } } - *fp.patIdx() = op.Value() + 4 // Loop back. + *fp.patIdx() = op.value() + 4 // Loop back. - case URX_CTR_INIT_NG: - *fp.extra(op.Value()) = 0 // Set the loop counter variable to zero + case urxCtrInitNg: + *fp.extra(op.value()) = 0 // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT_NG has, and // skip the pattern location counter past instrOperandLoc := *fp.patIdx() *fp.patIdx() += 3 - loopLoc := pat[instrOperandLoc].Value() - minCount := pat[instrOperandLoc+1].Value() - maxCount := pat[instrOperandLoc+2].Value() + loopLoc := pat[instrOperandLoc].value() + minCount := pat[instrOperandLoc+1].value() + maxCount := pat[instrOperandLoc+2].value() if maxCount == -1 { - *fp.extra(op.Value() + 1) = *fp.inputIdx() // Save initial input index for loop breaking. + *fp.extra(op.value() + 1) = *fp.inputIdx() // Save initial input index for loop breaking. } if minCount == 0 { if maxCount != 0 { - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) if err != nil { return err } @@ -673,11 +671,11 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.patIdx() = loopLoc + 1 } - case URX_CTR_LOOP_NG: - initOp := pat[op.Value()] - pCounter := fp.extra(initOp.Value()) - minCount := int(pat[op.Value()+2]) - maxCount := int(pat[op.Value()+3]) + case urxCtrLoopNg: + initOp := pat[op.value()] + pCounter := fp.extra(initOp.value()) + minCount := int(pat[op.value()+2]) + maxCount := int(pat[op.value()+3]) *pCounter++ if *pCounter >= maxCount && maxCount != -1 { // The loop has matched the maximum permitted number of times. @@ -689,7 +687,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { if *pCounter < minCount { // We haven't met the minimum number of matches yet. // Loop back for another one. - *fp.patIdx() = op.Value() + 4 // Loop back. + *fp.patIdx() = op.value() + 4 // Loop back. // Increment time-out counter. (StateSave() does it if count >= minCount) m.tickCounter-- if m.tickCounter <= 0 { @@ -703,7 +701,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // If there is no upper bound on the loop iterations, check that the input index // is progressing, and stop the loop if it is not. if maxCount == -1 { - lastInputIdx := fp.extra(initOp.Value() + 1) + lastInputIdx := fp.extra(initOp.value() + 1) if *fp.inputIdx() == *lastInputIdx { break } @@ -715,16 +713,16 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // (non-greedy, don't execute loop body first), but first do // a state save to the top of the loop, so that a match failure // in the following pattern will try another iteration of the loop. - fp, err = m.StateSave(*fp.inputIdx(), op.Value()+4) + fp, err = m.stateSave(*fp.inputIdx(), op.value()+4) if err != nil { return err } - case URX_STO_SP: - m.data[op.Value()] = m.stack.len() + case urxStoSp: + m.data[op.value()] = m.stack.len() - case URX_LD_SP: - newStackSize := m.data[op.Value()] + case urxLdSp: + newStackSize := m.data[op.value()] newFp := m.stack.offset(newStackSize) if newFp.equals(fp) { break @@ -733,9 +731,9 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = newFp m.stack.setSize(newStackSize) - case URX_BACKREF: - groupStartIdx := *fp.extra(op.Value()) - groupEndIdx := *fp.extra(op.Value() + 1) + case urxBackref: + groupStartIdx := *fp.extra(op.value()) + groupEndIdx := *fp.extra(op.value() + 1) if groupStartIdx < 0 { // This capture group has not participated in the match thus far, @@ -769,9 +767,9 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { if !success { fp = m.stack.popFrame() } - case URX_BACKREF_I: - groupStartIdx := *fp.extra(op.Value()) - groupEndIdx := *fp.extra(op.Value() + 1) + case urxBackrefI: + groupStartIdx := *fp.extra(op.value()) + groupEndIdx := *fp.extra(op.value() + 1) if groupStartIdx < 0 { // This capture group has not participated in the match thus far, @@ -814,33 +812,33 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() } - case URX_STO_INP_LOC: - *fp.extra(op.Value()) = *fp.inputIdx() + case urxStoInpLoc: + *fp.extra(op.value()) = *fp.inputIdx() - case URX_JMPX: + case urxJmpx: instrOperandLoc := *fp.patIdx() *fp.patIdx()++ - dataLoc := pat[instrOperandLoc].Value() + dataLoc := pat[instrOperandLoc].value() saveInputIdx := *fp.extra(dataLoc) if saveInputIdx < *fp.inputIdx() { - *fp.patIdx() = op.Value() // JMP + *fp.patIdx() = op.value() // JMP } else { fp = m.stack.popFrame() // FAIL, no progress in loop. } - case URX_LA_START: - m.data[op.Value()] = m.stack.len() - m.data[op.Value()+1] = *fp.inputIdx() - m.data[op.Value()+2] = m.activeStart - m.data[op.Value()+3] = m.activeLimit + case urxLaStart: + m.data[op.value()] = m.stack.len() + m.data[op.value()+1] = *fp.inputIdx() + m.data[op.value()+2] = m.activeStart + m.data[op.value()+3] = m.activeLimit m.activeStart = m.lookStart // Set the match region change for m.activeLimit = m.lookLimit // transparent bounds. - case URX_LA_END: + case urxLaEnd: stackSize := m.stack.len() - newStackSize := m.data[op.Value()] + newStackSize := m.data[op.value()] if stackSize > newStackSize { // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead @@ -851,18 +849,18 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { m.stack.setSize(newStackSize) } - *fp.inputIdx() = m.data[op.Value()+1] + *fp.inputIdx() = m.data[op.value()+1] - m.activeStart = m.data[op.Value()+2] - m.activeLimit = m.data[op.Value()+3] + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] - case URX_ONECHAR_I: + case urcOnecharI: // Case insensitive one char. The char from the pattern is already case folded. // Input text is not, but case folding the input can not reduce two or more code // points to one. if *fp.inputIdx() < m.activeLimit { c := charAt(inputText, *fp.inputIdx()) - if ucase.Fold(c) == op.Value32() { + if ucase.Fold(c) == op.value32() { *fp.inputIdx()++ break } @@ -872,16 +870,16 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() - case URX_STRING_I: + case urxStringI: // Case-insensitive test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. // The compiled string has already been case folded. - patternString := litText[op.Value():] + patternString := litText[op.value():] var patternStringIdx int nextOp := pat[*fp.patIdx()] *fp.patIdx()++ - patternStringLen := nextOp.Value() + patternStringLen := nextOp.value() success := true @@ -909,21 +907,21 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { fp = m.stack.popFrame() } - case URX_LB_START: + case urxLbStart: // Entering a look-behind block. // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 - m.data[op.Value()] = m.stack.len() - m.data[op.Value()+1] = *fp.inputIdx() + m.data[op.value()] = m.stack.len() + m.data[op.value()+1] = *fp.inputIdx() // Save input string length, then reset to pin any matches to end at // the current position. - m.data[op.Value()+2] = m.activeStart - m.data[op.Value()+3] = m.activeLimit + m.data[op.value()+2] = m.activeStart + m.data[op.value()+3] = m.activeLimit m.activeStart = m.regionStart m.activeLimit = *fp.inputIdx() // Init the variable containing the start index for attempted matches. - m.data[op.Value()+4] = -1 - case URX_LB_CONT: + m.data[op.value()+4] = -1 + case urxLbCont: // Positive Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. @@ -934,7 +932,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { maxML := pat[*fp.patIdx()] *fp.patIdx()++ - lbStartIdx := &m.data[op.Value()+4] + lbStartIdx := &m.data[op.value()+4] if *lbStartIdx < 0 { // First time through loop. *lbStartIdx = *fp.inputIdx() - int(minML) @@ -952,20 +950,20 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = m.stack.popFrame() - m.activeStart = m.data[op.Value()+2] - m.activeLimit = m.data[op.Value()+3] + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] break } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-3) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-3) if err != nil { return err } *fp.inputIdx() = *lbStartIdx - case URX_LB_END: + case urxLbEnd: // End of a look-behind block, after a successful match. if *fp.inputIdx() != m.activeLimit { // The look-behind expression matched, but the match did not @@ -980,9 +978,9 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Look-behind match is good. Restore the orignal input string region, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. - m.activeStart = m.data[op.Value()+2] - m.activeLimit = m.data[op.Value()+3] - case URX_LBN_CONT: + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + case urxLbnCount: // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. @@ -992,10 +990,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { maxML := pat[*fp.patIdx()] *fp.patIdx()++ - continueLoc := pat[*fp.patIdx()].Value() + continueLoc := pat[*fp.patIdx()].value() *fp.patIdx()++ - lbStartIdx := &m.data[op.Value()+4] + lbStartIdx := &m.data[op.value()+4] if *lbStartIdx < 0 { // First time through loop. @@ -1014,20 +1012,20 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location - m.activeStart = m.data[op.Value()+2] - m.activeLimit = m.data[op.Value()+3] + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] *fp.patIdx() = continueLoc break } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-4) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-4) if err != nil { return err } *fp.inputIdx() = *lbStartIdx - case URX_LBN_END: + case urxLbnEnd: // End of a negative look-behind block, after a successful match. if *fp.inputIdx() != m.activeLimit { @@ -1046,23 +1044,23 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Restore the orignal input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. - m.activeStart = m.data[op.Value()+2] - m.activeLimit = m.data[op.Value()+3] + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] // Restore original stack position, discarding any state saved // by the successful pattern match. - newStackSize := m.data[op.Value()] + newStackSize := m.data[op.value()] m.stack.setSize(newStackSize) // FAIL, which will take control back to someplace // prior to entering the look-behind test. fp = m.stack.popFrame() - case URX_LOOP_SR_I: + case urxLoopSrI: // Loop Initialization for the optimized implementation of // [some character set]* // This op scans through all matching input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. - s := sets[op.Value()] + s := sets[op.value()] // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. @@ -1091,19 +1089,19 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* loopcOp := pat[*fp.patIdx()] - stackLoc := loopcOp.Value() + stackLoc := loopcOp.value() *fp.extra(stackLoc) = *fp.inputIdx() *fp.inputIdx() = ix // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) if err != nil { return err } *fp.patIdx()++ - case URX_LOOP_DOT_I: + case urxLoopDotI: // Loop Initialization for the optimized implementation of .* // This op scans through all remaining input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. @@ -1111,7 +1109,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // Loop through input until the input is exhausted (we reach an end-of-line) // In DOTALL mode, we can just go straight to the end of the input. var ix int - if (op.Value() & 1) == 1 { + if (op.value() & 1) == 1 { // Dot-matches-All mode. Jump straight to the end of the string. ix = m.activeLimit m.hitEnd = true @@ -1127,7 +1125,7 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { c := charAt(inputText, ix) if (c & 0x7f) <= 0x29 { // Fast filter of non-new-line-s if (c == 0x0a) || // 0x0a is newline in both modes. - (((op.Value() & 2) == 0) && // IF not UNIX_LINES mode + (((op.value() & 2) == 0) && // IF not UNIX_LINES mode isLineTerminator(c)) { // char is a line ending. Exit the scanning loop. break @@ -1148,21 +1146,21 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { // must follow. It's operand is the stack location // that holds the starting input index for the match of this .* loopcOp := pat[*fp.patIdx()] - stackLoc := loopcOp.Value() + stackLoc := loopcOp.value() *fp.extra(stackLoc) = *fp.inputIdx() *fp.inputIdx() = ix // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) if err != nil { return err } *fp.patIdx()++ - case URX_LOOP_C: - backSearchIndex := *fp.extra(op.Value()) + case urxLoopC: + backSearchIndex := *fp.extra(op.value()) if backSearchIndex == *fp.inputIdx() { // We've backed up the input idx to the point that the loop started. @@ -1184,13 +1182,13 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { *fp.inputIdx() > backSearchIndex && twoPrevC == 0x0d { prevOp := pat[*fp.patIdx()-2] - if prevOp.Type() == URX_LOOP_DOT_I { + if prevOp.typ() == urxLoopDotI { // .*, stepping back over CRLF pair. *fp.inputIdx()-- } } - fp, err = m.StateSave(*fp.inputIdx(), *fp.patIdx()-1) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-1) if err != nil { return err } @@ -1237,10 +1235,10 @@ func (m *Matcher) isWordBoundary(pos int) bool { m.hitEnd = true } else { c := charAt(m.input, pos) - if uprops.HasBinaryProperty(c, uprops.UCHAR_GRAPHEME_EXTEND) || uchar.CharType(c) == uchar.U_FORMAT_CHAR { + if uprops.HasBinaryProperty(c, uprops.UCharGraphemeExtend) || uchar.CharType(c) == uchar.FormatChar { return false } - cIsWord = staticPropertySets[URX_ISWORD_SET].ContainsRune(c) + cIsWord = staticPropertySets[urxIswordSet].ContainsRune(c) } prevCIsWord := false @@ -1250,8 +1248,8 @@ func (m *Matcher) isWordBoundary(pos int) bool { } prevChar := charAt(m.input, pos-1) pos-- - if !(uprops.HasBinaryProperty(prevChar, uprops.UCHAR_GRAPHEME_EXTEND) || uchar.CharType(prevChar) == uchar.U_FORMAT_CHAR) { - prevCIsWord = staticPropertySets[URX_ISWORD_SET].ContainsRune(prevChar) + if !(uprops.HasBinaryProperty(prevChar, uprops.UCharGraphemeExtend) || uchar.CharType(prevChar) == uchar.FormatChar) { + prevCIsWord = staticPropertySets[urxIswordSet].ContainsRune(prevChar) break } } @@ -1292,14 +1290,14 @@ func (m *Matcher) isUWordBoundary(pos int) bool { return false } -func (m *Matcher) resetStack() StackFrame { +func (m *Matcher) resetStack() stackFrame { m.stack.reset() frame, _ := m.stack.newFrame(0, nil, "") frame.clearExtra() return frame } -func (m *Matcher) StateSave(inputIdx, savePatIdx int) (StackFrame, error) { +func (m *Matcher) stateSave(inputIdx, savePatIdx int) (stackFrame, error) { // push storage for a new frame. newFP, err := m.stack.newFrame(inputIdx, m.input, m.pattern.pattern) if err != nil { @@ -1321,11 +1319,11 @@ func (m *Matcher) StateSave(inputIdx, savePatIdx int) (StackFrame, error) { } func (m *Matcher) incrementTime(inputIdx int) error { - m.tickCounter = TIMER_INITIAL_VALUE + m.tickCounter = timerInitialValue m.time++ if m.timeLimit > 0 && m.time >= m.timeLimit { return &MatchError{ - Code: uerror.U_REGEX_TIME_OUT, + Code: uerror.TimeOut, Pattern: m.pattern.pattern, Position: inputIdx, Input: m.input, @@ -1339,7 +1337,7 @@ func (m *Matcher) isDecimalDigit(c rune) bool { } func (m *Matcher) isHorizWS(c rune) bool { - return uchar.CharType(c) == uchar.U_SPACE_SEPARATOR || c == 9 + return uchar.CharType(c) == uchar.SpaceSeparator || c == 9 } func (m *Matcher) followingGCBoundary(pos int) int { @@ -1420,7 +1418,7 @@ func (m *Matcher) Find() (bool, error) { } switch m.pattern.startType { - case START_NO_INFO: + case startNoInfo: // No optimization was found. // Try a match at each input position. for { @@ -1437,7 +1435,7 @@ func (m *Matcher) Find() (bool, error) { } startPos++ } - case START_SET: + case startSet: // Match may start on any char from a pre-computed set. for { pos := startPos @@ -1462,7 +1460,7 @@ func (m *Matcher) Find() (bool, error) { return false, nil } } - case START_START: + case startStart: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) if startPos > m.activeStart { @@ -1471,7 +1469,7 @@ func (m *Matcher) Find() (bool, error) { } err := m.MatchAt(startPos, false) return m.match, err - case START_LINE: + case startLine: var ch rune if startPos == m.anchorStart { err := m.MatchAt(startPos, false) @@ -1487,7 +1485,7 @@ func (m *Matcher) Find() (bool, error) { ch = charAt(m.input, startPos-1) } - if m.pattern.flags&UREGEX_UNIX_LINES != 0 { + if m.pattern.flags&UnixLines != 0 { for { if ch == 0x0a { err := m.MatchAt(startPos, false) @@ -1529,7 +1527,7 @@ func (m *Matcher) Find() (bool, error) { startPos++ } } - case START_CHAR, START_STRING: + case startChar, startString: // Match starts on exactly one char. theChar := m.pattern.initialChar for { @@ -1585,7 +1583,7 @@ func (m *Matcher) resetPreserveRegion() { m.hitEnd = false m.requireEnd = false m.time = 0 - m.tickCounter = TIMER_INITIAL_VALUE + m.tickCounter = timerInitialValue } func (m *Matcher) GroupCount() int { diff --git a/go/mysql/icuregex/ops.go b/go/mysql/icuregex/ops.go index 394059d886c..e1418ff6325 100644 --- a/go/mysql/icuregex/ops.go +++ b/go/mysql/icuregex/ops.go @@ -29,41 +29,41 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/utf16" ) -type Opcode uint8 +type opcode uint8 const ( - URX_RESERVED_OP Opcode = iota // For multi-operand ops, most non-first words. - URX_BACKTRACK // Force a backtrack, as if a match test had failed. - URX_END - URX_ONECHAR // Value field is the 21 bit unicode char to match - URX_STRING // Value field is index of string start - URX_STRING_LEN // Value field is string length (code units) - URX_STATE_SAVE // Value field is pattern position to push - URX_NOP - URX_START_CAPTURE // Value field is capture group number. - URX_END_CAPTURE // Value field is capture group number - URX_STATIC_SETREF // Value field is index of set in array of sets. - URX_SETREF // Value field is index of set in array of sets. - URX_DOTANY - URX_JMP // Value field is destination position in the pattern. - URX_FAIL // Stop match operation, No match. - - URX_JMP_SAV // Operand: JMP destination location - URX_BACKSLASH_B // Value field: 0: \b 1: \B - URX_BACKSLASH_G - URX_JMP_SAV_X // Conditional JMP_SAV, + urxReservedOp opcode = iota // For multi-operand ops, most non-first words. + urxBacktrack // Force a backtrack, as if a match test had failed. + urxEnd + urxOnechar // Value field is the 21 bit unicode char to match + urxString // Value field is index of string start + urxStringLen // Value field is string length (code units) + urxStateSave // Value field is pattern position to push + urxNop + urxStartCapture // Value field is capture group number. + urxEndCapture // Value field is capture group number + urxStaticSetref // Value field is index of set in array of sets. + urxSetref // Value field is index of set in array of sets. + urxDotany + urxJmp // Value field is destination position in the pattern. + urxFail // Stop match operation, No match. + + urxJmpSav // Operand: JMP destination location + urxBackslashB // Value field: 0: \b 1: \B + urxBackslashG + urxJmpSavX // Conditional JMP_SAV, // Used in (x)+, breaks loop on zero length match. // Operand: Jmp destination. - URX_BACKSLASH_X - URX_BACKSLASH_Z // \z Unconditional end of line. + urxBackslashX + urxBackslashZ // \z Unconditional end of line. - URX_DOTANY_ALL // ., in the . matches any mode. - URX_BACKSLASH_D // Value field: 0: \d 1: \D - URX_CARET // Value field: 1: multi-line mode. - URX_DOLLAR // Also for \Z + urxDotanyAll // ., in the . matches any mode. + urxBackslashD // Value field: 0: \d 1: \D + urxCaret // Value field: 1: multi-line mode. + urxDollar // Also for \Z - URX_CTR_INIT // Counter Inits for {Interval} loops. - URX_CTR_INIT_NG // 2 kinds, normal and non-greedy. + urxCtrInit // Counter Inits for {Interval} loops. + urxCtrInitNg // 2 kinds, normal and non-greedy. // These are 4 word opcodes. See description. // First Operand: Data loc of counter variable // 2nd Operand: Pat loc of the URX_CTR_LOOPx @@ -71,76 +71,76 @@ const ( // 3rd Operand: Minimum count. // 4th Operand: Max count, -1 for unbounded. - URX_DOTANY_UNIX // '.' operator in UNIX_LINES mode, only \n marks end of line. + urxDotanyUnix // '.' operator in UNIX_LINES mode, only \n marks end of line. - URX_CTR_LOOP // Loop Ops for {interval} loops. - URX_CTR_LOOP_NG // Also in three flavors. + utxCtrLoop // Loop Ops for {interval} loops. + urxCtrLoopNg // Also in three flavors. // Operand is loc of corresponding CTR_INIT. - URX_CARET_M_UNIX // '^' operator, test for start of line in multi-line + urxCaretMUnix // '^' operator, test for start of line in multi-line // plus UNIX_LINES mode. - URX_RELOC_OPRND // Operand value in multi-operand ops that refers + urxRelocOprnd // Operand value in multi-operand ops that refers // back into compiled pattern code, and thus must // be relocated when inserting/deleting ops in code. - URX_STO_SP // Store the stack ptr. Operand is location within + urxStoSp // Store the stack ptr. Operand is location within // matcher data (not stack data) to store it. - URX_LD_SP // Load the stack pointer. Operand is location + urxLdSp // Load the stack pointer. Operand is location // to load from. - URX_BACKREF // Back Reference. Parameter is the index of the + urxBackref // Back Reference. Parameter is the index of the // capture group variables in the state stack frame. - URX_STO_INP_LOC // Store the input location. Operand is location + urxStoInpLoc // Store the input location. Operand is location // within the matcher stack frame. - URX_JMPX // Conditional JMP. + urxJmpx // Conditional JMP. // First Operand: JMP target location. // Second Operand: Data location containing an // input position. If current input position == // saved input position, FAIL rather than taking // the JMP - URX_LA_START // Starting a LookAround expression. + urxLaStart // Starting a LookAround expression. // Save InputPos, SP and active region in static data. // Operand: Static data offset for the save - URX_LA_END // Ending a Lookaround expression. + urxLaEnd // Ending a Lookaround expression. // Restore InputPos and Stack to saved values. // Operand: Static data offset for saved data. - URX_ONECHAR_I // Test for case-insensitive match of a literal character. + urcOnecharI // Test for case-insensitive match of a literal character. // Operand: the literal char. - URX_STRING_I // Case insensitive string compare. + urxStringI // Case insensitive string compare. // First Operand: Index of start of string in string literals // Second Operand (next word in compiled code): // the length of the string. - URX_BACKREF_I // Case insensitive back reference. + urxBackrefI // Case insensitive back reference. // Parameter is the index of the // capture group variables in the state stack frame. - URX_DOLLAR_M // $ in multi-line mode. - URX_CARET_M // ^ in multi-line mode. - URX_LB_START // LookBehind Start. + urxDollarM // $ in multi-line mode. + urxCaretM // ^ in multi-line mode. + urxLbStart // LookBehind Start. // Parameter is data location - URX_LB_CONT // LookBehind Continue. + urxLbCont // LookBehind Continue. // Param 0: the data location // Param 1: The minimum length of the look-behind match // Param 2: The max length of the look-behind match - URX_LB_END // LookBehind End. + urxLbEnd // LookBehind End. // Parameter is the data location. // Check that match ended at the right spot, // Restore original input string len. - URX_LBN_CONT // Negative LookBehind Continue + urxLbnCount // Negative LookBehind Continue // Param 0: the data location // Param 1: The minimum length of the look-behind match // Param 2: The max length of the look-behind match // Param 3: The pattern loc following the look-behind block. - URX_LBN_END // Negative LookBehind end + urxLbnEnd // Negative LookBehind end // Parameter is the data location. // Check that the match ended at the right spot. - URX_STAT_SETREF_N // Reference to a prebuilt set (e.g. \w), negated + urxStatSetrefN // Reference to a prebuilt set (e.g. \w), negated // Operand is index of set in array of sets. - URX_LOOP_SR_I // Init a [set]* loop. + urxLoopSrI // Init a [set]* loop. // Operand is the sets index in array of user sets. - URX_LOOP_C // Continue a [set]* or OneChar* loop. + urxLoopC // Continue a [set]* or OneChar* loop. // Operand is a matcher static data location. // Must always immediately follow LOOP_x_I instruction. - URX_LOOP_DOT_I // .*, initialization of the optimized loop. + urxLoopDotI // .*, initialization of the optimized loop. // Operand value: // bit 0: // 0: Normal (. doesn't match new-line) mode. @@ -148,21 +148,21 @@ const ( // bit 1: controls what new-lines are recognized by this operation. // 0: All Unicode New-lines // 1: UNIX_LINES, \u000a only. - URX_BACKSLASH_BU // \b or \B in UREGEX_UWORD mode, using Unicode style + urxBackslashBu // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. - URX_DOLLAR_D // $ end of input test, in UNIX_LINES mode. - URX_DOLLAR_MD // $ end of input test, in MULTI_LINE and UNIX_LINES mode. - URX_BACKSLASH_H // Value field: 0: \h 1: \H - URX_BACKSLASH_R // Any line break sequence. - URX_BACKSLASH_V // Value field: 0: \v 1: \V + urxDollarD // $ end of input test, in UNIX_LINES mode. + urxDollarMd // $ end of input test, in MULTI_LINE and UNIX_LINES mode. + urxBackslashH // Value field: 0: \h 1: \H + urxBackslashR // Any line break sequence. + urxBackslashV // Value field: 0: \v 1: \V - URX_RESERVED_OP_N Opcode = 255 // For multi-operand ops, negative operand values. + urxReservedOpN opcode = 255 // For multi-operand ops, negative operand values. ) // Keep this list of opcode names in sync with the above enum // // Used for debug printing only. -var UrxOpcodeNames = []string{ +var urxOpcodeNames = []string{ " ", "BACKTRACK", "END", @@ -224,156 +224,154 @@ var UrxOpcodeNames = []string{ "URX_BACKSLASH_V", } -type Instruction int32 +type instruction int32 -func (ins Instruction) Type() Opcode { - return Opcode(uint32(ins) >> 24) +func (ins instruction) typ() opcode { + return opcode(uint32(ins) >> 24) } -func (ins Instruction) Value32() int32 { +func (ins instruction) value32() int32 { return int32(ins) & 0xffffff } -func (ins Instruction) Value() int { - return int(ins.Value32()) +func (ins instruction) value() int { + return int(ins.value32()) } // Access to Unicode Sets composite character properties // // The sets are accessed by the match engine for things like \w (word boundary) const ( - URX_ISWORD_SET = 1 - URX_ISALNUM_SET = 2 - URX_ISALPHA_SET = 3 - URX_ISSPACE_SET = 4 - - URX_GC_NORMAL = iota + 1 // Sets for finding grapheme cluster boundaries. - URX_GC_EXTEND - URX_GC_CONTROL - URX_GC_L - URX_GC_LV - URX_GC_LVT - URX_GC_V - URX_GC_T - - URX_LAST_SET - - URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set + urxIswordSet = 1 + urxIsalnumSet = 2 + urxIsalphaSet = 3 + urxIsspaceSet = 4 + + urxGcNormal = iota + 1 // Sets for finding grapheme cluster boundaries. + urxGcExtend + urxGcControl + urxGcL + urxGcLv + urxGcLvt + urxGcV + urxGcT + + urxNegSet = 0x800000 // Flag bit to reverse sense of set // membership test. ) -type Stack struct { +type stack struct { ary []int frameSize int stackLimit int } -type StackFrame []int +type stackFrame []int -func (f StackFrame) inputIdx() *int { +func (f stackFrame) inputIdx() *int { return &f[0] } -func (f StackFrame) patIdx() *int { +func (f stackFrame) patIdx() *int { return &f[1] } -func (f StackFrame) extra(n int) *int { +func (f stackFrame) extra(n int) *int { return &f[2+n] } -func (f StackFrame) equals(f2 StackFrame) bool { +func (f stackFrame) equals(f2 stackFrame) bool { return &f[0] == &f2[0] } -func (stack *Stack) len() int { - return len(stack.ary) +func (s *stack) len() int { + return len(s.ary) } -func (stack *Stack) sp() int { - return len(stack.ary) - stack.frameSize +func (s *stack) sp() int { + return len(s.ary) - s.frameSize } -func (stack *Stack) newFrame(inputIdx int, input []rune, pattern string) (StackFrame, error) { - if stack.stackLimit != 0 && len(stack.ary)+stack.frameSize > stack.stackLimit { +func (s *stack) newFrame(inputIdx int, input []rune, pattern string) (stackFrame, error) { + if s.stackLimit != 0 && len(s.ary)+s.frameSize > s.stackLimit { return nil, &MatchError{ - Code: uerror.U_REGEX_STACK_OVERFLOW, + Code: uerror.StackOverflow, Pattern: pattern, Position: inputIdx, Input: input, } } - stack.ary = slices.Grow(stack.ary, stack.frameSize) + s.ary = slices.Grow(s.ary, s.frameSize) - f := stack.ary[len(stack.ary) : len(stack.ary)+stack.frameSize] - stack.ary = stack.ary[:len(stack.ary)+stack.frameSize] + f := s.ary[len(s.ary) : len(s.ary)+s.frameSize] + s.ary = s.ary[:len(s.ary)+s.frameSize] return f, nil } -func (stack *Stack) prevFromTop() StackFrame { - return stack.ary[len(stack.ary)-2*stack.frameSize:] +func (s *stack) prevFromTop() stackFrame { + return s.ary[len(s.ary)-2*s.frameSize:] } -func (stack *Stack) popFrame() StackFrame { - stack.ary = stack.ary[:len(stack.ary)-stack.frameSize] - return stack.ary[len(stack.ary)-stack.frameSize:] +func (s *stack) popFrame() stackFrame { + s.ary = s.ary[:len(s.ary)-s.frameSize] + return s.ary[len(s.ary)-s.frameSize:] } -func (stack *Stack) reset() { - stack.ary = stack.ary[:0] +func (s *stack) reset() { + s.ary = s.ary[:0] } -func (stack *Stack) offset(size int) StackFrame { - return stack.ary[size-stack.frameSize : size] +func (s *stack) offset(size int) stackFrame { + return s.ary[size-s.frameSize : size] } -func (stack *Stack) setSize(size int) { - stack.ary = stack.ary[:size] +func (s *stack) setSize(size int) { + s.ary = s.ary[:size] } -func (f StackFrame) clearExtra() { +func (f stackFrame) clearExtra() { for i := 2; i < len(f); i++ { f[i] = -1 } } // number of UVector elements in the header -const RESTACKFRAME_HDRCOUNT = 2 +const restackframeHdrCount = 2 // Start-Of-Match type. Used by find() to quickly scan to positions where a // // match might start before firing up the full match engine. -type StartOfMatch int8 +type startOfMatch int8 const ( - START_NO_INFO StartOfMatch = iota // No hint available. - START_CHAR // Match starts with a literal code point. - START_SET // Match starts with something matching a set. - START_START // Match starts at start of buffer only (^ or \A) - START_LINE // Match starts with ^ in multi-line mode. - START_STRING // Match starts with a literal string. + startNoInfo startOfMatch = iota // No hint available. + startChar // Match starts with a literal code point. + startSet // Match starts with something matching a set. + startStart // Match starts at start of buffer only (^ or \A) + startLine // Match starts with ^ in multi-line mode. + startString // Match starts with a literal string. ) -func (som StartOfMatch) String() string { +func (som startOfMatch) String() string { switch som { - case START_NO_INFO: + case startNoInfo: return "START_NO_INFO" - case START_CHAR: + case startChar: return "START_CHAR" - case START_SET: + case startSet: return "START_SET" - case START_START: + case startStart: return "START_START" - case START_LINE: + case startLine: return "START_LINE" - case START_STRING: + case startString: return "START_STRING" default: panic("unknown StartOfMatch") } } -type CaseFoldIterator struct { +type caseFoldIterator struct { chars []rune index int limit int @@ -381,7 +379,7 @@ type CaseFoldIterator struct { foldChars []uint16 } -func (it *CaseFoldIterator) next() rune { +func (it *caseFoldIterator) next() rune { if len(it.foldChars) == 0 { // We are not in a string folding of an earlier character. // Start handling the next char from the input UText. @@ -404,12 +402,12 @@ func (it *CaseFoldIterator) next() rune { return res } -func (it *CaseFoldIterator) inExpansion() bool { +func (it *caseFoldIterator) inExpansion() bool { return len(it.foldChars) > 0 } -func newCaseFoldIterator(chars []rune, start, limit int) CaseFoldIterator { - return CaseFoldIterator{ +func newCaseFoldIterator(chars []rune, start, limit int) caseFoldIterator { + return caseFoldIterator{ chars: chars, index: start, limit: limit, diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go index 6e67410bd2c..d0913afa13f 100644 --- a/go/mysql/icuregex/pattern.go +++ b/go/mysql/icuregex/pattern.go @@ -29,7 +29,7 @@ type Pattern struct { pattern string flags RegexpFlag - compiledPat []Instruction + compiledPat []instruction literalText []rune sets []*uset.UnicodeSet @@ -40,7 +40,7 @@ type Pattern struct { groupMap []int32 - startType StartOfMatch + startType startOfMatch initialStringIdx int initialStringLen int initialChars *uset.UnicodeSet @@ -69,7 +69,7 @@ func MustCompileString(in string, flags RegexpFlag) *Pattern { func CompileString(in string, flags RegexpFlag) (*Pattern, error) { pat := NewPattern(flags) - cmp := NewCompiler(pat) + cmp := newCompiler(pat) if err := cmp.compile(in); err != nil { return nil, err } @@ -82,22 +82,18 @@ func (p *Pattern) Match(input string) *Matcher { return m } -func (p *Pattern) Matcher() *Matcher { - return NewMatcher(p) -} - type RegexpFlag int32 const ( /** Enable case insensitive matching. @stable ICU 2.4 */ - UREGEX_CASE_INSENSITIVE RegexpFlag = 2 + CaseInsensitive RegexpFlag = 2 /** Allow white space and comments within patterns @stable ICU 2.4 */ - UREGEX_COMMENTS RegexpFlag = 4 + Comments RegexpFlag = 4 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. * @stable ICU 2.4 */ - UREGEX_DOTALL RegexpFlag = 32 + DotAll RegexpFlag = 32 /** If set, treat the entire pattern as a literal string. * Metacharacters or escape sequences in the input sequence will be given @@ -109,20 +105,20 @@ const ( * * @stable ICU 4.0 */ - UREGEX_LITERAL RegexpFlag = 16 + Literal RegexpFlag = 16 /** Control behavior of "$" and "^" * If set, recognize line terminators within string, * otherwise, match only at start and end of input string. * @stable ICU 2.4 */ - UREGEX_MULTILINE RegexpFlag = 8 + Multiline RegexpFlag = 8 /** Unix-only line endings. * When this mode is enabled, only \\u000a is recognized as a line ending * in the behavior of ., ^, and $. * @stable ICU 4.0 */ - UREGEX_UNIX_LINES RegexpFlag = 1 + UnixLines RegexpFlag = 1 /** Unicode word boundaries. * If set, \b uses the Unicode TR 29 definition of word boundaries. @@ -131,7 +127,7 @@ const ( * http://unicode.org/reports/tr29/#Word_Boundaries * @stable ICU 2.8 */ - UREGEX_UWORD RegexpFlag = 256 + UWord RegexpFlag = 256 /** Error on Unrecognized backslash escapes. * If set, fail with an error on patterns that contain @@ -140,5 +136,5 @@ const ( * escaped letters represent themselves. * @stable ICU 4.0 */ - UREGEX_ERROR_ON_UNKNOWN_ESCAPES RegexpFlag = 512 + ErrorOnUnknownEscapes RegexpFlag = 512 ) diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go index 04d0266357f..f30ec443605 100644 --- a/go/mysql/icuregex/perl_test.go +++ b/go/mysql/icuregex/perl_test.go @@ -39,13 +39,13 @@ func TestPerl(t *testing.T) { defer f.Close() flagPat := MustCompileString(`('?)(.*)\1(.*)`, 0) - flagMat := flagPat.Matcher() + flagMat := NewMatcher(flagPat) groupsPat := MustCompileString(`\$([+\-])\[(\d+)\]`, 0) - groupsMat := groupsPat.Matcher() + groupsMat := NewMatcher(groupsPat) cgPat := MustCompileString(`\$(\d+)`, 0) - cgMat := cgPat.Matcher() + cgMat := NewMatcher(cgPat) group := func(m *Matcher, idx int) string { g, _ := m.Group(idx) @@ -85,18 +85,18 @@ func TestPerl(t *testing.T) { flagStr, _ := flagMat.Group(3) var flags RegexpFlag if strings.IndexByte(flagStr, 'i') >= 0 { - flags |= UREGEX_CASE_INSENSITIVE + flags |= CaseInsensitive } if strings.IndexByte(flagStr, 'm') >= 0 { - flags |= UREGEX_MULTILINE + flags |= Multiline } if strings.IndexByte(flagStr, 'x') >= 0 { - flags |= UREGEX_COMMENTS + flags |= Comments } testPat, err := CompileString(pattern, flags) if err != nil { - if cerr, ok := err.(*CompileError); ok && cerr.Code == uerror.U_REGEX_UNIMPLEMENTED { + if cerr, ok := err.(*CompileError); ok && cerr.Code == uerror.Unimplemented { continue } if strings.IndexByte(fields[2], 'c') == -1 && strings.IndexByte(fields[2], 'i') == -1 { diff --git a/go/mysql/icuregex/sets.go b/go/mysql/icuregex/sets.go index 3241b0c24d4..0f745b3374d 100644 --- a/go/mysql/icuregex/sets.go +++ b/go/mysql/icuregex/sets.go @@ -29,7 +29,7 @@ import ( var staticPropertySets [13]*uset.UnicodeSet func init() { - staticPropertySets[URX_ISWORD_SET] = func() *uset.UnicodeSet { + staticPropertySets[urxIswordSet] = func() *uset.UnicodeSet { s := uset.New() s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Alphabetic}`, 0)) s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{M}`, 0)) @@ -40,10 +40,10 @@ func init() { return s.Freeze() }() - staticPropertySets[URX_ISSPACE_SET] = uprops.MustNewUnicodeSetFomPattern(`\p{Whitespace}`, 0).Freeze() + staticPropertySets[urxIsspaceSet] = uprops.MustNewUnicodeSetFomPattern(`\p{Whitespace}`, 0).Freeze() - staticPropertySets[URX_GC_EXTEND] = uprops.MustNewUnicodeSetFomPattern(`\p{Grapheme_Extend}`, 0).Freeze() - staticPropertySets[URX_GC_CONTROL] = func() *uset.UnicodeSet { + staticPropertySets[urxGcExtend] = uprops.MustNewUnicodeSetFomPattern(`\p{Grapheme_Extend}`, 0).Freeze() + staticPropertySets[urxGcControl] = func() *uset.UnicodeSet { s := uset.New() s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zl:]`, 0)) s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zp:]`, 0)) @@ -52,20 +52,20 @@ func init() { s.RemoveAll(uprops.MustNewUnicodeSetFomPattern(`[:Grapheme_Extend:]`, 0)) return s.Freeze() }() - staticPropertySets[URX_GC_L] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() - staticPropertySets[URX_GC_LV] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() - staticPropertySets[URX_GC_LVT] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() - staticPropertySets[URX_GC_V] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() - staticPropertySets[URX_GC_T] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() + staticPropertySets[urxGcL] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() + staticPropertySets[urxGcLv] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() + staticPropertySets[urxGcLvt] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() + staticPropertySets[urxGcV] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() + staticPropertySets[urxGcT] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() - staticPropertySets[URX_GC_NORMAL] = func() *uset.UnicodeSet { + staticPropertySets[urxGcNormal] = func() *uset.UnicodeSet { s := uset.New() s.Complement() s.RemoveRuneRange(0xac00, 0xd7a4) - s.RemoveAll(staticPropertySets[URX_GC_CONTROL]) - s.RemoveAll(staticPropertySets[URX_GC_L]) - s.RemoveAll(staticPropertySets[URX_GC_V]) - s.RemoveAll(staticPropertySets[URX_GC_T]) + s.RemoveAll(staticPropertySets[urxGcControl]) + s.RemoveAll(staticPropertySets[urxGcL]) + s.RemoveAll(staticPropertySets[urxGcV]) + s.RemoveAll(staticPropertySets[urxGcT]) return s.Freeze() }() } @@ -77,13 +77,13 @@ var staticSetUnescape = func() *uset.UnicodeSet { }() const ( - kRuleSetDigitChar = 128 - kRuleSetAsciiLetter = 129 - kRuleSetRuleChar = 130 - kRuleSetCount = 131 - 128 + ruleSetDigitChar = 128 + ruleSetASCIILetter = 129 + ruleSetRuleChar = 130 + ruleSetCount = 131 - 128 ) -var staticRuleSet = [kRuleSetCount]*uset.UnicodeSet{ +var staticRuleSet = [ruleSetCount]*uset.UnicodeSet{ func() *uset.UnicodeSet { u := uset.New() u.AddRuneRange('0', '9') diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index b554da87856..efd2d8cbffa 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -27,15 +27,15 @@ func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) for _, b := range m.bytes { switch b { case 'c': - flags &= ^icuregex.UREGEX_CASE_INSENSITIVE + flags &= ^icuregex.CaseInsensitive case 'i': - flags |= icuregex.UREGEX_CASE_INSENSITIVE + flags |= icuregex.CaseInsensitive case 'm': - flags |= icuregex.UREGEX_MULTILINE + flags |= icuregex.Multiline case 'n': - flags |= icuregex.UREGEX_DOTALL + flags |= icuregex.DotAll case 'u': - flags |= icuregex.UREGEX_UNIX_LINES + flags |= icuregex.UnixLines } } } @@ -63,7 +63,7 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { var flags icuregex.RegexpFlag var collation = colid.Get() if strings.Contains(collation.Name(), "_ci") { - flags |= icuregex.UREGEX_CASE_INSENSITIVE + flags |= icuregex.CaseInsensitive } if len(r.Arguments) > 2 { From 0f0081b30cf8783ad647be1ebf14fb229193872f Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Mon, 3 Jul 2023 16:12:22 +0200 Subject: [PATCH 10/18] regex: Implement additional regular expression functions This implements the additional MySQL regular expression functions in the evalengine. The evaluator is only implementing this for now, but the compiler is up next. Signed-off-by: Dirkjan Bussink --- go/mysql/collations/charset/convert.go | 42 ++ go/mysql/constants.go | 21 + go/mysql/icuregex/compiler.go | 101 ++-- go/mysql/icuregex/error.go | 76 ++- go/mysql/icuregex/errors/error.go | 6 + go/mysql/icuregex/internal/uerror/error.go | 62 --- .../icuregex/internal/uprops/properties.go | 34 +- go/mysql/icuregex/matcher.go | 11 +- go/mysql/icuregex/ops.go | 3 +- go/mysql/icuregex/perl_test.go | 4 +- go/mysql/sql_error.go | 20 + go/vt/vterrors/state.go | 22 + go/vt/vtgate/evalengine/compare.go | 2 +- go/vt/vtgate/evalengine/compiler_test.go | 4 + go/vt/vtgate/evalengine/expr_collate.go | 12 +- go/vt/vtgate/evalengine/expr_compare.go | 8 +- go/vt/vtgate/evalengine/fn_regexp.go | 453 +++++++++++++++++- go/vt/vtgate/evalengine/testcases/cases.go | 136 +++++- go/vt/vtgate/evalengine/translate_builtin.go | 135 ++++++ .../tabletmanager/vreplication/utils.go | 20 + 20 files changed, 981 insertions(+), 191 deletions(-) create mode 100644 go/mysql/icuregex/errors/error.go delete mode 100644 go/mysql/icuregex/internal/uerror/error.go diff --git a/go/mysql/collations/charset/convert.go b/go/mysql/collations/charset/convert.go index a54e8f4d718..6054ae33559 100644 --- a/go/mysql/collations/charset/convert.go +++ b/go/mysql/collations/charset/convert.go @@ -19,6 +19,8 @@ package charset import ( "fmt" "unicode/utf8" + + "vitess.io/vitess/go/hack" ) func failedConversionError(from, to Charset, input []byte) error { @@ -158,6 +160,46 @@ func Expand(dst []rune, src []byte, srcCharset Charset) []rune { } } +func Collapse(dst []byte, src []rune, dstCharset Charset) []byte { + switch dstCharset := dstCharset.(type) { + case Charset_utf8mb3, Charset_utf8mb4: + if dst == nil { + return hack.StringBytes(string(src)) + } + return append(dst, hack.StringBytes(string(src))...) + case Charset_binary: + if dst == nil { + dst = make([]byte, 0, len(src)) + } + for _, b := range src { + dst = append(dst, byte(b)) + } + return dst + default: + nDst := 0 + if dst == nil { + dst = make([]byte, len(src)*dstCharset.MaxWidth()) + } else { + dst = dst[:cap(dst)] + } + for _, c := range src { + if len(dst)-nDst < 4 { + newDst := make([]byte, len(dst)*2) + copy(newDst, dst[:nDst]) + dst = newDst + } + w := dstCharset.EncodeRune(dst[nDst:], c) + if w < 0 { + if w = dstCharset.EncodeRune(dst[nDst:], '?'); w < 0 { + break + } + } + nDst += w + } + return dst[:nDst] + } +} + func ConvertFromUTF8(dst []byte, dstCharset Charset, src []byte) ([]byte, error) { return Convert(dst, dstCharset, src, Charset_utf8mb4{}) } diff --git a/go/mysql/constants.go b/go/mysql/constants.go index b2c9b4d49a5..f62f9373e0f 100644 --- a/go/mysql/constants.go +++ b/go/mysql/constants.go @@ -565,6 +565,27 @@ const ( ERJSONValueTooBig = ErrorCode(3150) ERJSONDocumentTooDeep = ErrorCode(3157) + ERRegexpStringNotTerminated = ErrorCode(3684) + ERRegexpBufferOverflow = ErrorCode(3684) + ERRegexpIllegalArgument = ErrorCode(3685) + ERRegexpIndexOutOfBounds = ErrorCode(3686) + ERRegexpInternal = ErrorCode(3687) + ERRegexpRuleSyntax = ErrorCode(3688) + ERRegexpBadEscapeSequence = ErrorCode(3689) + ERRegexpUnimplemented = ErrorCode(3690) + ERRegexpMismatchParen = ErrorCode(3691) + ERRegexpBadInterval = ErrorCode(3692) + ERRRegexpMaxLtMin = ErrorCode(3693) + ERRegexpInvalidBackRef = ErrorCode(3694) + ERRegexpLookBehindLimit = ErrorCode(3695) + ERRegexpMissingCloseBracket = ErrorCode(3696) + ERRegexpInvalidRange = ErrorCode(3697) + ERRegexpStackOverflow = ErrorCode(3698) + ERRegexpTimeOut = ErrorCode(3699) + ERRegexpPatternTooBig = ErrorCode(3700) + ERRegexpInvalidCaptureGroup = ErrorCode(3887) + ERRegexpInvalidFlag = ErrorCode(3900) + // max execution time exceeded ERQueryTimeout = ErrorCode(3024) diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index c1544e2bd7b..f2eac4ac9f8 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -31,7 +31,6 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/pattern" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" "vitess.io/vitess/go/mysql/icuregex/internal/unames" "vitess.io/vitess/go/mysql/icuregex/internal/uprops" "vitess.io/vitess/go/mysql/icuregex/internal/uset" @@ -226,7 +225,7 @@ func (c *compiler) nextChar(ch *reChar) { ch.char, c.p = pattern.UnescapeAt(beforeEscape) if ch.char < 0 { - c.error(uerror.BadEscapeSequence) + c.error(BadEscapeSequence) } c.charNum += len(beforeEscape) - len(c.p) } else if c.peekCharLL() == chDigit0 { @@ -244,7 +243,7 @@ func (c *compiler) nextChar(ch *reChar) { if ch2 < chDigit0 || ch2 > chDigit7 { if index == 0 { // \0 is not followed by any octal digits. - c.error(uerror.BadEscapeSequence) + c.error(BadEscapeSequence) } break } @@ -371,7 +370,7 @@ func (c *compiler) compile(pat string) error { if table[0].pushState != 0 { c.stackPtr++ if c.stackPtr >= stackSize { - c.error(uerror.InternalError) + c.error(InternalError) c.stackPtr-- } c.stack[c.stackPtr] = uint16(table[0].pushState) @@ -388,7 +387,7 @@ func (c *compiler) compile(pat string) error { c.stackPtr-- if c.stackPtr < 0 { c.stackPtr++ - c.error(uerror.MismatchedParen) + c.error(MismatchedParen) } } } @@ -437,7 +436,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.handleCloseParen() if len(c.parenStack) > 0 { // Missing close paren in pattern. - c.error(uerror.MismatchedParen) + c.error(MismatchedParen) } // add the END operation to the compiled pattern. @@ -491,7 +490,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.captureName.WriteRune(c.c.char) case doBadNamedCapture: - c.error(uerror.InvalidCaptureGroupName) + c.error(InvalidCaptureGroupName) case doOpenCaptureParen: // Open Capturing Paren, possibly named. @@ -539,7 +538,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.captureName = nil if _, ok := c.out.namedCaptureMap[captureName]; ok { - c.error(uerror.InvalidCaptureGroupName) + c.error(InvalidCaptureGroupName) } c.out.namedCaptureMap[captureName] = groupNumber } @@ -768,22 +767,22 @@ func (c *compiler) doParseActions(action patternParseAction) bool { case doConditionalExpr, doPerlInline: // Conditionals such as (?(1)a:b) // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. - c.error(uerror.Unimplemented) + c.error(Unimplemented) case doCloseParen: c.handleCloseParen() if len(c.parenStack) == 0 { // Extra close paren, or missing open paren. - c.error(uerror.MismatchedParen) + c.error(MismatchedParen) } case doNOP: case doBadOpenParenType, doRuleError: - c.error(uerror.RuleSyntax) + c.error(RuleSyntax) case doMismatchedParenErr: - c.error(uerror.MismatchedParen) + c.error(MismatchedParen) case doPlus: // Normal '+' compiles to @@ -999,7 +998,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { digitValue := uCharDigitValue(c.c.char) val := int64(c.intervalLow)*10 + digitValue if val > math.MaxInt32 { - c.error(uerror.NumberTooBig) + c.error(NumberTooBig) } else { c.intervalLow = int(val) } @@ -1012,7 +1011,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { digitValue := uCharDigitValue(c.c.char) val := int64(c.intervalUpper)*10 + digitValue if val > math.MaxInt32 { - c.error(uerror.NumberTooBig) + c.error(NumberTooBig) } else { c.intervalUpper = int(val) } @@ -1066,7 +1065,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.compileInterval(urxCtrInitNg, urxCtrLoopNg) case doIntervalError: - c.error(uerror.BadInterval) + c.error(BadInterval) case doLiteralChar: // We've just scanned a "normal" character from the pattern, @@ -1076,7 +1075,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { // We've just scanned an backslashed escaped character with no // special meaning. It represents itself. if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || /* in [A-Z] */ (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] - c.error(uerror.BadEscapeSequence) + c.error(BadEscapeSequence) } c.literalChar(c.c.char) @@ -1122,7 +1121,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { case doBackslashB: if !BreakIteration { if (c.modeFlags & UWord) != 0 { - c.error(uerror.Unimplemented) + c.error(Unimplemented) } } c.fixLiterals(false) @@ -1135,7 +1134,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { case doBackslashb: if !BreakIteration { if (c.modeFlags & UWord) != 0 { - c.error(uerror.Unimplemented) + c.error(Unimplemented) } } c.fixLiterals(false) @@ -1196,7 +1195,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { case doBackslashX: if !BreakIteration { // Grapheme Cluster Boundary requires ICU break iteration. - c.error(uerror.Unimplemented) + c.error(Unimplemented) } c.fixLiterals(false) c.appendOp(urxBackslashX, 0) @@ -1210,7 +1209,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.appendOp(urxBackslashZ, 0) case doEscapeError: - c.error(uerror.BadEscapeSequence) + c.error(BadEscapeSequence) case doExit: c.fixLiterals(false) @@ -1279,7 +1278,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { // Group name has not been defined. // Could be a forward reference. If we choose to support them at some // future time, extra mechanism will be required at this point. - c.error(uerror.InvalidCaptureGroupName) + c.error(InvalidCaptureGroupName) } else { // Given the number, handle identically to a \n numbered back reference. // See comments above, under doBackRef @@ -1448,7 +1447,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { c.modeFlags = c.newModeFlags case doBadModeFlag: - c.error(uerror.InvalidFlag) + c.error(InvalidFlag) case doSuppressComments: // We have just scanned a '(?'. We now need to prevent the character scanner from @@ -1619,7 +1618,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || // in [A-Z] (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] - c.error(uerror.BadEscapeSequence) + c.error(BadEscapeSequence) } c.setEval(setUnion) set := c.setStack[len(c.setStack)-1] @@ -1644,7 +1643,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { // and ICU UnicodeSet behavior. ch := c.scanNamedChar() if c.err == nil && (c.lastSetLiteral == -1 || c.lastSetLiteral > ch) { - c.error(uerror.InvalidRange) + c.error(InvalidRange) } set := c.setStack[len(c.setStack)-1] set.AddRuneRange(c.lastSetLiteral, ch) @@ -1668,10 +1667,10 @@ func (c *compiler) doParseActions(action patternParseAction) bool { } case doSetNoCloseError: - c.error(uerror.MissingCloseBracket) + c.error(MissingCloseBracket) case doSetOpError: - c.error(uerror.RuleSyntax) // -- or && at the end of a set. Illegal. + c.error(RuleSyntax) // -- or && at the end of a set. Illegal. case doSetPosixProp: if set := c.scanPosixProp(); set != nil { @@ -1692,7 +1691,7 @@ func (c *compiler) doParseActions(action patternParseAction) bool { // and ICU UnicodeSet behavior. if c.lastSetLiteral == -1 || c.lastSetLiteral > c.c.char { - c.error(uerror.InvalidRange) + c.error(InvalidRange) } c.setStack[len(c.setStack)-1].AddRuneRange(c.lastSetLiteral, c.c.char) @@ -1719,7 +1718,7 @@ func stackPop[T any](stack []T) (T, []T) { return out, stack } -func (c *compiler) error(e uerror.CompileErrorCode) { +func (c *compiler) error(e CompileErrorCode) { c.err = &CompileError{ Code: e, Line: c.lineNum, @@ -1778,7 +1777,7 @@ func (c *compiler) stripNOPs() { case urxBackref, urxBackrefI: where := op.value() if where > len(c.out.groupMap) { - c.error(uerror.InvalidBackRef) + c.error(InvalidBackRef) break } @@ -2294,7 +2293,7 @@ func (c *compiler) buildOp(typ opcode, val int) instruction { func (c *compiler) handleCloseParen() { if len(c.parenStack) == 0 { - c.error(uerror.MismatchedParen) + c.error(MismatchedParen) return } @@ -2398,7 +2397,7 @@ func (c *compiler) handleCloseParen() { maxML := c.maxMatchLength(c.matchOpenParen, patEnd) if maxML == math.MaxInt32 { - c.error(uerror.LookBehindLimit) + c.error(LookBehindLimit) break } if minML == math.MaxInt32 { @@ -2428,11 +2427,11 @@ func (c *compiler) handleCloseParen() { maxML := c.maxMatchLength(c.matchOpenParen, patEnd) if instruction(maxML).typ() != 0 { - c.error(uerror.LookBehindLimit) + c.error(LookBehindLimit) break } if maxML == math.MaxInt32 { - c.error(uerror.LookBehindLimit) + c.error(LookBehindLimit) break } if minML == math.MaxInt32 { @@ -2489,7 +2488,7 @@ func (c *compiler) fixLiterals(split bool) { } } else { if len(c.literalChars) > 0x00ffffff || len(c.out.literalText) > 0x00ffffff { - c.error(uerror.PatternTooBig) + c.error(PatternTooBig) } if c.modeFlags&CaseInsensitive != 0 { c.appendOp(urxStringI, len(c.out.literalText)) @@ -2512,14 +2511,14 @@ func (c *compiler) allocateData(size int) int { return 0 } if size <= 0 || size > 0x100 || c.out.dataSize < 0 { - c.error(uerror.InternalError) + c.error(InternalError) return 0 } dataIndex := c.out.dataSize c.out.dataSize += size if c.out.dataSize >= 0x00fffff0 { - c.error(uerror.InternalError) + c.error(InternalError) } return dataIndex } @@ -2529,13 +2528,13 @@ func (c *compiler) allocateStackData(size int) int { return 0 } if size <= 0 || size > 0x100 || c.out.frameSize < 0 { - c.error(uerror.InternalError) + c.error(InternalError) return 0 } dataIndex := c.out.frameSize c.out.frameSize += size if c.out.frameSize >= 0x00fffff0 { - c.error(uerror.InternalError) + c.error(InternalError) } return dataIndex } @@ -2698,18 +2697,18 @@ func (c *compiler) compileInterval(init opcode, loop opcode) { c.appendOp(loop, topOfBlock) if (c.intervalLow&0xff000000) != 0 || (c.intervalUpper > 0 && (c.intervalUpper&0xff000000) != 0) { - c.error(uerror.NumberTooBig) + c.error(NumberTooBig) } if c.intervalLow > c.intervalUpper && c.intervalUpper != -1 { - c.error(uerror.MaxLtMin) + c.error(MaxLtMin) } } func (c *compiler) scanNamedChar() rune { c.nextChar(&c.c) if c.c.char != chLBrace { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) return 0 } @@ -2720,7 +2719,7 @@ func (c *compiler) scanNamedChar() rune { break } if c.c.char == -1 { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) return 0 } charName = append(charName, c.c.char) @@ -2730,13 +2729,13 @@ func (c *compiler) scanNamedChar() rune { // All Unicode character names have only invariant characters. // The API to get a character, given a name, accepts only char *, forcing us to convert, // which requires this error check - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) return 0 } theChar := unames.CharForName(unames.UnicodeCharName, string(charName)) if c.err != nil { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) } c.nextChar(&c.c) // Continue overall regex pattern processing with char after the '}' @@ -3312,7 +3311,7 @@ func (c *compiler) scanProp() *uset.UnicodeSet { c.nextChar(&c.c) if c.c.char != chLBrace { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) return nil } @@ -3323,7 +3322,7 @@ func (c *compiler) scanProp() *uset.UnicodeSet { break } if c.c.char == -1 { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) return nil } propertyName.WriteRune(c.c.char) @@ -3373,7 +3372,7 @@ func (c *compiler) createSetForProperty(propName string, negated bool) *uset.Uni if strings.HasPrefix(propName, "In") && len(propName) >= 3 { set = uset.New() if uprops.ApplyPropertyAlias(set, "Block", propName[2:]) != nil { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) } goto done } @@ -3384,7 +3383,7 @@ func (c *compiler) createSetForProperty(propName string, negated bool) *uset.Uni if strings.HasPrefix(propName, "Is") && len(propName) >= 3 { mPropName := propName[2:] if strings.IndexByte(mPropName, '=') >= 0 { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) goto done } @@ -3397,7 +3396,7 @@ func (c *compiler) createSetForProperty(propName string, negated bool) *uset.Uni set, err = uprops.NewUnicodeSetFomPattern("\\p{"+mPropName+"}", 0) if err != nil { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) } else if !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { set.CloseOver(uset.CaseInsensitive) } @@ -3511,7 +3510,7 @@ func (c *compiler) createSetForProperty(propName string, negated bool) *uset.Uni set.AddRuneRange(9, 0x0d) set.AddRuneRange(0x1c, 0x1f) } else { - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) } if c.err == nil && !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { @@ -3522,7 +3521,7 @@ func (c *compiler) createSetForProperty(propName string, negated bool) *uset.Uni // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility // extensions matched it. - c.error(uerror.PropertySyntax) + c.error(PropertySyntax) done: if c.err != nil { diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go index c2cde70b8be..3b8c8a36312 100644 --- a/go/mysql/icuregex/error.go +++ b/go/mysql/icuregex/error.go @@ -24,12 +24,10 @@ package icuregex import ( "fmt" "strings" - - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" ) type CompileError struct { - Code uerror.CompileErrorCode + Code CompileErrorCode Line int Offset int Context string @@ -38,41 +36,37 @@ type CompileError struct { func (e *CompileError) Error() string { var out strings.Builder switch e.Code { - case uerror.InternalError: + case InternalError: out.WriteString("Internal Error") - case uerror.RuleSyntax: + case RuleSyntax: out.WriteString("Syntax Error") - case uerror.InvalidState: - out.WriteString("Invalid State") - case uerror.BadEscapeSequence: + case BadEscapeSequence: out.WriteString("Bad escape sequence") - case uerror.PropertySyntax: + case PropertySyntax: out.WriteString("Property syntax error") - case uerror.Unimplemented: + case Unimplemented: out.WriteString("Unimplemented") - case uerror.MismatchedParen: + case MismatchedParen: out.WriteString("Mismatched parentheses") - case uerror.NumberTooBig: + case NumberTooBig: out.WriteString("Number too big") - case uerror.BadInterval: + case BadInterval: out.WriteString("Bad interval") - case uerror.MaxLtMin: + case MaxLtMin: out.WriteString("Max less than min") - case uerror.InvalidBackRef: + case InvalidBackRef: out.WriteString("Invalid back reference") - case uerror.InvalidFlag: + case InvalidFlag: out.WriteString("Invalid flag") - case uerror.LookBehindLimit: + case LookBehindLimit: out.WriteString("Look behind limit") - case uerror.SetContainsString: - out.WriteString("Set contains string") - case uerror.MissingCloseBracket: + case MissingCloseBracket: out.WriteString("Missing closing ]") - case uerror.InvalidRange: + case InvalidRange: out.WriteString("Invalid range") - case uerror.PatternTooBig: + case PatternTooBig: out.WriteString("Pattern too big") - case uerror.InvalidCaptureGroupName: + case InvalidCaptureGroupName: out.WriteString("Invalid capture group name") } _, _ = fmt.Fprintf(&out, " at line %d, column %d: `%s`", e.Line, e.Offset, e.Context) @@ -81,7 +75,7 @@ func (e *CompileError) Error() string { } type MatchError struct { - Code uerror.MatchErrorCode + Code MatchErrorCode Pattern string Position int Input []rune @@ -92,9 +86,9 @@ const maxMatchInputLength = 20 func (e *MatchError) Error() string { var out strings.Builder switch e.Code { - case uerror.StackOverflow: + case StackOverflow: out.WriteString("Stack overflow") - case uerror.TimeOut: + case TimeOut: out.WriteString("Timeout") } @@ -123,3 +117,33 @@ func (e *MatchError) Error() string { return out.String() } + +type Code int32 + +type CompileErrorCode int32 + +const ( + InternalError CompileErrorCode = iota + 1 /**< An internal error (bug) was detected. */ + RuleSyntax /**< Syntax error in regexp pattern. */ + BadEscapeSequence /**< Unrecognized backslash escape sequence in pattern */ + PropertySyntax /**< Incorrect Unicode property */ + Unimplemented /**< Use of regexp feature that is not yet implemented. */ + MismatchedParen /**< Incorrectly nested parentheses in regexp pattern. */ + NumberTooBig /**< Decimal number is too large. */ + BadInterval /**< Error in {min,max} interval */ + MaxLtMin /**< In {min,max}, max is less than min. */ + InvalidBackRef /**< Back-reference to a non-existent capture group. */ + InvalidFlag /**< Invalid value for match mode flags. */ + LookBehindLimit /**< Look-Behind pattern matches must have a bounded maximum length. */ + MissingCloseBracket /**< Missing closing bracket on a bracket expression. */ + InvalidRange /**< In a character range [x-y], x is greater than y. */ + PatternTooBig /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ + InvalidCaptureGroupName /**< Invalid capture group name. @stable ICU 55 */ +) + +type MatchErrorCode int32 + +const ( + StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ + TimeOut /**< Maximum allowed match time exceeded */ +) diff --git a/go/mysql/icuregex/errors/error.go b/go/mysql/icuregex/errors/error.go new file mode 100644 index 00000000000..8f32f5fee1e --- /dev/null +++ b/go/mysql/icuregex/errors/error.go @@ -0,0 +1,6 @@ +package errors + +import "errors" + +var ErrIllegalArgument = errors.New("illegal argument") +var ErrUnsupported = errors.New("unsupported") diff --git a/go/mysql/icuregex/internal/uerror/error.go b/go/mysql/icuregex/internal/uerror/error.go deleted file mode 100644 index 4842f1ae6eb..00000000000 --- a/go/mysql/icuregex/internal/uerror/error.go +++ /dev/null @@ -1,62 +0,0 @@ -/* -© 2016 and later: Unicode, Inc. and others. -Copyright (C) 2004-2015, International Business Machines Corporation and others. -Copyright 2023 The Vitess Authors. - -This file contains code derived from the Unicode Project's ICU library. -License & terms of use for the original code: http://www.unicode.org/copyright.html - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package uerror - -import ( - "errors" -) - -type Code int32 - -var ErrIllegalArgument = errors.New("illegal argument") -var ErrUnsupported = errors.New("unsupported") - -type CompileErrorCode int32 - -const ( - ZeroError CompileErrorCode = iota - InternalError /**< An internal error (bug) was detected. */ - RuleSyntax /**< Syntax error in regexp pattern. */ - InvalidState /**< RegexMatcher in invalid state for requested operation */ - BadEscapeSequence /**< Unrecognized backslash escape sequence in pattern */ - PropertySyntax /**< Incorrect Unicode property */ - Unimplemented /**< Use of regexp feature that is not yet implemented. */ - MismatchedParen /**< Incorrectly nested parentheses in regexp pattern. */ - NumberTooBig /**< Decimal number is too large. */ - BadInterval /**< Error in {min,max} interval */ - MaxLtMin /**< In {min,max}, max is less than min. */ - InvalidBackRef /**< Back-reference to a non-existent capture group. */ - InvalidFlag /**< Invalid value for match mode flags. */ - LookBehindLimit /**< Look-Behind pattern matches must have a bounded maximum length. */ - SetContainsString /**< Regexps cannot have UnicodeSets containing strings.*/ - MissingCloseBracket /**< Missing closing bracket on a bracket expression. */ - InvalidRange /**< In a character range [x-y], x is greater than y. */ - PatternTooBig /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ - InvalidCaptureGroupName /**< Invalid capture group name. @stable ICU 55 */ -) - -type MatchErrorCode int32 - -const ( - StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ - TimeOut /**< Maximum allowed match time exceeded */ -) diff --git a/go/mysql/icuregex/internal/uprops/properties.go b/go/mysql/icuregex/internal/uprops/properties.go index ecf6d39ee20..06148cbbe18 100644 --- a/go/mysql/icuregex/internal/uprops/properties.go +++ b/go/mysql/icuregex/internal/uprops/properties.go @@ -26,12 +26,12 @@ import ( "strings" "sync" + "vitess.io/vitess/go/mysql/icuregex/errors" "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" "vitess.io/vitess/go/mysql/icuregex/internal/pattern" "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" "vitess.io/vitess/go/mysql/icuregex/internal/unames" "vitess.io/vitess/go/mysql/icuregex/internal/uset" @@ -65,9 +65,9 @@ func getInclusionsForSource(src propertySource) (*uset.UnicodeSet, error) { case srcNfkc: normalizer.Nfkc().AddPropertyStarts(u) case srcNfkcCf: - return nil, uerror.ErrUnsupported + return nil, errors.ErrUnsupported case srcNfcCanonIter: - return nil, uerror.ErrUnsupported + return nil, errors.ErrUnsupported case srcCase: ucase.AddPropertyStarts(u) case srcBidi: @@ -75,7 +75,7 @@ func getInclusionsForSource(src propertySource) (*uset.UnicodeSet, error) { case srcInpc, srcInsc, srcVo: AddULayoutPropertyStarts(src, u) default: - return nil, uerror.ErrUnsupported + return nil, errors.ErrUnsupported } inclusionsForSource[src] = u @@ -205,7 +205,7 @@ func ApplyIntPropertyValue(u *uset.UnicodeSet, prop Property, value int32) error return getIntPropertyValue(ch, prop) == value }) default: - return uerror.ErrUnsupported + return errors.ErrUnsupported } return nil } @@ -224,7 +224,7 @@ func mungeCharName(charname string) string { func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { if len(pat) < 5 { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } var posix, isName, invert bool @@ -242,11 +242,11 @@ func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { isName = c == 'N' pat = pattern.SkipWhitespace(pat[2:]) if len(pat) == 0 || pat[0] != '{' { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } pat = pat[1:] } else { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } var closePos int @@ -256,7 +256,7 @@ func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { closePos = strings.IndexByte(pat, '}') } if closePos < 0 { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } equals := strings.IndexByte(pat, '=') @@ -301,7 +301,7 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { if len(value) > 0 { p = getPropertyEnum(prop) if p == -1 { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } if p == UCharGeneralCategory { p = UCharGeneralCategoryMask @@ -318,11 +318,11 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { p == UCharLeadCanonicalCombiningClass { val, err := strconv.ParseUint(value, 10, 8) if err != nil { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } v = int32(val) } else { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } } } else { @@ -330,7 +330,7 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { case UCharNumericValue: val, err := strconv.ParseFloat(value, 64) if err != nil { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } incl, err := getInclusionsForProperty(p) if err != nil { @@ -346,7 +346,7 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { charName := mungeCharName(value) ch := unames.CharForName(unames.ExtendedCharName, charName) if ch < 0 { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } u.Clear() u.AddRune(ch) @@ -367,12 +367,12 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { case UCharScriptExtensions: v = getPropertyValueEnum(UCharScript, value) if v == -1 { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } default: // p is a non-binary, non-enumerated property that we // don't support (yet). - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } } } else { @@ -401,7 +401,7 @@ func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { v = int32(uchar.GcCnMask) invert = true } else { - return uerror.ErrIllegalArgument + return errors.ErrIllegalArgument } } } diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index fa9e540c296..f04d8f531f0 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -27,7 +27,6 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" "vitess.io/vitess/go/mysql/icuregex/internal/uprops" ) @@ -1323,7 +1322,7 @@ func (m *Matcher) incrementTime(inputIdx int) error { m.time++ if m.timeLimit > 0 && m.time >= m.timeLimit { return &MatchError{ - Code: uerror.TimeOut, + Code: TimeOut, Pattern: m.pattern.pattern, Position: inputIdx, Input: m.input, @@ -1635,6 +1634,14 @@ func (m *Matcher) Group(i int) (string, bool) { return string(m.input[start:end]), true } +func (m *Matcher) End() int { + if !m.match { + return -1 + } + + return m.matchEnd +} + // Test for any of the Unicode line terminating characters. func isLineTerminator(c rune) bool { if (c & ^(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) != 0 { diff --git a/go/mysql/icuregex/ops.go b/go/mysql/icuregex/ops.go index e1418ff6325..dbb83ee3d24 100644 --- a/go/mysql/icuregex/ops.go +++ b/go/mysql/icuregex/ops.go @@ -25,7 +25,6 @@ import ( "golang.org/x/exp/slices" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" "vitess.io/vitess/go/mysql/icuregex/internal/utf16" ) @@ -295,7 +294,7 @@ func (s *stack) sp() int { func (s *stack) newFrame(inputIdx int, input []rune, pattern string) (stackFrame, error) { if s.stackLimit != 0 && len(s.ary)+s.frameSize > s.stackLimit { return nil, &MatchError{ - Code: uerror.StackOverflow, + Code: StackOverflow, Pattern: pattern, Position: inputIdx, Input: input, diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go index f30ec443605..0e7beda9fbd 100644 --- a/go/mysql/icuregex/perl_test.go +++ b/go/mysql/icuregex/perl_test.go @@ -27,8 +27,6 @@ import ( "strconv" "strings" "testing" - - "vitess.io/vitess/go/mysql/icuregex/internal/uerror" ) func TestPerl(t *testing.T) { @@ -96,7 +94,7 @@ func TestPerl(t *testing.T) { testPat, err := CompileString(pattern, flags) if err != nil { - if cerr, ok := err.(*CompileError); ok && cerr.Code == uerror.Unimplemented { + if cerr, ok := err.(*CompileError); ok && cerr.Code == Unimplemented { continue } if strings.IndexByte(fields[2], 'c') == -1 && strings.IndexByte(fields[2], 'i') == -1 { diff --git a/go/mysql/sql_error.go b/go/mysql/sql_error.go index c400de4ef9a..7230a3ce281 100644 --- a/go/mysql/sql_error.go +++ b/go/mysql/sql_error.go @@ -218,6 +218,26 @@ var stateToMysqlCode = map[vterrors.State]mysqlCode{ vterrors.WrongArguments: {num: ERWrongArguments, state: SSUnknownSQLState}, vterrors.UnknownStmtHandler: {num: ERUnknownStmtHandler, state: SSUnknownSQLState}, vterrors.UnknownTimeZone: {num: ERUnknownTimeZone, state: SSUnknownSQLState}, + vterrors.RegexpStringNotTerminated: {num: ERRegexpStringNotTerminated, state: SSUnknownSQLState}, + vterrors.RegexpBufferOverflow: {num: ERRegexpBufferOverflow, state: SSUnknownSQLState}, + vterrors.RegexpIllegalArgument: {num: ERRegexpIllegalArgument, state: SSUnknownSQLState}, + vterrors.RegexpIndexOutOfBounds: {num: ERRegexpIndexOutOfBounds, state: SSUnknownSQLState}, + vterrors.RegexpInternal: {num: ERRegexpInternal, state: SSUnknownSQLState}, + vterrors.RegexpRuleSyntax: {num: ERRegexpRuleSyntax, state: SSUnknownSQLState}, + vterrors.RegexpBadEscapeSequence: {num: ERRegexpBadEscapeSequence, state: SSUnknownSQLState}, + vterrors.RegexpUnimplemented: {num: ERRegexpUnimplemented, state: SSUnknownSQLState}, + vterrors.RegexpMismatchParen: {num: ERRegexpMismatchParen, state: SSUnknownSQLState}, + vterrors.RegexpBadInterval: {num: ERRegexpBadInterval, state: SSUnknownSQLState}, + vterrors.RegexpMaxLtMin: {num: ERRRegexpMaxLtMin, state: SSUnknownSQLState}, + vterrors.RegexpInvalidBackRef: {num: ERRegexpInvalidBackRef, state: SSUnknownSQLState}, + vterrors.RegexpLookBehindLimit: {num: ERRegexpLookBehindLimit, state: SSUnknownSQLState}, + vterrors.RegexpMissingCloseBracket: {num: ERRegexpMissingCloseBracket, state: SSUnknownSQLState}, + vterrors.RegexpInvalidRange: {num: ERRegexpInvalidRange, state: SSUnknownSQLState}, + vterrors.RegexpStackOverflow: {num: ERRegexpStackOverflow, state: SSUnknownSQLState}, + vterrors.RegexpTimeOut: {num: ERRegexpTimeOut, state: SSUnknownSQLState}, + vterrors.RegexpPatternTooBig: {num: ERRegexpPatternTooBig, state: SSUnknownSQLState}, + vterrors.RegexpInvalidFlag: {num: ERRegexpInvalidFlag, state: SSUnknownSQLState}, + vterrors.RegexpInvalidCaptureGroup: {num: ERRegexpInvalidCaptureGroup, state: SSUnknownSQLState}, } func getStateToMySQLState(state vterrors.State) mysqlCode { diff --git a/go/vt/vterrors/state.go b/go/vt/vterrors/state.go index d7ed04e1c7b..37da94ad92c 100644 --- a/go/vt/vterrors/state.go +++ b/go/vt/vterrors/state.go @@ -88,6 +88,28 @@ const ( // unknown timezone UnknownTimeZone + // regexp errors + RegexpStringNotTerminated + RegexpBufferOverflow + RegexpIllegalArgument + RegexpIndexOutOfBounds + RegexpInternal + RegexpRuleSyntax + RegexpBadEscapeSequence + RegexpUnimplemented + RegexpMismatchParen + RegexpBadInterval + RegexpMaxLtMin + RegexpInvalidBackRef + RegexpLookBehindLimit + RegexpMissingCloseBracket + RegexpInvalidRange + RegexpStackOverflow + RegexpTimeOut + RegexpPatternTooBig + RegexpInvalidCaptureGroup + RegexpInvalidFlag + // No state should be added below NumOfStates NumOfStates ) diff --git a/go/vt/vtgate/evalengine/compare.go b/go/vt/vtgate/evalengine/compare.go index f2262cf8730..deee5fdb520 100644 --- a/go/vt/vtgate/evalengine/compare.go +++ b/go/vt/vtgate/evalengine/compare.go @@ -137,7 +137,7 @@ func compareStrings(l, r eval) (int, error) { if err != nil { return 0, err } - collation := col.Get() + collation := col.Collation.Get() if collation == nil { panic("unknown collation after coercion") } diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go index 92ef9d3d465..2c0fa71f52a 100644 --- a/go/vt/vtgate/evalengine/compiler_test.go +++ b/go/vt/vtgate/evalengine/compiler_test.go @@ -444,6 +444,10 @@ func TestCompilerSingle(t *testing.T) { expression: `INTERVAL(0, 0, 0, -1, NULL, NULL, 1)`, result: `INT64(5)`, }, + { + expression: `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, + result: `VARCHAR("X def ghi")`, + }, } for _, tc := range testCases { diff --git a/go/vt/vtgate/evalengine/expr_collate.go b/go/vt/vtgate/evalengine/expr_collate.go index 16fe8351880..4962d9dd2ba 100644 --- a/go/vt/vtgate/evalengine/expr_collate.go +++ b/go/vt/vtgate/evalengine/expr_collate.go @@ -152,16 +152,16 @@ func mergeCollations(c1, c2 collations.TypedCollation, t1, t2 sqltypes.Type) (co }) } -func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.ID, error) { +func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.TypedCollation, error) { lt := left.SQLType() rt := right.SQLType() mc, coerceLeft, coerceRight, err := mergeCollations(evalCollation(left), evalCollation(right), lt, rt) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } if coerceLeft == nil && coerceRight == nil { - return left, right, mc.Collation, nil + return left, right, mc, nil } left1 := newEvalRaw(lt, left.(*evalBytes).bytes, mc) @@ -170,16 +170,16 @@ func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.ID, erro if coerceLeft != nil { left1.bytes, err = coerceLeft(nil, left1.bytes) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } } if coerceRight != nil { right1.bytes, err = coerceRight(nil, right1.bytes) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } } - return left1, right1, mc.Collation, nil + return left1, right1, mc, nil } type collationAggregation struct { diff --git a/go/vt/vtgate/evalengine/expr_compare.go b/go/vt/vtgate/evalengine/expr_compare.go index cef7493e026..3aca0cc1151 100644 --- a/go/vt/vtgate/evalengine/expr_compare.go +++ b/go/vt/vtgate/evalengine/expr_compare.go @@ -558,7 +558,7 @@ func (l *LikeExpr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var col collations.ID + var col collations.TypedCollation left, right, col, err = mergeAndCoerceCollations(left, right) if err != nil { return nil, err @@ -567,11 +567,11 @@ func (l *LikeExpr) eval(env *ExpressionEnv) (eval, error) { var matched bool switch { case typeIsTextual(left.SQLType()) && typeIsTextual(right.SQLType()): - matched = l.matchWildcard(left.(*evalBytes).bytes, right.(*evalBytes).bytes, col) + matched = l.matchWildcard(left.(*evalBytes).bytes, right.(*evalBytes).bytes, col.Collation) case typeIsTextual(right.SQLType()): - matched = l.matchWildcard(left.ToRawBytes(), right.(*evalBytes).bytes, col) + matched = l.matchWildcard(left.ToRawBytes(), right.(*evalBytes).bytes, col.Collation) case typeIsTextual(left.SQLType()): - matched = l.matchWildcard(left.(*evalBytes).bytes, right.ToRawBytes(), col) + matched = l.matchWildcard(left.(*evalBytes).bytes, right.ToRawBytes(), col.Collation) default: matched = l.matchWildcard(left.ToRawBytes(), right.ToRawBytes(), collations.CollationBinaryID) } diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index efd2d8cbffa..567ee61e668 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -1,21 +1,20 @@ package evalengine import ( + "errors" "strings" "vitess.io/vitess/go/hack" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/mysql/collations/charset" "vitess.io/vitess/go/mysql/icuregex" + icuerrors "vitess.io/vitess/go/mysql/icuregex/errors" "vitess.io/vitess/go/sqltypes" querypb "vitess.io/vitess/go/vt/proto/query" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/vterrors" ) -type builtinRegexpLike struct { - CallExpr - Negate bool -} - func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) (icuregex.RegexpFlag, error) { m, err := match.eval(env) if err != nil || m == nil { @@ -36,13 +35,112 @@ func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) flags |= icuregex.DotAll case 'u': flags |= icuregex.UnixLines + default: + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr.") } } + default: + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr.") } return flags, nil } +func evalOccurrence(env *ExpressionEnv, expr Expr) (int64, error) { + occExpr, err := expr.eval(env) + if err != nil { + return 0, err + } + return evalToInt64(occExpr).i, nil +} + +func evalReturnOption(env *ExpressionEnv, expr Expr) (int64, error) { + retExpr, err := expr.eval(env) + if err != nil { + return 0, err + } + returnOption := evalToInt64(retExpr).i + switch returnOption { + case 0, 1: + // Valid return options. + return returnOption, nil + } + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr: return_option must be 1 or 0.") +} + +func evalPosition(env *ExpressionEnv, expr Expr, limit int64) (int64, error) { + posExpr, err := expr.eval(env) + if err != nil { + return 0, err + } + pos := evalToInt64(posExpr).i + if pos < 1 || pos > limit { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") + } + return pos, nil +} + +func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*icuregex.Pattern, error) { + patUtf8, err := charset.Convert(nil, &charset.Charset_utf8mb4{}, pat.ToRawBytes(), c) + if err != nil { + return nil, err + } + + regexp, err := icuregex.CompileString(hack.String(patUtf8), flags) + if err == nil { + return regexp, nil + } + + var compileErr *icuregex.CompileError + if errors.Is(err, icuerrors.ErrUnsupported) { + err = vterrors.NewErrorf(vtrpcpb.Code_UNIMPLEMENTED, vterrors.RegexpUnimplemented, err.Error()) + } else if errors.Is(err, icuerrors.ErrIllegalArgument) { + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, err.Error()) + } else if errors.As(err, &compileErr) { + switch compileErr.Code { + case icuregex.InternalError: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInternal, compileErr.Error()) + case icuregex.RuleSyntax: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpRuleSyntax, compileErr.Error()) + case icuregex.BadEscapeSequence: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpBadEscapeSequence, compileErr.Error()) + case icuregex.PropertySyntax: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpRuleSyntax, compileErr.Error()) + case icuregex.Unimplemented: + err = vterrors.NewErrorf(vtrpcpb.Code_UNIMPLEMENTED, vterrors.RegexpUnimplemented, compileErr.Error()) + case icuregex.MismatchedParen: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMismatchParen, compileErr.Error()) + case icuregex.BadInterval: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpBadInterval, compileErr.Error()) + case icuregex.MaxLtMin: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMaxLtMin, compileErr.Error()) + case icuregex.InvalidBackRef: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidBackRef, compileErr.Error()) + case icuregex.InvalidFlag: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidFlag, compileErr.Error()) + case icuregex.LookBehindLimit: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpLookBehindLimit, compileErr.Error()) + case icuregex.MissingCloseBracket: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMissingCloseBracket, compileErr.Error()) + case icuregex.InvalidRange: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidRange, compileErr.Error()) + case icuregex.PatternTooBig: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpPatternTooBig, compileErr.Error()) + case icuregex.InvalidCaptureGroupName: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidCaptureGroup, compileErr.Error()) + default: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInternal, compileErr.Error()) + } + } + + return nil, err +} + +type builtinRegexpLike struct { + CallExpr + Negate bool +} + func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { input, err := r.Arguments[0].eval(env) if err != nil || input == nil { @@ -54,14 +152,14 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var colid collations.ID - input, pat, colid, err = mergeAndCoerceCollations(input, pat) + var typedCol collations.TypedCollation + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) if err != nil { return nil, err } var flags icuregex.RegexpFlag - var collation = colid.Get() + var collation = typedCol.Collation.Get() if strings.Contains(collation.Name(), "_ci") { flags |= icuregex.CaseInsensitive } @@ -73,12 +171,7 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { } } - patUtf8, err := charset.Convert(nil, &charset.Charset_utf8mb4{}, pat.ToRawBytes(), collation.Charset()) - if err != nil { - return nil, err - } - - regexp, err := icuregex.CompileString(hack.String(patUtf8), flags) + regexp, err := compileRegex(pat, collation.Charset(), flags) if err != nil { return nil, err } @@ -100,7 +193,7 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { func (r *builtinRegexpLike) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { _, f1 := r.Arguments[0].typeof(env, fields) _, f2 := r.Arguments[1].typeof(env, fields) - return sqltypes.Int64, f1 | f2 + return sqltypes.Int64, f1 | f2 | flagIsBoolean } func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { @@ -108,3 +201,333 @@ func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { } var _ Expr = (*builtinRegexpLike)(nil) + +type builtinRegexpInstr struct { + CallExpr +} + +func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + var typedCol collations.TypedCollation + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + if err != nil { + return nil, err + } + + var flags icuregex.RegexpFlag + var collation = typedCol.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + + pos := int64(1) + occurrence := int64(1) + returnOption := int64(0) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + + if len(r.Arguments) > 2 { + pos, err = evalPosition(env, r.Arguments[2], int64(len(inputRunes))) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 3 { + occurrence, err = evalOccurrence(env, r.Arguments[3]) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 4 { + returnOption, err = evalReturnOption(env, r.Arguments[4]) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 5 { + flags, err = evalRegexpFlags(env, r.Arguments[5], flags) + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + found := false + for i := int64(0); i < occurrence; i++ { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + } + if !found { + return newEvalInt64(0), nil + } + if returnOption == 0 { + return newEvalInt64(int64(m.Start()) + pos), nil + } + return newEvalInt64(int64(m.End()) + pos), nil +} + +func (r *builtinRegexpInstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + return sqltypes.Int64, f1 | f2 +} + +func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { + return ctype{}, c.unsupported(r) +} + +var _ Expr = (*builtinRegexpInstr)(nil) + +type builtinRegexpSubstr struct { + CallExpr +} + +func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + var typedCol collations.TypedCollation + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + if err != nil { + return nil, err + } + + var flags icuregex.RegexpFlag + var collation = typedCol.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + + pos := int64(1) + occurrence := int64(1) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + + if len(r.Arguments) > 2 { + pos, err = evalPosition(env, r.Arguments[2], int64(len(inputRunes))) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 3 { + occurrence, err = evalOccurrence(env, r.Arguments[3]) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 4 { + flags, err = evalRegexpFlags(env, r.Arguments[4], flags) + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + found := false + for i := int64(0); i < occurrence; i++ { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + } + if !found { + return nil, nil + } + out := inputRunes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + bytes := charset.Collapse(nil, out, collation.Charset()) + return newEvalText(bytes, typedCol), nil +} + +func (r *builtinRegexpSubstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + return sqltypes.VarChar, f1 | f2 +} + +func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { + return ctype{}, c.unsupported(r) +} + +var _ Expr = (*builtinRegexpSubstr)(nil) + +type builtinRegexpReplace struct { + CallExpr +} + +func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + replArg, err := r.Arguments[2].eval(env) + if err != nil || pat == nil { + return nil, err + } + + var typedCol collations.TypedCollation + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + if err != nil { + return nil, err + } + + repl, ok := replArg.(*evalBytes) + if !ok { + repl, err = evalToVarchar(replArg, typedCol.Collation, true) + if err != nil { + return nil, err + } + } + + replRunes := charset.Expand(nil, repl.ToRawBytes(), repl.col.Collation.Get().Charset()) + + var flags icuregex.RegexpFlag + var collation = typedCol.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + + pos := int64(1) + occurrence := int64(0) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + + if len(r.Arguments) > 3 { + pos, err = evalPosition(env, r.Arguments[3], int64(len(inputRunes))) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 4 { + occurrence, err = evalOccurrence(env, r.Arguments[4]) + if err != nil { + return nil, err + } + } + + if len(r.Arguments) > 5 { + flags, err = evalRegexpFlags(env, r.Arguments[5], flags) + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + found := false + if occurrence > 0 { + for i := int64(0); i < occurrence; i++ { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + } + if !found { + return newEvalRaw(sqltypes.Text, input.ToRawBytes(), typedCol), nil + } + + out := append(inputRunes[:int64(m.Start())+pos-1], replRunes...) + out = append(out, inputRunes[int64(m.End())+pos-1:]...) + bytes := charset.Collapse(nil, out, collation.Charset()) + return newEvalRaw(sqltypes.Text, bytes, typedCol), nil + } + + found, err = m.Find() + if err != nil { + return nil, err + } + + if !found { + return newEvalRaw(sqltypes.Text, input.ToRawBytes(), typedCol), nil + } + + start := int64(m.Start()) + pos - 1 + out := append(inputRunes[:start], replRunes...) + end := int64(m.End()) + pos - 1 + for { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + nextStart := int64(m.Start()) + pos - 1 + out = append(out, inputRunes[end:nextStart]...) + out = append(out, replRunes...) + end = int64(m.End()) + pos - 1 + } + + out = append(out, inputRunes[end:]...) + + bytes := charset.Collapse(nil, out, collation.Charset()) + return newEvalRaw(sqltypes.Text, bytes, typedCol), nil +} + +func (r *builtinRegexpReplace) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + _, f3 := r.Arguments[2].typeof(env, fields) + return sqltypes.Text, f1 | f2 | f3 +} + +func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { + return ctype{}, c.unsupported(r) +} + +var _ Expr = (*builtinRegexpReplace)(nil) diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index 603b24498dd..b36f6ca0985 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -151,7 +151,10 @@ var Cases = []TestCase{ {Run: FnUUID}, {Run: FnUUIDToBin}, {Run: DateMath}, - {Run: Regexp}, + {Run: RegexpLike}, + {Run: RegexpInstr}, + {Run: RegexpSubstr}, + {Run: RegexpReplace}, } func JSONPathOperations(yield Query) { @@ -1900,9 +1903,12 @@ func DateMath(yield Query) { } } -func Regexp(yield Query) { +func RegexpLike(yield Query) { mysqlDocSamples := []string{ `'Michael!' REGEXP '.*'`, + `'Michael!' RLIKE '.*'`, + `'Michael!' NOT REGEXP '.*'`, + `'Michael!' NOT RLIKE '.*'`, `'new*\n*line' REGEXP 'new\\*.\\*line'`, `'a' REGEXP '^[a-d]'`, `REGEXP_LIKE('CamelCase', 'CAMELCASE')`, @@ -1938,3 +1944,129 @@ func Regexp(yield Query) { yield(q, nil) } } + +func RegexpInstr(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_INSTR('Michael!', '.*')`, + `REGEXP_INSTR('new*\n*line', 'new\\*.\\*line')`, + `REGEXP_INSTR('a', '^[a-d]')`, + `REGEXP_INSTR('CamelCase', 'CAMELCASE')`, + `REGEXP_INSTR('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_INSTR('abc', 'ABC'`, + `REGEXP_INSTR('abc', 'ABC', 'c')`, + `REGEXP_INSTR(' ', '[[:blank:]]')`, + `REGEXP_INSTR('\t', '[[:blank:]]')`, + `REGEXP_INSTR(' ', '[[:space:]]')`, + `REGEXP_INSTR('\t', '[[:space:]]')`, + `REGEXP_INSTR(_latin1 0xFF, _latin1 '[[:lower:]]' COLLATE latin1_bin)`, + `REGEXP_INSTR(_koi8r 0xFF, _koi8r '[[:lower:]]' COLLATE koi8r_bin)`, + `REGEXP_INSTR(_latin1 0xFF, _latin1 '[[:upper:]]' COLLATE latin1_bin)`, + `REGEXP_INSTR(_koi8r 0xFF, _koi8r '[[:upper:]]' COLLATE koi8r_bin)`, + `REGEXP_INSTR(_latin1 0xF7, _latin1 '[[:alpha:]]')`, + `REGEXP_INSTR(_koi8r 0xF7, _koi8r '[[:alpha:]]')`, + `REGEXP_INSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, + `REGEXP_INSTR(_latin1'a', _latin1'A' collate latin1_bin)`, + `REGEXP_INSTR('a', '\\p{alphabetic}')`, + `REGEXP_INSTR('a', '\\P{alphabetic}')`, + `REGEXP_INSTR('👌🏾, '\\p{Emoji}\\p{Emoji_modifier}')`, + `REGEXP_INSTR('a', '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('a', '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('A', '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('A', '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('dog cat dog', 'dog')`, + `REGEXP_INSTR('dog cat dog', 'dog', 2)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 0)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 1)`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 1, 1, 'i')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 1, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2, 0)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2, 1)`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'i')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'c')`, + `REGEXP_INSTR('aa aaa aaaa', 'a{2}')`, + `REGEXP_INSTR('aa aaa aaaa', 'a{4}')`, + `REGEXP_INSTR(123, 123)`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} + +func RegexpSubstr(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_SUBSTR('Michael!', '.*')`, + `REGEXP_SUBSTR('new*\n*line', 'new\\*.\\*line')`, + `REGEXP_SUBSTR('a', '^[a-d]')`, + `REGEXP_SUBSTR('CamelCase', 'CAMELCASE')`, + `REGEXP_SUBSTR('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_SUBSTR('abc', 'ABC'`, + `REGEXP_SUBSTR(' ', '[[:blank:]]')`, + `REGEXP_SUBSTR('\t', '[[:blank:]]')`, + `REGEXP_SUBSTR(' ', '[[:space:]]')`, + `REGEXP_SUBSTR('\t', '[[:space:]]')`, + `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, + `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_bin)`, + `REGEXP_SUBSTR('a', '\\p{alphabetic}')`, + `REGEXP_SUBSTR('a', '\\P{alphabetic}')`, + `REGEXP_SUBSTR('👌🏾, '\\p{Emoji}\\p{Emoji_modifier}')`, + `REGEXP_SUBSTR('a', '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('a', '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('A', '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('A', '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('dog cat dog', 'dog')`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 2)`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 1, 1)`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 'c')`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 1, 2)`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 2, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 2, 'c')`, + `REGEXP_SUBSTR('aa aaa aaaa', 'a{2}')`, + `REGEXP_SUBSTR('aa aaa aaaa', 'a{4}')`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} + +func RegexpReplace(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_REPLACE('a b c', 'b', 'X')`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 3)`, + `REGEXP_REPLACE('a', '\\p{Lowercase_letter}', 'X')`, + `REGEXP_REPLACE('a', '\\p{Uppercase_letter}', 'X')`, + `REGEXP_REPLACE('A', '\\p{Lowercase_letter}', 'X')`, + `REGEXP_REPLACE('A', '\\p{Uppercase_letter}', 'X')`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} diff --git a/go/vt/vtgate/evalengine/translate_builtin.go b/go/vt/vtgate/evalengine/translate_builtin.go index f4a27dad704..49784973180 100644 --- a/go/vt/vtgate/evalengine/translate_builtin.go +++ b/go/vt/vtgate/evalengine/translate_builtin.go @@ -791,6 +791,141 @@ func (ast *astCompiler) translateCallable(call sqlparser.Callable) (Expr, error) Negate: false, }, nil + case *sqlparser.RegexpInstrExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.ReturnOption != nil { + returnOption, err := ast.translateExpr(call.ReturnOption) + if err != nil { + return nil, err + } + args = append(args, returnOption) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpInstr{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_INSTR"}, + }, nil + + case *sqlparser.RegexpSubstrExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpSubstr{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_SUBSTR"}, + }, nil + + case *sqlparser.RegexpReplaceExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + repl, err := ast.translateExpr(call.Repl) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern, repl} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpReplace{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_REPLACE"}, + }, nil default: return nil, translateExprNotSupported(call) } diff --git a/go/vt/vttablet/tabletmanager/vreplication/utils.go b/go/vt/vttablet/tabletmanager/vreplication/utils.go index 02bcbb235be..1e26687e147 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/utils.go +++ b/go/vt/vttablet/tabletmanager/vreplication/utils.go @@ -155,6 +155,26 @@ func isUnrecoverableError(err error) bool { mysql.ERInvalidJSONTextInParams, mysql.ERJSONDocumentTooDeep, mysql.ERJSONValueTooBig, + mysql.ERRegexpError, + mysql.ERRegexpStringNotTerminated, + mysql.ERRegexpIllegalArgument, + mysql.ERRegexpIndexOutOfBounds, + mysql.ERRegexpInternal, + mysql.ERRegexpRuleSyntax, + mysql.ERRegexpBadEscapeSequence, + mysql.ERRegexpUnimplemented, + mysql.ERRegexpMismatchParen, + mysql.ERRegexpBadInterval, + mysql.ERRRegexpMaxLtMin, + mysql.ERRegexpInvalidBackRef, + mysql.ERRegexpLookBehindLimit, + mysql.ERRegexpMissingCloseBracket, + mysql.ERRegexpInvalidRange, + mysql.ERRegexpStackOverflow, + mysql.ERRegexpTimeOut, + mysql.ERRegexpPatternTooBig, + mysql.ERRegexpInvalidCaptureGroup, + mysql.ERRegexpInvalidFlag, mysql.ERNoDefault, mysql.ERNoDefaultForField, mysql.ERNonUniq, From bd2dad824e950e1b06a304e30e3a2f801a061543 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Mon, 3 Jul 2023 16:18:35 +0200 Subject: [PATCH 11/18] regexp: Update generated data Signed-off-by: Dirkjan Bussink --- go/vt/vtgate/evalengine/cached_size.go | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/go/vt/vtgate/evalengine/cached_size.go b/go/vt/vtgate/evalengine/cached_size.go index 8dda2d3902c..ea525e46a25 100644 --- a/go/vt/vtgate/evalengine/cached_size.go +++ b/go/vt/vtgate/evalengine/cached_size.go @@ -1257,6 +1257,18 @@ func (cached *builtinRandomBytes) CachedSize(alloc bool) int64 { size += cached.CallExpr.CachedSize(false) return size } +func (cached *builtinRegexpInstr) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} func (cached *builtinRegexpLike) CachedSize(alloc bool) int64 { if cached == nil { return int64(0) @@ -1269,6 +1281,30 @@ func (cached *builtinRegexpLike) CachedSize(alloc bool) int64 { size += cached.CallExpr.CachedSize(false) return size } +func (cached *builtinRegexpReplace) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} +func (cached *builtinRegexpSubstr) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} func (cached *builtinRepeat) CachedSize(alloc bool) int64 { if cached == nil { return int64(0) From 8fd1f6174a71bff0e70dcbe05e65b419effefa82 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Mon, 3 Jul 2023 16:36:56 +0200 Subject: [PATCH 12/18] Revert accidentally committed test Signed-off-by: Dirkjan Bussink --- go/vt/vtgate/evalengine/compiler_test.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go index 2c0fa71f52a..92ef9d3d465 100644 --- a/go/vt/vtgate/evalengine/compiler_test.go +++ b/go/vt/vtgate/evalengine/compiler_test.go @@ -444,10 +444,6 @@ func TestCompilerSingle(t *testing.T) { expression: `INTERVAL(0, 0, 0, -1, NULL, NULL, 1)`, result: `INT64(5)`, }, - { - expression: `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, - result: `VARCHAR("X def ghi")`, - }, } for _, tc := range testCases { From 88c382e9737cd70c608a21ced60964aa1426c377 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Tue, 4 Jul 2023 17:40:04 +0200 Subject: [PATCH 13/18] evalengine: Add compilation for regular expressions Also fixes a whole slew of bugs identified. Signed-off-by: Dirkjan Bussink --- go/mysql/collations/env.go | 9 +- go/mysql/constants.go | 2 + go/mysql/icuregex/compiler.go | 19 +- go/mysql/icuregex/error.go | 6 +- go/mysql/icuregex/icu_test.go | 9 +- .../icuregex/internal/pattern/unescape.go | 113 +++ .../internal/pattern/unescape_test.go | 10 + go/mysql/icuregex/pattern.go | 11 +- go/mysql/sql_error.go | 1 + go/vt/vterrors/state.go | 2 + go/vt/vtgate/evalengine/compiler_asm.go | 409 ++++++++- go/vt/vtgate/evalengine/compiler_test.go | 4 + go/vt/vtgate/evalengine/expr_collate.go | 6 + go/vt/vtgate/evalengine/fn_regexp.go | 810 +++++++++++++++--- .../evalengine/integration/fuzz_test.go | 3 + go/vt/vtgate/evalengine/testcases/cases.go | 33 +- go/vt/vtgate/evalengine/testcases/inputs.go | 46 + 17 files changed, 1328 insertions(+), 165 deletions(-) diff --git a/go/mysql/collations/env.go b/go/mysql/collations/env.go index 52a255b6f41..0c063e140d5 100644 --- a/go/mysql/collations/env.go +++ b/go/mysql/collations/env.go @@ -194,10 +194,11 @@ func makeEnv(version collver) *Environment { // A few interesting character set values. // See http://dev.mysql.com/doc/internals/en/character-set.html#packet-Protocol::CharacterSet const ( - CollationUtf8ID = 33 - CollationUtf8mb4ID = 255 - CollationBinaryID = 63 - CollationUtf8mb4BinID = 46 + CollationUtf8ID = 33 + CollationUtf8mb4ID = 255 + CollationBinaryID = 63 + CollationUtf8mb4BinID = 46 + CollationLatin1Swedish = 8 ) // Binary is the default Binary collation diff --git a/go/mysql/constants.go b/go/mysql/constants.go index f62f9373e0f..97f23a3a285 100644 --- a/go/mysql/constants.go +++ b/go/mysql/constants.go @@ -586,6 +586,8 @@ const ( ERRegexpInvalidCaptureGroup = ErrorCode(3887) ERRegexpInvalidFlag = ErrorCode(3900) + ERCharacterSetMismatch = ErrorCode(3995) + // max execution time exceeded ERQueryTimeout = ErrorCode(3024) diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index f2eac4ac9f8..eba297d0f21 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -73,7 +73,7 @@ const ( type compiler struct { err error out *Pattern - p string + p []rune scanIndex int quoteMode bool @@ -130,10 +130,13 @@ func (c *compiler) nextCharLL() (ch rune) { ch, c.peekChar = c.peekChar, -1 return } - var w int - ch, w = utf8.DecodeRuneInString(c.p) - c.p = c.p[w:] - if ch == utf8.RuneError && (w == 0 || w == 1) { + if len(c.p) == 0 { + return -1 + } + + ch = c.p[0] + c.p = c.p[1:] + if ch == utf8.RuneError { return -1 } @@ -223,7 +226,7 @@ func (c *compiler) nextChar(ch *reChar) { c.nextCharLL() // get & discard the peeked char. ch.quoted = true - ch.char, c.p = pattern.UnescapeAt(beforeEscape) + ch.char, c.p = pattern.UnescapeAtRunes(beforeEscape) if ch.char < 0 { c.error(BadEscapeSequence) } @@ -302,7 +305,7 @@ const ( chDash = 0x2d // '-' ) -func (c *compiler) compile(pat string) error { +func (c *compiler) compile(pat []rune) error { if c.err != nil { return c.err } @@ -310,7 +313,7 @@ func (c *compiler) compile(pat string) error { panic("cannot reuse pattern") } - c.out.pattern = pat + c.out.pattern = string(pat) c.p = pat var state uint16 = 1 diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go index 3b8c8a36312..219ddcf602b 100644 --- a/go/mysql/icuregex/error.go +++ b/go/mysql/icuregex/error.go @@ -37,9 +37,9 @@ func (e *CompileError) Error() string { var out strings.Builder switch e.Code { case InternalError: - out.WriteString("Internal Error") + out.WriteString("Internal error") case RuleSyntax: - out.WriteString("Syntax Error") + out.WriteString("Syntax error") case BadEscapeSequence: out.WriteString("Bad escape sequence") case PropertySyntax: @@ -69,7 +69,7 @@ func (e *CompileError) Error() string { case InvalidCaptureGroupName: out.WriteString("Invalid capture group name") } - _, _ = fmt.Fprintf(&out, " at line %d, column %d: `%s`", e.Line, e.Offset, e.Context) + _, _ = fmt.Fprintf(&out, " in regular expression on line %d, character %d: `%s`", e.Line, e.Offset, e.Context) return out.String() } diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index ac42cc16b3f..25179c9a2c2 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -404,18 +404,19 @@ func TestCornerCases(t *testing.T) { func TestOne(t *testing.T) { icuregex.Dumper = os.Stderr - const Pattern = `\p{CaseIgnorable}` - const Input = "foo.bar" + pattern := []rune{55296, 56320} + input := []rune{'𐀀'} const Flags = 0 - re, err := icuregex.CompileString(Pattern, Flags) + re, err := icuregex.Compile(pattern, Flags) if err != nil { t.Fatalf("compilation failed: %v", err) } re.Dump(os.Stderr) - m := re.Match(Input) + m := icuregex.NewMatcher(re) + m.Reset(input) found, err := m.Find() require.NoError(t, err) t.Logf("match = %v", found) diff --git a/go/mysql/icuregex/internal/pattern/unescape.go b/go/mysql/icuregex/internal/pattern/unescape.go index bdef8ad5cb3..e4a554ff612 100644 --- a/go/mysql/icuregex/internal/pattern/unescape.go +++ b/go/mysql/icuregex/internal/pattern/unescape.go @@ -199,3 +199,116 @@ func UnescapeAt(str string) (rune, string) { return c, str } + +func UnescapeAtRunes(str []rune) (rune, []rune) { + if len(str) == 0 { + return -1, str + } + + c := str[0] + str = str[1:] + if c == utf8.RuneError { + return -1, str + } + + var minDig, maxDig, n int + var braces bool + var bitsPerDigit = 4 + var result rune + + switch c { + case 'u': + minDig = 4 + maxDig = 4 + case 'U': + minDig = 8 + maxDig = 8 + case 'x': + minDig = 1 + if len(str) > 0 && str[0] == '{' { + str = str[1:] + braces = true + maxDig = 8 + } else { + maxDig = 2 + } + default: + if dig := _digit8(c); dig >= 0 { + minDig = 1 + maxDig = 4 + n = 1 + bitsPerDigit = 3 + result = dig + } + } + + if minDig != 0 { + for n < maxDig && len(str) > 0 { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + + var dig rune + if bitsPerDigit == 3 { + dig = _digit8(c) + } else { + dig = _digit16(c) + } + if dig < 0 { + break + } + result = (result << bitsPerDigit) | dig + str = str[1:] + n++ + } + if n < minDig { + return -1, str + } + if braces { + if c != '}' { + return -1, str + } + str = str[1:] + } + if result < 0 || result > utf8.MaxRune { + return -1, str + } + if len(str) > 0 && utf16.IsLead(result) { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + if c == '\\' { + var str2 []rune + c, str2 = UnescapeAtRunes(str[1:]) + if utf16.IsTrail(c) { + result = utf16.DecodeRune(result, c) + str = str2 + } + } + } + return result, str + } + + if c < utf8.RuneSelf { + for i := 0; i < len(unscapeMap); i += 2 { + if byte(c) == unscapeMap[i] { + return rune(unscapeMap[i+1]), str + } + if byte(c) < unscapeMap[i] { + break + } + } + } + + if c == 'c' && len(str) > 0 { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + return 0x1f & c, str[1:] + } + + return c, str +} diff --git a/go/mysql/icuregex/internal/pattern/unescape_test.go b/go/mysql/icuregex/internal/pattern/unescape_test.go index 8428584f8c8..0bb76c2bfdb 100644 --- a/go/mysql/icuregex/internal/pattern/unescape_test.go +++ b/go/mysql/icuregex/internal/pattern/unescape_test.go @@ -36,3 +36,13 @@ func TestUnescapeAt(t *testing.T) { assert.Equal(t, rune(0x00010000), r) assert.Equal(t, "", str) } + +func TestUnescapeAtRunes(t *testing.T) { + r, str := UnescapeAtRunes([]rune("ud800\\ud800\\udc00")) + assert.Equal(t, rune(0xd800), r) + assert.Equal(t, []rune("\\ud800\\udc00"), str) + + r, str = UnescapeAtRunes(str[1:]) + assert.Equal(t, rune(0x00010000), r) + assert.Equal(t, []rune(""), str) +} diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go index d0913afa13f..f0823a213d4 100644 --- a/go/mysql/icuregex/pattern.go +++ b/go/mysql/icuregex/pattern.go @@ -67,7 +67,7 @@ func MustCompileString(in string, flags RegexpFlag) *Pattern { return pat } -func CompileString(in string, flags RegexpFlag) (*Pattern, error) { +func Compile(in []rune, flags RegexpFlag) (*Pattern, error) { pat := NewPattern(flags) cmp := newCompiler(pat) if err := cmp.compile(in); err != nil { @@ -76,6 +76,15 @@ func CompileString(in string, flags RegexpFlag) (*Pattern, error) { return pat, nil } +func CompileString(in string, flags RegexpFlag) (*Pattern, error) { + pat := NewPattern(flags) + cmp := newCompiler(pat) + if err := cmp.compile([]rune(in)); err != nil { + return nil, err + } + return pat, nil +} + func (p *Pattern) Match(input string) *Matcher { m := NewMatcher(p) m.ResetString(input) diff --git a/go/mysql/sql_error.go b/go/mysql/sql_error.go index 7230a3ce281..4c83fd956a0 100644 --- a/go/mysql/sql_error.go +++ b/go/mysql/sql_error.go @@ -238,6 +238,7 @@ var stateToMysqlCode = map[vterrors.State]mysqlCode{ vterrors.RegexpPatternTooBig: {num: ERRegexpPatternTooBig, state: SSUnknownSQLState}, vterrors.RegexpInvalidFlag: {num: ERRegexpInvalidFlag, state: SSUnknownSQLState}, vterrors.RegexpInvalidCaptureGroup: {num: ERRegexpInvalidCaptureGroup, state: SSUnknownSQLState}, + vterrors.CharacterSetMismatch: {num: ERCharacterSetMismatch, state: SSUnknownSQLState}, } func getStateToMySQLState(state vterrors.State) mysqlCode { diff --git a/go/vt/vterrors/state.go b/go/vt/vterrors/state.go index 37da94ad92c..406b535b510 100644 --- a/go/vt/vterrors/state.go +++ b/go/vt/vterrors/state.go @@ -110,6 +110,8 @@ const ( RegexpInvalidCaptureGroup RegexpInvalidFlag + CharacterSetMismatch + // No state should be added below NumOfStates NumOfStates ) diff --git a/go/vt/vtgate/evalengine/compiler_asm.go b/go/vt/vtgate/evalengine/compiler_asm.go index 870c32fd767..afef071e754 100644 --- a/go/vt/vtgate/evalengine/compiler_asm.go +++ b/go/vt/vtgate/evalengine/compiler_asm.go @@ -35,6 +35,8 @@ import ( "github.com/google/uuid" + "vitess.io/vitess/go/mysql/icuregex" + "vitess.io/vitess/go/hack" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/mysql/collations/charset" @@ -3942,10 +3944,6 @@ func (asm *assembler) Fn_YEARWEEK() { }, "FN YEARWEEK DATE(SP-1)") } -func intervalStackOffset(l, i int) int { - return l - i + 1 -} - func (asm *assembler) Interval_i(l int) { asm.adjustStack(-l) asm.emit(func(env *ExpressionEnv) int { @@ -4285,3 +4283,406 @@ func (asm *assembler) Fn_DATEADD_s(unit datetime.IntervalType, sub bool, col col }, "FN DATEADD TEMPORAL(SP-2), INTERVAL(SP-1)") } + +func (asm *assembler) Fn_REGEXP_LIKE(m *icuregex.Matcher, negate bool, c charset.Charset, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + m.Reset(charset.Expand(nil, input.bytes, c)) + + ok, err := m.Find() + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if negate { + ok = !ok + } + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalBool(ok) + env.vm.sp -= offset + return 1 + }, "FN REGEXP_LIKE VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_LIKE_slow(negate bool, c collations.Charset, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + var err error + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + + if offset > 1 { + fe := env.vm.stack[env.vm.sp-offset+1] + flags, err = regexpFlags(fe, flags, "regexp_like") + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(charset.Expand(nil, input.bytes, c)) + + ok, err := m.Find() + if err != nil { + env.vm.err = err + env.vm.sp-- + return 1 + } + if negate { + ok = !ok + } + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalBool(ok) + env.vm.sp -= offset + return 1 + }, "FN REGEXP_LIKE_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_INSTR(m *icuregex.Matcher, c charset.Charset, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + returnOpt := int64(0) + if offset > 3 { + returnOpt, env.vm.err = returnOption(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + if !found { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + } else if returnOpt == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.Start()) + pos) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.End()) + pos) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_INSTR VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_INSTR_slow(c collations.Charset, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + returnOpt := int64(0) + if offset > 3 { + returnOpt, env.vm.err = returnOption(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + if offset > 4 { + fe := env.vm.stack[env.vm.sp-offset+4] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + if !found { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + } else if returnOpt == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.Start()) + pos) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.End()) + pos) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_INSTR_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_SUBSTR(m *icuregex.Matcher, merged collations.TypedCollation, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + c := merged.Collation.Get().Charset() + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + + if !found { + env.vm.stack[env.vm.sp-offset-1] = nil + } else { + out := runes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + b := charset.Collapse(nil, out, c) + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalText(b, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_SUBSTR VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_SUBSTR_slow(merged collations.TypedCollation, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + c := merged.Collation.Get().Charset() + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + if offset > 3 { + fe := env.vm.stack[env.vm.sp-offset+3] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_substr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + + if !found { + env.vm.stack[env.vm.sp-offset-1] = nil + } else { + out := runes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + b := charset.Collapse(nil, out, c) + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalText(b, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_SUBSTR_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_REPLACE(m *icuregex.Matcher, merged collations.TypedCollation, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + repl := env.vm.stack[env.vm.sp-offset+1].(*evalBytes) + + c := merged.Collation.Get().Charset() + inputRunes := charset.Expand(nil, input.bytes, c) + replRunes := charset.Expand(nil, repl.bytes, c) + + pos := int64(1) + if offset > 2 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), int64(len(inputRunes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(0) + if offset > 3 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), occ) + } + + m.Reset(inputRunes[pos-1:]) + + b, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, merged.Collation.Get().Charset()) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if !replaced { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(b, sqltypes.Text, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_REPLACE VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_REPLACE_slow(merged collations.TypedCollation, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + repl := env.vm.stack[env.vm.sp-offset+1].(*evalBytes) + + c := merged.Collation.Get().Charset() + inputRunes := charset.Expand(nil, input.bytes, c) + replRunes := charset.Expand(nil, repl.bytes, c) + + pos := int64(1) + if offset > 2 { + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), int64(len(inputRunes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(0) + if offset > 3 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), 0) + } + + if offset > 4 { + fe := env.vm.stack[env.vm.sp-offset+4] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_replace") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(inputRunes[pos-1:]) + + b, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, merged.Collation.Get().Charset()) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if !replaced { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(b, sqltypes.Text, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_REPLACE_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go index 92ef9d3d465..969549483bb 100644 --- a/go/vt/vtgate/evalengine/compiler_test.go +++ b/go/vt/vtgate/evalengine/compiler_test.go @@ -444,6 +444,10 @@ func TestCompilerSingle(t *testing.T) { expression: `INTERVAL(0, 0, 0, -1, NULL, NULL, 1)`, result: `INT64(5)`, }, + { + expression: `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 0)`, + result: `TEXT("X X X")`, + }, } for _, tc := range testCases { diff --git a/go/vt/vtgate/evalengine/expr_collate.go b/go/vt/vtgate/evalengine/expr_collate.go index 4962d9dd2ba..2ba2e3dba61 100644 --- a/go/vt/vtgate/evalengine/expr_collate.go +++ b/go/vt/vtgate/evalengine/expr_collate.go @@ -54,6 +54,12 @@ var collationUtf8mb3 = collations.TypedCollation{ Repertoire: collations.RepertoireUnicode, } +var collationRegexpFallback = collations.TypedCollation{ + Collation: collations.CollationLatin1Swedish, + Coercibility: collations.CoerceCoercible, + Repertoire: collations.RepertoireASCII, +} + type ( CollateExpr struct { UnaryExpr diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index 567ee61e668..b83ef0b582f 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -4,7 +4,6 @@ import ( "errors" "strings" - "vitess.io/vitess/go/hack" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/mysql/collations/charset" "vitess.io/vitess/go/mysql/icuregex" @@ -15,12 +14,7 @@ import ( "vitess.io/vitess/go/vt/vterrors" ) -func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) (icuregex.RegexpFlag, error) { - m, err := match.eval(env) - if err != nil || m == nil { - return flags, err - } - +func regexpFlags(m eval, flags icuregex.RegexpFlag, f string) (icuregex.RegexpFlag, error) { switch m := m.(type) { case *evalBytes: for _, b := range m.bytes { @@ -36,57 +30,111 @@ func evalRegexpFlags(env *ExpressionEnv, match Expr, flags icuregex.RegexpFlag) case 'u': flags |= icuregex.UnixLines default: - return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr.") + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s.", f) } } default: - return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr.") + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s.", f) } return flags, nil } -func evalOccurrence(env *ExpressionEnv, expr Expr) (int64, error) { - occExpr, err := expr.eval(env) - if err != nil { - return 0, err +func occurrence(e *evalInt64, min int64) int64 { + if e.i < min { + return min } - return evalToInt64(occExpr).i, nil + return e.i } -func evalReturnOption(env *ExpressionEnv, expr Expr) (int64, error) { - retExpr, err := expr.eval(env) - if err != nil { - return 0, err - } - returnOption := evalToInt64(retExpr).i - switch returnOption { +func returnOption(val *evalInt64, f string) (int64, error) { + switch val.i { case 0, 1: // Valid return options. - return returnOption, nil + return val.i, nil } - return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to regexp_instr: return_option must be 1 or 0.") + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s: return_option must be 1 or 0.", f) } -func evalPosition(env *ExpressionEnv, expr Expr, limit int64) (int64, error) { - posExpr, err := expr.eval(env) - if err != nil { - return 0, err - } - pos := evalToInt64(posExpr).i +func position(val *evalInt64, limit int64) (int64, error) { + pos := val.i if pos < 1 || pos > limit { return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") } return pos, nil } -func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*icuregex.Pattern, error) { - patUtf8, err := charset.Convert(nil, &charset.Charset_utf8mb4{}, pat.ToRawBytes(), c) +func evalRegexpCollation(input, pat eval, f string) (eval, eval, collations.TypedCollation, icuregex.RegexpFlag, error) { + var typedCol collations.TypedCollation + var err error + + if inputBytes, ok := input.(*evalBytes); ok { + if patBytes, ok := pat.(*evalBytes); ok { + inputCol := inputBytes.col.Collation + patCol := patBytes.col.Collation + if (inputCol == collations.CollationBinaryID && patCol != collations.CollationBinaryID) || + (inputCol != collations.CollationBinaryID && patCol == collations.CollationBinaryID) { + inputColName := inputCol.Get().Name() + patColName := patCol.Get().Name() + return nil, nil, typedCol, 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.CharacterSetMismatch, "Character set '%s' cannot be used in conjunction with '%s' in call to %s.", inputColName, patColName, f) + } + } + } + + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) if err != nil { - return nil, err + return nil, nil, collations.TypedCollation{}, 0, err + } + + var flags icuregex.RegexpFlag + var collation = typedCol.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive } - regexp, err := icuregex.CompileString(hack.String(patUtf8), flags) + return input, pat, typedCol, flags, nil +} + +func compileRegexpCollation(input, pat ctype, f string) (collations.TypedCollation, icuregex.RegexpFlag, error) { + var merged collations.TypedCollation + var err error + + if input.isTextual() && pat.isTextual() { + inputCol := input.Col.Collation + patCol := pat.Col.Collation + if (inputCol == collations.CollationBinaryID && patCol != collations.CollationBinaryID) || + (inputCol != collations.CollationBinaryID && patCol == collations.CollationBinaryID) { + inputColName := inputCol.Get().Name() + patColName := patCol.Get().Name() + return input.Col, 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.CharacterSetMismatch, "Character set '%s' cannot be used in conjunction with '%s' in call to %s.", inputColName, patColName, f) + } + } + + if input.Col.Collation != pat.Col.Collation { + merged, _, _, err = mergeCollations(input.Col, pat.Col, input.Type, pat.Type) + } else { + merged = input.Col + } + if err != nil { + return input.Col, 0, err + } + + var flags icuregex.RegexpFlag + var collation = merged.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + return merged, flags, nil +} + +func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*icuregex.Pattern, error) { + patRunes := charset.Expand(nil, pat.ToRawBytes(), c) + + if len(patRunes) == 0 { + return nil, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, " Illegal argument to a regular expression.") + } + + regexp, err := icuregex.Compile(patRunes, flags) if err == nil { return regexp, nil } @@ -136,6 +184,17 @@ func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*i return nil, err } +// resultCollation returns the collation to use for the result of a regexp. +// This falls back to latin1_swedish if the input collation is binary. This +// seems to be a side effect of how MySQL also works. Probably due to how it +// is using ICU and converting there. +func resultCollation(in collations.TypedCollation) collations.TypedCollation { + if in.Collation == collationBinary.Collation { + return collationRegexpFallback + } + return in +} + type builtinRegexpLike struct { CallExpr Negate bool @@ -152,20 +211,18 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var typedCol collations.TypedCollation - input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_like") if err != nil { return nil, err } - - var flags icuregex.RegexpFlag - var collation = typedCol.Collation.Get() - if strings.Contains(collation.Name(), "_ci") { - flags |= icuregex.CaseInsensitive - } + collation := typedCol.Collation.Get() if len(r.Arguments) > 2 { - flags, err = evalRegexpFlags(env, r.Arguments[2], flags) + m, err := r.Arguments[2].eval(env) + if err != nil || m == nil { + return nil, err + } + flags, err = regexpFlags(m, flags, "regexp_like") if err != nil { return nil, err } @@ -180,7 +237,7 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { m := icuregex.NewMatcher(regexp) m.Reset(inputRunes) - ok, err := m.Matches() + ok, err := m.Find() if err != nil { return nil, err } @@ -193,11 +250,95 @@ func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { func (r *builtinRegexpLike) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { _, f1 := r.Arguments[0].typeof(env, fields) _, f2 := r.Arguments[1].typeof(env, fields) - return sqltypes.Int64, f1 | f2 | flagIsBoolean + var f3 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + return sqltypes.Int64, f1 | f2 | f3 | flagIsBoolean +} + +func (r *builtinRegexpLike) compileSlow(c *compiler, input, pat, fl ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_LIKE_slow(r.Negate, merged.Collation.Get().Charset(), flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | fl.Flag | flagIsBoolean}, nil } func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { - return ctype{}, c.unsupported(r) + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pat, 1)) + + var f ctype + + if len(r.Arguments) > 2 { + f, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(f, 2)) + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_like") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + pattern, ok := r.Arguments[1].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + inner, ok := pattern.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + + if len(r.Arguments) > 2 { + fl, ok := r.Arguments[2].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + fe, ok := fl.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + + flags, err = regexpFlags(fe, flags, "regexp_like") + if err != nil { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + } + + p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + if err != nil { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_LIKE(icuregex.NewMatcher(p), r.Negate, merged.Collation.Get().Charset(), len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | f.Flag | flagIsBoolean}, nil } var _ Expr = (*builtinRegexpLike)(nil) @@ -217,46 +358,54 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var typedCol collations.TypedCollation - input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_instr") if err != nil { return nil, err } - var flags icuregex.RegexpFlag - var collation = typedCol.Collation.Get() - if strings.Contains(collation.Name(), "_ci") { - flags |= icuregex.CaseInsensitive - } + collation := typedCol.Collation.Get() pos := int64(1) - occurrence := int64(1) - returnOption := int64(0) + occ := int64(1) + returnOpt := int64(0) inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) if len(r.Arguments) > 2 { - pos, err = evalPosition(env, r.Arguments[2], int64(len(inputRunes))) + posExpr, err := r.Arguments[2].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) if err != nil { return nil, err } } if len(r.Arguments) > 3 { - occurrence, err = evalOccurrence(env, r.Arguments[3]) - if err != nil { + occExpr, err := r.Arguments[3].eval(env) + if err != nil || occExpr == nil { return nil, err } + occ = occurrence(evalToInt64(occExpr), occ) } if len(r.Arguments) > 4 { - returnOption, err = evalReturnOption(env, r.Arguments[4]) + retExpr, err := r.Arguments[4].eval(env) + if err != nil || retExpr == nil { + return nil, err + } + returnOpt, err = returnOption(evalToInt64(retExpr), "regexp_instr") if err != nil { return nil, err } } if len(r.Arguments) > 5 { - flags, err = evalRegexpFlags(env, r.Arguments[5], flags) + m, err := r.Arguments[5].eval(env) + if err != nil || m == nil { + return nil, err + } + flags, err = regexpFlags(m, flags, "regexp_instr") if err != nil { return nil, err } @@ -271,7 +420,7 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { m.Reset(inputRunes[pos-1:]) found := false - for i := int64(0); i < occurrence; i++ { + for i := int64(0); i < occ; i++ { found, err = m.Find() if err != nil { return nil, err @@ -283,7 +432,7 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { if !found { return newEvalInt64(0), nil } - if returnOption == 0 { + if returnOpt == 0 { return newEvalInt64(int64(m.Start()) + pos), nil } return newEvalInt64(int64(m.End()) + pos), nil @@ -292,11 +441,138 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { func (r *builtinRegexpInstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { _, f1 := r.Arguments[0].typeof(env, fields) _, f2 := r.Arguments[1].typeof(env, fields) - return sqltypes.Int64, f1 | f2 + var f3, f4, f5, f6 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + if len(r.Arguments) > 5 { + _, f6 = r.Arguments[5].typeof(env, fields) + } + return sqltypes.Int64, f1 | f2 | f3 | f4 | f5 | f6 +} + +func (r *builtinRegexpInstr) compileSlow(c *compiler, input, pat, pos, occ, returnOption, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_INSTR_slow(merged.Collation.Get().Charset(), flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | returnOption.Flag | matchType.Flag}, nil } func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { - return ctype{}, c.unsupported(r) + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 1)) + + var pos ctype + if len(r.Arguments) > 2 { + pos, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 2)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 3 { + occ, err = r.Arguments[3].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 3)) + _ = c.compileToInt64(pos, 1) + } + + var returnOpt ctype + if len(r.Arguments) > 4 { + returnOpt, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 4)) + _ = c.compileToInt64(pos, 1) + } + + var matchType ctype + if len(r.Arguments) > 5 { + matchType, err = r.Arguments[5].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 5)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_instr") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + pattern, ok := r.Arguments[1].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + inner, ok := pattern.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + + if len(r.Arguments) > 5 { + fl, ok := r.Arguments[5].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + fe, ok := fl.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + + flags, err = regexpFlags(fe, flags, "regexp_instr") + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + } + + p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_INSTR(icuregex.NewMatcher(p), merged.Collation.Get().Charset(), len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | flagIsBoolean}, nil } var _ Expr = (*builtinRegexpInstr)(nil) @@ -316,38 +592,42 @@ func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var typedCol collations.TypedCollation - input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_substr") if err != nil { return nil, err } - var flags icuregex.RegexpFlag - var collation = typedCol.Collation.Get() - if strings.Contains(collation.Name(), "_ci") { - flags |= icuregex.CaseInsensitive - } + collation := typedCol.Collation.Get() pos := int64(1) - occurrence := int64(1) + occ := int64(1) inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) if len(r.Arguments) > 2 { - pos, err = evalPosition(env, r.Arguments[2], int64(len(inputRunes))) + posExpr, err := r.Arguments[2].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) if err != nil { return nil, err } } if len(r.Arguments) > 3 { - occurrence, err = evalOccurrence(env, r.Arguments[3]) - if err != nil { + occExpr, err := r.Arguments[3].eval(env) + if err != nil || occExpr == nil { return nil, err } + occ = occurrence(evalToInt64(occExpr), occ) } if len(r.Arguments) > 4 { - flags, err = evalRegexpFlags(env, r.Arguments[4], flags) + m, err := r.Arguments[4].eval(env) + if err != nil || m == nil { + return nil, err + } + flags, err = regexpFlags(m, flags, "regexp_substr") if err != nil { return nil, err } @@ -362,7 +642,7 @@ func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { m.Reset(inputRunes[pos-1:]) found := false - for i := int64(0); i < occurrence; i++ { + for i := int64(0); i < occ; i++ { found, err = m.Find() if err != nil { return nil, err @@ -375,18 +655,132 @@ func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { return nil, nil } out := inputRunes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] - bytes := charset.Collapse(nil, out, collation.Charset()) - return newEvalText(bytes, typedCol), nil + b := charset.Collapse(nil, out, collation.Charset()) + return newEvalText(b, resultCollation(typedCol)), nil } func (r *builtinRegexpSubstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { _, f1 := r.Arguments[0].typeof(env, fields) _, f2 := r.Arguments[1].typeof(env, fields) - return sqltypes.VarChar, f1 | f2 + var f3, f4, f5 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + return sqltypes.VarChar, f1 | f2 | f3 | f4 | f5 +} + +func (r *builtinRegexpSubstr) compileSlow(c *compiler, input, pat, pos, occ, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_SUBSTR_slow(merged, flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil } func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { - return ctype{}, c.unsupported(r) + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 1)) + + var pos ctype + if len(r.Arguments) > 2 { + pos, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 2)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 3 { + occ, err = r.Arguments[3].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 3)) + _ = c.compileToInt64(pos, 1) + } + + var matchType ctype + if len(r.Arguments) > 4 { + matchType, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 4)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_substr") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + pattern, ok := r.Arguments[1].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + inner, ok := pattern.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + + if len(r.Arguments) > 4 { + fl, ok := r.Arguments[4].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + fe, ok := fl.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + + flags, err = regexpFlags(fe, flags, "regexp_substr") + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + } + + p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_SUBSTR(icuregex.NewMatcher(p), merged, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil } var _ Expr = (*builtinRegexpSubstr)(nil) @@ -395,6 +789,58 @@ type builtinRegexpReplace struct { CallExpr } +func regexpReplace(m *icuregex.Matcher, inputRunes, replRunes []rune, pos, occ int64, c collations.Charset) ([]byte, bool, error) { + var err error + found := false + if occ > 0 { + for i := int64(0); i < occ; i++ { + found, err = m.Find() + if err != nil { + return nil, false, err + } + if !found { + break + } + } + if !found { + return nil, false, nil + } + + out := append(inputRunes[:int64(m.Start())+pos-1], replRunes...) + out = append(out, inputRunes[int64(m.End())+pos-1:]...) + return charset.Collapse(nil, out, c), true, nil + } + + found, err = m.Find() + if err != nil { + return nil, false, err + } + + if !found { + return nil, false, nil + } + + start := int64(m.Start()) + pos - 1 + out := append(inputRunes[:start], replRunes...) + end := int64(m.End()) + pos - 1 + for { + found, err = m.Find() + if err != nil { + return nil, false, err + } + if !found { + break + } + nextStart := int64(m.Start()) + pos - 1 + out = append(out, inputRunes[end:nextStart]...) + out = append(out, replRunes...) + end = int64(m.End()) + pos - 1 + } + + out = append(out, inputRunes[end:]...) + return charset.Collapse(nil, out, c), true, nil +} + func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { input, err := r.Arguments[0].eval(env) if err != nil || input == nil { @@ -411,12 +857,13 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var typedCol collations.TypedCollation - input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_replace") if err != nil { return nil, err } + collation := typedCol.Collation.Get() + repl, ok := replArg.(*evalBytes) if !ok { repl, err = evalToVarchar(replArg, typedCol.Collation, true) @@ -424,35 +871,37 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { return nil, err } } - replRunes := charset.Expand(nil, repl.ToRawBytes(), repl.col.Collation.Get().Charset()) - var flags icuregex.RegexpFlag - var collation = typedCol.Collation.Get() - if strings.Contains(collation.Name(), "_ci") { - flags |= icuregex.CaseInsensitive - } - pos := int64(1) - occurrence := int64(0) + occ := int64(0) inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) if len(r.Arguments) > 3 { - pos, err = evalPosition(env, r.Arguments[3], int64(len(inputRunes))) + posExpr, err := r.Arguments[3].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) if err != nil { return nil, err } } if len(r.Arguments) > 4 { - occurrence, err = evalOccurrence(env, r.Arguments[4]) - if err != nil { + occExpr, err := r.Arguments[4].eval(env) + if err != nil || occExpr == nil { return nil, err } + occ = occurrence(evalToInt64(occExpr), occ) } if len(r.Arguments) > 5 { - flags, err = evalRegexpFlags(env, r.Arguments[5], flags) + m, err := r.Arguments[5].eval(env) + if err != nil || m == nil { + return nil, err + } + flags, err = regexpFlags(m, flags, "regexp_replace") if err != nil { return nil, err } @@ -466,68 +915,149 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { m := icuregex.NewMatcher(regexp) m.Reset(inputRunes[pos-1:]) - found := false - if occurrence > 0 { - for i := int64(0); i < occurrence; i++ { - found, err = m.Find() - if err != nil { - return nil, err - } - if !found { - break - } - } - if !found { - return newEvalRaw(sqltypes.Text, input.ToRawBytes(), typedCol), nil - } + bytes, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, collation.Charset()) + if err != nil { + return nil, err + } + if !replaced { + return newEvalRaw(sqltypes.Text, input.ToRawBytes(), resultCollation(typedCol)), nil + } + return newEvalRaw(sqltypes.Text, bytes, resultCollation(typedCol)), nil +} - out := append(inputRunes[:int64(m.Start())+pos-1], replRunes...) - out = append(out, inputRunes[int64(m.End())+pos-1:]...) - bytes := charset.Collapse(nil, out, collation.Charset()) - return newEvalRaw(sqltypes.Text, bytes, typedCol), nil +func (r *builtinRegexpReplace) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + _, f3 := r.Arguments[2].typeof(env, fields) + var f4, f5, f6 typeFlag + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + if len(r.Arguments) > 5 { + _, f6 = r.Arguments[5].typeof(env, fields) + } + return sqltypes.Text, f1 | f2 | f3 | f4 | f5 | f6 +} + +func (r *builtinRegexpReplace) compileSlow(c *compiler, input, pat, repl, pos, occ, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) } - found, err = m.Find() + c.asm.Fn_REGEXP_REPLACE_slow(merged, flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | repl.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil +} + +func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { + input, err := r.Arguments[0].compile(c) if err != nil { - return nil, err + return ctype{}, err } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) - if !found { - return newEvalRaw(sqltypes.Text, input.ToRawBytes(), typedCol), nil + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err } + skips = append(skips, c.compileNullCheckArg(input, 1)) - start := int64(m.Start()) + pos - 1 - out := append(inputRunes[:start], replRunes...) - end := int64(m.End()) + pos - 1 - for { - found, err = m.Find() + repl, err := r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 2)) + + var pos ctype + if len(r.Arguments) > 3 { + pos, err = r.Arguments[3].compile(c) if err != nil { - return nil, err + return ctype{}, err } - if !found { - break + skips = append(skips, c.compileNullCheckArg(input, 3)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 4 { + occ, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err } - nextStart := int64(m.Start()) + pos - 1 - out = append(out, inputRunes[end:nextStart]...) - out = append(out, replRunes...) - end = int64(m.End()) + pos - 1 + skips = append(skips, c.compileNullCheckArg(input, 4)) + _ = c.compileToInt64(pos, 1) } - out = append(out, inputRunes[end:]...) + var matchType ctype + if len(r.Arguments) > 5 { + matchType, err = r.Arguments[5].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(input, 5)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } - bytes := charset.Collapse(nil, out, collation.Charset()) - return newEvalRaw(sqltypes.Text, bytes, typedCol), nil -} + merged, flags, err := compileRegexpCollation(input, pat, "regexp_replace") + if err != nil { + return ctype{}, err + } -func (r *builtinRegexpReplace) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { - _, f1 := r.Arguments[0].typeof(env, fields) - _, f2 := r.Arguments[1].typeof(env, fields) - _, f3 := r.Arguments[2].typeof(env, fields) - return sqltypes.Text, f1 | f2 | f3 -} + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } -func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { - return ctype{}, c.unsupported(r) + if !repl.isTextual() || repl.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-2, sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + pattern, ok := r.Arguments[1].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + inner, ok := pattern.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + + if len(r.Arguments) > 5 { + fl, ok := r.Arguments[5].(*Literal) + if !ok { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + fe, ok := fl.inner.(*evalBytes) + if !ok { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + + flags, err = regexpFlags(fe, flags, "regexp_replace") + if err != nil { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + } + + p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + if err != nil { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_REPLACE(icuregex.NewMatcher(p), merged, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | repl.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil } var _ Expr = (*builtinRegexpReplace)(nil) diff --git a/go/vt/vtgate/evalengine/integration/fuzz_test.go b/go/vt/vtgate/evalengine/integration/fuzz_test.go index 24cd2733fd4..94b3c483fce 100644 --- a/go/vt/vtgate/evalengine/integration/fuzz_test.go +++ b/go/vt/vtgate/evalengine/integration/fuzz_test.go @@ -98,6 +98,9 @@ var ( regexp.MustCompile(`Invalid JSON text in argument (\d+) to function (\w+): (.*?)`), regexp.MustCompile(`Illegal mix of collations`), regexp.MustCompile(`Incorrect (DATE|DATETIME) value`), + regexp.MustCompile(`Syntax error in regular expression`), + regexp.MustCompile(`The regular expression contains an unclosed bracket expression`), + regexp.MustCompile(`Illegal argument to a regular expression`), } ) diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index b36f6ca0985..9322031ea2f 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -1915,6 +1915,8 @@ func RegexpLike(yield Query) { `REGEXP_LIKE('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, `REGEXP_LIKE('abc', 'ABC'`, `REGEXP_LIKE('abc', 'ABC', 'c')`, + `REGEXP_LIKE(1234, 12)`, + `REGEXP_LIKE(1234, 12, 'c')`, `' ' REGEXP '[[:blank:]]'`, `'\t' REGEXP '[[:blank:]]'`, `' ' REGEXP '[[:space:]]'`, @@ -1938,11 +1940,24 @@ func RegexpLike(yield Query) { `'A' collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}'`, `'a' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, `'A' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, + `0xff REGEXP 0xff`, + `0xff REGEXP 0xfe`, + `cast(time '12:34:58' as json) REGEXP 0xff`, } for _, q := range mysqlDocSamples { yield(q, nil) } + + for _, i := range regexInputs { + for _, p := range regexInputs { + yield(fmt.Sprintf("%s REGEXP %s", i, p), nil) + yield(fmt.Sprintf("%s NOT REGEXP %s", i, p), nil) + for _, m := range regexMatchStrings { + yield(fmt.Sprintf("REGEXP_LIKE(%s, %s, %s)", i, p, m), nil) + } + } + } } func RegexpInstr(yield Query) { @@ -1954,6 +1969,7 @@ func RegexpInstr(yield Query) { `REGEXP_INSTR('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, `REGEXP_INSTR('abc', 'ABC'`, `REGEXP_INSTR('abc', 'ABC', 'c')`, + `REGEXP_INSTR('0', '0', 1, 0)`, `REGEXP_INSTR(' ', '[[:blank:]]')`, `REGEXP_INSTR('\t', '[[:blank:]]')`, `REGEXP_INSTR(' ', '[[:space:]]')`, @@ -1991,7 +2007,12 @@ func RegexpInstr(yield Query) { `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'c')`, `REGEXP_INSTR('aa aaa aaaa', 'a{2}')`, `REGEXP_INSTR('aa aaa aaaa', 'a{4}')`, - `REGEXP_INSTR(123, 123)`, + `REGEXP_INSTR(1234, 12)`, + `REGEXP_INSTR(1234, 12, 1)`, + `REGEXP_INSTR(1234, 12, 100)`, + `REGEXP_INSTR(1234, 12, 1, 1)`, + `REGEXP_INSTR(1234, 12, 1, 1, 1)`, + `REGEXP_INSTR(1234, 12, 1, 1, 1, 'c')`, } for _, q := range mysqlDocSamples { @@ -2034,6 +2055,11 @@ func RegexpSubstr(yield Query) { `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 2, 'c')`, `REGEXP_SUBSTR('aa aaa aaaa', 'a{2}')`, `REGEXP_SUBSTR('aa aaa aaaa', 'a{4}')`, + `REGEXP_SUBSTR(1234, 12)`, + `REGEXP_SUBSTR(1234, 12, 1)`, + `REGEXP_SUBSTR(1234, 12, 100)`, + `REGEXP_SUBSTR(1234, 12, 1, 1)`, + `REGEXP_SUBSTR(1234, 12, 1, 1, 'c')`, } for _, q := range mysqlDocSamples { @@ -2064,6 +2090,11 @@ func RegexpReplace(yield Query) { `REGEXP_REPLACE('a', '\\p{Uppercase_letter}', 'X')`, `REGEXP_REPLACE('A', '\\p{Lowercase_letter}', 'X')`, `REGEXP_REPLACE('A', '\\p{Uppercase_letter}', 'X')`, + `REGEXP_REPLACE(1234, 12, 6)`, + `REGEXP_REPLACE(1234, 12, 6, 1)`, + `REGEXP_REPLACE(1234, 12, 6, 100)`, + `REGEXP_REPLACE(1234, 12, 6, 1, 1)`, + `REGEXP_REPLACE(1234, 12, 6, 1, 1, 'c')`, } for _, q := range mysqlDocSamples { diff --git a/go/vt/vtgate/evalengine/testcases/inputs.go b/go/vt/vtgate/evalengine/testcases/inputs.go index 47f50b677c5..552400f6d8e 100644 --- a/go/vt/vtgate/evalengine/testcases/inputs.go +++ b/go/vt/vtgate/evalengine/testcases/inputs.go @@ -133,6 +133,52 @@ var inputConversions = []string{ "cast(time '12:34:56' as json)", "cast(time '12:34:58' as json)", "cast(time '5 12:34:58' as json)", } +var regexInputs = []string{ + "0", "1", "' 0 '", `'\t1foo\t'`, + `'foobar'`, `_utf8 'foobar'`, `''`, `_binary 'foobar'`, + `0x0`, `0x1`, `0xff`, `X'00'`, `X'01'`, `X'ff'`, + "NULL", "true", "false", + "0xFF666F6F626172FF", "0x666F6F626172FF", "0xFF666F6F626172", + "JSON_OBJECT()", "JSON_ARRAY()", + "time '10:04:58'", "date '2000-01-01'", + "timestamp '2000-01-01 10:34:58'", + "cast(0 as json)", "cast(1 as json)", + "cast(true as json)", "cast(false as json)", + "cast('{}' as json)", "cast('[]' as json)", + "cast('null' as json)", "cast('true' as json)", "cast('false' as json)", + // JSON numbers + "cast(1 as json)", "cast(2 as json)", "cast(1.1 as json)", "cast(-1.1 as json)", + "cast(9223372036854775807 as json)", "cast(18446744073709551615 as json)", + "cast('1' as json)", "cast('2' as json)", "cast('1.1' as json)", "cast('-1.1' as json)", + "cast('9223372036854775807' as json)", "cast('18446744073709551615' as json)", + // JSON strings + "cast('\"foo\"' as json)", "cast('\"bar\"' as json)", + // JSON binary values + "cast(_binary' \"foo\"' as json)", "cast(_binary '\"bar\"' as json)", + "cast(0xFF666F6F626172FF as json)", "cast(0x666F6F626172FF as json)", + "cast(0b01 as json)", "cast(0b001 as json)", + // JSON arrays + "cast('[\"a\"]' as json)", "cast('[\"ab\"]' as json)", + // JSON objects + "cast('{\"a\": 1, \"b\": 2}' as json)", "cast('{\"b\": 2, \"a\": 1}' as json)", + // JSON date, datetime & time + "cast(date '2000-01-01' as json)", "cast(date '2000-01-02' as json)", + "cast(timestamp '2000-01-01 12:34:58' as json)", + "cast(time '12:34:56' as json)", "cast(time '12:34:58' as json)", "cast(time '5 12:34:58' as json)", +} + +var regexMatchStrings = []string{ + "NULL", + "'c'", "'i'", "'m'", "'n'", "'u'", "'cimnu'", "'cimnuunmic'", +} + +var regexCounters = []string{ + "NULL", + "0", "1", "5", "100000", + "'0'", "'1'", "'5'", "'100000'", + "0.4", "0.5", "0x0", "0x1", +} + const inputPi = "314159265358979323846264338327950288419716939937510582097494459" var inputStrings = []string{ From 6fcbce0b5ab0d0a3efa4f45dc543a95d64a7cb61 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Wed, 5 Jul 2023 09:34:57 +0200 Subject: [PATCH 14/18] icuregex: Allow for setting explicit dumper Remove the usage of a global variable here. Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/icu_test.go | 11 +++++------ go/mysql/icuregex/matcher.go | 28 ++++++++++++++++------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index 25179c9a2c2..42c98dde5db 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -402,13 +402,11 @@ func TestCornerCases(t *testing.T) { } func TestOne(t *testing.T) { - icuregex.Dumper = os.Stderr - - pattern := []rune{55296, 56320} - input := []rune{'𐀀'} + const Pattern = `\p{CaseIgnorable}` + const Input = "foo.bar" const Flags = 0 - re, err := icuregex.Compile(pattern, Flags) + re, err := icuregex.CompileString(Pattern, Flags) if err != nil { t.Fatalf("compilation failed: %v", err) } @@ -416,7 +414,8 @@ func TestOne(t *testing.T) { re.Dump(os.Stderr) m := icuregex.NewMatcher(re) - m.Reset(input) + m.Dumper(os.Stderr) + m.ResetString(Input) found, err := m.Find() require.NoError(t, err) t.Logf("match = %v", found) diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index f04d8f531f0..11fbc152d73 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -84,6 +84,8 @@ type Matcher struct { // Kept separately from fTime to keep as much // code as possible out of the inline // StateSave function. + + dumper io.Writer } func NewMatcher(pat *Pattern) *Matcher { @@ -100,8 +102,6 @@ func NewMatcher(pat *Pattern) *Matcher { return m } -var Dumper io.Writer - func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { //-------------------------------------------------------------------------------- // @@ -114,10 +114,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { var err error var isMatch bool // True if the we have a match. - if Dumper != nil { - fmt.Fprintf(Dumper, "MatchAt(startIdx=%d)\n", startIdx) - fmt.Fprintf(Dumper, "Original Pattern: \"%s\"\n", m.pattern.pattern) - fmt.Fprintf(Dumper, "Input String: \"%s\"\n\n", string(m.input)) + if m.dumper != nil { + fmt.Fprintf(m.dumper, "MatchAt(startIdx=%d)\n", startIdx) + fmt.Fprintf(m.dumper, "Original Pattern: \"%s\"\n", m.pattern.pattern) + fmt.Fprintf(m.dumper, "Input String: \"%s\"\n\n", string(m.input)) } pat := m.pattern.compiledPat @@ -135,10 +135,10 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { for { op := pat[*fp.patIdx()] - if Dumper != nil { - fmt.Fprintf(Dumper, "inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", *fp.inputIdx(), + if m.dumper != nil { + fmt.Fprintf(m.dumper, "inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", *fp.inputIdx(), charAt(inputText, *fp.inputIdx()), m.stack.sp(), m.activeLimit) - m.pattern.dumpOp(Dumper, *fp.patIdx()) + m.pattern.dumpOp(m.dumper, *fp.patIdx()) } *fp.patIdx()++ @@ -1206,11 +1206,11 @@ breakFromLoop: m.matchEnd = *fp.inputIdx() } - if Dumper != nil { + if m.dumper != nil { if isMatch { - fmt.Fprintf(Dumper, "Match. start=%d end=%d\n\n", m.matchStart, m.matchEnd) + fmt.Fprintf(m.dumper, "Match. start=%d end=%d\n\n", m.matchStart, m.matchEnd) } else { - fmt.Fprintf(Dumper, "No match\n\n") + fmt.Fprintf(m.dumper, "No match\n\n") } } @@ -1642,6 +1642,10 @@ func (m *Matcher) End() int { return m.matchEnd } +func (m *Matcher) Dumper(out io.Writer) { + m.dumper = out +} + // Test for any of the Unicode line terminating characters. func isLineTerminator(c rune) bool { if (c & ^(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) != 0 { From eaba7f8617864725c359848f2e19ed75fa8d6de4 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Wed, 5 Jul 2023 12:56:06 +0200 Subject: [PATCH 15/18] evalengine: Add a whole bunch of regex tests This adds a bunch of tests and fixes the bugs exposed through them. Signed-off-by: Dirkjan Bussink --- go/mysql/constants.go | 2 + go/mysql/sql_error.go | 1 + go/vt/vterrors/state.go | 1 + go/vt/vtgate/evalengine/compiler_asm.go | 80 +++++-- go/vt/vtgate/evalengine/compiler_test.go | 4 +- go/vt/vtgate/evalengine/fn_regexp.go | 212 +++++++++++------- .../evalengine/integration/fuzz_test.go | 2 + go/vt/vtgate/evalengine/testcases/cases.go | 79 +++++++ go/vt/vtgate/evalengine/testcases/inputs.go | 31 +-- 9 files changed, 289 insertions(+), 123 deletions(-) diff --git a/go/mysql/constants.go b/go/mysql/constants.go index 97f23a3a285..bedc9871426 100644 --- a/go/mysql/constants.go +++ b/go/mysql/constants.go @@ -588,6 +588,8 @@ const ( ERCharacterSetMismatch = ErrorCode(3995) + ERWrongParametersToNativeFct = ErrorCode(1583) + // max execution time exceeded ERQueryTimeout = ErrorCode(3024) diff --git a/go/mysql/sql_error.go b/go/mysql/sql_error.go index 4c83fd956a0..ac988033e3d 100644 --- a/go/mysql/sql_error.go +++ b/go/mysql/sql_error.go @@ -239,6 +239,7 @@ var stateToMysqlCode = map[vterrors.State]mysqlCode{ vterrors.RegexpInvalidFlag: {num: ERRegexpInvalidFlag, state: SSUnknownSQLState}, vterrors.RegexpInvalidCaptureGroup: {num: ERRegexpInvalidCaptureGroup, state: SSUnknownSQLState}, vterrors.CharacterSetMismatch: {num: ERCharacterSetMismatch, state: SSUnknownSQLState}, + vterrors.WrongParametersToNativeFct: {num: ERWrongParametersToNativeFct, state: SSUnknownSQLState}, } func getStateToMySQLState(state vterrors.State) mysqlCode { diff --git a/go/vt/vterrors/state.go b/go/vt/vterrors/state.go index 406b535b510..609ab6fbd1b 100644 --- a/go/vt/vterrors/state.go +++ b/go/vt/vterrors/state.go @@ -111,6 +111,7 @@ const ( RegexpInvalidFlag CharacterSetMismatch + WrongParametersToNativeFct // No state should be added below NumOfStates NumOfStates diff --git a/go/vt/vtgate/evalengine/compiler_asm.go b/go/vt/vtgate/evalengine/compiler_asm.go index afef071e754..1267eaf1d1d 100644 --- a/go/vt/vtgate/evalengine/compiler_asm.go +++ b/go/vt/vtgate/evalengine/compiler_asm.go @@ -4353,9 +4353,15 @@ func (asm *assembler) Fn_REGEXP_INSTR(m *icuregex.Matcher, c charset.Charset, of input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) runes := charset.Expand(nil, input.bytes, c) + if len(runes) == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + env.vm.sp -= offset + return 1 + } + pos := int64(1) if offset > 1 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + pos, env.vm.err = positionInstr(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) if env.vm.err != nil { env.vm.sp -= offset return 1 @@ -4406,11 +4412,33 @@ func (asm *assembler) Fn_REGEXP_INSTR_slow(c collations.Charset, flags icuregex. asm.emit(func(env *ExpressionEnv) int { input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + + if offset > 4 { + fe := env.vm.stack[env.vm.sp-offset+4] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + runes := charset.Expand(nil, input.bytes, c) + if len(runes) == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + env.vm.sp -= offset + return 1 + } pos := int64(1) if offset > 1 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + pos, env.vm.err = positionInstr(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) if env.vm.err != nil { env.vm.sp -= offset return 1 @@ -4431,22 +4459,6 @@ func (asm *assembler) Fn_REGEXP_INSTR_slow(c collations.Charset, flags icuregex. } } - if offset > 4 { - fe := env.vm.stack[env.vm.sp-offset+4] - flags, env.vm.err = regexpFlags(fe, flags, "regexp_instr") - if env.vm.err != nil { - env.vm.sp -= offset - return 1 - } - } - - p, err := compileRegex(pattern, c, flags) - if err != nil { - env.vm.err = err - env.vm.sp -= offset - return 1 - } - m := icuregex.NewMatcher(p) m.Reset(runes[pos-1:]) @@ -4482,11 +4494,17 @@ func (asm *assembler) Fn_REGEXP_SUBSTR(m *icuregex.Matcher, merged collations.Ty pos := int64(1) if offset > 1 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + limit := int64(len(runes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), limit, "regexp_substr") if env.vm.err != nil { env.vm.sp -= offset return 1 } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = nil + env.vm.sp -= offset + return 1 + } } occ := int64(1) @@ -4530,11 +4548,17 @@ func (asm *assembler) Fn_REGEXP_SUBSTR_slow(merged collations.TypedCollation, fl pos := int64(1) if offset > 1 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + limit := int64(len(runes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), limit, "regexp_substr") if env.vm.err != nil { env.vm.sp -= offset return 1 } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = nil + env.vm.sp -= offset + return 1 + } } occ := int64(1) @@ -4597,11 +4621,17 @@ func (asm *assembler) Fn_REGEXP_REPLACE(m *icuregex.Matcher, merged collations.T pos := int64(1) if offset > 2 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), int64(len(inputRunes))) + limit := int64(len(inputRunes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), limit, "regexp_replace") if env.vm.err != nil { env.vm.sp -= offset return 1 } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + env.vm.sp -= offset + return 1 + } } occ := int64(0) @@ -4640,11 +4670,17 @@ func (asm *assembler) Fn_REGEXP_REPLACE_slow(merged collations.TypedCollation, f pos := int64(1) if offset > 2 { - pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), int64(len(inputRunes))) + limit := int64(len(inputRunes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), limit, "regexp_replace") if env.vm.err != nil { env.vm.sp -= offset return 1 } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + env.vm.sp -= offset + return 1 + } } occ := int64(0) diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go index 969549483bb..1b5ace371c9 100644 --- a/go/vt/vtgate/evalengine/compiler_test.go +++ b/go/vt/vtgate/evalengine/compiler_test.go @@ -445,8 +445,8 @@ func TestCompilerSingle(t *testing.T) { result: `INT64(5)`, }, { - expression: `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 0)`, - result: `TEXT("X X X")`, + expression: `REGEXP_REPLACE(1234, 12, 6, 1)`, + result: `TEXT("634")`, }, } diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index b83ef0b582f..957e258af11 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -56,7 +56,7 @@ func returnOption(val *evalInt64, f string) (int64, error) { return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s: return_option must be 1 or 0.", f) } -func position(val *evalInt64, limit int64) (int64, error) { +func positionInstr(val *evalInt64, limit int64) (int64, error) { pos := val.i if pos < 1 || pos > limit { return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") @@ -64,6 +64,17 @@ func position(val *evalInt64, limit int64) (int64, error) { return pos, nil } +func position(val *evalInt64, limit int64, f string) (int64, error) { + pos := val.i + if pos < 1 { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongParametersToNativeFct, "Incorrect parameters in the call to native function '%s'", f) + } + if pos-1 > limit { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") + } + return pos, nil +} + func evalRegexpCollation(input, pat eval, f string) (eval, eval, collations.TypedCollation, icuregex.RegexpFlag, error) { var typedCol collations.TypedCollation var err error @@ -131,7 +142,7 @@ func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*i patRunes := charset.Expand(nil, pat.ToRawBytes(), c) if len(patRunes) == 0 { - return nil, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, " Illegal argument to a regular expression.") + return nil, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, "Illegal argument to a regular expression.") } regexp, err := icuregex.Compile(patRunes, flags) @@ -363,49 +374,46 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - collation := typedCol.Collation.Get() - - pos := int64(1) - occ := int64(1) - returnOpt := int64(0) - inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) - + var posExpr eval if len(r.Arguments) > 2 { - posExpr, err := r.Arguments[2].eval(env) + posExpr, err = r.Arguments[2].eval(env) if err != nil || posExpr == nil { return nil, err } - pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) - if err != nil { - return nil, err - } } + var occExpr eval if len(r.Arguments) > 3 { - occExpr, err := r.Arguments[3].eval(env) + occExpr, err = r.Arguments[3].eval(env) if err != nil || occExpr == nil { return nil, err } - occ = occurrence(evalToInt64(occExpr), occ) } + var retExpr eval if len(r.Arguments) > 4 { - retExpr, err := r.Arguments[4].eval(env) + retExpr, err = r.Arguments[4].eval(env) if err != nil || retExpr == nil { return nil, err } - returnOpt, err = returnOption(evalToInt64(retExpr), "regexp_instr") - if err != nil { - return nil, err - } } + var mtExpr eval if len(r.Arguments) > 5 { - m, err := r.Arguments[5].eval(env) - if err != nil || m == nil { + mtExpr, err = r.Arguments[5].eval(env) + if err != nil || mtExpr == nil { return nil, err } - flags, err = regexpFlags(m, flags, "regexp_instr") + } + + collation := typedCol.Collation.Get() + + pos := int64(1) + occ := int64(1) + returnOpt := int64(0) + + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_instr") if err != nil { return nil, err } @@ -416,6 +424,29 @@ func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { return nil, err } + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + if len(inputRunes) == 0 { + return newEvalInt64(0), nil + } + + if posExpr != nil { + pos, err = positionInstr(evalToInt64(posExpr), int64(len(inputRunes))) + if err != nil { + return nil, err + } + } + + if occExpr != nil { + occ = occurrence(evalToInt64(occExpr), occ) + } + + if retExpr != nil { + returnOpt, err = returnOption(evalToInt64(retExpr), "regexp_instr") + if err != nil { + return nil, err + } + } + m := icuregex.NewMatcher(regexp) m.Reset(inputRunes[pos-1:]) @@ -479,7 +510,7 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 1)) + skips = append(skips, c.compileNullCheckArg(pat, 1)) var pos ctype if len(r.Arguments) > 2 { @@ -487,7 +518,7 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 2)) + skips = append(skips, c.compileNullCheckArg(pos, 2)) _ = c.compileToInt64(pos, 1) } @@ -497,8 +528,8 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 3)) - _ = c.compileToInt64(pos, 1) + skips = append(skips, c.compileNullCheckArg(occ, 3)) + _ = c.compileToInt64(occ, 1) } var returnOpt ctype @@ -507,8 +538,8 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 4)) - _ = c.compileToInt64(pos, 1) + skips = append(skips, c.compileNullCheckArg(returnOpt, 4)) + _ = c.compileToInt64(returnOpt, 1) } var matchType ctype @@ -517,7 +548,7 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 5)) + skips = append(skips, c.compileNullCheckArg(matchType, 5)) switch { case matchType.isTextual(): default: @@ -597,37 +628,50 @@ func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - collation := typedCol.Collation.Get() - - pos := int64(1) - occ := int64(1) - inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) - + var posExpr eval + // For some reason this gets checked before NULL checks of the other values if len(r.Arguments) > 2 { - posExpr, err := r.Arguments[2].eval(env) + posExpr, err = r.Arguments[2].eval(env) if err != nil || posExpr == nil { return nil, err } - pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) - if err != nil { - return nil, err - } } + var occExpr eval if len(r.Arguments) > 3 { - occExpr, err := r.Arguments[3].eval(env) + occExpr, err = r.Arguments[3].eval(env) if err != nil || occExpr == nil { return nil, err } - occ = occurrence(evalToInt64(occExpr), occ) } + var mtExpr eval if len(r.Arguments) > 4 { - m, err := r.Arguments[4].eval(env) - if err != nil || m == nil { + mtExpr, err = r.Arguments[4].eval(env) + if err != nil || mtExpr == nil { return nil, err } - flags, err = regexpFlags(m, flags, "regexp_substr") + } + + collation := typedCol.Collation.Get() + pos := int64(1) + occ := int64(1) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + + if posExpr != nil { + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes)), "regexp_substr") + if err != nil { + return nil, err + } + + } + + if occExpr != nil { + occ = occurrence(evalToInt64(occExpr), occ) + } + + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_substr") if err != nil { return nil, err } @@ -697,7 +741,7 @@ func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 1)) + skips = append(skips, c.compileNullCheckArg(pat, 1)) var pos ctype if len(r.Arguments) > 2 { @@ -705,7 +749,7 @@ func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 2)) + skips = append(skips, c.compileNullCheckArg(pos, 2)) _ = c.compileToInt64(pos, 1) } @@ -715,8 +759,8 @@ func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 3)) - _ = c.compileToInt64(pos, 1) + skips = append(skips, c.compileNullCheckArg(occ, 3)) + _ = c.compileToInt64(occ, 1) } var matchType ctype @@ -725,7 +769,7 @@ func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 4)) + skips = append(skips, c.compileNullCheckArg(matchType, 4)) switch { case matchType.isTextual(): default: @@ -853,7 +897,7 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { } replArg, err := r.Arguments[2].eval(env) - if err != nil || pat == nil { + if err != nil || replArg == nil { return nil, err } @@ -862,6 +906,31 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { return nil, err } + var posExpr eval + // For some reason this gets checked before NULL checks of the other values + if len(r.Arguments) > 3 { + posExpr, err = r.Arguments[3].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + } + + var occExpr eval + if len(r.Arguments) > 4 { + occExpr, err = r.Arguments[4].eval(env) + if err != nil || occExpr == nil { + return nil, err + } + } + + var mtExpr eval + if len(r.Arguments) > 5 { + mtExpr, err = r.Arguments[5].eval(env) + if err != nil || mtExpr == nil { + return nil, err + } + } + collation := typedCol.Collation.Get() repl, ok := replArg.(*evalBytes) @@ -871,37 +940,24 @@ func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { return nil, err } } - replRunes := charset.Expand(nil, repl.ToRawBytes(), repl.col.Collation.Get().Charset()) - pos := int64(1) occ := int64(0) inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + replRunes := charset.Expand(nil, repl.ToRawBytes(), repl.col.Collation.Get().Charset()) - if len(r.Arguments) > 3 { - posExpr, err := r.Arguments[3].eval(env) - if err != nil || posExpr == nil { - return nil, err - } - pos, err = position(evalToInt64(posExpr), int64(len(inputRunes))) + if posExpr != nil { + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes)), "regexp_replace") if err != nil { return nil, err } } - if len(r.Arguments) > 4 { - occExpr, err := r.Arguments[4].eval(env) - if err != nil || occExpr == nil { - return nil, err - } + if occExpr != nil { occ = occurrence(evalToInt64(occExpr), occ) } - if len(r.Arguments) > 5 { - m, err := r.Arguments[5].eval(env) - if err != nil || m == nil { - return nil, err - } - flags, err = regexpFlags(m, flags, "regexp_replace") + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_replace") if err != nil { return nil, err } @@ -964,13 +1020,13 @@ func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 1)) + skips = append(skips, c.compileNullCheckArg(pat, 1)) repl, err := r.Arguments[2].compile(c) if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 2)) + skips = append(skips, c.compileNullCheckArg(repl, 2)) var pos ctype if len(r.Arguments) > 3 { @@ -978,7 +1034,7 @@ func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 3)) + skips = append(skips, c.compileNullCheckArg(pos, 3)) _ = c.compileToInt64(pos, 1) } @@ -988,8 +1044,8 @@ func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 4)) - _ = c.compileToInt64(pos, 1) + skips = append(skips, c.compileNullCheckArg(occ, 4)) + _ = c.compileToInt64(occ, 1) } var matchType ctype @@ -998,7 +1054,7 @@ func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { if err != nil { return ctype{}, err } - skips = append(skips, c.compileNullCheckArg(input, 5)) + skips = append(skips, c.compileNullCheckArg(matchType, 5)) switch { case matchType.isTextual(): default: diff --git a/go/vt/vtgate/evalengine/integration/fuzz_test.go b/go/vt/vtgate/evalengine/integration/fuzz_test.go index 94b3c483fce..563bb323244 100644 --- a/go/vt/vtgate/evalengine/integration/fuzz_test.go +++ b/go/vt/vtgate/evalengine/integration/fuzz_test.go @@ -101,6 +101,8 @@ var ( regexp.MustCompile(`Syntax error in regular expression`), regexp.MustCompile(`The regular expression contains an unclosed bracket expression`), regexp.MustCompile(`Illegal argument to a regular expression`), + regexp.MustCompile(`Incorrect arguments to regexp_substr`), + regexp.MustCompile(`Incorrect arguments to regexp_replace`), } ) diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index 9322031ea2f..dbcd13efff6 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -2013,6 +2013,20 @@ func RegexpInstr(yield Query) { `REGEXP_INSTR(1234, 12, 1, 1)`, `REGEXP_INSTR(1234, 12, 1, 1, 1)`, `REGEXP_INSTR(1234, 12, 1, 1, 1, 'c')`, + `REGEXP_INSTR('', ' ', 1000)`, + `REGEXP_INSTR(' ', ' ', 1000)`, + `REGEXP_INSTR(NULL, 'DOG', 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', NULL, 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', NULL, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, NULL, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, NULL, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, NULL)`, + + `REGEXP_INSTR('dog cat dog', NULL, 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', NULL, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, NULL, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, 2, NULL, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, 2, 1, NULL)`, } for _, q := range mysqlDocSamples { @@ -2060,6 +2074,26 @@ func RegexpSubstr(yield Query) { `REGEXP_SUBSTR(1234, 12, 100)`, `REGEXP_SUBSTR(1234, 12, 1, 1)`, `REGEXP_SUBSTR(1234, 12, 1, 1, 'c')`, + + `REGEXP_SUBSTR(NULL, 'DOG', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', NULL, 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', NULL, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, NULL, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, NULL)`, + + `REGEXP_SUBSTR(NULL, '[', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', NULL, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', 1, NULL, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', 1, 1, NULL)`, + + `REGEXP_SUBSTR('dog cat dog', 'DOG', 0, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', -1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 100, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 0)`, + + `REGEXP_SUBSTR(' ', ' ', 1)`, + `REGEXP_SUBSTR(' ', ' ', 2)`, + `REGEXP_SUBSTR(' ', ' ', 3)`, } for _, q := range mysqlDocSamples { @@ -2095,6 +2129,51 @@ func RegexpReplace(yield Query) { `REGEXP_REPLACE(1234, 12, 6, 100)`, `REGEXP_REPLACE(1234, 12, 6, 1, 1)`, `REGEXP_REPLACE(1234, 12, 6, 1, 1, 'c')`, + + `REGEXP_REPLACE(NULL, 'DOG', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', NULL, 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, 1, NULL)`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', '1', '1', 0)`, + + `REGEXP_REPLACE(NULL, _latin1'DOG', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, 1, NULL)`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', '1', '1', 0)`, + + `REGEXP_REPLACE(NULL, '[', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', 'bar', 1, 1, NULL)`, + + `REGEXP_REPLACE(NULL, _latin1'[', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', 'bar', 1, 1, NULL)`, + + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 0, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('', 'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('', 'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, 1, 0)`, + + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 0, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('', _latin1'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('', _latin1'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, 1, 0)`, + + `REGEXP_REPLACE(' ', ' ', 'x', 1)`, + `REGEXP_REPLACE(' ', ' ', 'x', 2)`, + `REGEXP_REPLACE(' ', ' ', 'x', 3)`, + + `REGEXP_REPLACE(' ', _latin1' ', 'x', 1)`, + `REGEXP_REPLACE(' ', _latin1' ', 'x', 2)`, + `REGEXP_REPLACE(' ', _latin1' ', 'x', 3)`, } for _, q := range mysqlDocSamples { diff --git a/go/vt/vtgate/evalengine/testcases/inputs.go b/go/vt/vtgate/evalengine/testcases/inputs.go index 552400f6d8e..5785375955f 100644 --- a/go/vt/vtgate/evalengine/testcases/inputs.go +++ b/go/vt/vtgate/evalengine/testcases/inputs.go @@ -136,35 +136,25 @@ var inputConversions = []string{ var regexInputs = []string{ "0", "1", "' 0 '", `'\t1foo\t'`, `'foobar'`, `_utf8 'foobar'`, `''`, `_binary 'foobar'`, - `0x0`, `0x1`, `0xff`, `X'00'`, `X'01'`, `X'ff'`, + `0x0`, `0x1`, `0xff`, "NULL", "true", "false", - "0xFF666F6F626172FF", "0x666F6F626172FF", "0xFF666F6F626172", - "JSON_OBJECT()", "JSON_ARRAY()", + "0xFF666F6F626172FF", "time '10:04:58'", "date '2000-01-01'", "timestamp '2000-01-01 10:34:58'", "cast(0 as json)", "cast(1 as json)", "cast(true as json)", "cast(false as json)", - "cast('{}' as json)", "cast('[]' as json)", - "cast('null' as json)", "cast('true' as json)", "cast('false' as json)", // JSON numbers - "cast(1 as json)", "cast(2 as json)", "cast(1.1 as json)", "cast(-1.1 as json)", - "cast(9223372036854775807 as json)", "cast(18446744073709551615 as json)", - "cast('1' as json)", "cast('2' as json)", "cast('1.1' as json)", "cast('-1.1' as json)", - "cast('9223372036854775807' as json)", "cast('18446744073709551615' as json)", + "cast(2 as json)", "cast(1.1 as json)", "cast(-1.1 as json)", // JSON strings - "cast('\"foo\"' as json)", "cast('\"bar\"' as json)", + "cast('\"foo\"' as json)", // JSON binary values - "cast(_binary' \"foo\"' as json)", "cast(_binary '\"bar\"' as json)", - "cast(0xFF666F6F626172FF as json)", "cast(0x666F6F626172FF as json)", - "cast(0b01 as json)", "cast(0b001 as json)", + "cast(_binary' \"foo\"' as json)", + "cast(0xFF666F6F626172FF as json)", + "cast(0b01 as json)", // JSON arrays - "cast('[\"a\"]' as json)", "cast('[\"ab\"]' as json)", + "cast('[\"a\"]' as json)", // JSON objects - "cast('{\"a\": 1, \"b\": 2}' as json)", "cast('{\"b\": 2, \"a\": 1}' as json)", - // JSON date, datetime & time - "cast(date '2000-01-01' as json)", "cast(date '2000-01-02' as json)", - "cast(timestamp '2000-01-01 12:34:58' as json)", - "cast(time '12:34:56' as json)", "cast(time '12:34:58' as json)", "cast(time '5 12:34:58' as json)", + "cast('{\"a\": 1, \"b\": 2}' as json)", } var regexMatchStrings = []string{ @@ -175,8 +165,7 @@ var regexMatchStrings = []string{ var regexCounters = []string{ "NULL", "0", "1", "5", "100000", - "'0'", "'1'", "'5'", "'100000'", - "0.4", "0.5", "0x0", "0x1", + "'2'", "0.4", "0.5", "0x1", } const inputPi = "314159265358979323846264338327950288419716939937510582097494459" From 5895fd7665d7f995a211ddb4eb4f9d3e7184528d Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Wed, 5 Jul 2023 15:51:10 +0200 Subject: [PATCH 16/18] Fix license Signed-off-by: Dirkjan Bussink --- go/mysql/icuregex/errors/error.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/go/mysql/icuregex/errors/error.go b/go/mysql/icuregex/errors/error.go index 8f32f5fee1e..f03a5157acf 100644 --- a/go/mysql/icuregex/errors/error.go +++ b/go/mysql/icuregex/errors/error.go @@ -1,3 +1,24 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package errors import "errors" From 1731f8356484fcff857afd0802246cb2ed86ccce Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Wed, 5 Jul 2023 15:52:47 +0200 Subject: [PATCH 17/18] More license fixes Signed-off-by: Dirkjan Bussink --- go/vt/vtgate/evalengine/fn_regexp.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index 957e258af11..d839752e7ad 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -1,3 +1,19 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package evalengine import ( From faf160db951e0e4de071430d098f4fb3927ae629 Mon Sep 17 00:00:00 2001 From: Dirkjan Bussink Date: Wed, 5 Jul 2023 19:31:38 +0000 Subject: [PATCH 18/18] evalengine: Improve handling of constant expression regexps Signed-off-by: Dirkjan Bussink --- go/vt/vtgate/evalengine/fn_regexp.go | 159 ++++++--------------- go/vt/vtgate/evalengine/testcases/cases.go | 6 + 2 files changed, 49 insertions(+), 116 deletions(-) diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go index d839752e7ad..5886a5c3765 100644 --- a/go/vt/vtgate/evalengine/fn_regexp.go +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -211,6 +211,45 @@ func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*i return nil, err } +func compileConstantRegex(c *compiler, args TupleExpr, pat, mt int, cs collations.TypedCollation, flags icuregex.RegexpFlag, f string) (*icuregex.Pattern, error) { + pattern := args[pat] + if !pattern.constant() { + return nil, c.unsupported(pattern) + } + var err error + staticEnv := EmptyExpressionEnv() + pattern, err = simplifyExpr(staticEnv, pattern) + if err != nil { + return nil, err + } + + if len(args) > mt { + fl := args[mt] + if !fl.constant() { + return nil, c.unsupported(fl) + } + fl, err = simplifyExpr(staticEnv, fl) + if err != nil { + return nil, err + } + flags, err = regexpFlags(fl.(*Literal).inner, flags, f) + if err != nil { + return nil, err + } + } + + if pattern.(*Literal).inner == nil { + return nil, c.unsupported(pattern) + } + + innerPat, err := evalToVarchar(pattern.(*Literal).inner, cs.Collation, true) + if err != nil { + return nil, err + } + + return compileRegex(innerPat, cs.Collation.Get().Charset(), flags) +} + // resultCollation returns the collation to use for the result of a regexp. // This falls back to latin1_swedish if the input collation is binary. This // seems to be a side effect of how MySQL also works. Probably due to how it @@ -329,35 +368,7 @@ func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { // We optimize for the case where the pattern is a constant. If not, // we fall back to the slow path. - pattern, ok := r.Arguments[1].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - inner, ok := pattern.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - - if len(r.Arguments) > 2 { - fl, ok := r.Arguments[2].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - fe, ok := fl.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - - flags, err = regexpFlags(fe, flags, "regexp_like") - if err != nil { - return r.compileSlow(c, input, pat, f, merged, flags, skips...) - } - } - - p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + p, err := compileConstantRegex(c, r.Arguments, 1, 2, merged, flags, "regexp_like") if err != nil { return r.compileSlow(c, input, pat, f, merged, flags, skips...) } @@ -583,35 +594,7 @@ func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { // We optimize for the case where the pattern is a constant. If not, // we fall back to the slow path. - pattern, ok := r.Arguments[1].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - inner, ok := pattern.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - - if len(r.Arguments) > 5 { - fl, ok := r.Arguments[5].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - fe, ok := fl.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - - flags, err = regexpFlags(fe, flags, "regexp_instr") - if err != nil { - return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) - } - } - - p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + p, err := compileConstantRegex(c, r.Arguments, 1, 5, merged, flags, "regexp_instr") if err != nil { return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) } @@ -804,35 +787,7 @@ func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { // We optimize for the case where the pattern is a constant. If not, // we fall back to the slow path. - pattern, ok := r.Arguments[1].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - inner, ok := pattern.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - - if len(r.Arguments) > 4 { - fl, ok := r.Arguments[4].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - fe, ok := fl.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - - flags, err = regexpFlags(fe, flags, "regexp_substr") - if err != nil { - return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) - } - } - - p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + p, err := compileConstantRegex(c, r.Arguments, 1, 4, merged, flags, "regexp_substr") if err != nil { return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) } @@ -1093,35 +1048,7 @@ func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { // We optimize for the case where the pattern is a constant. If not, // we fall back to the slow path. - pattern, ok := r.Arguments[1].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - inner, ok := pattern.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - if !merged.Collation.Get().Charset().IsSuperset(inner.col.Collation.Get().Charset()) { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - - if len(r.Arguments) > 5 { - fl, ok := r.Arguments[5].(*Literal) - if !ok { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - fe, ok := fl.inner.(*evalBytes) - if !ok { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - - flags, err = regexpFlags(fe, flags, "regexp_replace") - if err != nil { - return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) - } - } - - p, err := compileRegex(inner, merged.Collation.Get().Charset(), flags) + p, err := compileConstantRegex(c, r.Arguments, 1, 5, merged, flags, "regexp_replace") if err != nil { return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) } diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index dbcd13efff6..d6e692b1a99 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -1929,6 +1929,12 @@ func RegexpLike(yield Query) { `_koi8r 0xF7 regexp _koi8r '[[:alpha:]]'`, `_latin1'a' regexp _latin1'A' collate latin1_general_ci`, `_latin1'a' regexp _latin1'A' collate latin1_bin`, + + `_latin1 'ÿ' regexp _utf8mb4 'ÿ'`, + `_utf8mb4 'ÿ' regexp _latin1 'ÿ'`, + `convert('ÿ' as char character set latin1) regexp _utf8mb4 'ÿ'`, + `_utf8mb4 'ÿ' regexp convert('ÿ' as char character set latin1)`, + `'a' regexp '\\p{alphabetic}'`, `'a' regexp '\\P{alphabetic}'`, `'👌🏾regexp '\\p{Emoji}\\p{Emoji_modifier}'`,