diff --git a/go/mysql/icuregex/internal/icudata/embed.go b/go/mysql/icuregex/internal/icudata/embed.go index 2b7e3033a21..bc3b62b5db6 100644 --- a/go/mysql/icuregex/internal/icudata/embed.go +++ b/go/mysql/icuregex/internal/icudata/embed.go @@ -80,17 +80,17 @@ var Nfkc []byte // case folding. // This is used for property checks of characters about composition. // -//go:embed nfkc_cf.nrm -var NfkcCf []byte +//Unused: go:embed nfkc_cf.nrm +//var NfkcCf []byte // BrkChar is used for matching against character break // characters in regular expressions. // -//go:embed char.brk -var BrkChar []byte +//Unused: go:embed char.brk +//var BrkChar []byte // BrkWord is used for matching against word break // characters in regular expressions. // -//go:embed word.brk -var BrkWord []byte +//Unused: go:embed word.brk +///var BrkWord []byte diff --git a/go/mysql/icuregex/internal/ubidi/loader.go b/go/mysql/icuregex/internal/ubidi/loader.go new file mode 100644 index 00000000000..e30ca402f81 --- /dev/null +++ b/go/mysql/icuregex/internal/ubidi/loader.go @@ -0,0 +1,125 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ubidi + +import ( + "errors" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var ubidiOnce sync.Once +var ubidi struct { + indexes []int32 + trie *utrie.UTrie2 + mirrors []uint32 + jg []uint8 + jg2 []uint8 +} + +func indexes() []int32 { + loadUBidi() + return ubidi.indexes +} + +func trie() *utrie.UTrie2 { + loadUBidi() + return ubidi.trie +} + +func mirrors() []uint32 { + loadUBidi() + return ubidi.mirrors +} + +func jg() []uint8 { + loadUBidi() + return ubidi.jg +} + +func jg2() []uint8 { + loadUBidi() + return ubidi.jg2 +} + +func loadUBidi() { + ubidiOnce.Do(func() { + b := udata.NewBytes(icudata.UBidi) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x42 && + info.DataFormat[1] == 0x69 && + info.DataFormat[2] == 0x44 && + info.DataFormat[3] == 0x69 && + info.FormatVersion[0] == 2 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") + } + + ubidi.indexes = make([]int32, count) + ubidi.indexes[0] = count + + for i := int32(1); i < count; i++ { + ubidi.indexes[i] = int32(bytes.Uint32()) + } + + ubidi.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := ubidi.indexes[ixTrieSize] + trieLength := ubidi.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := ubidi.indexes[ixMirrorLength]; n > 0 { + ubidi.mirrors = bytes.Uint32Slice(n) + } + if n := ubidi.indexes[ixJgLimit] - ubidi.indexes[ixJgStart]; n > 0 { + ubidi.jg = bytes.Uint8Slice(n) + } + if n := ubidi.indexes[ixJgLimit2] - ubidi.indexes[ixJgStart2]; n > 0 { + ubidi.jg2 = bytes.Uint8Slice(n) + } + + return nil +} diff --git a/go/mysql/icuregex/internal/ubidi/ubidi.go b/go/mysql/icuregex/internal/ubidi/ubidi.go index 195e2b1a6dd..79482dfbc8d 100644 --- a/go/mysql/icuregex/internal/ubidi/ubidi.go +++ b/go/mysql/icuregex/internal/ubidi/ubidi.go @@ -21,14 +21,6 @@ limitations under the License. package ubidi -import ( - "errors" - - "vitess.io/vitess/go/mysql/icuregex/internal/icudata" - "vitess.io/vitess/go/mysql/icuregex/internal/udata" - "vitess.io/vitess/go/mysql/icuregex/internal/utrie" -) - const ( ixIndexTop = iota ixLength @@ -44,72 +36,6 @@ const ( ixTop ) -var ubidi struct { - indexes []int32 - trie *utrie.UTrie2 - mirrors []uint32 - jg []uint8 - jg2 []uint8 -} - -func readData(bytes *udata.Bytes) error { - err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.DataFormat[0] == 0x42 && - info.DataFormat[1] == 0x69 && - info.DataFormat[2] == 0x44 && - info.DataFormat[3] == 0x69 && - info.FormatVersion[0] == 2 - }) - if err != nil { - return err - } - - count := int32(bytes.Uint32()) - if count < ixTop { - return errors.New("indexes[0] too small in ucase.icu") - } - - ubidi.indexes = make([]int32, count) - ubidi.indexes[0] = count - - for i := int32(1); i < count; i++ { - ubidi.indexes[i] = int32(bytes.Uint32()) - } - - ubidi.trie, err = utrie.UTrie2FromBytes(bytes) - if err != nil { - return err - } - - expectedTrieLength := ubidi.indexes[ixTrieSize] - trieLength := ubidi.trie.SerializedLength() - - if trieLength > expectedTrieLength { - return errors.New("ucase.icu: not enough bytes for the trie") - } - - bytes.Skip(expectedTrieLength - trieLength) - - if n := ubidi.indexes[ixMirrorLength]; n > 0 { - ubidi.mirrors = bytes.Uint32Slice(n) - } - if n := ubidi.indexes[ixJgLimit] - ubidi.indexes[ixJgStart]; n > 0 { - ubidi.jg = bytes.Uint8Slice(n) - } - if n := ubidi.indexes[ixJgLimit2] - ubidi.indexes[ixJgStart2]; n > 0 { - ubidi.jg2 = bytes.Uint8Slice(n) - } - - return nil -} - -func init() { - b := udata.NewBytes(icudata.UBidi) - if err := readData(b); err != nil { - panic(err) - } -} - const ( /* UBIDI_CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ jtShift = 5 /* joining type: 3 bits (7..5) */ @@ -362,22 +288,24 @@ type propertySet interface { func AddPropertyStarts(sa propertySet) { /* add the start code point of each same-value range of the trie */ - ubidi.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + trie().Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) return true }) + idxs := indexes() + mrs := mirrors() /* add the code points from the bidi mirroring table */ - length := ubidi.indexes[ixMirrorLength] + length := idxs[ixMirrorLength] for i := int32(0); i < length; i++ { - c := mirrorCodePoint(rune(ubidi.mirrors[i])) + c := mirrorCodePoint(rune(mrs[i])) sa.AddRuneRange(c, c+1) } /* add the code points from the Joining_Group array where the value changes */ - start := ubidi.indexes[ixJgStart] - limit := ubidi.indexes[ixJgLimit] - jgArray := ubidi.jg[:] + start := idxs[ixJgStart] + limit := idxs[ixJgLimit] + jgArray := jg() for { prev := uint8(0) for start < limit { @@ -393,11 +321,11 @@ func AddPropertyStarts(sa propertySet) { /* add the limit code point if the last value was not 0 (it is now start==limit) */ sa.AddRune(limit) } - if limit == ubidi.indexes[ixJgLimit] { + if limit == idxs[ixJgLimit] { /* switch to the second Joining_Group range */ - start = ubidi.indexes[ixJgStart2] - limit = ubidi.indexes[ixJgLimit2] - jgArray = ubidi.jg2[:] + start = idxs[ixJgStart2] + limit = idxs[ixJgLimit2] + jgArray = jg2() } else { break } @@ -417,45 +345,46 @@ func mirrorCodePoint(m rune) rune { } func IsJoinControl(c rune) bool { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return HasFlag(props, joinControlShift) } func JoinType(c rune) JoiningType { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return JoiningType((props & jtMask) >> jtShift) } func JoinGroup(c rune) JoiningGroup { - start := ubidi.indexes[ixJgStart] - limit := ubidi.indexes[ixJgLimit] + idxs := indexes() + start := idxs[ixJgStart] + limit := idxs[ixJgLimit] if start <= c && c < limit { - return JoiningGroup(ubidi.jg[c-start]) + return JoiningGroup(jg()[c-start]) } - start = ubidi.indexes[ixJgStart2] - limit = ubidi.indexes[ixJgLimit2] + start = idxs[ixJgStart2] + limit = idxs[ixJgLimit2] if start <= c && c < limit { - return JoiningGroup(ubidi.jg2[c-start]) + return JoiningGroup(jg2()[c-start]) } return JgNoJoiningGroup } func IsMirrored(c rune) bool { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return HasFlag(props, isMirroredShift) } func IsBidiControl(c rune) bool { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return HasFlag(props, bidiControlShift) } func PairedBracketType(c rune) UPairedBracketType { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return UPairedBracketType((props & bptMask) >> bptShift) } func Class(c rune) CharDirection { - props := ubidi.trie.Get16(c) + props := trie().Get16(c) return CharDirection(props & classMask) } diff --git a/go/mysql/icuregex/internal/ucase/fold.go b/go/mysql/icuregex/internal/ucase/fold.go index 88d4f026c65..728142042ba 100644 --- a/go/mysql/icuregex/internal/ucase/fold.go +++ b/go/mysql/icuregex/internal/ucase/fold.go @@ -83,7 +83,7 @@ func FoldRunes(str []rune) []rune { - U+0130 has no simple case folding (simple-case-folds to itself). */ func Fold(c rune) rune { - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { if isUpperOrTitle(props) { c += getDelta(props) @@ -130,7 +130,7 @@ func Fold(c rune) rune { func FullFolding(c rune) (rune, []uint16) { result := c - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { if isUpperOrTitle(props) { @@ -222,7 +222,7 @@ func getDelta(props uint16) rune { } func getExceptions(props uint16) []uint16 { - return ucase.exceptions[props>>4:] + return exceptions()[props>>4:] } func hasSlot(flags uint16, idx int32) bool { diff --git a/go/mysql/icuregex/internal/ucase/loader.go b/go/mysql/icuregex/internal/ucase/loader.go new file mode 100644 index 00000000000..83a6b6c59a7 --- /dev/null +++ b/go/mysql/icuregex/internal/ucase/loader.go @@ -0,0 +1,110 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ucase + +import ( + "errors" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var ucaseOnce sync.Once +var ucase struct { + trie *utrie.UTrie2 + exceptions []uint16 + unfold []uint16 +} + +func trie() *utrie.UTrie2 { + loadUCase() + return ucase.trie +} + +func exceptions() []uint16 { + loadUCase() + return ucase.exceptions +} + +func unfold() []uint16 { + loadUCase() + return ucase.unfold +} + +func loadUCase() { + ucaseOnce.Do(func() { + b := udata.NewBytes(icudata.UCase) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x63 && + info.DataFormat[1] == 0x41 && + info.DataFormat[2] == 0x53 && + info.DataFormat[3] == 0x45 && + info.FormatVersion[0] == 4 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") + } + + indexes := make([]int32, count) + indexes[0] = count + + for i := int32(1); i < count; i++ { + indexes[i] = int32(bytes.Uint32()) + } + + ucase.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := indexes[ixTrieSize] + trieLength := ucase.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := indexes[ixExcLength]; n > 0 { + ucase.exceptions = bytes.Uint16Slice(n) + } + if n := indexes[ixUnfoldLength]; n > 0 { + ucase.unfold = bytes.Uint16Slice(n) + } + + return nil +} diff --git a/go/mysql/icuregex/internal/ucase/ucase.go b/go/mysql/icuregex/internal/ucase/ucase.go index 9fb8407ea66..33fac0a5cce 100644 --- a/go/mysql/icuregex/internal/ucase/ucase.go +++ b/go/mysql/icuregex/internal/ucase/ucase.go @@ -22,20 +22,9 @@ limitations under the License. package ucase import ( - "errors" - - "vitess.io/vitess/go/mysql/icuregex/internal/icudata" - "vitess.io/vitess/go/mysql/icuregex/internal/udata" "vitess.io/vitess/go/mysql/icuregex/internal/utf16" - "vitess.io/vitess/go/mysql/icuregex/internal/utrie" ) -var ucase struct { - trie *utrie.UTrie2 - exceptions []uint16 - unfold []uint16 -} - const ( ixIndexTop = 0 ixLength = 1 @@ -46,68 +35,13 @@ const ( ixTop = 16 ) -func readData(bytes *udata.Bytes) error { - err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.DataFormat[0] == 0x63 && - info.DataFormat[1] == 0x41 && - info.DataFormat[2] == 0x53 && - info.DataFormat[3] == 0x45 && - info.FormatVersion[0] == 4 - }) - if err != nil { - return err - } - - count := int32(bytes.Uint32()) - if count < ixTop { - return errors.New("indexes[0] too small in ucase.icu") - } - - indexes := make([]int32, count) - indexes[0] = count - - for i := int32(1); i < count; i++ { - indexes[i] = int32(bytes.Uint32()) - } - - ucase.trie, err = utrie.UTrie2FromBytes(bytes) - if err != nil { - return err - } - - expectedTrieLength := indexes[ixTrieSize] - trieLength := ucase.trie.SerializedLength() - - if trieLength > expectedTrieLength { - return errors.New("ucase.icu: not enough bytes for the trie") - } - - bytes.Skip(expectedTrieLength - trieLength) - - if n := indexes[ixExcLength]; n > 0 { - ucase.exceptions = bytes.Uint16Slice(n) - } - if n := indexes[ixUnfoldLength]; n > 0 { - ucase.unfold = bytes.Uint16Slice(n) - } - - return nil -} - -func init() { - b := udata.NewBytes(icudata.UCase) - if err := readData(b); err != nil { - panic(err) - } -} - type propertySet interface { AddRune(ch rune) } func AddPropertyStarts(sa propertySet) { /* add the start code point of each same-value range of the trie */ - ucase.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + trie().Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) return true }) @@ -162,7 +96,7 @@ func AddCaseClosure(c rune, sa propertySet) { break } - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { if getPropsType(props) != None { /* add the one simple case mapping, no matter what type it is */ @@ -267,7 +201,7 @@ func IsSoftDotted(c rune) bool { /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ func getDotType(c rune) int32 { - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { return int32(props & dotMask) } @@ -276,7 +210,7 @@ func getDotType(c rune) int32 { } func IsCaseSensitive(c rune) bool { - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { return (props & sensitive) != 0 } @@ -287,7 +221,7 @@ func IsCaseSensitive(c rune) bool { func ToFullLower(c rune) rune { // The sign of the result has meaning, input must be non-negative so that it can be returned as is. result := c - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { if isUpperOrTitle(props) { result = c + getDelta(props) @@ -340,7 +274,7 @@ func ToFullTitle(c rune) rune { func toUpperOrTitle(c rune, upperNotTitle bool) rune { result := c - props := ucase.trie.Get16(c) + props := trie().Get16(c) if !hasException(props) { if getPropsType(props) == Lower { result = c + getDelta(props) @@ -400,7 +334,7 @@ func toUpperOrTitle(c rune, upperNotTitle bool) rune { } func GetTypeOrIgnorable(c rune) int32 { - props := ucase.trie.Get16(c) + props := trie().Get16(c) return int32(props & 7) } @@ -416,7 +350,7 @@ const ( const typeMask = 3 func GetType(c rune) Type { - props := ucase.trie.Get16(c) + props := trie().Get16(c) return getPropsType(props) } diff --git a/go/mysql/icuregex/internal/uchar/loader.go b/go/mysql/icuregex/internal/uchar/loader.go new file mode 100644 index 00000000000..fab54f85e0a --- /dev/null +++ b/go/mysql/icuregex/internal/uchar/loader.go @@ -0,0 +1,139 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uchar + +import ( + "errors" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var upropsOnce sync.Once +var uprops struct { + trie *utrie.UTrie2 + trie2 *utrie.UTrie2 + vectorsColumns int32 + vectors []uint32 + scriptExtensions []uint16 +} + +func trie() *utrie.UTrie2 { + loadUProps() + return uprops.trie +} + +func trie2() *utrie.UTrie2 { + loadUProps() + return uprops.trie2 +} + +func vectorsColumns() int32 { + loadUProps() + return uprops.vectorsColumns +} + +func vectors() []uint32 { + loadUProps() + return uprops.vectors +} + +func scriptExtensions() []uint16 { + loadUProps() + return uprops.scriptExtensions +} + +func loadUProps() { + upropsOnce.Do(func() { + b := udata.NewBytes(icudata.UProps) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x55 && + info.DataFormat[1] == 0x50 && + info.DataFormat[2] == 0x72 && + info.DataFormat[3] == 0x6f && + info.FormatVersion[0] == 7 + }) + if err != nil { + return err + } + + propertyOffset := bytes.Int32() + /* exceptionOffset = */ bytes.Int32() + /* caseOffset = */ bytes.Int32() + additionalOffset := bytes.Int32() + additionalVectorsOffset := bytes.Int32() + uprops.vectorsColumns = bytes.Int32() + scriptExtensionsOffset := bytes.Int32() + reservedOffset7 := bytes.Int32() + /* reservedOffset8 = */ bytes.Int32() + /* dataTopOffset = */ bytes.Int32() + _ = bytes.Int32() + _ = bytes.Int32() + bytes.Skip((16 - 12) << 2) + + uprops.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := (propertyOffset - 16) * 4 + trieLength := uprops.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + bytes.Skip((additionalOffset - propertyOffset) * 4) + + if uprops.vectorsColumns > 0 { + uprops.trie2, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4 + trieLength = uprops.trie2.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + uprops.vectors = bytes.Uint32Slice(scriptExtensionsOffset - additionalVectorsOffset) + } + + if n := (reservedOffset7 - scriptExtensionsOffset) * 2; n > 0 { + uprops.scriptExtensions = bytes.Uint16Slice(n) + } + + return nil +} diff --git a/go/mysql/icuregex/internal/uchar/uchar.go b/go/mysql/icuregex/internal/uchar/uchar.go index a2c758ea1c0..1f01b4691de 100644 --- a/go/mysql/icuregex/internal/uchar/uchar.go +++ b/go/mysql/icuregex/internal/uchar/uchar.go @@ -22,100 +22,15 @@ limitations under the License. package uchar import ( - "errors" "strconv" - - "vitess.io/vitess/go/mysql/icuregex/internal/icudata" - "vitess.io/vitess/go/mysql/icuregex/internal/udata" - "vitess.io/vitess/go/mysql/icuregex/internal/utrie" ) -var uprops struct { - trie *utrie.UTrie2 - trie2 *utrie.UTrie2 - vectorsColumns int32 - vectors []uint32 - scriptExtensions []uint16 -} - -func readData(bytes *udata.Bytes) error { - err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.DataFormat[0] == 0x55 && - info.DataFormat[1] == 0x50 && - info.DataFormat[2] == 0x72 && - info.DataFormat[3] == 0x6f && - info.FormatVersion[0] == 7 - }) - if err != nil { - return err - } - - propertyOffset := bytes.Int32() - /* exceptionOffset = */ bytes.Int32() - /* caseOffset = */ bytes.Int32() - additionalOffset := bytes.Int32() - additionalVectorsOffset := bytes.Int32() - uprops.vectorsColumns = bytes.Int32() - scriptExtensionsOffset := bytes.Int32() - reservedOffset7 := bytes.Int32() - /* reservedOffset8 = */ bytes.Int32() - /* dataTopOffset = */ bytes.Int32() - _ = bytes.Int32() - _ = bytes.Int32() - bytes.Skip((16 - 12) << 2) - - uprops.trie, err = utrie.UTrie2FromBytes(bytes) - if err != nil { - return err - } - - expectedTrieLength := (propertyOffset - 16) * 4 - trieLength := uprops.trie.SerializedLength() - - if trieLength > expectedTrieLength { - return errors.New("ucase.icu: not enough bytes for the trie") - } - - bytes.Skip(expectedTrieLength - trieLength) - bytes.Skip((additionalOffset - propertyOffset) * 4) - - if uprops.vectorsColumns > 0 { - uprops.trie2, err = utrie.UTrie2FromBytes(bytes) - if err != nil { - return err - } - - expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4 - trieLength = uprops.trie2.SerializedLength() - - if trieLength > expectedTrieLength { - return errors.New("ucase.icu: not enough bytes for the trie") - } - - bytes.Skip(expectedTrieLength - trieLength) - uprops.vectors = bytes.Uint32Slice(scriptExtensionsOffset - additionalVectorsOffset) - } - - if n := (reservedOffset7 - scriptExtensionsOffset) * 2; n > 0 { - uprops.scriptExtensions = bytes.Uint16Slice(n) - } - - return nil -} - -func init() { - b := udata.NewBytes(icudata.UProps) - if err := readData(b); err != nil { - panic(err) - } -} - type PropertySet interface { AddRune(ch rune) } func VecAddPropertyStarts(sa PropertySet) { - uprops.trie2.Enum(nil, func(start, _ rune, _ uint32) bool { + trie2().Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) return true }) @@ -139,7 +54,7 @@ const ( func AddPropertyStarts(sa PropertySet) { /* add the start code point of each same-value range of the main trie */ - uprops.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + trie().Enum(nil, func(start, _ rune, _ uint32) bool { sa.AddRune(start) return true }) @@ -205,32 +120,28 @@ func AddPropertyStarts(sa PropertySet) { } func CharType(c rune) Category { - props := uprops.trie.Get16(c) + props := trie().Get16(c) return getCategory(props) } -func GetProperties(c rune) uint16 { - return uprops.trie.Get16(c) -} - func getCategory(props uint16) Category { return Category(props & 0x1f) } func GetUnicodeProperties(c rune, column int) uint32 { - if column >= int(uprops.vectorsColumns) { + if column >= int(vectorsColumns()) { return 0 } - vecIndex := uprops.trie2.Get16(c) - return uprops.vectors[int(vecIndex)+column] + vecIndex := trie2().Get16(c) + return vectors()[int(vecIndex)+column] } func ScriptExtension(idx uint32) uint16 { - return uprops.scriptExtensions[idx] + return scriptExtensions()[idx] } func ScriptExtensions(idx uint32) []uint16 { - return uprops.scriptExtensions[idx:] + return scriptExtensions()[idx:] } func IsDigit(c rune) bool { @@ -242,7 +153,7 @@ func IsPOSIXPrint(c rune) bool { } func IsGraphPOSIX(c rune) bool { - props := uprops.trie.Get16(c) + props := trie().Get16(c) /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ /* comparing ==0 returns FALSE for the categories mentioned */ return uMask(getCategory(props))&(GcCcMask|GcCsMask|GcCnMask|GcZMask) == 0 @@ -321,7 +232,7 @@ whitespace: const upropsNumericTypeValueShift = 6 func NumericTypeValue(c rune) uint16 { - props := uprops.trie.Get16(c) + props := trie().Get16(c) return props >> upropsNumericTypeValueShift } diff --git a/go/mysql/icuregex/internal/unames/loader.go b/go/mysql/icuregex/internal/unames/loader.go new file mode 100644 index 00000000000..296670b1c66 --- /dev/null +++ b/go/mysql/icuregex/internal/unames/loader.go @@ -0,0 +1,90 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package unames + +import ( + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +var charNamesOnce sync.Once +var charNames *unames + +type unames struct { + tokens []uint16 + tokenStrings []uint8 + groups []uint16 + groupNames []uint8 + algNames []algorithmicRange +} + +func loadCharNames() { + charNamesOnce.Do(func() { + b := udata.NewBytes(icudata.UNames) + if err := b.ReadHeader(func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 1 + }); err != nil { + panic(err) + } + + tokenStringOffset := int32(b.Uint32() - 16) + groupsOffset := int32(b.Uint32() - 16) + groupStringOffset := int32(b.Uint32() - 16) + algNamesOffset := int32(b.Uint32() - 16) + charNames = &unames{ + tokens: b.Uint16Slice(tokenStringOffset / 2), + tokenStrings: b.Uint8Slice(groupsOffset - tokenStringOffset), + groups: b.Uint16Slice((groupStringOffset - groupsOffset) / 2), + groupNames: b.Uint8Slice(algNamesOffset - groupStringOffset), + } + + algCount := b.Uint32() + charNames.algNames = make([]algorithmicRange, 0, algCount) + + for i := uint32(0); i < algCount; i++ { + ar := algorithmicRange{ + start: b.Uint32(), + end: b.Uint32(), + typ: b.Uint8(), + variant: b.Uint8(), + } + size := b.Uint16() + switch ar.typ { + case 0: + ar.s = b.Uint8Slice(int32(size) - 12) + case 1: + ar.factors = b.Uint16Slice(int32(ar.variant)) + ar.s = b.Uint8Slice(int32(size) - 12 - int32(ar.variant)*2) + } + charNames.algNames = append(charNames.algNames, ar) + } + }) +} diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go index 45920be8292..66e8ba15615 100644 --- a/go/mysql/icuregex/internal/unames/unames.go +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -25,73 +25,8 @@ import ( "bytes" "strconv" "strings" - "sync" - - "vitess.io/vitess/go/mysql/icuregex/internal/icudata" - "vitess.io/vitess/go/mysql/icuregex/internal/udata" ) -var charNamesOnce sync.Once -var charNames *unames - -type unames struct { - tokens []uint16 - tokenStrings []uint8 - groups []uint16 - groupNames []uint8 - algNames []algorithmicRange -} - -func loadCharNames() { - charNamesOnce.Do(func() { - b := udata.NewBytes(icudata.UNames) - if err := b.ReadHeader(func(info *udata.DataInfo) bool { - return info.Size >= 20 && - info.IsBigEndian == 0 && - info.CharsetFamily == 0 && - info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ - info.DataFormat[1] == 0x6e && - info.DataFormat[2] == 0x61 && - info.DataFormat[3] == 0x6d && - info.FormatVersion[0] == 1 - }); err != nil { - panic(err) - } - - tokenStringOffset := int32(b.Uint32() - 16) - groupsOffset := int32(b.Uint32() - 16) - groupStringOffset := int32(b.Uint32() - 16) - algNamesOffset := int32(b.Uint32() - 16) - charNames = &unames{ - tokens: b.Uint16Slice(tokenStringOffset / 2), - tokenStrings: b.Uint8Slice(groupsOffset - tokenStringOffset), - groups: b.Uint16Slice((groupStringOffset - groupsOffset) / 2), - groupNames: b.Uint8Slice(algNamesOffset - groupStringOffset), - } - - algCount := b.Uint32() - charNames.algNames = make([]algorithmicRange, 0, algCount) - - for i := uint32(0); i < algCount; i++ { - ar := algorithmicRange{ - start: b.Uint32(), - end: b.Uint32(), - typ: b.Uint8(), - variant: b.Uint8(), - } - size := b.Uint16() - switch ar.typ { - case 0: - ar.s = b.Uint8Slice(int32(size) - 12) - case 1: - ar.factors = b.Uint16Slice(int32(ar.variant)) - ar.s = b.Uint8Slice(int32(size) - 12 - int32(ar.variant)*2) - } - charNames.algNames = append(charNames.algNames, ar) - } - }) -} - func (names *unames) getGroupName(group []uint16) []uint8 { return names.groupNames[names.getGroupOffset(group):] } diff --git a/go/mysql/icuregex/internal/uprops/loader.go b/go/mysql/icuregex/internal/uprops/loader.go new file mode 100644 index 00000000000..3be142a6f8a --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/loader.go @@ -0,0 +1,93 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "fmt" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +var pnamesOnce sync.Once +var pnames struct { + valueMaps []uint32 + byteTrie []uint8 +} + +func valueMaps() []uint32 { + loadPNames() + return pnames.valueMaps +} + +func byteTrie() []uint8 { + loadPNames() + return pnames.byteTrie +} + +func loadPNames() { + pnamesOnce.Do(func() { + b := udata.NewBytes(icudata.PNames) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x70 && + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 2 + }) + if err != nil { + return err + } + + count := bytes.Int32() / 4 + if count < 8 { + return fmt.Errorf("indexes[0] too small in ucase.icu") + } + + indexes := make([]int32, count) + indexes[0] = count * 4 + + for i := int32(1); i < count; i++ { + indexes[i] = bytes.Int32() + } + + offset := indexes[ixValueMapsOffset] + nextOffset := indexes[ixByteTriesOffset] + numInts := (nextOffset - offset) / 4 + + pnames.valueMaps = bytes.Uint32Slice(numInts) + + offset = nextOffset + nextOffset = indexes[ixNameGroupsOffset] + numBytes := nextOffset - offset + + pnames.byteTrie = bytes.Uint8Slice(numBytes) + return nil +} diff --git a/go/mysql/icuregex/internal/uprops/uprops.go b/go/mysql/icuregex/internal/uprops/uprops.go index ddf0989b5d8..387eebc0239 100644 --- a/go/mysql/icuregex/internal/uprops/uprops.go +++ b/go/mysql/icuregex/internal/uprops/uprops.go @@ -22,19 +22,10 @@ limitations under the License. package uprops import ( - "fmt" - "vitess.io/vitess/go/mysql/icuregex/internal/bytestrie" - "vitess.io/vitess/go/mysql/icuregex/internal/icudata" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" - "vitess.io/vitess/go/mysql/icuregex/internal/udata" ) -var pnames struct { - valueMaps []uint32 - byteTrie []uint8 -} - const ( ixValueMapsOffset = 0 ixByteTriesOffset = 1 @@ -42,51 +33,6 @@ const ( ixReserved3Offset = 3 ) -func readData(bytes *udata.Bytes) error { - err := bytes.ReadHeader(func(info *udata.DataInfo) bool { - return info.DataFormat[0] == 0x70 && - info.DataFormat[1] == 0x6e && - info.DataFormat[2] == 0x61 && - info.DataFormat[3] == 0x6d && - info.FormatVersion[0] == 2 - }) - if err != nil { - return err - } - - count := bytes.Int32() / 4 - if count < 8 { - return fmt.Errorf("indexes[0] too small in ucase.icu") - } - - indexes := make([]int32, count) - indexes[0] = count * 4 - - for i := int32(1); i < count; i++ { - indexes[i] = bytes.Int32() - } - - offset := indexes[ixValueMapsOffset] - nextOffset := indexes[ixByteTriesOffset] - numInts := (nextOffset - offset) / 4 - - pnames.valueMaps = bytes.Uint32Slice(numInts) - - offset = nextOffset - nextOffset = indexes[ixNameGroupsOffset] - numBytes := nextOffset - offset - - pnames.byteTrie = bytes.Uint8Slice(numBytes) - return nil -} - -func init() { - b := udata.NewBytes(icudata.PNames) - if err := readData(b); err != nil { - panic(err) - } -} - func (prop Property) source() propertySource { if prop < UCharBinaryStart { return srcNone /* undefined */ @@ -158,20 +104,22 @@ func getPropertyValueEnum(prop Property, alias string) int32 { return -1 } - valueMapIdx = int32(pnames.valueMaps[valueMapIdx+1]) + valueMps := valueMaps() + valueMapIdx = int32(valueMps[valueMapIdx+1]) if valueMapIdx == 0 { return -1 } // valueMapIndex is the start of the property's valueMap, // where the first word is the BytesTrie offset. - return getPropertyOrValueEnum(int32(pnames.valueMaps[valueMapIdx]), alias) + return getPropertyOrValueEnum(int32(valueMps[valueMapIdx]), alias) } func findProperty(prop Property) int32 { var i = int32(1) - for numRanges := int32(pnames.valueMaps[0]); numRanges > 0; numRanges-- { - start := int32(pnames.valueMaps[i]) - limit := int32(pnames.valueMaps[i+1]) + valueMps := valueMaps() + for numRanges := int32(valueMps[0]); numRanges > 0; numRanges-- { + start := int32(valueMps[i]) + limit := int32(valueMps[i+1]) i += 2 if int32(prop) < start { break @@ -185,7 +133,7 @@ func findProperty(prop Property) int32 { } func getPropertyOrValueEnum(offset int32, alias string) int32 { - trie := bytestrie.New(pnames.byteTrie[offset:]) + trie := bytestrie.New(byteTrie()[offset:]) if trie.ContainsName(alias) { return trie.GetValue() }