Skip to content

Commit 936382f

Browse files
committed
ICU collation for strings #17.
Implemented using "golang.org/x/text/collate" package. A new configuration API is exposed to set a collator object, which is responsible for compiling strings to sort-key. Note that SetTextCollator() API is not stable. Another caveat, AFAIK, in using golang.org/x/text/collate is that we cannot convert sort-key back to string.
1 parent 84a4f87 commit 936382f

File tree

5 files changed

+166
-4
lines changed

5 files changed

+166
-4
lines changed

cbor_json.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33

44
package gson
55

6-
import "strconv"
76
import "math"
7+
import "strconv"
88
import "encoding/binary"
99

1010
var nullBin = []byte("null")

collate.go

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import "encoding/json"
44
import "math/big"
55
import "bytes"
66

7+
import "golang.org/x/text/collate"
8+
79
// Collation order for supported types. Applications desiring different
810
// ordering between types can initialize these byte values before
911
// instantiating a config object.
@@ -34,6 +36,8 @@ type collateConfig struct {
3436
enc *json.Encoder
3537
buf *bytes.Buffer
3638
zf *big.Float
39+
tcltbuffer *collate.Buffer
40+
textcollator *collate.Collator
3741
}
3842

3943
// Collate abstraction for value encoded into binary-collation.

collate_test.go

+140
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
package gson
22

3+
import "sort"
34
import "bytes"
5+
import "strings"
46
import "testing"
57
import "reflect"
68

9+
import "golang.org/x/text/collate"
10+
import "golang.org/x/text/language"
11+
712
func TestCollateReset(t *testing.T) {
813
config := NewDefaultConfig()
914
clt := config.NewCollate(make([]byte, 0, 1024))
@@ -49,6 +54,117 @@ func TestCollateEmpty(t *testing.T) {
4954
}()
5055
}
5156

57+
func TestAlternateSortTypes(t *testing.T) {
58+
testCases := []struct {
59+
lang string
60+
in testtxtclts
61+
want []string
62+
}{{
63+
lang: "zh,cmn,zh-Hant-u-co-pinyin,zh-HK-u-co-pinyin,zh-pinyin",
64+
in: testtxtclts{
65+
&testtxtclt{in: "爸爸"}, &testtxtclt{in: "妈妈"},
66+
&testtxtclt{in: "儿子"}, &testtxtclt{in: "女儿"},
67+
},
68+
want: []string{"爸爸", "儿子", "妈妈", "女儿"},
69+
}, {
70+
lang: "zh-Hant,zh-u-co-stroke,zh-Hant-u-co-stroke",
71+
in: testtxtclts{
72+
&testtxtclt{in: "爸爸"}, &testtxtclt{in: "妈妈"},
73+
&testtxtclt{in: "儿子"}, &testtxtclt{in: "女儿"},
74+
},
75+
want: []string{"儿子", "女儿", "妈妈", "爸爸"},
76+
}}
77+
78+
for _, tc := range testCases {
79+
for _, tag := range strings.Split(tc.lang, ",") {
80+
collator := collate.New(language.MustParse(tag))
81+
config := NewDefaultConfig().SetTextCollator(collator)
82+
for _, item := range tc.in {
83+
item.collate(config)
84+
}
85+
sort.Sort(tc.in)
86+
got := []string{}
87+
for _, item := range tc.in {
88+
got = append(got, item.in)
89+
}
90+
if !reflect.DeepEqual(got, tc.want) {
91+
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
92+
}
93+
}
94+
}
95+
}
96+
97+
func TestTextNocase(t *testing.T) {
98+
testCases := []struct {
99+
lang string
100+
in testtxtclts
101+
want []string
102+
}{{
103+
lang: "en",
104+
in: testtxtclts{
105+
&testtxtclt{in: "B"}, &testtxtclt{in: "b"},
106+
&testtxtclt{in: "a"}, &testtxtclt{in: "A"},
107+
},
108+
want: []string{"a", "A", "B", "b"},
109+
}}
110+
111+
for _, tc := range testCases {
112+
for _, tag := range strings.Split(tc.lang, ",") {
113+
collator := collate.New(language.MustParse(tag))
114+
config := NewDefaultConfig().SetTextCollator(collator)
115+
for _, item := range tc.in {
116+
item.collate(config)
117+
}
118+
sort.Sort(tc.in)
119+
got := []string{}
120+
for _, item := range tc.in {
121+
got = append(got, item.in)
122+
}
123+
if !reflect.DeepEqual(got, tc.want) {
124+
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
125+
}
126+
}
127+
}
128+
}
129+
130+
func TestTextGermanSwedish(t *testing.T) {
131+
testCases := []struct {
132+
lang string
133+
in testtxtclts
134+
want []string
135+
}{{
136+
lang: "de",
137+
in: testtxtclts{
138+
&testtxtclt{in: "a"}, &testtxtclt{in: "z"}, &testtxtclt{in: "ä"},
139+
},
140+
want: []string{"a", "ä", "z"},
141+
}, {
142+
lang: "sv",
143+
in: testtxtclts{
144+
&testtxtclt{in: "a"}, &testtxtclt{in: "z"}, &testtxtclt{in: "ä"},
145+
},
146+
want: []string{"a", "z", "ä"},
147+
}}
148+
149+
for _, tc := range testCases {
150+
for _, tag := range strings.Split(tc.lang, ",") {
151+
collator := collate.New(language.MustParse(tag))
152+
config := NewDefaultConfig().SetTextCollator(collator)
153+
for _, item := range tc.in {
154+
item.collate(config)
155+
}
156+
sort.Sort(tc.in)
157+
got := []string{}
158+
for _, item := range tc.in {
159+
got = append(got, item.in)
160+
}
161+
if !reflect.DeepEqual(got, tc.want) {
162+
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
163+
}
164+
}
165+
}
166+
}
167+
52168
// sort type for slice of []byte
53169

54170
type ByteSlices [][]byte
@@ -64,3 +180,27 @@ func (bs ByteSlices) Less(i, j int) bool {
64180
func (bs ByteSlices) Swap(i, j int) {
65181
bs[i], bs[j] = bs[j], bs[i]
66182
}
183+
184+
type testtxtclt struct {
185+
in string
186+
clt []byte
187+
}
188+
189+
func (item *testtxtclt) collate(config *Config) {
190+
val := config.NewValue(item.in)
191+
item.clt = val.Tocollate(config.NewCollate(nil)).Bytes()
192+
}
193+
194+
type testtxtclts []*testtxtclt
195+
196+
func (items testtxtclts) Len() int {
197+
return len(items)
198+
}
199+
200+
func (items testtxtclts) Less(i, j int) bool {
201+
return bytes.Compare(items[i].clt, items[j].clt) < 0
202+
}
203+
204+
func (items testtxtclts) Swap(i, j int) {
205+
items[i], items[j] = items[j], items[i]
206+
}

config.go

+13-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import "fmt"
55
import "math/big"
66
import "encoding/json"
77

8+
import "golang.org/x/text/collate"
9+
810
// NumberKind how to treat numbers.
911
type NumberKind byte
1012

@@ -72,12 +74,15 @@ func NewDefaultConfig() *Config {
7274
}
7375

7476
func (config *Config) init() *Config {
77+
// collateConfig
7578
config.buf = bytes.NewBuffer(make([]byte, 0, 1024)) // start with 1K
7679
config.enc = json.NewEncoder(config.buf)
77-
a, b, c, d := config.strlen, config.numkeys, config.itemlen, config.ptrlen
78-
config.pools = newMempool(a, b, c, d)
7980
config.zf = big.NewFloat(0)
8081
config.zf.SetPrec(64)
82+
config.tcltbuffer = &collate.Buffer{}
83+
// mempools
84+
a, b, c, d := config.strlen, config.numkeys, config.itemlen, config.ptrlen
85+
config.pools = newMempool(a, b, c, d)
8186
return config
8287
}
8388

@@ -135,6 +140,12 @@ func (config Config) SetMaxkeys(n int) *Config {
135140
return config.init()
136141
}
137142

143+
// SetTextCollator for string type.
144+
func (config Config) SetTextCollator(collator *collate.Collator) *Config {
145+
config.textcollator = collator
146+
return &config
147+
}
148+
138149
// ResetPools configure a new set of pools with specified size, instead
139150
// of using the default size: MaxStringLen, MaxKeys, MaxCollateLen, and,
140151
// MaxJsonpointerLen.

util.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,16 @@ func collateString(str string, code []byte, config *Config) (n int) {
186186
code[0], code[1] = TypeMissing, Terminator
187187
return 2
188188
}
189+
strcode := str2bytes(str)
190+
if config.textcollator != nil {
191+
config.tcltbuffer.Reset()
192+
strcode = config.textcollator.Key(config.tcltbuffer, strcode)
193+
strcode = strcode[:len(strcode)-1] // return text is null terminated
194+
}
195+
189196
code[n] = TypeString
190197
n++
191-
n += suffixEncodeString(str2bytes(str), code[n:])
198+
n += suffixEncodeString(strcode, code[n:])
192199
code[n] = Terminator
193200
n++
194201
return n

0 commit comments

Comments
 (0)