1
2
3
4
5
6
7 package main
8
9
16 import "C"
17 import (
18 "fmt"
19 "log"
20 "unicode/utf16"
21 "unicode/utf8"
22 "unsafe"
23 )
24
25 func init() {
26 AddFactory(CollatorFactory{"icu", newUTF16,
27 "Main ICU collator, using native strings."})
28 AddFactory(CollatorFactory{"icu8", newUTF8iter,
29 "ICU collator using ICU iterators to process UTF8."})
30 AddFactory(CollatorFactory{"icu16", newUTF8conv,
31 "ICU collation by first converting UTF8 to UTF16."})
32 }
33
34 func icuCharP(s []byte) *C.char {
35 return (*C.char)(unsafe.Pointer(&s[0]))
36 }
37
38 func icuUInt8P(s []byte) *C.uint8_t {
39 return (*C.uint8_t)(unsafe.Pointer(&s[0]))
40 }
41
42 func icuUCharP(s []uint16) *C.UChar {
43 return (*C.UChar)(unsafe.Pointer(&s[0]))
44 }
45 func icuULen(s []uint16) C.int32_t {
46 return C.int32_t(len(s))
47 }
48 func icuSLen(s []byte) C.int32_t {
49 return C.int32_t(len(s))
50 }
51
52
53 type icuCollator struct {
54 loc *C.char
55 col *C.UCollator
56 keyBuf []byte
57 }
58
59 const growBufSize = 10 * 1024 * 1024
60
61 func (c *icuCollator) init(locale string) error {
62 err := C.UErrorCode(0)
63 c.loc = C.CString(locale)
64 c.col = C.ucol_open(c.loc, &err)
65 if err > 0 {
66 return fmt.Errorf("failed opening collator for %q", locale)
67 } else if err < 0 {
68 loc := C.ucol_getLocaleByType(c.col, 0, &err)
69 fmt, ok := map[int]string{
70 -127: "warning: using default collator: %s",
71 -128: "warning: using fallback collator: %s",
72 }[int(err)]
73 if ok {
74 log.Printf(fmt, C.GoString(loc))
75 }
76 }
77 c.keyBuf = make([]byte, 0, growBufSize)
78 return nil
79 }
80
81 func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
82 if len(c.keyBuf) == cap(c.keyBuf) {
83 c.keyBuf = make([]byte, 0, growBufSize)
84 }
85 b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
86 return icuUInt8P(b), icuSLen(b)
87 }
88
89 func (c *icuCollator) extendBuf(n C.int32_t) []byte {
90 end := len(c.keyBuf) + int(n)
91 if end > cap(c.keyBuf) {
92 if len(c.keyBuf) == 0 {
93 log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
94 }
95 c.keyBuf = make([]byte, 0, growBufSize)
96 return nil
97 }
98 b := c.keyBuf[len(c.keyBuf):end]
99 c.keyBuf = c.keyBuf[:end]
100 return b
101 }
102
103 func (c *icuCollator) Close() error {
104 C.ucol_close(c.col)
105 C.free(unsafe.Pointer(c.loc))
106 return nil
107 }
108
109
110 type icuUTF16 struct {
111 icuCollator
112 }
113
114 func newUTF16(locale string) (Collator, error) {
115 c := &icuUTF16{}
116 return c, c.init(locale)
117 }
118
119 func (c *icuUTF16) Compare(a, b Input) int {
120 return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
121 }
122
123 func (c *icuUTF16) Key(s Input) []byte {
124 bp, bn := c.buf()
125 n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
126 if b := c.extendBuf(n); b != nil {
127 return b
128 }
129 return c.Key(s)
130 }
131
132
133
134
135 type icuUTF8iter struct {
136 icuCollator
137 a, b C.UCharIterator
138 }
139
140 func newUTF8iter(locale string) (Collator, error) {
141 c := &icuUTF8iter{}
142 return c, c.init(locale)
143 }
144
145 func (c *icuUTF8iter) Compare(a, b Input) int {
146 err := C.UErrorCode(0)
147 C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
148 C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
149 return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
150 }
151
152 func (c *icuUTF8iter) Key(s Input) []byte {
153 err := C.UErrorCode(0)
154 state := [2]C.uint32_t{}
155 C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
156 bp, bn := c.buf()
157 n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
158 if n >= bn {
159
160 if c.extendBuf(n+1) != nil {
161 log.Fatal("expected extension to fail")
162 }
163 return c.Key(s)
164 }
165 return c.extendBuf(n)
166 }
167
168
169
170
171 type icuUTF8conv struct {
172 icuCollator
173 }
174
175 func newUTF8conv(locale string) (Collator, error) {
176 c := &icuUTF8conv{}
177 return c, c.init(locale)
178 }
179
180 func (c *icuUTF8conv) Compare(sa, sb Input) int {
181 a := encodeUTF16(sa.UTF8)
182 b := encodeUTF16(sb.UTF8)
183 return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
184 }
185
186 func (c *icuUTF8conv) Key(s Input) []byte {
187 a := encodeUTF16(s.UTF8)
188 bp, bn := c.buf()
189 n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
190 if b := c.extendBuf(n); b != nil {
191 return b
192 }
193 return c.Key(s)
194 }
195
196 func encodeUTF16(b []byte) []uint16 {
197 a := []uint16{}
198 for len(b) > 0 {
199 r, sz := utf8.DecodeRune(b)
200 b = b[sz:]
201 r1, r2 := utf16.EncodeRune(r)
202 if r1 != 0xFFFD {
203 a = append(a, uint16(r1), uint16(r2))
204 } else {
205 a = append(a, uint16(r))
206 }
207 }
208 return a
209 }
210
View as plain text