1
2
3
4
5
6
7
8
9
10 package main
11
12 import (
13 "archive/zip"
14 "bufio"
15 "bytes"
16 "flag"
17 "fmt"
18 "io"
19 "log"
20 "os"
21 "regexp"
22 "sort"
23 "strconv"
24 "strings"
25 "unicode/utf8"
26
27 "golang.org/x/text/collate"
28 "golang.org/x/text/collate/build"
29 "golang.org/x/text/internal/colltab"
30 "golang.org/x/text/internal/gen"
31 "golang.org/x/text/language"
32 "golang.org/x/text/unicode/cldr"
33 )
34
35 var (
36 test = flag.Bool("test", false,
37 "test existing tables; can be used to compare web data with package data.")
38 short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
39 draft = flag.Bool("draft", false, `Use draft versions, when available.`)
40 tags = flag.String("tags", "", "build tags to be included after go:build directive")
41 pkg = flag.String("package", "collate",
42 "the name of the package in which the generated file is to be included")
43
44 tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
45 "comma-spearated list of tables to generate.")
46 exclude = flagStringSet("exclude", "zh2", "",
47 "comma-separated list of languages to exclude.")
48 include = flagStringSet("include", "", "",
49 "comma-separated list of languages to include. Include trumps exclude.")
50
51
52 types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
53 "comma-separated list of types that should be included.")
54 )
55
56
57
58 type stringSet struct {
59 s []string
60 allowed *stringSet
61 dirty bool
62 all bool
63 allowAll bool
64 }
65
66 func flagStringSet(name, def, allowed, usage string) *stringSet {
67 ss := &stringSet{}
68 if allowed != "" {
69 usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
70 ss.allowed = &stringSet{}
71 failOnError(ss.allowed.Set(allowed))
72 }
73 ss.Set(def)
74 flag.Var(ss, name, usage)
75 return ss
76 }
77
78 func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
79 ss := &stringSet{allowAll: true}
80 if allowed == "" {
81 flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
82 } else {
83 ss.allowed = &stringSet{}
84 failOnError(ss.allowed.Set(allowed))
85 flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
86 }
87 ss.Set(def)
88 return ss
89 }
90
91 func (ss stringSet) Len() int {
92 return len(ss.s)
93 }
94
95 func (ss stringSet) String() string {
96 return strings.Join(ss.s, ",")
97 }
98
99 func (ss *stringSet) Set(s string) error {
100 if ss.allowAll && s == "all" {
101 ss.s = nil
102 ss.all = true
103 return nil
104 }
105 ss.s = ss.s[:0]
106 for _, s := range strings.Split(s, ",") {
107 if s := strings.TrimSpace(s); s != "" {
108 if ss.allowed != nil && !ss.allowed.contains(s) {
109 return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
110 }
111 ss.add(s)
112 }
113 }
114 ss.compact()
115 return nil
116 }
117
118 func (ss *stringSet) add(s string) {
119 ss.s = append(ss.s, s)
120 ss.dirty = true
121 }
122
123 func (ss *stringSet) values() []string {
124 ss.compact()
125 return ss.s
126 }
127
128 func (ss *stringSet) contains(s string) bool {
129 if ss.all {
130 return true
131 }
132 for _, v := range ss.s {
133 if v == s {
134 return true
135 }
136 }
137 return false
138 }
139
140 func (ss *stringSet) compact() {
141 if !ss.dirty {
142 return
143 }
144 a := ss.s
145 sort.Strings(a)
146 k := 0
147 for i := 1; i < len(a); i++ {
148 if a[k] != a[i] {
149 a[k+1] = a[i]
150 k++
151 }
152 }
153 ss.s = a[:k+1]
154 ss.dirty = false
155 }
156
157 func skipLang(l string) bool {
158 if include.Len() > 0 {
159 return !include.contains(l)
160 }
161 return exclude.contains(l)
162 }
163
164
165
166
167 func altInclude() []string {
168 l := []string{}
169 if *short {
170 l = append(l, "short")
171 }
172 l = append(l, "")
173
174 if *draft {
175 l = append(l, "proposed")
176 }
177 return l
178 }
179
180 func failOnError(e error) {
181 if e != nil {
182 log.Panic(e)
183 }
184 }
185
186 func openArchive() *zip.Reader {
187 f := gen.OpenCLDRCoreZip()
188 buffer, err := io.ReadAll(f)
189 f.Close()
190 failOnError(err)
191 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
192 failOnError(err)
193 return archive
194 }
195
196
197
198
199 func parseUCA(builder *build.Builder) {
200 var r io.ReadCloser
201 var err error
202 for _, f := range openArchive().File {
203 if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
204 r, err = f.Open()
205 }
206 }
207 if r == nil {
208 log.Fatal("File allkeys_CLDR.txt not found in archive.")
209 }
210 failOnError(err)
211 defer r.Close()
212 scanner := bufio.NewScanner(r)
213 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
214 for i := 1; scanner.Scan(); i++ {
215 line := scanner.Text()
216 if len(line) == 0 || line[0] == '#' {
217 continue
218 }
219 if line[0] == '@' {
220
221 switch {
222 case strings.HasPrefix(line[1:], "version "):
223 a := strings.Split(line[1:], " ")
224 if a[1] != gen.UnicodeVersion() {
225 log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
226 }
227 case strings.HasPrefix(line[1:], "backwards "):
228 log.Fatalf("%d: unsupported option backwards", i)
229 default:
230 log.Printf("%d: unknown option %s", i, line[1:])
231 }
232 } else {
233
234 part := strings.Split(line, " ; ")
235 if len(part) != 2 {
236 log.Fatalf("%d: production rule without ';': %v", i, line)
237 }
238 lhs := []rune{}
239 for _, v := range strings.Split(part[0], " ") {
240 if v == "" {
241 continue
242 }
243 lhs = append(lhs, rune(convHex(i, v)))
244 }
245 var n int
246 var vars []int
247 rhs := [][]int{}
248 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
249 n += len(m[0])
250 elem := []int{}
251 for _, h := range strings.Split(m[2], ".") {
252 elem = append(elem, convHex(i, h))
253 }
254 if m[1] == "*" {
255 vars = append(vars, i)
256 }
257 rhs = append(rhs, elem)
258 }
259 if len(part[1]) < n+3 || part[1][n+1] != '#' {
260 log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
261 }
262 if *test {
263 testInput.add(string(lhs))
264 }
265 failOnError(builder.Add(lhs, rhs, vars))
266 }
267 }
268 if scanner.Err() != nil {
269 log.Fatal(scanner.Err())
270 }
271 }
272
273 func convHex(line int, s string) int {
274 r, e := strconv.ParseInt(s, 16, 32)
275 if e != nil {
276 log.Fatalf("%d: %v", line, e)
277 }
278 return int(r)
279 }
280
281 var testInput = stringSet{}
282
283 var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
284 var tagRe = regexp.MustCompile(`<([a-z_]*) */>`)
285
286 var mainLocales = []string{}
287
288
289 type charSets map[string][]string
290
291 func (p charSets) fprint(w io.Writer) {
292 fmt.Fprintln(w, "[exN]string{")
293 for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
294 if set := p[k]; len(set) != 0 {
295 fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
296 }
297 }
298 fmt.Fprintln(w, "\t},")
299 }
300
301 var localeChars = make(map[string]charSets)
302
303 const exemplarHeader = `
304 type exemplarType int
305 const (
306 exCharacters exemplarType = iota
307 exContractions
308 exPunctuation
309 exAuxiliary
310 exCurrency
311 exIndex
312 exN
313 )
314 `
315
316 func printExemplarCharacters(w io.Writer) {
317 fmt.Fprintln(w, exemplarHeader)
318 fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
319 for _, loc := range mainLocales {
320 fmt.Fprintf(w, "\t%q: ", loc)
321 localeChars[loc].fprint(w)
322 }
323 fmt.Fprintln(w, "}")
324 }
325
326 func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
327 r := gen.OpenCLDRCoreZip()
328 data, err := d.DecodeZip(r)
329 failOnError(err)
330 return data
331 }
332
333
334 func parseMain() {
335 d := &cldr.Decoder{}
336 d.SetDirFilter("main")
337 d.SetSectionFilter("characters")
338 data := decodeCLDR(d)
339 for _, loc := range data.Locales() {
340 x := data.RawLDML(loc)
341 if skipLang(x.Identity.Language.Type) {
342 continue
343 }
344 if x.Characters != nil {
345 x, _ = data.LDML(loc)
346 loc = language.Make(loc).String()
347 for _, ec := range x.Characters.ExemplarCharacters {
348 if ec.Draft != "" {
349 continue
350 }
351 if _, ok := localeChars[loc]; !ok {
352 mainLocales = append(mainLocales, loc)
353 localeChars[loc] = make(charSets)
354 }
355 localeChars[loc][ec.Type] = parseCharacters(ec.Data())
356 }
357 }
358 }
359 }
360
361 func parseCharacters(chars string) []string {
362 parseSingle := func(s string) (r rune, tail string, escaped bool) {
363 if s[0] == '\\' {
364 return rune(s[1]), s[2:], true
365 }
366 r, sz := utf8.DecodeRuneInString(s)
367 return r, s[sz:], false
368 }
369 chars = strings.TrimSpace(chars)
370 if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
371 chars = chars[1:n]
372 }
373 list := []string{}
374 var r, last, end rune
375 for len(chars) > 0 {
376 if chars[0] == '{' {
377 buf := []rune{}
378 for chars = chars[1:]; len(chars) > 0; {
379 r, chars, _ = parseSingle(chars)
380 if r == '}' {
381 break
382 }
383 if r == ' ' {
384 log.Fatalf("space not supported in sequence %q", chars)
385 }
386 buf = append(buf, r)
387 }
388 list = append(list, string(buf))
389 last = 0
390 } else {
391 escaped := false
392 r, chars, escaped = parseSingle(chars)
393 if r != ' ' {
394 if r == '-' && !escaped {
395 if last == 0 {
396 log.Fatal("'-' should be preceded by a character")
397 }
398 end, chars, _ = parseSingle(chars)
399 for ; last <= end; last++ {
400 list = append(list, string(last))
401 }
402 last = 0
403 } else {
404 list = append(list, string(r))
405 last = r
406 }
407 }
408 }
409 }
410 return list
411 }
412
413 var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
414
415
416 var typeMap = map[string]string{
417 "phonebook": "phonebk",
418 "traditional": "trad",
419 }
420
421
422 func parseCollation(b *build.Builder) {
423 d := &cldr.Decoder{}
424 d.SetDirFilter("collation")
425 data := decodeCLDR(d)
426 for _, loc := range data.Locales() {
427 x, err := data.LDML(loc)
428 failOnError(err)
429 if skipLang(x.Identity.Language.Type) {
430 continue
431 }
432 cs := x.Collations.Collation
433 sl := cldr.MakeSlice(&cs)
434 if len(types.s) == 0 {
435 sl.SelectAnyOf("type", x.Collations.Default())
436 } else if !types.all {
437 sl.SelectAnyOf("type", types.s...)
438 }
439 sl.SelectOnePerGroup("alt", altInclude())
440
441 for _, c := range cs {
442 id, err := language.Parse(loc)
443 if err != nil {
444 fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
445 continue
446 }
447
448 d := c.Type
449 if x.Collations.DefaultCollation == nil {
450 d = x.Collations.Default()
451 } else {
452 d = x.Collations.DefaultCollation.Data()
453 }
454
455
456 if d != c.Type && c.Type != "search" {
457 typ := c.Type
458 if len(c.Type) > 8 {
459 typ = typeMap[c.Type]
460 }
461 id, err = id.SetTypeForKey("co", typ)
462 failOnError(err)
463 }
464 t := b.Tailoring(id)
465 c.Process(processor{t})
466 }
467 }
468 }
469
470 type processor struct {
471 t *build.Tailoring
472 }
473
474 func (p processor) Reset(anchor string, before int) (err error) {
475 if before != 0 {
476 err = p.t.SetAnchorBefore(anchor)
477 } else {
478 err = p.t.SetAnchor(anchor)
479 }
480 failOnError(err)
481 return nil
482 }
483
484 func (p processor) Insert(level int, str, context, extend string) error {
485 str = context + str
486 if *test {
487 testInput.add(str)
488 }
489
490 err := p.t.Insert(colltab.Level(level-1), str, context+extend)
491 failOnError(err)
492 return nil
493 }
494
495 func (p processor) Index(id string) {
496 }
497
498 func testCollator(c *collate.Collator) {
499 c0 := collate.New(language.Und)
500
501
502
503 buf := collate.Buffer{}
504
505
506 for i := rune(0); i < 0x30000; i++ {
507 testInput.add(string(i))
508 }
509 for i := rune(0xE0000); i < 0xF0000; i++ {
510 testInput.add(string(i))
511 }
512 for _, str := range testInput.values() {
513 k0 := c0.KeyFromString(&buf, str)
514 k := c.KeyFromString(&buf, str)
515 if !bytes.Equal(k0, k) {
516 failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
517 }
518 buf.Reset()
519 }
520 fmt.Println("PASS")
521 }
522
523 func main() {
524 gen.Init()
525 b := build.NewBuilder()
526 parseUCA(b)
527 if tables.contains("chars") {
528 parseMain()
529 }
530 parseCollation(b)
531
532 c, err := b.Build()
533 failOnError(err)
534
535 if *test {
536 testCollator(collate.NewFromTable(c))
537 } else {
538 w := &bytes.Buffer{}
539
540 gen.WriteUnicodeVersion(w)
541 gen.WriteCLDRVersion(w)
542
543 if tables.contains("collate") {
544 _, err = b.Print(w)
545 failOnError(err)
546 }
547 if tables.contains("chars") {
548 printExemplarCharacters(w)
549 }
550 gen.WriteGoFile("tables.go", *pkg, w.Bytes())
551 }
552 }
553
View as plain text