1
2
3
4
5
6
7
8
9
10 package main
11
12 import (
13 "bufio"
14 "flag"
15 "fmt"
16 "io"
17 "log"
18 "math"
19 "reflect"
20 "regexp"
21 "sort"
22 "strconv"
23 "strings"
24
25 "golang.org/x/text/internal/gen"
26 "golang.org/x/text/internal/tag"
27 "golang.org/x/text/unicode/cldr"
28 )
29
30 var (
31 test = flag.Bool("test",
32 false,
33 "test existing tables; can be used to compare web data with package data.")
34 outputFile = flag.String("output",
35 "tables.go",
36 "output file for generated tables")
37 )
38
39 var comment = []string{
40 `
41 lang holds an alphabetically sorted list of ISO-639 language identifiers.
42 All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
43 For 2-byte language identifiers, the two successive bytes have the following meaning:
44 - if the first letter of the 2- and 3-letter ISO codes are the same:
45 the second and third letter of the 3-letter ISO code.
46 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
47 For 3-byte language identifiers the 4th byte is 0.`,
48 `
49 langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
50 in lookup tables. The language ids for these language codes are derived directly
51 from the letters and are not consecutive.`,
52 `
53 altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
54 to 2-letter language codes that cannot be derived using the method described above.
55 Each 3-letter code is followed by its 1-byte langID.`,
56 `
57 altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
58 `
59 AliasMap maps langIDs to their suggested replacements.`,
60 `
61 script is an alphabetically sorted list of ISO 15924 codes. The index
62 of the script in the string, divided by 4, is the internal scriptID.`,
63 `
64 isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
65 for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
66 the UN.M49 codes used for groups.)`,
67 `
68 regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
69 Each 2-letter codes is followed by two bytes with the following meaning:
70 - [A-Z}{2}: the first letter of the 2-letter code plus these two
71 letters form the 3-letter ISO code.
72 - 0, n: index into altRegionISO3.`,
73 `
74 regionTypes defines the status of a region for various standards.`,
75 `
76 m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
77 codes indicating collections of regions.`,
78 `
79 m49Index gives indexes into fromM49 based on the three most significant bits
80 of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
81 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
82 for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
83 The region code is stored in the 9 lsb of the indexed value.`,
84 `
85 fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
86 `
87 altRegionISO3 holds a list of 3-letter region codes that cannot be
88 mapped to 2-letter codes using the default algorithm. This is a short list.`,
89 `
90 altRegionIDs holds a list of regionIDs the positions of which match those
91 of the 3-letter ISO codes in altRegionISO3.`,
92 `
93 variantNumSpecialized is the number of specialized variants in variants.`,
94 `
95 suppressScript is an index from langID to the dominant script for that language,
96 if it exists. If a script is given, it should be suppressed from the language tag.`,
97 `
98 likelyLang is a lookup table, indexed by langID, for the most likely
99 scripts and regions given incomplete information. If more entries exist for a
100 given language, region and script are the index and size respectively
101 of the list in likelyLangList.`,
102 `
103 likelyLangList holds lists info associated with likelyLang.`,
104 `
105 likelyRegion is a lookup table, indexed by regionID, for the most likely
106 languages and scripts given incomplete information. If more entries exist
107 for a given regionID, lang and script are the index and size respectively
108 of the list in likelyRegionList.
109 TODO: exclude containers and user-definable regions from the list.`,
110 `
111 likelyRegionList holds lists info associated with likelyRegion.`,
112 `
113 likelyScript is a lookup table, indexed by scriptID, for the most likely
114 languages and regions given a script.`,
115 `
116 nRegionGroups is the number of region groups.`,
117 `
118 regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
119 where each set holds all groupings that are directly connected in a region
120 containment graph.`,
121 `
122 regionInclusionBits is an array of bit vectors where every vector represents
123 a set of region groupings. These sets are used to compute the distance
124 between two regions for the purpose of language matching.`,
125 `
126 regionInclusionNext marks, for each entry in regionInclusionBits, the set of
127 all groups that are reachable from the groups set in the respective entry.`,
128 }
129
130
131
132
133
134 func failOnError(e error) {
135 if e != nil {
136 log.Panic(e)
137 }
138 }
139
140 type setType int
141
142 const (
143 Indexed setType = 1 + iota
144 Linear
145 )
146
147 type stringSet struct {
148 s []string
149 sorted, frozen bool
150
151
152
153 update map[string]string
154 typ setType
155 }
156
157 func (ss *stringSet) clone() stringSet {
158 c := *ss
159 c.s = append([]string(nil), c.s...)
160 return c
161 }
162
163 func (ss *stringSet) setType(t setType) {
164 if ss.typ != t && ss.typ != 0 {
165 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
166 }
167 }
168
169
170
171 func (ss *stringSet) parse(s string) {
172 scan := bufio.NewScanner(strings.NewReader(s))
173 scan.Split(bufio.ScanWords)
174 for scan.Scan() {
175 ss.add(scan.Text())
176 }
177 }
178
179 func (ss *stringSet) assertChangeable() {
180 if ss.frozen {
181 log.Panic("attempt to modify a frozen stringSet")
182 }
183 }
184
185 func (ss *stringSet) add(s string) {
186 ss.assertChangeable()
187 ss.s = append(ss.s, s)
188 ss.sorted = ss.frozen
189 }
190
191 func (ss *stringSet) freeze() {
192 ss.compact()
193 ss.frozen = true
194 }
195
196 func (ss *stringSet) compact() {
197 if ss.sorted {
198 return
199 }
200 a := ss.s
201 sort.Strings(a)
202 k := 0
203 for i := 1; i < len(a); i++ {
204 if a[k] != a[i] {
205 a[k+1] = a[i]
206 k++
207 }
208 }
209 ss.s = a[:k+1]
210 ss.sorted = ss.frozen
211 }
212
213 type funcSorter struct {
214 fn func(a, b string) bool
215 sort.StringSlice
216 }
217
218 func (s funcSorter) Less(i, j int) bool {
219 return s.fn(s.StringSlice[i], s.StringSlice[j])
220 }
221
222 func (ss *stringSet) sortFunc(f func(a, b string) bool) {
223 ss.compact()
224 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
225 }
226
227 func (ss *stringSet) remove(s string) {
228 ss.assertChangeable()
229 if i, ok := ss.find(s); ok {
230 copy(ss.s[i:], ss.s[i+1:])
231 ss.s = ss.s[:len(ss.s)-1]
232 }
233 }
234
235 func (ss *stringSet) replace(ol, nu string) {
236 ss.s[ss.index(ol)] = nu
237 ss.sorted = ss.frozen
238 }
239
240 func (ss *stringSet) index(s string) int {
241 ss.setType(Indexed)
242 i, ok := ss.find(s)
243 if !ok {
244 if i < len(ss.s) {
245 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
246 }
247 log.Panicf("find: item %q is not in list", s)
248
249 }
250 return i
251 }
252
253 func (ss *stringSet) find(s string) (int, bool) {
254 ss.compact()
255 i := sort.SearchStrings(ss.s, s)
256 return i, i != len(ss.s) && ss.s[i] == s
257 }
258
259 func (ss *stringSet) slice() []string {
260 ss.compact()
261 return ss.s
262 }
263
264 func (ss *stringSet) updateLater(v, key string) {
265 if ss.update == nil {
266 ss.update = map[string]string{}
267 }
268 ss.update[v] = key
269 }
270
271
272 func (ss *stringSet) join() string {
273 ss.setType(Indexed)
274 n := len(ss.s[0])
275 for _, s := range ss.s {
276 if len(s) != n {
277 log.Panicf("join: not all entries are of the same length: %q", s)
278 }
279 }
280 ss.s = append(ss.s, strings.Repeat("\xff", n))
281 return strings.Join(ss.s, "")
282 }
283
284
285
286
287
288 type ianaEntry struct {
289 typ string
290 description []string
291 scope string
292 added string
293 preferred string
294 deprecated string
295 suppressScript string
296 macro string
297 prefix []string
298 }
299
300 type builder struct {
301 w *gen.CodeWriter
302 hw io.Writer
303 data *cldr.CLDR
304 supp *cldr.SupplementalData
305
306
307 locale stringSet
308 lang stringSet
309 langNoIndex stringSet
310 script stringSet
311 region stringSet
312 variant stringSet
313
314
315 groups map[int]index
316
317
318 registry map[string]*ianaEntry
319 }
320
321 type index uint
322
323 func newBuilder(w *gen.CodeWriter) *builder {
324 r := gen.OpenCLDRCoreZip()
325 defer r.Close()
326 d := &cldr.Decoder{}
327 data, err := d.DecodeZip(r)
328 failOnError(err)
329 b := builder{
330 w: w,
331 hw: io.MultiWriter(w, w.Hash),
332 data: data,
333 supp: data.Supplemental(),
334 }
335 b.parseRegistry()
336 return &b
337 }
338
339 func (b *builder) parseRegistry() {
340 r := gen.OpenIANAFile("assignments/language-subtag-registry")
341 defer r.Close()
342 b.registry = make(map[string]*ianaEntry)
343
344 scan := bufio.NewScanner(r)
345 scan.Split(bufio.ScanWords)
346 var record *ianaEntry
347 for more := scan.Scan(); more; {
348 key := scan.Text()
349 more = scan.Scan()
350 value := scan.Text()
351 switch key {
352 case "Type:":
353 record = &ianaEntry{typ: value}
354 case "Subtag:", "Tag:":
355 if s := strings.SplitN(value, "..", 2); len(s) > 1 {
356 for a := s[0]; a <= s[1]; a = inc(a) {
357 b.addToRegistry(a, record)
358 }
359 } else {
360 b.addToRegistry(value, record)
361 }
362 case "Suppress-Script:":
363 record.suppressScript = value
364 case "Added:":
365 record.added = value
366 case "Deprecated:":
367 record.deprecated = value
368 case "Macrolanguage:":
369 record.macro = value
370 case "Preferred-Value:":
371 record.preferred = value
372 case "Prefix:":
373 record.prefix = append(record.prefix, value)
374 case "Scope:":
375 record.scope = value
376 case "Description:":
377 buf := []byte(value)
378 for more = scan.Scan(); more; more = scan.Scan() {
379 b := scan.Bytes()
380 if b[0] == '%' || b[len(b)-1] == ':' {
381 break
382 }
383 buf = append(buf, ' ')
384 buf = append(buf, b...)
385 }
386 record.description = append(record.description, string(buf))
387 continue
388 default:
389 continue
390 }
391 more = scan.Scan()
392 }
393 if scan.Err() != nil {
394 log.Panic(scan.Err())
395 }
396 }
397
398 func (b *builder) addToRegistry(key string, entry *ianaEntry) {
399 if info, ok := b.registry[key]; ok {
400 if info.typ != "language" || entry.typ != "extlang" {
401 log.Fatalf("parseRegistry: tag %q already exists", key)
402 }
403 } else {
404 b.registry[key] = entry
405 }
406 }
407
408 var commentIndex = make(map[string]string)
409
410 func init() {
411 for _, s := range comment {
412 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
413 commentIndex[key] = s
414 }
415 }
416
417 func (b *builder) comment(name string) {
418 if s := commentIndex[name]; len(s) > 0 {
419 b.w.WriteComment(s)
420 } else {
421 fmt.Fprintln(b.w)
422 }
423 }
424
425 func (b *builder) pf(f string, x ...interface{}) {
426 fmt.Fprintf(b.hw, f, x...)
427 fmt.Fprint(b.hw, "\n")
428 }
429
430 func (b *builder) p(x ...interface{}) {
431 fmt.Fprintln(b.hw, x...)
432 }
433
434 func (b *builder) addSize(s int) {
435 b.w.Size += s
436 b.pf("// Size: %d bytes", s)
437 }
438
439 func (b *builder) writeConst(name string, x interface{}) {
440 b.comment(name)
441 b.w.WriteConst(name, x)
442 }
443
444
445
446 func (b *builder) writeConsts(f func(string) int, values ...string) {
447 b.pf("const (")
448 for _, v := range values {
449 b.pf("\t_%s = %v", v, f(v))
450 }
451 b.pf(")")
452 }
453
454
455 func (b *builder) writeType(value interface{}) {
456 b.comment(reflect.TypeOf(value).Name())
457 b.w.WriteType(value)
458 }
459
460 func (b *builder) writeSlice(name string, ss interface{}) {
461 b.writeSliceAddSize(name, 0, ss)
462 }
463
464 func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
465 b.comment(name)
466 b.w.Size += extraSize
467 v := reflect.ValueOf(ss)
468 t := v.Type().Elem()
469 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
470
471 fmt.Fprintf(b.w, "var %s = ", name)
472 b.w.WriteArray(ss)
473 b.p()
474 }
475
476 type FromTo struct {
477 From, To uint16
478 }
479
480 func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
481 ss.sortFunc(func(a, b string) bool {
482 return index(a) < index(b)
483 })
484 m := []FromTo{}
485 for _, s := range ss.s {
486 m = append(m, FromTo{index(s), index(ss.update[s])})
487 }
488 b.writeSlice(name, m)
489 }
490
491 const base = 'z' - 'a' + 1
492
493 func strToInt(s string) uint {
494 v := uint(0)
495 for i := 0; i < len(s); i++ {
496 v *= base
497 v += uint(s[i] - 'a')
498 }
499 return v
500 }
501
502
503
504 func intToStr(v uint, s []byte) {
505 for i := len(s) - 1; i >= 0; i-- {
506 s[i] = byte(v%base) + 'a'
507 v /= base
508 }
509 }
510
511 func (b *builder) writeBitVector(name string, ss []string) {
512 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
513 for _, s := range ss {
514 v := strToInt(s)
515 vec[v/8] |= 1 << (v % 8)
516 }
517 b.writeSlice(name, vec)
518 }
519
520
521 func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
522 b.comment(name)
523 v := reflect.ValueOf(m)
524 sz := v.Len() * (2 + int(v.Type().Key().Size()))
525 for _, k := range m {
526 sz += len(k)
527 }
528 b.addSize(sz)
529 keys := []string{}
530 b.pf(`var %s = map[string]uint16{`, name)
531 for k := range m {
532 keys = append(keys, k)
533 }
534 sort.Strings(keys)
535 for _, k := range keys {
536 b.pf("\t%q: %v,", k, f(m[k]))
537 }
538 b.p("}")
539 }
540
541 func (b *builder) writeMap(name string, m interface{}) {
542 b.comment(name)
543 v := reflect.ValueOf(m)
544 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
545 b.addSize(sz)
546 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
547 return strings.IndexRune("{}, ", r) != -1
548 })
549 sort.Strings(f[1:])
550 b.pf(`var %s = %s{`, name, f[0])
551 for _, kv := range f[1:] {
552 b.pf("\t%s,", kv)
553 }
554 b.p("}")
555 }
556
557 func (b *builder) langIndex(s string) uint16 {
558 if s == "und" {
559 return 0
560 }
561 if i, ok := b.lang.find(s); ok {
562 return uint16(i)
563 }
564 return uint16(strToInt(s)) + uint16(len(b.lang.s))
565 }
566
567
568 func inc(s string) string {
569 const maxTagLength = 4
570 var buf [maxTagLength]byte
571 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
572 for i := 0; i < len(s); i++ {
573 if s[i] <= 'Z' {
574 buf[i] -= 'a' - 'A'
575 }
576 }
577 return string(buf[:len(s)])
578 }
579
580 func (b *builder) parseIndices() {
581 meta := b.supp.Metadata
582
583 for k, v := range b.registry {
584 var ss *stringSet
585 switch v.typ {
586 case "language":
587 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
588 b.lang.add(k)
589 continue
590 } else {
591 ss = &b.langNoIndex
592 }
593 case "region":
594 ss = &b.region
595 case "script":
596 ss = &b.script
597 case "variant":
598 ss = &b.variant
599 default:
600 continue
601 }
602 ss.add(k)
603 }
604
605 for _, lang := range b.data.Locales() {
606 if x := b.data.RawLDML(lang); false ||
607 x.LocaleDisplayNames != nil ||
608 x.Characters != nil ||
609 x.Delimiters != nil ||
610 x.Measurement != nil ||
611 x.Dates != nil ||
612 x.Numbers != nil ||
613 x.Units != nil ||
614 x.ListPatterns != nil ||
615 x.Collations != nil ||
616 x.Segmentations != nil ||
617 x.Rbnf != nil ||
618 x.Annotations != nil ||
619 x.Metadata != nil {
620
621 from := strings.Split(lang, "_")
622 if lang := from[0]; lang != "root" {
623 b.lang.add(lang)
624 }
625 }
626 }
627
628 for _, plurals := range b.data.Supplemental().Plurals {
629 for _, rules := range plurals.PluralRules {
630 for _, lang := range strings.Split(rules.Locales, " ") {
631 if lang = strings.Split(lang, "_")[0]; lang != "root" {
632 b.lang.add(lang)
633 }
634 }
635 }
636 }
637
638 for _, m := range b.supp.LikelySubtags.LikelySubtag {
639 from := strings.Split(m.From, "_")
640 b.lang.add(from[0])
641 }
642
643 for _, a := range meta.Alias.LanguageAlias {
644 if a.Reason == "bibliographic" {
645 b.langNoIndex.add(a.Type)
646 }
647 }
648
649 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
650 if len(reg.Type) == 2 {
651 b.region.add(reg.Type)
652 }
653 }
654
655 for _, s := range b.lang.s {
656 if len(s) == 3 {
657 b.langNoIndex.remove(s)
658 }
659 }
660 b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
661 b.writeConst("NumScripts", len(b.script.slice()))
662 b.writeConst("NumRegions", len(b.region.slice()))
663
664
665 b.lang.add("---")
666 b.script.add("----")
667 b.region.add("---")
668
669
670 b.locale.parse(meta.DefaultContent.Locales)
671 }
672
673
674
675 func (b *builder) computeRegionGroups() {
676 b.groups = make(map[int]index)
677
678
679 for i := 1; b.region.s[i][0] < 'A'; i++ {
680 b.groups[i] = index(len(b.groups))
681 }
682 for _, g := range b.supp.TerritoryContainment.Group {
683
684
685 if g.Type == "EZ" || g.Type == "UN" {
686 continue
687 }
688 group := b.region.index(g.Type)
689 if _, ok := b.groups[group]; !ok {
690 b.groups[group] = index(len(b.groups))
691 }
692 }
693 if len(b.groups) > 64 {
694 log.Fatalf("only 64 groups supported, found %d", len(b.groups))
695 }
696 b.writeConst("nRegionGroups", len(b.groups))
697 }
698
699 var langConsts = []string{
700 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
701 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
702 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
703 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
704 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
705 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
706
707
708 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
709 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
710 }
711
712
713 func (b *builder) writeLanguage() {
714 meta := b.supp.Metadata
715
716 b.writeConst("nonCanonicalUnd", b.lang.index("und"))
717 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
718 b.writeConst("langPrivateStart", b.langIndex("qaa"))
719 b.writeConst("langPrivateEnd", b.langIndex("qtz"))
720
721
722
723 langAliasMap := stringSet{}
724 aliasTypeMap := map[string]AliasType{}
725
726
727 altLangISO3 := stringSet{}
728
729 altLangISO3.add("---")
730 altLangISO3.updateLater("---", "aa")
731
732 lang := b.lang.clone()
733 for _, a := range meta.Alias.LanguageAlias {
734 if a.Replacement == "" {
735 a.Replacement = "und"
736 }
737
738 repl := strings.SplitN(a.Replacement, "_", 2)[0]
739 if a.Reason == "overlong" {
740 if len(a.Replacement) == 2 && len(a.Type) == 3 {
741 lang.updateLater(a.Replacement, a.Type)
742 }
743 } else if len(a.Type) <= 3 {
744 switch a.Reason {
745 case "macrolanguage":
746 aliasTypeMap[a.Type] = Macro
747 case "deprecated":
748
749 continue
750 case "bibliographic", "legacy":
751 if a.Type == "no" {
752 continue
753 }
754 aliasTypeMap[a.Type] = Legacy
755 default:
756 log.Fatalf("new %s alias: %s", a.Reason, a.Type)
757 }
758 langAliasMap.add(a.Type)
759 langAliasMap.updateLater(a.Type, repl)
760 }
761 }
762
763
764 langAliasMap.add("nb")
765 langAliasMap.updateLater("nb", "no")
766 aliasTypeMap["nb"] = Macro
767
768 for k, v := range b.registry {
769
770 if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
771 langAliasMap.add(k)
772 langAliasMap.updateLater(k, v.preferred)
773 aliasTypeMap[k] = Deprecated
774 }
775 }
776
777 lang.updateLater("tl", "tgl")
778 lang.updateLater("sh", "hbs")
779 lang.updateLater("mo", "mol")
780 lang.updateLater("no", "nor")
781 lang.updateLater("tw", "twi")
782 lang.updateLater("nb", "nob")
783 lang.updateLater("ak", "aka")
784 lang.updateLater("bh", "bih")
785
786
787 for _, v := range lang.s[1:] {
788 s, ok := lang.update[v]
789 if !ok {
790 if s, ok = lang.update[langAliasMap.update[v]]; !ok {
791 continue
792 }
793 lang.update[v] = s
794 }
795 if v[0] != s[0] {
796 altLangISO3.add(s)
797 altLangISO3.updateLater(s, v)
798 }
799 }
800
801
802 lang.freeze()
803 for i, v := range lang.s {
804
805
806
807 add := ""
808 if s, ok := lang.update[v]; ok {
809 if s[0] == v[0] {
810 add = s[1:]
811 } else {
812 add = string([]byte{0, byte(altLangISO3.index(s))})
813 }
814 } else if len(v) == 3 {
815 add = "\x00"
816 } else {
817 log.Panicf("no data for long form of %q", v)
818 }
819 lang.s[i] += add
820 }
821 b.writeConst("lang", tag.Index(lang.join()))
822
823 b.writeConst("langNoIndexOffset", len(b.lang.s))
824
825
826 b.writeBitVector("langNoIndex", b.langNoIndex.slice())
827
828 altLangIndex := []uint16{}
829 for i, s := range altLangISO3.slice() {
830 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
831 if i > 0 {
832 idx := b.lang.index(altLangISO3.update[s])
833 altLangIndex = append(altLangIndex, uint16(idx))
834 }
835 }
836 b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
837 b.writeSlice("altLangIndex", altLangIndex)
838
839 b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex)
840 types := make([]AliasType, len(langAliasMap.s))
841 for i, s := range langAliasMap.s {
842 types[i] = aliasTypeMap[s]
843 }
844 b.writeSlice("AliasTypes", types)
845 }
846
847 var scriptConsts = []string{
848 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
849 "Zzzz",
850 }
851
852 func (b *builder) writeScript() {
853 b.writeConsts(b.script.index, scriptConsts...)
854 b.writeConst("script", tag.Index(b.script.join()))
855
856 supp := make([]uint8, len(b.lang.slice()))
857 for i, v := range b.lang.slice()[1:] {
858 if sc := b.registry[v].suppressScript; sc != "" {
859 supp[i+1] = uint8(b.script.index(sc))
860 }
861 }
862 b.writeSlice("suppressScript", supp)
863
864
865
866 for _, a := range b.supp.Metadata.Alias.ScriptAlias {
867 if a.Type != "Qaai" {
868 log.Panicf("unexpected deprecated stript %q", a.Type)
869 }
870 }
871 }
872
873 func parseM49(s string) int16 {
874 if len(s) == 0 {
875 return 0
876 }
877 v, err := strconv.ParseUint(s, 10, 10)
878 failOnError(err)
879 return int16(v)
880 }
881
882 var regionConsts = []string{
883 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
884 "ZZ", "XA", "XC", "XK",
885 }
886
887 func (b *builder) writeRegion() {
888 b.writeConsts(b.region.index, regionConsts...)
889
890 isoOffset := b.region.index("AA")
891 m49map := make([]int16, len(b.region.slice()))
892 fromM49map := make(map[int16]int)
893 altRegionISO3 := ""
894 altRegionIDs := []uint16{}
895
896 b.writeConst("isoRegionOffset", isoOffset)
897
898
899 regionISO := b.region.clone()
900 regionISO.s = regionISO.s[isoOffset:]
901 regionISO.sorted = false
902
903 regionTypes := make([]byte, len(b.region.s))
904
905
906 for s, e := range b.registry {
907 if len(s) == 2 && s == strings.ToUpper(s) {
908 i := b.region.index(s)
909 for _, d := range e.description {
910 if strings.Contains(d, "Private use") {
911 regionTypes[i] = iso3166UserAssigned
912 }
913 }
914 regionTypes[i] |= bcp47Region
915 }
916 }
917
918
919 r := gen.OpenIANAFile("domains/root/db")
920 defer r.Close()
921
922 buf, err := io.ReadAll(r)
923 failOnError(err)
924 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
925 for _, m := range re.FindAllSubmatch(buf, -1) {
926 i := b.region.index(strings.ToUpper(string(m[1])))
927 regionTypes[i] |= ccTLD
928 }
929
930 b.writeSlice("regionTypes", regionTypes)
931
932 iso3Set := make(map[string]int)
933 update := func(iso2, iso3 string) {
934 i := regionISO.index(iso2)
935 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
936 regionISO.s[i] += iso3[1:]
937 iso3Set[iso3] = -1
938 } else {
939 if ok && j >= 0 {
940 regionISO.s[i] += string([]byte{0, byte(j)})
941 } else {
942 iso3Set[iso3] = len(altRegionISO3)
943 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
944 altRegionISO3 += iso3
945 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
946 }
947 }
948 }
949 for _, tc := range b.supp.CodeMappings.TerritoryCodes {
950 i := regionISO.index(tc.Type) + isoOffset
951 if d := m49map[i]; d != 0 {
952 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
953 }
954 m49 := parseM49(tc.Numeric)
955 m49map[i] = m49
956 if r := fromM49map[m49]; r == 0 {
957 fromM49map[m49] = i
958 } else if r != i {
959 dep := b.registry[regionISO.s[r-isoOffset]].deprecated
960 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
961 fromM49map[m49] = i
962 }
963 }
964 }
965 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
966 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
967 from := parseM49(ta.Type)
968 if r := fromM49map[from]; r == 0 {
969 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
970 }
971 }
972 }
973 for _, tc := range b.supp.CodeMappings.TerritoryCodes {
974 if len(tc.Alpha3) == 3 {
975 update(tc.Type, tc.Alpha3)
976 }
977 }
978
979
980 for _, m := range []struct{ iso2, iso3 string }{
981 {"CT", "CTE"},
982 {"DY", "DHY"},
983 {"HV", "HVO"},
984 {"JT", "JTN"},
985 {"MI", "MID"},
986 {"NH", "NHB"},
987 {"NQ", "ATN"},
988 {"PC", "PCI"},
989 {"PU", "PUS"},
990 {"PZ", "PCZ"},
991 {"RH", "RHO"},
992 {"VD", "VDR"},
993 {"WK", "WAK"},
994
995 {"FQ", "ATF"},
996 } {
997 update(m.iso2, m.iso3)
998 }
999 for i, s := range regionISO.s {
1000 if len(s) != 4 {
1001 regionISO.s[i] = s + " "
1002 }
1003 }
1004 b.writeConst("regionISO", tag.Index(regionISO.join()))
1005 b.writeConst("altRegionISO3", altRegionISO3)
1006 b.writeSlice("altRegionIDs", altRegionIDs)
1007
1008
1009
1010
1011 regionOldMap := stringSet{}
1012
1013 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
1014 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
1015 regionOldMap.add(reg.Type)
1016 regionOldMap.updateLater(reg.Type, reg.Replacement)
1017 i, _ := regionISO.find(reg.Type)
1018 j, _ := regionISO.find(reg.Replacement)
1019 if k := m49map[i+isoOffset]; k == 0 {
1020 m49map[i+isoOffset] = m49map[j+isoOffset]
1021 }
1022 }
1023 }
1024 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 {
1025 return uint16(b.region.index(s))
1026 })
1027
1028 for i := 1; i < isoOffset; i++ {
1029 m := parseM49(b.region.s[i])
1030 m49map[i] = m
1031 fromM49map[m] = i
1032 }
1033 b.writeSlice("m49", m49map)
1034
1035 const (
1036 searchBits = 7
1037 regionBits = 9
1038 )
1039 if len(m49map) >= 1<<regionBits {
1040 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
1041 }
1042 m49Index := [9]int16{}
1043 fromM49 := []uint16{}
1044 m49 := []int{}
1045 for k, _ := range fromM49map {
1046 m49 = append(m49, int(k))
1047 }
1048 sort.Ints(m49)
1049 for _, k := range m49[1:] {
1050 val := (k & (1<<searchBits - 1)) << regionBits
1051 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
1052 m49Index[1:][k>>searchBits] = int16(len(fromM49))
1053 }
1054 b.writeSlice("m49Index", m49Index)
1055 b.writeSlice("fromM49", fromM49)
1056 }
1057
1058 const (
1059
1060
1061 iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
1062 iso3166Trans = "AN BU CS NT TP YU ZR"
1063
1064 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
1065 )
1066
1067 const (
1068 iso3166UserAssigned = 1 << iota
1069 ccTLD
1070 bcp47Region
1071 )
1072
1073 func find(list []string, s string) int {
1074 for i, t := range list {
1075 if t == s {
1076 return i
1077 }
1078 }
1079 return -1
1080 }
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102 func (b *builder) writeVariant() {
1103 generalized := stringSet{}
1104 specialized := stringSet{}
1105 specializedExtend := stringSet{}
1106
1107 for _, v := range b.variant.slice() {
1108 e := b.registry[v]
1109 if len(e.prefix) == 0 {
1110 generalized.add(v)
1111 continue
1112 }
1113 c := strings.Split(e.prefix[0], "-")
1114 hasScriptOrRegion := false
1115 if len(c) > 1 {
1116 _, hasScriptOrRegion = b.script.find(c[1])
1117 if !hasScriptOrRegion {
1118 _, hasScriptOrRegion = b.region.find(c[1])
1119
1120 }
1121 }
1122 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
1123
1124 specialized.add(v)
1125 continue
1126 }
1127
1128 specializedExtend.add(v)
1129 prefix := c[0] + "-"
1130 if hasScriptOrRegion {
1131 prefix += c[1]
1132 }
1133 for _, p := range e.prefix {
1134
1135
1136 i := strings.LastIndex(p, "-")
1137 pred := b.registry[p[i+1:]]
1138 if find(pred.prefix, p[:i]) < 0 {
1139 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
1140 }
1141
1142
1143
1144 count := strings.Count(p[:i], "-")
1145 for _, q := range pred.prefix {
1146 if c := strings.Count(q, "-"); c != count {
1147 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
1148 }
1149 }
1150 if !strings.HasPrefix(p, prefix) {
1151 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
1152 }
1153 }
1154 }
1155
1156
1157 a := specializedExtend.s
1158 less := func(v, w string) bool {
1159
1160 maxCount := func(s string) (max int) {
1161 for _, p := range b.registry[s].prefix {
1162 if c := strings.Count(p, "-"); c > max {
1163 max = c
1164 }
1165 }
1166 return
1167 }
1168 if cv, cw := maxCount(v), maxCount(w); cv != cw {
1169 return cv < cw
1170 }
1171
1172 return v < w
1173 }
1174 sort.Sort(funcSorter{less, sort.StringSlice(a)})
1175 specializedExtend.frozen = true
1176
1177
1178 variantIndex := make(map[string]uint8)
1179 add := func(s []string) {
1180 for _, v := range s {
1181 variantIndex[v] = uint8(len(variantIndex))
1182 }
1183 }
1184 add(specialized.slice())
1185 add(specializedExtend.s)
1186 numSpecialized := len(variantIndex)
1187 add(generalized.slice())
1188 if n := len(variantIndex); n > 255 {
1189 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
1190 }
1191 b.writeMap("variantIndex", variantIndex)
1192 b.writeConst("variantNumSpecialized", numSpecialized)
1193 }
1194
1195 func (b *builder) writeLanguageInfo() {
1196 }
1197
1198
1199
1200
1201 func (b *builder) writeLikelyData() {
1202 const (
1203 isList = 1 << iota
1204 scriptInFrom
1205 regionInFrom
1206 )
1207 type (
1208 likelyScriptRegion struct {
1209 region uint16
1210 script uint16
1211 flags uint8
1212 }
1213 likelyLangScript struct {
1214 lang uint16
1215 script uint16
1216 flags uint8
1217 }
1218 likelyLangRegion struct {
1219 lang uint16
1220 region uint16
1221 }
1222
1223
1224 likelyTag struct {
1225 lang uint16
1226 region uint16
1227 script uint16
1228 }
1229 )
1230 var (
1231 likelyRegionGroup = make([]likelyTag, len(b.groups))
1232 likelyLang = make([]likelyScriptRegion, len(b.lang.s))
1233 likelyRegion = make([]likelyLangScript, len(b.region.s))
1234 likelyScript = make([]likelyLangRegion, len(b.script.s))
1235 likelyLangList = []likelyScriptRegion{}
1236 likelyRegionList = []likelyLangScript{}
1237 )
1238 type fromTo struct {
1239 from, to []string
1240 }
1241 langToOther := map[int][]fromTo{}
1242 regionToOther := map[int][]fromTo{}
1243 for _, m := range b.supp.LikelySubtags.LikelySubtag {
1244 from := strings.Split(m.From, "_")
1245 to := strings.Split(m.To, "_")
1246 if len(to) != 3 {
1247 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
1248 }
1249 if len(from) > 3 {
1250 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
1251 }
1252 if from[0] != to[0] && from[0] != "und" {
1253 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
1254 }
1255 if len(from) == 3 {
1256 if from[2] != to[2] {
1257 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
1258 }
1259 if from[0] != "und" {
1260 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
1261 }
1262 }
1263 if len(from) == 1 || from[0] != "und" {
1264 id := 0
1265 if from[0] != "und" {
1266 id = b.lang.index(from[0])
1267 }
1268 langToOther[id] = append(langToOther[id], fromTo{from, to})
1269 } else if len(from) == 2 && len(from[1]) == 4 {
1270 sid := b.script.index(from[1])
1271 likelyScript[sid].lang = uint16(b.langIndex(to[0]))
1272 likelyScript[sid].region = uint16(b.region.index(to[2]))
1273 } else {
1274 r := b.region.index(from[len(from)-1])
1275 if id, ok := b.groups[r]; ok {
1276 if from[0] != "und" {
1277 log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
1278 }
1279 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
1280 likelyRegionGroup[id].script = uint16(b.script.index(to[1]))
1281 likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
1282 } else {
1283 regionToOther[r] = append(regionToOther[r], fromTo{from, to})
1284 }
1285 }
1286 }
1287 b.writeType(likelyLangRegion{})
1288 b.writeSlice("likelyScript", likelyScript)
1289
1290 for id := range b.lang.s {
1291 list := langToOther[id]
1292 if len(list) == 1 {
1293 likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
1294 likelyLang[id].script = uint16(b.script.index(list[0].to[1]))
1295 } else if len(list) > 1 {
1296 likelyLang[id].flags = isList
1297 likelyLang[id].region = uint16(len(likelyLangList))
1298 likelyLang[id].script = uint16(len(list))
1299 for _, x := range list {
1300 flags := uint8(0)
1301 if len(x.from) > 1 {
1302 if x.from[1] == x.to[2] {
1303 flags = regionInFrom
1304 } else {
1305 flags = scriptInFrom
1306 }
1307 }
1308 likelyLangList = append(likelyLangList, likelyScriptRegion{
1309 region: uint16(b.region.index(x.to[2])),
1310 script: uint16(b.script.index(x.to[1])),
1311 flags: flags,
1312 })
1313 }
1314 }
1315 }
1316
1317 b.writeType(likelyScriptRegion{})
1318 b.writeSlice("likelyLang", likelyLang)
1319 b.writeSlice("likelyLangList", likelyLangList)
1320
1321 for id := range b.region.s {
1322 list := regionToOther[id]
1323 if len(list) == 1 {
1324 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
1325 likelyRegion[id].script = uint16(b.script.index(list[0].to[1]))
1326 if len(list[0].from) > 2 {
1327 likelyRegion[id].flags = scriptInFrom
1328 }
1329 } else if len(list) > 1 {
1330 likelyRegion[id].flags = isList
1331 likelyRegion[id].lang = uint16(len(likelyRegionList))
1332 likelyRegion[id].script = uint16(len(list))
1333 for i, x := range list {
1334 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
1335 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
1336 }
1337 x := likelyLangScript{
1338 lang: uint16(b.langIndex(x.to[0])),
1339 script: uint16(b.script.index(x.to[1])),
1340 }
1341 if len(list[0].from) > 2 {
1342 x.flags = scriptInFrom
1343 }
1344 likelyRegionList = append(likelyRegionList, x)
1345 }
1346 }
1347 }
1348 b.writeType(likelyLangScript{})
1349 b.writeSlice("likelyRegion", likelyRegion)
1350 b.writeSlice("likelyRegionList", likelyRegionList)
1351
1352 b.writeType(likelyTag{})
1353 b.writeSlice("likelyRegionGroup", likelyRegionGroup)
1354 }
1355
1356 func (b *builder) writeRegionInclusionData() {
1357 var (
1358
1359 mm = make(map[int][]index)
1360
1361
1362
1363 containment = make(map[index][]index)
1364 )
1365 for _, g := range b.supp.TerritoryContainment.Group {
1366
1367
1368 if g.Type == "EZ" || g.Type == "UN" {
1369 continue
1370 }
1371 group := b.region.index(g.Type)
1372 groupIdx := b.groups[group]
1373 for _, mem := range strings.Split(g.Contains, " ") {
1374 r := b.region.index(mem)
1375 mm[r] = append(mm[r], groupIdx)
1376 if g, ok := b.groups[r]; ok {
1377 mm[group] = append(mm[group], g)
1378 containment[groupIdx] = append(containment[groupIdx], g)
1379 }
1380 }
1381 }
1382
1383 regionContainment := make([]uint64, len(b.groups))
1384 for _, g := range b.groups {
1385 l := containment[g]
1386
1387
1388 for i := 0; i < len(l); i++ {
1389 l = append(l, containment[l[i]]...)
1390 }
1391
1392
1393 regionContainment[g] = 1 << g
1394 for _, v := range l {
1395 regionContainment[g] |= 1 << v
1396 }
1397 }
1398 b.writeSlice("regionContainment", regionContainment)
1399
1400 regionInclusion := make([]uint8, len(b.region.s))
1401 bvs := make(map[uint64]index)
1402
1403 for r, i := range b.groups {
1404 bv := uint64(1 << i)
1405 for _, g := range mm[r] {
1406 bv |= 1 << g
1407 }
1408 bvs[bv] = i
1409 regionInclusion[r] = uint8(bvs[bv])
1410 }
1411 for r := 1; r < len(b.region.s); r++ {
1412 if _, ok := b.groups[r]; !ok {
1413 bv := uint64(0)
1414 for _, g := range mm[r] {
1415 bv |= 1 << g
1416 }
1417 if bv == 0 {
1418
1419 bv = 1 << b.groups[b.region.index("001")]
1420 }
1421 if _, ok := bvs[bv]; !ok {
1422 bvs[bv] = index(len(bvs))
1423 }
1424 regionInclusion[r] = uint8(bvs[bv])
1425 }
1426 }
1427 b.writeSlice("regionInclusion", regionInclusion)
1428 regionInclusionBits := make([]uint64, len(bvs))
1429 for k, v := range bvs {
1430 regionInclusionBits[v] = uint64(k)
1431 }
1432
1433 regionInclusionNext := []uint8{}
1434 for i := 0; i < len(regionInclusionBits); i++ {
1435 bits := regionInclusionBits[i]
1436 next := bits
1437 for i := uint(0); i < uint(len(b.groups)); i++ {
1438 if bits&(1<<i) != 0 {
1439 next |= regionInclusionBits[i]
1440 }
1441 }
1442 if _, ok := bvs[next]; !ok {
1443 bvs[next] = index(len(bvs))
1444 regionInclusionBits = append(regionInclusionBits, next)
1445 }
1446 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
1447 }
1448 b.writeSlice("regionInclusionBits", regionInclusionBits)
1449 b.writeSlice("regionInclusionNext", regionInclusionNext)
1450 }
1451
1452 type parentRel struct {
1453 lang uint16
1454 script uint16
1455 maxScript uint16
1456 toRegion uint16
1457 fromRegion []uint16
1458 }
1459
1460 func (b *builder) writeParents() {
1461 b.writeType(parentRel{})
1462
1463 parents := []parentRel{}
1464
1465
1466 n := 0
1467 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
1468
1469 if p.Parent == "root" {
1470 continue
1471 }
1472
1473 sub := strings.Split(p.Parent, "_")
1474 parent := parentRel{lang: b.langIndex(sub[0])}
1475 if len(sub) == 2 {
1476
1477
1478 parent.maxScript = uint16(b.script.index("Latn"))
1479 parent.toRegion = uint16(b.region.index(sub[1]))
1480 } else {
1481 parent.script = uint16(b.script.index(sub[1]))
1482 parent.maxScript = parent.script
1483 parent.toRegion = uint16(b.region.index(sub[2]))
1484 }
1485 for _, c := range strings.Split(p.Locales, " ") {
1486 region := b.region.index(c[strings.LastIndex(c, "_")+1:])
1487 parent.fromRegion = append(parent.fromRegion, uint16(region))
1488 }
1489 parents = append(parents, parent)
1490 n += len(parent.fromRegion)
1491 }
1492 b.writeSliceAddSize("parents", n*2, parents)
1493 }
1494
1495 func main() {
1496 gen.Init()
1497
1498 gen.Repackage("gen_common.go", "common.go", "language")
1499
1500 w := gen.NewCodeWriter()
1501 defer w.WriteGoFile("tables.go", "language")
1502
1503 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
1504
1505 b := newBuilder(w)
1506 gen.WriteCLDRVersion(w)
1507
1508 b.parseIndices()
1509 b.writeType(FromTo{})
1510 b.writeLanguage()
1511 b.writeScript()
1512 b.writeRegion()
1513 b.writeVariant()
1514
1515 b.computeRegionGroups()
1516 b.writeLikelyData()
1517 b.writeRegionInclusionData()
1518 b.writeParents()
1519 }
1520
View as plain text