...

Source file src/golang.org/x/text/internal/language/gen.go

Documentation: golang.org/x/text/internal/language

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // Language tag table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"bufio"
    14  	"flag"
    15  	"fmt"
    16  	"io"
    17  	"log"
    18  	"math"
    19  	"reflect"
    20  	"regexp"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"golang.org/x/text/internal/gen"
    26  	"golang.org/x/text/internal/tag"
    27  	"golang.org/x/text/unicode/cldr"
    28  )
    29  
    30  var (
    31  	test = flag.Bool("test",
    32  		false,
    33  		"test existing tables; can be used to compare web data with package data.")
    34  	outputFile = flag.String("output",
    35  		"tables.go",
    36  		"output file for generated tables")
    37  )
    38  
    39  var comment = []string{
    40  	`
    41  lang holds an alphabetically sorted list of ISO-639 language identifiers.
    42  All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
    43  For 2-byte language identifiers, the two successive bytes have the following meaning:
    44      - if the first letter of the 2- and 3-letter ISO codes are the same:
    45        the second and third letter of the 3-letter ISO code.
    46      - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
    47  For 3-byte language identifiers the 4th byte is 0.`,
    48  	`
    49  langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
    50  in lookup tables. The language ids for these language codes are derived directly
    51  from the letters and are not consecutive.`,
    52  	`
    53  altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
    54  to 2-letter language codes that cannot be derived using the method described above.
    55  Each 3-letter code is followed by its 1-byte langID.`,
    56  	`
    57  altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
    58  	`
    59  AliasMap maps langIDs to their suggested replacements.`,
    60  	`
    61  script is an alphabetically sorted list of ISO 15924 codes. The index
    62  of the script in the string, divided by 4, is the internal scriptID.`,
    63  	`
    64  isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
    65  for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
    66  the UN.M49 codes used for groups.)`,
    67  	`
    68  regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
    69  Each 2-letter codes is followed by two bytes with the following meaning:
    70      - [A-Z}{2}: the first letter of the 2-letter code plus these two
    71                  letters form the 3-letter ISO code.
    72      - 0, n:     index into altRegionISO3.`,
    73  	`
    74  regionTypes defines the status of a region for various standards.`,
    75  	`
    76  m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
    77  codes indicating collections of regions.`,
    78  	`
    79  m49Index gives indexes into fromM49 based on the three most significant bits
    80  of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
    81     fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
    82  for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
    83  The region code is stored in the 9 lsb of the indexed value.`,
    84  	`
    85  fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
    86  	`
    87  altRegionISO3 holds a list of 3-letter region codes that cannot be
    88  mapped to 2-letter codes using the default algorithm. This is a short list.`,
    89  	`
    90  altRegionIDs holds a list of regionIDs the positions of which match those
    91  of the 3-letter ISO codes in altRegionISO3.`,
    92  	`
    93  variantNumSpecialized is the number of specialized variants in variants.`,
    94  	`
    95  suppressScript is an index from langID to the dominant script for that language,
    96  if it exists.  If a script is given, it should be suppressed from the language tag.`,
    97  	`
    98  likelyLang is a lookup table, indexed by langID, for the most likely
    99  scripts and regions given incomplete information. If more entries exist for a
   100  given language, region and script are the index and size respectively
   101  of the list in likelyLangList.`,
   102  	`
   103  likelyLangList holds lists info associated with likelyLang.`,
   104  	`
   105  likelyRegion is a lookup table, indexed by regionID, for the most likely
   106  languages and scripts given incomplete information. If more entries exist
   107  for a given regionID, lang and script are the index and size respectively
   108  of the list in likelyRegionList.
   109  TODO: exclude containers and user-definable regions from the list.`,
   110  	`
   111  likelyRegionList holds lists info associated with likelyRegion.`,
   112  	`
   113  likelyScript is a lookup table, indexed by scriptID, for the most likely
   114  languages and regions given a script.`,
   115  	`
   116  nRegionGroups is the number of region groups.`,
   117  	`
   118  regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
   119  where each set holds all groupings that are directly connected in a region
   120  containment graph.`,
   121  	`
   122  regionInclusionBits is an array of bit vectors where every vector represents
   123  a set of region groupings.  These sets are used to compute the distance
   124  between two regions for the purpose of language matching.`,
   125  	`
   126  regionInclusionNext marks, for each entry in regionInclusionBits, the set of
   127  all groups that are reachable from the groups set in the respective entry.`,
   128  }
   129  
   130  // TODO: consider changing some of these structures to tries. This can reduce
   131  // memory, but may increase the need for memory allocations. This could be
   132  // mitigated if we can piggyback on language tags for common cases.
   133  
   134  func failOnError(e error) {
   135  	if e != nil {
   136  		log.Panic(e)
   137  	}
   138  }
   139  
   140  type setType int
   141  
   142  const (
   143  	Indexed setType = 1 + iota // all elements must be of same size
   144  	Linear
   145  )
   146  
   147  type stringSet struct {
   148  	s              []string
   149  	sorted, frozen bool
   150  
   151  	// We often need to update values after the creation of an index is completed.
   152  	// We include a convenience map for keeping track of this.
   153  	update map[string]string
   154  	typ    setType // used for checking.
   155  }
   156  
   157  func (ss *stringSet) clone() stringSet {
   158  	c := *ss
   159  	c.s = append([]string(nil), c.s...)
   160  	return c
   161  }
   162  
   163  func (ss *stringSet) setType(t setType) {
   164  	if ss.typ != t && ss.typ != 0 {
   165  		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
   166  	}
   167  }
   168  
   169  // parse parses a whitespace-separated string and initializes ss with its
   170  // components.
   171  func (ss *stringSet) parse(s string) {
   172  	scan := bufio.NewScanner(strings.NewReader(s))
   173  	scan.Split(bufio.ScanWords)
   174  	for scan.Scan() {
   175  		ss.add(scan.Text())
   176  	}
   177  }
   178  
   179  func (ss *stringSet) assertChangeable() {
   180  	if ss.frozen {
   181  		log.Panic("attempt to modify a frozen stringSet")
   182  	}
   183  }
   184  
   185  func (ss *stringSet) add(s string) {
   186  	ss.assertChangeable()
   187  	ss.s = append(ss.s, s)
   188  	ss.sorted = ss.frozen
   189  }
   190  
   191  func (ss *stringSet) freeze() {
   192  	ss.compact()
   193  	ss.frozen = true
   194  }
   195  
   196  func (ss *stringSet) compact() {
   197  	if ss.sorted {
   198  		return
   199  	}
   200  	a := ss.s
   201  	sort.Strings(a)
   202  	k := 0
   203  	for i := 1; i < len(a); i++ {
   204  		if a[k] != a[i] {
   205  			a[k+1] = a[i]
   206  			k++
   207  		}
   208  	}
   209  	ss.s = a[:k+1]
   210  	ss.sorted = ss.frozen
   211  }
   212  
   213  type funcSorter struct {
   214  	fn func(a, b string) bool
   215  	sort.StringSlice
   216  }
   217  
   218  func (s funcSorter) Less(i, j int) bool {
   219  	return s.fn(s.StringSlice[i], s.StringSlice[j])
   220  }
   221  
   222  func (ss *stringSet) sortFunc(f func(a, b string) bool) {
   223  	ss.compact()
   224  	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
   225  }
   226  
   227  func (ss *stringSet) remove(s string) {
   228  	ss.assertChangeable()
   229  	if i, ok := ss.find(s); ok {
   230  		copy(ss.s[i:], ss.s[i+1:])
   231  		ss.s = ss.s[:len(ss.s)-1]
   232  	}
   233  }
   234  
   235  func (ss *stringSet) replace(ol, nu string) {
   236  	ss.s[ss.index(ol)] = nu
   237  	ss.sorted = ss.frozen
   238  }
   239  
   240  func (ss *stringSet) index(s string) int {
   241  	ss.setType(Indexed)
   242  	i, ok := ss.find(s)
   243  	if !ok {
   244  		if i < len(ss.s) {
   245  			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
   246  		}
   247  		log.Panicf("find: item %q is not in list", s)
   248  
   249  	}
   250  	return i
   251  }
   252  
   253  func (ss *stringSet) find(s string) (int, bool) {
   254  	ss.compact()
   255  	i := sort.SearchStrings(ss.s, s)
   256  	return i, i != len(ss.s) && ss.s[i] == s
   257  }
   258  
   259  func (ss *stringSet) slice() []string {
   260  	ss.compact()
   261  	return ss.s
   262  }
   263  
   264  func (ss *stringSet) updateLater(v, key string) {
   265  	if ss.update == nil {
   266  		ss.update = map[string]string{}
   267  	}
   268  	ss.update[v] = key
   269  }
   270  
   271  // join joins the string and ensures that all entries are of the same length.
   272  func (ss *stringSet) join() string {
   273  	ss.setType(Indexed)
   274  	n := len(ss.s[0])
   275  	for _, s := range ss.s {
   276  		if len(s) != n {
   277  			log.Panicf("join: not all entries are of the same length: %q", s)
   278  		}
   279  	}
   280  	ss.s = append(ss.s, strings.Repeat("\xff", n))
   281  	return strings.Join(ss.s, "")
   282  }
   283  
   284  // ianaEntry holds information for an entry in the IANA Language Subtag Repository.
   285  // All types use the same entry.
   286  // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
   287  // fields.
   288  type ianaEntry struct {
   289  	typ            string
   290  	description    []string
   291  	scope          string
   292  	added          string
   293  	preferred      string
   294  	deprecated     string
   295  	suppressScript string
   296  	macro          string
   297  	prefix         []string
   298  }
   299  
   300  type builder struct {
   301  	w    *gen.CodeWriter
   302  	hw   io.Writer // MultiWriter for w and w.Hash
   303  	data *cldr.CLDR
   304  	supp *cldr.SupplementalData
   305  
   306  	// indices
   307  	locale      stringSet // common locales
   308  	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
   309  	langNoIndex stringSet // 3-letter ISO codes with no associated data
   310  	script      stringSet // 4-letter ISO codes
   311  	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
   312  	variant     stringSet // 4-8-alphanumeric variant code.
   313  
   314  	// Region codes that are groups with their corresponding group IDs.
   315  	groups map[int]index
   316  
   317  	// langInfo
   318  	registry map[string]*ianaEntry
   319  }
   320  
   321  type index uint
   322  
   323  func newBuilder(w *gen.CodeWriter) *builder {
   324  	r := gen.OpenCLDRCoreZip()
   325  	defer r.Close()
   326  	d := &cldr.Decoder{}
   327  	data, err := d.DecodeZip(r)
   328  	failOnError(err)
   329  	b := builder{
   330  		w:    w,
   331  		hw:   io.MultiWriter(w, w.Hash),
   332  		data: data,
   333  		supp: data.Supplemental(),
   334  	}
   335  	b.parseRegistry()
   336  	return &b
   337  }
   338  
   339  func (b *builder) parseRegistry() {
   340  	r := gen.OpenIANAFile("assignments/language-subtag-registry")
   341  	defer r.Close()
   342  	b.registry = make(map[string]*ianaEntry)
   343  
   344  	scan := bufio.NewScanner(r)
   345  	scan.Split(bufio.ScanWords)
   346  	var record *ianaEntry
   347  	for more := scan.Scan(); more; {
   348  		key := scan.Text()
   349  		more = scan.Scan()
   350  		value := scan.Text()
   351  		switch key {
   352  		case "Type:":
   353  			record = &ianaEntry{typ: value}
   354  		case "Subtag:", "Tag:":
   355  			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
   356  				for a := s[0]; a <= s[1]; a = inc(a) {
   357  					b.addToRegistry(a, record)
   358  				}
   359  			} else {
   360  				b.addToRegistry(value, record)
   361  			}
   362  		case "Suppress-Script:":
   363  			record.suppressScript = value
   364  		case "Added:":
   365  			record.added = value
   366  		case "Deprecated:":
   367  			record.deprecated = value
   368  		case "Macrolanguage:":
   369  			record.macro = value
   370  		case "Preferred-Value:":
   371  			record.preferred = value
   372  		case "Prefix:":
   373  			record.prefix = append(record.prefix, value)
   374  		case "Scope:":
   375  			record.scope = value
   376  		case "Description:":
   377  			buf := []byte(value)
   378  			for more = scan.Scan(); more; more = scan.Scan() {
   379  				b := scan.Bytes()
   380  				if b[0] == '%' || b[len(b)-1] == ':' {
   381  					break
   382  				}
   383  				buf = append(buf, ' ')
   384  				buf = append(buf, b...)
   385  			}
   386  			record.description = append(record.description, string(buf))
   387  			continue
   388  		default:
   389  			continue
   390  		}
   391  		more = scan.Scan()
   392  	}
   393  	if scan.Err() != nil {
   394  		log.Panic(scan.Err())
   395  	}
   396  }
   397  
   398  func (b *builder) addToRegistry(key string, entry *ianaEntry) {
   399  	if info, ok := b.registry[key]; ok {
   400  		if info.typ != "language" || entry.typ != "extlang" {
   401  			log.Fatalf("parseRegistry: tag %q already exists", key)
   402  		}
   403  	} else {
   404  		b.registry[key] = entry
   405  	}
   406  }
   407  
   408  var commentIndex = make(map[string]string)
   409  
   410  func init() {
   411  	for _, s := range comment {
   412  		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
   413  		commentIndex[key] = s
   414  	}
   415  }
   416  
   417  func (b *builder) comment(name string) {
   418  	if s := commentIndex[name]; len(s) > 0 {
   419  		b.w.WriteComment(s)
   420  	} else {
   421  		fmt.Fprintln(b.w)
   422  	}
   423  }
   424  
   425  func (b *builder) pf(f string, x ...interface{}) {
   426  	fmt.Fprintf(b.hw, f, x...)
   427  	fmt.Fprint(b.hw, "\n")
   428  }
   429  
   430  func (b *builder) p(x ...interface{}) {
   431  	fmt.Fprintln(b.hw, x...)
   432  }
   433  
   434  func (b *builder) addSize(s int) {
   435  	b.w.Size += s
   436  	b.pf("// Size: %d bytes", s)
   437  }
   438  
   439  func (b *builder) writeConst(name string, x interface{}) {
   440  	b.comment(name)
   441  	b.w.WriteConst(name, x)
   442  }
   443  
   444  // writeConsts computes f(v) for all v in values and writes the results
   445  // as constants named _v to a single constant block.
   446  func (b *builder) writeConsts(f func(string) int, values ...string) {
   447  	b.pf("const (")
   448  	for _, v := range values {
   449  		b.pf("\t_%s = %v", v, f(v))
   450  	}
   451  	b.pf(")")
   452  }
   453  
   454  // writeType writes the type of the given value, which must be a struct.
   455  func (b *builder) writeType(value interface{}) {
   456  	b.comment(reflect.TypeOf(value).Name())
   457  	b.w.WriteType(value)
   458  }
   459  
   460  func (b *builder) writeSlice(name string, ss interface{}) {
   461  	b.writeSliceAddSize(name, 0, ss)
   462  }
   463  
   464  func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
   465  	b.comment(name)
   466  	b.w.Size += extraSize
   467  	v := reflect.ValueOf(ss)
   468  	t := v.Type().Elem()
   469  	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
   470  
   471  	fmt.Fprintf(b.w, "var %s = ", name)
   472  	b.w.WriteArray(ss)
   473  	b.p()
   474  }
   475  
   476  type FromTo struct {
   477  	From, To uint16
   478  }
   479  
   480  func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
   481  	ss.sortFunc(func(a, b string) bool {
   482  		return index(a) < index(b)
   483  	})
   484  	m := []FromTo{}
   485  	for _, s := range ss.s {
   486  		m = append(m, FromTo{index(s), index(ss.update[s])})
   487  	}
   488  	b.writeSlice(name, m)
   489  }
   490  
   491  const base = 'z' - 'a' + 1
   492  
   493  func strToInt(s string) uint {
   494  	v := uint(0)
   495  	for i := 0; i < len(s); i++ {
   496  		v *= base
   497  		v += uint(s[i] - 'a')
   498  	}
   499  	return v
   500  }
   501  
   502  // converts the given integer to the original ASCII string passed to strToInt.
   503  // len(s) must match the number of characters obtained.
   504  func intToStr(v uint, s []byte) {
   505  	for i := len(s) - 1; i >= 0; i-- {
   506  		s[i] = byte(v%base) + 'a'
   507  		v /= base
   508  	}
   509  }
   510  
   511  func (b *builder) writeBitVector(name string, ss []string) {
   512  	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
   513  	for _, s := range ss {
   514  		v := strToInt(s)
   515  		vec[v/8] |= 1 << (v % 8)
   516  	}
   517  	b.writeSlice(name, vec)
   518  }
   519  
   520  // TODO: convert this type into a list or two-stage trie.
   521  func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
   522  	b.comment(name)
   523  	v := reflect.ValueOf(m)
   524  	sz := v.Len() * (2 + int(v.Type().Key().Size()))
   525  	for _, k := range m {
   526  		sz += len(k)
   527  	}
   528  	b.addSize(sz)
   529  	keys := []string{}
   530  	b.pf(`var %s = map[string]uint16{`, name)
   531  	for k := range m {
   532  		keys = append(keys, k)
   533  	}
   534  	sort.Strings(keys)
   535  	for _, k := range keys {
   536  		b.pf("\t%q: %v,", k, f(m[k]))
   537  	}
   538  	b.p("}")
   539  }
   540  
   541  func (b *builder) writeMap(name string, m interface{}) {
   542  	b.comment(name)
   543  	v := reflect.ValueOf(m)
   544  	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
   545  	b.addSize(sz)
   546  	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
   547  		return strings.IndexRune("{}, ", r) != -1
   548  	})
   549  	sort.Strings(f[1:])
   550  	b.pf(`var %s = %s{`, name, f[0])
   551  	for _, kv := range f[1:] {
   552  		b.pf("\t%s,", kv)
   553  	}
   554  	b.p("}")
   555  }
   556  
   557  func (b *builder) langIndex(s string) uint16 {
   558  	if s == "und" {
   559  		return 0
   560  	}
   561  	if i, ok := b.lang.find(s); ok {
   562  		return uint16(i)
   563  	}
   564  	return uint16(strToInt(s)) + uint16(len(b.lang.s))
   565  }
   566  
   567  // inc advances the string to its lexicographical successor.
   568  func inc(s string) string {
   569  	const maxTagLength = 4
   570  	var buf [maxTagLength]byte
   571  	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
   572  	for i := 0; i < len(s); i++ {
   573  		if s[i] <= 'Z' {
   574  			buf[i] -= 'a' - 'A'
   575  		}
   576  	}
   577  	return string(buf[:len(s)])
   578  }
   579  
   580  func (b *builder) parseIndices() {
   581  	meta := b.supp.Metadata
   582  
   583  	for k, v := range b.registry {
   584  		var ss *stringSet
   585  		switch v.typ {
   586  		case "language":
   587  			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
   588  				b.lang.add(k)
   589  				continue
   590  			} else {
   591  				ss = &b.langNoIndex
   592  			}
   593  		case "region":
   594  			ss = &b.region
   595  		case "script":
   596  			ss = &b.script
   597  		case "variant":
   598  			ss = &b.variant
   599  		default:
   600  			continue
   601  		}
   602  		ss.add(k)
   603  	}
   604  	// Include any language for which there is data.
   605  	for _, lang := range b.data.Locales() {
   606  		if x := b.data.RawLDML(lang); false ||
   607  			x.LocaleDisplayNames != nil ||
   608  			x.Characters != nil ||
   609  			x.Delimiters != nil ||
   610  			x.Measurement != nil ||
   611  			x.Dates != nil ||
   612  			x.Numbers != nil ||
   613  			x.Units != nil ||
   614  			x.ListPatterns != nil ||
   615  			x.Collations != nil ||
   616  			x.Segmentations != nil ||
   617  			x.Rbnf != nil ||
   618  			x.Annotations != nil ||
   619  			x.Metadata != nil {
   620  
   621  			from := strings.Split(lang, "_")
   622  			if lang := from[0]; lang != "root" {
   623  				b.lang.add(lang)
   624  			}
   625  		}
   626  	}
   627  	// Include locales for plural rules, which uses a different structure.
   628  	for _, plurals := range b.data.Supplemental().Plurals {
   629  		for _, rules := range plurals.PluralRules {
   630  			for _, lang := range strings.Split(rules.Locales, " ") {
   631  				if lang = strings.Split(lang, "_")[0]; lang != "root" {
   632  					b.lang.add(lang)
   633  				}
   634  			}
   635  		}
   636  	}
   637  	// Include languages in likely subtags.
   638  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
   639  		from := strings.Split(m.From, "_")
   640  		b.lang.add(from[0])
   641  	}
   642  	// Include ISO-639 alpha-3 bibliographic entries.
   643  	for _, a := range meta.Alias.LanguageAlias {
   644  		if a.Reason == "bibliographic" {
   645  			b.langNoIndex.add(a.Type)
   646  		}
   647  	}
   648  	// Include regions in territoryAlias (not all are in the IANA registry!)
   649  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
   650  		if len(reg.Type) == 2 {
   651  			b.region.add(reg.Type)
   652  		}
   653  	}
   654  
   655  	for _, s := range b.lang.s {
   656  		if len(s) == 3 {
   657  			b.langNoIndex.remove(s)
   658  		}
   659  	}
   660  	b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
   661  	b.writeConst("NumScripts", len(b.script.slice()))
   662  	b.writeConst("NumRegions", len(b.region.slice()))
   663  
   664  	// Add dummy codes at the start of each list to represent "unspecified".
   665  	b.lang.add("---")
   666  	b.script.add("----")
   667  	b.region.add("---")
   668  
   669  	// common locales
   670  	b.locale.parse(meta.DefaultContent.Locales)
   671  }
   672  
   673  // TODO: region inclusion data will probably not be use used in future matchers.
   674  
   675  func (b *builder) computeRegionGroups() {
   676  	b.groups = make(map[int]index)
   677  
   678  	// Create group indices.
   679  	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
   680  		b.groups[i] = index(len(b.groups))
   681  	}
   682  	for _, g := range b.supp.TerritoryContainment.Group {
   683  		// Skip UN and EURO zone as they are flattening the containment
   684  		// relationship.
   685  		if g.Type == "EZ" || g.Type == "UN" {
   686  			continue
   687  		}
   688  		group := b.region.index(g.Type)
   689  		if _, ok := b.groups[group]; !ok {
   690  			b.groups[group] = index(len(b.groups))
   691  		}
   692  	}
   693  	if len(b.groups) > 64 {
   694  		log.Fatalf("only 64 groups supported, found %d", len(b.groups))
   695  	}
   696  	b.writeConst("nRegionGroups", len(b.groups))
   697  }
   698  
   699  var langConsts = []string{
   700  	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
   701  	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
   702  	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
   703  	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
   704  	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
   705  	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
   706  
   707  	// constants for grandfathered tags (if not already defined)
   708  	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
   709  	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
   710  }
   711  
   712  // writeLanguage generates all tables needed for language canonicalization.
   713  func (b *builder) writeLanguage() {
   714  	meta := b.supp.Metadata
   715  
   716  	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
   717  	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
   718  	b.writeConst("langPrivateStart", b.langIndex("qaa"))
   719  	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
   720  
   721  	// Get language codes that need to be mapped (overlong 3-letter codes,
   722  	// deprecated 2-letter codes, legacy and grandfathered tags.)
   723  	langAliasMap := stringSet{}
   724  	aliasTypeMap := map[string]AliasType{}
   725  
   726  	// altLangISO3 get the alternative ISO3 names that need to be mapped.
   727  	altLangISO3 := stringSet{}
   728  	// Add dummy start to avoid the use of index 0.
   729  	altLangISO3.add("---")
   730  	altLangISO3.updateLater("---", "aa")
   731  
   732  	lang := b.lang.clone()
   733  	for _, a := range meta.Alias.LanguageAlias {
   734  		if a.Replacement == "" {
   735  			a.Replacement = "und"
   736  		}
   737  		// TODO: support mapping to tags
   738  		repl := strings.SplitN(a.Replacement, "_", 2)[0]
   739  		if a.Reason == "overlong" {
   740  			if len(a.Replacement) == 2 && len(a.Type) == 3 {
   741  				lang.updateLater(a.Replacement, a.Type)
   742  			}
   743  		} else if len(a.Type) <= 3 {
   744  			switch a.Reason {
   745  			case "macrolanguage":
   746  				aliasTypeMap[a.Type] = Macro
   747  			case "deprecated":
   748  				// handled elsewhere
   749  				continue
   750  			case "bibliographic", "legacy":
   751  				if a.Type == "no" {
   752  					continue
   753  				}
   754  				aliasTypeMap[a.Type] = Legacy
   755  			default:
   756  				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
   757  			}
   758  			langAliasMap.add(a.Type)
   759  			langAliasMap.updateLater(a.Type, repl)
   760  		}
   761  	}
   762  	// Manually add the mapping of "nb" (Norwegian) to its macro language.
   763  	// This can be removed if CLDR adopts this change.
   764  	langAliasMap.add("nb")
   765  	langAliasMap.updateLater("nb", "no")
   766  	aliasTypeMap["nb"] = Macro
   767  
   768  	for k, v := range b.registry {
   769  		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
   770  		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
   771  			langAliasMap.add(k)
   772  			langAliasMap.updateLater(k, v.preferred)
   773  			aliasTypeMap[k] = Deprecated
   774  		}
   775  	}
   776  	// Fix CLDR mappings.
   777  	lang.updateLater("tl", "tgl")
   778  	lang.updateLater("sh", "hbs")
   779  	lang.updateLater("mo", "mol")
   780  	lang.updateLater("no", "nor")
   781  	lang.updateLater("tw", "twi")
   782  	lang.updateLater("nb", "nob")
   783  	lang.updateLater("ak", "aka")
   784  	lang.updateLater("bh", "bih")
   785  
   786  	// Ensure that each 2-letter code is matched with a 3-letter code.
   787  	for _, v := range lang.s[1:] {
   788  		s, ok := lang.update[v]
   789  		if !ok {
   790  			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
   791  				continue
   792  			}
   793  			lang.update[v] = s
   794  		}
   795  		if v[0] != s[0] {
   796  			altLangISO3.add(s)
   797  			altLangISO3.updateLater(s, v)
   798  		}
   799  	}
   800  
   801  	// Complete canonicalized language tags.
   802  	lang.freeze()
   803  	for i, v := range lang.s {
   804  		// We can avoid these manual entries by using the IANA registry directly.
   805  		// Seems easier to update the list manually, as changes are rare.
   806  		// The panic in this loop will trigger if we miss an entry.
   807  		add := ""
   808  		if s, ok := lang.update[v]; ok {
   809  			if s[0] == v[0] {
   810  				add = s[1:]
   811  			} else {
   812  				add = string([]byte{0, byte(altLangISO3.index(s))})
   813  			}
   814  		} else if len(v) == 3 {
   815  			add = "\x00"
   816  		} else {
   817  			log.Panicf("no data for long form of %q", v)
   818  		}
   819  		lang.s[i] += add
   820  	}
   821  	b.writeConst("lang", tag.Index(lang.join()))
   822  
   823  	b.writeConst("langNoIndexOffset", len(b.lang.s))
   824  
   825  	// space of all valid 3-letter language identifiers.
   826  	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
   827  
   828  	altLangIndex := []uint16{}
   829  	for i, s := range altLangISO3.slice() {
   830  		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
   831  		if i > 0 {
   832  			idx := b.lang.index(altLangISO3.update[s])
   833  			altLangIndex = append(altLangIndex, uint16(idx))
   834  		}
   835  	}
   836  	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
   837  	b.writeSlice("altLangIndex", altLangIndex)
   838  
   839  	b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex)
   840  	types := make([]AliasType, len(langAliasMap.s))
   841  	for i, s := range langAliasMap.s {
   842  		types[i] = aliasTypeMap[s]
   843  	}
   844  	b.writeSlice("AliasTypes", types)
   845  }
   846  
   847  var scriptConsts = []string{
   848  	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
   849  	"Zzzz",
   850  }
   851  
   852  func (b *builder) writeScript() {
   853  	b.writeConsts(b.script.index, scriptConsts...)
   854  	b.writeConst("script", tag.Index(b.script.join()))
   855  
   856  	supp := make([]uint8, len(b.lang.slice()))
   857  	for i, v := range b.lang.slice()[1:] {
   858  		if sc := b.registry[v].suppressScript; sc != "" {
   859  			supp[i+1] = uint8(b.script.index(sc))
   860  		}
   861  	}
   862  	b.writeSlice("suppressScript", supp)
   863  
   864  	// There is only one deprecated script in CLDR. This value is hard-coded.
   865  	// We check here if the code must be updated.
   866  	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
   867  		if a.Type != "Qaai" {
   868  			log.Panicf("unexpected deprecated stript %q", a.Type)
   869  		}
   870  	}
   871  }
   872  
   873  func parseM49(s string) int16 {
   874  	if len(s) == 0 {
   875  		return 0
   876  	}
   877  	v, err := strconv.ParseUint(s, 10, 10)
   878  	failOnError(err)
   879  	return int16(v)
   880  }
   881  
   882  var regionConsts = []string{
   883  	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
   884  	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
   885  }
   886  
   887  func (b *builder) writeRegion() {
   888  	b.writeConsts(b.region.index, regionConsts...)
   889  
   890  	isoOffset := b.region.index("AA")
   891  	m49map := make([]int16, len(b.region.slice()))
   892  	fromM49map := make(map[int16]int)
   893  	altRegionISO3 := ""
   894  	altRegionIDs := []uint16{}
   895  
   896  	b.writeConst("isoRegionOffset", isoOffset)
   897  
   898  	// 2-letter region lookup and mapping to numeric codes.
   899  	regionISO := b.region.clone()
   900  	regionISO.s = regionISO.s[isoOffset:]
   901  	regionISO.sorted = false
   902  
   903  	regionTypes := make([]byte, len(b.region.s))
   904  
   905  	// Is the region valid BCP 47?
   906  	for s, e := range b.registry {
   907  		if len(s) == 2 && s == strings.ToUpper(s) {
   908  			i := b.region.index(s)
   909  			for _, d := range e.description {
   910  				if strings.Contains(d, "Private use") {
   911  					regionTypes[i] = iso3166UserAssigned
   912  				}
   913  			}
   914  			regionTypes[i] |= bcp47Region
   915  		}
   916  	}
   917  
   918  	// Is the region a valid ccTLD?
   919  	r := gen.OpenIANAFile("domains/root/db")
   920  	defer r.Close()
   921  
   922  	buf, err := io.ReadAll(r)
   923  	failOnError(err)
   924  	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
   925  	for _, m := range re.FindAllSubmatch(buf, -1) {
   926  		i := b.region.index(strings.ToUpper(string(m[1])))
   927  		regionTypes[i] |= ccTLD
   928  	}
   929  
   930  	b.writeSlice("regionTypes", regionTypes)
   931  
   932  	iso3Set := make(map[string]int)
   933  	update := func(iso2, iso3 string) {
   934  		i := regionISO.index(iso2)
   935  		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
   936  			regionISO.s[i] += iso3[1:]
   937  			iso3Set[iso3] = -1
   938  		} else {
   939  			if ok && j >= 0 {
   940  				regionISO.s[i] += string([]byte{0, byte(j)})
   941  			} else {
   942  				iso3Set[iso3] = len(altRegionISO3)
   943  				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
   944  				altRegionISO3 += iso3
   945  				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
   946  			}
   947  		}
   948  	}
   949  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   950  		i := regionISO.index(tc.Type) + isoOffset
   951  		if d := m49map[i]; d != 0 {
   952  			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
   953  		}
   954  		m49 := parseM49(tc.Numeric)
   955  		m49map[i] = m49
   956  		if r := fromM49map[m49]; r == 0 {
   957  			fromM49map[m49] = i
   958  		} else if r != i {
   959  			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
   960  			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
   961  				fromM49map[m49] = i
   962  			}
   963  		}
   964  	}
   965  	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
   966  		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
   967  			from := parseM49(ta.Type)
   968  			if r := fromM49map[from]; r == 0 {
   969  				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
   970  			}
   971  		}
   972  	}
   973  	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
   974  		if len(tc.Alpha3) == 3 {
   975  			update(tc.Type, tc.Alpha3)
   976  		}
   977  	}
   978  	// This entries are not included in territoryCodes. Mostly 3-letter variants
   979  	// of deleted codes and an entry for QU.
   980  	for _, m := range []struct{ iso2, iso3 string }{
   981  		{"CT", "CTE"},
   982  		{"DY", "DHY"},
   983  		{"HV", "HVO"},
   984  		{"JT", "JTN"},
   985  		{"MI", "MID"},
   986  		{"NH", "NHB"},
   987  		{"NQ", "ATN"},
   988  		{"PC", "PCI"},
   989  		{"PU", "PUS"},
   990  		{"PZ", "PCZ"},
   991  		{"RH", "RHO"},
   992  		{"VD", "VDR"},
   993  		{"WK", "WAK"},
   994  		// These three-letter codes are used for others as well.
   995  		{"FQ", "ATF"},
   996  	} {
   997  		update(m.iso2, m.iso3)
   998  	}
   999  	for i, s := range regionISO.s {
  1000  		if len(s) != 4 {
  1001  			regionISO.s[i] = s + "  "
  1002  		}
  1003  	}
  1004  	b.writeConst("regionISO", tag.Index(regionISO.join()))
  1005  	b.writeConst("altRegionISO3", altRegionISO3)
  1006  	b.writeSlice("altRegionIDs", altRegionIDs)
  1007  
  1008  	// Create list of deprecated regions.
  1009  	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
  1010  	// Transitionally-reserved mapping not included.
  1011  	regionOldMap := stringSet{}
  1012  	// Include regions in territoryAlias (not all are in the IANA registry!)
  1013  	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
  1014  		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
  1015  			regionOldMap.add(reg.Type)
  1016  			regionOldMap.updateLater(reg.Type, reg.Replacement)
  1017  			i, _ := regionISO.find(reg.Type)
  1018  			j, _ := regionISO.find(reg.Replacement)
  1019  			if k := m49map[i+isoOffset]; k == 0 {
  1020  				m49map[i+isoOffset] = m49map[j+isoOffset]
  1021  			}
  1022  		}
  1023  	}
  1024  	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
  1025  		return uint16(b.region.index(s))
  1026  	})
  1027  	// 3-digit region lookup, groupings.
  1028  	for i := 1; i < isoOffset; i++ {
  1029  		m := parseM49(b.region.s[i])
  1030  		m49map[i] = m
  1031  		fromM49map[m] = i
  1032  	}
  1033  	b.writeSlice("m49", m49map)
  1034  
  1035  	const (
  1036  		searchBits = 7
  1037  		regionBits = 9
  1038  	)
  1039  	if len(m49map) >= 1<<regionBits {
  1040  		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
  1041  	}
  1042  	m49Index := [9]int16{}
  1043  	fromM49 := []uint16{}
  1044  	m49 := []int{}
  1045  	for k, _ := range fromM49map {
  1046  		m49 = append(m49, int(k))
  1047  	}
  1048  	sort.Ints(m49)
  1049  	for _, k := range m49[1:] {
  1050  		val := (k & (1<<searchBits - 1)) << regionBits
  1051  		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
  1052  		m49Index[1:][k>>searchBits] = int16(len(fromM49))
  1053  	}
  1054  	b.writeSlice("m49Index", m49Index)
  1055  	b.writeSlice("fromM49", fromM49)
  1056  }
  1057  
  1058  const (
  1059  	// TODO: put these lists in regionTypes as user data? Could be used for
  1060  	// various optimizations and refinements and could be exposed in the API.
  1061  	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
  1062  	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
  1063  	// DY and RH are actually not deleted, but indeterminately reserved.
  1064  	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
  1065  )
  1066  
  1067  const (
  1068  	iso3166UserAssigned = 1 << iota
  1069  	ccTLD
  1070  	bcp47Region
  1071  )
  1072  
  1073  func find(list []string, s string) int {
  1074  	for i, t := range list {
  1075  		if t == s {
  1076  			return i
  1077  		}
  1078  	}
  1079  	return -1
  1080  }
  1081  
  1082  // writeVariant generates per-variant information and creates a map from variant
  1083  // name to index value. We assign index values such that sorting multiple
  1084  // variants by index value will result in the correct order.
  1085  // There are two types of variants: specialized and general. Specialized variants
  1086  // are only applicable to certain language or language-script pairs. Generalized
  1087  // variants apply to any language. Generalized variants always sort after
  1088  // specialized variants.  We will therefore always assign a higher index value
  1089  // to a generalized variant than any other variant. Generalized variants are
  1090  // sorted alphabetically among themselves.
  1091  // Specialized variants may also sort after other specialized variants. Such
  1092  // variants will be ordered after any of the variants they may follow.
  1093  // We assume that if a variant x is followed by a variant y, then for any prefix
  1094  // p of x, p-x is a prefix of y. This allows us to order tags based on the
  1095  // maximum of the length of any of its prefixes.
  1096  // TODO: it is possible to define a set of Prefix values on variants such that
  1097  // a total order cannot be defined to the point that this algorithm breaks.
  1098  // In other words, we cannot guarantee the same order of variants for the
  1099  // future using the same algorithm or for non-compliant combinations of
  1100  // variants. For this reason, consider using simple alphabetic sorting
  1101  // of variants and ignore Prefix restrictions altogether.
  1102  func (b *builder) writeVariant() {
  1103  	generalized := stringSet{}
  1104  	specialized := stringSet{}
  1105  	specializedExtend := stringSet{}
  1106  	// Collate the variants by type and check assumptions.
  1107  	for _, v := range b.variant.slice() {
  1108  		e := b.registry[v]
  1109  		if len(e.prefix) == 0 {
  1110  			generalized.add(v)
  1111  			continue
  1112  		}
  1113  		c := strings.Split(e.prefix[0], "-")
  1114  		hasScriptOrRegion := false
  1115  		if len(c) > 1 {
  1116  			_, hasScriptOrRegion = b.script.find(c[1])
  1117  			if !hasScriptOrRegion {
  1118  				_, hasScriptOrRegion = b.region.find(c[1])
  1119  
  1120  			}
  1121  		}
  1122  		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
  1123  			// Variant is preceded by a language.
  1124  			specialized.add(v)
  1125  			continue
  1126  		}
  1127  		// Variant is preceded by another variant.
  1128  		specializedExtend.add(v)
  1129  		prefix := c[0] + "-"
  1130  		if hasScriptOrRegion {
  1131  			prefix += c[1]
  1132  		}
  1133  		for _, p := range e.prefix {
  1134  			// Verify that the prefix minus the last element is a prefix of the
  1135  			// predecessor element.
  1136  			i := strings.LastIndex(p, "-")
  1137  			pred := b.registry[p[i+1:]]
  1138  			if find(pred.prefix, p[:i]) < 0 {
  1139  				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
  1140  			}
  1141  			// The sorting used below does not work in the general case. It works
  1142  			// if we assume that variants that may be followed by others only have
  1143  			// prefixes of the same length. Verify this.
  1144  			count := strings.Count(p[:i], "-")
  1145  			for _, q := range pred.prefix {
  1146  				if c := strings.Count(q, "-"); c != count {
  1147  					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
  1148  				}
  1149  			}
  1150  			if !strings.HasPrefix(p, prefix) {
  1151  				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
  1152  			}
  1153  		}
  1154  	}
  1155  
  1156  	// Sort extended variants.
  1157  	a := specializedExtend.s
  1158  	less := func(v, w string) bool {
  1159  		// Sort by the maximum number of elements.
  1160  		maxCount := func(s string) (max int) {
  1161  			for _, p := range b.registry[s].prefix {
  1162  				if c := strings.Count(p, "-"); c > max {
  1163  					max = c
  1164  				}
  1165  			}
  1166  			return
  1167  		}
  1168  		if cv, cw := maxCount(v), maxCount(w); cv != cw {
  1169  			return cv < cw
  1170  		}
  1171  		// Sort by name as tie breaker.
  1172  		return v < w
  1173  	}
  1174  	sort.Sort(funcSorter{less, sort.StringSlice(a)})
  1175  	specializedExtend.frozen = true
  1176  
  1177  	// Create index from variant name to index.
  1178  	variantIndex := make(map[string]uint8)
  1179  	add := func(s []string) {
  1180  		for _, v := range s {
  1181  			variantIndex[v] = uint8(len(variantIndex))
  1182  		}
  1183  	}
  1184  	add(specialized.slice())
  1185  	add(specializedExtend.s)
  1186  	numSpecialized := len(variantIndex)
  1187  	add(generalized.slice())
  1188  	if n := len(variantIndex); n > 255 {
  1189  		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
  1190  	}
  1191  	b.writeMap("variantIndex", variantIndex)
  1192  	b.writeConst("variantNumSpecialized", numSpecialized)
  1193  }
  1194  
  1195  func (b *builder) writeLanguageInfo() {
  1196  }
  1197  
  1198  // writeLikelyData writes tables that are used both for finding parent relations and for
  1199  // language matching.  Each entry contains additional bits to indicate the status of the
  1200  // data to know when it cannot be used for parent relations.
  1201  func (b *builder) writeLikelyData() {
  1202  	const (
  1203  		isList = 1 << iota
  1204  		scriptInFrom
  1205  		regionInFrom
  1206  	)
  1207  	type ( // generated types
  1208  		likelyScriptRegion struct {
  1209  			region uint16
  1210  			script uint16
  1211  			flags  uint8
  1212  		}
  1213  		likelyLangScript struct {
  1214  			lang   uint16
  1215  			script uint16
  1216  			flags  uint8
  1217  		}
  1218  		likelyLangRegion struct {
  1219  			lang   uint16
  1220  			region uint16
  1221  		}
  1222  		// likelyTag is used for getting likely tags for group regions, where
  1223  		// the likely region might be a region contained in the group.
  1224  		likelyTag struct {
  1225  			lang   uint16
  1226  			region uint16
  1227  			script uint16
  1228  		}
  1229  	)
  1230  	var ( // generated variables
  1231  		likelyRegionGroup = make([]likelyTag, len(b.groups))
  1232  		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
  1233  		likelyRegion      = make([]likelyLangScript, len(b.region.s))
  1234  		likelyScript      = make([]likelyLangRegion, len(b.script.s))
  1235  		likelyLangList    = []likelyScriptRegion{}
  1236  		likelyRegionList  = []likelyLangScript{}
  1237  	)
  1238  	type fromTo struct {
  1239  		from, to []string
  1240  	}
  1241  	langToOther := map[int][]fromTo{}
  1242  	regionToOther := map[int][]fromTo{}
  1243  	for _, m := range b.supp.LikelySubtags.LikelySubtag {
  1244  		from := strings.Split(m.From, "_")
  1245  		to := strings.Split(m.To, "_")
  1246  		if len(to) != 3 {
  1247  			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
  1248  		}
  1249  		if len(from) > 3 {
  1250  			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
  1251  		}
  1252  		if from[0] != to[0] && from[0] != "und" {
  1253  			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
  1254  		}
  1255  		if len(from) == 3 {
  1256  			if from[2] != to[2] {
  1257  				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
  1258  			}
  1259  			if from[0] != "und" {
  1260  				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
  1261  			}
  1262  		}
  1263  		if len(from) == 1 || from[0] != "und" {
  1264  			id := 0
  1265  			if from[0] != "und" {
  1266  				id = b.lang.index(from[0])
  1267  			}
  1268  			langToOther[id] = append(langToOther[id], fromTo{from, to})
  1269  		} else if len(from) == 2 && len(from[1]) == 4 {
  1270  			sid := b.script.index(from[1])
  1271  			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
  1272  			likelyScript[sid].region = uint16(b.region.index(to[2]))
  1273  		} else {
  1274  			r := b.region.index(from[len(from)-1])
  1275  			if id, ok := b.groups[r]; ok {
  1276  				if from[0] != "und" {
  1277  					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
  1278  				}
  1279  				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
  1280  				likelyRegionGroup[id].script = uint16(b.script.index(to[1]))
  1281  				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
  1282  			} else {
  1283  				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
  1284  			}
  1285  		}
  1286  	}
  1287  	b.writeType(likelyLangRegion{})
  1288  	b.writeSlice("likelyScript", likelyScript)
  1289  
  1290  	for id := range b.lang.s {
  1291  		list := langToOther[id]
  1292  		if len(list) == 1 {
  1293  			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
  1294  			likelyLang[id].script = uint16(b.script.index(list[0].to[1]))
  1295  		} else if len(list) > 1 {
  1296  			likelyLang[id].flags = isList
  1297  			likelyLang[id].region = uint16(len(likelyLangList))
  1298  			likelyLang[id].script = uint16(len(list))
  1299  			for _, x := range list {
  1300  				flags := uint8(0)
  1301  				if len(x.from) > 1 {
  1302  					if x.from[1] == x.to[2] {
  1303  						flags = regionInFrom
  1304  					} else {
  1305  						flags = scriptInFrom
  1306  					}
  1307  				}
  1308  				likelyLangList = append(likelyLangList, likelyScriptRegion{
  1309  					region: uint16(b.region.index(x.to[2])),
  1310  					script: uint16(b.script.index(x.to[1])),
  1311  					flags:  flags,
  1312  				})
  1313  			}
  1314  		}
  1315  	}
  1316  	// TODO: merge suppressScript data with this table.
  1317  	b.writeType(likelyScriptRegion{})
  1318  	b.writeSlice("likelyLang", likelyLang)
  1319  	b.writeSlice("likelyLangList", likelyLangList)
  1320  
  1321  	for id := range b.region.s {
  1322  		list := regionToOther[id]
  1323  		if len(list) == 1 {
  1324  			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
  1325  			likelyRegion[id].script = uint16(b.script.index(list[0].to[1]))
  1326  			if len(list[0].from) > 2 {
  1327  				likelyRegion[id].flags = scriptInFrom
  1328  			}
  1329  		} else if len(list) > 1 {
  1330  			likelyRegion[id].flags = isList
  1331  			likelyRegion[id].lang = uint16(len(likelyRegionList))
  1332  			likelyRegion[id].script = uint16(len(list))
  1333  			for i, x := range list {
  1334  				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
  1335  					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
  1336  				}
  1337  				x := likelyLangScript{
  1338  					lang:   uint16(b.langIndex(x.to[0])),
  1339  					script: uint16(b.script.index(x.to[1])),
  1340  				}
  1341  				if len(list[0].from) > 2 {
  1342  					x.flags = scriptInFrom
  1343  				}
  1344  				likelyRegionList = append(likelyRegionList, x)
  1345  			}
  1346  		}
  1347  	}
  1348  	b.writeType(likelyLangScript{})
  1349  	b.writeSlice("likelyRegion", likelyRegion)
  1350  	b.writeSlice("likelyRegionList", likelyRegionList)
  1351  
  1352  	b.writeType(likelyTag{})
  1353  	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
  1354  }
  1355  
  1356  func (b *builder) writeRegionInclusionData() {
  1357  	var (
  1358  		// mm holds for each group the set of groups with a distance of 1.
  1359  		mm = make(map[int][]index)
  1360  
  1361  		// containment holds for each group the transitive closure of
  1362  		// containment of other groups.
  1363  		containment = make(map[index][]index)
  1364  	)
  1365  	for _, g := range b.supp.TerritoryContainment.Group {
  1366  		// Skip UN and EURO zone as they are flattening the containment
  1367  		// relationship.
  1368  		if g.Type == "EZ" || g.Type == "UN" {
  1369  			continue
  1370  		}
  1371  		group := b.region.index(g.Type)
  1372  		groupIdx := b.groups[group]
  1373  		for _, mem := range strings.Split(g.Contains, " ") {
  1374  			r := b.region.index(mem)
  1375  			mm[r] = append(mm[r], groupIdx)
  1376  			if g, ok := b.groups[r]; ok {
  1377  				mm[group] = append(mm[group], g)
  1378  				containment[groupIdx] = append(containment[groupIdx], g)
  1379  			}
  1380  		}
  1381  	}
  1382  
  1383  	regionContainment := make([]uint64, len(b.groups))
  1384  	for _, g := range b.groups {
  1385  		l := containment[g]
  1386  
  1387  		// Compute the transitive closure of containment.
  1388  		for i := 0; i < len(l); i++ {
  1389  			l = append(l, containment[l[i]]...)
  1390  		}
  1391  
  1392  		// Compute the bitmask.
  1393  		regionContainment[g] = 1 << g
  1394  		for _, v := range l {
  1395  			regionContainment[g] |= 1 << v
  1396  		}
  1397  	}
  1398  	b.writeSlice("regionContainment", regionContainment)
  1399  
  1400  	regionInclusion := make([]uint8, len(b.region.s))
  1401  	bvs := make(map[uint64]index)
  1402  	// Make the first bitvector positions correspond with the groups.
  1403  	for r, i := range b.groups {
  1404  		bv := uint64(1 << i)
  1405  		for _, g := range mm[r] {
  1406  			bv |= 1 << g
  1407  		}
  1408  		bvs[bv] = i
  1409  		regionInclusion[r] = uint8(bvs[bv])
  1410  	}
  1411  	for r := 1; r < len(b.region.s); r++ {
  1412  		if _, ok := b.groups[r]; !ok {
  1413  			bv := uint64(0)
  1414  			for _, g := range mm[r] {
  1415  				bv |= 1 << g
  1416  			}
  1417  			if bv == 0 {
  1418  				// Pick the world for unspecified regions.
  1419  				bv = 1 << b.groups[b.region.index("001")]
  1420  			}
  1421  			if _, ok := bvs[bv]; !ok {
  1422  				bvs[bv] = index(len(bvs))
  1423  			}
  1424  			regionInclusion[r] = uint8(bvs[bv])
  1425  		}
  1426  	}
  1427  	b.writeSlice("regionInclusion", regionInclusion)
  1428  	regionInclusionBits := make([]uint64, len(bvs))
  1429  	for k, v := range bvs {
  1430  		regionInclusionBits[v] = uint64(k)
  1431  	}
  1432  	// Add bit vectors for increasingly large distances until a fixed point is reached.
  1433  	regionInclusionNext := []uint8{}
  1434  	for i := 0; i < len(regionInclusionBits); i++ {
  1435  		bits := regionInclusionBits[i]
  1436  		next := bits
  1437  		for i := uint(0); i < uint(len(b.groups)); i++ {
  1438  			if bits&(1<<i) != 0 {
  1439  				next |= regionInclusionBits[i]
  1440  			}
  1441  		}
  1442  		if _, ok := bvs[next]; !ok {
  1443  			bvs[next] = index(len(bvs))
  1444  			regionInclusionBits = append(regionInclusionBits, next)
  1445  		}
  1446  		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
  1447  	}
  1448  	b.writeSlice("regionInclusionBits", regionInclusionBits)
  1449  	b.writeSlice("regionInclusionNext", regionInclusionNext)
  1450  }
  1451  
  1452  type parentRel struct {
  1453  	lang       uint16
  1454  	script     uint16
  1455  	maxScript  uint16
  1456  	toRegion   uint16
  1457  	fromRegion []uint16
  1458  }
  1459  
  1460  func (b *builder) writeParents() {
  1461  	b.writeType(parentRel{})
  1462  
  1463  	parents := []parentRel{}
  1464  
  1465  	// Construct parent overrides.
  1466  	n := 0
  1467  	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
  1468  		// Skipping non-standard scripts to root is implemented using addTags.
  1469  		if p.Parent == "root" {
  1470  			continue
  1471  		}
  1472  
  1473  		sub := strings.Split(p.Parent, "_")
  1474  		parent := parentRel{lang: b.langIndex(sub[0])}
  1475  		if len(sub) == 2 {
  1476  			// TODO: check that all undefined scripts are indeed Latn in these
  1477  			// cases.
  1478  			parent.maxScript = uint16(b.script.index("Latn"))
  1479  			parent.toRegion = uint16(b.region.index(sub[1]))
  1480  		} else {
  1481  			parent.script = uint16(b.script.index(sub[1]))
  1482  			parent.maxScript = parent.script
  1483  			parent.toRegion = uint16(b.region.index(sub[2]))
  1484  		}
  1485  		for _, c := range strings.Split(p.Locales, " ") {
  1486  			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
  1487  			parent.fromRegion = append(parent.fromRegion, uint16(region))
  1488  		}
  1489  		parents = append(parents, parent)
  1490  		n += len(parent.fromRegion)
  1491  	}
  1492  	b.writeSliceAddSize("parents", n*2, parents)
  1493  }
  1494  
  1495  func main() {
  1496  	gen.Init()
  1497  
  1498  	gen.Repackage("gen_common.go", "common.go", "language")
  1499  
  1500  	w := gen.NewCodeWriter()
  1501  	defer w.WriteGoFile("tables.go", "language")
  1502  
  1503  	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
  1504  
  1505  	b := newBuilder(w)
  1506  	gen.WriteCLDRVersion(w)
  1507  
  1508  	b.parseIndices()
  1509  	b.writeType(FromTo{})
  1510  	b.writeLanguage()
  1511  	b.writeScript()
  1512  	b.writeRegion()
  1513  	b.writeVariant()
  1514  	// TODO: b.writeLocale()
  1515  	b.computeRegionGroups()
  1516  	b.writeLikelyData()
  1517  	b.writeRegionInclusionData()
  1518  	b.writeParents()
  1519  }
  1520  

View as plain text