...

Source file src/golang.org/x/text/language/gen.go

Documentation: golang.org/x/text/language

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // Language tag table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"flag"
    14  	"fmt"
    15  	"io"
    16  	"log"
    17  	"sort"
    18  	"strconv"
    19  	"strings"
    20  
    21  	"golang.org/x/text/internal/gen"
    22  	"golang.org/x/text/internal/language"
    23  	"golang.org/x/text/unicode/cldr"
    24  )
    25  
    26  var (
    27  	test = flag.Bool("test",
    28  		false,
    29  		"test existing tables; can be used to compare web data with package data.")
    30  	outputFile = flag.String("output",
    31  		"tables.go",
    32  		"output file for generated tables")
    33  )
    34  
    35  func main() {
    36  	gen.Init()
    37  
    38  	w := gen.NewCodeWriter()
    39  	defer w.WriteGoFile("tables.go", "language")
    40  
    41  	b := newBuilder(w)
    42  	gen.WriteCLDRVersion(w)
    43  
    44  	b.writeConstants()
    45  	b.writeMatchData()
    46  }
    47  
    48  type builder struct {
    49  	w    *gen.CodeWriter
    50  	hw   io.Writer // MultiWriter for w and w.Hash
    51  	data *cldr.CLDR
    52  	supp *cldr.SupplementalData
    53  }
    54  
    55  func (b *builder) langIndex(s string) uint16 {
    56  	return uint16(language.MustParseBase(s))
    57  }
    58  
    59  func (b *builder) regionIndex(s string) int {
    60  	return int(language.MustParseRegion(s))
    61  }
    62  
    63  func (b *builder) scriptIndex(s string) int {
    64  	return int(language.MustParseScript(s))
    65  }
    66  
    67  func newBuilder(w *gen.CodeWriter) *builder {
    68  	r := gen.OpenCLDRCoreZip()
    69  	defer r.Close()
    70  	d := &cldr.Decoder{}
    71  	data, err := d.DecodeZip(r)
    72  	if err != nil {
    73  		log.Fatal(err)
    74  	}
    75  	b := builder{
    76  		w:    w,
    77  		hw:   io.MultiWriter(w, w.Hash),
    78  		data: data,
    79  		supp: data.Supplemental(),
    80  	}
    81  	return &b
    82  }
    83  
    84  // writeConsts computes f(v) for all v in values and writes the results
    85  // as constants named _v to a single constant block.
    86  func (b *builder) writeConsts(f func(string) int, values ...string) {
    87  	fmt.Fprintln(b.w, "const (")
    88  	for _, v := range values {
    89  		fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
    90  	}
    91  	fmt.Fprintln(b.w, ")")
    92  }
    93  
    94  // TODO: region inclusion data will probably not be use used in future matchers.
    95  
    96  var langConsts = []string{
    97  	"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
    98  }
    99  
   100  var scriptConsts = []string{
   101  	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
   102  	"Zzzz",
   103  }
   104  
   105  var regionConsts = []string{
   106  	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
   107  	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
   108  }
   109  
   110  func (b *builder) writeConstants() {
   111  	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
   112  	b.writeConsts(b.regionIndex, regionConsts...)
   113  	b.writeConsts(b.scriptIndex, scriptConsts...)
   114  }
   115  
   116  type mutualIntelligibility struct {
   117  	want, have uint16
   118  	distance   uint8
   119  	oneway     bool
   120  }
   121  
   122  type scriptIntelligibility struct {
   123  	wantLang, haveLang     uint16
   124  	wantScript, haveScript uint8
   125  	distance               uint8
   126  	// Always oneway
   127  }
   128  
   129  type regionIntelligibility struct {
   130  	lang     uint16 // compact language id
   131  	script   uint8  // 0 means any
   132  	group    uint8  // 0 means any; if bit 7 is set it means inverse
   133  	distance uint8
   134  	// Always twoway.
   135  }
   136  
   137  // writeMatchData writes tables with languages and scripts for which there is
   138  // mutual intelligibility. The data is based on CLDR's languageMatching data.
   139  // Note that we use a different algorithm than the one defined by CLDR and that
   140  // we slightly modify the data. For example, we convert scores to confidence levels.
   141  // We also drop all region-related data as we use a different algorithm to
   142  // determine region equivalence.
   143  func (b *builder) writeMatchData() {
   144  	lm := b.supp.LanguageMatching.LanguageMatches
   145  	cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
   146  
   147  	regionHierarchy := map[string][]string{}
   148  	for _, g := range b.supp.TerritoryContainment.Group {
   149  		regions := strings.Split(g.Contains, " ")
   150  		regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
   151  	}
   152  	// Regions start at 1, so the slice must be one larger than the number of
   153  	// regions.
   154  	regionToGroups := make([]uint8, language.NumRegions+1)
   155  
   156  	idToIndex := map[string]uint8{}
   157  	for i, mv := range lm[0].MatchVariable {
   158  		if i > 6 {
   159  			log.Fatalf("Too many groups: %d", i)
   160  		}
   161  		idToIndex[mv.Id] = uint8(i + 1)
   162  		// TODO: also handle '-'
   163  		for _, r := range strings.Split(mv.Value, "+") {
   164  			todo := []string{r}
   165  			for k := 0; k < len(todo); k++ {
   166  				r := todo[k]
   167  				regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
   168  				todo = append(todo, regionHierarchy[r]...)
   169  			}
   170  		}
   171  	}
   172  	b.w.WriteVar("regionToGroups", regionToGroups)
   173  
   174  	// maps language id to in- and out-of-group region.
   175  	paradigmLocales := [][3]uint16{}
   176  	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
   177  	for i := 0; i < len(locales); i += 2 {
   178  		x := [3]uint16{}
   179  		for j := 0; j < 2; j++ {
   180  			pc := strings.SplitN(locales[i+j], "-", 2)
   181  			x[0] = b.langIndex(pc[0])
   182  			if len(pc) == 2 {
   183  				x[1+j] = uint16(b.regionIndex(pc[1]))
   184  			}
   185  		}
   186  		paradigmLocales = append(paradigmLocales, x)
   187  	}
   188  	b.w.WriteVar("paradigmLocales", paradigmLocales)
   189  
   190  	b.w.WriteType(mutualIntelligibility{})
   191  	b.w.WriteType(scriptIntelligibility{})
   192  	b.w.WriteType(regionIntelligibility{})
   193  
   194  	matchLang := []mutualIntelligibility{}
   195  	matchScript := []scriptIntelligibility{}
   196  	matchRegion := []regionIntelligibility{}
   197  	// Convert the languageMatch entries in lists keyed by desired language.
   198  	for _, m := range lm[0].LanguageMatch {
   199  		// Different versions of CLDR use different separators.
   200  		desired := strings.Replace(m.Desired, "-", "_", -1)
   201  		supported := strings.Replace(m.Supported, "-", "_", -1)
   202  		d := strings.Split(desired, "_")
   203  		s := strings.Split(supported, "_")
   204  		if len(d) != len(s) {
   205  			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
   206  			continue
   207  		}
   208  		distance, _ := strconv.ParseInt(m.Distance, 10, 8)
   209  		switch len(d) {
   210  		case 2:
   211  			if desired == supported && desired == "*_*" {
   212  				continue
   213  			}
   214  			// language-script pair.
   215  			matchScript = append(matchScript, scriptIntelligibility{
   216  				wantLang:   uint16(b.langIndex(d[0])),
   217  				haveLang:   uint16(b.langIndex(s[0])),
   218  				wantScript: uint8(b.scriptIndex(d[1])),
   219  				haveScript: uint8(b.scriptIndex(s[1])),
   220  				distance:   uint8(distance),
   221  			})
   222  			if m.Oneway != "true" {
   223  				matchScript = append(matchScript, scriptIntelligibility{
   224  					wantLang:   uint16(b.langIndex(s[0])),
   225  					haveLang:   uint16(b.langIndex(d[0])),
   226  					wantScript: uint8(b.scriptIndex(s[1])),
   227  					haveScript: uint8(b.scriptIndex(d[1])),
   228  					distance:   uint8(distance),
   229  				})
   230  			}
   231  		case 1:
   232  			if desired == supported && desired == "*" {
   233  				continue
   234  			}
   235  			if distance == 1 {
   236  				// nb == no is already handled by macro mapping. Check there
   237  				// really is only this case.
   238  				if d[0] != "no" || s[0] != "nb" {
   239  					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
   240  				}
   241  				continue
   242  			}
   243  			// TODO: consider dropping oneway field and just doubling the entry.
   244  			matchLang = append(matchLang, mutualIntelligibility{
   245  				want:     uint16(b.langIndex(d[0])),
   246  				have:     uint16(b.langIndex(s[0])),
   247  				distance: uint8(distance),
   248  				oneway:   m.Oneway == "true",
   249  			})
   250  		case 3:
   251  			if desired == supported && desired == "*_*_*" {
   252  				continue
   253  			}
   254  			if desired != supported {
   255  				// This is now supported by CLDR, but only one case, which
   256  				// should already be covered by paradigm locales. For instance,
   257  				// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
   258  				// testdata/CLDRLocaleMatcherTest.txt tests this.
   259  				if supported != "en_*_GB" {
   260  					log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
   261  				}
   262  				continue
   263  			}
   264  			ri := regionIntelligibility{
   265  				lang:     b.langIndex(d[0]),
   266  				distance: uint8(distance),
   267  			}
   268  			if d[1] != "*" {
   269  				ri.script = uint8(b.scriptIndex(d[1]))
   270  			}
   271  			switch {
   272  			case d[2] == "*":
   273  				ri.group = 0x80 // not contained in anything
   274  			case strings.HasPrefix(d[2], "$!"):
   275  				ri.group = 0x80
   276  				d[2] = "$" + d[2][len("$!"):]
   277  				fallthrough
   278  			case strings.HasPrefix(d[2], "$"):
   279  				ri.group |= idToIndex[d[2]]
   280  			}
   281  			matchRegion = append(matchRegion, ri)
   282  		default:
   283  			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
   284  		}
   285  	}
   286  	sort.SliceStable(matchLang, func(i, j int) bool {
   287  		return matchLang[i].distance < matchLang[j].distance
   288  	})
   289  	b.w.WriteComment(`
   290  		matchLang holds pairs of langIDs of base languages that are typically
   291  		mutually intelligible. Each pair is associated with a confidence and
   292  		whether the intelligibility goes one or both ways.`)
   293  	b.w.WriteVar("matchLang", matchLang)
   294  
   295  	b.w.WriteComment(`
   296  		matchScript holds pairs of scriptIDs where readers of one script
   297  		can typically also read the other. Each is associated with a confidence.`)
   298  	sort.SliceStable(matchScript, func(i, j int) bool {
   299  		return matchScript[i].distance < matchScript[j].distance
   300  	})
   301  	b.w.WriteVar("matchScript", matchScript)
   302  
   303  	sort.SliceStable(matchRegion, func(i, j int) bool {
   304  		return matchRegion[i].distance < matchRegion[j].distance
   305  	})
   306  	b.w.WriteVar("matchRegion", matchRegion)
   307  }
   308  

View as plain text