...

Source file src/golang.org/x/text/collate/maketables.go

Documentation: golang.org/x/text/collate

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // Collation table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"archive/zip"
    14  	"bufio"
    15  	"bytes"
    16  	"flag"
    17  	"fmt"
    18  	"io"
    19  	"log"
    20  	"os"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  	"unicode/utf8"
    26  
    27  	"golang.org/x/text/collate"
    28  	"golang.org/x/text/collate/build"
    29  	"golang.org/x/text/internal/colltab"
    30  	"golang.org/x/text/internal/gen"
    31  	"golang.org/x/text/language"
    32  	"golang.org/x/text/unicode/cldr"
    33  )
    34  
    35  var (
    36  	test = flag.Bool("test", false,
    37  		"test existing tables; can be used to compare web data with package data.")
    38  	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
    39  	draft = flag.Bool("draft", false, `Use draft versions, when available.`)
    40  	tags  = flag.String("tags", "", "build tags to be included after go:build directive")
    41  	pkg   = flag.String("package", "collate",
    42  		"the name of the package in which the generated file is to be included")
    43  
    44  	tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
    45  		"comma-spearated list of tables to generate.")
    46  	exclude = flagStringSet("exclude", "zh2", "",
    47  		"comma-separated list of languages to exclude.")
    48  	include = flagStringSet("include", "", "",
    49  		"comma-separated list of languages to include. Include trumps exclude.")
    50  	// TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
    51  	// TODO: Not included: traditional (buggy for Bengali)
    52  	types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
    53  		"comma-separated list of types that should be included.")
    54  )
    55  
    56  // stringSet implements an ordered set based on a list.  It implements flag.Value
    57  // to allow a set to be specified as a comma-separated list.
    58  type stringSet struct {
    59  	s        []string
    60  	allowed  *stringSet
    61  	dirty    bool // needs compaction if true
    62  	all      bool
    63  	allowAll bool
    64  }
    65  
    66  func flagStringSet(name, def, allowed, usage string) *stringSet {
    67  	ss := &stringSet{}
    68  	if allowed != "" {
    69  		usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
    70  		ss.allowed = &stringSet{}
    71  		failOnError(ss.allowed.Set(allowed))
    72  	}
    73  	ss.Set(def)
    74  	flag.Var(ss, name, usage)
    75  	return ss
    76  }
    77  
    78  func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
    79  	ss := &stringSet{allowAll: true}
    80  	if allowed == "" {
    81  		flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
    82  	} else {
    83  		ss.allowed = &stringSet{}
    84  		failOnError(ss.allowed.Set(allowed))
    85  		flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
    86  	}
    87  	ss.Set(def)
    88  	return ss
    89  }
    90  
    91  func (ss stringSet) Len() int {
    92  	return len(ss.s)
    93  }
    94  
    95  func (ss stringSet) String() string {
    96  	return strings.Join(ss.s, ",")
    97  }
    98  
    99  func (ss *stringSet) Set(s string) error {
   100  	if ss.allowAll && s == "all" {
   101  		ss.s = nil
   102  		ss.all = true
   103  		return nil
   104  	}
   105  	ss.s = ss.s[:0]
   106  	for _, s := range strings.Split(s, ",") {
   107  		if s := strings.TrimSpace(s); s != "" {
   108  			if ss.allowed != nil && !ss.allowed.contains(s) {
   109  				return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
   110  			}
   111  			ss.add(s)
   112  		}
   113  	}
   114  	ss.compact()
   115  	return nil
   116  }
   117  
   118  func (ss *stringSet) add(s string) {
   119  	ss.s = append(ss.s, s)
   120  	ss.dirty = true
   121  }
   122  
   123  func (ss *stringSet) values() []string {
   124  	ss.compact()
   125  	return ss.s
   126  }
   127  
   128  func (ss *stringSet) contains(s string) bool {
   129  	if ss.all {
   130  		return true
   131  	}
   132  	for _, v := range ss.s {
   133  		if v == s {
   134  			return true
   135  		}
   136  	}
   137  	return false
   138  }
   139  
   140  func (ss *stringSet) compact() {
   141  	if !ss.dirty {
   142  		return
   143  	}
   144  	a := ss.s
   145  	sort.Strings(a)
   146  	k := 0
   147  	for i := 1; i < len(a); i++ {
   148  		if a[k] != a[i] {
   149  			a[k+1] = a[i]
   150  			k++
   151  		}
   152  	}
   153  	ss.s = a[:k+1]
   154  	ss.dirty = false
   155  }
   156  
   157  func skipLang(l string) bool {
   158  	if include.Len() > 0 {
   159  		return !include.contains(l)
   160  	}
   161  	return exclude.contains(l)
   162  }
   163  
   164  // altInclude returns a list of alternatives (for the LDML alt attribute)
   165  // in order of preference.  An empty string in this list indicates the
   166  // default entry.
   167  func altInclude() []string {
   168  	l := []string{}
   169  	if *short {
   170  		l = append(l, "short")
   171  	}
   172  	l = append(l, "")
   173  	// TODO: handle draft using cldr.SetDraftLevel
   174  	if *draft {
   175  		l = append(l, "proposed")
   176  	}
   177  	return l
   178  }
   179  
   180  func failOnError(e error) {
   181  	if e != nil {
   182  		log.Panic(e)
   183  	}
   184  }
   185  
   186  func openArchive() *zip.Reader {
   187  	f := gen.OpenCLDRCoreZip()
   188  	buffer, err := io.ReadAll(f)
   189  	f.Close()
   190  	failOnError(err)
   191  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   192  	failOnError(err)
   193  	return archive
   194  }
   195  
   196  // parseUCA parses a Default Unicode Collation Element Table of the format
   197  // specified in https://www.unicode.org/reports/tr10/#File_Format.
   198  // It returns the variable top.
   199  func parseUCA(builder *build.Builder) {
   200  	var r io.ReadCloser
   201  	var err error
   202  	for _, f := range openArchive().File {
   203  		if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
   204  			r, err = f.Open()
   205  		}
   206  	}
   207  	if r == nil {
   208  		log.Fatal("File allkeys_CLDR.txt not found in archive.")
   209  	}
   210  	failOnError(err)
   211  	defer r.Close()
   212  	scanner := bufio.NewScanner(r)
   213  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
   214  	for i := 1; scanner.Scan(); i++ {
   215  		line := scanner.Text()
   216  		if len(line) == 0 || line[0] == '#' {
   217  			continue
   218  		}
   219  		if line[0] == '@' {
   220  			// parse properties
   221  			switch {
   222  			case strings.HasPrefix(line[1:], "version "):
   223  				a := strings.Split(line[1:], " ")
   224  				if a[1] != gen.UnicodeVersion() {
   225  					log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
   226  				}
   227  			case strings.HasPrefix(line[1:], "backwards "):
   228  				log.Fatalf("%d: unsupported option backwards", i)
   229  			default:
   230  				log.Printf("%d: unknown option %s", i, line[1:])
   231  			}
   232  		} else {
   233  			// parse entries
   234  			part := strings.Split(line, " ; ")
   235  			if len(part) != 2 {
   236  				log.Fatalf("%d: production rule without ';': %v", i, line)
   237  			}
   238  			lhs := []rune{}
   239  			for _, v := range strings.Split(part[0], " ") {
   240  				if v == "" {
   241  					continue
   242  				}
   243  				lhs = append(lhs, rune(convHex(i, v)))
   244  			}
   245  			var n int
   246  			var vars []int
   247  			rhs := [][]int{}
   248  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   249  				n += len(m[0])
   250  				elem := []int{}
   251  				for _, h := range strings.Split(m[2], ".") {
   252  					elem = append(elem, convHex(i, h))
   253  				}
   254  				if m[1] == "*" {
   255  					vars = append(vars, i)
   256  				}
   257  				rhs = append(rhs, elem)
   258  			}
   259  			if len(part[1]) < n+3 || part[1][n+1] != '#' {
   260  				log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
   261  			}
   262  			if *test {
   263  				testInput.add(string(lhs))
   264  			}
   265  			failOnError(builder.Add(lhs, rhs, vars))
   266  		}
   267  	}
   268  	if scanner.Err() != nil {
   269  		log.Fatal(scanner.Err())
   270  	}
   271  }
   272  
   273  func convHex(line int, s string) int {
   274  	r, e := strconv.ParseInt(s, 16, 32)
   275  	if e != nil {
   276  		log.Fatalf("%d: %v", line, e)
   277  	}
   278  	return int(r)
   279  }
   280  
   281  var testInput = stringSet{}
   282  
   283  var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
   284  var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
   285  
   286  var mainLocales = []string{}
   287  
   288  // charSets holds a list of exemplar characters per category.
   289  type charSets map[string][]string
   290  
   291  func (p charSets) fprint(w io.Writer) {
   292  	fmt.Fprintln(w, "[exN]string{")
   293  	for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
   294  		if set := p[k]; len(set) != 0 {
   295  			fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
   296  		}
   297  	}
   298  	fmt.Fprintln(w, "\t},")
   299  }
   300  
   301  var localeChars = make(map[string]charSets)
   302  
   303  const exemplarHeader = `
   304  type exemplarType int
   305  const (
   306  	exCharacters exemplarType = iota
   307  	exContractions
   308  	exPunctuation
   309  	exAuxiliary
   310  	exCurrency
   311  	exIndex
   312  	exN
   313  )
   314  `
   315  
   316  func printExemplarCharacters(w io.Writer) {
   317  	fmt.Fprintln(w, exemplarHeader)
   318  	fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
   319  	for _, loc := range mainLocales {
   320  		fmt.Fprintf(w, "\t%q: ", loc)
   321  		localeChars[loc].fprint(w)
   322  	}
   323  	fmt.Fprintln(w, "}")
   324  }
   325  
   326  func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
   327  	r := gen.OpenCLDRCoreZip()
   328  	data, err := d.DecodeZip(r)
   329  	failOnError(err)
   330  	return data
   331  }
   332  
   333  // parseMain parses XML files in the main directory of the CLDR core.zip file.
   334  func parseMain() {
   335  	d := &cldr.Decoder{}
   336  	d.SetDirFilter("main")
   337  	d.SetSectionFilter("characters")
   338  	data := decodeCLDR(d)
   339  	for _, loc := range data.Locales() {
   340  		x := data.RawLDML(loc)
   341  		if skipLang(x.Identity.Language.Type) {
   342  			continue
   343  		}
   344  		if x.Characters != nil {
   345  			x, _ = data.LDML(loc)
   346  			loc = language.Make(loc).String()
   347  			for _, ec := range x.Characters.ExemplarCharacters {
   348  				if ec.Draft != "" {
   349  					continue
   350  				}
   351  				if _, ok := localeChars[loc]; !ok {
   352  					mainLocales = append(mainLocales, loc)
   353  					localeChars[loc] = make(charSets)
   354  				}
   355  				localeChars[loc][ec.Type] = parseCharacters(ec.Data())
   356  			}
   357  		}
   358  	}
   359  }
   360  
   361  func parseCharacters(chars string) []string {
   362  	parseSingle := func(s string) (r rune, tail string, escaped bool) {
   363  		if s[0] == '\\' {
   364  			return rune(s[1]), s[2:], true
   365  		}
   366  		r, sz := utf8.DecodeRuneInString(s)
   367  		return r, s[sz:], false
   368  	}
   369  	chars = strings.TrimSpace(chars)
   370  	if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
   371  		chars = chars[1:n]
   372  	}
   373  	list := []string{}
   374  	var r, last, end rune
   375  	for len(chars) > 0 {
   376  		if chars[0] == '{' { // character sequence
   377  			buf := []rune{}
   378  			for chars = chars[1:]; len(chars) > 0; {
   379  				r, chars, _ = parseSingle(chars)
   380  				if r == '}' {
   381  					break
   382  				}
   383  				if r == ' ' {
   384  					log.Fatalf("space not supported in sequence %q", chars)
   385  				}
   386  				buf = append(buf, r)
   387  			}
   388  			list = append(list, string(buf))
   389  			last = 0
   390  		} else { // single character
   391  			escaped := false
   392  			r, chars, escaped = parseSingle(chars)
   393  			if r != ' ' {
   394  				if r == '-' && !escaped {
   395  					if last == 0 {
   396  						log.Fatal("'-' should be preceded by a character")
   397  					}
   398  					end, chars, _ = parseSingle(chars)
   399  					for ; last <= end; last++ {
   400  						list = append(list, string(last))
   401  					}
   402  					last = 0
   403  				} else {
   404  					list = append(list, string(r))
   405  					last = r
   406  				}
   407  			}
   408  		}
   409  	}
   410  	return list
   411  }
   412  
   413  var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
   414  
   415  // typeMap translates legacy type keys to their BCP47 equivalent.
   416  var typeMap = map[string]string{
   417  	"phonebook":   "phonebk",
   418  	"traditional": "trad",
   419  }
   420  
   421  // parseCollation parses XML files in the collation directory of the CLDR core.zip file.
   422  func parseCollation(b *build.Builder) {
   423  	d := &cldr.Decoder{}
   424  	d.SetDirFilter("collation")
   425  	data := decodeCLDR(d)
   426  	for _, loc := range data.Locales() {
   427  		x, err := data.LDML(loc)
   428  		failOnError(err)
   429  		if skipLang(x.Identity.Language.Type) {
   430  			continue
   431  		}
   432  		cs := x.Collations.Collation
   433  		sl := cldr.MakeSlice(&cs)
   434  		if len(types.s) == 0 {
   435  			sl.SelectAnyOf("type", x.Collations.Default())
   436  		} else if !types.all {
   437  			sl.SelectAnyOf("type", types.s...)
   438  		}
   439  		sl.SelectOnePerGroup("alt", altInclude())
   440  
   441  		for _, c := range cs {
   442  			id, err := language.Parse(loc)
   443  			if err != nil {
   444  				fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
   445  				continue
   446  			}
   447  			// Support both old- and new-style defaults.
   448  			d := c.Type
   449  			if x.Collations.DefaultCollation == nil {
   450  				d = x.Collations.Default()
   451  			} else {
   452  				d = x.Collations.DefaultCollation.Data()
   453  			}
   454  			// We assume tables are being built either for search or collation,
   455  			// but not both. For search the default is always "search".
   456  			if d != c.Type && c.Type != "search" {
   457  				typ := c.Type
   458  				if len(c.Type) > 8 {
   459  					typ = typeMap[c.Type]
   460  				}
   461  				id, err = id.SetTypeForKey("co", typ)
   462  				failOnError(err)
   463  			}
   464  			t := b.Tailoring(id)
   465  			c.Process(processor{t})
   466  		}
   467  	}
   468  }
   469  
   470  type processor struct {
   471  	t *build.Tailoring
   472  }
   473  
   474  func (p processor) Reset(anchor string, before int) (err error) {
   475  	if before != 0 {
   476  		err = p.t.SetAnchorBefore(anchor)
   477  	} else {
   478  		err = p.t.SetAnchor(anchor)
   479  	}
   480  	failOnError(err)
   481  	return nil
   482  }
   483  
   484  func (p processor) Insert(level int, str, context, extend string) error {
   485  	str = context + str
   486  	if *test {
   487  		testInput.add(str)
   488  	}
   489  	// TODO: mimic bug in old maketables: remove.
   490  	err := p.t.Insert(colltab.Level(level-1), str, context+extend)
   491  	failOnError(err)
   492  	return nil
   493  }
   494  
   495  func (p processor) Index(id string) {
   496  }
   497  
   498  func testCollator(c *collate.Collator) {
   499  	c0 := collate.New(language.Und)
   500  
   501  	// iterator over all characters for all locales and check
   502  	// whether Key is equal.
   503  	buf := collate.Buffer{}
   504  
   505  	// Add all common and not too uncommon runes to the test set.
   506  	for i := rune(0); i < 0x30000; i++ {
   507  		testInput.add(string(i))
   508  	}
   509  	for i := rune(0xE0000); i < 0xF0000; i++ {
   510  		testInput.add(string(i))
   511  	}
   512  	for _, str := range testInput.values() {
   513  		k0 := c0.KeyFromString(&buf, str)
   514  		k := c.KeyFromString(&buf, str)
   515  		if !bytes.Equal(k0, k) {
   516  			failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
   517  		}
   518  		buf.Reset()
   519  	}
   520  	fmt.Println("PASS")
   521  }
   522  
   523  func main() {
   524  	gen.Init()
   525  	b := build.NewBuilder()
   526  	parseUCA(b)
   527  	if tables.contains("chars") {
   528  		parseMain()
   529  	}
   530  	parseCollation(b)
   531  
   532  	c, err := b.Build()
   533  	failOnError(err)
   534  
   535  	if *test {
   536  		testCollator(collate.NewFromTable(c))
   537  	} else {
   538  		w := &bytes.Buffer{}
   539  
   540  		gen.WriteUnicodeVersion(w)
   541  		gen.WriteCLDRVersion(w)
   542  
   543  		if tables.contains("collate") {
   544  			_, err = b.Print(w)
   545  			failOnError(err)
   546  		}
   547  		if tables.contains("chars") {
   548  			printExemplarCharacters(w)
   549  		}
   550  		gen.WriteGoFile("tables.go", *pkg, w.Bytes())
   551  	}
   552  }
   553  

View as plain text