gen.go

Documentation: golang.org/x/text/feature/plural

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  // This file generates data for the CLDR plural rules, as defined in
    10  //    https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
    11  //
    12  // We assume a slightly simplified grammar:
    13  //
    14  // 		condition     = and_condition ('or' and_condition)* samples
    15  // 		and_condition = relation ('and' relation)*
    16  // 		relation      = expr ('=' | '!=') range_list
    17  // 		expr          = operand ('%' '10' '0'* )?
    18  // 		operand       = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
    19  // 		range_list    = (range | value) (',' range_list)*
    20  // 		range         = value'..'value
    21  // 		value         = digit+
    22  // 		digit         = 0|1|2|3|4|5|6|7|8|9
    23  //
    24  // 		samples       = ('@integer' sampleList)?
    25  // 		                ('@decimal' sampleList)?
    26  // 		sampleList    = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
    27  // 		sampleRange   = decimalValue ('~' decimalValue)?
    28  // 		decimalValue  = value ('.' value)?
    29  //
    30  //		Symbol	Value
    31  //		n	absolute value of the source number (integer and decimals).
    32  //		i	integer digits of n.
    33  //		v	number of visible fraction digits in n, with trailing zeros.
    34  //		w	number of visible fraction digits in n, without trailing zeros.
    35  //		f	visible fractional digits in n, with trailing zeros.
    36  //		t	visible fractional digits in n, without trailing zeros.
    37  //
    38  // The algorithm for which the data is generated is based on the following
    39  // observations
    40  //
    41  //    - the number of different sets of numbers which the plural rules use to
    42  //      test inclusion is limited,
    43  //    - most numbers that are tested on are < 100
    44  //
    45  // This allows us to define a bitmap for each number < 100 where a bit i
    46  // indicates whether this number is included in some defined set i.
    47  // The function matchPlural in plural.go defines how we can subsequently use
    48  // this data to determine inclusion.
    49  //
    50  // There are a few languages for which this doesn't work. For one Italian and
    51  // Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
    52  // which considers whether numbers are multiples of hundreds. The model here
    53  // could be extended to handle Italian and Azerbaijan fairly easily (by
    54  // considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
    55  // 100), but for now it seems easier to just hard-code these cases.
    56  
    57  import (
    58  	"bufio"
    59  	"bytes"
    60  	"flag"
    61  	"fmt"
    62  	"log"
    63  	"strconv"
    64  	"strings"
    65  
    66  	"golang.org/x/text/internal/gen"
    67  	"golang.org/x/text/internal/language"
    68  	"golang.org/x/text/internal/language/compact"
    69  	"golang.org/x/text/unicode/cldr"
    70  )
    71  
    72  var (
    73  	test = flag.Bool("test", false,
    74  		"test existing tables; can be used to compare web data with package data.")
    75  	outputFile     = flag.String("output", "tables.go", "output file")
    76  	outputTestFile = flag.String("testoutput", "data_test.go", "output file")
    77  
    78  	draft = flag.String("draft",
    79  		"contributed",
    80  		`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
    81  )
    82  
    83  func main() {
    84  	gen.Init()
    85  
    86  	const pkg = "plural"
    87  
    88  	gen.Repackage("gen_common.go", "common.go", pkg)
    89  	// Read the CLDR zip file.
    90  	r := gen.OpenCLDRCoreZip()
    91  	defer r.Close()
    92  
    93  	d := &cldr.Decoder{}
    94  	d.SetDirFilter("supplemental", "main")
    95  	d.SetSectionFilter("numbers", "plurals")
    96  	data, err := d.DecodeZip(r)
    97  	if err != nil {
    98  		log.Fatalf("DecodeZip: %v", err)
    99  	}
   100  
   101  	w := gen.NewCodeWriter()
   102  	defer w.WriteGoFile(*outputFile, pkg)
   103  
   104  	gen.WriteCLDRVersion(w)
   105  
   106  	genPlurals(w, data)
   107  
   108  	w = gen.NewCodeWriter()
   109  	defer w.WriteGoFile(*outputTestFile, pkg)
   110  
   111  	genPluralsTests(w, data)
   112  }
   113  
   114  type pluralTest struct {
   115  	locales string   // space-separated list of locales for this test
   116  	form    int      // Use int instead of Form to simplify generation.
   117  	integer []string // Entries of the form \d+ or \d+~\d+
   118  	decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
   119  }
   120  
   121  func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
   122  	w.WriteType(pluralTest{})
   123  
   124  	for _, plurals := range data.Supplemental().Plurals {
   125  		if plurals.Type == "" {
   126  			// The empty type is reserved for plural ranges.
   127  			continue
   128  		}
   129  		tests := []pluralTest{}
   130  
   131  		for _, pRules := range plurals.PluralRules {
   132  			for _, rule := range pRules.PluralRule {
   133  				test := pluralTest{
   134  					locales: pRules.Locales,
   135  					form:    int(countMap[rule.Count]),
   136  				}
   137  				scan := bufio.NewScanner(strings.NewReader(rule.Data()))
   138  				scan.Split(splitTokens)
   139  				var p *[]string
   140  				for scan.Scan() {
   141  					switch t := scan.Text(); t {
   142  					case "@integer":
   143  						p = &test.integer
   144  					case "@decimal":
   145  						p = &test.decimal
   146  					case ",", "…":
   147  					default:
   148  						if p != nil {
   149  							*p = append(*p, t)
   150  						}
   151  					}
   152  				}
   153  				tests = append(tests, test)
   154  			}
   155  		}
   156  		w.WriteVar(plurals.Type+"Tests", tests)
   157  	}
   158  }
   159  
   160  func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
   161  	for _, plurals := range data.Supplemental().Plurals {
   162  		if plurals.Type == "" {
   163  			continue
   164  		}
   165  		// Initialize setMap and inclusionMasks. They are already populated with
   166  		// a few entries to serve as an example and to assign nice numbers to
   167  		// common cases.
   168  
   169  		// setMap contains sets of numbers represented by boolean arrays where
   170  		// a true value for element i means that the number i is included.
   171  		setMap := map[[numN]bool]int{
   172  			// The above init func adds an entry for including all numbers.
   173  			[numN]bool{1: true}: 1, // fix {1} to a nice value
   174  			[numN]bool{2: true}: 2, // fix {2} to a nice value
   175  			[numN]bool{0: true}: 3, // fix {0} to a nice value
   176  		}
   177  
   178  		// inclusionMasks contains bit masks for every number under numN to
   179  		// indicate in which set the number is included. Bit 1 << x will be set
   180  		// if it is included in set x.
   181  		inclusionMasks := [numN]uint64{
   182  			// Note: these entries are not complete: more bits will be set along the way.
   183  			0: 1 << 3,
   184  			1: 1 << 1,
   185  			2: 1 << 2,
   186  		}
   187  
   188  		// Create set {0..99}. We will assign this set the identifier 0.
   189  		var all [numN]bool
   190  		for i := range all {
   191  			// Mark number i as being included in the set (which has identifier 0).
   192  			inclusionMasks[i] |= 1 << 0
   193  			// Mark number i as included in the set.
   194  			all[i] = true
   195  		}
   196  		// Register the identifier for the set.
   197  		setMap[all] = 0
   198  
   199  		rules := []pluralCheck{}
   200  		index := []byte{0}
   201  		langMap := map[compact.ID]byte{0: 0}
   202  
   203  		for _, pRules := range plurals.PluralRules {
   204  			// Parse the rules.
   205  			var conds []orCondition
   206  			for _, rule := range pRules.PluralRule {
   207  				form := countMap[rule.Count]
   208  				conds = parsePluralCondition(conds, rule.Data(), form)
   209  			}
   210  			// Encode the rules.
   211  			for _, c := range conds {
   212  				// If an or condition only has filters, we create an entry for
   213  				// this filter and the set that contains all values.
   214  				empty := true
   215  				for _, b := range c.used {
   216  					empty = empty && !b
   217  				}
   218  				if empty {
   219  					rules = append(rules, pluralCheck{
   220  						cat:   byte(opMod<<opShift) | byte(c.form),
   221  						setID: 0, // all values
   222  					})
   223  					continue
   224  				}
   225  				// We have some entries with values.
   226  				for i, set := range c.set {
   227  					if !c.used[i] {
   228  						continue
   229  					}
   230  					index, ok := setMap[set]
   231  					if !ok {
   232  						index = len(setMap)
   233  						setMap[set] = index
   234  						for i := range inclusionMasks {
   235  							if set[i] {
   236  								inclusionMasks[i] |= 1 << uint64(index)
   237  							}
   238  						}
   239  					}
   240  					rules = append(rules, pluralCheck{
   241  						cat:   byte(i<<opShift | andNext),
   242  						setID: byte(index),
   243  					})
   244  				}
   245  				// Now set the last entry to the plural form the rule matches.
   246  				rules[len(rules)-1].cat &^= formMask
   247  				rules[len(rules)-1].cat |= byte(c.form)
   248  			}
   249  			// Point the relevant locales to the created entries.
   250  			for _, loc := range strings.Split(pRules.Locales, " ") {
   251  				if strings.TrimSpace(loc) == "" {
   252  					continue
   253  				}
   254  				lang, ok := compact.FromTag(language.MustParse(loc))
   255  				if !ok {
   256  					log.Printf("No compact index for locale %q", loc)
   257  				}
   258  				langMap[lang] = byte(len(index) - 1)
   259  			}
   260  			index = append(index, byte(len(rules)))
   261  		}
   262  		w.WriteVar(plurals.Type+"Rules", rules)
   263  		w.WriteVar(plurals.Type+"Index", index)
   264  		// Expand the values: first by using the parent relationship.
   265  		langToIndex := make([]byte, compact.NumCompactTags)
   266  		for i := range langToIndex {
   267  			for p := compact.ID(i); ; p = p.Parent() {
   268  				if x, ok := langMap[p]; ok {
   269  					langToIndex[i] = x
   270  					break
   271  				}
   272  			}
   273  		}
   274  		// Now expand by including entries with identical languages for which
   275  		// one isn't set.
   276  		for i, v := range langToIndex {
   277  			if v == 0 {
   278  				id, _ := compact.FromTag(language.Tag{
   279  					LangID: compact.ID(i).Tag().LangID,
   280  				})
   281  				if p := langToIndex[id]; p != 0 {
   282  					langToIndex[i] = p
   283  				}
   284  			}
   285  		}
   286  		w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
   287  		// Need to convert array to slice because of golang.org/issue/7651.
   288  		// This will allow tables to be dropped when unused. This is especially
   289  		// relevant for the ordinal data, which I suspect won't be used as much.
   290  		w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
   291  
   292  		if len(rules) > 0xFF {
   293  			log.Fatalf("Too many entries for rules: %#x", len(rules))
   294  		}
   295  		if len(index) > 0xFF {
   296  			log.Fatalf("Too many entries for index: %#x", len(index))
   297  		}
   298  		if len(setMap) > 64 { // maximum number of bits.
   299  			log.Fatalf("Too many entries for setMap: %d", len(setMap))
   300  		}
   301  		w.WriteComment(
   302  			"Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
   303  			plurals.Type, len(rules), len(index), len(setMap))
   304  		// Prevent comment from attaching to the next entry.
   305  		fmt.Fprint(w, "\n\n")
   306  	}
   307  }
   308  
   309  type orCondition struct {
   310  	original string // for debugging
   311  
   312  	form Form
   313  	used [32]bool
   314  	set  [32][numN]bool
   315  }
   316  
   317  func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
   318  	ok = true
   319  	for _, x := range v {
   320  		if x >= maxMod {
   321  			ok = false
   322  			break
   323  		}
   324  	}
   325  	for i := 0; i < numN; i++ {
   326  		m := i
   327  		if mod != 0 {
   328  			m = i % mod
   329  		}
   330  		if !intIn(m, v) {
   331  			o.set[op][i] = false
   332  		}
   333  	}
   334  	if ok {
   335  		o.used[op] = true
   336  	}
   337  	return ok
   338  }
   339  
   340  func intIn(x int, a []int) bool {
   341  	for _, y := range a {
   342  		if x == y {
   343  			return true
   344  		}
   345  	}
   346  	return false
   347  }
   348  
   349  var operandIndex = map[string]opID{
   350  	"i": opI,
   351  	"n": opN,
   352  	"f": opF,
   353  	"v": opV,
   354  	"w": opW,
   355  }
   356  
   357  // parsePluralCondition parses the condition of a single pluralRule and appends
   358  // the resulting or conditions to conds.
   359  //
   360  // Example rules:
   361  //
   362  //	// Category "one" in English: only allow 1 with no visible fraction
   363  //	i = 1 and v = 0 @integer 1
   364  //
   365  //	// Category "few" in Czech: all numbers with visible fractions
   366  //	v != 0   @decimal ...
   367  //
   368  //	// Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
   369  //	// numbers with a fraction 11..19 and no trailing zeros.
   370  //	n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
   371  //
   372  // @integer and @decimal are followed by examples and are not relevant for the
   373  // rule itself. The are used here to signal the termination of the rule.
   374  func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
   375  	scan := bufio.NewScanner(strings.NewReader(s))
   376  	scan.Split(splitTokens)
   377  	for {
   378  		cond := orCondition{original: s, form: f}
   379  		// Set all numbers to be allowed for all number classes and restrict
   380  		// from here on.
   381  		for i := range cond.set {
   382  			for j := range cond.set[i] {
   383  				cond.set[i][j] = true
   384  			}
   385  		}
   386  	andLoop:
   387  		for {
   388  			var token string
   389  			scan.Scan() // Must exist.
   390  			switch class := scan.Text(); class {
   391  			case "t":
   392  				class = "w" // equal to w for t == 0
   393  				fallthrough
   394  			case "n", "i", "f", "v", "w":
   395  				op := scanToken(scan)
   396  				opCode := operandIndex[class]
   397  				mod := 0
   398  				if op == "%" {
   399  					opCode |= opMod
   400  
   401  					switch v := scanUint(scan); v {
   402  					case 10, 100:
   403  						mod = v
   404  					case 1000:
   405  						// A more general solution would be to allow checking
   406  						// against multiples of 100 and include entries for the
   407  						// numbers 100..900 in the inclusion masks. At the
   408  						// moment this would only help Azerbaijan and Italian.
   409  
   410  						// Italian doesn't use '%', so this must be Azerbaijan.
   411  						cond.used[opAzerbaijan00s] = true
   412  						return append(conds, cond)
   413  
   414  					case 1000000:
   415  						cond.used[opBretonM] = true
   416  						return append(conds, cond)
   417  
   418  					default:
   419  						log.Fatalf("Modulo value not supported %d", v)
   420  					}
   421  					op = scanToken(scan)
   422  				}
   423  				if op != "=" && op != "!=" {
   424  					log.Fatalf("Unexpected op %q", op)
   425  				}
   426  				if op == "!=" {
   427  					opCode |= opNotEqual
   428  				}
   429  				a := []int{}
   430  				v := scanUint(scan)
   431  				if class == "w" && v != 0 {
   432  					log.Fatalf("Must compare against zero for operand type %q", class)
   433  				}
   434  				token = scanToken(scan)
   435  				for {
   436  					switch token {
   437  					case "..":
   438  						end := scanUint(scan)
   439  						for ; v <= end; v++ {
   440  							a = append(a, v)
   441  						}
   442  						token = scanToken(scan)
   443  					default: // ",", "or", "and", "@..."
   444  						a = append(a, v)
   445  					}
   446  					if token != "," {
   447  						break
   448  					}
   449  					v = scanUint(scan)
   450  					token = scanToken(scan)
   451  				}
   452  				if !cond.add(opCode, mod, a) {
   453  					// Detected large numbers. As we ruled out Azerbaijan, this
   454  					// must be the many rule for Italian ordinals.
   455  					cond.set[opItalian800] = cond.set[opN]
   456  					cond.used[opItalian800] = true
   457  				}
   458  
   459  			case "@integer", "@decimal": // "other" entry: tests only.
   460  				return conds
   461  			default:
   462  				log.Fatalf("Unexpected operand class %q (%s)", class, s)
   463  			}
   464  			switch token {
   465  			case "or":
   466  				conds = append(conds, cond)
   467  				break andLoop
   468  			case "@integer", "@decimal": // examples
   469  				// There is always an example in practice, so we always terminate here.
   470  				if err := scan.Err(); err != nil {
   471  					log.Fatal(err)
   472  				}
   473  				return append(conds, cond)
   474  			case "and":
   475  				// keep accumulating
   476  			default:
   477  				log.Fatalf("Unexpected token %q", token)
   478  			}
   479  		}
   480  	}
   481  }
   482  
   483  func scanToken(scan *bufio.Scanner) string {
   484  	scan.Scan()
   485  	return scan.Text()
   486  }
   487  
   488  func scanUint(scan *bufio.Scanner) int {
   489  	scan.Scan()
   490  	val, err := strconv.ParseUint(scan.Text(), 10, 32)
   491  	if err != nil {
   492  		log.Fatal(err)
   493  	}
   494  	return int(val)
   495  }
   496  
   497  // splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
   498  func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
   499  	condTokens := [][]byte{
   500  		[]byte(".."),
   501  		[]byte(","),
   502  		[]byte("!="),
   503  		[]byte("="),
   504  	}
   505  	advance, token, err = bufio.ScanWords(data, atEOF)
   506  	for _, t := range condTokens {
   507  		if len(t) >= len(token) {
   508  			continue
   509  		}
   510  		switch p := bytes.Index(token, t); {
   511  		case p == -1:
   512  		case p == 0:
   513  			advance = len(t)
   514  			token = token[:len(t)]
   515  			return advance - len(token) + len(t), token[:len(t)], err
   516  		case p < advance:
   517  			// Don't split when "=" overlaps "!=".
   518  			if t[0] == '=' && token[p-1] == '!' {
   519  				continue
   520  			}
   521  			advance = p
   522  			token = token[:p]
   523  		}
   524  	}
   525  	return advance, token, err
   526  }
   527
View as plain text