...

Source file src/golang.org/x/text/internal/export/unicode/gen.go

Documentation: golang.org/x/text/internal/export/unicode

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // Unicode table generator.
     8  // Data read from the web.
     9  
    10  package main
    11  
    12  import (
    13  	"flag"
    14  	"fmt"
    15  	"log"
    16  	"os"
    17  	"regexp"
    18  	"sort"
    19  	"strings"
    20  	"unicode"
    21  
    22  	"golang.org/x/text/internal/gen"
    23  	"golang.org/x/text/internal/ucd"
    24  	"golang.org/x/text/unicode/rangetable"
    25  )
    26  
    27  func main() {
    28  	flag.Parse()
    29  	setupOutput()
    30  	loadChars() // always needed
    31  	loadCasefold()
    32  	printCategories()
    33  	printScriptOrProperty(false)
    34  	printScriptOrProperty(true)
    35  	printCases()
    36  	printLatinProperties()
    37  	printCasefold()
    38  	printSizes()
    39  	flushOutput()
    40  }
    41  
    42  func defaultVersion() string {
    43  	if v := os.Getenv("UNICODE_VERSION"); v != "" {
    44  		return v
    45  	}
    46  	return unicode.Version
    47  }
    48  
    49  var tablelist = flag.String("tables",
    50  	"all",
    51  	"comma-separated list of which tables to generate; can be letter")
    52  var scriptlist = flag.String("scripts",
    53  	"all",
    54  	"comma-separated list of which script tables to generate")
    55  var proplist = flag.String("props",
    56  	"all",
    57  	"comma-separated list of which property tables to generate")
    58  var cases = flag.Bool("cases",
    59  	true,
    60  	"generate case tables")
    61  var test = flag.Bool("test",
    62  	false,
    63  	"test existing tables; can be used to compare web data with package data")
    64  
    65  var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
    66  var logger = log.New(os.Stderr, "", log.Lshortfile)
    67  
    68  var output *gen.CodeWriter
    69  
    70  func setupOutput() {
    71  	output = gen.NewCodeWriter()
    72  }
    73  
    74  func flushOutput() {
    75  	output.WriteGoFile("tables.go", "unicode")
    76  }
    77  
    78  func printf(format string, args ...interface{}) {
    79  	fmt.Fprintf(output, format, args...)
    80  }
    81  
    82  func print(args ...interface{}) {
    83  	fmt.Fprint(output, args...)
    84  }
    85  
    86  func println(args ...interface{}) {
    87  	fmt.Fprintln(output, args...)
    88  }
    89  
    90  var category = map[string]bool{
    91  	// Nd Lu etc.
    92  	// We use one-character names to identify merged categories
    93  	"L": true, // Lu Ll Lt Lm Lo
    94  	"P": true, // Pc Pd Ps Pe Pu Pf Po
    95  	"M": true, // Mn Mc Me
    96  	"N": true, // Nd Nl No
    97  	"S": true, // Sm Sc Sk So
    98  	"Z": true, // Zs Zl Zp
    99  	"C": true, // Cc Cf Cs Co Cn
   100  }
   101  
   102  // This contains only the properties we're interested in.
   103  type Char struct {
   104  	codePoint rune // if zero, this index is not a valid code point.
   105  	category  string
   106  	upperCase rune
   107  	lowerCase rune
   108  	titleCase rune
   109  	foldCase  rune // simple case folding
   110  	caseOrbit rune // next in simple case folding orbit
   111  }
   112  
   113  const MaxChar = 0x10FFFF
   114  
   115  var chars = make([]Char, MaxChar+1)
   116  var scripts = make(map[string][]rune)
   117  var props = make(map[string][]rune) // a property looks like a script; can share the format
   118  
   119  func allCategories() []string {
   120  	a := make([]string, 0, len(category))
   121  	for k := range category {
   122  		a = append(a, k)
   123  	}
   124  	sort.Strings(a)
   125  	return a
   126  }
   127  
   128  func all(scripts map[string][]rune) []string {
   129  	a := make([]string, 0, len(scripts))
   130  	for k := range scripts {
   131  		a = append(a, k)
   132  	}
   133  	sort.Strings(a)
   134  	return a
   135  }
   136  
   137  func allCatFold(m map[string]map[rune]bool) []string {
   138  	a := make([]string, 0, len(m))
   139  	for k := range m {
   140  		a = append(a, k)
   141  	}
   142  	sort.Strings(a)
   143  	return a
   144  }
   145  
   146  func categoryOp(code rune, class uint8) bool {
   147  	category := chars[code].category
   148  	return len(category) > 0 && category[0] == class
   149  }
   150  
   151  func loadChars() {
   152  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
   153  		c := Char{codePoint: p.Rune(0)}
   154  
   155  		getRune := func(field int) rune {
   156  			if p.String(field) == "" {
   157  				return 0
   158  			}
   159  			return p.Rune(field)
   160  		}
   161  
   162  		c.category = p.String(ucd.GeneralCategory)
   163  		category[c.category] = true
   164  		switch c.category {
   165  		case "Nd":
   166  			// Decimal digit
   167  			p.Int(ucd.NumericValue)
   168  		case "Lu":
   169  			c.upperCase = getRune(ucd.CodePoint)
   170  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   171  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   172  		case "Ll":
   173  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   174  			c.lowerCase = getRune(ucd.CodePoint)
   175  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   176  		case "Lt":
   177  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   178  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   179  			c.titleCase = getRune(ucd.CodePoint)
   180  		default:
   181  			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
   182  			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
   183  			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
   184  		}
   185  
   186  		chars[c.codePoint] = c
   187  	})
   188  }
   189  
   190  func loadCasefold() {
   191  	ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) {
   192  		kind := p.String(1)
   193  		if kind != "C" && kind != "S" {
   194  			// Only care about 'common' and 'simple' foldings.
   195  			return
   196  		}
   197  		p1 := p.Rune(0)
   198  		p2 := p.Rune(2)
   199  		chars[p1].foldCase = rune(p2)
   200  	})
   201  }
   202  
   203  var categoryMapping = map[string]string{
   204  	"Lu": "Letter, uppercase",
   205  	"Ll": "Letter, lowercase",
   206  	"Lt": "Letter, titlecase",
   207  	"Lm": "Letter, modifier",
   208  	"Lo": "Letter, other",
   209  	"Mn": "Mark, nonspacing",
   210  	"Mc": "Mark, spacing combining",
   211  	"Me": "Mark, enclosing",
   212  	"Nd": "Number, decimal digit",
   213  	"Nl": "Number, letter",
   214  	"No": "Number, other",
   215  	"Pc": "Punctuation, connector",
   216  	"Pd": "Punctuation, dash",
   217  	"Ps": "Punctuation, open",
   218  	"Pe": "Punctuation, close",
   219  	"Pi": "Punctuation, initial quote",
   220  	"Pf": "Punctuation, final quote",
   221  	"Po": "Punctuation, other",
   222  	"Sm": "Symbol, math",
   223  	"Sc": "Symbol, currency",
   224  	"Sk": "Symbol, modifier",
   225  	"So": "Symbol, other",
   226  	"Zs": "Separator, space",
   227  	"Zl": "Separator, line",
   228  	"Zp": "Separator, paragraph",
   229  	"Cc": "Other, control",
   230  	"Cf": "Other, format",
   231  	"Cs": "Other, surrogate",
   232  	"Co": "Other, private use",
   233  	"Cn": "Other, not assigned",
   234  }
   235  
   236  func printCategories() {
   237  	if *tablelist == "" {
   238  		return
   239  	}
   240  	// Find out which categories to dump
   241  	list := strings.Split(*tablelist, ",")
   242  	if *tablelist == "all" {
   243  		list = allCategories()
   244  	}
   245  	if *test {
   246  		fullCategoryTest(list)
   247  		return
   248  	}
   249  
   250  	println("// Version is the Unicode edition from which the tables are derived.")
   251  	printf("const Version = %q\n\n", gen.UnicodeVersion())
   252  
   253  	if *tablelist == "all" {
   254  		println("// Categories is the set of Unicode category tables.")
   255  		println("var Categories = map[string] *RangeTable {")
   256  		for _, k := range allCategories() {
   257  			printf("\t%q: %s,\n", k, k)
   258  		}
   259  		print("}\n\n")
   260  	}
   261  
   262  	decl := make(sort.StringSlice, len(list))
   263  	ndecl := 0
   264  	for _, name := range list {
   265  		if _, ok := category[name]; !ok {
   266  			logger.Fatal("unknown category", name)
   267  		}
   268  		// We generate an UpperCase name to serve as concise documentation and an _UnderScored
   269  		// name to store the data. This stops godoc dumping all the tables but keeps them
   270  		// available to clients.
   271  		// Cases deserving special comments
   272  		varDecl := ""
   273  		switch name {
   274  		case "C":
   275  			varDecl = "\tOther = _C;	// Other/C is the set of Unicode control and special characters, category C.\n"
   276  			varDecl += "\tC = _C\n"
   277  		case "L":
   278  			varDecl = "\tLetter = _L;	// Letter/L is the set of Unicode letters, category L.\n"
   279  			varDecl += "\tL = _L\n"
   280  		case "M":
   281  			varDecl = "\tMark = _M;	// Mark/M is the set of Unicode mark characters, category M.\n"
   282  			varDecl += "\tM = _M\n"
   283  		case "N":
   284  			varDecl = "\tNumber = _N;	// Number/N is the set of Unicode number characters, category N.\n"
   285  			varDecl += "\tN = _N\n"
   286  		case "P":
   287  			varDecl = "\tPunct = _P;	// Punct/P is the set of Unicode punctuation characters, category P.\n"
   288  			varDecl += "\tP = _P\n"
   289  		case "S":
   290  			varDecl = "\tSymbol = _S;	// Symbol/S is the set of Unicode symbol characters, category S.\n"
   291  			varDecl += "\tS = _S\n"
   292  		case "Z":
   293  			varDecl = "\tSpace = _Z;	// Space/Z is the set of Unicode space characters, category Z.\n"
   294  			varDecl += "\tZ = _Z\n"
   295  		case "Nd":
   296  			varDecl = "\tDigit = _Nd;	// Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
   297  		case "Lu":
   298  			varDecl = "\tUpper = _Lu;	// Upper is the set of Unicode upper case letters.\n"
   299  		case "Ll":
   300  			varDecl = "\tLower = _Ll;	// Lower is the set of Unicode lower case letters.\n"
   301  		case "Lt":
   302  			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
   303  		}
   304  		if len(name) > 1 {
   305  			desc, ok := categoryMapping[name]
   306  			if ok {
   307  				varDecl += fmt.Sprintf(
   308  					"\t%s = _%s;	// %s is the set of Unicode characters in category %s (%s).\n",
   309  					name, name, name, name, desc)
   310  			} else {
   311  				varDecl += fmt.Sprintf(
   312  					"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
   313  					name, name, name, name)
   314  			}
   315  		}
   316  		decl[ndecl] = varDecl
   317  		ndecl++
   318  		if len(name) == 1 { // unified categories
   319  			dumpRange(
   320  				"_"+name,
   321  				func(code rune) bool { return categoryOp(code, name[0]) })
   322  			continue
   323  		}
   324  		dumpRange("_"+name,
   325  			func(code rune) bool { return chars[code].category == name })
   326  	}
   327  	decl.Sort()
   328  	println("// These variables have type *RangeTable.")
   329  	println("var (")
   330  	for _, d := range decl {
   331  		print(d)
   332  	}
   333  	print(")\n\n")
   334  }
   335  
   336  type Op func(code rune) bool
   337  
   338  func dumpRange(name string, inCategory Op) {
   339  	runes := []rune{}
   340  	for i := range chars {
   341  		r := rune(i)
   342  		if inCategory(r) {
   343  			runes = append(runes, r)
   344  		}
   345  	}
   346  	printRangeTable(name, runes)
   347  }
   348  
   349  func printRangeTable(name string, runes []rune) {
   350  	rt := rangetable.New(runes...)
   351  	printf("var %s = &RangeTable{\n", name)
   352  	println("\tR16: []Range16{")
   353  	for _, r := range rt.R16 {
   354  		printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
   355  		range16Count++
   356  	}
   357  	println("\t},")
   358  	if len(rt.R32) > 0 {
   359  		println("\tR32: []Range32{")
   360  		for _, r := range rt.R32 {
   361  			printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
   362  			range32Count++
   363  		}
   364  		println("\t},")
   365  	}
   366  	if rt.LatinOffset > 0 {
   367  		printf("\tLatinOffset: %d,\n", rt.LatinOffset)
   368  	}
   369  	printf("}\n\n")
   370  }
   371  
   372  func fullCategoryTest(list []string) {
   373  	for _, name := range list {
   374  		if _, ok := category[name]; !ok {
   375  			logger.Fatal("unknown category", name)
   376  		}
   377  		r, ok := unicode.Categories[name]
   378  		if !ok && len(name) > 1 {
   379  			logger.Fatalf("unknown table %q", name)
   380  		}
   381  		if len(name) == 1 {
   382  			verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
   383  		} else {
   384  			verifyRange(
   385  				name,
   386  				func(code rune) bool { return chars[code].category == name },
   387  				r)
   388  		}
   389  	}
   390  }
   391  
   392  func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
   393  	count := 0
   394  	for j := range chars {
   395  		i := rune(j)
   396  		web := inCategory(i)
   397  		pkg := unicode.Is(table, i)
   398  		if web != pkg {
   399  			fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
   400  			count++
   401  			if count > 10 {
   402  				break
   403  			}
   404  		}
   405  	}
   406  }
   407  
   408  func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) {
   409  	for _, name := range list {
   410  		if _, ok := scripts[name]; !ok {
   411  			logger.Fatal("unknown script", name)
   412  		}
   413  		_, ok := installed[name]
   414  		if !ok {
   415  			logger.Fatal("unknown table", name)
   416  		}
   417  		for _, r := range scripts[name] {
   418  			if !unicode.Is(installed[name], rune(r)) {
   419  				fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
   420  			}
   421  		}
   422  	}
   423  }
   424  
   425  var deprecatedAliases = map[string]string{
   426  	"Sentence_Terminal": "STerm",
   427  }
   428  
   429  // PropList.txt has the same format as Scripts.txt so we can share its parser.
   430  func printScriptOrProperty(doProps bool) {
   431  	flaglist := *scriptlist
   432  	file := "Scripts.txt"
   433  	table := scripts
   434  	installed := unicode.Scripts
   435  	if doProps {
   436  		flaglist = *proplist
   437  		file = "PropList.txt"
   438  		table = props
   439  		installed = unicode.Properties
   440  	}
   441  	if flaglist == "" {
   442  		return
   443  	}
   444  	ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) {
   445  		name := p.String(1)
   446  		table[name] = append(table[name], p.Rune(0))
   447  	})
   448  	// Find out which scripts to dump
   449  	list := strings.Split(flaglist, ",")
   450  	if flaglist == "all" {
   451  		list = all(table)
   452  	}
   453  	if *test {
   454  		fullScriptTest(list, installed, table)
   455  		return
   456  	}
   457  
   458  	if flaglist == "all" {
   459  		if doProps {
   460  			println("// Properties is the set of Unicode property tables.")
   461  			println("var Properties = map[string] *RangeTable{")
   462  		} else {
   463  			println("// Scripts is the set of Unicode script tables.")
   464  			println("var Scripts = map[string] *RangeTable{")
   465  		}
   466  		for _, k := range all(table) {
   467  			printf("\t%q: %s,\n", k, k)
   468  			if alias, ok := deprecatedAliases[k]; ok {
   469  				printf("\t%q: %s,\n", alias, k)
   470  			}
   471  		}
   472  		print("}\n\n")
   473  	}
   474  
   475  	decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
   476  	ndecl := 0
   477  	for _, name := range list {
   478  		if doProps {
   479  			decl[ndecl] = fmt.Sprintf(
   480  				"\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
   481  				name, name, name, name)
   482  		} else {
   483  			decl[ndecl] = fmt.Sprintf(
   484  				"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
   485  				name, name, name, name)
   486  		}
   487  		ndecl++
   488  		if alias, ok := deprecatedAliases[name]; ok {
   489  			decl[ndecl] = fmt.Sprintf(
   490  				"\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
   491  				alias, name)
   492  			ndecl++
   493  		}
   494  		printRangeTable("_"+name, table[name])
   495  	}
   496  	decl.Sort()
   497  	println("// These variables have type *RangeTable.")
   498  	println("var (")
   499  	for _, d := range decl {
   500  		print(d)
   501  	}
   502  	print(")\n\n")
   503  }
   504  
   505  const (
   506  	CaseUpper = 1 << iota
   507  	CaseLower
   508  	CaseTitle
   509  	CaseNone    = 0  // must be zero
   510  	CaseMissing = -1 // character not present; not a valid case state
   511  )
   512  
   513  type caseState struct {
   514  	point        rune
   515  	_case        int
   516  	deltaToUpper rune
   517  	deltaToLower rune
   518  	deltaToTitle rune
   519  }
   520  
   521  // Is d a continuation of the state of c?
   522  func (c *caseState) adjacent(d *caseState) bool {
   523  	if d.point < c.point {
   524  		c, d = d, c
   525  	}
   526  	switch {
   527  	case d.point != c.point+1: // code points not adjacent (shouldn't happen)
   528  		return false
   529  	case d._case != c._case: // different cases
   530  		return c.upperLowerAdjacent(d)
   531  	case c._case == CaseNone:
   532  		return false
   533  	case c._case == CaseMissing:
   534  		return false
   535  	case d.deltaToUpper != c.deltaToUpper:
   536  		return false
   537  	case d.deltaToLower != c.deltaToLower:
   538  		return false
   539  	case d.deltaToTitle != c.deltaToTitle:
   540  		return false
   541  	}
   542  	return true
   543  }
   544  
   545  // Is d the same as c, but opposite in upper/lower case? this would make it
   546  // an element of an UpperLower sequence.
   547  func (c *caseState) upperLowerAdjacent(d *caseState) bool {
   548  	// check they're a matched case pair.  we know they have adjacent values
   549  	switch {
   550  	case c._case == CaseUpper && d._case != CaseLower:
   551  		return false
   552  	case c._case == CaseLower && d._case != CaseUpper:
   553  		return false
   554  	}
   555  	// matched pair (at least in upper/lower).  make the order Upper Lower
   556  	if c._case == CaseLower {
   557  		c, d = d, c
   558  	}
   559  	// for an Upper Lower sequence the deltas have to be in order
   560  	//	c: 0 1 0
   561  	//	d: -1 0 -1
   562  	switch {
   563  	case c.deltaToUpper != 0:
   564  		return false
   565  	case c.deltaToLower != 1:
   566  		return false
   567  	case c.deltaToTitle != 0:
   568  		return false
   569  	case d.deltaToUpper != -1:
   570  		return false
   571  	case d.deltaToLower != 0:
   572  		return false
   573  	case d.deltaToTitle != -1:
   574  		return false
   575  	}
   576  	return true
   577  }
   578  
   579  // Does this character start an UpperLower sequence?
   580  func (c *caseState) isUpperLower() bool {
   581  	// for an Upper Lower sequence the deltas have to be in order
   582  	//	c: 0 1 0
   583  	switch {
   584  	case c.deltaToUpper != 0:
   585  		return false
   586  	case c.deltaToLower != 1:
   587  		return false
   588  	case c.deltaToTitle != 0:
   589  		return false
   590  	}
   591  	return true
   592  }
   593  
   594  // Does this character start a LowerUpper sequence?
   595  func (c *caseState) isLowerUpper() bool {
   596  	// for an Upper Lower sequence the deltas have to be in order
   597  	//	c: -1 0 -1
   598  	switch {
   599  	case c.deltaToUpper != -1:
   600  		return false
   601  	case c.deltaToLower != 0:
   602  		return false
   603  	case c.deltaToTitle != -1:
   604  		return false
   605  	}
   606  	return true
   607  }
   608  
   609  func getCaseState(i rune) (c *caseState) {
   610  	c = &caseState{point: i, _case: CaseNone}
   611  	ch := &chars[i]
   612  	switch ch.codePoint {
   613  	case 0:
   614  		c._case = CaseMissing // Will get NUL wrong but that doesn't matter
   615  		return
   616  	case ch.upperCase:
   617  		c._case = CaseUpper
   618  	case ch.lowerCase:
   619  		c._case = CaseLower
   620  	case ch.titleCase:
   621  		c._case = CaseTitle
   622  	}
   623  	// Some things such as roman numeral U+2161 don't describe themselves
   624  	// as upper case, but have a lower case. Second-guess them.
   625  	if c._case == CaseNone && ch.lowerCase != 0 {
   626  		c._case = CaseUpper
   627  	}
   628  	// Same in the other direction.
   629  	if c._case == CaseNone && ch.upperCase != 0 {
   630  		c._case = CaseLower
   631  	}
   632  
   633  	if ch.upperCase != 0 {
   634  		c.deltaToUpper = ch.upperCase - i
   635  	}
   636  	if ch.lowerCase != 0 {
   637  		c.deltaToLower = ch.lowerCase - i
   638  	}
   639  	if ch.titleCase != 0 {
   640  		c.deltaToTitle = ch.titleCase - i
   641  	}
   642  	return
   643  }
   644  
   645  func printCases() {
   646  	if *test {
   647  		fullCaseTest()
   648  		return
   649  	}
   650  	printf(
   651  		"// CaseRanges is the table describing case mappings for all letters with\n" +
   652  			"// non-self mappings.\n" +
   653  			"var CaseRanges = _CaseRanges\n" +
   654  			"var _CaseRanges = []CaseRange {\n")
   655  
   656  	var startState *caseState    // the start of a run; nil for not active
   657  	var prevState = &caseState{} // the state of the previous character
   658  	for i := range chars {
   659  		state := getCaseState(rune(i))
   660  		if state.adjacent(prevState) {
   661  			prevState = state
   662  			continue
   663  		}
   664  		// end of run (possibly)
   665  		printCaseRange(startState, prevState)
   666  		startState = nil
   667  		if state._case != CaseMissing && state._case != CaseNone {
   668  			startState = state
   669  		}
   670  		prevState = state
   671  	}
   672  	print("}\n")
   673  }
   674  
   675  func printCaseRange(lo, hi *caseState) {
   676  	if lo == nil {
   677  		return
   678  	}
   679  	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
   680  		// character represents itself in all cases - no need to mention it
   681  		return
   682  	}
   683  	switch {
   684  	case hi.point > lo.point && lo.isUpperLower():
   685  		printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
   686  			lo.point, hi.point)
   687  	case hi.point > lo.point && lo.isLowerUpper():
   688  		logger.Fatalf("LowerUpper sequence: should not happen: %U.  If it's real, need to fix To()", lo.point)
   689  		printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
   690  			lo.point, hi.point)
   691  	default:
   692  		printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
   693  			lo.point, hi.point,
   694  			lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
   695  	}
   696  }
   697  
   698  // If the cased value in the Char is 0, it means use the rune itself.
   699  func caseIt(r, cased rune) rune {
   700  	if cased == 0 {
   701  		return r
   702  	}
   703  	return cased
   704  }
   705  
   706  func fullCaseTest() {
   707  	for j, c := range chars {
   708  		i := rune(j)
   709  		lower := unicode.ToLower(i)
   710  		want := caseIt(i, c.lowerCase)
   711  		if lower != want {
   712  			fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
   713  		}
   714  		upper := unicode.ToUpper(i)
   715  		want = caseIt(i, c.upperCase)
   716  		if upper != want {
   717  			fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
   718  		}
   719  		title := unicode.ToTitle(i)
   720  		want = caseIt(i, c.titleCase)
   721  		if title != want {
   722  			fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
   723  		}
   724  	}
   725  }
   726  
   727  func printLatinProperties() {
   728  	if *test {
   729  		return
   730  	}
   731  	println("var properties = [MaxLatin1+1]uint8{")
   732  	for code := 0; code <= unicode.MaxLatin1; code++ {
   733  		var property string
   734  		switch chars[code].category {
   735  		case "Cc", "": // NUL has no category.
   736  			property = "pC"
   737  		case "Cf": // soft hyphen, unique category, not printable.
   738  			property = "0"
   739  		case "Ll":
   740  			property = "pLl | pp"
   741  		case "Lo":
   742  			property = "pLo | pp"
   743  		case "Lu":
   744  			property = "pLu | pp"
   745  		case "Nd", "No":
   746  			property = "pN | pp"
   747  		case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
   748  			property = "pP | pp"
   749  		case "Sc", "Sk", "Sm", "So":
   750  			property = "pS | pp"
   751  		case "Zs":
   752  			property = "pZ"
   753  		default:
   754  			logger.Fatalf("%U has unknown category %q", code, chars[code].category)
   755  		}
   756  		// Special case
   757  		if code == ' ' {
   758  			property = "pZ | pp"
   759  		}
   760  		printf("\t0x%02X: %s, // %q\n", code, property, code)
   761  	}
   762  	printf("}\n\n")
   763  }
   764  
   765  func printCasefold() {
   766  	// Build list of case-folding groups attached to each canonical folded char (typically lower case).
   767  	var caseOrbit = make([][]rune, MaxChar+1)
   768  	for j := range chars {
   769  		i := rune(j)
   770  		c := &chars[i]
   771  		if c.foldCase == 0 {
   772  			continue
   773  		}
   774  		orb := caseOrbit[c.foldCase]
   775  		if orb == nil {
   776  			orb = append(orb, c.foldCase)
   777  		}
   778  		caseOrbit[c.foldCase] = append(orb, i)
   779  	}
   780  
   781  	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
   782  	for j := range chars {
   783  		i := rune(j)
   784  		c := &chars[i]
   785  		f := c.foldCase
   786  		if f == 0 {
   787  			f = i
   788  		}
   789  		orb := caseOrbit[f]
   790  		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
   791  			// Default assumption of [upper, lower] is wrong.
   792  			caseOrbit[i] = []rune{i}
   793  		}
   794  	}
   795  
   796  	// Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
   797  	for i, orb := range caseOrbit {
   798  		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
   799  			caseOrbit[i] = nil
   800  		}
   801  		if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
   802  			caseOrbit[i] = nil
   803  		}
   804  	}
   805  
   806  	// Record orbit information in chars.
   807  	for _, orb := range caseOrbit {
   808  		if orb == nil {
   809  			continue
   810  		}
   811  		sort.Slice(orb, func(i, j int) bool {
   812  			return orb[i] < orb[j]
   813  		})
   814  		c := orb[len(orb)-1]
   815  		for _, d := range orb {
   816  			chars[c].caseOrbit = d
   817  			c = d
   818  		}
   819  	}
   820  
   821  	printAsciiFold()
   822  	printCaseOrbit()
   823  
   824  	// Tables of category and script folding exceptions: code points
   825  	// that must be added when interpreting a particular category/script
   826  	// in a case-folding context.
   827  	cat := make(map[string]map[rune]bool)
   828  	for name := range category {
   829  		if x := foldExceptions(inCategory(name)); len(x) > 0 {
   830  			cat[name] = x
   831  		}
   832  	}
   833  
   834  	scr := make(map[string]map[rune]bool)
   835  	for name := range scripts {
   836  		if x := foldExceptions(scripts[name]); len(x) > 0 {
   837  			scr[name] = x
   838  		}
   839  	}
   840  
   841  	printCatFold("FoldCategory", cat)
   842  	printCatFold("FoldScript", scr)
   843  }
   844  
   845  // inCategory returns a list of all the runes in the category.
   846  func inCategory(name string) []rune {
   847  	var x []rune
   848  	for j := range chars {
   849  		i := rune(j)
   850  		c := &chars[i]
   851  		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
   852  			x = append(x, i)
   853  		}
   854  	}
   855  	return x
   856  }
   857  
   858  // foldExceptions returns a list of all the runes fold-equivalent
   859  // to runes in class but not in class themselves.
   860  func foldExceptions(class []rune) map[rune]bool {
   861  	// Create map containing class and all fold-equivalent chars.
   862  	m := make(map[rune]bool)
   863  	for _, r := range class {
   864  		c := &chars[r]
   865  		if c.caseOrbit == 0 {
   866  			// Just upper and lower.
   867  			if u := c.upperCase; u != 0 {
   868  				m[u] = true
   869  			}
   870  			if l := c.lowerCase; l != 0 {
   871  				m[l] = true
   872  			}
   873  			m[r] = true
   874  			continue
   875  		}
   876  		// Otherwise walk orbit.
   877  		r0 := r
   878  		for {
   879  			m[r] = true
   880  			r = chars[r].caseOrbit
   881  			if r == r0 {
   882  				break
   883  			}
   884  		}
   885  	}
   886  
   887  	// Remove class itself.
   888  	for _, r := range class {
   889  		delete(m, r)
   890  	}
   891  
   892  	// What's left is the exceptions.
   893  	return m
   894  }
   895  
   896  var comment = map[string]string{
   897  	"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
   898  		"// code points outside the category that are equivalent under\n" +
   899  		"// simple case folding to code points inside the category.\n" +
   900  		"// If there is no entry for a category name, there are no such points.\n",
   901  
   902  	"FoldScript": "// FoldScript maps a script name to a table of\n" +
   903  		"// code points outside the script that are equivalent under\n" +
   904  		"// simple case folding to code points inside the script.\n" +
   905  		"// If there is no entry for a script name, there are no such points.\n",
   906  }
   907  
   908  func printAsciiFold() {
   909  	printf("var asciiFold = [MaxASCII + 1]uint16{\n")
   910  	for i := rune(0); i <= unicode.MaxASCII; i++ {
   911  		c := chars[i]
   912  		f := c.caseOrbit
   913  		if f == 0 {
   914  			if c.lowerCase != i && c.lowerCase != 0 {
   915  				f = c.lowerCase
   916  			} else if c.upperCase != i && c.upperCase != 0 {
   917  				f = c.upperCase
   918  			} else {
   919  				f = i
   920  			}
   921  		}
   922  		printf("\t0x%04X,\n", f)
   923  	}
   924  	printf("}\n\n")
   925  }
   926  
   927  func printCaseOrbit() {
   928  	if *test {
   929  		for j := range chars {
   930  			i := rune(j)
   931  			c := &chars[i]
   932  			f := c.caseOrbit
   933  			if f == 0 {
   934  				if c.lowerCase != i && c.lowerCase != 0 {
   935  					f = c.lowerCase
   936  				} else if c.upperCase != i && c.upperCase != 0 {
   937  					f = c.upperCase
   938  				} else {
   939  					f = i
   940  				}
   941  			}
   942  			if g := unicode.SimpleFold(i); g != f {
   943  				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
   944  			}
   945  		}
   946  		return
   947  	}
   948  
   949  	printf("var caseOrbit = []foldPair{\n")
   950  	for i := range chars {
   951  		c := &chars[i]
   952  		if c.caseOrbit != 0 {
   953  			printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
   954  			foldPairCount++
   955  		}
   956  	}
   957  	printf("}\n\n")
   958  }
   959  
   960  func printCatFold(name string, m map[string]map[rune]bool) {
   961  	if *test {
   962  		var pkgMap map[string]*unicode.RangeTable
   963  		if name == "FoldCategory" {
   964  			pkgMap = unicode.FoldCategory
   965  		} else {
   966  			pkgMap = unicode.FoldScript
   967  		}
   968  		if len(pkgMap) != len(m) {
   969  			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
   970  			return
   971  		}
   972  		for k, v := range m {
   973  			t, ok := pkgMap[k]
   974  			if !ok {
   975  				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
   976  				continue
   977  			}
   978  			n := 0
   979  			for _, r := range t.R16 {
   980  				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
   981  					if !v[c] {
   982  						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
   983  					}
   984  					n++
   985  				}
   986  			}
   987  			for _, r := range t.R32 {
   988  				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
   989  					if !v[c] {
   990  						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
   991  					}
   992  					n++
   993  				}
   994  			}
   995  			if n != len(v) {
   996  				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
   997  			}
   998  		}
   999  		return
  1000  	}
  1001  
  1002  	print(comment[name])
  1003  	printf("var %s = map[string]*RangeTable{\n", name)
  1004  	for _, name := range allCatFold(m) {
  1005  		printf("\t%q: fold%s,\n", name, name)
  1006  	}
  1007  	printf("}\n\n")
  1008  	for _, name := range allCatFold(m) {
  1009  		class := m[name]
  1010  		dumpRange("fold"+name, func(code rune) bool { return class[code] })
  1011  	}
  1012  }
  1013  
  1014  var range16Count = 0  // Number of entries in the 16-bit range tables.
  1015  var range32Count = 0  // Number of entries in the 32-bit range tables.
  1016  var foldPairCount = 0 // Number of fold pairs in the exception tables.
  1017  
  1018  func printSizes() {
  1019  	if *test {
  1020  		return
  1021  	}
  1022  	println()
  1023  	printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
  1024  	range16Bytes := range16Count * 3 * 2
  1025  	range32Bytes := range32Count * 3 * 4
  1026  	printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
  1027  	println()
  1028  	printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
  1029  }
  1030  

View as plain text