gen.go

Documentation: golang.org/x/text/cases

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // This program generates the trie for casing operations. The Unicode casing
     8  // algorithm requires the lookup of various properties and mappings for each
     9  // rune. The table generated by this generator combines several of the most
    10  // frequently used of these into a single trie so that they can be accessed
    11  // with a single lookup.
    12  package main
    13  
    14  import (
    15  	"bytes"
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"reflect"
    20  	"strconv"
    21  	"strings"
    22  	"unicode"
    23  
    24  	"golang.org/x/text/internal/gen"
    25  	"golang.org/x/text/internal/triegen"
    26  	"golang.org/x/text/internal/ucd"
    27  	"golang.org/x/text/unicode/norm"
    28  )
    29  
    30  func main() {
    31  	gen.Init()
    32  	genTables()
    33  	genTablesTest()
    34  	gen.Repackage("gen_trieval.go", "trieval.go", "cases")
    35  }
    36  
    37  // runeInfo contains all information for a rune that we care about for casing
    38  // operations.
    39  type runeInfo struct {
    40  	Rune rune
    41  
    42  	entry info // trie value for this rune.
    43  
    44  	CaseMode info
    45  
    46  	// Simple case mappings.
    47  	Simple [1 + maxCaseMode][]rune
    48  
    49  	// Special casing
    50  	HasSpecial  bool
    51  	Conditional bool
    52  	Special     [1 + maxCaseMode][]rune
    53  
    54  	// Folding
    55  	FoldSimple  rune
    56  	FoldSpecial rune
    57  	FoldFull    []rune
    58  
    59  	// TODO: FC_NFKC, or equivalent data.
    60  
    61  	// Properties
    62  	SoftDotted     bool
    63  	CaseIgnorable  bool
    64  	Cased          bool
    65  	DecomposeGreek bool
    66  	BreakType      string
    67  	BreakCat       breakCategory
    68  
    69  	// We care mostly about 0, Above, and IotaSubscript.
    70  	CCC byte
    71  }
    72  
    73  type breakCategory int
    74  
    75  const (
    76  	breakBreak breakCategory = iota
    77  	breakLetter
    78  	breakMid
    79  )
    80  
    81  // mapping returns the case mapping for the given case type.
    82  func (r *runeInfo) mapping(c info) string {
    83  	if r.HasSpecial {
    84  		return string(r.Special[c])
    85  	}
    86  	if len(r.Simple[c]) != 0 {
    87  		return string(r.Simple[c])
    88  	}
    89  	return string(r.Rune)
    90  }
    91  
    92  func parse(file string, f func(p *ucd.Parser)) {
    93  	ucd.Parse(gen.OpenUCDFile(file), f)
    94  }
    95  
    96  func parseUCD() []runeInfo {
    97  	chars := make([]runeInfo, unicode.MaxRune)
    98  
    99  	get := func(r rune) *runeInfo {
   100  		c := &chars[r]
   101  		c.Rune = r
   102  		return c
   103  	}
   104  
   105  	parse("UnicodeData.txt", func(p *ucd.Parser) {
   106  		ri := get(p.Rune(0))
   107  		ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
   108  		ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
   109  		ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
   110  		ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
   111  		if p.String(ucd.GeneralCategory) == "Lt" {
   112  			ri.CaseMode = cTitle
   113  		}
   114  	})
   115  
   116  	// <code>; <property>
   117  	parse("PropList.txt", func(p *ucd.Parser) {
   118  		if p.String(1) == "Soft_Dotted" {
   119  			chars[p.Rune(0)].SoftDotted = true
   120  		}
   121  	})
   122  
   123  	// <code>; <word break type>
   124  	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
   125  		ri := get(p.Rune(0))
   126  		switch p.String(1) {
   127  		case "Case_Ignorable":
   128  			ri.CaseIgnorable = true
   129  		case "Cased":
   130  			ri.Cased = true
   131  		case "Lowercase":
   132  			ri.CaseMode = cLower
   133  		case "Uppercase":
   134  			ri.CaseMode = cUpper
   135  		}
   136  	})
   137  
   138  	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
   139  	parse("SpecialCasing.txt", func(p *ucd.Parser) {
   140  		// We drop all conditional special casing and deal with them manually in
   141  		// the language-specific case mappers. Rune 0x03A3 is the only one with
   142  		// a conditional formatting that is not language-specific. However,
   143  		// dealing with this letter is tricky, especially in a streaming
   144  		// context, so we deal with it in the Caser for Greek specifically.
   145  		ri := get(p.Rune(0))
   146  		if p.String(4) == "" {
   147  			ri.HasSpecial = true
   148  			ri.Special[cLower] = p.Runes(1)
   149  			ri.Special[cTitle] = p.Runes(2)
   150  			ri.Special[cUpper] = p.Runes(3)
   151  		} else {
   152  			ri.Conditional = true
   153  		}
   154  	})
   155  
   156  	// TODO: Use text breaking according to UAX #29.
   157  	// <code>; <word break type>
   158  	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
   159  		ri := get(p.Rune(0))
   160  		ri.BreakType = p.String(1)
   161  
   162  		// We collapse the word breaking properties onto the categories we need.
   163  		switch p.String(1) { // TODO: officially we need to canonicalize.
   164  		case "MidLetter", "MidNumLet", "Single_Quote":
   165  			ri.BreakCat = breakMid
   166  			if !ri.CaseIgnorable {
   167  				// finalSigma relies on the fact that all breakMid runes are
   168  				// also a Case_Ignorable. Revisit this code when this changes.
   169  				log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
   170  			}
   171  		case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
   172  			ri.BreakCat = breakLetter
   173  		}
   174  	})
   175  
   176  	// <code>; <type>; <mapping>
   177  	parse("CaseFolding.txt", func(p *ucd.Parser) {
   178  		ri := get(p.Rune(0))
   179  		switch p.String(1) {
   180  		case "C":
   181  			ri.FoldSimple = p.Rune(2)
   182  			ri.FoldFull = p.Runes(2)
   183  		case "S":
   184  			ri.FoldSimple = p.Rune(2)
   185  		case "T":
   186  			ri.FoldSpecial = p.Rune(2)
   187  		case "F":
   188  			ri.FoldFull = p.Runes(2)
   189  		default:
   190  			log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
   191  		}
   192  	})
   193  
   194  	return chars
   195  }
   196  
   197  func genTables() {
   198  	chars := parseUCD()
   199  	verifyProperties(chars)
   200  
   201  	t := triegen.NewTrie("case")
   202  	for i := range chars {
   203  		c := &chars[i]
   204  		makeEntry(c)
   205  		t.Insert(rune(i), uint64(c.entry))
   206  	}
   207  
   208  	w := gen.NewCodeWriter()
   209  	defer w.WriteVersionedGoFile("tables.go", "cases")
   210  
   211  	gen.WriteUnicodeVersion(w)
   212  
   213  	// TODO: write CLDR version after adding a mechanism to detect that the
   214  	// tables on which the manually created locale-sensitive casing code is
   215  	// based hasn't changed.
   216  
   217  	w.WriteVar("xorData", string(xorData))
   218  	w.WriteVar("exceptions", string(exceptionData))
   219  
   220  	sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
   221  	if err != nil {
   222  		log.Fatal(err)
   223  	}
   224  	w.Size += sz
   225  }
   226  
   227  func makeEntry(ri *runeInfo) {
   228  	if ri.CaseIgnorable {
   229  		if ri.Cased {
   230  			ri.entry = cIgnorableCased
   231  		} else {
   232  			ri.entry = cIgnorableUncased
   233  		}
   234  	} else {
   235  		ri.entry = ri.CaseMode
   236  	}
   237  
   238  	// TODO: handle soft-dotted.
   239  
   240  	ccc := cccOther
   241  	switch ri.CCC {
   242  	case 0: // Not_Reordered
   243  		ccc = cccZero
   244  	case above: // Above
   245  		ccc = cccAbove
   246  	}
   247  	switch ri.BreakCat {
   248  	case breakBreak:
   249  		ccc = cccBreak
   250  	case breakMid:
   251  		ri.entry |= isMidBit
   252  	}
   253  
   254  	ri.entry |= ccc
   255  
   256  	if ri.CaseMode == cUncased {
   257  		return
   258  	}
   259  
   260  	// Need to do something special.
   261  	if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
   262  		makeException(ri)
   263  		return
   264  	}
   265  	if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
   266  		makeException(ri)
   267  		return
   268  	}
   269  
   270  	// Rune is either lowercase or uppercase.
   271  
   272  	orig := string(ri.Rune)
   273  	mapped := ""
   274  	if ri.CaseMode == cUpper {
   275  		mapped = ri.mapping(cLower)
   276  	} else {
   277  		mapped = ri.mapping(cUpper)
   278  	}
   279  
   280  	if len(orig) != len(mapped) {
   281  		makeException(ri)
   282  		return
   283  	}
   284  
   285  	if string(ri.FoldFull) == ri.mapping(cUpper) {
   286  		ri.entry |= inverseFoldBit
   287  	}
   288  
   289  	n := len(orig)
   290  
   291  	// Create per-byte XOR mask.
   292  	var b []byte
   293  	for i := 0; i < n; i++ {
   294  		b = append(b, orig[i]^mapped[i])
   295  	}
   296  
   297  	// Remove leading 0 bytes, but keep at least one byte.
   298  	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
   299  	}
   300  
   301  	if len(b) == 1 && b[0]&0xc0 == 0 {
   302  		ri.entry |= info(b[0]) << xorShift
   303  		return
   304  	}
   305  
   306  	key := string(b)
   307  	x, ok := xorCache[key]
   308  	if !ok {
   309  		xorData = append(xorData, 0) // for detecting start of sequence
   310  		xorData = append(xorData, b...)
   311  
   312  		x = len(xorData) - 1
   313  		xorCache[key] = x
   314  	}
   315  	ri.entry |= info(x<<xorShift) | xorIndexBit
   316  }
   317  
   318  var xorCache = map[string]int{}
   319  
   320  // xorData contains byte-wise XOR data for the least significant bytes of a
   321  // UTF-8 encoded rune. An index points to the last byte. The sequence starts
   322  // with a zero terminator.
   323  var xorData = []byte{}
   324  
   325  // See the comments in gen_trieval.go re "the exceptions slice".
   326  var exceptionData = []byte{0}
   327  
   328  // makeException encodes case mappings that cannot be expressed in a simple
   329  // XOR diff.
   330  func makeException(ri *runeInfo) {
   331  	ccc := ri.entry & cccMask
   332  	// Set exception bit and retain case type.
   333  	ri.entry &= 0x0007
   334  	ri.entry |= exceptionBit
   335  
   336  	if len(exceptionData) >= 1<<numExceptionBits {
   337  		log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
   338  	}
   339  
   340  	// Set the offset in the exceptionData array.
   341  	ri.entry |= info(len(exceptionData) << exceptionShift)
   342  
   343  	orig := string(ri.Rune)
   344  	tc := ri.mapping(cTitle)
   345  	uc := ri.mapping(cUpper)
   346  	lc := ri.mapping(cLower)
   347  	ff := string(ri.FoldFull)
   348  
   349  	// addString sets the length of a string and adds it to the expansions array.
   350  	addString := func(s string, b *byte) {
   351  		if len(s) == 0 {
   352  			// Zero-length mappings exist, but only for conditional casing,
   353  			// which we are representing outside of this table.
   354  			log.Fatalf("%U: has zero-length mapping.", ri.Rune)
   355  		}
   356  		*b <<= 3
   357  		if s != orig || ri.CaseMode == cLower {
   358  			n := len(s)
   359  			if n > 7 {
   360  				log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
   361  			}
   362  			*b |= byte(n)
   363  			exceptionData = append(exceptionData, s...)
   364  		}
   365  	}
   366  
   367  	// byte 0:
   368  	exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
   369  
   370  	// byte 1:
   371  	p := len(exceptionData)
   372  	exceptionData = append(exceptionData, 0)
   373  
   374  	if len(ff) > 7 { // May be zero-length.
   375  		log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
   376  	}
   377  	exceptionData = append(exceptionData, ff...)
   378  	ct := ri.CaseMode
   379  	if ct != cLower {
   380  		addString(lc, &exceptionData[p])
   381  	}
   382  	if ct != cUpper {
   383  		addString(uc, &exceptionData[p])
   384  	}
   385  	if ct != cTitle {
   386  		addString(tc, &exceptionData[p])
   387  	}
   388  }
   389  
   390  // sparseCompacter is a trie value block Compacter. There are many cases where
   391  // successive runes alternate between lower- and upper-case. This Compacter
   392  // exploits this by adding a special case type where the case value is obtained
   393  // from or-ing it with the least-significant bit of the rune, creating large
   394  // ranges of equal case values that compress well.
   395  type sparseCompacter struct {
   396  	sparseBlocks  [][]uint16
   397  	sparseOffsets []uint16
   398  	sparseCount   int
   399  }
   400  
   401  // makeSparse returns the number of elements that compact block would contain
   402  // as well as the modified values.
   403  func makeSparse(vals []uint64) ([]uint16, int) {
   404  	// Copy the values.
   405  	values := make([]uint16, len(vals))
   406  	for i, v := range vals {
   407  		values[i] = uint16(v)
   408  	}
   409  
   410  	alt := func(i int, v uint16) uint16 {
   411  		if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
   412  			// Convert cLower or cUpper to cXORCase value, which has the form 11x.
   413  			xor := v
   414  			xor &^= 1
   415  			xor |= uint16(i&1) ^ (v & 1)
   416  			xor |= 0x4
   417  			return xor
   418  		}
   419  		return v
   420  	}
   421  
   422  	var count int
   423  	var previous uint16
   424  	for i, v := range values {
   425  		if v != 0 {
   426  			// Try if the unmodified value is equal to the previous.
   427  			if v == previous {
   428  				continue
   429  			}
   430  
   431  			// Try if the xor-ed value is equal to the previous value.
   432  			a := alt(i, v)
   433  			if a == previous {
   434  				values[i] = a
   435  				continue
   436  			}
   437  
   438  			// This is a new value.
   439  			count++
   440  
   441  			// Use the xor-ed value if it will be identical to the next value.
   442  			if p := i + 1; p < len(values) && alt(p, values[p]) == a {
   443  				values[i] = a
   444  				v = a
   445  			}
   446  		}
   447  		previous = v
   448  	}
   449  	return values, count
   450  }
   451  
   452  func (s *sparseCompacter) Size(v []uint64) (int, bool) {
   453  	_, n := makeSparse(v)
   454  
   455  	// We limit using this method to having 16 entries.
   456  	if n > 16 {
   457  		return 0, false
   458  	}
   459  
   460  	return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
   461  }
   462  
   463  func (s *sparseCompacter) Store(v []uint64) uint32 {
   464  	h := uint32(len(s.sparseOffsets))
   465  	values, sz := makeSparse(v)
   466  	s.sparseBlocks = append(s.sparseBlocks, values)
   467  	s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
   468  	s.sparseCount += sz
   469  	return h
   470  }
   471  
   472  func (s *sparseCompacter) Handler() string {
   473  	// The sparse global variable and its lookup method is defined in gen_trieval.go.
   474  	return "sparse.lookup"
   475  }
   476  
   477  func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
   478  	p := func(format string, args ...interface{}) {
   479  		_, err := fmt.Fprintf(w, format, args...)
   480  		if retErr == nil && err != nil {
   481  			retErr = err
   482  		}
   483  	}
   484  
   485  	ls := len(s.sparseBlocks)
   486  	if ls == len(s.sparseOffsets) {
   487  		s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
   488  	}
   489  	p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
   490  	p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
   491  
   492  	ns := s.sparseCount
   493  	p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
   494  	p("var sparseValues = [%d]valueRange {", ns)
   495  	for i, values := range s.sparseBlocks {
   496  		p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
   497  		var v uint16
   498  		for i, nv := range values {
   499  			if nv != v {
   500  				if v != 0 {
   501  					p(",hi:%#02x},", 0x80+i-1)
   502  				}
   503  				if nv != 0 {
   504  					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
   505  				}
   506  			}
   507  			v = nv
   508  		}
   509  		if v != 0 {
   510  			p(",hi:%#02x},", 0x80+len(values)-1)
   511  		}
   512  	}
   513  	p("\n}\n\n")
   514  	return
   515  }
   516  
   517  // verifyProperties that properties of the runes that are relied upon in the
   518  // implementation. Each property is marked with an identifier that is referred
   519  // to in the places where it is used.
   520  func verifyProperties(chars []runeInfo) {
   521  	for i, c := range chars {
   522  		r := rune(i)
   523  
   524  		// Rune properties.
   525  
   526  		// A.1: modifier never changes on lowercase. [ltLower]
   527  		if c.CCC > 0 && unicode.ToLower(r) != r {
   528  			log.Fatalf("%U: non-starter changes when lowercased", r)
   529  		}
   530  
   531  		// A.2: properties of decompositions starting with I or J. [ltLower]
   532  		d := norm.NFD.PropertiesString(string(r)).Decomposition()
   533  		if len(d) > 0 {
   534  			if d[0] == 'I' || d[0] == 'J' {
   535  				// A.2.1: we expect at least an ASCII character and a modifier.
   536  				if len(d) < 3 {
   537  					log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
   538  				}
   539  
   540  				// All subsequent runes are modifiers and all have the same CCC.
   541  				runes := []rune(string(d[1:]))
   542  				ccc := chars[runes[0]].CCC
   543  
   544  				for _, mr := range runes[1:] {
   545  					mc := chars[mr]
   546  
   547  					// A.2.2: all modifiers have a CCC of Above or less.
   548  					if ccc == 0 || ccc > above {
   549  						log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
   550  					}
   551  
   552  					// A.2.3: a sequence of modifiers all have the same CCC.
   553  					if mc.CCC != ccc {
   554  						log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
   555  					}
   556  
   557  					// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
   558  					if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
   559  						log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
   560  					}
   561  
   562  					if i += len(string(mr)); i >= len(d) {
   563  						break
   564  					}
   565  				}
   566  			}
   567  		}
   568  
   569  		// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
   570  		if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
   571  			log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
   572  		}
   573  
   574  		// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
   575  		if c.CCC == iotaSubscript && r != 0x0345 {
   576  			log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
   577  		}
   578  
   579  		// A.5: soft-dotted runes do not have exceptions.
   580  		if c.SoftDotted && c.entry&exceptionBit != 0 {
   581  			log.Fatalf("%U: soft-dotted has exception", r)
   582  		}
   583  
   584  		// A.6: Greek decomposition. [elUpper]
   585  		if unicode.Is(unicode.Greek, r) {
   586  			if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
   587  				runes := []rune(string(b))
   588  				// A.6.1: If a Greek rune decomposes and the first rune of the
   589  				// decomposition is greater than U+00FF, the rune is always
   590  				// great and not a modifier.
   591  				if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
   592  					log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
   593  				}
   594  				// A.6.2: Any follow-up rune in a Greek decomposition is a
   595  				// modifier of which the first should be gobbled in
   596  				// decomposition.
   597  				for _, m := range runes[1:] {
   598  					switch m {
   599  					case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
   600  					default:
   601  						log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
   602  					}
   603  				}
   604  			}
   605  		}
   606  
   607  		// Breaking properties.
   608  
   609  		// B.1: all runes with CCC > 0 are of break type Extend.
   610  		if c.CCC > 0 && c.BreakType != "Extend" {
   611  			log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
   612  		}
   613  
   614  		// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
   615  		if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
   616  			log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
   617  		}
   618  
   619  		// B.3: letter category.
   620  		if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
   621  			if c.BreakCat != breakLetter {
   622  				log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
   623  			}
   624  		}
   625  	}
   626  }
   627  
   628  func genTablesTest() {
   629  	w := &bytes.Buffer{}
   630  
   631  	fmt.Fprintln(w, "var (")
   632  	printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
   633  
   634  	// We discard the output as we know we have perfect functions. We run them
   635  	// just to verify the properties are correct.
   636  	n := printProperties(io.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
   637  	n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
   638  	n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
   639  	if n > 0 {
   640  		log.Fatalf("One of the discarded properties does not have a perfect filter.")
   641  	}
   642  
   643  	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
   644  	fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
   645  	parse("SpecialCasing.txt", func(p *ucd.Parser) {
   646  		// Skip conditional entries.
   647  		if p.String(4) != "" {
   648  			return
   649  		}
   650  		r := p.Rune(0)
   651  		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
   652  			r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
   653  	})
   654  	fmt.Fprint(w, "\t}\n\n")
   655  
   656  	// <code>; <type>; <runes>
   657  	table := map[rune]struct{ simple, full, special string }{}
   658  	parse("CaseFolding.txt", func(p *ucd.Parser) {
   659  		r := p.Rune(0)
   660  		t := p.String(1)
   661  		v := string(p.Runes(2))
   662  		if t != "T" && v == string(unicode.ToLower(r)) {
   663  			return
   664  		}
   665  		x := table[r]
   666  		switch t {
   667  		case "C":
   668  			x.full = v
   669  			x.simple = v
   670  		case "S":
   671  			x.simple = v
   672  		case "F":
   673  			x.full = v
   674  		case "T":
   675  			x.special = v
   676  		}
   677  		table[r] = x
   678  	})
   679  	fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
   680  	for r := rune(0); r < 0x10FFFF; r++ {
   681  		x, ok := table[r]
   682  		if !ok {
   683  			continue
   684  		}
   685  		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
   686  	}
   687  	fmt.Fprint(w, "\t}\n\n")
   688  
   689  	// Break property
   690  	notBreak := map[rune]bool{}
   691  	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
   692  		switch p.String(1) {
   693  		case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
   694  			"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
   695  			notBreak[p.Rune(0)] = true
   696  		}
   697  	})
   698  
   699  	fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
   700  	inBreak := false
   701  	for r := rune(0); r <= lastRuneForTesting; r++ {
   702  		if isBreak := !notBreak[r]; isBreak != inBreak {
   703  			if isBreak {
   704  				fmt.Fprintf(w, "\t\t{0x%x, ", r)
   705  			} else {
   706  				fmt.Fprintf(w, "0x%x},\n", r-1)
   707  			}
   708  			inBreak = isBreak
   709  		}
   710  	}
   711  	if inBreak {
   712  		fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
   713  	}
   714  	fmt.Fprint(w, "\t}\n\n")
   715  
   716  	// Word break test
   717  	// Filter out all samples that do not contain cased characters.
   718  	cased := map[rune]bool{}
   719  	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
   720  		if p.String(1) == "Cased" {
   721  			cased[p.Rune(0)] = true
   722  		}
   723  	})
   724  
   725  	fmt.Fprintln(w, "\tbreakTest = []string{")
   726  	parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
   727  		c := strings.Split(p.String(0), " ")
   728  
   729  		const sep = '|'
   730  		numCased := 0
   731  		test := ""
   732  		for ; len(c) >= 2; c = c[2:] {
   733  			if c[0] == "÷" && test != "" {
   734  				test += string(sep)
   735  			}
   736  			i, err := strconv.ParseUint(c[1], 16, 32)
   737  			r := rune(i)
   738  			if err != nil {
   739  				log.Fatalf("Invalid rune %q.", c[1])
   740  			}
   741  			if r == sep {
   742  				log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
   743  			}
   744  			if cased[r] {
   745  				numCased++
   746  			}
   747  			test += string(r)
   748  		}
   749  		if numCased > 1 {
   750  			fmt.Fprintf(w, "\t\t%q,\n", test)
   751  		}
   752  	})
   753  	fmt.Fprintln(w, "\t}")
   754  
   755  	fmt.Fprintln(w, ")")
   756  
   757  	gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
   758  }
   759  
   760  // These functions are just used for verification that their definition have not
   761  // changed in the Unicode Standard.
   762  
   763  func verifyCased(r rune) bool {
   764  	return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
   765  }
   766  
   767  func verifyLower(r rune) bool {
   768  	return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
   769  }
   770  
   771  func verifyUpper(r rune) bool {
   772  	return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
   773  }
   774  
   775  // verifyIgnore is an approximation of the Case_Ignorable property using the
   776  // core unicode package. It is used to reduce the size of the test data.
   777  func verifyIgnore(r rune) bool {
   778  	props := []*unicode.RangeTable{
   779  		unicode.Mn,
   780  		unicode.Me,
   781  		unicode.Cf,
   782  		unicode.Lm,
   783  		unicode.Sk,
   784  	}
   785  	for _, p := range props {
   786  		if unicode.Is(p, r) {
   787  			return true
   788  		}
   789  	}
   790  	return false
   791  }
   792  
   793  // printProperties prints tables of rune properties from the given UCD file.
   794  // A filter func f can be given to exclude certain values. A rune r will have
   795  // the indicated property if it is in the generated table or if f(r).
   796  func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
   797  	verify := map[rune]bool{}
   798  	n := 0
   799  	varNameParts := strings.Split(property, "_")
   800  	varNameParts[0] = strings.ToLower(varNameParts[0])
   801  	fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
   802  	parse(file, func(p *ucd.Parser) {
   803  		if p.String(1) == property {
   804  			r := p.Rune(0)
   805  			verify[r] = true
   806  			if !f(r) {
   807  				n++
   808  				fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
   809  			}
   810  		}
   811  	})
   812  	fmt.Fprint(w, "\t}\n\n")
   813  
   814  	// Verify that f is correct, that is, it represents a subset of the property.
   815  	for r := rune(0); r <= lastRuneForTesting; r++ {
   816  		if !verify[r] && f(r) {
   817  			log.Fatalf("Incorrect filter func for property %q.", property)
   818  		}
   819  	}
   820  	return n
   821  }
   822  
   823  // The newCaseTrie, sparseValues and sparseOffsets definitions below are
   824  // placeholders referred to by gen_trieval.go. The real definitions are
   825  // generated by this program and written to tables.go.
   826  
   827  func newCaseTrie(int) int { return 0 }
   828  
   829  var (
   830  	sparseValues  [0]valueRange
   831  	sparseOffsets [0]uint16
   832  )
   833
View as plain text