// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ignore // This program generates the trie for idna operations. The Unicode casing // algorithm requires the lookup of various properties and mappings for each // rune. The table generated by this generator combines several of the most // frequently used of these into a single trie so that they can be accessed // with a single lookup. package main import ( "fmt" "io" "log" "unicode" "unicode/utf8" "golang.org/x/text/internal/gen" "golang.org/x/text/internal/triegen" "golang.org/x/text/internal/ucd" "golang.org/x/text/unicode/bidi" ) func main() { gen.Init() genTables() gen.Repackage("gen_trieval.go", "trieval.go", "idna") gen.Repackage("gen_common.go", "common_test.go", "idna") } var runes = map[rune]info{} func genTables() { t := triegen.NewTrie("idna") ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) { r := p.Rune(0) if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M" runes[r] = mayNeedNorm } }) ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { r := p.Rune(0) const cccVirama = 9 if p.Int(ucd.CanonicalCombiningClass) == cccVirama { runes[p.Rune(0)] = viramaModifier } switch { case unicode.In(r, unicode.Mark): runes[r] |= modifier | mayNeedNorm } // TODO: by using UnicodeData.txt we don't mark undefined codepoints // that are earmarked as RTL properly. However, an undefined cp will // always fail, so there is no need to store this info. switch p, _ := bidi.LookupRune(r); p.Class() { case bidi.R, bidi.AL, bidi.AN: if x := runes[r]; x != 0 && x != mayNeedNorm { log.Fatalf("%U: rune both modifier and RTL letter/number", r) } runes[r] = rtl } }) ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) { switch v := p.String(1); v { case "L", "D", "T", "R": runes[p.Rune(0)] |= joinType[v] << joinShift } }) ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) { r := p.Rune(0) // The mappings table explicitly defines surrogates as invalid. if !utf8.ValidRune(r) { return } cat := catFromEntry(p) isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation if !isMapped { // Only include additional category information for non-mapped // runes. The additional information is only used after mapping and // the bits would clash with mapping information. // TODO: it would be possible to inline this data and avoid // additional lookups. This is quite tedious, though, so let's first // see if we need this. cat |= category(runes[r]) } s := string(p.Runes(2)) if s != "" && !isMapped { log.Fatalf("%U: Mapping with non-mapping category %d", r, cat) } t.Insert(r, uint64(makeEntry(r, s))+uint64(cat)) }) w := gen.NewCodeWriter() defer w.WriteVersionedGoFile("tables.go", "idna") gen.WriteUnicodeVersion(w) w.WriteVar("mappings", string(mappings)) w.WriteVar("mappingIndex", mappingIndex) w.WriteVar("xorData", string(xorData)) sz, err := t.Gen(w, triegen.Compact(&normCompacter{})) if err != nil { log.Fatal(err) } w.Size += sz } var ( // mappings contains replacement strings for mapped runes. mappings = []byte{} // mappingIndex contains an offset in mappingBytes representing the start // of a mapping. Then next entry in mappingIndex points past the end of the // string. mappingIndex = []uint16{0} mapCache = map[string]int{} // xorData is like mappings, except that it contains XOR data. // We split these two tables so that we don't get an overflow. xorData = []byte{} xorCache = map[string]int{} ) // makeEntry creates a trie entry. func makeEntry(r rune, mapped string) info { orig := string(r) if len(orig) != len(mapped) { // Store the mapped value as is in the mappings table. index := len(mappingIndex) - 1 if x, ok := mapCache[mapped]; ok { index = x } else { mapCache[mapped] = index mappings = append(mappings, mapped...) mappingIndex = append(mappingIndex, uint16(len(mappings))) } return info(index) << indexShift } // Create per-byte XOR mask. var b []byte for i := 0; i < len(orig); i++ { b = append(b, orig[i]^mapped[i]) } // Remove leading 0 bytes, but keep at least one byte. for ; len(b) > 1 && b[0] == 0; b = b[1:] { } if len(b) == 1 { return xorBit | inlineXOR | info(b[0])<= 0 { counts[stride]++ } v = int(x) } var maxs, maxc int for stride, cnt := range counts { if cnt > maxc || (cnt == maxc && stride < maxs) { maxs, maxc = stride, cnt } } return maxs } func countSparseEntries(a []uint64) int { stride := mostFrequentStride(a) var v, count int for _, tv := range a { if int(tv)-v != stride { if tv != 0 { count++ } } v = int(tv) } return count } func (c *normCompacter) Size(v []uint64) (sz int, ok bool) { if n := countSparseEntries(v); n <= maxSparseEntries { return (n+1)*4 + 2, true } return 0, false } func (c *normCompacter) Store(v []uint64) uint32 { h := uint32(len(c.sparseOffset)) c.sparseBlocks = append(c.sparseBlocks, v) c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount)) c.sparseCount += countSparseEntries(v) + 1 return h } func (c *normCompacter) Handler() string { return "idnaSparse.lookup" } func (c *normCompacter) Print(w io.Writer) (retErr error) { p := func(f string, x ...interface{}) { if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil { retErr = err } } ls := len(c.sparseBlocks) p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2) p("var idnaSparseOffset = %#v\n\n", c.sparseOffset) ns := c.sparseCount p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4) p("var idnaSparseValues = [%d]valueRange {", ns) for i, b := range c.sparseBlocks { p("\n// Block %#x, offset %#x", i, c.sparseOffset[i]) var v int stride := mostFrequentStride(b) n := countSparseEntries(b) p("\n{value:%#04x,lo:%#02x},", stride, uint8(n)) for i, nv := range b { if int(nv)-v != stride { if v != 0 { p(",hi:%#02x},", 0x80+i-1) } if nv != 0 { p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) } } v = int(nv) } if v != 0 { p(",hi:%#02x},", 0x80+len(b)-1) } } p("\n}\n\n") return }