...

Source file src/golang.org/x/text/internal/export/idna/gen.go

Documentation: golang.org/x/text/internal/export/idna

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  // This program generates the trie for idna operations. The Unicode casing
     8  // algorithm requires the lookup of various properties and mappings for each
     9  // rune. The table generated by this generator combines several of the most
    10  // frequently used of these into a single trie so that they can be accessed
    11  // with a single lookup.
    12  package main
    13  
    14  import (
    15  	"fmt"
    16  	"io"
    17  	"log"
    18  	"unicode"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/internal/gen"
    22  	"golang.org/x/text/internal/triegen"
    23  	"golang.org/x/text/internal/ucd"
    24  	"golang.org/x/text/unicode/bidi"
    25  )
    26  
    27  func main() {
    28  	gen.Init()
    29  	genTables()
    30  	gen.Repackage("gen_trieval.go", "trieval.go", "idna")
    31  	gen.Repackage("gen_common.go", "common_test.go", "idna")
    32  }
    33  
    34  var runes = map[rune]info{}
    35  
    36  func genTables() {
    37  	t := triegen.NewTrie("idna")
    38  
    39  	ucd.Parse(gen.OpenUCDFile("DerivedNormalizationProps.txt"), func(p *ucd.Parser) {
    40  		r := p.Rune(0)
    41  		if p.String(1) == "NFC_QC" { // p.String(2) is "N" or "M"
    42  			runes[r] = mayNeedNorm
    43  		}
    44  	})
    45  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
    46  		r := p.Rune(0)
    47  
    48  		const cccVirama = 9
    49  		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
    50  			runes[p.Rune(0)] = viramaModifier
    51  		}
    52  		switch {
    53  		case unicode.In(r, unicode.Mark):
    54  			runes[r] |= modifier | mayNeedNorm
    55  		}
    56  		// TODO: by using UnicodeData.txt we don't mark undefined codepoints
    57  		// that are earmarked as RTL properly. However, an undefined cp will
    58  		// always fail, so there is no need to store this info.
    59  		switch p, _ := bidi.LookupRune(r); p.Class() {
    60  		case bidi.R, bidi.AL, bidi.AN:
    61  			if x := runes[r]; x != 0 && x != mayNeedNorm {
    62  				log.Fatalf("%U: rune both modifier and RTL letter/number", r)
    63  			}
    64  			runes[r] = rtl
    65  		}
    66  	})
    67  
    68  	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
    69  		switch v := p.String(1); v {
    70  		case "L", "D", "T", "R":
    71  			runes[p.Rune(0)] |= joinType[v] << joinShift
    72  		}
    73  	})
    74  
    75  	ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
    76  		r := p.Rune(0)
    77  
    78  		// The mappings table explicitly defines surrogates as invalid.
    79  		if !utf8.ValidRune(r) {
    80  			return
    81  		}
    82  
    83  		cat := catFromEntry(p)
    84  		isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
    85  		if !isMapped {
    86  			// Only include additional category information for non-mapped
    87  			// runes. The additional information is only used after mapping and
    88  			// the bits would clash with mapping information.
    89  			// TODO: it would be possible to inline this data and avoid
    90  			// additional lookups. This is quite tedious, though, so let's first
    91  			// see if we need this.
    92  			cat |= category(runes[r])
    93  		}
    94  
    95  		s := string(p.Runes(2))
    96  		if s != "" && !isMapped {
    97  			log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
    98  		}
    99  		t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
   100  	})
   101  
   102  	w := gen.NewCodeWriter()
   103  	defer w.WriteVersionedGoFile("tables.go", "idna")
   104  
   105  	gen.WriteUnicodeVersion(w)
   106  
   107  	w.WriteVar("mappings", string(mappings))
   108  	w.WriteVar("mappingIndex", mappingIndex)
   109  	w.WriteVar("xorData", string(xorData))
   110  
   111  	sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
   112  	if err != nil {
   113  		log.Fatal(err)
   114  	}
   115  	w.Size += sz
   116  }
   117  
   118  var (
   119  	// mappings contains replacement strings for mapped runes.
   120  	mappings = []byte{}
   121  
   122  	// mappingIndex contains an offset in mappingBytes representing the start
   123  	// of a mapping. Then next entry in mappingIndex points past the end of the
   124  	// string.
   125  	mappingIndex = []uint16{0}
   126  	mapCache     = map[string]int{}
   127  
   128  	// xorData is like mappings, except that it contains XOR data.
   129  	// We split these two tables so that we don't get an overflow.
   130  	xorData  = []byte{}
   131  	xorCache = map[string]int{}
   132  )
   133  
   134  // makeEntry creates a trie entry.
   135  func makeEntry(r rune, mapped string) info {
   136  	orig := string(r)
   137  
   138  	if len(orig) != len(mapped) {
   139  		// Store the mapped value as is in the mappings table.
   140  		index := len(mappingIndex) - 1
   141  		if x, ok := mapCache[mapped]; ok {
   142  			index = x
   143  		} else {
   144  			mapCache[mapped] = index
   145  			mappings = append(mappings, mapped...)
   146  			mappingIndex = append(mappingIndex, uint16(len(mappings)))
   147  		}
   148  		return info(index) << indexShift
   149  	}
   150  
   151  	// Create per-byte XOR mask.
   152  	var b []byte
   153  	for i := 0; i < len(orig); i++ {
   154  		b = append(b, orig[i]^mapped[i])
   155  	}
   156  
   157  	// Remove leading 0 bytes, but keep at least one byte.
   158  	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
   159  	}
   160  
   161  	if len(b) == 1 {
   162  		return xorBit | inlineXOR | info(b[0])<<indexShift
   163  	}
   164  	mapped = string(b)
   165  
   166  	// Store the mapped value as is in the mappings table.
   167  	index := len(xorData)
   168  	if x, ok := xorCache[mapped]; ok {
   169  		index = x
   170  	} else {
   171  		xorCache[mapped] = index
   172  		xorData = append(xorData, byte(len(mapped)))
   173  		xorData = append(xorData, mapped...)
   174  	}
   175  	return xorBit | info(index)<<indexShift
   176  }
   177  
   178  // The following code implements a triegen.Compacter that was originally
   179  // designed for normalization. The IDNA table has some similarities with the
   180  // norm table. Using this compacter, together with the XOR pattern approach,
   181  // reduces the table size by roughly 100K. It can probably be compressed further
   182  // by also including elements of the compacter used by cases, but for now it is
   183  // good enough.
   184  
   185  const maxSparseEntries = 16
   186  
   187  type normCompacter struct {
   188  	sparseBlocks [][]uint64
   189  	sparseOffset []uint16
   190  	sparseCount  int
   191  }
   192  
   193  func mostFrequentStride(a []uint64) int {
   194  	counts := make(map[int]int)
   195  	var v int
   196  	for _, x := range a {
   197  		if stride := int(x) - v; v != 0 && stride >= 0 {
   198  			counts[stride]++
   199  		}
   200  		v = int(x)
   201  	}
   202  	var maxs, maxc int
   203  	for stride, cnt := range counts {
   204  		if cnt > maxc || (cnt == maxc && stride < maxs) {
   205  			maxs, maxc = stride, cnt
   206  		}
   207  	}
   208  	return maxs
   209  }
   210  
   211  func countSparseEntries(a []uint64) int {
   212  	stride := mostFrequentStride(a)
   213  	var v, count int
   214  	for _, tv := range a {
   215  		if int(tv)-v != stride {
   216  			if tv != 0 {
   217  				count++
   218  			}
   219  		}
   220  		v = int(tv)
   221  	}
   222  	return count
   223  }
   224  
   225  func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
   226  	if n := countSparseEntries(v); n <= maxSparseEntries {
   227  		return (n+1)*4 + 2, true
   228  	}
   229  	return 0, false
   230  }
   231  
   232  func (c *normCompacter) Store(v []uint64) uint32 {
   233  	h := uint32(len(c.sparseOffset))
   234  	c.sparseBlocks = append(c.sparseBlocks, v)
   235  	c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
   236  	c.sparseCount += countSparseEntries(v) + 1
   237  	return h
   238  }
   239  
   240  func (c *normCompacter) Handler() string {
   241  	return "idnaSparse.lookup"
   242  }
   243  
   244  func (c *normCompacter) Print(w io.Writer) (retErr error) {
   245  	p := func(f string, x ...interface{}) {
   246  		if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
   247  			retErr = err
   248  		}
   249  	}
   250  
   251  	ls := len(c.sparseBlocks)
   252  	p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
   253  	p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
   254  
   255  	ns := c.sparseCount
   256  	p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
   257  	p("var idnaSparseValues = [%d]valueRange {", ns)
   258  	for i, b := range c.sparseBlocks {
   259  		p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
   260  		var v int
   261  		stride := mostFrequentStride(b)
   262  		n := countSparseEntries(b)
   263  		p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
   264  		for i, nv := range b {
   265  			if int(nv)-v != stride {
   266  				if v != 0 {
   267  					p(",hi:%#02x},", 0x80+i-1)
   268  				}
   269  				if nv != 0 {
   270  					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
   271  				}
   272  			}
   273  			v = int(nv)
   274  		}
   275  		if v != 0 {
   276  			p(",hi:%#02x},", 0x80+len(b)-1)
   277  		}
   278  	}
   279  	p("\n}\n\n")
   280  	return
   281  }
   282  

View as plain text