1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package triegen implements a code generator for a trie for associating 6 // unsigned integer values with UTF-8 encoded runes. 7 // 8 // Many of the go.text packages use tries for storing per-rune information. A 9 // trie is especially useful if many of the runes have the same value. If this 10 // is the case, many blocks can be expected to be shared allowing for 11 // information on many runes to be stored in little space. 12 // 13 // As most of the lookups are done directly on []byte slices, the tries use the 14 // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to 15 // runes and contributes a little bit to better performance. It also naturally 16 // provides a fast path for ASCII. 17 // 18 // Space is also an issue. There are many code points defined in Unicode and as 19 // a result tables can get quite large. So every byte counts. The triegen 20 // package automatically chooses the smallest integer values to represent the 21 // tables. Compacters allow further compression of the trie by allowing for 22 // alternative representations of individual trie blocks. 23 // 24 // triegen allows generating multiple tries as a single structure. This is 25 // useful when, for example, one wants to generate tries for several languages 26 // that have a lot of values in common. Some existing libraries for 27 // internationalization store all per-language data as a dynamically loadable 28 // chunk. The go.text packages are designed with the assumption that the user 29 // typically wants to compile in support for all supported languages, in line 30 // with the approach common to Go to create a single standalone binary. The 31 // multi-root trie approach can give significant storage savings in this 32 // scenario. 33 // 34 // triegen generates both tables and code. The code is optimized to use the 35 // automatically chosen data types. The following code is generated for a Trie 36 // or multiple Tries named "foo": 37 // 38 // - type fooTrie 39 // The trie type. 40 // 41 // - func newFooTrie(x int) *fooTrie 42 // Trie constructor, where x is the index of the trie passed to Gen. 43 // 44 // - func (t *fooTrie) lookup(s []byte) (v uintX, sz int) 45 // The lookup method, where uintX is automatically chosen. 46 // 47 // - func lookupString, lookupUnsafe and lookupStringUnsafe 48 // Variants of the above. 49 // 50 // - var fooValues and fooIndex and any tables generated by Compacters. 51 // The core trie data. 52 // 53 // - var fooTrieHandles 54 // Indexes of starter blocks in case of multiple trie roots. 55 // 56 // It is recommended that users test the generated trie by checking the returned 57 // value for every rune. Such exhaustive tests are possible as the number of 58 // runes in Unicode is limited. 59 package triegen // import "golang.org/x/text/internal/triegen" 60 61 // TODO: Arguably, the internally optimized data types would not have to be 62 // exposed in the generated API. We could also investigate not generating the 63 // code, but using it through a package. We would have to investigate the impact 64 // on performance of making such change, though. For packages like unicode/norm, 65 // small changes like this could tank performance. 66 67 import ( 68 "encoding/binary" 69 "fmt" 70 "hash/crc64" 71 "io" 72 "log" 73 "unicode/utf8" 74 ) 75 76 // builder builds a set of tries for associating values with runes. The set of 77 // tries can share common index and value blocks. 78 type builder struct { 79 Name string 80 81 // ValueType is the type of the trie values looked up. 82 ValueType string 83 84 // ValueSize is the byte size of the ValueType. 85 ValueSize int 86 87 // IndexType is the type of trie index values used for all UTF-8 bytes of 88 // a rune except the last one. 89 IndexType string 90 91 // IndexSize is the byte size of the IndexType. 92 IndexSize int 93 94 // SourceType is used when generating the lookup functions. If the user 95 // requests StringSupport, all lookup functions will be generated for 96 // string input as well. 97 SourceType string 98 99 Trie []*Trie 100 101 IndexBlocks []*node 102 ValueBlocks [][]uint64 103 Compactions []compaction 104 Checksum uint64 105 106 ASCIIBlock string 107 StarterBlock string 108 109 indexBlockIdx map[uint64]int 110 valueBlockIdx map[uint64]nodeIndex 111 asciiBlockIdx map[uint64]int 112 113 // Stats are used to fill out the template. 114 Stats struct { 115 NValueEntries int 116 NValueBytes int 117 NIndexEntries int 118 NIndexBytes int 119 NHandleBytes int 120 } 121 122 err error 123 } 124 125 // A nodeIndex encodes the index of a node, which is defined by the compaction 126 // which stores it and an index within the compaction. For internal nodes, the 127 // compaction is always 0. 128 type nodeIndex struct { 129 compaction int 130 index int 131 } 132 133 // compaction keeps track of stats used for the compaction. 134 type compaction struct { 135 c Compacter 136 blocks []*node 137 maxHandle uint32 138 totalSize int 139 140 // Used by template-based generator and thus exported. 141 Cutoff uint32 142 Offset uint32 143 Handler string 144 } 145 146 func (b *builder) setError(err error) { 147 if b.err == nil { 148 b.err = err 149 } 150 } 151 152 // An Option can be passed to Gen. 153 type Option func(b *builder) error 154 155 // Compact configures the trie generator to use the given Compacter. 156 func Compact(c Compacter) Option { 157 return func(b *builder) error { 158 b.Compactions = append(b.Compactions, compaction{ 159 c: c, 160 Handler: c.Handler() + "(n, b)"}) 161 return nil 162 } 163 } 164 165 // Gen writes Go code for a shared trie lookup structure to w for the given 166 // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will 167 // return the *nameTrie for tries[x]. A value can be looked up by using one of 168 // the various lookup methods defined on nameTrie. It returns the table size of 169 // the generated trie. 170 func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) { 171 // The index contains two dummy blocks, followed by the zero block. The zero 172 // block is at offset 0x80, so that the offset for the zero block for 173 // continuation bytes is 0. 174 b := &builder{ 175 Name: name, 176 Trie: tries, 177 IndexBlocks: []*node{{}, {}, {}}, 178 Compactions: []compaction{{ 179 Handler: name + "Values[n<<6+uint32(b)]", 180 }}, 181 // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero 182 // block. 183 indexBlockIdx: map[uint64]int{0: 0}, 184 valueBlockIdx: map[uint64]nodeIndex{0: {}}, 185 asciiBlockIdx: map[uint64]int{}, 186 } 187 b.Compactions[0].c = (*simpleCompacter)(b) 188 189 for _, f := range opts { 190 if err := f(b); err != nil { 191 return 0, err 192 } 193 } 194 b.build() 195 if b.err != nil { 196 return 0, b.err 197 } 198 if err = b.print(w); err != nil { 199 return 0, err 200 } 201 return b.Size(), nil 202 } 203 204 // A Trie represents a single root node of a trie. A builder may build several 205 // overlapping tries at once. 206 type Trie struct { 207 root *node 208 209 hiddenTrie 210 } 211 212 // hiddenTrie contains values we want to be visible to the template generator, 213 // but hidden from the API documentation. 214 type hiddenTrie struct { 215 Name string 216 Checksum uint64 217 ASCIIIndex int 218 StarterIndex int 219 } 220 221 // NewTrie returns a new trie root. 222 func NewTrie(name string) *Trie { 223 return &Trie{ 224 &node{ 225 children: make([]*node, blockSize), 226 values: make([]uint64, utf8.RuneSelf), 227 }, 228 hiddenTrie{Name: name}, 229 } 230 } 231 232 // Gen is a convenience wrapper around the Gen func passing t as the only trie 233 // and uses the name passed to NewTrie. It returns the size of the generated 234 // tables. 235 func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) { 236 return Gen(w, t.Name, []*Trie{t}, opts...) 237 } 238 239 // node is a node of the intermediate trie structure. 240 type node struct { 241 // children holds this node's children. It is always of length 64. 242 // A child node may be nil. 243 children []*node 244 245 // values contains the values of this node. If it is non-nil, this node is 246 // either a root or leaf node: 247 // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F]. 248 // For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF]. 249 values []uint64 250 251 index nodeIndex 252 } 253 254 // Insert associates value with the given rune. Insert will panic if a non-zero 255 // value is passed for an invalid rune. 256 func (t *Trie) Insert(r rune, value uint64) { 257 if value == 0 { 258 return 259 } 260 s := string(r) 261 if []rune(s)[0] != r && value != 0 { 262 // Note: The UCD tables will always assign what amounts to a zero value 263 // to a surrogate. Allowing a zero value for an illegal rune allows 264 // users to iterate over [0..MaxRune] without having to explicitly 265 // exclude surrogates, which would be tedious. 266 panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r)) 267 } 268 if len(s) == 1 { 269 // It is a root node value (ASCII). 270 t.root.values[s[0]] = value 271 return 272 } 273 274 n := t.root 275 for ; len(s) > 1; s = s[1:] { 276 if n.children == nil { 277 n.children = make([]*node, blockSize) 278 } 279 p := s[0] % blockSize 280 c := n.children[p] 281 if c == nil { 282 c = &node{} 283 n.children[p] = c 284 } 285 if len(s) > 2 && c.values != nil { 286 log.Fatalf("triegen: insert(%U): found internal node with values", r) 287 } 288 n = c 289 } 290 if n.values == nil { 291 n.values = make([]uint64, blockSize) 292 } 293 if n.children != nil { 294 log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r) 295 } 296 n.values[s[0]-0x80] = value 297 } 298 299 // Size returns the number of bytes the generated trie will take to store. It 300 // needs to be exported as it is used in the templates. 301 func (b *builder) Size() int { 302 // Index blocks. 303 sz := len(b.IndexBlocks) * blockSize * b.IndexSize 304 305 // Skip the first compaction, which represents the normal value blocks, as 306 // its totalSize does not account for the ASCII blocks, which are managed 307 // separately. 308 sz += len(b.ValueBlocks) * blockSize * b.ValueSize 309 for _, c := range b.Compactions[1:] { 310 sz += c.totalSize 311 } 312 313 // TODO: this computation does not account for the fixed overhead of a using 314 // a compaction, either code or data. As for data, though, the typical 315 // overhead of data is in the order of bytes (2 bytes for cases). Further, 316 // the savings of using a compaction should anyway be substantial for it to 317 // be worth it. 318 319 // For multi-root tries, we also need to account for the handles. 320 if len(b.Trie) > 1 { 321 sz += 2 * b.IndexSize * len(b.Trie) 322 } 323 return sz 324 } 325 326 func (b *builder) build() { 327 // Compute the sizes of the values. 328 var vmax uint64 329 for _, t := range b.Trie { 330 vmax = maxValue(t.root, vmax) 331 } 332 b.ValueType, b.ValueSize = getIntType(vmax) 333 334 // Compute all block allocations. 335 // TODO: first compute the ASCII blocks for all tries and then the other 336 // nodes. ASCII blocks are more restricted in placement, as they require two 337 // blocks to be placed consecutively. Processing them first may improve 338 // sharing (at least one zero block can be expected to be saved.) 339 for _, t := range b.Trie { 340 b.Checksum += b.buildTrie(t) 341 } 342 343 // Compute the offsets for all the Compacters. 344 offset := uint32(0) 345 for i := range b.Compactions { 346 c := &b.Compactions[i] 347 c.Offset = offset 348 offset += c.maxHandle + 1 349 c.Cutoff = offset 350 } 351 352 // Compute the sizes of indexes. 353 // TODO: different byte positions could have different sizes. So far we have 354 // not found a case where this is beneficial. 355 imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff) 356 for _, ib := range b.IndexBlocks { 357 if x := uint64(ib.index.index); x > imax { 358 imax = x 359 } 360 } 361 b.IndexType, b.IndexSize = getIntType(imax) 362 } 363 364 func maxValue(n *node, max uint64) uint64 { 365 if n == nil { 366 return max 367 } 368 for _, c := range n.children { 369 max = maxValue(c, max) 370 } 371 for _, v := range n.values { 372 if max < v { 373 max = v 374 } 375 } 376 return max 377 } 378 379 func getIntType(v uint64) (string, int) { 380 switch { 381 case v < 1<<8: 382 return "uint8", 1 383 case v < 1<<16: 384 return "uint16", 2 385 case v < 1<<32: 386 return "uint32", 4 387 } 388 return "uint64", 8 389 } 390 391 const ( 392 blockSize = 64 393 394 // Subtract two blocks to offset 0x80, the first continuation byte. 395 blockOffset = 2 396 397 // Subtract three blocks to offset 0xC0, the first non-ASCII starter. 398 rootBlockOffset = 3 399 ) 400 401 var crcTable = crc64.MakeTable(crc64.ISO) 402 403 func (b *builder) buildTrie(t *Trie) uint64 { 404 n := t.root 405 406 // Get the ASCII offset. For the first trie, the ASCII block will be at 407 // position 0. 408 hasher := crc64.New(crcTable) 409 binary.Write(hasher, binary.BigEndian, n.values) 410 hash := hasher.Sum64() 411 412 v, ok := b.asciiBlockIdx[hash] 413 if !ok { 414 v = len(b.ValueBlocks) 415 b.asciiBlockIdx[hash] = v 416 417 b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:]) 418 if v == 0 { 419 // Add the zero block at position 2 so that it will be assigned a 420 // zero reference in the lookup blocks. 421 // TODO: always do this? This would allow us to remove a check from 422 // the trie lookup, but at the expense of extra space. Analyze 423 // performance for unicode/norm. 424 b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize)) 425 } 426 } 427 t.ASCIIIndex = v 428 429 // Compute remaining offsets. 430 t.Checksum = b.computeOffsets(n, true) 431 // We already subtracted the normal blockOffset from the index. Subtract the 432 // difference for starter bytes. 433 t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset) 434 return t.Checksum 435 } 436 437 func (b *builder) computeOffsets(n *node, root bool) uint64 { 438 // For the first trie, the root lookup block will be at position 3, which is 439 // the offset for UTF-8 non-ASCII starter bytes. 440 first := len(b.IndexBlocks) == rootBlockOffset 441 if first { 442 b.IndexBlocks = append(b.IndexBlocks, n) 443 } 444 445 // We special-case the cases where all values recursively are 0. This allows 446 // for the use of a zero block to which all such values can be directed. 447 hash := uint64(0) 448 if n.children != nil || n.values != nil { 449 hasher := crc64.New(crcTable) 450 for _, c := range n.children { 451 var v uint64 452 if c != nil { 453 v = b.computeOffsets(c, false) 454 } 455 binary.Write(hasher, binary.BigEndian, v) 456 } 457 binary.Write(hasher, binary.BigEndian, n.values) 458 hash = hasher.Sum64() 459 } 460 461 if first { 462 b.indexBlockIdx[hash] = rootBlockOffset - blockOffset 463 } 464 465 // Compacters don't apply to internal nodes. 466 if n.children != nil { 467 v, ok := b.indexBlockIdx[hash] 468 if !ok { 469 v = len(b.IndexBlocks) - blockOffset 470 b.IndexBlocks = append(b.IndexBlocks, n) 471 b.indexBlockIdx[hash] = v 472 } 473 n.index = nodeIndex{0, v} 474 } else { 475 h, ok := b.valueBlockIdx[hash] 476 if !ok { 477 bestI, bestSize := 0, blockSize*b.ValueSize 478 for i, c := range b.Compactions[1:] { 479 if sz, ok := c.c.Size(n.values); ok && bestSize > sz { 480 bestI, bestSize = i+1, sz 481 } 482 } 483 c := &b.Compactions[bestI] 484 c.totalSize += bestSize 485 v := c.c.Store(n.values) 486 if c.maxHandle < v { 487 c.maxHandle = v 488 } 489 h = nodeIndex{bestI, int(v)} 490 b.valueBlockIdx[hash] = h 491 } 492 n.index = h 493 } 494 return hash 495 } 496