...

Source file src/golang.org/x/text/collate/build/trie.go

Documentation: golang.org/x/text/collate/build

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // The trie in this file is used to associate the first full character
     6  // in a UTF-8 string to a collation element.
     7  // All but the last byte in a UTF-8 byte sequence are
     8  // used to look up offsets in the index table to be used for the next byte.
     9  // The last byte is used to index into a table of collation elements.
    10  // This file contains the code for the generation of the trie.
    11  
    12  package build
    13  
    14  import (
    15  	"fmt"
    16  	"hash/fnv"
    17  	"io"
    18  	"reflect"
    19  )
    20  
    21  const (
    22  	blockSize   = 64
    23  	blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes.
    24  )
    25  
    26  type trieHandle struct {
    27  	lookupStart uint16 // offset in table for first byte
    28  	valueStart  uint16 // offset in table for first byte
    29  }
    30  
    31  type trie struct {
    32  	index  []uint16
    33  	values []uint32
    34  }
    35  
    36  // trieNode is the intermediate trie structure used for generating a trie.
    37  type trieNode struct {
    38  	index    []*trieNode
    39  	value    []uint32
    40  	b        byte
    41  	refValue uint16
    42  	refIndex uint16
    43  }
    44  
    45  func newNode() *trieNode {
    46  	return &trieNode{
    47  		index: make([]*trieNode, 64),
    48  		value: make([]uint32, 128), // root node size is 128 instead of 64
    49  	}
    50  }
    51  
    52  func (n *trieNode) isInternal() bool {
    53  	return n.value != nil
    54  }
    55  
    56  func (n *trieNode) insert(r rune, value uint32) {
    57  	const maskx = 0x3F // mask out two most-significant bits
    58  	str := string(r)
    59  	if len(str) == 1 {
    60  		n.value[str[0]] = value
    61  		return
    62  	}
    63  	for i := 0; i < len(str)-1; i++ {
    64  		b := str[i] & maskx
    65  		if n.index == nil {
    66  			n.index = make([]*trieNode, blockSize)
    67  		}
    68  		nn := n.index[b]
    69  		if nn == nil {
    70  			nn = &trieNode{}
    71  			nn.b = b
    72  			n.index[b] = nn
    73  		}
    74  		n = nn
    75  	}
    76  	if n.value == nil {
    77  		n.value = make([]uint32, blockSize)
    78  	}
    79  	b := str[len(str)-1] & maskx
    80  	n.value[b] = value
    81  }
    82  
    83  type trieBuilder struct {
    84  	t *trie
    85  
    86  	roots []*trieHandle
    87  
    88  	lookupBlocks []*trieNode
    89  	valueBlocks  []*trieNode
    90  
    91  	lookupBlockIdx map[uint32]*trieNode
    92  	valueBlockIdx  map[uint32]*trieNode
    93  }
    94  
    95  func newTrieBuilder() *trieBuilder {
    96  	index := &trieBuilder{}
    97  	index.lookupBlocks = make([]*trieNode, 0)
    98  	index.valueBlocks = make([]*trieNode, 0)
    99  	index.lookupBlockIdx = make(map[uint32]*trieNode)
   100  	index.valueBlockIdx = make(map[uint32]*trieNode)
   101  	// The third nil is the default null block.  The other two blocks
   102  	// are used to guarantee an offset of at least 3 for each block.
   103  	index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil)
   104  	index.t = &trie{}
   105  	return index
   106  }
   107  
   108  func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode {
   109  	hasher := fnv.New32()
   110  	if n.index != nil {
   111  		for i, nn := range n.index {
   112  			var vi, vv uint16
   113  			if nn != nil {
   114  				nn = b.computeOffsets(nn)
   115  				n.index[i] = nn
   116  				vi = nn.refIndex
   117  				vv = nn.refValue
   118  			}
   119  			hasher.Write([]byte{byte(vi >> 8), byte(vi)})
   120  			hasher.Write([]byte{byte(vv >> 8), byte(vv)})
   121  		}
   122  		h := hasher.Sum32()
   123  		nn, ok := b.lookupBlockIdx[h]
   124  		if !ok {
   125  			n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset
   126  			b.lookupBlocks = append(b.lookupBlocks, n)
   127  			b.lookupBlockIdx[h] = n
   128  		} else {
   129  			n = nn
   130  		}
   131  	} else {
   132  		for _, v := range n.value {
   133  			hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
   134  		}
   135  		h := hasher.Sum32()
   136  		nn, ok := b.valueBlockIdx[h]
   137  		if !ok {
   138  			n.refValue = uint16(len(b.valueBlocks)) - blockOffset
   139  			n.refIndex = n.refValue
   140  			b.valueBlocks = append(b.valueBlocks, n)
   141  			b.valueBlockIdx[h] = n
   142  		} else {
   143  			n = nn
   144  		}
   145  	}
   146  	return n
   147  }
   148  
   149  func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 {
   150  	hasher := fnv.New32()
   151  	for _, v := range n.value[:2*blockSize] {
   152  		hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
   153  	}
   154  	h := hasher.Sum32()
   155  	nn, ok := b.valueBlockIdx[h]
   156  	if !ok {
   157  		n.refValue = uint16(len(b.valueBlocks))
   158  		n.refIndex = n.refValue
   159  		b.valueBlocks = append(b.valueBlocks, n)
   160  		// Add a dummy block to accommodate the double block size.
   161  		b.valueBlocks = append(b.valueBlocks, nil)
   162  		b.valueBlockIdx[h] = n
   163  	} else {
   164  		n = nn
   165  	}
   166  	return n.refValue
   167  }
   168  
   169  func genValueBlock(t *trie, n *trieNode) {
   170  	if n != nil {
   171  		for _, v := range n.value {
   172  			t.values = append(t.values, v)
   173  		}
   174  	}
   175  }
   176  
   177  func genLookupBlock(t *trie, n *trieNode) {
   178  	for _, nn := range n.index {
   179  		v := uint16(0)
   180  		if nn != nil {
   181  			if n.index != nil {
   182  				v = nn.refIndex
   183  			} else {
   184  				v = nn.refValue
   185  			}
   186  		}
   187  		t.index = append(t.index, v)
   188  	}
   189  }
   190  
   191  func (b *trieBuilder) addTrie(n *trieNode) *trieHandle {
   192  	h := &trieHandle{}
   193  	b.roots = append(b.roots, h)
   194  	h.valueStart = b.addStartValueBlock(n)
   195  	if len(b.roots) == 1 {
   196  		// We insert a null block after the first start value block.
   197  		// This ensures that continuation bytes UTF-8 sequences of length
   198  		// greater than 2 will automatically hit a null block if there
   199  		// was an undefined entry.
   200  		b.valueBlocks = append(b.valueBlocks, nil)
   201  	}
   202  	n = b.computeOffsets(n)
   203  	// Offset by one extra block as the first byte starts at 0xC0 instead of 0x80.
   204  	h.lookupStart = n.refIndex - 1
   205  	return h
   206  }
   207  
   208  // generate generates and returns the trie for n.
   209  func (b *trieBuilder) generate() (t *trie, err error) {
   210  	t = b.t
   211  	if len(b.valueBlocks) >= 1<<16 {
   212  		return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16)
   213  	}
   214  	if len(b.lookupBlocks) >= 1<<16 {
   215  		return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16)
   216  	}
   217  	genValueBlock(t, b.valueBlocks[0])
   218  	genValueBlock(t, &trieNode{value: make([]uint32, 64)})
   219  	for i := 2; i < len(b.valueBlocks); i++ {
   220  		genValueBlock(t, b.valueBlocks[i])
   221  	}
   222  	n := &trieNode{index: make([]*trieNode, 64)}
   223  	genLookupBlock(t, n)
   224  	genLookupBlock(t, n)
   225  	genLookupBlock(t, n)
   226  	for i := 3; i < len(b.lookupBlocks); i++ {
   227  		genLookupBlock(t, b.lookupBlocks[i])
   228  	}
   229  	return b.t, nil
   230  }
   231  
   232  func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) {
   233  	p := func(f string, a ...interface{}) {
   234  		nn, e := fmt.Fprintf(w, f, a...)
   235  		n += nn
   236  		if err == nil {
   237  			err = e
   238  		}
   239  	}
   240  	nv := len(t.values)
   241  	p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4)
   242  	p("// Block 2 is the null block.\n")
   243  	p("var %sValues = [%d]uint32 {", name, nv)
   244  	var printnewline bool
   245  	for i, v := range t.values {
   246  		if i%blockSize == 0 {
   247  			p("\n\t// Block %#x, offset %#x", i/blockSize, i)
   248  		}
   249  		if i%4 == 0 {
   250  			printnewline = true
   251  		}
   252  		if v != 0 {
   253  			if printnewline {
   254  				p("\n\t")
   255  				printnewline = false
   256  			}
   257  			p("%#04x:%#08x, ", i, v)
   258  		}
   259  	}
   260  	p("\n}\n\n")
   261  	ni := len(t.index)
   262  	p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2)
   263  	p("// Block 0 is the null block.\n")
   264  	p("var %sLookup = [%d]uint16 {", name, ni)
   265  	printnewline = false
   266  	for i, v := range t.index {
   267  		if i%blockSize == 0 {
   268  			p("\n\t// Block %#x, offset %#x", i/blockSize, i)
   269  		}
   270  		if i%8 == 0 {
   271  			printnewline = true
   272  		}
   273  		if v != 0 {
   274  			if printnewline {
   275  				p("\n\t")
   276  				printnewline = false
   277  			}
   278  			p("%#03x:%#02x, ", i, v)
   279  		}
   280  	}
   281  	p("\n}\n\n")
   282  	return n, nv*4 + ni*2, err
   283  }
   284  
   285  func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) {
   286  	const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}"
   287  	n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name)
   288  	sz += int(reflect.TypeOf(trie{}).Size())
   289  	return
   290  }
   291  

View as plain text