contract.go

Documentation: golang.org/x/text/collate/build

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"reflect"
    11  	"sort"
    12  	"strings"
    13  
    14  	"golang.org/x/text/internal/colltab"
    15  )
    16  
    17  // This file contains code for detecting contractions and generating
    18  // the necessary tables.
    19  // Any Unicode Collation Algorithm (UCA) table entry that has more than
    20  // one rune one the left-hand side is called a contraction.
    21  // See https://www.unicode.org/reports/tr10/#Contractions for more details.
    22  //
    23  // We define the following terms:
    24  //   initial:     a rune that appears as the first rune in a contraction.
    25  //   suffix:      a sequence of runes succeeding the initial rune
    26  //                in a given contraction.
    27  //   non-initial: a rune that appears in a suffix.
    28  //
    29  // A rune may be both an initial and a non-initial and may be so in
    30  // many contractions.  An initial may typically also appear by itself.
    31  // In case of ambiguities, the UCA requires we match the longest
    32  // contraction.
    33  //
    34  // Many contraction rules share the same set of possible suffixes.
    35  // We store sets of suffixes in a trie that associates an index with
    36  // each suffix in the set.  This index can be used to look up a
    37  // collation element associated with the (starter rune, suffix) pair.
    38  //
    39  // The trie is defined on a UTF-8 byte sequence.
    40  // The overall trie is represented as an array of ctEntries.  Each node of the trie
    41  // is represented as a subsequence of ctEntries, where each entry corresponds to
    42  // a possible match of a next character in the search string.  An entry
    43  // also includes the length and offset to the next sequence of entries
    44  // to check in case of a match.
    45  
    46  const (
    47  	final   = 0
    48  	noIndex = 0xFF
    49  )
    50  
    51  // ctEntry associates to a matching byte an offset and/or next sequence of
    52  // bytes to check. A ctEntry c is called final if a match means that the
    53  // longest suffix has been found.  An entry c is final if c.N == 0.
    54  // A single final entry can match a range of characters to an offset.
    55  // A non-final entry always matches a single byte. Note that a non-final
    56  // entry might still resemble a completed suffix.
    57  // Examples:
    58  // The suffix strings "ab" and "ac" can be represented as:
    59  //
    60  //	[]ctEntry{
    61  //		{'a', 1, 1, noIndex},  // 'a' by itself does not match, so i is 0xFF.
    62  //		{'b', 'c', 0, 1},   // "ab" -> 1, "ac" -> 2
    63  //	}
    64  //
    65  // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
    66  //
    67  //	[]ctEntry{
    68  //		{'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
    69  //		{'b', 1, 2, 1},    // "ab" -> 1, may be followed by 'c' or 'd'.
    70  //		{'d', 'd', final, 3},  // "abd" -> 3
    71  //		{'c', 4, 1, 2},    // "abc" -> 2, may be followed by 'd'.
    72  //		{'d', 'd', final, 4},  // "abcd" -> 4
    73  //	}
    74  //
    75  // See genStateTests in contract_test.go for more examples.
    76  type ctEntry struct {
    77  	L uint8 // non-final: byte value to match; final: lowest match in range.
    78  	H uint8 // non-final: relative index to next block; final: highest match in range.
    79  	N uint8 // non-final: length of next block; final: final
    80  	I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
    81  }
    82  
    83  // contractTrieSet holds a set of contraction tries. The tries are stored
    84  // consecutively in the entry field.
    85  type contractTrieSet []struct{ l, h, n, i uint8 }
    86  
    87  // ctHandle is used to identify a trie in the trie set, consisting in an offset
    88  // in the array and the size of the first node.
    89  type ctHandle struct {
    90  	index, n int
    91  }
    92  
    93  // appendTrie adds a new trie for the given suffixes to the trie set and returns
    94  // a handle to it.  The handle will be invalid on error.
    95  func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
    96  	es := make([]stridx, len(suffixes))
    97  	for i, s := range suffixes {
    98  		es[i].str = s
    99  	}
   100  	sort.Sort(offsetSort(es))
   101  	for i := range es {
   102  		es[i].index = i + 1
   103  	}
   104  	sort.Sort(genidxSort(es))
   105  	i := len(*ct)
   106  	n, err := genStates(ct, es)
   107  	if err != nil {
   108  		*ct = (*ct)[:i]
   109  		return ctHandle{}, err
   110  	}
   111  	return ctHandle{i, n}, nil
   112  }
   113  
   114  // genStates generates ctEntries for a given suffix set and returns
   115  // the number of entries for the first node.
   116  func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
   117  	if len(sis) == 0 {
   118  		return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
   119  	}
   120  	start := len(*ct)
   121  	// create entries for differing first bytes.
   122  	for _, si := range sis {
   123  		s := si.str
   124  		if len(s) == 0 {
   125  			continue
   126  		}
   127  		added := false
   128  		c := s[0]
   129  		if len(s) > 1 {
   130  			for j := len(*ct) - 1; j >= start; j-- {
   131  				if (*ct)[j].L == c {
   132  					added = true
   133  					break
   134  				}
   135  			}
   136  			if !added {
   137  				*ct = append(*ct, ctEntry{L: c, I: noIndex})
   138  			}
   139  		} else {
   140  			for j := len(*ct) - 1; j >= start; j-- {
   141  				// Update the offset for longer suffixes with the same byte.
   142  				if (*ct)[j].L == c {
   143  					(*ct)[j].I = uint8(si.index)
   144  					added = true
   145  				}
   146  				// Extend range of final ctEntry, if possible.
   147  				if (*ct)[j].H+1 == c {
   148  					(*ct)[j].H = c
   149  					added = true
   150  				}
   151  			}
   152  			if !added {
   153  				*ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
   154  			}
   155  		}
   156  	}
   157  	n := len(*ct) - start
   158  	// Append nodes for the remainder of the suffixes for each ctEntry.
   159  	sp := 0
   160  	for i, end := start, len(*ct); i < end; i++ {
   161  		fe := (*ct)[i]
   162  		if fe.H == 0 { // uninitialized non-final
   163  			ln := len(*ct) - start - n
   164  			if ln > 0xFF {
   165  				return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
   166  			}
   167  			fe.H = uint8(ln)
   168  			// Find first non-final strings with same byte as current entry.
   169  			for ; sis[sp].str[0] != fe.L; sp++ {
   170  			}
   171  			se := sp + 1
   172  			for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
   173  			}
   174  			sl := sis[sp:se]
   175  			sp = se
   176  			for i, si := range sl {
   177  				sl[i].str = si.str[1:]
   178  			}
   179  			nn, err := genStates(ct, sl)
   180  			if err != nil {
   181  				return 0, err
   182  			}
   183  			fe.N = uint8(nn)
   184  			(*ct)[i] = fe
   185  		}
   186  	}
   187  	sort.Sort(entrySort((*ct)[start : start+n]))
   188  	return n, nil
   189  }
   190  
   191  // There may be both a final and non-final entry for a byte if the byte
   192  // is implied in a range of matches in the final entry.
   193  // We need to ensure that the non-final entry comes first in that case.
   194  type entrySort colltab.ContractTrieSet
   195  
   196  func (fe entrySort) Len() int      { return len(fe) }
   197  func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
   198  func (fe entrySort) Less(i, j int) bool {
   199  	return fe[i].L > fe[j].L
   200  }
   201  
   202  // stridx is used for sorting suffixes and their associated offsets.
   203  type stridx struct {
   204  	str   string
   205  	index int
   206  }
   207  
   208  // For computing the offsets, we first sort by size, and then by string.
   209  // This ensures that strings that only differ in the last byte by 1
   210  // are sorted consecutively in increasing order such that they can
   211  // be packed as a range in a final ctEntry.
   212  type offsetSort []stridx
   213  
   214  func (si offsetSort) Len() int      { return len(si) }
   215  func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
   216  func (si offsetSort) Less(i, j int) bool {
   217  	if len(si[i].str) != len(si[j].str) {
   218  		return len(si[i].str) > len(si[j].str)
   219  	}
   220  	return si[i].str < si[j].str
   221  }
   222  
   223  // For indexing, we want to ensure that strings are sorted in string order, where
   224  // for strings with the same prefix, we put longer strings before shorter ones.
   225  type genidxSort []stridx
   226  
   227  func (si genidxSort) Len() int      { return len(si) }
   228  func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
   229  func (si genidxSort) Less(i, j int) bool {
   230  	if strings.HasPrefix(si[j].str, si[i].str) {
   231  		return false
   232  	}
   233  	if strings.HasPrefix(si[i].str, si[j].str) {
   234  		return true
   235  	}
   236  	return si[i].str < si[j].str
   237  }
   238  
   239  // lookup matches the longest suffix in str and returns the associated offset
   240  // and the number of bytes consumed.
   241  func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
   242  	states := (*ct)[h.index:]
   243  	p := 0
   244  	n := h.n
   245  	for i := 0; i < n && p < len(str); {
   246  		e := states[i]
   247  		c := str[p]
   248  		if c >= e.L {
   249  			if e.L == c {
   250  				p++
   251  				if e.I != noIndex {
   252  					index, ns = int(e.I), p
   253  				}
   254  				if e.N != final {
   255  					// set to new state
   256  					i, states, n = 0, states[int(e.H)+n:], int(e.N)
   257  				} else {
   258  					return
   259  				}
   260  				continue
   261  			} else if e.N == final && c <= e.H {
   262  				p++
   263  				return int(c-e.L) + int(e.I), p
   264  			}
   265  		}
   266  		i++
   267  	}
   268  	return
   269  }
   270  
   271  // print writes the contractTrieSet t as compilable Go code to w. It returns
   272  // the total number of bytes written and the size of the resulting data structure in bytes.
   273  func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
   274  	update3 := func(nn, sz int, e error) {
   275  		n += nn
   276  		if err == nil {
   277  			err = e
   278  		}
   279  		size += sz
   280  	}
   281  	update2 := func(nn int, e error) { update3(nn, 0, e) }
   282  
   283  	update3(printArray(*t, w, name))
   284  	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
   285  	update3(printStruct(*t, w, name))
   286  	update2(fmt.Fprintln(w))
   287  	return
   288  }
   289  
   290  func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
   291  	p := func(f string, a ...interface{}) {
   292  		nn, e := fmt.Fprintf(w, f, a...)
   293  		n += nn
   294  		if err == nil {
   295  			err = e
   296  		}
   297  	}
   298  	size = len(ct) * 4
   299  	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
   300  	p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
   301  	for _, fe := range ct {
   302  		p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
   303  	}
   304  	p("}\n")
   305  	return
   306  }
   307  
   308  func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
   309  	n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
   310  	size = int(reflect.TypeOf(ct).Size())
   311  	return
   312  }
   313
View as plain text