...

Source file src/golang.org/x/text/collate/build/builder.go

Documentation: golang.org/x/text/collate/build

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build // import "golang.org/x/text/collate/build"
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"log"
    11  	"sort"
    12  	"strings"
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/internal/colltab"
    16  	"golang.org/x/text/language"
    17  	"golang.org/x/text/unicode/norm"
    18  )
    19  
    20  // TODO: optimizations:
    21  // - expandElem is currently 20K. By putting unique colElems in a separate
    22  //   table and having a byte array of indexes into this table, we can reduce
    23  //   the total size to about 7K. By also factoring out the length bytes, we
    24  //   can reduce this to about 6K.
    25  // - trie valueBlocks are currently 100K. There are a lot of sparse blocks
    26  //   and many consecutive values with the same stride. This can be further
    27  //   compacted.
    28  // - Compress secondary weights into 8 bits.
    29  // - Some LDML specs specify a context element. Currently we simply concatenate
    30  //   those.  Context can be implemented using the contraction trie. If Builder
    31  //   could analyze and detect when using a context makes sense, there is no
    32  //   need to expose this construct in the API.
    33  
    34  // A Builder builds a root collation table.  The user must specify the
    35  // collation elements for each entry.  A common use will be to base the weights
    36  // on those specified in the allkeys* file as provided by the UCA or CLDR.
    37  type Builder struct {
    38  	index  *trieBuilder
    39  	root   ordering
    40  	locale []*Tailoring
    41  	t      *table
    42  	err    error
    43  	built  bool
    44  
    45  	minNonVar int // lowest primary recorded for a variable
    46  	varTop    int // highest primary recorded for a non-variable
    47  
    48  	// indexes used for reusing expansions and contractions
    49  	expIndex map[string]int      // positions of expansions keyed by their string representation
    50  	ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
    51  	ctElem   map[string]int      // contraction elements keyed by their string representation
    52  }
    53  
    54  // A Tailoring builds a collation table based on another collation table.
    55  // The table is defined by specifying tailorings to the underlying table.
    56  // See https://unicode.org/reports/tr35/ for an overview of tailoring
    57  // collation tables.  The CLDR contains pre-defined tailorings for a variety
    58  // of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
    59  type Tailoring struct {
    60  	id      string
    61  	builder *Builder
    62  	index   *ordering
    63  
    64  	anchor *entry
    65  	before bool
    66  }
    67  
    68  // NewBuilder returns a new Builder.
    69  func NewBuilder() *Builder {
    70  	return &Builder{
    71  		index:    newTrieBuilder(),
    72  		root:     makeRootOrdering(),
    73  		expIndex: make(map[string]int),
    74  		ctHandle: make(map[string]ctHandle),
    75  		ctElem:   make(map[string]int),
    76  	}
    77  }
    78  
    79  // Tailoring returns a Tailoring for the given locale.  One should
    80  // have completed all calls to Add before calling Tailoring.
    81  func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
    82  	t := &Tailoring{
    83  		id:      loc.String(),
    84  		builder: b,
    85  		index:   b.root.clone(),
    86  	}
    87  	t.index.id = t.id
    88  	b.locale = append(b.locale, t)
    89  	return t
    90  }
    91  
    92  // Add adds an entry to the collation element table, mapping
    93  // a slice of runes to a sequence of collation elements.
    94  // A collation element is specified as list of weights: []int{primary, secondary, ...}.
    95  // The entries are typically obtained from a collation element table
    96  // as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
    97  // Note that the collation elements specified by colelems are only used
    98  // as a guide.  The actual weights generated by Builder may differ.
    99  // The argument variables is a list of indices into colelems that should contain
   100  // a value for each colelem that is a variable. (See the reference above.)
   101  func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
   102  	str := string(runes)
   103  	elems := make([]rawCE, len(colelems))
   104  	for i, ce := range colelems {
   105  		if len(ce) == 0 {
   106  			break
   107  		}
   108  		elems[i] = makeRawCE(ce, 0)
   109  		if len(ce) == 1 {
   110  			elems[i].w[1] = defaultSecondary
   111  		}
   112  		if len(ce) <= 2 {
   113  			elems[i].w[2] = defaultTertiary
   114  		}
   115  		if len(ce) <= 3 {
   116  			elems[i].w[3] = ce[0]
   117  		}
   118  	}
   119  	for i, ce := range elems {
   120  		p := ce.w[0]
   121  		isvar := false
   122  		for _, j := range variables {
   123  			if i == j {
   124  				isvar = true
   125  			}
   126  		}
   127  		if isvar {
   128  			if p >= b.minNonVar && b.minNonVar > 0 {
   129  				return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
   130  			}
   131  			if p > b.varTop {
   132  				b.varTop = p
   133  			}
   134  		} else if p > 1 { // 1 is a special primary value reserved for FFFE
   135  			if p <= b.varTop {
   136  				return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
   137  			}
   138  			if b.minNonVar == 0 || p < b.minNonVar {
   139  				b.minNonVar = p
   140  			}
   141  		}
   142  	}
   143  	elems, err := convertLargeWeights(elems)
   144  	if err != nil {
   145  		return err
   146  	}
   147  	cccs := []uint8{}
   148  	nfd := norm.NFD.String(str)
   149  	for i := range nfd {
   150  		cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
   151  	}
   152  	if len(cccs) < len(elems) {
   153  		if len(cccs) > 2 {
   154  			return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
   155  		}
   156  		p := len(elems) - 1
   157  		for ; p > 0 && elems[p].w[0] == 0; p-- {
   158  			elems[p].ccc = cccs[len(cccs)-1]
   159  		}
   160  		for ; p >= 0; p-- {
   161  			elems[p].ccc = cccs[0]
   162  		}
   163  	} else {
   164  		for i := range elems {
   165  			elems[i].ccc = cccs[i]
   166  		}
   167  	}
   168  	// doNorm in collate.go assumes that the following conditions hold.
   169  	if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
   170  		return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
   171  	}
   172  	b.root.newEntry(str, elems)
   173  	return nil
   174  }
   175  
   176  func (t *Tailoring) setAnchor(anchor string) error {
   177  	anchor = norm.NFC.String(anchor)
   178  	a := t.index.find(anchor)
   179  	if a == nil {
   180  		a = t.index.newEntry(anchor, nil)
   181  		a.implicit = true
   182  		a.modified = true
   183  		for _, r := range []rune(anchor) {
   184  			e := t.index.find(string(r))
   185  			e.lock = true
   186  		}
   187  	}
   188  	t.anchor = a
   189  	return nil
   190  }
   191  
   192  // SetAnchor sets the point after which elements passed in subsequent calls to
   193  // Insert will be inserted.  It is equivalent to the reset directive in an LDML
   194  // specification.  See Insert for an example.
   195  // SetAnchor supports the following logical reset positions:
   196  // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
   197  // and <last_non_ignorable/>.
   198  func (t *Tailoring) SetAnchor(anchor string) error {
   199  	if err := t.setAnchor(anchor); err != nil {
   200  		return err
   201  	}
   202  	t.before = false
   203  	return nil
   204  }
   205  
   206  // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
   207  // Insert will insert entries before the anchor.
   208  func (t *Tailoring) SetAnchorBefore(anchor string) error {
   209  	if err := t.setAnchor(anchor); err != nil {
   210  		return err
   211  	}
   212  	t.before = true
   213  	return nil
   214  }
   215  
   216  // Insert sets the ordering of str relative to the entry set by the previous
   217  // call to SetAnchor or Insert.  The argument extend corresponds
   218  // to the extend elements as defined in LDML.  A non-empty value for extend
   219  // will cause the collation elements corresponding to extend to be appended
   220  // to the collation elements generated for the entry added by Insert.
   221  // This has the same net effect as sorting str after the string anchor+extend.
   222  // See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
   223  // on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
   224  // for full details on LDML.
   225  //
   226  // Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
   227  // at the primary sorting level:
   228  //
   229  //	t := b.Tailoring("se")
   230  //	t.SetAnchor("z")
   231  //	t.Insert(colltab.Primary, "ä", "")
   232  //
   233  // Order "ü" after "ue" at the secondary sorting level:
   234  //
   235  //	t.SetAnchor("ue")
   236  //	t.Insert(colltab.Secondary, "ü","")
   237  //
   238  // or
   239  //
   240  //	t.SetAnchor("u")
   241  //	t.Insert(colltab.Secondary, "ü", "e")
   242  //
   243  // Order "q" after "ab" at the secondary level and "Q" after "q"
   244  // at the tertiary level:
   245  //
   246  //	t.SetAnchor("ab")
   247  //	t.Insert(colltab.Secondary, "q", "")
   248  //	t.Insert(colltab.Tertiary, "Q", "")
   249  //
   250  // Order "b" before "a":
   251  //
   252  //	t.SetAnchorBefore("a")
   253  //	t.Insert(colltab.Primary, "b", "")
   254  //
   255  // Order "0" after the last primary ignorable:
   256  //
   257  //	t.SetAnchor("<last_primary_ignorable/>")
   258  //	t.Insert(colltab.Primary, "0", "")
   259  func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
   260  	if t.anchor == nil {
   261  		return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
   262  	}
   263  	str = norm.NFC.String(str)
   264  	e := t.index.find(str)
   265  	if e == nil {
   266  		e = t.index.newEntry(str, nil)
   267  	} else if e.logical != noAnchor {
   268  		return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
   269  	}
   270  	if e.lock {
   271  		return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
   272  	}
   273  	a := t.anchor
   274  	// Find the first element after the anchor which differs at a level smaller or
   275  	// equal to the given level.  Then insert at this position.
   276  	// See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
   277  	e.before = t.before
   278  	if t.before {
   279  		t.before = false
   280  		if a.prev == nil {
   281  			a.insertBefore(e)
   282  		} else {
   283  			for a = a.prev; a.level > level; a = a.prev {
   284  			}
   285  			a.insertAfter(e)
   286  		}
   287  		e.level = level
   288  	} else {
   289  		for ; a.level > level; a = a.next {
   290  		}
   291  		e.level = a.level
   292  		if a != e {
   293  			a.insertAfter(e)
   294  			a.level = level
   295  		} else {
   296  			// We don't set a to prev itself. This has the effect of the entry
   297  			// getting new collation elements that are an increment of itself.
   298  			// This is intentional.
   299  			a.prev.level = level
   300  		}
   301  	}
   302  	e.extend = norm.NFD.String(extend)
   303  	e.exclude = false
   304  	e.modified = true
   305  	e.elems = nil
   306  	t.anchor = e
   307  	return nil
   308  }
   309  
   310  func (o *ordering) getWeight(e *entry) []rawCE {
   311  	if len(e.elems) == 0 && e.logical == noAnchor {
   312  		if e.implicit {
   313  			for _, r := range e.runes {
   314  				e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
   315  			}
   316  		} else if e.before {
   317  			count := [colltab.Identity + 1]int{}
   318  			a := e
   319  			for ; a.elems == nil && !a.implicit; a = a.next {
   320  				count[a.level]++
   321  			}
   322  			e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
   323  			for i := colltab.Primary; i < colltab.Quaternary; i++ {
   324  				if count[i] != 0 {
   325  					e.elems[0].w[i] -= count[i]
   326  					break
   327  				}
   328  			}
   329  			if e.prev != nil {
   330  				o.verifyWeights(e.prev, e, e.prev.level)
   331  			}
   332  		} else {
   333  			prev := e.prev
   334  			e.elems = nextWeight(prev.level, o.getWeight(prev))
   335  			o.verifyWeights(e, e.next, e.level)
   336  		}
   337  	}
   338  	return e.elems
   339  }
   340  
   341  func (o *ordering) addExtension(e *entry) {
   342  	if ex := o.find(e.extend); ex != nil {
   343  		e.elems = append(e.elems, ex.elems...)
   344  	} else {
   345  		for _, r := range []rune(e.extend) {
   346  			e.elems = append(e.elems, o.find(string(r)).elems...)
   347  		}
   348  	}
   349  	e.extend = ""
   350  }
   351  
   352  func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
   353  	if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
   354  		return nil
   355  	}
   356  	for i := colltab.Primary; i < level; i++ {
   357  		if a.elems[0].w[i] < b.elems[0].w[i] {
   358  			return nil
   359  		}
   360  	}
   361  	if a.elems[0].w[level] >= b.elems[0].w[level] {
   362  		err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
   363  		log.Println(err)
   364  		// TODO: return the error instead, or better, fix the conflicting entry by making room.
   365  	}
   366  	return nil
   367  }
   368  
   369  func (b *Builder) error(e error) {
   370  	if e != nil {
   371  		b.err = e
   372  	}
   373  }
   374  
   375  func (b *Builder) errorID(locale string, e error) {
   376  	if e != nil {
   377  		b.err = fmt.Errorf("%s:%v", locale, e)
   378  	}
   379  }
   380  
   381  // patchNorm ensures that NFC and NFD counterparts are consistent.
   382  func (o *ordering) patchNorm() {
   383  	// Insert the NFD counterparts, if necessary.
   384  	for _, e := range o.ordered {
   385  		nfd := norm.NFD.String(e.str)
   386  		if nfd != e.str {
   387  			if e0 := o.find(nfd); e0 != nil && !e0.modified {
   388  				e0.elems = e.elems
   389  			} else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
   390  				e := o.newEntry(nfd, e.elems)
   391  				e.modified = true
   392  			}
   393  		}
   394  	}
   395  	// Update unchanged composed forms if one of their parts changed.
   396  	for _, e := range o.ordered {
   397  		nfd := norm.NFD.String(e.str)
   398  		if e.modified || nfd == e.str {
   399  			continue
   400  		}
   401  		if e0 := o.find(nfd); e0 != nil {
   402  			e.elems = e0.elems
   403  		} else {
   404  			e.elems = o.genColElems(nfd)
   405  			if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
   406  				r := []rune(nfd)
   407  				head := string(r[0])
   408  				tail := ""
   409  				for i := 1; i < len(r); i++ {
   410  					s := norm.NFC.String(head + string(r[i]))
   411  					if e0 := o.find(s); e0 != nil && e0.modified {
   412  						head = s
   413  					} else {
   414  						tail += string(r[i])
   415  					}
   416  				}
   417  				e.elems = append(o.genColElems(head), o.genColElems(tail)...)
   418  			}
   419  		}
   420  	}
   421  	// Exclude entries for which the individual runes generate the same collation elements.
   422  	for _, e := range o.ordered {
   423  		if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
   424  			e.exclude = true
   425  		}
   426  	}
   427  }
   428  
   429  func (b *Builder) buildOrdering(o *ordering) {
   430  	for _, e := range o.ordered {
   431  		o.getWeight(e)
   432  	}
   433  	for _, e := range o.ordered {
   434  		o.addExtension(e)
   435  	}
   436  	o.patchNorm()
   437  	o.sort()
   438  	simplify(o)
   439  	b.processExpansions(o)   // requires simplify
   440  	b.processContractions(o) // requires simplify
   441  
   442  	t := newNode()
   443  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   444  		if !e.skip() {
   445  			ce, err := e.encode()
   446  			b.errorID(o.id, err)
   447  			t.insert(e.runes[0], ce)
   448  		}
   449  	}
   450  	o.handle = b.index.addTrie(t)
   451  }
   452  
   453  func (b *Builder) build() (*table, error) {
   454  	if b.built {
   455  		return b.t, b.err
   456  	}
   457  	b.built = true
   458  	b.t = &table{
   459  		Table: colltab.Table{
   460  			MaxContractLen: utf8.UTFMax,
   461  			VariableTop:    uint32(b.varTop),
   462  		},
   463  	}
   464  
   465  	b.buildOrdering(&b.root)
   466  	b.t.root = b.root.handle
   467  	for _, t := range b.locale {
   468  		b.buildOrdering(t.index)
   469  		if b.err != nil {
   470  			break
   471  		}
   472  	}
   473  	i, err := b.index.generate()
   474  	b.t.trie = *i
   475  	b.t.Index = colltab.Trie{
   476  		Index:   i.index,
   477  		Values:  i.values,
   478  		Index0:  i.index[blockSize*b.t.root.lookupStart:],
   479  		Values0: i.values[blockSize*b.t.root.valueStart:],
   480  	}
   481  	b.error(err)
   482  	return b.t, b.err
   483  }
   484  
   485  // Build builds the root Collator.
   486  func (b *Builder) Build() (colltab.Weighter, error) {
   487  	table, err := b.build()
   488  	if err != nil {
   489  		return nil, err
   490  	}
   491  	return table, nil
   492  }
   493  
   494  // Build builds a Collator for Tailoring t.
   495  func (t *Tailoring) Build() (colltab.Weighter, error) {
   496  	// TODO: implement.
   497  	return nil, nil
   498  }
   499  
   500  // Print prints the tables for b and all its Tailorings as a Go file
   501  // that can be included in the Collate package.
   502  func (b *Builder) Print(w io.Writer) (n int, err error) {
   503  	p := func(nn int, e error) {
   504  		n += nn
   505  		if err == nil {
   506  			err = e
   507  		}
   508  	}
   509  	t, err := b.build()
   510  	if err != nil {
   511  		return 0, err
   512  	}
   513  	p(fmt.Fprintf(w, `var availableLocales = "und`))
   514  	for _, loc := range b.locale {
   515  		if loc.id != "und" {
   516  			p(fmt.Fprintf(w, ",%s", loc.id))
   517  		}
   518  	}
   519  	p(fmt.Fprint(w, "\"\n\n"))
   520  	p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
   521  	p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
   522  	for _, loc := range b.locale {
   523  		if loc.id == "und" {
   524  			p(t.fprintIndex(w, loc.index.handle, loc.id))
   525  		}
   526  	}
   527  	for _, loc := range b.locale {
   528  		if loc.id != "und" {
   529  			p(t.fprintIndex(w, loc.index.handle, loc.id))
   530  		}
   531  	}
   532  	p(fmt.Fprint(w, "}\n\n"))
   533  	n, _, err = t.fprint(w, "main")
   534  	return
   535  }
   536  
   537  // reproducibleFromNFKD checks whether the given expansion could be generated
   538  // from an NFKD expansion.
   539  func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
   540  	// Length must be equal.
   541  	if len(exp) != len(nfkd) {
   542  		return false
   543  	}
   544  	for i, ce := range exp {
   545  		// Primary and secondary values should be equal.
   546  		if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
   547  			return false
   548  		}
   549  		// Tertiary values should be equal to maxTertiary for third element onwards.
   550  		// TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
   551  		// simply be dropped.  Try this out by dropping the following code.
   552  		if i >= 2 && ce.w[2] != maxTertiary {
   553  			return false
   554  		}
   555  		if _, err := makeCE(ce); err != nil {
   556  			// Simply return false. The error will be caught elsewhere.
   557  			return false
   558  		}
   559  	}
   560  	return true
   561  }
   562  
   563  func simplify(o *ordering) {
   564  	// Runes that are a starter of a contraction should not be removed.
   565  	// (To date, there is only Kannada character 0CCA.)
   566  	keep := make(map[rune]bool)
   567  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   568  		if len(e.runes) > 1 {
   569  			keep[e.runes[0]] = true
   570  		}
   571  	}
   572  	// Tag entries for which the runes NFKD decompose to identical values.
   573  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   574  		s := e.str
   575  		nfkd := norm.NFKD.String(s)
   576  		nfd := norm.NFD.String(s)
   577  		if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
   578  			continue
   579  		}
   580  		if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
   581  			e.decompose = true
   582  		}
   583  	}
   584  }
   585  
   586  // appendExpansion converts the given collation sequence to
   587  // collation elements and adds them to the expansion table.
   588  // It returns an index to the expansion table.
   589  func (b *Builder) appendExpansion(e *entry) int {
   590  	t := b.t
   591  	i := len(t.ExpandElem)
   592  	ce := uint32(len(e.elems))
   593  	t.ExpandElem = append(t.ExpandElem, ce)
   594  	for _, w := range e.elems {
   595  		ce, err := makeCE(w)
   596  		if err != nil {
   597  			b.error(err)
   598  			return -1
   599  		}
   600  		t.ExpandElem = append(t.ExpandElem, ce)
   601  	}
   602  	return i
   603  }
   604  
   605  // processExpansions extracts data necessary to generate
   606  // the extraction tables.
   607  func (b *Builder) processExpansions(o *ordering) {
   608  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   609  		if !e.expansion() {
   610  			continue
   611  		}
   612  		key := fmt.Sprintf("%v", e.elems)
   613  		i, ok := b.expIndex[key]
   614  		if !ok {
   615  			i = b.appendExpansion(e)
   616  			b.expIndex[key] = i
   617  		}
   618  		e.expansionIndex = i
   619  	}
   620  }
   621  
   622  func (b *Builder) processContractions(o *ordering) {
   623  	// Collate contractions per starter rune.
   624  	starters := []rune{}
   625  	cm := make(map[rune][]*entry)
   626  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   627  		if e.contraction() {
   628  			if len(e.str) > b.t.MaxContractLen {
   629  				b.t.MaxContractLen = len(e.str)
   630  			}
   631  			r := e.runes[0]
   632  			if _, ok := cm[r]; !ok {
   633  				starters = append(starters, r)
   634  			}
   635  			cm[r] = append(cm[r], e)
   636  		}
   637  	}
   638  	// Add entries of single runes that are at a start of a contraction.
   639  	for e := o.front(); e != nil; e, _ = e.nextIndexed() {
   640  		if !e.contraction() {
   641  			r := e.runes[0]
   642  			if _, ok := cm[r]; ok {
   643  				cm[r] = append(cm[r], e)
   644  			}
   645  		}
   646  	}
   647  	// Build the tries for the contractions.
   648  	t := b.t
   649  	for _, r := range starters {
   650  		l := cm[r]
   651  		// Compute suffix strings. There are 31 different contraction suffix
   652  		// sets for 715 contractions and 82 contraction starter runes as of
   653  		// version 6.0.0.
   654  		sufx := []string{}
   655  		hasSingle := false
   656  		for _, e := range l {
   657  			if len(e.runes) > 1 {
   658  				sufx = append(sufx, string(e.runes[1:]))
   659  			} else {
   660  				hasSingle = true
   661  			}
   662  		}
   663  		if !hasSingle {
   664  			b.error(fmt.Errorf("no single entry for starter rune %U found", r))
   665  			continue
   666  		}
   667  		// Unique the suffix set.
   668  		sort.Strings(sufx)
   669  		key := strings.Join(sufx, "\n")
   670  		handle, ok := b.ctHandle[key]
   671  		if !ok {
   672  			var err error
   673  			handle, err = appendTrie(&t.ContractTries, sufx)
   674  			if err != nil {
   675  				b.error(err)
   676  			}
   677  			b.ctHandle[key] = handle
   678  		}
   679  		// Bucket sort entries in index order.
   680  		es := make([]*entry, len(l))
   681  		for _, e := range l {
   682  			var p, sn int
   683  			if len(e.runes) > 1 {
   684  				str := []byte(string(e.runes[1:]))
   685  				p, sn = lookup(&t.ContractTries, handle, str)
   686  				if sn != len(str) {
   687  					log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
   688  				}
   689  			}
   690  			if es[p] != nil {
   691  				log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
   692  			}
   693  			es[p] = e
   694  		}
   695  		// Create collation elements for contractions.
   696  		elems := []uint32{}
   697  		for _, e := range es {
   698  			ce, err := e.encodeBase()
   699  			b.errorID(o.id, err)
   700  			elems = append(elems, ce)
   701  		}
   702  		key = fmt.Sprintf("%v", elems)
   703  		i, ok := b.ctElem[key]
   704  		if !ok {
   705  			i = len(t.ContractElem)
   706  			b.ctElem[key] = i
   707  			t.ContractElem = append(t.ContractElem, elems...)
   708  		}
   709  		// Store info in entry for starter rune.
   710  		es[0].contractionIndex = i
   711  		es[0].contractionHandle = handle
   712  	}
   713  }
   714  

View as plain text