...

Source file src/golang.org/x/text/collate/build/order.go

Documentation: golang.org/x/text/collate/build

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build
     6  
     7  import (
     8  	"fmt"
     9  	"log"
    10  	"sort"
    11  	"strings"
    12  	"unicode"
    13  
    14  	"golang.org/x/text/internal/colltab"
    15  	"golang.org/x/text/unicode/norm"
    16  )
    17  
    18  type logicalAnchor int
    19  
    20  const (
    21  	firstAnchor logicalAnchor = -1
    22  	noAnchor                  = 0
    23  	lastAnchor                = 1
    24  )
    25  
    26  // entry is used to keep track of a single entry in the collation element table
    27  // during building. Examples of entries can be found in the Default Unicode
    28  // Collation Element Table.
    29  // See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
    30  type entry struct {
    31  	str    string // same as string(runes)
    32  	runes  []rune
    33  	elems  []rawCE // the collation elements
    34  	extend string  // weights of extend to be appended to elems
    35  	before bool    // weights relative to next instead of previous.
    36  	lock   bool    // entry is used in extension and can no longer be moved.
    37  
    38  	// prev, next, and level are used to keep track of tailorings.
    39  	prev, next *entry
    40  	level      colltab.Level // next differs at this level
    41  	skipRemove bool          // do not unlink when removed
    42  
    43  	decompose bool // can use NFKD decomposition to generate elems
    44  	exclude   bool // do not include in table
    45  	implicit  bool // derived, is not included in the list
    46  	modified  bool // entry was modified in tailoring
    47  	logical   logicalAnchor
    48  
    49  	expansionIndex    int // used to store index into expansion table
    50  	contractionHandle ctHandle
    51  	contractionIndex  int // index into contraction elements
    52  }
    53  
    54  func (e *entry) String() string {
    55  	return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)",
    56  		e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
    57  }
    58  
    59  func (e *entry) skip() bool {
    60  	return e.contraction()
    61  }
    62  
    63  func (e *entry) expansion() bool {
    64  	return !e.decompose && len(e.elems) > 1
    65  }
    66  
    67  func (e *entry) contraction() bool {
    68  	return len(e.runes) > 1
    69  }
    70  
    71  func (e *entry) contractionStarter() bool {
    72  	return e.contractionHandle.n != 0
    73  }
    74  
    75  // nextIndexed gets the next entry that needs to be stored in the table.
    76  // It returns the entry and the collation level at which the next entry differs
    77  // from the current entry.
    78  // Entries that can be explicitly derived and logical reset positions are
    79  // examples of entries that will not be indexed.
    80  func (e *entry) nextIndexed() (*entry, colltab.Level) {
    81  	level := e.level
    82  	for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next {
    83  		if e.level < level {
    84  			level = e.level
    85  		}
    86  	}
    87  	return e, level
    88  }
    89  
    90  // remove unlinks entry e from the sorted chain and clears the collation
    91  // elements. e may not be at the front or end of the list. This should always
    92  // be the case, as the front and end of the list are always logical anchors,
    93  // which may not be removed.
    94  func (e *entry) remove() {
    95  	if e.logical != noAnchor {
    96  		log.Fatalf("may not remove anchor %q", e.str)
    97  	}
    98  	// TODO: need to set e.prev.level to e.level if e.level is smaller?
    99  	e.elems = nil
   100  	if !e.skipRemove {
   101  		if e.prev != nil {
   102  			e.prev.next = e.next
   103  		}
   104  		if e.next != nil {
   105  			e.next.prev = e.prev
   106  		}
   107  	}
   108  	e.skipRemove = false
   109  }
   110  
   111  // insertAfter inserts n after e.
   112  func (e *entry) insertAfter(n *entry) {
   113  	if e == n {
   114  		panic("e == anchor")
   115  	}
   116  	if e == nil {
   117  		panic("unexpected nil anchor")
   118  	}
   119  	n.remove()
   120  	n.decompose = false // redo decomposition test
   121  
   122  	n.next = e.next
   123  	n.prev = e
   124  	if e.next != nil {
   125  		e.next.prev = n
   126  	}
   127  	e.next = n
   128  }
   129  
   130  // insertBefore inserts n before e.
   131  func (e *entry) insertBefore(n *entry) {
   132  	if e == n {
   133  		panic("e == anchor")
   134  	}
   135  	if e == nil {
   136  		panic("unexpected nil anchor")
   137  	}
   138  	n.remove()
   139  	n.decompose = false // redo decomposition test
   140  
   141  	n.prev = e.prev
   142  	n.next = e
   143  	if e.prev != nil {
   144  		e.prev.next = n
   145  	}
   146  	e.prev = n
   147  }
   148  
   149  func (e *entry) encodeBase() (ce uint32, err error) {
   150  	switch {
   151  	case e.expansion():
   152  		ce, err = makeExpandIndex(e.expansionIndex)
   153  	default:
   154  		if e.decompose {
   155  			log.Fatal("decompose should be handled elsewhere")
   156  		}
   157  		ce, err = makeCE(e.elems[0])
   158  	}
   159  	return
   160  }
   161  
   162  func (e *entry) encode() (ce uint32, err error) {
   163  	if e.skip() {
   164  		log.Fatal("cannot build colElem for entry that should be skipped")
   165  	}
   166  	switch {
   167  	case e.decompose:
   168  		t1 := e.elems[0].w[2]
   169  		t2 := 0
   170  		if len(e.elems) > 1 {
   171  			t2 = e.elems[1].w[2]
   172  		}
   173  		ce, err = makeDecompose(t1, t2)
   174  	case e.contractionStarter():
   175  		ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
   176  	default:
   177  		if len(e.runes) > 1 {
   178  			log.Fatal("colElem: contractions are handled in contraction trie")
   179  		}
   180  		ce, err = e.encodeBase()
   181  	}
   182  	return
   183  }
   184  
   185  // entryLess returns true if a sorts before b and false otherwise.
   186  func entryLess(a, b *entry) bool {
   187  	if res, _ := compareWeights(a.elems, b.elems); res != 0 {
   188  		return res == -1
   189  	}
   190  	if a.logical != noAnchor {
   191  		return a.logical == firstAnchor
   192  	}
   193  	if b.logical != noAnchor {
   194  		return b.logical == lastAnchor
   195  	}
   196  	return a.str < b.str
   197  }
   198  
   199  type sortedEntries []*entry
   200  
   201  func (s sortedEntries) Len() int {
   202  	return len(s)
   203  }
   204  
   205  func (s sortedEntries) Swap(i, j int) {
   206  	s[i], s[j] = s[j], s[i]
   207  }
   208  
   209  func (s sortedEntries) Less(i, j int) bool {
   210  	return entryLess(s[i], s[j])
   211  }
   212  
   213  type ordering struct {
   214  	id       string
   215  	entryMap map[string]*entry
   216  	ordered  []*entry
   217  	handle   *trieHandle
   218  }
   219  
   220  // insert inserts e into both entryMap and ordered.
   221  // Note that insert simply appends e to ordered.  To reattain a sorted
   222  // order, o.sort() should be called.
   223  func (o *ordering) insert(e *entry) {
   224  	if e.logical == noAnchor {
   225  		o.entryMap[e.str] = e
   226  	} else {
   227  		// Use key format as used in UCA rules.
   228  		o.entryMap[fmt.Sprintf("[%s]", e.str)] = e
   229  		// Also add index entry for XML format.
   230  		o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e
   231  	}
   232  	o.ordered = append(o.ordered, e)
   233  }
   234  
   235  // newEntry creates a new entry for the given info and inserts it into
   236  // the index.
   237  func (o *ordering) newEntry(s string, ces []rawCE) *entry {
   238  	e := &entry{
   239  		runes: []rune(s),
   240  		elems: ces,
   241  		str:   s,
   242  	}
   243  	o.insert(e)
   244  	return e
   245  }
   246  
   247  // find looks up and returns the entry for the given string.
   248  // It returns nil if str is not in the index and if an implicit value
   249  // cannot be derived, that is, if str represents more than one rune.
   250  func (o *ordering) find(str string) *entry {
   251  	e := o.entryMap[str]
   252  	if e == nil {
   253  		r := []rune(str)
   254  		if len(r) == 1 {
   255  			const (
   256  				firstHangul = 0xAC00
   257  				lastHangul  = 0xD7A3
   258  			)
   259  			if r[0] >= firstHangul && r[0] <= lastHangul {
   260  				ce := []rawCE{}
   261  				nfd := norm.NFD.String(str)
   262  				for _, r := range nfd {
   263  					ce = append(ce, o.find(string(r)).elems...)
   264  				}
   265  				e = o.newEntry(nfd, ce)
   266  			} else {
   267  				e = o.newEntry(string(r[0]), []rawCE{
   268  					{w: []int{
   269  						implicitPrimary(r[0]),
   270  						defaultSecondary,
   271  						defaultTertiary,
   272  						int(r[0]),
   273  					},
   274  					},
   275  				})
   276  				e.modified = true
   277  			}
   278  			e.exclude = true // do not index implicits
   279  		}
   280  	}
   281  	return e
   282  }
   283  
   284  // makeRootOrdering returns a newly initialized ordering value and populates
   285  // it with a set of logical reset points that can be used as anchors.
   286  // The anchors first_tertiary_ignorable and __END__ will always sort at
   287  // the beginning and end, respectively. This means that prev and next are non-nil
   288  // for any indexed entry.
   289  func makeRootOrdering() ordering {
   290  	const max = unicode.MaxRune
   291  	o := ordering{
   292  		entryMap: make(map[string]*entry),
   293  	}
   294  	insert := func(typ logicalAnchor, s string, ce []int) {
   295  		e := &entry{
   296  			elems:   []rawCE{{w: ce}},
   297  			str:     s,
   298  			exclude: true,
   299  			logical: typ,
   300  		}
   301  		o.insert(e)
   302  	}
   303  	insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0})
   304  	insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max})
   305  	insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max})
   306  	insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max})
   307  	insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max})
   308  	return o
   309  }
   310  
   311  // patchForInsert eliminates entries from the list with more than one collation element.
   312  // The next and prev fields of the eliminated entries still point to appropriate
   313  // values in the newly created list.
   314  // It requires that sort has been called.
   315  func (o *ordering) patchForInsert() {
   316  	for i := 0; i < len(o.ordered)-1; {
   317  		e := o.ordered[i]
   318  		lev := e.level
   319  		n := e.next
   320  		for ; n != nil && len(n.elems) > 1; n = n.next {
   321  			if n.level < lev {
   322  				lev = n.level
   323  			}
   324  			n.skipRemove = true
   325  		}
   326  		for ; o.ordered[i] != n; i++ {
   327  			o.ordered[i].level = lev
   328  			o.ordered[i].next = n
   329  			o.ordered[i+1].prev = e
   330  		}
   331  	}
   332  }
   333  
   334  // clone copies all ordering of es into a new ordering value.
   335  func (o *ordering) clone() *ordering {
   336  	o.sort()
   337  	oo := ordering{
   338  		entryMap: make(map[string]*entry),
   339  	}
   340  	for _, e := range o.ordered {
   341  		ne := &entry{
   342  			runes:     e.runes,
   343  			elems:     e.elems,
   344  			str:       e.str,
   345  			decompose: e.decompose,
   346  			exclude:   e.exclude,
   347  			logical:   e.logical,
   348  		}
   349  		oo.insert(ne)
   350  	}
   351  	oo.sort() // link all ordering.
   352  	oo.patchForInsert()
   353  	return &oo
   354  }
   355  
   356  // front returns the first entry to be indexed.
   357  // It assumes that sort() has been called.
   358  func (o *ordering) front() *entry {
   359  	e := o.ordered[0]
   360  	if e.prev != nil {
   361  		log.Panicf("unexpected first entry: %v", e)
   362  	}
   363  	// The first entry is always a logical position, which should not be indexed.
   364  	e, _ = e.nextIndexed()
   365  	return e
   366  }
   367  
   368  // sort sorts all ordering based on their collation elements and initializes
   369  // the prev, next, and level fields accordingly.
   370  func (o *ordering) sort() {
   371  	sort.Sort(sortedEntries(o.ordered))
   372  	l := o.ordered
   373  	for i := 1; i < len(l); i++ {
   374  		k := i - 1
   375  		l[k].next = l[i]
   376  		_, l[k].level = compareWeights(l[k].elems, l[i].elems)
   377  		l[i].prev = l[k]
   378  	}
   379  }
   380  
   381  // genColElems generates a collation element array from the runes in str. This
   382  // assumes that all collation elements have already been added to the Builder.
   383  func (o *ordering) genColElems(str string) []rawCE {
   384  	elems := []rawCE{}
   385  	for _, r := range []rune(str) {
   386  		for _, ce := range o.find(string(r)).elems {
   387  			if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
   388  				elems = append(elems, ce)
   389  			}
   390  		}
   391  	}
   392  	return elems
   393  }
   394  

View as plain text