...

Source file src/golang.org/x/text/collate/collate.go

Documentation: golang.org/x/text/collate

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // TODO: remove hard-coded versions when we have implemented fractional weights.
     6  // The current implementation is incompatible with later CLDR versions.
     7  //go:generate go run maketables.go -cldr=23 -unicode=6.2.0
     8  
     9  // Package collate contains types for comparing and sorting Unicode strings
    10  // according to a given collation order.
    11  package collate // import "golang.org/x/text/collate"
    12  
    13  import (
    14  	"bytes"
    15  	"strings"
    16  
    17  	"golang.org/x/text/internal/colltab"
    18  	"golang.org/x/text/language"
    19  )
    20  
    21  // Collator provides functionality for comparing strings for a given
    22  // collation order.
    23  type Collator struct {
    24  	options
    25  
    26  	sorter sorter
    27  
    28  	_iter [2]iter
    29  }
    30  
    31  func (c *Collator) iter(i int) *iter {
    32  	// TODO: evaluate performance for making the second iterator optional.
    33  	return &c._iter[i]
    34  }
    35  
    36  // Supported returns the list of languages for which collating differs from its parent.
    37  func Supported() []language.Tag {
    38  	// TODO: use language.Coverage instead.
    39  
    40  	t := make([]language.Tag, len(tags))
    41  	copy(t, tags)
    42  	return t
    43  }
    44  
    45  func init() {
    46  	ids := strings.Split(availableLocales, ",")
    47  	tags = make([]language.Tag, len(ids))
    48  	for i, s := range ids {
    49  		tags[i] = language.Raw.MustParse(s)
    50  	}
    51  }
    52  
    53  var tags []language.Tag
    54  
    55  // New returns a new Collator initialized for the given locale.
    56  func New(t language.Tag, o ...Option) *Collator {
    57  	index := colltab.MatchLang(t, tags)
    58  	c := newCollator(getTable(locales[index]))
    59  
    60  	// Set options from the user-supplied tag.
    61  	c.setFromTag(t)
    62  
    63  	// Set the user-supplied options.
    64  	c.setOptions(o)
    65  
    66  	c.init()
    67  	return c
    68  }
    69  
    70  // NewFromTable returns a new Collator for the given Weighter.
    71  func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
    72  	c := newCollator(w)
    73  	c.setOptions(o)
    74  	c.init()
    75  	return c
    76  }
    77  
    78  func (c *Collator) init() {
    79  	if c.numeric {
    80  		c.t = colltab.NewNumericWeighter(c.t)
    81  	}
    82  	c._iter[0].init(c)
    83  	c._iter[1].init(c)
    84  }
    85  
    86  // Buffer holds keys generated by Key and KeyString.
    87  type Buffer struct {
    88  	buf [4096]byte
    89  	key []byte
    90  }
    91  
    92  func (b *Buffer) init() {
    93  	if b.key == nil {
    94  		b.key = b.buf[:0]
    95  	}
    96  }
    97  
    98  // Reset clears the buffer from previous results generated by Key and KeyString.
    99  func (b *Buffer) Reset() {
   100  	b.key = b.key[:0]
   101  }
   102  
   103  // Compare returns an integer comparing the two byte slices.
   104  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
   105  func (c *Collator) Compare(a, b []byte) int {
   106  	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
   107  	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
   108  	c.iter(0).SetInput(a)
   109  	c.iter(1).SetInput(b)
   110  	if res := c.compare(); res != 0 {
   111  		return res
   112  	}
   113  	if !c.ignore[colltab.Identity] {
   114  		return bytes.Compare(a, b)
   115  	}
   116  	return 0
   117  }
   118  
   119  // CompareString returns an integer comparing the two strings.
   120  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
   121  func (c *Collator) CompareString(a, b string) int {
   122  	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
   123  	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
   124  	c.iter(0).SetInputString(a)
   125  	c.iter(1).SetInputString(b)
   126  	if res := c.compare(); res != 0 {
   127  		return res
   128  	}
   129  	if !c.ignore[colltab.Identity] {
   130  		if a < b {
   131  			return -1
   132  		} else if a > b {
   133  			return 1
   134  		}
   135  	}
   136  	return 0
   137  }
   138  
   139  func compareLevel(f func(i *iter) int, a, b *iter) int {
   140  	a.pce = 0
   141  	b.pce = 0
   142  	for {
   143  		va := f(a)
   144  		vb := f(b)
   145  		if va != vb {
   146  			if va < vb {
   147  				return -1
   148  			}
   149  			return 1
   150  		} else if va == 0 {
   151  			break
   152  		}
   153  	}
   154  	return 0
   155  }
   156  
   157  func (c *Collator) compare() int {
   158  	ia, ib := c.iter(0), c.iter(1)
   159  	// Process primary level
   160  	if c.alternate != altShifted {
   161  		// TODO: implement script reordering
   162  		if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
   163  			return res
   164  		}
   165  	} else {
   166  		// TODO: handle shifted
   167  	}
   168  	if !c.ignore[colltab.Secondary] {
   169  		f := (*iter).nextSecondary
   170  		if c.backwards {
   171  			f = (*iter).prevSecondary
   172  		}
   173  		if res := compareLevel(f, ia, ib); res != 0 {
   174  			return res
   175  		}
   176  	}
   177  	// TODO: special case handling (Danish?)
   178  	if !c.ignore[colltab.Tertiary] || c.caseLevel {
   179  		if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
   180  			return res
   181  		}
   182  		if !c.ignore[colltab.Quaternary] {
   183  			if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
   184  				return res
   185  			}
   186  		}
   187  	}
   188  	return 0
   189  }
   190  
   191  // Key returns the collation key for str.
   192  // Passing the buffer buf may avoid memory allocations.
   193  // The returned slice will point to an allocation in Buffer and will remain
   194  // valid until the next call to buf.Reset().
   195  func (c *Collator) Key(buf *Buffer, str []byte) []byte {
   196  	// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
   197  	buf.init()
   198  	return c.key(buf, c.getColElems(str))
   199  }
   200  
   201  // KeyFromString returns the collation key for str.
   202  // Passing the buffer buf may avoid memory allocations.
   203  // The returned slice will point to an allocation in Buffer and will retain
   204  // valid until the next call to buf.ResetKeys().
   205  func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
   206  	// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
   207  	buf.init()
   208  	return c.key(buf, c.getColElemsString(str))
   209  }
   210  
   211  func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
   212  	processWeights(c.alternate, c.t.Top(), w)
   213  	kn := len(buf.key)
   214  	c.keyFromElems(buf, w)
   215  	return buf.key[kn:]
   216  }
   217  
   218  func (c *Collator) getColElems(str []byte) []colltab.Elem {
   219  	i := c.iter(0)
   220  	i.SetInput(str)
   221  	for i.Next() {
   222  	}
   223  	return i.Elems
   224  }
   225  
   226  func (c *Collator) getColElemsString(str string) []colltab.Elem {
   227  	i := c.iter(0)
   228  	i.SetInputString(str)
   229  	for i.Next() {
   230  	}
   231  	return i.Elems
   232  }
   233  
   234  type iter struct {
   235  	wa [512]colltab.Elem
   236  
   237  	colltab.Iter
   238  	pce int
   239  }
   240  
   241  func (i *iter) init(c *Collator) {
   242  	i.Weighter = c.t
   243  	i.Elems = i.wa[:0]
   244  }
   245  
   246  func (i *iter) nextPrimary() int {
   247  	for {
   248  		for ; i.pce < i.N; i.pce++ {
   249  			if v := i.Elems[i.pce].Primary(); v != 0 {
   250  				i.pce++
   251  				return v
   252  			}
   253  		}
   254  		if !i.Next() {
   255  			return 0
   256  		}
   257  	}
   258  	panic("should not reach here")
   259  }
   260  
   261  func (i *iter) nextSecondary() int {
   262  	for ; i.pce < len(i.Elems); i.pce++ {
   263  		if v := i.Elems[i.pce].Secondary(); v != 0 {
   264  			i.pce++
   265  			return v
   266  		}
   267  	}
   268  	return 0
   269  }
   270  
   271  func (i *iter) prevSecondary() int {
   272  	for ; i.pce < len(i.Elems); i.pce++ {
   273  		if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
   274  			i.pce++
   275  			return v
   276  		}
   277  	}
   278  	return 0
   279  }
   280  
   281  func (i *iter) nextTertiary() int {
   282  	for ; i.pce < len(i.Elems); i.pce++ {
   283  		if v := i.Elems[i.pce].Tertiary(); v != 0 {
   284  			i.pce++
   285  			return int(v)
   286  		}
   287  	}
   288  	return 0
   289  }
   290  
   291  func (i *iter) nextQuaternary() int {
   292  	for ; i.pce < len(i.Elems); i.pce++ {
   293  		if v := i.Elems[i.pce].Quaternary(); v != 0 {
   294  			i.pce++
   295  			return v
   296  		}
   297  	}
   298  	return 0
   299  }
   300  
   301  func appendPrimary(key []byte, p int) []byte {
   302  	// Convert to variable length encoding; supports up to 23 bits.
   303  	if p <= 0x7FFF {
   304  		key = append(key, uint8(p>>8), uint8(p))
   305  	} else {
   306  		key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
   307  	}
   308  	return key
   309  }
   310  
   311  // keyFromElems converts the weights ws to a compact sequence of bytes.
   312  // The result will be appended to the byte buffer in buf.
   313  func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
   314  	for _, v := range ws {
   315  		if w := v.Primary(); w > 0 {
   316  			buf.key = appendPrimary(buf.key, w)
   317  		}
   318  	}
   319  	if !c.ignore[colltab.Secondary] {
   320  		buf.key = append(buf.key, 0, 0)
   321  		// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
   322  		if !c.backwards {
   323  			for _, v := range ws {
   324  				if w := v.Secondary(); w > 0 {
   325  					buf.key = append(buf.key, uint8(w>>8), uint8(w))
   326  				}
   327  			}
   328  		} else {
   329  			for i := len(ws) - 1; i >= 0; i-- {
   330  				if w := ws[i].Secondary(); w > 0 {
   331  					buf.key = append(buf.key, uint8(w>>8), uint8(w))
   332  				}
   333  			}
   334  		}
   335  	} else if c.caseLevel {
   336  		buf.key = append(buf.key, 0, 0)
   337  	}
   338  	if !c.ignore[colltab.Tertiary] || c.caseLevel {
   339  		buf.key = append(buf.key, 0, 0)
   340  		for _, v := range ws {
   341  			if w := v.Tertiary(); w > 0 {
   342  				buf.key = append(buf.key, uint8(w))
   343  			}
   344  		}
   345  		// Derive the quaternary weights from the options and other levels.
   346  		// Note that we represent MaxQuaternary as 0xFF. The first byte of the
   347  		// representation of a primary weight is always smaller than 0xFF,
   348  		// so using this single byte value will compare correctly.
   349  		if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
   350  			if c.alternate == altShiftTrimmed {
   351  				lastNonFFFF := len(buf.key)
   352  				buf.key = append(buf.key, 0)
   353  				for _, v := range ws {
   354  					if w := v.Quaternary(); w == colltab.MaxQuaternary {
   355  						buf.key = append(buf.key, 0xFF)
   356  					} else if w > 0 {
   357  						buf.key = appendPrimary(buf.key, w)
   358  						lastNonFFFF = len(buf.key)
   359  					}
   360  				}
   361  				buf.key = buf.key[:lastNonFFFF]
   362  			} else {
   363  				buf.key = append(buf.key, 0)
   364  				for _, v := range ws {
   365  					if w := v.Quaternary(); w == colltab.MaxQuaternary {
   366  						buf.key = append(buf.key, 0xFF)
   367  					} else if w > 0 {
   368  						buf.key = appendPrimary(buf.key, w)
   369  					}
   370  				}
   371  			}
   372  		}
   373  	}
   374  }
   375  
   376  func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
   377  	ignore := false
   378  	vtop := int(top)
   379  	switch vw {
   380  	case altShifted, altShiftTrimmed:
   381  		for i := range wa {
   382  			if p := wa[i].Primary(); p <= vtop && p != 0 {
   383  				wa[i] = colltab.MakeQuaternary(p)
   384  				ignore = true
   385  			} else if p == 0 {
   386  				if ignore {
   387  					wa[i] = colltab.Ignore
   388  				}
   389  			} else {
   390  				ignore = false
   391  			}
   392  		}
   393  	case altBlanked:
   394  		for i := range wa {
   395  			if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
   396  				wa[i] = colltab.Ignore
   397  				ignore = true
   398  			} else {
   399  				ignore = false
   400  			}
   401  		}
   402  	}
   403  }
   404  

View as plain text