...

Source file src/golang.org/x/text/collate/build/colelem.go

Documentation: golang.org/x/text/collate/build

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package build
     6  
     7  import (
     8  	"fmt"
     9  	"unicode"
    10  
    11  	"golang.org/x/text/internal/colltab"
    12  )
    13  
    14  const (
    15  	defaultSecondary = 0x20
    16  	defaultTertiary  = 0x2
    17  	maxTertiary      = 0x1F
    18  )
    19  
    20  type rawCE struct {
    21  	w   []int
    22  	ccc uint8
    23  }
    24  
    25  func makeRawCE(w []int, ccc uint8) rawCE {
    26  	ce := rawCE{w: make([]int, 4), ccc: ccc}
    27  	copy(ce.w, w)
    28  	return ce
    29  }
    30  
    31  // A collation element is represented as an uint32.
    32  // In the typical case, a rune maps to a single collation element. If a rune
    33  // can be the start of a contraction or expands into multiple collation elements,
    34  // then the collation element that is associated with a rune will have a special
    35  // form to represent such m to n mappings.  Such special collation elements
    36  // have a value >= 0x80000000.
    37  
    38  const (
    39  	maxPrimaryBits   = 21
    40  	maxSecondaryBits = 12
    41  	maxTertiaryBits  = 8
    42  )
    43  
    44  func makeCE(ce rawCE) (uint32, error) {
    45  	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
    46  	return uint32(v), e
    47  }
    48  
    49  // For contractions, collation elements are of the form
    50  // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
    51  //   - n* is the size of the first node in the contraction trie.
    52  //   - i* is the index of the first node in the contraction trie.
    53  //   - b* is the offset into the contraction collation element table.
    54  //
    55  // See contract.go for details on the contraction trie.
    56  const (
    57  	contractID            = 0xC0000000
    58  	maxNBits              = 4
    59  	maxTrieIndexBits      = 12
    60  	maxContractOffsetBits = 13
    61  )
    62  
    63  func makeContractIndex(h ctHandle, offset int) (uint32, error) {
    64  	if h.n >= 1<<maxNBits {
    65  		return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
    66  	}
    67  	if h.index >= 1<<maxTrieIndexBits {
    68  		return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
    69  	}
    70  	if offset >= 1<<maxContractOffsetBits {
    71  		return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
    72  	}
    73  	ce := uint32(contractID)
    74  	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
    75  	ce += uint32(h.index << maxNBits)
    76  	ce += uint32(h.n)
    77  	return ce, nil
    78  }
    79  
    80  // For expansions, collation elements are of the form
    81  // 11100000 00000000 bbbbbbbb bbbbbbbb,
    82  // where b* is the index into the expansion sequence table.
    83  const (
    84  	expandID           = 0xE0000000
    85  	maxExpandIndexBits = 16
    86  )
    87  
    88  func makeExpandIndex(index int) (uint32, error) {
    89  	if index >= 1<<maxExpandIndexBits {
    90  		return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
    91  	}
    92  	return expandID + uint32(index), nil
    93  }
    94  
    95  // Each list of collation elements corresponding to an expansion starts with
    96  // a header indicating the length of the sequence.
    97  func makeExpansionHeader(n int) (uint32, error) {
    98  	return uint32(n), nil
    99  }
   100  
   101  // Some runes can be expanded using NFKD decomposition. Instead of storing the full
   102  // sequence of collation elements, we decompose the rune and lookup the collation
   103  // elements for each rune in the decomposition and modify the tertiary weights.
   104  // The collation element, in this case, is of the form
   105  // 11110000 00000000 wwwwwwww vvvvvvvv, where
   106  //   - v* is the replacement tertiary weight for the first rune,
   107  //   - w* is the replacement tertiary weight for the second rune.
   108  //
   109  // Tertiary weights of subsequent runes should be replaced with maxTertiary.
   110  // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
   111  const (
   112  	decompID = 0xF0000000
   113  )
   114  
   115  func makeDecompose(t1, t2 int) (uint32, error) {
   116  	if t1 >= 256 || t1 < 0 {
   117  		return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
   118  	}
   119  	if t2 >= 256 || t2 < 0 {
   120  		return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
   121  	}
   122  	return uint32(t2<<8+t1) + decompID, nil
   123  }
   124  
   125  const (
   126  	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
   127  	minUnified       rune = 0x4E00
   128  	maxUnified            = 0x9FFF
   129  	minCompatibility      = 0xF900
   130  	maxCompatibility      = 0xFAFF
   131  	minRare               = 0x3400
   132  	maxRare               = 0x4DBF
   133  )
   134  const (
   135  	commonUnifiedOffset = 0x10000
   136  	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
   137  	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
   138  	illegalOffset       = otherOffset + int(unicode.MaxRune)
   139  	maxPrimary          = illegalOffset + 1
   140  )
   141  
   142  // implicitPrimary returns the primary weight for the a rune
   143  // for which there is no entry for the rune in the collation table.
   144  // We take a different approach from the one specified in
   145  // https://unicode.org/reports/tr10/#Implicit_Weights,
   146  // but preserve the resulting relative ordering of the runes.
   147  func implicitPrimary(r rune) int {
   148  	if unicode.Is(unicode.Ideographic, r) {
   149  		if r >= minUnified && r <= maxUnified {
   150  			// The most common case for CJK.
   151  			return int(r) + commonUnifiedOffset
   152  		}
   153  		if r >= minCompatibility && r <= maxCompatibility {
   154  			// This will typically not hit. The DUCET explicitly specifies mappings
   155  			// for all characters that do not decompose.
   156  			return int(r) + commonUnifiedOffset
   157  		}
   158  		return int(r) + rareUnifiedOffset
   159  	}
   160  	return int(r) + otherOffset
   161  }
   162  
   163  // convertLargeWeights converts collation elements with large
   164  // primaries (either double primaries or for illegal runes)
   165  // to our own representation.
   166  // A CJK character C is represented in the DUCET as
   167  //
   168  //	[.FBxx.0020.0002.C][.BBBB.0000.0000.C]
   169  //
   170  // We will rewrite these characters to a single CE.
   171  // We assume the CJK values start at 0x8000.
   172  // See https://unicode.org/reports/tr10/#Implicit_Weights
   173  func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
   174  	const (
   175  		cjkPrimaryStart   = 0xFB40
   176  		rarePrimaryStart  = 0xFB80
   177  		otherPrimaryStart = 0xFBC0
   178  		illegalPrimary    = 0xFFFE
   179  		highBitsMask      = 0x3F
   180  		lowBitsMask       = 0x7FFF
   181  		lowBitsFlag       = 0x8000
   182  		shiftBits         = 15
   183  	)
   184  	for i := 0; i < len(elems); i++ {
   185  		ce := elems[i].w
   186  		p := ce[0]
   187  		if p < cjkPrimaryStart {
   188  			continue
   189  		}
   190  		if p > 0xFFFF {
   191  			return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
   192  		}
   193  		if p >= illegalPrimary {
   194  			ce[0] = illegalOffset + p - illegalPrimary
   195  		} else {
   196  			if i+1 >= len(elems) {
   197  				return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
   198  			}
   199  			if elems[i+1].w[0]&lowBitsFlag == 0 {
   200  				return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
   201  			}
   202  			np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
   203  			switch {
   204  			case p < rarePrimaryStart:
   205  				np += commonUnifiedOffset
   206  			case p < otherPrimaryStart:
   207  				np += rareUnifiedOffset
   208  			default:
   209  				p += otherOffset
   210  			}
   211  			ce[0] = np
   212  			for j := i + 1; j+1 < len(elems); j++ {
   213  				elems[j] = elems[j+1]
   214  			}
   215  			elems = elems[:len(elems)-1]
   216  		}
   217  	}
   218  	return elems, nil
   219  }
   220  
   221  // nextWeight computes the first possible collation weights following elems
   222  // for the given level.
   223  func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
   224  	if level == colltab.Identity {
   225  		next := make([]rawCE, len(elems))
   226  		copy(next, elems)
   227  		return next
   228  	}
   229  	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
   230  	next[0].w[level]++
   231  	if level < colltab.Secondary {
   232  		next[0].w[colltab.Secondary] = defaultSecondary
   233  	}
   234  	if level < colltab.Tertiary {
   235  		next[0].w[colltab.Tertiary] = defaultTertiary
   236  	}
   237  	// Filter entries that cannot influence ordering.
   238  	for _, ce := range elems[1:] {
   239  		skip := true
   240  		for i := colltab.Primary; i < level; i++ {
   241  			skip = skip && ce.w[i] == 0
   242  		}
   243  		if !skip {
   244  			next = append(next, ce)
   245  		}
   246  	}
   247  	return next
   248  }
   249  
   250  func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
   251  	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
   252  	}
   253  	if i < len(elems) {
   254  		return i, elems[i].w[level]
   255  	}
   256  	return i, 0
   257  }
   258  
   259  // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
   260  // It also returns the collation level at which the difference is found.
   261  func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
   262  	for level := colltab.Primary; level < colltab.Identity; level++ {
   263  		var va, vb int
   264  		for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
   265  			ia, va = nextVal(a, ia, level)
   266  			ib, vb = nextVal(b, ib, level)
   267  			if va != vb {
   268  				if va < vb {
   269  					return -1, level
   270  				} else {
   271  					return 1, level
   272  				}
   273  			}
   274  		}
   275  	}
   276  	return 0, colltab.Identity
   277  }
   278  
   279  func equalCE(a, b rawCE) bool {
   280  	for i := 0; i < 3; i++ {
   281  		if b.w[i] != a.w[i] {
   282  			return false
   283  		}
   284  	}
   285  	return true
   286  }
   287  
   288  func equalCEArrays(a, b []rawCE) bool {
   289  	if len(a) != len(b) {
   290  		return false
   291  	}
   292  	for i := range a {
   293  		if !equalCE(a[i], b[i]) {
   294  			return false
   295  		}
   296  	}
   297  	return true
   298  }
   299  

View as plain text