...

Source file src/golang.org/x/text/internal/colltab/collelem.go

Documentation: golang.org/x/text/internal/colltab

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import (
     8  	"fmt"
     9  	"unicode"
    10  )
    11  
    12  // Level identifies the collation comparison level.
    13  // The primary level corresponds to the basic sorting of text.
    14  // The secondary level corresponds to accents and related linguistic elements.
    15  // The tertiary level corresponds to casing and related concepts.
    16  // The quaternary level is derived from the other levels by the
    17  // various algorithms for handling variable elements.
    18  type Level int
    19  
    20  const (
    21  	Primary Level = iota
    22  	Secondary
    23  	Tertiary
    24  	Quaternary
    25  	Identity
    26  
    27  	NumLevels
    28  )
    29  
    30  const (
    31  	defaultSecondary = 0x20
    32  	defaultTertiary  = 0x2
    33  	maxTertiary      = 0x1F
    34  	MaxQuaternary    = 0x1FFFFF // 21 bits.
    35  )
    36  
    37  // Elem is a representation of a collation element. This API provides ways to encode
    38  // and decode Elems. Implementations of collation tables may use values greater
    39  // or equal to PrivateUse for their own purposes.  However, these should never be
    40  // returned by AppendNext.
    41  type Elem uint32
    42  
    43  const (
    44  	maxCE       Elem = 0xAFFFFFFF
    45  	PrivateUse       = minContract
    46  	minContract      = 0xC0000000
    47  	maxContract      = 0xDFFFFFFF
    48  	minExpand        = 0xE0000000
    49  	maxExpand        = 0xEFFFFFFF
    50  	minDecomp        = 0xF0000000
    51  )
    52  
    53  type ceType int
    54  
    55  const (
    56  	ceNormal           ceType = iota // ceNormal includes implicits (ce == 0)
    57  	ceContractionIndex               // rune can be a start of a contraction
    58  	ceExpansionIndex                 // rune expands into a sequence of collation elements
    59  	ceDecompose                      // rune expands using NFKC decomposition
    60  )
    61  
    62  func (ce Elem) ctype() ceType {
    63  	if ce <= maxCE {
    64  		return ceNormal
    65  	}
    66  	if ce <= maxContract {
    67  		return ceContractionIndex
    68  	} else {
    69  		if ce <= maxExpand {
    70  			return ceExpansionIndex
    71  		}
    72  		return ceDecompose
    73  	}
    74  	panic("should not reach here")
    75  	return ceType(-1)
    76  }
    77  
    78  // For normal collation elements, we assume that a collation element either has
    79  // a primary or non-default secondary value, not both.
    80  // Collation elements with a primary value are of the form
    81  //
    82  //	01pppppp pppppppp ppppppp0 ssssssss
    83  //	  - p* is primary collation value
    84  //	  - s* is the secondary collation value
    85  //	00pppppp pppppppp ppppppps sssttttt, where
    86  //	  - p* is primary collation value
    87  //	  - s* offset of secondary from default value.
    88  //	  - t* is the tertiary collation value
    89  //	100ttttt cccccccc pppppppp pppppppp
    90  //	  - t* is the tertiar collation value
    91  //	  - c* is the canonical combining class
    92  //	  - p* is the primary collation value
    93  //
    94  // Collation elements with a secondary value are of the form
    95  //
    96  //	1010cccc ccccssss ssssssss tttttttt, where
    97  //	  - c* is the canonical combining class
    98  //	  - s* is the secondary collation value
    99  //	  - t* is the tertiary collation value
   100  //	11qqqqqq qqqqqqqq qqqqqqq0 00000000
   101  //	  - q* quaternary value
   102  const (
   103  	ceTypeMask              = 0xC0000000
   104  	ceTypeMaskExt           = 0xE0000000
   105  	ceIgnoreMask            = 0xF00FFFFF
   106  	ceType1                 = 0x40000000
   107  	ceType2                 = 0x00000000
   108  	ceType3or4              = 0x80000000
   109  	ceType4                 = 0xA0000000
   110  	ceTypeQ                 = 0xC0000000
   111  	Ignore                  = ceType4
   112  	firstNonPrimary         = 0x80000000
   113  	lastSpecialPrimary      = 0xA0000000
   114  	secondaryMask           = 0x80000000
   115  	hasTertiaryMask         = 0x40000000
   116  	primaryValueMask        = 0x3FFFFE00
   117  	maxPrimaryBits          = 21
   118  	compactPrimaryBits      = 16
   119  	maxSecondaryBits        = 12
   120  	maxTertiaryBits         = 8
   121  	maxCCCBits              = 8
   122  	maxSecondaryCompactBits = 8
   123  	maxSecondaryDiffBits    = 4
   124  	maxTertiaryCompactBits  = 5
   125  	primaryShift            = 9
   126  	compactSecondaryShift   = 5
   127  	minCompactSecondary     = defaultSecondary - 4
   128  )
   129  
   130  func makeImplicitCE(primary int) Elem {
   131  	return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
   132  }
   133  
   134  // MakeElem returns an Elem for the given values.  It will return an error
   135  // if the given combination of values is invalid.
   136  func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
   137  	if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
   138  		return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
   139  	}
   140  	if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
   141  		return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
   142  	}
   143  	if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
   144  		return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
   145  	}
   146  	ce := Elem(0)
   147  	if primary != 0 {
   148  		if ccc != 0 {
   149  			if primary >= 1<<compactPrimaryBits {
   150  				return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
   151  			}
   152  			if secondary != defaultSecondary {
   153  				return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
   154  			}
   155  			ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
   156  			ce |= Elem(ccc) << compactPrimaryBits
   157  			ce |= Elem(primary)
   158  			ce |= ceType3or4
   159  		} else if tertiary == defaultTertiary {
   160  			if secondary >= 1<<maxSecondaryCompactBits {
   161  				return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
   162  			}
   163  			ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
   164  			ce |= ceType1
   165  		} else {
   166  			d := secondary - defaultSecondary + maxSecondaryDiffBits
   167  			if d >= 1<<maxSecondaryDiffBits || d < 0 {
   168  				return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
   169  			}
   170  			if tertiary >= 1<<maxTertiaryCompactBits {
   171  				return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
   172  			}
   173  			ce = Elem(primary<<maxSecondaryDiffBits + d)
   174  			ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
   175  		}
   176  	} else {
   177  		ce = Elem(secondary<<maxTertiaryBits + tertiary)
   178  		ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
   179  		ce |= ceType4
   180  	}
   181  	return ce, nil
   182  }
   183  
   184  // MakeQuaternary returns an Elem with the given quaternary value.
   185  func MakeQuaternary(v int) Elem {
   186  	return ceTypeQ | Elem(v<<primaryShift)
   187  }
   188  
   189  // Mask sets weights for any level smaller than l to 0.
   190  // The resulting Elem can be used to test for equality with
   191  // other Elems to which the same mask has been applied.
   192  func (ce Elem) Mask(l Level) uint32 {
   193  	return 0
   194  }
   195  
   196  // CCC returns the canonical combining class associated with the underlying character,
   197  // if applicable, or 0 otherwise.
   198  func (ce Elem) CCC() uint8 {
   199  	if ce&ceType3or4 != 0 {
   200  		if ce&ceType4 == ceType3or4 {
   201  			return uint8(ce >> 16)
   202  		}
   203  		return uint8(ce >> 20)
   204  	}
   205  	return 0
   206  }
   207  
   208  // Primary returns the primary collation weight for ce.
   209  func (ce Elem) Primary() int {
   210  	if ce >= firstNonPrimary {
   211  		if ce > lastSpecialPrimary {
   212  			return 0
   213  		}
   214  		return int(uint16(ce))
   215  	}
   216  	return int(ce&primaryValueMask) >> primaryShift
   217  }
   218  
   219  // Secondary returns the secondary collation weight for ce.
   220  func (ce Elem) Secondary() int {
   221  	switch ce & ceTypeMask {
   222  	case ceType1:
   223  		return int(uint8(ce))
   224  	case ceType2:
   225  		return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
   226  	case ceType3or4:
   227  		if ce < ceType4 {
   228  			return defaultSecondary
   229  		}
   230  		return int(ce>>8) & 0xFFF
   231  	case ceTypeQ:
   232  		return 0
   233  	}
   234  	panic("should not reach here")
   235  }
   236  
   237  // Tertiary returns the tertiary collation weight for ce.
   238  func (ce Elem) Tertiary() uint8 {
   239  	if ce&hasTertiaryMask == 0 {
   240  		if ce&ceType3or4 == 0 {
   241  			return uint8(ce & 0x1F)
   242  		}
   243  		if ce&ceType4 == ceType4 {
   244  			return uint8(ce)
   245  		}
   246  		return uint8(ce>>24) & 0x1F // type 2
   247  	} else if ce&ceTypeMask == ceType1 {
   248  		return defaultTertiary
   249  	}
   250  	// ce is a quaternary value.
   251  	return 0
   252  }
   253  
   254  func (ce Elem) updateTertiary(t uint8) Elem {
   255  	if ce&ceTypeMask == ceType1 {
   256  		// convert to type 4
   257  		nce := ce & primaryValueMask
   258  		nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
   259  		ce = nce
   260  	} else if ce&ceTypeMaskExt == ceType3or4 {
   261  		ce &= ^Elem(maxTertiary << 24)
   262  		return ce | (Elem(t) << 24)
   263  	} else {
   264  		// type 2 or 4
   265  		ce &= ^Elem(maxTertiary)
   266  	}
   267  	return ce | Elem(t)
   268  }
   269  
   270  // Quaternary returns the quaternary value if explicitly specified,
   271  // 0 if ce == Ignore, or MaxQuaternary otherwise.
   272  // Quaternary values are used only for shifted variants.
   273  func (ce Elem) Quaternary() int {
   274  	if ce&ceTypeMask == ceTypeQ {
   275  		return int(ce&primaryValueMask) >> primaryShift
   276  	} else if ce&ceIgnoreMask == Ignore {
   277  		return 0
   278  	}
   279  	return MaxQuaternary
   280  }
   281  
   282  // Weight returns the collation weight for the given level.
   283  func (ce Elem) Weight(l Level) int {
   284  	switch l {
   285  	case Primary:
   286  		return ce.Primary()
   287  	case Secondary:
   288  		return ce.Secondary()
   289  	case Tertiary:
   290  		return int(ce.Tertiary())
   291  	case Quaternary:
   292  		return ce.Quaternary()
   293  	}
   294  	return 0 // return 0 (ignore) for undefined levels.
   295  }
   296  
   297  // For contractions, collation elements are of the form
   298  // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
   299  //   - n* is the size of the first node in the contraction trie.
   300  //   - i* is the index of the first node in the contraction trie.
   301  //   - b* is the offset into the contraction collation element table.
   302  //
   303  // See contract.go for details on the contraction trie.
   304  const (
   305  	maxNBits              = 4
   306  	maxTrieIndexBits      = 12
   307  	maxContractOffsetBits = 13
   308  )
   309  
   310  func splitContractIndex(ce Elem) (index, n, offset int) {
   311  	n = int(ce & (1<<maxNBits - 1))
   312  	ce >>= maxNBits
   313  	index = int(ce & (1<<maxTrieIndexBits - 1))
   314  	ce >>= maxTrieIndexBits
   315  	offset = int(ce & (1<<maxContractOffsetBits - 1))
   316  	return
   317  }
   318  
   319  // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
   320  // where b* is the index into the expansion sequence table.
   321  const maxExpandIndexBits = 16
   322  
   323  func splitExpandIndex(ce Elem) (index int) {
   324  	return int(uint16(ce))
   325  }
   326  
   327  // Some runes can be expanded using NFKD decomposition. Instead of storing the full
   328  // sequence of collation elements, we decompose the rune and lookup the collation
   329  // elements for each rune in the decomposition and modify the tertiary weights.
   330  // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
   331  //   - v* is the replacement tertiary weight for the first rune,
   332  //   - w* is the replacement tertiary weight for the second rune,
   333  //
   334  // Tertiary weights of subsequent runes should be replaced with maxTertiary.
   335  // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
   336  func splitDecompose(ce Elem) (t1, t2 uint8) {
   337  	return uint8(ce), uint8(ce >> 8)
   338  }
   339  
   340  const (
   341  	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
   342  	minUnified       rune = 0x4E00
   343  	maxUnified            = 0x9FFF
   344  	minCompatibility      = 0xF900
   345  	maxCompatibility      = 0xFAFF
   346  	minRare               = 0x3400
   347  	maxRare               = 0x4DBF
   348  )
   349  const (
   350  	commonUnifiedOffset = 0x10000
   351  	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
   352  	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
   353  	illegalOffset       = otherOffset + int(unicode.MaxRune)
   354  	maxPrimary          = illegalOffset + 1
   355  )
   356  
   357  // implicitPrimary returns the primary weight for the a rune
   358  // for which there is no entry for the rune in the collation table.
   359  // We take a different approach from the one specified in
   360  // https://unicode.org/reports/tr10/#Implicit_Weights,
   361  // but preserve the resulting relative ordering of the runes.
   362  func implicitPrimary(r rune) int {
   363  	if unicode.Is(unicode.Ideographic, r) {
   364  		if r >= minUnified && r <= maxUnified {
   365  			// The most common case for CJK.
   366  			return int(r) + commonUnifiedOffset
   367  		}
   368  		if r >= minCompatibility && r <= maxCompatibility {
   369  			// This will typically not hit. The DUCET explicitly specifies mappings
   370  			// for all characters that do not decompose.
   371  			return int(r) + commonUnifiedOffset
   372  		}
   373  		return int(r) + rareUnifiedOffset
   374  	}
   375  	return int(r) + otherOffset
   376  }
   377  

View as plain text