1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ignore 6 7 package main 8 9 // This file contains definitions for interpreting the trie value of the case 10 // trie generated by "go run gen*.go". It is shared by both the generator 11 // program and the resultant package. Sharing is achieved by the generator 12 // copying gen_trieval.go to trieval.go and changing what's above this comment. 13 14 // info holds case information for a single rune. It is the value returned 15 // by a trie lookup. Most mapping information can be stored in a single 16-bit 16 // value. If not, for example when a rune is mapped to multiple runes, the value 17 // stores some basic case data and an index into an array with additional data. 18 // 19 // The per-rune values have the following format: 20 // 21 // if (exception) { 22 // 15..4 unsigned exception index 23 // } else { 24 // 15..8 XOR pattern or index to XOR pattern for case mapping 25 // Only 13..8 are used for XOR patterns. 26 // 7 inverseFold (fold to upper, not to lower) 27 // 6 index: interpret the XOR pattern as an index 28 // or isMid if case mode is cIgnorableUncased. 29 // 5..4 CCC: zero (normal or break), above or other 30 // } 31 // 3 exception: interpret this value as an exception index 32 // (TODO: is this bit necessary? Probably implied from case mode.) 33 // 2..0 case mode 34 // 35 // For the non-exceptional cases, a rune must be either uncased, lowercase or 36 // uppercase. If the rune is cased, the XOR pattern maps either a lowercase 37 // rune to uppercase or an uppercase rune to lowercase (applied to the 10 38 // least-significant bits of the rune). 39 // 40 // See the definitions below for a more detailed description of the various 41 // bits. 42 type info uint16 43 44 const ( 45 casedMask = 0x0003 46 fullCasedMask = 0x0007 47 ignorableMask = 0x0006 48 ignorableValue = 0x0004 49 50 inverseFoldBit = 1 << 7 51 isMidBit = 1 << 6 52 53 exceptionBit = 1 << 3 54 exceptionShift = 4 55 numExceptionBits = 12 56 57 xorIndexBit = 1 << 6 58 xorShift = 8 59 60 // There is no mapping if all xor bits and the exception bit are zero. 61 hasMappingMask = 0xff80 | exceptionBit 62 ) 63 64 // The case mode bits encodes the case type of a rune. This includes uncased, 65 // title, upper and lower case and case ignorable. (For a definition of these 66 // terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare 67 // cases, a rune can be both cased and case-ignorable. This is encoded by 68 // cIgnorableCased. A rune of this type is always lower case. Some runes are 69 // cased while not having a mapping. 70 // 71 // A common pattern for scripts in the Unicode standard is for upper and lower 72 // case runes to alternate for increasing rune values (e.g. the accented Latin 73 // ranges starting from U+0100 and U+1E00 among others and some Cyrillic 74 // characters). We use this property by defining a cXORCase mode, where the case 75 // mode (always upper or lower case) is derived from the rune value. As the XOR 76 // pattern for case mappings is often identical for successive runes, using 77 // cXORCase can result in large series of identical trie values. This, in turn, 78 // allows us to better compress the trie blocks. 79 const ( 80 cUncased info = iota // 000 81 cTitle // 001 82 cLower // 010 83 cUpper // 011 84 cIgnorableUncased // 100 85 cIgnorableCased // 101 // lower case if mappings exist 86 cXORCase // 11x // case is cLower | ((rune&1) ^ x) 87 88 maxCaseMode = cUpper 89 ) 90 91 func (c info) isCased() bool { 92 return c&casedMask != 0 93 } 94 95 func (c info) isCaseIgnorable() bool { 96 return c&ignorableMask == ignorableValue 97 } 98 99 func (c info) isNotCasedAndNotCaseIgnorable() bool { 100 return c&fullCasedMask == 0 101 } 102 103 func (c info) isCaseIgnorableAndNotCased() bool { 104 return c&fullCasedMask == cIgnorableUncased 105 } 106 107 func (c info) isMid() bool { 108 return c&(fullCasedMask|isMidBit) == isMidBit|cIgnorableUncased 109 } 110 111 // The case mapping implementation will need to know about various Canonical 112 // Combining Class (CCC) values. We encode two of these in the trie value: 113 // cccZero (0) and cccAbove (230). If the value is cccOther, it means that 114 // CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that 115 // the rune also has the break category Break (see below). 116 const ( 117 cccBreak info = iota << 4 118 cccZero 119 cccAbove 120 cccOther 121 122 cccMask = cccBreak | cccZero | cccAbove | cccOther 123 ) 124 125 const ( 126 starter = 0 127 above = 230 128 iotaSubscript = 240 129 ) 130 131 // The exceptions slice holds data that does not fit in a normal info entry. 132 // The entry is pointed to by the exception index in an entry. It has the 133 // following format: 134 // 135 // Header: 136 // 137 // byte 0: 138 // 7..6 unused 139 // 5..4 CCC type (same bits as entry) 140 // 3 unused 141 // 2..0 length of fold 142 // 143 // byte 1: 144 // 7..6 unused 145 // 5..3 length of 1st mapping of case type 146 // 2..0 length of 2nd mapping of case type 147 // 148 // case 1st 2nd 149 // lower -> upper, title 150 // upper -> lower, title 151 // title -> lower, upper 152 // 153 // Lengths with the value 0x7 indicate no value and implies no change. 154 // A length of 0 indicates a mapping to zero-length string. 155 // 156 // Body bytes: 157 // 158 // case folding bytes 159 // lowercase mapping bytes 160 // uppercase mapping bytes 161 // titlecase mapping bytes 162 // closure mapping bytes (for NFKC_Casefold). (TODO) 163 // 164 // Fallbacks: 165 // 166 // missing fold -> lower 167 // missing title -> upper 168 // all missing -> original rune 169 // 170 // exceptions starts with a dummy byte to enforce that there is no zero index 171 // value. 172 const ( 173 lengthMask = 0x07 174 lengthBits = 3 175 noChange = 0 176 ) 177 178 // References to generated trie. 179 180 var trie = newCaseTrie(0) 181 182 var sparse = sparseBlocks{ 183 values: sparseValues[:], 184 offsets: sparseOffsets[:], 185 } 186 187 // Sparse block lookup code. 188 189 // valueRange is an entry in a sparse block. 190 type valueRange struct { 191 value uint16 192 lo, hi byte 193 } 194 195 type sparseBlocks struct { 196 values []valueRange 197 offsets []uint16 198 } 199 200 // lookup returns the value from values block n for byte b using binary search. 201 func (s *sparseBlocks) lookup(n uint32, b byte) uint16 { 202 lo := s.offsets[n] 203 hi := s.offsets[n+1] 204 for lo < hi { 205 m := lo + (hi-lo)/2 206 r := s.values[m] 207 if r.lo <= b && b <= r.hi { 208 return r.value 209 } 210 if b < r.lo { 211 hi = m 212 } else { 213 lo = m + 1 214 } 215 } 216 return 0 217 } 218 219 // lastRuneForTesting is the last rune used for testing. Everything after this 220 // is boring. 221 const lastRuneForTesting = rune(0x1FFFF) 222