1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package colltab 6 7 // An Iter incrementally converts chunks of the input text to collation 8 // elements, while ensuring that the collation elements are in normalized order 9 // (that is, they are in the order as if the input text were normalized first). 10 type Iter struct { 11 Weighter Weighter 12 Elems []Elem 13 // N is the number of elements in Elems that will not be reordered on 14 // subsequent iterations, N <= len(Elems). 15 N int 16 17 bytes []byte 18 str string 19 // Because the Elems buffer may contain collation elements that are needed 20 // for look-ahead, we need two positions in the text (bytes or str): one for 21 // the end position in the text for the current iteration and one for the 22 // start of the next call to appendNext. 23 pEnd int // end position in text corresponding to N. 24 pNext int // pEnd <= pNext. 25 } 26 27 // Reset sets the position in the current input text to p and discards any 28 // results obtained so far. 29 func (i *Iter) Reset(p int) { 30 i.Elems = i.Elems[:0] 31 i.N = 0 32 i.pEnd = p 33 i.pNext = p 34 } 35 36 // Len returns the length of the input text. 37 func (i *Iter) Len() int { 38 if i.bytes != nil { 39 return len(i.bytes) 40 } 41 return len(i.str) 42 } 43 44 // Discard removes the collation elements up to N. 45 func (i *Iter) Discard() { 46 // TODO: change this such that only modifiers following starters will have 47 // to be copied. 48 i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])] 49 i.N = 0 50 } 51 52 // End returns the end position of the input text for which Next has returned 53 // results. 54 func (i *Iter) End() int { 55 return i.pEnd 56 } 57 58 // SetInput resets i to input s. 59 func (i *Iter) SetInput(s []byte) { 60 i.bytes = s 61 i.str = "" 62 i.Reset(0) 63 } 64 65 // SetInputString resets i to input s. 66 func (i *Iter) SetInputString(s string) { 67 i.str = s 68 i.bytes = nil 69 i.Reset(0) 70 } 71 72 func (i *Iter) done() bool { 73 return i.pNext >= len(i.str) && i.pNext >= len(i.bytes) 74 } 75 76 func (i *Iter) appendNext() bool { 77 if i.done() { 78 return false 79 } 80 var sz int 81 if i.bytes == nil { 82 i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:]) 83 } else { 84 i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:]) 85 } 86 if sz == 0 { 87 sz = 1 88 } 89 i.pNext += sz 90 return true 91 } 92 93 // Next appends Elems to the internal array. On each iteration, it will either 94 // add starters or modifiers. In the majority of cases, an Elem with a primary 95 // value > 0 will have a CCC of 0. The CCC values of collation elements are also 96 // used to detect if the input string was not normalized and to adjust the 97 // result accordingly. 98 func (i *Iter) Next() bool { 99 if i.N == len(i.Elems) && !i.appendNext() { 100 return false 101 } 102 103 // Check if the current segment starts with a starter. 104 prevCCC := i.Elems[len(i.Elems)-1].CCC() 105 if prevCCC == 0 { 106 i.N = len(i.Elems) 107 i.pEnd = i.pNext 108 return true 109 } else if i.Elems[i.N].CCC() == 0 { 110 // set i.N to only cover part of i.Elems for which prevCCC == 0 and 111 // use rest for the next call to next. 112 for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ { 113 } 114 i.pEnd = i.pNext 115 return true 116 } 117 118 // The current (partial) segment starts with modifiers. We need to collect 119 // all successive modifiers to ensure that they are normalized. 120 for { 121 p := len(i.Elems) 122 i.pEnd = i.pNext 123 if !i.appendNext() { 124 break 125 } 126 127 if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters { 128 // Leave the starter for the next iteration. This ensures that we 129 // do not return sequences of collation elements that cross two 130 // segments. 131 // 132 // TODO: handle large number of combining characters by fully 133 // normalizing the input segment before iteration. This ensures 134 // results are consistent across the text repo. 135 i.N = p 136 return true 137 } else if ccc < prevCCC { 138 i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC. 139 } else { 140 prevCCC = ccc 141 } 142 } 143 144 done := len(i.Elems) != i.N 145 i.N = len(i.Elems) 146 return done 147 } 148 149 // nextNoNorm is the same as next, but does not "normalize" the collation 150 // elements. 151 func (i *Iter) nextNoNorm() bool { 152 // TODO: remove this function. Using this instead of next does not seem 153 // to improve performance in any significant way. We retain this until 154 // later for evaluation purposes. 155 if i.done() { 156 return false 157 } 158 i.appendNext() 159 i.N = len(i.Elems) 160 return true 161 } 162 163 const maxCombiningCharacters = 30 164 165 // doNorm reorders the collation elements in i.Elems. 166 // It assumes that blocks of collation elements added with appendNext 167 // either start and end with the same CCC or start with CCC == 0. 168 // This allows for a single insertion point for the entire block. 169 // The correctness of this assumption is verified in builder.go. 170 func (i *Iter) doNorm(p int, ccc uint8) { 171 n := len(i.Elems) 172 k := p 173 for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- { 174 } 175 i.Elems = append(i.Elems, i.Elems[p:k]...) 176 copy(i.Elems[p:], i.Elems[k:]) 177 i.Elems = i.Elems[:n] 178 } 179