1
2
3
4
5
6
7
8
9
10
11
12 package main
13
14 import (
15 "bytes"
16 "fmt"
17 "io"
18 "log"
19 "reflect"
20 "strconv"
21 "strings"
22 "unicode"
23
24 "golang.org/x/text/internal/gen"
25 "golang.org/x/text/internal/triegen"
26 "golang.org/x/text/internal/ucd"
27 "golang.org/x/text/unicode/norm"
28 )
29
30 func main() {
31 gen.Init()
32 genTables()
33 genTablesTest()
34 gen.Repackage("gen_trieval.go", "trieval.go", "cases")
35 }
36
37
38
39 type runeInfo struct {
40 Rune rune
41
42 entry info
43
44 CaseMode info
45
46
47 Simple [1 + maxCaseMode][]rune
48
49
50 HasSpecial bool
51 Conditional bool
52 Special [1 + maxCaseMode][]rune
53
54
55 FoldSimple rune
56 FoldSpecial rune
57 FoldFull []rune
58
59
60
61
62 SoftDotted bool
63 CaseIgnorable bool
64 Cased bool
65 DecomposeGreek bool
66 BreakType string
67 BreakCat breakCategory
68
69
70 CCC byte
71 }
72
73 type breakCategory int
74
75 const (
76 breakBreak breakCategory = iota
77 breakLetter
78 breakMid
79 )
80
81
82 func (r *runeInfo) mapping(c info) string {
83 if r.HasSpecial {
84 return string(r.Special[c])
85 }
86 if len(r.Simple[c]) != 0 {
87 return string(r.Simple[c])
88 }
89 return string(r.Rune)
90 }
91
92 func parse(file string, f func(p *ucd.Parser)) {
93 ucd.Parse(gen.OpenUCDFile(file), f)
94 }
95
96 func parseUCD() []runeInfo {
97 chars := make([]runeInfo, unicode.MaxRune)
98
99 get := func(r rune) *runeInfo {
100 c := &chars[r]
101 c.Rune = r
102 return c
103 }
104
105 parse("UnicodeData.txt", func(p *ucd.Parser) {
106 ri := get(p.Rune(0))
107 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
108 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
109 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
110 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
111 if p.String(ucd.GeneralCategory) == "Lt" {
112 ri.CaseMode = cTitle
113 }
114 })
115
116
117 parse("PropList.txt", func(p *ucd.Parser) {
118 if p.String(1) == "Soft_Dotted" {
119 chars[p.Rune(0)].SoftDotted = true
120 }
121 })
122
123
124 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
125 ri := get(p.Rune(0))
126 switch p.String(1) {
127 case "Case_Ignorable":
128 ri.CaseIgnorable = true
129 case "Cased":
130 ri.Cased = true
131 case "Lowercase":
132 ri.CaseMode = cLower
133 case "Uppercase":
134 ri.CaseMode = cUpper
135 }
136 })
137
138
139 parse("SpecialCasing.txt", func(p *ucd.Parser) {
140
141
142
143
144
145 ri := get(p.Rune(0))
146 if p.String(4) == "" {
147 ri.HasSpecial = true
148 ri.Special[cLower] = p.Runes(1)
149 ri.Special[cTitle] = p.Runes(2)
150 ri.Special[cUpper] = p.Runes(3)
151 } else {
152 ri.Conditional = true
153 }
154 })
155
156
157
158 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
159 ri := get(p.Rune(0))
160 ri.BreakType = p.String(1)
161
162
163 switch p.String(1) {
164 case "MidLetter", "MidNumLet", "Single_Quote":
165 ri.BreakCat = breakMid
166 if !ri.CaseIgnorable {
167
168
169 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
170 }
171 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
172 ri.BreakCat = breakLetter
173 }
174 })
175
176
177 parse("CaseFolding.txt", func(p *ucd.Parser) {
178 ri := get(p.Rune(0))
179 switch p.String(1) {
180 case "C":
181 ri.FoldSimple = p.Rune(2)
182 ri.FoldFull = p.Runes(2)
183 case "S":
184 ri.FoldSimple = p.Rune(2)
185 case "T":
186 ri.FoldSpecial = p.Rune(2)
187 case "F":
188 ri.FoldFull = p.Runes(2)
189 default:
190 log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
191 }
192 })
193
194 return chars
195 }
196
197 func genTables() {
198 chars := parseUCD()
199 verifyProperties(chars)
200
201 t := triegen.NewTrie("case")
202 for i := range chars {
203 c := &chars[i]
204 makeEntry(c)
205 t.Insert(rune(i), uint64(c.entry))
206 }
207
208 w := gen.NewCodeWriter()
209 defer w.WriteVersionedGoFile("tables.go", "cases")
210
211 gen.WriteUnicodeVersion(w)
212
213
214
215
216
217 w.WriteVar("xorData", string(xorData))
218 w.WriteVar("exceptions", string(exceptionData))
219
220 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
221 if err != nil {
222 log.Fatal(err)
223 }
224 w.Size += sz
225 }
226
227 func makeEntry(ri *runeInfo) {
228 if ri.CaseIgnorable {
229 if ri.Cased {
230 ri.entry = cIgnorableCased
231 } else {
232 ri.entry = cIgnorableUncased
233 }
234 } else {
235 ri.entry = ri.CaseMode
236 }
237
238
239
240 ccc := cccOther
241 switch ri.CCC {
242 case 0:
243 ccc = cccZero
244 case above:
245 ccc = cccAbove
246 }
247 switch ri.BreakCat {
248 case breakBreak:
249 ccc = cccBreak
250 case breakMid:
251 ri.entry |= isMidBit
252 }
253
254 ri.entry |= ccc
255
256 if ri.CaseMode == cUncased {
257 return
258 }
259
260
261 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
262 makeException(ri)
263 return
264 }
265 if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
266 makeException(ri)
267 return
268 }
269
270
271
272 orig := string(ri.Rune)
273 mapped := ""
274 if ri.CaseMode == cUpper {
275 mapped = ri.mapping(cLower)
276 } else {
277 mapped = ri.mapping(cUpper)
278 }
279
280 if len(orig) != len(mapped) {
281 makeException(ri)
282 return
283 }
284
285 if string(ri.FoldFull) == ri.mapping(cUpper) {
286 ri.entry |= inverseFoldBit
287 }
288
289 n := len(orig)
290
291
292 var b []byte
293 for i := 0; i < n; i++ {
294 b = append(b, orig[i]^mapped[i])
295 }
296
297
298 for ; len(b) > 1 && b[0] == 0; b = b[1:] {
299 }
300
301 if len(b) == 1 && b[0]&0xc0 == 0 {
302 ri.entry |= info(b[0]) << xorShift
303 return
304 }
305
306 key := string(b)
307 x, ok := xorCache[key]
308 if !ok {
309 xorData = append(xorData, 0)
310 xorData = append(xorData, b...)
311
312 x = len(xorData) - 1
313 xorCache[key] = x
314 }
315 ri.entry |= info(x<<xorShift) | xorIndexBit
316 }
317
318 var xorCache = map[string]int{}
319
320
321
322
323 var xorData = []byte{}
324
325
326 var exceptionData = []byte{0}
327
328
329
330 func makeException(ri *runeInfo) {
331 ccc := ri.entry & cccMask
332
333 ri.entry &= 0x0007
334 ri.entry |= exceptionBit
335
336 if len(exceptionData) >= 1<<numExceptionBits {
337 log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
338 }
339
340
341 ri.entry |= info(len(exceptionData) << exceptionShift)
342
343 orig := string(ri.Rune)
344 tc := ri.mapping(cTitle)
345 uc := ri.mapping(cUpper)
346 lc := ri.mapping(cLower)
347 ff := string(ri.FoldFull)
348
349
350 addString := func(s string, b *byte) {
351 if len(s) == 0 {
352
353
354 log.Fatalf("%U: has zero-length mapping.", ri.Rune)
355 }
356 *b <<= 3
357 if s != orig || ri.CaseMode == cLower {
358 n := len(s)
359 if n > 7 {
360 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
361 }
362 *b |= byte(n)
363 exceptionData = append(exceptionData, s...)
364 }
365 }
366
367
368 exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
369
370
371 p := len(exceptionData)
372 exceptionData = append(exceptionData, 0)
373
374 if len(ff) > 7 {
375 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
376 }
377 exceptionData = append(exceptionData, ff...)
378 ct := ri.CaseMode
379 if ct != cLower {
380 addString(lc, &exceptionData[p])
381 }
382 if ct != cUpper {
383 addString(uc, &exceptionData[p])
384 }
385 if ct != cTitle {
386 addString(tc, &exceptionData[p])
387 }
388 }
389
390
391
392
393
394
395 type sparseCompacter struct {
396 sparseBlocks [][]uint16
397 sparseOffsets []uint16
398 sparseCount int
399 }
400
401
402
403 func makeSparse(vals []uint64) ([]uint16, int) {
404
405 values := make([]uint16, len(vals))
406 for i, v := range vals {
407 values[i] = uint16(v)
408 }
409
410 alt := func(i int, v uint16) uint16 {
411 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
412
413 xor := v
414 xor &^= 1
415 xor |= uint16(i&1) ^ (v & 1)
416 xor |= 0x4
417 return xor
418 }
419 return v
420 }
421
422 var count int
423 var previous uint16
424 for i, v := range values {
425 if v != 0 {
426
427 if v == previous {
428 continue
429 }
430
431
432 a := alt(i, v)
433 if a == previous {
434 values[i] = a
435 continue
436 }
437
438
439 count++
440
441
442 if p := i + 1; p < len(values) && alt(p, values[p]) == a {
443 values[i] = a
444 v = a
445 }
446 }
447 previous = v
448 }
449 return values, count
450 }
451
452 func (s *sparseCompacter) Size(v []uint64) (int, bool) {
453 _, n := makeSparse(v)
454
455
456 if n > 16 {
457 return 0, false
458 }
459
460 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
461 }
462
463 func (s *sparseCompacter) Store(v []uint64) uint32 {
464 h := uint32(len(s.sparseOffsets))
465 values, sz := makeSparse(v)
466 s.sparseBlocks = append(s.sparseBlocks, values)
467 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
468 s.sparseCount += sz
469 return h
470 }
471
472 func (s *sparseCompacter) Handler() string {
473
474 return "sparse.lookup"
475 }
476
477 func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
478 p := func(format string, args ...interface{}) {
479 _, err := fmt.Fprintf(w, format, args...)
480 if retErr == nil && err != nil {
481 retErr = err
482 }
483 }
484
485 ls := len(s.sparseBlocks)
486 if ls == len(s.sparseOffsets) {
487 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
488 }
489 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
490 p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
491
492 ns := s.sparseCount
493 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
494 p("var sparseValues = [%d]valueRange {", ns)
495 for i, values := range s.sparseBlocks {
496 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
497 var v uint16
498 for i, nv := range values {
499 if nv != v {
500 if v != 0 {
501 p(",hi:%#02x},", 0x80+i-1)
502 }
503 if nv != 0 {
504 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
505 }
506 }
507 v = nv
508 }
509 if v != 0 {
510 p(",hi:%#02x},", 0x80+len(values)-1)
511 }
512 }
513 p("\n}\n\n")
514 return
515 }
516
517
518
519
520 func verifyProperties(chars []runeInfo) {
521 for i, c := range chars {
522 r := rune(i)
523
524
525
526
527 if c.CCC > 0 && unicode.ToLower(r) != r {
528 log.Fatalf("%U: non-starter changes when lowercased", r)
529 }
530
531
532 d := norm.NFD.PropertiesString(string(r)).Decomposition()
533 if len(d) > 0 {
534 if d[0] == 'I' || d[0] == 'J' {
535
536 if len(d) < 3 {
537 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
538 }
539
540
541 runes := []rune(string(d[1:]))
542 ccc := chars[runes[0]].CCC
543
544 for _, mr := range runes[1:] {
545 mc := chars[mr]
546
547
548 if ccc == 0 || ccc > above {
549 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
550 }
551
552
553 if mc.CCC != ccc {
554 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
555 }
556
557
558 if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
559 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
560 }
561
562 if i += len(string(mr)); i >= len(d) {
563 break
564 }
565 }
566 }
567 }
568
569
570 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
571 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
572 }
573
574
575 if c.CCC == iotaSubscript && r != 0x0345 {
576 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
577 }
578
579
580 if c.SoftDotted && c.entry&exceptionBit != 0 {
581 log.Fatalf("%U: soft-dotted has exception", r)
582 }
583
584
585 if unicode.Is(unicode.Greek, r) {
586 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
587 runes := []rune(string(b))
588
589
590
591 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
592 log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
593 }
594
595
596
597 for _, m := range runes[1:] {
598 switch m {
599 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
600 default:
601 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
602 }
603 }
604 }
605 }
606
607
608
609
610 if c.CCC > 0 && c.BreakType != "Extend" {
611 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
612 }
613
614
615 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
616 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
617 }
618
619
620 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
621 if c.BreakCat != breakLetter {
622 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
623 }
624 }
625 }
626 }
627
628 func genTablesTest() {
629 w := &bytes.Buffer{}
630
631 fmt.Fprintln(w, "var (")
632 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
633
634
635
636 n := printProperties(io.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
637 n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
638 n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
639 if n > 0 {
640 log.Fatalf("One of the discarded properties does not have a perfect filter.")
641 }
642
643
644 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
645 parse("SpecialCasing.txt", func(p *ucd.Parser) {
646
647 if p.String(4) != "" {
648 return
649 }
650 r := p.Rune(0)
651 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
652 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
653 })
654 fmt.Fprint(w, "\t}\n\n")
655
656
657 table := map[rune]struct{ simple, full, special string }{}
658 parse("CaseFolding.txt", func(p *ucd.Parser) {
659 r := p.Rune(0)
660 t := p.String(1)
661 v := string(p.Runes(2))
662 if t != "T" && v == string(unicode.ToLower(r)) {
663 return
664 }
665 x := table[r]
666 switch t {
667 case "C":
668 x.full = v
669 x.simple = v
670 case "S":
671 x.simple = v
672 case "F":
673 x.full = v
674 case "T":
675 x.special = v
676 }
677 table[r] = x
678 })
679 fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
680 for r := rune(0); r < 0x10FFFF; r++ {
681 x, ok := table[r]
682 if !ok {
683 continue
684 }
685 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
686 }
687 fmt.Fprint(w, "\t}\n\n")
688
689
690 notBreak := map[rune]bool{}
691 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
692 switch p.String(1) {
693 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
694 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
695 notBreak[p.Rune(0)] = true
696 }
697 })
698
699 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
700 inBreak := false
701 for r := rune(0); r <= lastRuneForTesting; r++ {
702 if isBreak := !notBreak[r]; isBreak != inBreak {
703 if isBreak {
704 fmt.Fprintf(w, "\t\t{0x%x, ", r)
705 } else {
706 fmt.Fprintf(w, "0x%x},\n", r-1)
707 }
708 inBreak = isBreak
709 }
710 }
711 if inBreak {
712 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
713 }
714 fmt.Fprint(w, "\t}\n\n")
715
716
717
718 cased := map[rune]bool{}
719 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
720 if p.String(1) == "Cased" {
721 cased[p.Rune(0)] = true
722 }
723 })
724
725 fmt.Fprintln(w, "\tbreakTest = []string{")
726 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
727 c := strings.Split(p.String(0), " ")
728
729 const sep = '|'
730 numCased := 0
731 test := ""
732 for ; len(c) >= 2; c = c[2:] {
733 if c[0] == "รท" && test != "" {
734 test += string(sep)
735 }
736 i, err := strconv.ParseUint(c[1], 16, 32)
737 r := rune(i)
738 if err != nil {
739 log.Fatalf("Invalid rune %q.", c[1])
740 }
741 if r == sep {
742 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
743 }
744 if cased[r] {
745 numCased++
746 }
747 test += string(r)
748 }
749 if numCased > 1 {
750 fmt.Fprintf(w, "\t\t%q,\n", test)
751 }
752 })
753 fmt.Fprintln(w, "\t}")
754
755 fmt.Fprintln(w, ")")
756
757 gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
758 }
759
760
761
762
763 func verifyCased(r rune) bool {
764 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
765 }
766
767 func verifyLower(r rune) bool {
768 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
769 }
770
771 func verifyUpper(r rune) bool {
772 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
773 }
774
775
776
777 func verifyIgnore(r rune) bool {
778 props := []*unicode.RangeTable{
779 unicode.Mn,
780 unicode.Me,
781 unicode.Cf,
782 unicode.Lm,
783 unicode.Sk,
784 }
785 for _, p := range props {
786 if unicode.Is(p, r) {
787 return true
788 }
789 }
790 return false
791 }
792
793
794
795
796 func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
797 verify := map[rune]bool{}
798 n := 0
799 varNameParts := strings.Split(property, "_")
800 varNameParts[0] = strings.ToLower(varNameParts[0])
801 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
802 parse(file, func(p *ucd.Parser) {
803 if p.String(1) == property {
804 r := p.Rune(0)
805 verify[r] = true
806 if !f(r) {
807 n++
808 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
809 }
810 }
811 })
812 fmt.Fprint(w, "\t}\n\n")
813
814
815 for r := rune(0); r <= lastRuneForTesting; r++ {
816 if !verify[r] && f(r) {
817 log.Fatalf("Incorrect filter func for property %q.", property)
818 }
819 }
820 return n
821 }
822
823
824
825
826
827 func newCaseTrie(int) int { return 0 }
828
829 var (
830 sparseValues [0]valueRange
831 sparseOffsets [0]uint16
832 )
833
View as plain text