1
2
3
4
5
6
7
8
9
10 package main
11
12 import (
13 "flag"
14 "fmt"
15 "log"
16 "os"
17 "regexp"
18 "sort"
19 "strings"
20 "unicode"
21
22 "golang.org/x/text/internal/gen"
23 "golang.org/x/text/internal/ucd"
24 "golang.org/x/text/unicode/rangetable"
25 )
26
27 func main() {
28 flag.Parse()
29 setupOutput()
30 loadChars()
31 loadCasefold()
32 printCategories()
33 printScriptOrProperty(false)
34 printScriptOrProperty(true)
35 printCases()
36 printLatinProperties()
37 printCasefold()
38 printSizes()
39 flushOutput()
40 }
41
42 func defaultVersion() string {
43 if v := os.Getenv("UNICODE_VERSION"); v != "" {
44 return v
45 }
46 return unicode.Version
47 }
48
49 var tablelist = flag.String("tables",
50 "all",
51 "comma-separated list of which tables to generate; can be letter")
52 var scriptlist = flag.String("scripts",
53 "all",
54 "comma-separated list of which script tables to generate")
55 var proplist = flag.String("props",
56 "all",
57 "comma-separated list of which property tables to generate")
58 var cases = flag.Bool("cases",
59 true,
60 "generate case tables")
61 var test = flag.Bool("test",
62 false,
63 "test existing tables; can be used to compare web data with package data")
64
65 var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
66 var logger = log.New(os.Stderr, "", log.Lshortfile)
67
68 var output *gen.CodeWriter
69
70 func setupOutput() {
71 output = gen.NewCodeWriter()
72 }
73
74 func flushOutput() {
75 output.WriteGoFile("tables.go", "unicode")
76 }
77
78 func printf(format string, args ...interface{}) {
79 fmt.Fprintf(output, format, args...)
80 }
81
82 func print(args ...interface{}) {
83 fmt.Fprint(output, args...)
84 }
85
86 func println(args ...interface{}) {
87 fmt.Fprintln(output, args...)
88 }
89
90 var category = map[string]bool{
91
92
93 "L": true,
94 "P": true,
95 "M": true,
96 "N": true,
97 "S": true,
98 "Z": true,
99 "C": true,
100 }
101
102
103 type Char struct {
104 codePoint rune
105 category string
106 upperCase rune
107 lowerCase rune
108 titleCase rune
109 foldCase rune
110 caseOrbit rune
111 }
112
113 const MaxChar = 0x10FFFF
114
115 var chars = make([]Char, MaxChar+1)
116 var scripts = make(map[string][]rune)
117 var props = make(map[string][]rune)
118
119 func allCategories() []string {
120 a := make([]string, 0, len(category))
121 for k := range category {
122 a = append(a, k)
123 }
124 sort.Strings(a)
125 return a
126 }
127
128 func all(scripts map[string][]rune) []string {
129 a := make([]string, 0, len(scripts))
130 for k := range scripts {
131 a = append(a, k)
132 }
133 sort.Strings(a)
134 return a
135 }
136
137 func allCatFold(m map[string]map[rune]bool) []string {
138 a := make([]string, 0, len(m))
139 for k := range m {
140 a = append(a, k)
141 }
142 sort.Strings(a)
143 return a
144 }
145
146 func categoryOp(code rune, class uint8) bool {
147 category := chars[code].category
148 return len(category) > 0 && category[0] == class
149 }
150
151 func loadChars() {
152 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
153 c := Char{codePoint: p.Rune(0)}
154
155 getRune := func(field int) rune {
156 if p.String(field) == "" {
157 return 0
158 }
159 return p.Rune(field)
160 }
161
162 c.category = p.String(ucd.GeneralCategory)
163 category[c.category] = true
164 switch c.category {
165 case "Nd":
166
167 p.Int(ucd.NumericValue)
168 case "Lu":
169 c.upperCase = getRune(ucd.CodePoint)
170 c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
171 c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
172 case "Ll":
173 c.upperCase = getRune(ucd.SimpleUppercaseMapping)
174 c.lowerCase = getRune(ucd.CodePoint)
175 c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
176 case "Lt":
177 c.upperCase = getRune(ucd.SimpleUppercaseMapping)
178 c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
179 c.titleCase = getRune(ucd.CodePoint)
180 default:
181 c.upperCase = getRune(ucd.SimpleUppercaseMapping)
182 c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
183 c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
184 }
185
186 chars[c.codePoint] = c
187 })
188 }
189
190 func loadCasefold() {
191 ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) {
192 kind := p.String(1)
193 if kind != "C" && kind != "S" {
194
195 return
196 }
197 p1 := p.Rune(0)
198 p2 := p.Rune(2)
199 chars[p1].foldCase = rune(p2)
200 })
201 }
202
203 var categoryMapping = map[string]string{
204 "Lu": "Letter, uppercase",
205 "Ll": "Letter, lowercase",
206 "Lt": "Letter, titlecase",
207 "Lm": "Letter, modifier",
208 "Lo": "Letter, other",
209 "Mn": "Mark, nonspacing",
210 "Mc": "Mark, spacing combining",
211 "Me": "Mark, enclosing",
212 "Nd": "Number, decimal digit",
213 "Nl": "Number, letter",
214 "No": "Number, other",
215 "Pc": "Punctuation, connector",
216 "Pd": "Punctuation, dash",
217 "Ps": "Punctuation, open",
218 "Pe": "Punctuation, close",
219 "Pi": "Punctuation, initial quote",
220 "Pf": "Punctuation, final quote",
221 "Po": "Punctuation, other",
222 "Sm": "Symbol, math",
223 "Sc": "Symbol, currency",
224 "Sk": "Symbol, modifier",
225 "So": "Symbol, other",
226 "Zs": "Separator, space",
227 "Zl": "Separator, line",
228 "Zp": "Separator, paragraph",
229 "Cc": "Other, control",
230 "Cf": "Other, format",
231 "Cs": "Other, surrogate",
232 "Co": "Other, private use",
233 "Cn": "Other, not assigned",
234 }
235
236 func printCategories() {
237 if *tablelist == "" {
238 return
239 }
240
241 list := strings.Split(*tablelist, ",")
242 if *tablelist == "all" {
243 list = allCategories()
244 }
245 if *test {
246 fullCategoryTest(list)
247 return
248 }
249
250 println("// Version is the Unicode edition from which the tables are derived.")
251 printf("const Version = %q\n\n", gen.UnicodeVersion())
252
253 if *tablelist == "all" {
254 println("// Categories is the set of Unicode category tables.")
255 println("var Categories = map[string] *RangeTable {")
256 for _, k := range allCategories() {
257 printf("\t%q: %s,\n", k, k)
258 }
259 print("}\n\n")
260 }
261
262 decl := make(sort.StringSlice, len(list))
263 ndecl := 0
264 for _, name := range list {
265 if _, ok := category[name]; !ok {
266 logger.Fatal("unknown category", name)
267 }
268
269
270
271
272 varDecl := ""
273 switch name {
274 case "C":
275 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n"
276 varDecl += "\tC = _C\n"
277 case "L":
278 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n"
279 varDecl += "\tL = _L\n"
280 case "M":
281 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n"
282 varDecl += "\tM = _M\n"
283 case "N":
284 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n"
285 varDecl += "\tN = _N\n"
286 case "P":
287 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n"
288 varDecl += "\tP = _P\n"
289 case "S":
290 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n"
291 varDecl += "\tS = _S\n"
292 case "Z":
293 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n"
294 varDecl += "\tZ = _Z\n"
295 case "Nd":
296 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
297 case "Lu":
298 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n"
299 case "Ll":
300 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n"
301 case "Lt":
302 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
303 }
304 if len(name) > 1 {
305 desc, ok := categoryMapping[name]
306 if ok {
307 varDecl += fmt.Sprintf(
308 "\t%s = _%s; // %s is the set of Unicode characters in category %s (%s).\n",
309 name, name, name, name, desc)
310 } else {
311 varDecl += fmt.Sprintf(
312 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
313 name, name, name, name)
314 }
315 }
316 decl[ndecl] = varDecl
317 ndecl++
318 if len(name) == 1 {
319 dumpRange(
320 "_"+name,
321 func(code rune) bool { return categoryOp(code, name[0]) })
322 continue
323 }
324 dumpRange("_"+name,
325 func(code rune) bool { return chars[code].category == name })
326 }
327 decl.Sort()
328 println("// These variables have type *RangeTable.")
329 println("var (")
330 for _, d := range decl {
331 print(d)
332 }
333 print(")\n\n")
334 }
335
336 type Op func(code rune) bool
337
338 func dumpRange(name string, inCategory Op) {
339 runes := []rune{}
340 for i := range chars {
341 r := rune(i)
342 if inCategory(r) {
343 runes = append(runes, r)
344 }
345 }
346 printRangeTable(name, runes)
347 }
348
349 func printRangeTable(name string, runes []rune) {
350 rt := rangetable.New(runes...)
351 printf("var %s = &RangeTable{\n", name)
352 println("\tR16: []Range16{")
353 for _, r := range rt.R16 {
354 printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
355 range16Count++
356 }
357 println("\t},")
358 if len(rt.R32) > 0 {
359 println("\tR32: []Range32{")
360 for _, r := range rt.R32 {
361 printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
362 range32Count++
363 }
364 println("\t},")
365 }
366 if rt.LatinOffset > 0 {
367 printf("\tLatinOffset: %d,\n", rt.LatinOffset)
368 }
369 printf("}\n\n")
370 }
371
372 func fullCategoryTest(list []string) {
373 for _, name := range list {
374 if _, ok := category[name]; !ok {
375 logger.Fatal("unknown category", name)
376 }
377 r, ok := unicode.Categories[name]
378 if !ok && len(name) > 1 {
379 logger.Fatalf("unknown table %q", name)
380 }
381 if len(name) == 1 {
382 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
383 } else {
384 verifyRange(
385 name,
386 func(code rune) bool { return chars[code].category == name },
387 r)
388 }
389 }
390 }
391
392 func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
393 count := 0
394 for j := range chars {
395 i := rune(j)
396 web := inCategory(i)
397 pkg := unicode.Is(table, i)
398 if web != pkg {
399 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
400 count++
401 if count > 10 {
402 break
403 }
404 }
405 }
406 }
407
408 func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) {
409 for _, name := range list {
410 if _, ok := scripts[name]; !ok {
411 logger.Fatal("unknown script", name)
412 }
413 _, ok := installed[name]
414 if !ok {
415 logger.Fatal("unknown table", name)
416 }
417 for _, r := range scripts[name] {
418 if !unicode.Is(installed[name], rune(r)) {
419 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
420 }
421 }
422 }
423 }
424
425 var deprecatedAliases = map[string]string{
426 "Sentence_Terminal": "STerm",
427 }
428
429
430 func printScriptOrProperty(doProps bool) {
431 flaglist := *scriptlist
432 file := "Scripts.txt"
433 table := scripts
434 installed := unicode.Scripts
435 if doProps {
436 flaglist = *proplist
437 file = "PropList.txt"
438 table = props
439 installed = unicode.Properties
440 }
441 if flaglist == "" {
442 return
443 }
444 ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) {
445 name := p.String(1)
446 table[name] = append(table[name], p.Rune(0))
447 })
448
449 list := strings.Split(flaglist, ",")
450 if flaglist == "all" {
451 list = all(table)
452 }
453 if *test {
454 fullScriptTest(list, installed, table)
455 return
456 }
457
458 if flaglist == "all" {
459 if doProps {
460 println("// Properties is the set of Unicode property tables.")
461 println("var Properties = map[string] *RangeTable{")
462 } else {
463 println("// Scripts is the set of Unicode script tables.")
464 println("var Scripts = map[string] *RangeTable{")
465 }
466 for _, k := range all(table) {
467 printf("\t%q: %s,\n", k, k)
468 if alias, ok := deprecatedAliases[k]; ok {
469 printf("\t%q: %s,\n", alias, k)
470 }
471 }
472 print("}\n\n")
473 }
474
475 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
476 ndecl := 0
477 for _, name := range list {
478 if doProps {
479 decl[ndecl] = fmt.Sprintf(
480 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
481 name, name, name, name)
482 } else {
483 decl[ndecl] = fmt.Sprintf(
484 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
485 name, name, name, name)
486 }
487 ndecl++
488 if alias, ok := deprecatedAliases[name]; ok {
489 decl[ndecl] = fmt.Sprintf(
490 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
491 alias, name)
492 ndecl++
493 }
494 printRangeTable("_"+name, table[name])
495 }
496 decl.Sort()
497 println("// These variables have type *RangeTable.")
498 println("var (")
499 for _, d := range decl {
500 print(d)
501 }
502 print(")\n\n")
503 }
504
505 const (
506 CaseUpper = 1 << iota
507 CaseLower
508 CaseTitle
509 CaseNone = 0
510 CaseMissing = -1
511 )
512
513 type caseState struct {
514 point rune
515 _case int
516 deltaToUpper rune
517 deltaToLower rune
518 deltaToTitle rune
519 }
520
521
522 func (c *caseState) adjacent(d *caseState) bool {
523 if d.point < c.point {
524 c, d = d, c
525 }
526 switch {
527 case d.point != c.point+1:
528 return false
529 case d._case != c._case:
530 return c.upperLowerAdjacent(d)
531 case c._case == CaseNone:
532 return false
533 case c._case == CaseMissing:
534 return false
535 case d.deltaToUpper != c.deltaToUpper:
536 return false
537 case d.deltaToLower != c.deltaToLower:
538 return false
539 case d.deltaToTitle != c.deltaToTitle:
540 return false
541 }
542 return true
543 }
544
545
546
547 func (c *caseState) upperLowerAdjacent(d *caseState) bool {
548
549 switch {
550 case c._case == CaseUpper && d._case != CaseLower:
551 return false
552 case c._case == CaseLower && d._case != CaseUpper:
553 return false
554 }
555
556 if c._case == CaseLower {
557 c, d = d, c
558 }
559
560
561
562 switch {
563 case c.deltaToUpper != 0:
564 return false
565 case c.deltaToLower != 1:
566 return false
567 case c.deltaToTitle != 0:
568 return false
569 case d.deltaToUpper != -1:
570 return false
571 case d.deltaToLower != 0:
572 return false
573 case d.deltaToTitle != -1:
574 return false
575 }
576 return true
577 }
578
579
580 func (c *caseState) isUpperLower() bool {
581
582
583 switch {
584 case c.deltaToUpper != 0:
585 return false
586 case c.deltaToLower != 1:
587 return false
588 case c.deltaToTitle != 0:
589 return false
590 }
591 return true
592 }
593
594
595 func (c *caseState) isLowerUpper() bool {
596
597
598 switch {
599 case c.deltaToUpper != -1:
600 return false
601 case c.deltaToLower != 0:
602 return false
603 case c.deltaToTitle != -1:
604 return false
605 }
606 return true
607 }
608
609 func getCaseState(i rune) (c *caseState) {
610 c = &caseState{point: i, _case: CaseNone}
611 ch := &chars[i]
612 switch ch.codePoint {
613 case 0:
614 c._case = CaseMissing
615 return
616 case ch.upperCase:
617 c._case = CaseUpper
618 case ch.lowerCase:
619 c._case = CaseLower
620 case ch.titleCase:
621 c._case = CaseTitle
622 }
623
624
625 if c._case == CaseNone && ch.lowerCase != 0 {
626 c._case = CaseUpper
627 }
628
629 if c._case == CaseNone && ch.upperCase != 0 {
630 c._case = CaseLower
631 }
632
633 if ch.upperCase != 0 {
634 c.deltaToUpper = ch.upperCase - i
635 }
636 if ch.lowerCase != 0 {
637 c.deltaToLower = ch.lowerCase - i
638 }
639 if ch.titleCase != 0 {
640 c.deltaToTitle = ch.titleCase - i
641 }
642 return
643 }
644
645 func printCases() {
646 if *test {
647 fullCaseTest()
648 return
649 }
650 printf(
651 "// CaseRanges is the table describing case mappings for all letters with\n" +
652 "// non-self mappings.\n" +
653 "var CaseRanges = _CaseRanges\n" +
654 "var _CaseRanges = []CaseRange {\n")
655
656 var startState *caseState
657 var prevState = &caseState{}
658 for i := range chars {
659 state := getCaseState(rune(i))
660 if state.adjacent(prevState) {
661 prevState = state
662 continue
663 }
664
665 printCaseRange(startState, prevState)
666 startState = nil
667 if state._case != CaseMissing && state._case != CaseNone {
668 startState = state
669 }
670 prevState = state
671 }
672 print("}\n")
673 }
674
675 func printCaseRange(lo, hi *caseState) {
676 if lo == nil {
677 return
678 }
679 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
680
681 return
682 }
683 switch {
684 case hi.point > lo.point && lo.isUpperLower():
685 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
686 lo.point, hi.point)
687 case hi.point > lo.point && lo.isLowerUpper():
688 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
689 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
690 lo.point, hi.point)
691 default:
692 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
693 lo.point, hi.point,
694 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
695 }
696 }
697
698
699 func caseIt(r, cased rune) rune {
700 if cased == 0 {
701 return r
702 }
703 return cased
704 }
705
706 func fullCaseTest() {
707 for j, c := range chars {
708 i := rune(j)
709 lower := unicode.ToLower(i)
710 want := caseIt(i, c.lowerCase)
711 if lower != want {
712 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
713 }
714 upper := unicode.ToUpper(i)
715 want = caseIt(i, c.upperCase)
716 if upper != want {
717 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
718 }
719 title := unicode.ToTitle(i)
720 want = caseIt(i, c.titleCase)
721 if title != want {
722 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
723 }
724 }
725 }
726
727 func printLatinProperties() {
728 if *test {
729 return
730 }
731 println("var properties = [MaxLatin1+1]uint8{")
732 for code := 0; code <= unicode.MaxLatin1; code++ {
733 var property string
734 switch chars[code].category {
735 case "Cc", "":
736 property = "pC"
737 case "Cf":
738 property = "0"
739 case "Ll":
740 property = "pLl | pp"
741 case "Lo":
742 property = "pLo | pp"
743 case "Lu":
744 property = "pLu | pp"
745 case "Nd", "No":
746 property = "pN | pp"
747 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
748 property = "pP | pp"
749 case "Sc", "Sk", "Sm", "So":
750 property = "pS | pp"
751 case "Zs":
752 property = "pZ"
753 default:
754 logger.Fatalf("%U has unknown category %q", code, chars[code].category)
755 }
756
757 if code == ' ' {
758 property = "pZ | pp"
759 }
760 printf("\t0x%02X: %s, // %q\n", code, property, code)
761 }
762 printf("}\n\n")
763 }
764
765 func printCasefold() {
766
767 var caseOrbit = make([][]rune, MaxChar+1)
768 for j := range chars {
769 i := rune(j)
770 c := &chars[i]
771 if c.foldCase == 0 {
772 continue
773 }
774 orb := caseOrbit[c.foldCase]
775 if orb == nil {
776 orb = append(orb, c.foldCase)
777 }
778 caseOrbit[c.foldCase] = append(orb, i)
779 }
780
781
782 for j := range chars {
783 i := rune(j)
784 c := &chars[i]
785 f := c.foldCase
786 if f == 0 {
787 f = i
788 }
789 orb := caseOrbit[f]
790 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
791
792 caseOrbit[i] = []rune{i}
793 }
794 }
795
796
797 for i, orb := range caseOrbit {
798 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
799 caseOrbit[i] = nil
800 }
801 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
802 caseOrbit[i] = nil
803 }
804 }
805
806
807 for _, orb := range caseOrbit {
808 if orb == nil {
809 continue
810 }
811 sort.Slice(orb, func(i, j int) bool {
812 return orb[i] < orb[j]
813 })
814 c := orb[len(orb)-1]
815 for _, d := range orb {
816 chars[c].caseOrbit = d
817 c = d
818 }
819 }
820
821 printAsciiFold()
822 printCaseOrbit()
823
824
825
826
827 cat := make(map[string]map[rune]bool)
828 for name := range category {
829 if x := foldExceptions(inCategory(name)); len(x) > 0 {
830 cat[name] = x
831 }
832 }
833
834 scr := make(map[string]map[rune]bool)
835 for name := range scripts {
836 if x := foldExceptions(scripts[name]); len(x) > 0 {
837 scr[name] = x
838 }
839 }
840
841 printCatFold("FoldCategory", cat)
842 printCatFold("FoldScript", scr)
843 }
844
845
846 func inCategory(name string) []rune {
847 var x []rune
848 for j := range chars {
849 i := rune(j)
850 c := &chars[i]
851 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
852 x = append(x, i)
853 }
854 }
855 return x
856 }
857
858
859
860 func foldExceptions(class []rune) map[rune]bool {
861
862 m := make(map[rune]bool)
863 for _, r := range class {
864 c := &chars[r]
865 if c.caseOrbit == 0 {
866
867 if u := c.upperCase; u != 0 {
868 m[u] = true
869 }
870 if l := c.lowerCase; l != 0 {
871 m[l] = true
872 }
873 m[r] = true
874 continue
875 }
876
877 r0 := r
878 for {
879 m[r] = true
880 r = chars[r].caseOrbit
881 if r == r0 {
882 break
883 }
884 }
885 }
886
887
888 for _, r := range class {
889 delete(m, r)
890 }
891
892
893 return m
894 }
895
896 var comment = map[string]string{
897 "FoldCategory": "// FoldCategory maps a category name to a table of\n" +
898 "// code points outside the category that are equivalent under\n" +
899 "// simple case folding to code points inside the category.\n" +
900 "// If there is no entry for a category name, there are no such points.\n",
901
902 "FoldScript": "// FoldScript maps a script name to a table of\n" +
903 "// code points outside the script that are equivalent under\n" +
904 "// simple case folding to code points inside the script.\n" +
905 "// If there is no entry for a script name, there are no such points.\n",
906 }
907
908 func printAsciiFold() {
909 printf("var asciiFold = [MaxASCII + 1]uint16{\n")
910 for i := rune(0); i <= unicode.MaxASCII; i++ {
911 c := chars[i]
912 f := c.caseOrbit
913 if f == 0 {
914 if c.lowerCase != i && c.lowerCase != 0 {
915 f = c.lowerCase
916 } else if c.upperCase != i && c.upperCase != 0 {
917 f = c.upperCase
918 } else {
919 f = i
920 }
921 }
922 printf("\t0x%04X,\n", f)
923 }
924 printf("}\n\n")
925 }
926
927 func printCaseOrbit() {
928 if *test {
929 for j := range chars {
930 i := rune(j)
931 c := &chars[i]
932 f := c.caseOrbit
933 if f == 0 {
934 if c.lowerCase != i && c.lowerCase != 0 {
935 f = c.lowerCase
936 } else if c.upperCase != i && c.upperCase != 0 {
937 f = c.upperCase
938 } else {
939 f = i
940 }
941 }
942 if g := unicode.SimpleFold(i); g != f {
943 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
944 }
945 }
946 return
947 }
948
949 printf("var caseOrbit = []foldPair{\n")
950 for i := range chars {
951 c := &chars[i]
952 if c.caseOrbit != 0 {
953 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
954 foldPairCount++
955 }
956 }
957 printf("}\n\n")
958 }
959
960 func printCatFold(name string, m map[string]map[rune]bool) {
961 if *test {
962 var pkgMap map[string]*unicode.RangeTable
963 if name == "FoldCategory" {
964 pkgMap = unicode.FoldCategory
965 } else {
966 pkgMap = unicode.FoldScript
967 }
968 if len(pkgMap) != len(m) {
969 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
970 return
971 }
972 for k, v := range m {
973 t, ok := pkgMap[k]
974 if !ok {
975 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
976 continue
977 }
978 n := 0
979 for _, r := range t.R16 {
980 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
981 if !v[c] {
982 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
983 }
984 n++
985 }
986 }
987 for _, r := range t.R32 {
988 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
989 if !v[c] {
990 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
991 }
992 n++
993 }
994 }
995 if n != len(v) {
996 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
997 }
998 }
999 return
1000 }
1001
1002 print(comment[name])
1003 printf("var %s = map[string]*RangeTable{\n", name)
1004 for _, name := range allCatFold(m) {
1005 printf("\t%q: fold%s,\n", name, name)
1006 }
1007 printf("}\n\n")
1008 for _, name := range allCatFold(m) {
1009 class := m[name]
1010 dumpRange("fold"+name, func(code rune) bool { return class[code] })
1011 }
1012 }
1013
1014 var range16Count = 0
1015 var range32Count = 0
1016 var foldPairCount = 0
1017
1018 func printSizes() {
1019 if *test {
1020 return
1021 }
1022 println()
1023 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
1024 range16Bytes := range16Count * 3 * 2
1025 range32Bytes := range32Count * 3 * 4
1026 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
1027 println()
1028 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
1029 }
1030
View as plain text