...
1
2
3
4
5 package main
6
7 import (
8 "math"
9 "math/rand"
10 "strings"
11 "unicode"
12 "unicode/utf16"
13 "unicode/utf8"
14
15 "golang.org/x/text/language"
16 "golang.org/x/text/unicode/norm"
17 )
18
19
20
21
22 func parent(locale string) (parent string, ok bool) {
23 if locale == "und" {
24 return "", false
25 }
26 if i := strings.LastIndex(locale, "-"); i != -1 {
27 return locale[:i], true
28 }
29 return "und", true
30 }
31
32
33
34 type rewriter struct {
35 seen map[string]bool
36 addCases bool
37 }
38
39 func newRewriter() *rewriter {
40 return &rewriter{
41 seen: make(map[string]bool),
42 }
43 }
44
45 func (r *rewriter) insert(a []string, s string) []string {
46 if !r.seen[s] {
47 r.seen[s] = true
48 a = append(a, s)
49 }
50 return a
51 }
52
53
54
55 func (r *rewriter) rewrite(ss []string) []string {
56 ns := []string{}
57 for _, s := range ss {
58 ns = r.insert(ns, s)
59 if r.addCases {
60 rs := []rune(s)
61 rn := rs[0]
62 for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
63 rs[0] = c
64 ns = r.insert(ns, string(rs))
65 }
66 }
67 }
68 return ns
69 }
70
71
72 type exemplarySet struct {
73 typ exemplarType
74 set []string
75 charIndex int
76 }
77
78 type phraseGenerator struct {
79 sets [exN]exemplarySet
80 n int
81 }
82
83 func (g *phraseGenerator) init(id string) {
84 ec := exemplarCharacters
85 loc := language.Make(id).String()
86
87 for i := range g.sets {
88 for p, ok := loc, true; ok; p, ok = parent(p) {
89 if set, ok := ec[p]; ok && set[i] != "" {
90 g.sets[i].set = strings.Split(set[i], " ")
91 break
92 }
93 }
94 }
95 r := newRewriter()
96 r.addCases = *cases
97 for i := range g.sets {
98 g.sets[i].set = r.rewrite(g.sets[i].set)
99 }
100
101 for i, set := range g.sets {
102 g.n += len(set.set)
103 g.sets[i].charIndex = g.n
104 }
105 }
106
107
108 func (g *phraseGenerator) phrase(i int) string {
109 for _, set := range g.sets {
110 if i < set.charIndex {
111 return set.set[i-(set.charIndex-len(set.set))]
112 }
113 }
114 panic("index out of range")
115 }
116
117
118
119
120
121 func (g *phraseGenerator) generate(doNorm bool) []Input {
122 const (
123 M = 1024 * 1024
124 buf8Size = 30 * M
125 buf16Size = 10 * M
126 )
127
128 if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
129 g.n = sq
130 }
131 size := g.n * g.n
132 a := make([]Input, 0, size)
133 buf8 := make([]byte, 0, buf8Size)
134 buf16 := make([]uint16, 0, buf16Size)
135
136 addInput := func(str string) {
137 buf8 = buf8[len(buf8):]
138 buf16 = buf16[len(buf16):]
139 if len(str) > cap(buf8) {
140 buf8 = make([]byte, 0, buf8Size)
141 }
142 if len(str) > cap(buf16) {
143 buf16 = make([]uint16, 0, buf16Size)
144 }
145 if doNorm {
146 buf8 = norm.NFD.AppendString(buf8, str)
147 } else {
148 buf8 = append(buf8, str...)
149 }
150 buf16 = appendUTF16(buf16, buf8)
151 a = append(a, makeInput(buf8, buf16))
152 }
153 for i := 0; i < g.n; i++ {
154 p1 := g.phrase(i)
155 addInput(p1)
156 for j := 0; j < g.n; j++ {
157 p2 := g.phrase(j)
158 addInput(p1 + p2)
159 }
160 }
161
162 rnd := rand.New(rand.NewSource(int64(rand.Int())))
163 for i := range a {
164 j := i + rnd.Intn(len(a)-i)
165 a[i], a[j] = a[j], a[i]
166 a[i].index = i
167 }
168 return a
169 }
170
171 func appendUTF16(buf []uint16, s []byte) []uint16 {
172 for len(s) > 0 {
173 r, sz := utf8.DecodeRune(s)
174 s = s[sz:]
175 r1, r2 := utf16.EncodeRune(r)
176 if r1 != 0xFFFD {
177 buf = append(buf, uint16(r1), uint16(r2))
178 } else {
179 buf = append(buf, uint16(r))
180 }
181 }
182 return buf
183 }
184
View as plain text