1
2
3
4
5 package main
6
7 import (
8 "bytes"
9 "flag"
10 "fmt"
11 "io"
12 "log"
13 "os"
14 "runtime/pprof"
15 "sort"
16 "strconv"
17 "strings"
18 "text/template"
19 "time"
20
21 "golang.org/x/text/unicode/norm"
22 )
23
24 var (
25 doNorm = flag.Bool("norm", false, "normalize input strings")
26 cases = flag.Bool("case", false, "generate case variants")
27 verbose = flag.Bool("verbose", false, "print results")
28 debug = flag.Bool("debug", false, "output debug information")
29 locales = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.")
30 col = flag.String("col", "go", "collator to test")
31 gold = flag.String("gold", "go", "collator used as the gold standard")
32 usecmp = flag.Bool("usecmp", false,
33 `use comparison instead of sort keys when sorting. Must be "test", "gold" or "both"`)
34 cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
35 exclude = flag.String("exclude", "", "exclude errors that contain any of the characters")
36 limit = flag.Int("limit", 5000000, "maximum number of samples to generate for one run")
37 )
38
39 func failOnError(err error) {
40 if err != nil {
41 log.Panic(err)
42 }
43 }
44
45
46
47 type Test struct {
48 ctxt *Context
49 Name string
50 Locale string
51 ColName string
52
53 Col Collator
54 UseCompare bool
55
56 Input []Input
57 Duration time.Duration
58
59 start time.Time
60 msg string
61 count int
62 }
63
64 func (t *Test) clear() {
65 t.Col = nil
66 t.Input = nil
67 }
68
69 const (
70 msgGeneratingInput = "generating input"
71 msgGeneratingKeys = "generating keys"
72 msgSorting = "sorting"
73 )
74
75 var lastLen = 0
76
77 func (t *Test) SetStatus(msg string) {
78 if *debug || *verbose {
79 fmt.Printf("%s: %s...\n", t.Name, msg)
80 } else if t.ctxt.out != nil {
81 fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen))
82 fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
83 fmt.Fprint(t.ctxt.out, msg, "...")
84 lastLen = len(msg) + 3
85 fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
86 }
87 }
88
89
90 func (t *Test) Start(msg string) {
91 t.SetStatus(msg)
92 t.count = 0
93 t.msg = msg
94 t.start = time.Now()
95 }
96
97
98 func (t *Test) Stop() (time.Duration, int) {
99 d := time.Now().Sub(t.start)
100 t.Duration += d
101 if *debug || *verbose {
102 fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000)
103 }
104 return d, t.count
105 }
106
107
108 func (t *Test) generateKeys() {
109 for i, s := range t.Input {
110 b := t.Col.Key(s)
111 t.Input[i].key = b
112 if *debug {
113 fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b)
114 }
115 }
116 }
117
118
119
120 func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) {
121 if *cpuprofile != "" {
122 f, err := os.Create(*cpuprofile)
123 failOnError(err)
124 pprof.StartCPUProfile(f)
125 defer pprof.StopCPUProfile()
126 }
127 if t.UseCompare || t.Col.Key(t.Input[0]) == nil {
128 t.Start(msgSorting)
129 sort.Sort(&testCompare{*t})
130 tsort, nsort = t.Stop()
131 } else {
132 t.Start(msgGeneratingKeys)
133 t.generateKeys()
134 t.count = len(t.Input)
135 tkey, nkey = t.Stop()
136 t.Start(msgSorting)
137 sort.Sort(t)
138 tsort, nsort = t.Stop()
139 }
140 return
141 }
142
143 func (t *Test) Swap(a, b int) {
144 t.Input[a], t.Input[b] = t.Input[b], t.Input[a]
145 }
146
147 func (t *Test) Less(a, b int) bool {
148 t.count++
149 return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1
150 }
151
152 func (t Test) Len() int {
153 return len(t.Input)
154 }
155
156 type testCompare struct {
157 Test
158 }
159
160 func (t *testCompare) Less(a, b int) bool {
161 t.count++
162 return t.Col.Compare(t.Input[a], t.Input[b]) == -1
163 }
164
165 type testRestore struct {
166 Test
167 }
168
169 func (t *testRestore) Less(a, b int) bool {
170 return t.Input[a].index < t.Input[b].index
171 }
172
173
174 func (t *Test) GenerateInput() {
175 t.Input = nil
176 if t.ctxt.lastLocale != t.Locale {
177 gen := phraseGenerator{}
178 gen.init(t.Locale)
179 t.SetStatus(msgGeneratingInput)
180 t.ctxt.lastInput = nil
181 t.Input = gen.generate(*doNorm)
182 t.ctxt.lastInput = t.Input
183 t.ctxt.lastLocale = t.Locale
184 } else {
185 t.Input = t.ctxt.lastInput
186 for i := range t.Input {
187 t.Input[i].key = nil
188 }
189 sort.Sort(&testRestore{*t})
190 }
191 }
192
193
194 type Context struct {
195 test []*Test
196 last *Test
197
198 lastLocale string
199 lastInput []Input
200
201 out io.Writer
202 }
203
204 func (ts *Context) Printf(format string, a ...interface{}) {
205 ts.assertBuf()
206 fmt.Fprintf(ts.out, format, a...)
207 }
208
209 func (ts *Context) Print(a ...interface{}) {
210 ts.assertBuf()
211 fmt.Fprint(ts.out, a...)
212 }
213
214
215
216
217
218 func (ts *Context) assertBuf() {
219 if ts.out != nil {
220 return
221 }
222 if *debug || *verbose {
223 ts.out = &bytes.Buffer{}
224 } else {
225 ts.out = os.Stdout
226 }
227 }
228
229
230 func (ts *Context) flush() {
231 if ts.out != nil {
232 if _, ok := ts.out.(io.ReadCloser); !ok {
233 io.Copy(os.Stdout, ts.out.(io.Reader))
234 }
235 }
236 }
237
238
239
240 func parseTests() *Context {
241 ctxt := &Context{}
242 colls := strings.Split(*col, ",")
243 for _, loc := range strings.Split(*locales, ",") {
244 loc = strings.TrimSpace(loc)
245 for _, name := range colls {
246 name = strings.TrimSpace(name)
247 col := getCollator(name, loc)
248 ctxt.test = append(ctxt.test, &Test{
249 ctxt: ctxt,
250 Locale: loc,
251 ColName: name,
252 UseCompare: *usecmp,
253 Col: col,
254 })
255 }
256 }
257 return ctxt
258 }
259
260 func (c *Context) Len() int {
261 return len(c.test)
262 }
263
264 func (c *Context) Test(i int) *Test {
265 if c.last != nil {
266 c.last.clear()
267 }
268 c.last = c.test[i]
269 return c.last
270 }
271
272 func parseInput(args []string) []Input {
273 input := []Input{}
274 for _, s := range args {
275 rs := []rune{}
276 for len(s) > 0 {
277 var r rune
278 r, _, s, _ = strconv.UnquoteChar(s, '\'')
279 rs = append(rs, r)
280 }
281 s = string(rs)
282 if *doNorm {
283 s = norm.NFD.String(s)
284 }
285 input = append(input, makeInputString(s))
286 }
287 return input
288 }
289
290
291 type Command struct {
292 Run func(cmd *Context, args []string)
293 Usage string
294 Short string
295 Long string
296 }
297
298 func (cmd Command) Name() string {
299 return strings.SplitN(cmd.Usage, " ", 2)[0]
300 }
301
302 var commands = []*Command{
303 cmdSort,
304 cmdBench,
305 cmdRegress,
306 }
307
308 const sortHelp = `
309 Sort sorts a given list of strings. Strings are separated by whitespace.
310 `
311
312 var cmdSort = &Command{
313 Run: runSort,
314 Usage: "sort <string>*",
315 Short: "sort a given list of strings",
316 Long: sortHelp,
317 }
318
319 func runSort(ctxt *Context, args []string) {
320 input := parseInput(args)
321 if len(input) == 0 {
322 log.Fatalf("Nothing to sort.")
323 }
324 if ctxt.Len() > 1 {
325 ctxt.Print("COLL LOCALE RESULT\n")
326 }
327 for i := 0; i < ctxt.Len(); i++ {
328 t := ctxt.Test(i)
329 t.Input = append(t.Input, input...)
330 t.Sort()
331 if ctxt.Len() > 1 {
332 ctxt.Printf("%-5s %-5s ", t.ColName, t.Locale)
333 }
334 for _, s := range t.Input {
335 ctxt.Print(string(s.UTF8), " ")
336 }
337 ctxt.Print("\n")
338 }
339 }
340
341 const benchHelp = `
342 Bench runs a benchmark for the given list of collator implementations.
343 If no collator implementations are given, the go collator will be used.
344 `
345
346 var cmdBench = &Command{
347 Run: runBench,
348 Usage: "bench",
349 Short: "benchmark a given list of collator implementations",
350 Long: benchHelp,
351 }
352
353 func runBench(ctxt *Context, args []string) {
354 ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL")
355 for i := 0; i < ctxt.Len(); i++ {
356 t := ctxt.Test(i)
357 ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName)
358 t.GenerateInput()
359 ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000))
360 tkey, tsort, nkey, nsort := t.Sort()
361 p := func(dur time.Duration, n int) {
362 s := ""
363 if dur > 0 {
364 s = fmt.Sprintf("%6.3fs ", dur.Seconds())
365 if n > 0 {
366 s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n)))
367 }
368 }
369 ctxt.Printf("%-24s ", s)
370 }
371 p(tkey, nkey)
372 p(tsort, nsort)
373
374 total := 0
375 for _, s := range t.Input {
376 total += len(s.key)
377 }
378 ctxt.Printf("%-5d ", total/t.Len())
379 ctxt.Printf("%6.3fs\n", t.Duration.Seconds())
380 if *debug {
381 for _, s := range t.Input {
382 fmt.Print(string(s.UTF8), " ")
383 }
384 fmt.Println()
385 }
386 }
387 }
388
389 const regressHelp = `
390 Regress runs a monkey test by comparing the results of randomly generated tests
391 between two implementations of a collator. The user may optionally pass a list
392 of strings to regress against instead of the default test set.
393 `
394
395 var cmdRegress = &Command{
396 Run: runRegress,
397 Usage: "regress -gold=<col> -test=<col> [string]*",
398 Short: "run a monkey test between two collators",
399 Long: regressHelp,
400 }
401
402 const failedKeyCompare = `
403 %s:%d: incorrect comparison result for input:
404 a: %q (%.4X)
405 key: %s
406 b: %q (%.4X)
407 key: %s
408 Compare(a, b) = %d; want %d.
409
410 gold keys:
411 a: %s
412 b: %s
413 `
414
415 const failedCompare = `
416 %s:%d: incorrect comparison result for input:
417 a: %q (%.4X)
418 b: %q (%.4X)
419 Compare(a, b) = %d; want %d.
420 `
421
422 func keyStr(b []byte) string {
423 buf := &bytes.Buffer{}
424 for _, v := range b {
425 fmt.Fprintf(buf, "%.2X ", v)
426 }
427 return buf.String()
428 }
429
430 func runRegress(ctxt *Context, args []string) {
431 input := parseInput(args)
432 for i := 0; i < ctxt.Len(); i++ {
433 t := ctxt.Test(i)
434 if len(input) > 0 {
435 t.Input = append(t.Input, input...)
436 } else {
437 t.GenerateInput()
438 }
439 t.Sort()
440 count := 0
441 gold := getCollator(*gold, t.Locale)
442 for i := 1; i < len(t.Input); i++ {
443 ia := t.Input[i-1]
444 ib := t.Input[i]
445 if bytes.IndexAny(ib.UTF8, *exclude) != -1 {
446 i++
447 continue
448 }
449 if bytes.IndexAny(ia.UTF8, *exclude) != -1 {
450 continue
451 }
452 goldCmp := gold.Compare(ia, ib)
453 if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp {
454 count++
455 a := string(ia.UTF8)
456 b := string(ib.UTF8)
457 fmt.Printf(failedKeyCompare, t.Locale, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib)))
458 } else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp {
459 count++
460 a := string(ia.UTF8)
461 b := string(ib.UTF8)
462 fmt.Printf(failedCompare, t.Locale, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp)
463 }
464 }
465 if count > 0 {
466 ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1)
467 }
468 }
469 }
470
471 const helpTemplate = `
472 colcmp is a tool for testing and benchmarking collation
473
474 Usage: colcmp command [arguments]
475
476 The commands are:
477 {{range .}}
478 {{.Name | printf "%-11s"}} {{.Short}}{{end}}
479
480 Use "col help [topic]" for more information about that topic.
481 `
482
483 const detailedHelpTemplate = `
484 Usage: colcmp {{.Usage}}
485
486 {{.Long | trim}}
487 `
488
489 func runHelp(args []string) {
490 t := template.New("help")
491 t.Funcs(template.FuncMap{"trim": strings.TrimSpace})
492 if len(args) < 1 {
493 template.Must(t.Parse(helpTemplate))
494 failOnError(t.Execute(os.Stderr, &commands))
495 } else {
496 for _, cmd := range commands {
497 if cmd.Name() == args[0] {
498 template.Must(t.Parse(detailedHelpTemplate))
499 failOnError(t.Execute(os.Stderr, cmd))
500 os.Exit(0)
501 }
502 }
503 log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0])
504 }
505 os.Exit(0)
506 }
507
508 func main() {
509 flag.Parse()
510 log.SetFlags(0)
511
512 ctxt := parseTests()
513
514 if flag.NArg() < 1 {
515 runHelp(nil)
516 }
517 args := flag.Args()[1:]
518 if flag.Arg(0) == "help" {
519 runHelp(args)
520 }
521 for _, cmd := range commands {
522 if cmd.Name() == flag.Arg(0) {
523 cmd.Run(ctxt, args)
524 ctxt.flush()
525 return
526 }
527 }
528 runHelp(flag.Args())
529 }
530
View as plain text