colcmp.go

Documentation: golang.org/x/text/collate/tools/colcmp

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main // import "golang.org/x/text/collate/tools/colcmp"
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"os"
    14  	"runtime/pprof"
    15  	"sort"
    16  	"strconv"
    17  	"strings"
    18  	"text/template"
    19  	"time"
    20  
    21  	"golang.org/x/text/unicode/norm"
    22  )
    23  
    24  var (
    25  	doNorm  = flag.Bool("norm", false, "normalize input strings")
    26  	cases   = flag.Bool("case", false, "generate case variants")
    27  	verbose = flag.Bool("verbose", false, "print results")
    28  	debug   = flag.Bool("debug", false, "output debug information")
    29  	locales = flag.String("locale", "en_US", "the locale to use. May be a comma-separated list for some commands.")
    30  	col     = flag.String("col", "go", "collator to test")
    31  	gold    = flag.String("gold", "go", "collator used as the gold standard")
    32  	usecmp  = flag.Bool("usecmp", false,
    33  		`use comparison instead of sort keys when sorting.  Must be "test", "gold" or "both"`)
    34  	cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
    35  	exclude    = flag.String("exclude", "", "exclude errors that contain any of the characters")
    36  	limit      = flag.Int("limit", 5000000, "maximum number of samples to generate for one run")
    37  )
    38  
    39  func failOnError(err error) {
    40  	if err != nil {
    41  		log.Panic(err)
    42  	}
    43  }
    44  
    45  // Test holds test data for testing a locale-collator pair.
    46  // Test also provides functionality that is commonly used by the various commands.
    47  type Test struct {
    48  	ctxt    *Context
    49  	Name    string
    50  	Locale  string
    51  	ColName string
    52  
    53  	Col        Collator
    54  	UseCompare bool
    55  
    56  	Input    []Input
    57  	Duration time.Duration
    58  
    59  	start time.Time
    60  	msg   string
    61  	count int
    62  }
    63  
    64  func (t *Test) clear() {
    65  	t.Col = nil
    66  	t.Input = nil
    67  }
    68  
    69  const (
    70  	msgGeneratingInput = "generating input"
    71  	msgGeneratingKeys  = "generating keys"
    72  	msgSorting         = "sorting"
    73  )
    74  
    75  var lastLen = 0
    76  
    77  func (t *Test) SetStatus(msg string) {
    78  	if *debug || *verbose {
    79  		fmt.Printf("%s: %s...\n", t.Name, msg)
    80  	} else if t.ctxt.out != nil {
    81  		fmt.Fprint(t.ctxt.out, strings.Repeat(" ", lastLen))
    82  		fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
    83  		fmt.Fprint(t.ctxt.out, msg, "...")
    84  		lastLen = len(msg) + 3
    85  		fmt.Fprint(t.ctxt.out, strings.Repeat("\b", lastLen))
    86  	}
    87  }
    88  
    89  // Start is used by commands to signal the start of an operation.
    90  func (t *Test) Start(msg string) {
    91  	t.SetStatus(msg)
    92  	t.count = 0
    93  	t.msg = msg
    94  	t.start = time.Now()
    95  }
    96  
    97  // Stop is used by commands to signal the end of an operation.
    98  func (t *Test) Stop() (time.Duration, int) {
    99  	d := time.Now().Sub(t.start)
   100  	t.Duration += d
   101  	if *debug || *verbose {
   102  		fmt.Printf("%s: %s done. (%.3fs /%dK ops)\n", t.Name, t.msg, d.Seconds(), t.count/1000)
   103  	}
   104  	return d, t.count
   105  }
   106  
   107  // generateKeys generates sort keys for all the inputs.
   108  func (t *Test) generateKeys() {
   109  	for i, s := range t.Input {
   110  		b := t.Col.Key(s)
   111  		t.Input[i].key = b
   112  		if *debug {
   113  			fmt.Printf("%s (%X): %X\n", string(s.UTF8), s.UTF16, b)
   114  		}
   115  	}
   116  }
   117  
   118  // Sort sorts the inputs. It generates sort keys if this is required by the
   119  // chosen sort method.
   120  func (t *Test) Sort() (tkey, tsort time.Duration, nkey, nsort int) {
   121  	if *cpuprofile != "" {
   122  		f, err := os.Create(*cpuprofile)
   123  		failOnError(err)
   124  		pprof.StartCPUProfile(f)
   125  		defer pprof.StopCPUProfile()
   126  	}
   127  	if t.UseCompare || t.Col.Key(t.Input[0]) == nil {
   128  		t.Start(msgSorting)
   129  		sort.Sort(&testCompare{*t})
   130  		tsort, nsort = t.Stop()
   131  	} else {
   132  		t.Start(msgGeneratingKeys)
   133  		t.generateKeys()
   134  		t.count = len(t.Input)
   135  		tkey, nkey = t.Stop()
   136  		t.Start(msgSorting)
   137  		sort.Sort(t)
   138  		tsort, nsort = t.Stop()
   139  	}
   140  	return
   141  }
   142  
   143  func (t *Test) Swap(a, b int) {
   144  	t.Input[a], t.Input[b] = t.Input[b], t.Input[a]
   145  }
   146  
   147  func (t *Test) Less(a, b int) bool {
   148  	t.count++
   149  	return bytes.Compare(t.Input[a].key, t.Input[b].key) == -1
   150  }
   151  
   152  func (t Test) Len() int {
   153  	return len(t.Input)
   154  }
   155  
   156  type testCompare struct {
   157  	Test
   158  }
   159  
   160  func (t *testCompare) Less(a, b int) bool {
   161  	t.count++
   162  	return t.Col.Compare(t.Input[a], t.Input[b]) == -1
   163  }
   164  
   165  type testRestore struct {
   166  	Test
   167  }
   168  
   169  func (t *testRestore) Less(a, b int) bool {
   170  	return t.Input[a].index < t.Input[b].index
   171  }
   172  
   173  // GenerateInput generates input phrases for the locale tested by t.
   174  func (t *Test) GenerateInput() {
   175  	t.Input = nil
   176  	if t.ctxt.lastLocale != t.Locale {
   177  		gen := phraseGenerator{}
   178  		gen.init(t.Locale)
   179  		t.SetStatus(msgGeneratingInput)
   180  		t.ctxt.lastInput = nil // allow the previous value to be garbage collected.
   181  		t.Input = gen.generate(*doNorm)
   182  		t.ctxt.lastInput = t.Input
   183  		t.ctxt.lastLocale = t.Locale
   184  	} else {
   185  		t.Input = t.ctxt.lastInput
   186  		for i := range t.Input {
   187  			t.Input[i].key = nil
   188  		}
   189  		sort.Sort(&testRestore{*t})
   190  	}
   191  }
   192  
   193  // Context holds all tests and settings translated from command line options.
   194  type Context struct {
   195  	test []*Test
   196  	last *Test
   197  
   198  	lastLocale string
   199  	lastInput  []Input
   200  
   201  	out io.Writer
   202  }
   203  
   204  func (ts *Context) Printf(format string, a ...interface{}) {
   205  	ts.assertBuf()
   206  	fmt.Fprintf(ts.out, format, a...)
   207  }
   208  
   209  func (ts *Context) Print(a ...interface{}) {
   210  	ts.assertBuf()
   211  	fmt.Fprint(ts.out, a...)
   212  }
   213  
   214  // assertBuf sets up an io.Writer for output, if it doesn't already exist.
   215  // In debug and verbose mode, output is buffered so that the regular output
   216  // will not interfere with the additional output.  Otherwise, output is
   217  // written directly to stdout for a more responsive feel.
   218  func (ts *Context) assertBuf() {
   219  	if ts.out != nil {
   220  		return
   221  	}
   222  	if *debug || *verbose {
   223  		ts.out = &bytes.Buffer{}
   224  	} else {
   225  		ts.out = os.Stdout
   226  	}
   227  }
   228  
   229  // flush flushes the contents of ts.out to stdout, if it is not stdout already.
   230  func (ts *Context) flush() {
   231  	if ts.out != nil {
   232  		if _, ok := ts.out.(io.ReadCloser); !ok {
   233  			io.Copy(os.Stdout, ts.out.(io.Reader))
   234  		}
   235  	}
   236  }
   237  
   238  // parseTests creates all tests from command lines and returns
   239  // a Context to hold them.
   240  func parseTests() *Context {
   241  	ctxt := &Context{}
   242  	colls := strings.Split(*col, ",")
   243  	for _, loc := range strings.Split(*locales, ",") {
   244  		loc = strings.TrimSpace(loc)
   245  		for _, name := range colls {
   246  			name = strings.TrimSpace(name)
   247  			col := getCollator(name, loc)
   248  			ctxt.test = append(ctxt.test, &Test{
   249  				ctxt:       ctxt,
   250  				Locale:     loc,
   251  				ColName:    name,
   252  				UseCompare: *usecmp,
   253  				Col:        col,
   254  			})
   255  		}
   256  	}
   257  	return ctxt
   258  }
   259  
   260  func (c *Context) Len() int {
   261  	return len(c.test)
   262  }
   263  
   264  func (c *Context) Test(i int) *Test {
   265  	if c.last != nil {
   266  		c.last.clear()
   267  	}
   268  	c.last = c.test[i]
   269  	return c.last
   270  }
   271  
   272  func parseInput(args []string) []Input {
   273  	input := []Input{}
   274  	for _, s := range args {
   275  		rs := []rune{}
   276  		for len(s) > 0 {
   277  			var r rune
   278  			r, _, s, _ = strconv.UnquoteChar(s, '\'')
   279  			rs = append(rs, r)
   280  		}
   281  		s = string(rs)
   282  		if *doNorm {
   283  			s = norm.NFD.String(s)
   284  		}
   285  		input = append(input, makeInputString(s))
   286  	}
   287  	return input
   288  }
   289  
   290  // A Command is an implementation of a colcmp command.
   291  type Command struct {
   292  	Run   func(cmd *Context, args []string)
   293  	Usage string
   294  	Short string
   295  	Long  string
   296  }
   297  
   298  func (cmd Command) Name() string {
   299  	return strings.SplitN(cmd.Usage, " ", 2)[0]
   300  }
   301  
   302  var commands = []*Command{
   303  	cmdSort,
   304  	cmdBench,
   305  	cmdRegress,
   306  }
   307  
   308  const sortHelp = `
   309  Sort sorts a given list of strings.  Strings are separated by whitespace.
   310  `
   311  
   312  var cmdSort = &Command{
   313  	Run:   runSort,
   314  	Usage: "sort <string>*",
   315  	Short: "sort a given list of strings",
   316  	Long:  sortHelp,
   317  }
   318  
   319  func runSort(ctxt *Context, args []string) {
   320  	input := parseInput(args)
   321  	if len(input) == 0 {
   322  		log.Fatalf("Nothing to sort.")
   323  	}
   324  	if ctxt.Len() > 1 {
   325  		ctxt.Print("COLL  LOCALE RESULT\n")
   326  	}
   327  	for i := 0; i < ctxt.Len(); i++ {
   328  		t := ctxt.Test(i)
   329  		t.Input = append(t.Input, input...)
   330  		t.Sort()
   331  		if ctxt.Len() > 1 {
   332  			ctxt.Printf("%-5s %-5s  ", t.ColName, t.Locale)
   333  		}
   334  		for _, s := range t.Input {
   335  			ctxt.Print(string(s.UTF8), " ")
   336  		}
   337  		ctxt.Print("\n")
   338  	}
   339  }
   340  
   341  const benchHelp = `
   342  Bench runs a benchmark for the given list of collator implementations.
   343  If no collator implementations are given, the go collator will be used.
   344  `
   345  
   346  var cmdBench = &Command{
   347  	Run:   runBench,
   348  	Usage: "bench",
   349  	Short: "benchmark a given list of collator implementations",
   350  	Long:  benchHelp,
   351  }
   352  
   353  func runBench(ctxt *Context, args []string) {
   354  	ctxt.Printf("%-7s %-5s %-6s %-24s %-24s %-5s %s\n", "LOCALE", "COLL", "N", "KEYS", "SORT", "AVGLN", "TOTAL")
   355  	for i := 0; i < ctxt.Len(); i++ {
   356  		t := ctxt.Test(i)
   357  		ctxt.Printf("%-7s %-5s ", t.Locale, t.ColName)
   358  		t.GenerateInput()
   359  		ctxt.Printf("%-6s ", fmt.Sprintf("%dK", t.Len()/1000))
   360  		tkey, tsort, nkey, nsort := t.Sort()
   361  		p := func(dur time.Duration, n int) {
   362  			s := ""
   363  			if dur > 0 {
   364  				s = fmt.Sprintf("%6.3fs ", dur.Seconds())
   365  				if n > 0 {
   366  					s += fmt.Sprintf("%15s", fmt.Sprintf("(%4.2f ns/op)", float64(dur)/float64(n)))
   367  				}
   368  			}
   369  			ctxt.Printf("%-24s ", s)
   370  		}
   371  		p(tkey, nkey)
   372  		p(tsort, nsort)
   373  
   374  		total := 0
   375  		for _, s := range t.Input {
   376  			total += len(s.key)
   377  		}
   378  		ctxt.Printf("%-5d ", total/t.Len())
   379  		ctxt.Printf("%6.3fs\n", t.Duration.Seconds())
   380  		if *debug {
   381  			for _, s := range t.Input {
   382  				fmt.Print(string(s.UTF8), " ")
   383  			}
   384  			fmt.Println()
   385  		}
   386  	}
   387  }
   388  
   389  const regressHelp = `
   390  Regress runs a monkey test by comparing the results of randomly generated tests
   391  between two implementations of a collator. The user may optionally pass a list
   392  of strings to regress against instead of the default test set.
   393  `
   394  
   395  var cmdRegress = &Command{
   396  	Run:   runRegress,
   397  	Usage: "regress -gold=<col> -test=<col> [string]*",
   398  	Short: "run a monkey test between two collators",
   399  	Long:  regressHelp,
   400  }
   401  
   402  const failedKeyCompare = `
   403  %s:%d: incorrect comparison result for input:
   404      a:   %q (%.4X)
   405      key: %s
   406      b:   %q (%.4X)
   407      key: %s
   408      Compare(a, b) = %d; want %d.
   409  
   410    gold keys:
   411  	a:   %s
   412  	b:   %s
   413  `
   414  
   415  const failedCompare = `
   416  %s:%d: incorrect comparison result for input:
   417      a:   %q (%.4X)
   418      b:   %q (%.4X)
   419      Compare(a, b) = %d; want %d.
   420  `
   421  
   422  func keyStr(b []byte) string {
   423  	buf := &bytes.Buffer{}
   424  	for _, v := range b {
   425  		fmt.Fprintf(buf, "%.2X ", v)
   426  	}
   427  	return buf.String()
   428  }
   429  
   430  func runRegress(ctxt *Context, args []string) {
   431  	input := parseInput(args)
   432  	for i := 0; i < ctxt.Len(); i++ {
   433  		t := ctxt.Test(i)
   434  		if len(input) > 0 {
   435  			t.Input = append(t.Input, input...)
   436  		} else {
   437  			t.GenerateInput()
   438  		}
   439  		t.Sort()
   440  		count := 0
   441  		gold := getCollator(*gold, t.Locale)
   442  		for i := 1; i < len(t.Input); i++ {
   443  			ia := t.Input[i-1]
   444  			ib := t.Input[i]
   445  			if bytes.IndexAny(ib.UTF8, *exclude) != -1 {
   446  				i++
   447  				continue
   448  			}
   449  			if bytes.IndexAny(ia.UTF8, *exclude) != -1 {
   450  				continue
   451  			}
   452  			goldCmp := gold.Compare(ia, ib)
   453  			if cmp := bytes.Compare(ia.key, ib.key); cmp != goldCmp {
   454  				count++
   455  				a := string(ia.UTF8)
   456  				b := string(ib.UTF8)
   457  				fmt.Printf(failedKeyCompare, t.Locale, i-1, a, []rune(a), keyStr(ia.key), b, []rune(b), keyStr(ib.key), cmp, goldCmp, keyStr(gold.Key(ia)), keyStr(gold.Key(ib)))
   458  			} else if cmp := t.Col.Compare(ia, ib); cmp != goldCmp {
   459  				count++
   460  				a := string(ia.UTF8)
   461  				b := string(ib.UTF8)
   462  				fmt.Printf(failedCompare, t.Locale, i-1, a, []rune(a), b, []rune(b), cmp, goldCmp)
   463  			}
   464  		}
   465  		if count > 0 {
   466  			ctxt.Printf("Found %d inconsistencies in %d entries.\n", count, t.Len()-1)
   467  		}
   468  	}
   469  }
   470  
   471  const helpTemplate = `
   472  colcmp is a tool for testing and benchmarking collation
   473  
   474  Usage: colcmp command [arguments]
   475  
   476  The commands are:
   477  {{range .}}
   478      {{.Name | printf "%-11s"}} {{.Short}}{{end}}
   479  
   480  Use "col help [topic]" for more information about that topic.
   481  `
   482  
   483  const detailedHelpTemplate = `
   484  Usage: colcmp {{.Usage}}
   485  
   486  {{.Long | trim}}
   487  `
   488  
   489  func runHelp(args []string) {
   490  	t := template.New("help")
   491  	t.Funcs(template.FuncMap{"trim": strings.TrimSpace})
   492  	if len(args) < 1 {
   493  		template.Must(t.Parse(helpTemplate))
   494  		failOnError(t.Execute(os.Stderr, &commands))
   495  	} else {
   496  		for _, cmd := range commands {
   497  			if cmd.Name() == args[0] {
   498  				template.Must(t.Parse(detailedHelpTemplate))
   499  				failOnError(t.Execute(os.Stderr, cmd))
   500  				os.Exit(0)
   501  			}
   502  		}
   503  		log.Fatalf("Unknown command %q. Run 'colcmp help'.", args[0])
   504  	}
   505  	os.Exit(0)
   506  }
   507  
   508  func main() {
   509  	flag.Parse()
   510  	log.SetFlags(0)
   511  
   512  	ctxt := parseTests()
   513  
   514  	if flag.NArg() < 1 {
   515  		runHelp(nil)
   516  	}
   517  	args := flag.Args()[1:]
   518  	if flag.Arg(0) == "help" {
   519  		runHelp(args)
   520  	}
   521  	for _, cmd := range commands {
   522  		if cmd.Name() == flag.Arg(0) {
   523  			cmd.Run(ctxt, args)
   524  			ctxt.flush()
   525  			return
   526  		}
   527  	}
   528  	runHelp(flag.Args())
   529  }
   530
View as plain text