...

Source file src/golang.org/x/text/collate/tools/colcmp/gen.go

Documentation: golang.org/x/text/collate/tools/colcmp

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"math"
     9  	"math/rand"
    10  	"strings"
    11  	"unicode"
    12  	"unicode/utf16"
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/language"
    16  	"golang.org/x/text/unicode/norm"
    17  )
    18  
    19  // TODO: replace with functionality in language package.
    20  // parent computes the parent language for the given language.
    21  // It returns false if the parent is already root.
    22  func parent(locale string) (parent string, ok bool) {
    23  	if locale == "und" {
    24  		return "", false
    25  	}
    26  	if i := strings.LastIndex(locale, "-"); i != -1 {
    27  		return locale[:i], true
    28  	}
    29  	return "und", true
    30  }
    31  
    32  // rewriter is used to both unique strings and create variants of strings
    33  // to add to the test set.
    34  type rewriter struct {
    35  	seen     map[string]bool
    36  	addCases bool
    37  }
    38  
    39  func newRewriter() *rewriter {
    40  	return &rewriter{
    41  		seen: make(map[string]bool),
    42  	}
    43  }
    44  
    45  func (r *rewriter) insert(a []string, s string) []string {
    46  	if !r.seen[s] {
    47  		r.seen[s] = true
    48  		a = append(a, s)
    49  	}
    50  	return a
    51  }
    52  
    53  // rewrite takes a sequence of strings in, adds variants of the these strings
    54  // based on options and removes duplicates.
    55  func (r *rewriter) rewrite(ss []string) []string {
    56  	ns := []string{}
    57  	for _, s := range ss {
    58  		ns = r.insert(ns, s)
    59  		if r.addCases {
    60  			rs := []rune(s)
    61  			rn := rs[0]
    62  			for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
    63  				rs[0] = c
    64  				ns = r.insert(ns, string(rs))
    65  			}
    66  		}
    67  	}
    68  	return ns
    69  }
    70  
    71  // exemplarySet holds a parsed set of characters from the exemplarCharacters table.
    72  type exemplarySet struct {
    73  	typ       exemplarType
    74  	set       []string
    75  	charIndex int // cumulative total of phrases, including this set
    76  }
    77  
    78  type phraseGenerator struct {
    79  	sets [exN]exemplarySet
    80  	n    int
    81  }
    82  
    83  func (g *phraseGenerator) init(id string) {
    84  	ec := exemplarCharacters
    85  	loc := language.Make(id).String()
    86  	// get sets for locale or parent locale if the set is not defined.
    87  	for i := range g.sets {
    88  		for p, ok := loc, true; ok; p, ok = parent(p) {
    89  			if set, ok := ec[p]; ok && set[i] != "" {
    90  				g.sets[i].set = strings.Split(set[i], " ")
    91  				break
    92  			}
    93  		}
    94  	}
    95  	r := newRewriter()
    96  	r.addCases = *cases
    97  	for i := range g.sets {
    98  		g.sets[i].set = r.rewrite(g.sets[i].set)
    99  	}
   100  	// compute indexes
   101  	for i, set := range g.sets {
   102  		g.n += len(set.set)
   103  		g.sets[i].charIndex = g.n
   104  	}
   105  }
   106  
   107  // phrase returns the ith phrase, where i < g.n.
   108  func (g *phraseGenerator) phrase(i int) string {
   109  	for _, set := range g.sets {
   110  		if i < set.charIndex {
   111  			return set.set[i-(set.charIndex-len(set.set))]
   112  		}
   113  	}
   114  	panic("index out of range")
   115  }
   116  
   117  // generate generates inputs by combining all pairs of examplar strings.
   118  // If doNorm is true, all input strings are normalized to NFC.
   119  // TODO: allow other variations, statistical models, and random
   120  // trailing sequences.
   121  func (g *phraseGenerator) generate(doNorm bool) []Input {
   122  	const (
   123  		M         = 1024 * 1024
   124  		buf8Size  = 30 * M
   125  		buf16Size = 10 * M
   126  	)
   127  	// TODO: use a better way to limit the input size.
   128  	if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
   129  		g.n = sq
   130  	}
   131  	size := g.n * g.n
   132  	a := make([]Input, 0, size)
   133  	buf8 := make([]byte, 0, buf8Size)
   134  	buf16 := make([]uint16, 0, buf16Size)
   135  
   136  	addInput := func(str string) {
   137  		buf8 = buf8[len(buf8):]
   138  		buf16 = buf16[len(buf16):]
   139  		if len(str) > cap(buf8) {
   140  			buf8 = make([]byte, 0, buf8Size)
   141  		}
   142  		if len(str) > cap(buf16) {
   143  			buf16 = make([]uint16, 0, buf16Size)
   144  		}
   145  		if doNorm {
   146  			buf8 = norm.NFD.AppendString(buf8, str)
   147  		} else {
   148  			buf8 = append(buf8, str...)
   149  		}
   150  		buf16 = appendUTF16(buf16, buf8)
   151  		a = append(a, makeInput(buf8, buf16))
   152  	}
   153  	for i := 0; i < g.n; i++ {
   154  		p1 := g.phrase(i)
   155  		addInput(p1)
   156  		for j := 0; j < g.n; j++ {
   157  			p2 := g.phrase(j)
   158  			addInput(p1 + p2)
   159  		}
   160  	}
   161  	// permutate
   162  	rnd := rand.New(rand.NewSource(int64(rand.Int())))
   163  	for i := range a {
   164  		j := i + rnd.Intn(len(a)-i)
   165  		a[i], a[j] = a[j], a[i]
   166  		a[i].index = i // allow restoring this order if input is used multiple times.
   167  	}
   168  	return a
   169  }
   170  
   171  func appendUTF16(buf []uint16, s []byte) []uint16 {
   172  	for len(s) > 0 {
   173  		r, sz := utf8.DecodeRune(s)
   174  		s = s[sz:]
   175  		r1, r2 := utf16.EncodeRune(r)
   176  		if r1 != 0xFFFD {
   177  			buf = append(buf, uint16(r1), uint16(r2))
   178  		} else {
   179  			buf = append(buf, uint16(r))
   180  		}
   181  	}
   182  	return buf
   183  }
   184  

View as plain text