...

Source file src/golang.org/x/text/cases/map_test.go

Documentation: golang.org/x/text/cases

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cases
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"path"
    11  	"strings"
    12  	"testing"
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/internal/testtext"
    16  	"golang.org/x/text/language"
    17  	"golang.org/x/text/transform"
    18  	"golang.org/x/text/unicode/norm"
    19  )
    20  
    21  type testCase struct {
    22  	lang  string
    23  	src   interface{} // string, []string, or nil to skip test
    24  	title interface{} // string, []string, or nil to skip test
    25  	lower interface{} // string, []string, or nil to skip test
    26  	upper interface{} // string, []string, or nil to skip test
    27  	opts  options
    28  }
    29  
    30  var testCases = []testCase{
    31  	0: {
    32  		lang:  "und",
    33  		src:   "abc aBc ABC abC İsıI ΕΣΆΣ",
    34  		title: "Abc Abc Abc Abc İsıi Εσάσ",
    35  		lower: "abc abc abc abc i\u0307sıi εσάσ",
    36  		upper: "ABC ABC ABC ABC İSII ΕΣΆΣ",
    37  		opts:  getOpts(HandleFinalSigma(false)),
    38  	},
    39  
    40  	1: {
    41  		lang:  "und",
    42  		src:   "abc aBc ABC abC İsıI ΕΣΆΣ Σ _Σ -Σ",
    43  		title: "Abc Abc Abc Abc İsıi Εσάς Σ _Σ -Σ",
    44  		lower: "abc abc abc abc i\u0307sıi εσάς σ _σ -σ",
    45  		upper: "ABC ABC ABC ABC İSII ΕΣΆΣ Σ _Σ -Σ",
    46  		opts:  getOpts(HandleFinalSigma(true)),
    47  	},
    48  
    49  	2: { // Title cased runes.
    50  		lang:  supported,
    51  		src:   "DžA",
    52  		title: "Dža",
    53  		lower: "dža",
    54  		upper: "DŽA",
    55  	},
    56  
    57  	3: {
    58  		// Title breaking.
    59  		lang: supported,
    60  		src: []string{
    61  			"FOO CASE TEST",
    62  			"DON'T DO THiS",
    63  			"χωΡΊΣ χωΡΊΣ^a χωΡΊΣ:a χωΡΊΣ:^a χωΡΊΣ^ όμΩΣ Σ",
    64  			"with-hyphens",
    65  			"49ers 49ers",
    66  			`"capitalize a^a -hyphen 0X _u a_u:a`,
    67  			"MidNumLet a.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
    68  			"MidNum a,b;c\u037ed\u0589e\u060cf\u2044g\ufe50h",
    69  			"\u0345 x\u3031x x\u05d0x \u05d0x a'.a a.a a4,a",
    70  		},
    71  		title: []string{
    72  			"Foo Case Test",
    73  			"Don't Do This",
    74  			"Χωρίς Χωρίσ^A Χωρίσ:a Χωρίσ:^A Χωρίς^ Όμως Σ",
    75  			"With-Hyphens",
    76  			// Note that 49Ers is correct according to the spec.
    77  			// TODO: provide some option to the user to treat different
    78  			// characters as cased.
    79  			"49Ers 49Ers",
    80  			`"Capitalize A^A -Hyphen 0X _U A_u:a`,
    81  			"Midnumlet A.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
    82  			"Midnum A,B;C\u037eD\u0589E\u060cF\u2044G\ufe50H",
    83  			"\u0399 X\u3031X X\u05d0x \u05d0X A'.A A.a A4,A",
    84  		},
    85  	},
    86  
    87  	// TODO: These are known deviations from the options{} Unicode Word Breaking
    88  	// Algorithm.
    89  	// {
    90  	// 	"und",
    91  	// 	"x_\u3031_x a4,4a",
    92  	// 	"X_\u3031_x A4,4a", // Currently is "X_\U3031_X A4,4A".
    93  	// 	"x_\u3031_x a4,4a",
    94  	// 	"X_\u3031_X A4,4A",
    95  	// 	options{},
    96  	// },
    97  
    98  	4: {
    99  		// Tests title options
   100  		lang:  "und",
   101  		src:   "abc aBc ABC abC İsıI o'Brien",
   102  		title: "Abc ABc ABC AbC İsıI O'Brien",
   103  		opts:  getOpts(NoLower),
   104  	},
   105  
   106  	5: {
   107  		lang:  "el",
   108  		src:   "aBc ΟΔΌΣ Οδός Σο ΣΟ Σ oΣ ΟΣ σ ἕξ \u03ac",
   109  		title: "Abc Οδός Οδός Σο Σο Σ Oς Ος Σ Ἕξ \u0386",
   110  		lower: "abc οδός οδός σο σο σ oς ος σ ἕξ \u03ac",
   111  		upper: "ABC ΟΔΟΣ ΟΔΟΣ ΣΟ ΣΟ Σ OΣ ΟΣ Σ ΕΞ \u0391", // Uppercase removes accents
   112  	},
   113  
   114  	6: {
   115  		lang:  "tr az",
   116  		src:   "Isiİ İsıI I\u0307sIiİ İsıI\u0307 I\u0300\u0307",
   117  		title: "Isii İsıı I\u0307sıii İsıi I\u0300\u0307",
   118  		lower: "ısii isıı isıii isıi \u0131\u0300\u0307",
   119  		upper: "ISİİ İSII I\u0307SIİİ İSII\u0307 I\u0300\u0307",
   120  	},
   121  
   122  	7: {
   123  		lang:  "lt",
   124  		src:   "I Ï J J̈ Į Į̈ Ì Í Ĩ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
   125  		title: "I Ï J J̈ Į Į̈ Ì Í Ĩ Xi̇̈ Xj̇̈ Xį̇̈ Xi̇̀ Xi̇́ Xi̇̃ Xi Xi̇̈ Xj Xj̇̈ Xį Xį̇̈ Xi̟̤",
   126  		lower: "i i̇̈ j j̇̈ į į̇̈ i̇̀ i̇́ i̇̃ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ xi xi̇̈ xj xj̇̈ xį xį̇̈ xi̟̤",
   127  		upper: "I Ï J J̈ Į Į̈ Ì Í Ĩ XÏ XJ̈ XĮ̈ XÌ XÍ XĨ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
   128  	},
   129  
   130  	8: {
   131  		lang:  "lt",
   132  		src:   "\u012e\u0300 \u00cc i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
   133  		title: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
   134  		lower: "\u012f\u0307\u0300 i\u0307\u0300 i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
   135  		upper: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
   136  	},
   137  
   138  	9: {
   139  		lang:  "nl",
   140  		src:   "ijs IJs Ij Ijs İJ İJs aa aA 'ns 'S",
   141  		title: "IJs IJs IJ IJs İj İjs Aa Aa 'ns 's",
   142  	},
   143  
   144  	// Note: this specification is not currently part of CLDR. The same holds
   145  	// for the leading apostrophe handling for Dutch.
   146  	// See https://unicode.org/cldr/trac/ticket/7078.
   147  	10: {
   148  		lang:  "af",
   149  		src:   "wag 'n bietjie",
   150  		title: "Wag 'n Bietjie",
   151  		lower: "wag 'n bietjie",
   152  		upper: "WAG 'N BIETJIE",
   153  	},
   154  }
   155  
   156  func TestCaseMappings(t *testing.T) {
   157  	for i, tt := range testCases {
   158  		src, ok := tt.src.([]string)
   159  		if !ok {
   160  			src = strings.Split(tt.src.(string), " ")
   161  		}
   162  
   163  		for _, lang := range strings.Split(tt.lang, " ") {
   164  			tag := language.MustParse(lang)
   165  			testEntry := func(name string, mk func(language.Tag, options) transform.SpanningTransformer, gold interface{}) {
   166  				c := Caser{mk(tag, tt.opts)}
   167  				if gold != nil {
   168  					wants, ok := gold.([]string)
   169  					if !ok {
   170  						wants = strings.Split(gold.(string), " ")
   171  					}
   172  					for j, want := range wants {
   173  						if got := c.String(src[j]); got != want {
   174  							t.Errorf("%d:%s:\n%s.String(%+q):\ngot  %+q;\nwant %+q", i, lang, name, src[j], got, want)
   175  						}
   176  					}
   177  				}
   178  				dst := make([]byte, 256) // big enough to hold any result
   179  				src := []byte(strings.Join(src, " "))
   180  				v := testtext.AllocsPerRun(20, func() {
   181  					c.Transform(dst, src, true)
   182  				})
   183  				if v > 1.1 {
   184  					t.Errorf("%d:%s:\n%s: number of allocs was %f; want 0", i, lang, name, v)
   185  				}
   186  			}
   187  			testEntry("Upper", makeUpper, tt.upper)
   188  			testEntry("Lower", makeLower, tt.lower)
   189  			testEntry("Title", makeTitle, tt.title)
   190  		}
   191  	}
   192  }
   193  
   194  // TestAlloc tests that some mapping methods should not cause any allocation.
   195  func TestAlloc(t *testing.T) {
   196  	dst := make([]byte, 256) // big enough to hold any result
   197  	src := []byte(txtNonASCII)
   198  
   199  	for i, f := range []func() Caser{
   200  		func() Caser { return Upper(language.Und) },
   201  		func() Caser { return Lower(language.Und) },
   202  		func() Caser { return Lower(language.Und, HandleFinalSigma(false)) },
   203  		// TODO: use a shared copy for these casers as well, in order of
   204  		// importance, starting with the most important:
   205  		// func() Caser { return Title(language.Und) },
   206  		// func() Caser { return Title(language.Und, HandleFinalSigma(false)) },
   207  	} {
   208  		testtext.Run(t, "", func(t *testing.T) {
   209  			var c Caser
   210  			v := testtext.AllocsPerRun(10, func() {
   211  				c = f()
   212  			})
   213  			if v > 0 {
   214  				// TODO: Right now only Upper has 1 allocation. Special-case Lower
   215  				// and Title as well to have less allocations for the root locale.
   216  				t.Errorf("%d:init: number of allocs was %f; want 0", i, v)
   217  			}
   218  			v = testtext.AllocsPerRun(2, func() {
   219  				c.Transform(dst, src, true)
   220  			})
   221  			if v > 0 {
   222  				t.Errorf("%d:transform: number of allocs was %f; want 0", i, v)
   223  			}
   224  		})
   225  	}
   226  }
   227  
   228  func testHandover(t *testing.T, c Caser, src string) {
   229  	want := c.String(src)
   230  	// Find the common prefix.
   231  	pSrc := 0
   232  	for ; pSrc < len(src) && pSrc < len(want) && want[pSrc] == src[pSrc]; pSrc++ {
   233  	}
   234  
   235  	// Test handover for each substring of the prefix.
   236  	for i := 0; i < pSrc; i++ {
   237  		testtext.Run(t, fmt.Sprint("interleave/", i), func(t *testing.T) {
   238  			dst := make([]byte, 4*len(src))
   239  			c.Reset()
   240  			nSpan, _ := c.Span([]byte(src[:i]), false)
   241  			copy(dst, src[:nSpan])
   242  			nTransform, _, _ := c.Transform(dst[nSpan:], []byte(src[nSpan:]), true)
   243  			got := string(dst[:nSpan+nTransform])
   244  			if got != want {
   245  				t.Errorf("full string: got %q; want %q", got, want)
   246  			}
   247  		})
   248  	}
   249  }
   250  
   251  func TestHandover(t *testing.T) {
   252  	testCases := []struct {
   253  		desc          string
   254  		t             Caser
   255  		first, second string
   256  	}{{
   257  		"title/nosigma/single midword",
   258  		Title(language.Und, HandleFinalSigma(false)),
   259  		"A.", "a",
   260  	}, {
   261  		"title/nosigma/single midword",
   262  		Title(language.Und, HandleFinalSigma(false)),
   263  		"A", ".a",
   264  	}, {
   265  		"title/nosigma/double midword",
   266  		Title(language.Und, HandleFinalSigma(false)),
   267  		"A..", "a",
   268  	}, {
   269  		"title/nosigma/double midword",
   270  		Title(language.Und, HandleFinalSigma(false)),
   271  		"A.", ".a",
   272  	}, {
   273  		"title/nosigma/double midword",
   274  		Title(language.Und, HandleFinalSigma(false)),
   275  		"A", "..a",
   276  	}, {
   277  		"title/sigma/single midword",
   278  		Title(language.Und),
   279  		"ΟΣ.", "a",
   280  	}, {
   281  		"title/sigma/single midword",
   282  		Title(language.Und),
   283  		"ΟΣ", ".a",
   284  	}, {
   285  		"title/sigma/double midword",
   286  		Title(language.Und),
   287  		"ΟΣ..", "a",
   288  	}, {
   289  		"title/sigma/double midword",
   290  		Title(language.Und),
   291  		"ΟΣ.", ".a",
   292  	}, {
   293  		"title/sigma/double midword",
   294  		Title(language.Und),
   295  		"ΟΣ", "..a",
   296  	}, {
   297  		"title/af/leading apostrophe",
   298  		Title(language.Afrikaans),
   299  		"'", "n bietje",
   300  	}}
   301  	for _, tc := range testCases {
   302  		testtext.Run(t, tc.desc, func(t *testing.T) {
   303  			src := tc.first + tc.second
   304  			want := tc.t.String(src)
   305  			tc.t.Reset()
   306  			n, _ := tc.t.Span([]byte(tc.first), false)
   307  
   308  			dst := make([]byte, len(want))
   309  			copy(dst, tc.first[:n])
   310  
   311  			nDst, _, _ := tc.t.Transform(dst[n:], []byte(src[n:]), true)
   312  			got := string(dst[:n+nDst])
   313  			if got != want {
   314  				t.Errorf("got %q; want %q", got, want)
   315  			}
   316  		})
   317  	}
   318  }
   319  
   320  // minBufSize is the size of the buffer by which the casing operation in
   321  // this package are guaranteed to make progress.
   322  const minBufSize = norm.MaxSegmentSize
   323  
   324  type bufferTest struct {
   325  	desc, src, want  string
   326  	firstErr         error
   327  	dstSize, srcSize int
   328  	t                transform.SpanningTransformer
   329  }
   330  
   331  var bufferTests []bufferTest
   332  
   333  func init() {
   334  	bufferTests = []bufferTest{{
   335  		desc:     "und/upper/short dst",
   336  		src:      "abcdefg",
   337  		want:     "ABCDEFG",
   338  		firstErr: transform.ErrShortDst,
   339  		dstSize:  3,
   340  		srcSize:  minBufSize,
   341  		t:        Upper(language.Und),
   342  	}, {
   343  		desc:     "und/upper/short src",
   344  		src:      "123é56",
   345  		want:     "123É56",
   346  		firstErr: transform.ErrShortSrc,
   347  		dstSize:  4,
   348  		srcSize:  4,
   349  		t:        Upper(language.Und),
   350  	}, {
   351  		desc:     "und/upper/no error on short",
   352  		src:      "12",
   353  		want:     "12",
   354  		firstErr: nil,
   355  		dstSize:  1,
   356  		srcSize:  1,
   357  		t:        Upper(language.Und),
   358  	}, {
   359  		desc:     "und/lower/short dst",
   360  		src:      "ABCDEFG",
   361  		want:     "abcdefg",
   362  		firstErr: transform.ErrShortDst,
   363  		dstSize:  3,
   364  		srcSize:  minBufSize,
   365  		t:        Lower(language.Und),
   366  	}, {
   367  		desc:     "und/lower/short src",
   368  		src:      "123É56",
   369  		want:     "123é56",
   370  		firstErr: transform.ErrShortSrc,
   371  		dstSize:  4,
   372  		srcSize:  4,
   373  		t:        Lower(language.Und),
   374  	}, {
   375  		desc:     "und/lower/no error on short",
   376  		src:      "12",
   377  		want:     "12",
   378  		firstErr: nil,
   379  		dstSize:  1,
   380  		srcSize:  1,
   381  		t:        Lower(language.Und),
   382  	}, {
   383  		desc:    "und/lower/simple (no final sigma)",
   384  		src:     "ΟΣ ΟΣΣ",
   385  		want:    "οσ οσσ",
   386  		dstSize: minBufSize,
   387  		srcSize: minBufSize,
   388  		t:       Lower(language.Und, HandleFinalSigma(false)),
   389  	}, {
   390  		desc:    "und/title/simple (no final sigma)",
   391  		src:     "ΟΣ ΟΣΣ",
   392  		want:    "Οσ Οσσ",
   393  		dstSize: minBufSize,
   394  		srcSize: minBufSize,
   395  		t:       Title(language.Und, HandleFinalSigma(false)),
   396  	}, {
   397  		desc:    "und/title/final sigma: no error",
   398  		src:     "ΟΣ",
   399  		want:    "Ος",
   400  		dstSize: minBufSize,
   401  		srcSize: minBufSize,
   402  		t:       Title(language.Und),
   403  	}, {
   404  		desc:     "und/title/final sigma: short source",
   405  		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
   406  		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
   407  		firstErr: transform.ErrShortSrc,
   408  		dstSize:  minBufSize,
   409  		srcSize:  10,
   410  		t:        Title(language.Und),
   411  	}, {
   412  		desc:     "und/title/final sigma: short destination 1",
   413  		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
   414  		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
   415  		firstErr: transform.ErrShortDst,
   416  		dstSize:  10,
   417  		srcSize:  minBufSize,
   418  		t:        Title(language.Und),
   419  	}, {
   420  		desc:     "und/title/final sigma: short destination 2",
   421  		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
   422  		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
   423  		firstErr: transform.ErrShortDst,
   424  		dstSize:  9,
   425  		srcSize:  minBufSize,
   426  		t:        Title(language.Und),
   427  	}, {
   428  		desc:     "und/title/final sigma: short destination 3",
   429  		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
   430  		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
   431  		firstErr: transform.ErrShortDst,
   432  		dstSize:  8,
   433  		srcSize:  minBufSize,
   434  		t:        Title(language.Und),
   435  	}, {
   436  		desc:     "und/title/clipped UTF-8 rune",
   437  		src:      "σσσσσσσσσσσ",
   438  		want:     "Σσσσσσσσσσσ",
   439  		firstErr: transform.ErrShortSrc,
   440  		dstSize:  minBufSize,
   441  		srcSize:  5,
   442  		t:        Title(language.Und),
   443  	}, {
   444  		desc:    "und/title/clipped UTF-8 rune atEOF",
   445  		src:     "σσσ" + string([]byte{0xCF}),
   446  		want:    "Σσσ" + string([]byte{0xCF}),
   447  		dstSize: minBufSize,
   448  		srcSize: minBufSize,
   449  		t:       Title(language.Und),
   450  	}, {
   451  		// Note: the choice to change the final sigma at the end in case of
   452  		// too many case ignorables is arbitrary. The main reason for this
   453  		// choice is that it results in simpler code.
   454  		desc:    "und/title/final sigma: max ignorables",
   455  		src:     "ΟΣ" + strings.Repeat(".", maxIgnorable) + "a",
   456  		want:    "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
   457  		dstSize: minBufSize,
   458  		srcSize: minBufSize,
   459  		t:       Title(language.Und),
   460  	}, {
   461  		// Note: the choice to change the final sigma at the end in case of
   462  		// too many case ignorables is arbitrary. The main reason for this
   463  		// choice is that it results in simpler code.
   464  		desc:    "und/title/long string",
   465  		src:     "AA" + strings.Repeat(".", maxIgnorable+1) + "a",
   466  		want:    "Aa" + strings.Repeat(".", maxIgnorable+1) + "A",
   467  		dstSize: minBufSize,
   468  		srcSize: len("AA" + strings.Repeat(".", maxIgnorable+1)),
   469  		t:       Title(language.Und),
   470  	}, {
   471  		// Note: the choice to change the final sigma at the end in case of
   472  		// too many case ignorables is arbitrary. The main reason for this
   473  		// choice is that it results in simpler code.
   474  		desc:    "und/title/final sigma: too many ignorables",
   475  		src:     "ΟΣ" + strings.Repeat(".", maxIgnorable+1) + "a",
   476  		want:    "Ος" + strings.Repeat(".", maxIgnorable+1) + "A",
   477  		dstSize: minBufSize,
   478  		srcSize: len("ΟΣ" + strings.Repeat(".", maxIgnorable+1)),
   479  		t:       Title(language.Und),
   480  	}, {
   481  		desc:    "und/title/final sigma: apostrophe",
   482  		src:     "ΟΣ''a",
   483  		want:    "Οσ''A",
   484  		dstSize: minBufSize,
   485  		srcSize: minBufSize,
   486  		t:       Title(language.Und),
   487  	}, {
   488  		desc:    "el/upper/max ignorables",
   489  		src:     "ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
   490  		want:    "Ο" + strings.Repeat("\u0321", maxIgnorable-1),
   491  		dstSize: minBufSize,
   492  		srcSize: minBufSize,
   493  		t:       Upper(language.Greek),
   494  	}, {
   495  		desc:    "el/upper/too many ignorables",
   496  		src:     "ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
   497  		want:    "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
   498  		dstSize: minBufSize,
   499  		srcSize: len("ο" + strings.Repeat("\u0321", maxIgnorable)),
   500  		t:       Upper(language.Greek),
   501  	}, {
   502  		desc:     "el/upper/short dst",
   503  		src:      "123ο",
   504  		want:     "123Ο",
   505  		firstErr: transform.ErrShortDst,
   506  		dstSize:  3,
   507  		srcSize:  minBufSize,
   508  		t:        Upper(language.Greek),
   509  	}, {
   510  		desc:    "lt/lower/max ignorables",
   511  		src:     "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
   512  		want:    "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
   513  		dstSize: minBufSize,
   514  		srcSize: minBufSize,
   515  		t:       Lower(language.Lithuanian),
   516  	}, {
   517  		desc:    "lt/lower/too many ignorables",
   518  		src:     "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
   519  		want:    "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
   520  		dstSize: minBufSize,
   521  		srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
   522  		t:       Lower(language.Lithuanian),
   523  	}, {
   524  		desc:     "lt/lower/decomposition with short dst buffer 1",
   525  		src:      "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
   526  		firstErr: transform.ErrShortDst,
   527  		want:     "aaaaai\u0307\u0300",
   528  		dstSize:  5,
   529  		srcSize:  minBufSize,
   530  		t:        Lower(language.Lithuanian),
   531  	}, {
   532  		desc:     "lt/lower/decomposition with short dst buffer 2",
   533  		src:      "aaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
   534  		firstErr: transform.ErrShortDst,
   535  		want:     "aaaai\u0307\u0300",
   536  		dstSize:  5,
   537  		srcSize:  minBufSize,
   538  		t:        Lower(language.Lithuanian),
   539  	}, {
   540  		desc:    "lt/upper/max ignorables",
   541  		src:     "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
   542  		want:    "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
   543  		dstSize: minBufSize,
   544  		srcSize: minBufSize,
   545  		t:       Upper(language.Lithuanian),
   546  	}, {
   547  		desc:    "lt/upper/too many ignorables",
   548  		src:     "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
   549  		want:    "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
   550  		dstSize: minBufSize,
   551  		srcSize: len("i" + strings.Repeat("\u0321", maxIgnorable)),
   552  		t:       Upper(language.Lithuanian),
   553  	}, {
   554  		desc:     "lt/upper/short dst",
   555  		src:      "12i\u0307\u0300",
   556  		want:     "12\u00cc",
   557  		firstErr: transform.ErrShortDst,
   558  		dstSize:  3,
   559  		srcSize:  minBufSize,
   560  		t:        Upper(language.Lithuanian),
   561  	}, {
   562  		desc:    "aztr/lower/max ignorables",
   563  		src:     "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
   564  		want:    "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
   565  		dstSize: minBufSize,
   566  		srcSize: minBufSize,
   567  		t:       Lower(language.Turkish),
   568  	}, {
   569  		desc:    "aztr/lower/too many ignorables",
   570  		src:     "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
   571  		want:    "\u0131" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
   572  		dstSize: minBufSize,
   573  		srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
   574  		t:       Lower(language.Turkish),
   575  	}, {
   576  		desc:     "nl/title/pre-IJ cutoff",
   577  		src:      "  ij",
   578  		want:     "  IJ",
   579  		firstErr: transform.ErrShortDst,
   580  		dstSize:  2,
   581  		srcSize:  minBufSize,
   582  		t:        Title(language.Dutch),
   583  	}, {
   584  		desc:     "nl/title/mid-IJ cutoff",
   585  		src:      "  ij",
   586  		want:     "  IJ",
   587  		firstErr: transform.ErrShortDst,
   588  		dstSize:  3,
   589  		srcSize:  minBufSize,
   590  		t:        Title(language.Dutch),
   591  	}, {
   592  		desc:     "af/title/apostrophe",
   593  		src:      "'n bietje",
   594  		want:     "'n Bietje",
   595  		firstErr: transform.ErrShortDst,
   596  		dstSize:  3,
   597  		srcSize:  minBufSize,
   598  		t:        Title(language.Afrikaans),
   599  	}}
   600  }
   601  
   602  func TestShortBuffersAndOverflow(t *testing.T) {
   603  	for i, tt := range bufferTests {
   604  		testtext.Run(t, tt.desc, func(t *testing.T) {
   605  			buf := make([]byte, tt.dstSize)
   606  			got := []byte{}
   607  			var nSrc, nDst int
   608  			var err error
   609  			for p := 0; p < len(tt.src); p += nSrc {
   610  				q := p + tt.srcSize
   611  				if q > len(tt.src) {
   612  					q = len(tt.src)
   613  				}
   614  				nDst, nSrc, err = tt.t.Transform(buf, []byte(tt.src[p:q]), q == len(tt.src))
   615  				got = append(got, buf[:nDst]...)
   616  
   617  				if p == 0 && err != tt.firstErr {
   618  					t.Errorf("%d:%s:\n error was %v; want %v", i, tt.desc, err, tt.firstErr)
   619  					break
   620  				}
   621  			}
   622  			if string(got) != tt.want {
   623  				t.Errorf("%d:%s:\ngot  %+q;\nwant %+q", i, tt.desc, got, tt.want)
   624  			}
   625  			testHandover(t, Caser{tt.t}, tt.src)
   626  		})
   627  	}
   628  }
   629  
   630  func TestSpan(t *testing.T) {
   631  	for _, tt := range []struct {
   632  		desc  string
   633  		src   string
   634  		want  string
   635  		atEOF bool
   636  		err   error
   637  		t     Caser
   638  	}{{
   639  		desc:  "und/upper/basic",
   640  		src:   "abcdefg",
   641  		want:  "",
   642  		atEOF: true,
   643  		err:   transform.ErrEndOfSpan,
   644  		t:     Upper(language.Und),
   645  	}, {
   646  		desc:  "und/upper/short src",
   647  		src:   "123É"[:4],
   648  		want:  "123",
   649  		atEOF: false,
   650  		err:   transform.ErrShortSrc,
   651  		t:     Upper(language.Und),
   652  	}, {
   653  		desc:  "und/upper/no error on short",
   654  		src:   "12",
   655  		want:  "12",
   656  		atEOF: false,
   657  		t:     Upper(language.Und),
   658  	}, {
   659  		desc:  "und/lower/basic",
   660  		src:   "ABCDEFG",
   661  		want:  "",
   662  		atEOF: true,
   663  		err:   transform.ErrEndOfSpan,
   664  		t:     Lower(language.Und),
   665  	}, {
   666  		desc:  "und/lower/short src num",
   667  		src:   "123é"[:4],
   668  		want:  "123",
   669  		atEOF: false,
   670  		err:   transform.ErrShortSrc,
   671  		t:     Lower(language.Und),
   672  	}, {
   673  		desc:  "und/lower/short src greek",
   674  		src:   "αβγé"[:7],
   675  		want:  "αβγ",
   676  		atEOF: false,
   677  		err:   transform.ErrShortSrc,
   678  		t:     Lower(language.Und),
   679  	}, {
   680  		desc:  "und/lower/no error on short",
   681  		src:   "12",
   682  		want:  "12",
   683  		atEOF: false,
   684  		t:     Lower(language.Und),
   685  	}, {
   686  		desc:  "und/lower/simple (no final sigma)",
   687  		src:   "ος οσσ",
   688  		want:  "οσ οσσ",
   689  		atEOF: true,
   690  		t:     Lower(language.Und, HandleFinalSigma(false)),
   691  	}, {
   692  		desc:  "und/title/simple (no final sigma)",
   693  		src:   "Οσ Οσσ",
   694  		want:  "Οσ Οσσ",
   695  		atEOF: true,
   696  		t:     Title(language.Und, HandleFinalSigma(false)),
   697  	}, {
   698  		desc: "und/lower/final sigma: no error",
   699  		src:  "οΣ", // Oς
   700  		want: "ο",  // Oς
   701  		err:  transform.ErrEndOfSpan,
   702  		t:    Lower(language.Und),
   703  	}, {
   704  		desc: "und/title/final sigma: no error",
   705  		src:  "ΟΣ", // Oς
   706  		want: "Ο",  // Oς
   707  		err:  transform.ErrEndOfSpan,
   708  		t:    Title(language.Und),
   709  	}, {
   710  		desc: "und/title/final sigma: no short source!",
   711  		src:  "ΟσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσΣ",
   712  		want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσ",
   713  		err:  transform.ErrEndOfSpan,
   714  		t:    Title(language.Und),
   715  	}, {
   716  		desc:  "und/title/clipped UTF-8 rune",
   717  		src:   "Σσ" + string([]byte{0xCF}),
   718  		want:  "Σσ",
   719  		atEOF: false,
   720  		err:   transform.ErrShortSrc,
   721  		t:     Title(language.Und),
   722  	}, {
   723  		desc:  "und/title/clipped UTF-8 rune atEOF",
   724  		src:   "Σσσ" + string([]byte{0xCF}),
   725  		want:  "Σσσ" + string([]byte{0xCF}),
   726  		atEOF: true,
   727  		t:     Title(language.Und),
   728  	}, {
   729  		// Note: the choice to change the final sigma at the end in case of
   730  		// too many case ignorables is arbitrary. The main reason for this
   731  		// choice is that it results in simpler code.
   732  		desc: "und/title/long string",
   733  		src:  "A" + strings.Repeat("a", maxIgnorable+5),
   734  		want: "A" + strings.Repeat("a", maxIgnorable+5),
   735  		t:    Title(language.Und),
   736  	}, {
   737  		// Note: the choice to change the final sigma at the end in case of
   738  		// too many case ignorables is arbitrary. The main reason for this
   739  		// choice is that it results in simpler code.
   740  		desc:  "und/title/cyrillic",
   741  		src:   "При",
   742  		want:  "При",
   743  		atEOF: true,
   744  		t:     Title(language.Und, HandleFinalSigma(false)),
   745  	}, {
   746  		// Note: the choice to change the final sigma at the end in case of
   747  		// too many case ignorables is arbitrary. The main reason for this
   748  		// choice is that it results in simpler code.
   749  		desc: "und/title/final sigma: max ignorables",
   750  		src:  "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
   751  		want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
   752  		t:    Title(language.Und),
   753  	}, {
   754  		desc: "el/upper/max ignorables - not implemented",
   755  		src:  "Ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
   756  		want: "",
   757  		err:  transform.ErrEndOfSpan,
   758  		t:    Upper(language.Greek),
   759  	}, {
   760  		desc: "el/upper/too many ignorables - not implemented",
   761  		src:  "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
   762  		want: "",
   763  		err:  transform.ErrEndOfSpan,
   764  		t:    Upper(language.Greek),
   765  	}, {
   766  		desc: "el/upper/short dst",
   767  		src:  "123ο",
   768  		want: "",
   769  		err:  transform.ErrEndOfSpan,
   770  		t:    Upper(language.Greek),
   771  	}, {
   772  		desc: "lt/lower/max ignorables",
   773  		src:  "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
   774  		want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
   775  		t:    Lower(language.Lithuanian),
   776  	}, {
   777  		desc: "lt/lower/isLower",
   778  		src:  "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
   779  		want: "",
   780  		err:  transform.ErrEndOfSpan,
   781  		t:    Lower(language.Lithuanian),
   782  	}, {
   783  		desc: "lt/lower/not identical",
   784  		src:  "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
   785  		err:  transform.ErrEndOfSpan,
   786  		want: "aaaaa",
   787  		t:    Lower(language.Lithuanian),
   788  	}, {
   789  		desc: "lt/lower/identical",
   790  		src:  "aaaai\u0307\u0300", // U+00CC LATIN CAPITAL LETTER I GRAVE
   791  		want: "aaaai\u0307\u0300",
   792  		t:    Lower(language.Lithuanian),
   793  	}, {
   794  		desc: "lt/upper/not implemented",
   795  		src:  "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
   796  		want: "",
   797  		err:  transform.ErrEndOfSpan,
   798  		t:    Upper(language.Lithuanian),
   799  	}, {
   800  		desc: "lt/upper/not implemented, ascii",
   801  		src:  "AB",
   802  		want: "",
   803  		err:  transform.ErrEndOfSpan,
   804  		t:    Upper(language.Lithuanian),
   805  	}, {
   806  		desc: "nl/title/pre-IJ cutoff",
   807  		src:  "  IJ",
   808  		want: "  IJ",
   809  		t:    Title(language.Dutch),
   810  	}, {
   811  		desc: "nl/title/mid-IJ cutoff",
   812  		src:  "  Ia",
   813  		want: "  Ia",
   814  		t:    Title(language.Dutch),
   815  	}, {
   816  		desc: "af/title/apostrophe",
   817  		src:  "'n Bietje",
   818  		want: "'n Bietje",
   819  		t:    Title(language.Afrikaans),
   820  	}, {
   821  		desc: "af/title/apostrophe-incorrect",
   822  		src:  "'N Bietje",
   823  		// The Single_Quote (a MidWord), needs to be retained as unspanned so
   824  		// that a successive call to Transform can detect that N should not be
   825  		// capitalized.
   826  		want: "",
   827  		err:  transform.ErrEndOfSpan,
   828  		t:    Title(language.Afrikaans),
   829  	}} {
   830  		testtext.Run(t, tt.desc, func(t *testing.T) {
   831  			for p := 0; p < len(tt.want); p += utf8.RuneLen([]rune(tt.src[p:])[0]) {
   832  				tt.t.Reset()
   833  				n, err := tt.t.Span([]byte(tt.src[:p]), false)
   834  				if err != nil && err != transform.ErrShortSrc {
   835  					t.Errorf("early failure:Span(%+q): %v (%d < %d)", tt.src[:p], err, n, len(tt.want))
   836  					break
   837  				}
   838  			}
   839  			tt.t.Reset()
   840  			n, err := tt.t.Span([]byte(tt.src), tt.atEOF)
   841  			if n != len(tt.want) || err != tt.err {
   842  				t.Errorf("Span(%+q, %v): got %d, %v; want %d, %v", tt.src, tt.atEOF, n, err, len(tt.want), tt.err)
   843  			}
   844  			testHandover(t, tt.t, tt.src)
   845  		})
   846  	}
   847  }
   848  
   849  var txtASCII = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 50)
   850  
   851  // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
   852  const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.  Nếu bạn sử
   853  dụng, chuyển đổi, hoặc xây dựng dự án từ  nội dung được chia sẻ này, bạn phải áp
   854  dụng giấy phép này hoặc  một giấy phép khác có các điều khoản tương tự như giấy
   855  phép này cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào trên đây
   856  cũng có thể được miễn bỏ nếu bạn được sự cho phép của người sở hữu bản quyền.
   857  Phạm vi công chúng — Khi tác phẩm hoặc bất kỳ chương nào của tác phẩm đã trong
   858  vùng dành cho công chúng theo quy định của pháp luật thì tình trạng của nó không
   859  bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
   860  
   861  // http://creativecommons.org/licenses/by-sa/2.5/cn/
   862  const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
   863  广播或通过信息网络传播本作品 创作演绎作品
   864  对本作品进行商业性使用 惟须遵守下列条件:
   865  署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
   866  相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
   867  您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
   868  
   869  // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
   870  const txt_ru = `При обязательном соблюдении следующих условий: Attribution — Вы
   871  должны атрибутировать произведение (указывать автора и источник) в порядке,
   872  предусмотренном автором или лицензиаром (но только так, чтобы никоим образом не
   873  подразумевалось, что они поддерживают вас или использование вами данного
   874  произведения). Υπό τις ακόλουθες προϋποθέσεις:`
   875  
   876  // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
   877  const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με
   878  τον τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια (χωρίς
   879  όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή τη χρήση του έργου
   880  από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, τροποποιήσετε ή δημιουργήσετε
   881  περαιτέρω βασισμένοι στο έργο θα μπορείτε να διανέμετε το έργο που θα προκύψει
   882  μόνο με την ίδια ή παρόμοια άδεια.`
   883  
   884  const txtNonASCII = txt_vn + txt_cn + txt_ru + txt_gr
   885  
   886  // TODO: Improve ASCII performance.
   887  
   888  func BenchmarkCasers(b *testing.B) {
   889  	for _, s := range []struct{ name, text string }{
   890  		{"ascii", txtASCII},
   891  		{"nonASCII", txtNonASCII},
   892  		{"short", "При"},
   893  	} {
   894  		src := []byte(s.text)
   895  		// Measure case mappings in bytes package for comparison.
   896  		for _, f := range []struct {
   897  			name string
   898  			fn   func(b []byte) []byte
   899  		}{
   900  			{"lower", bytes.ToLower},
   901  			{"title", bytes.ToTitle},
   902  			{"upper", bytes.ToUpper},
   903  		} {
   904  			testtext.Bench(b, path.Join(s.name, "bytes", f.name), func(b *testing.B) {
   905  				b.SetBytes(int64(len(src)))
   906  				for i := 0; i < b.N; i++ {
   907  					f.fn(src)
   908  				}
   909  			})
   910  		}
   911  		for _, t := range []struct {
   912  			name  string
   913  			caser transform.SpanningTransformer
   914  		}{
   915  			{"fold/default", Fold()},
   916  			{"upper/default", Upper(language.Und)},
   917  			{"lower/sigma", Lower(language.Und)},
   918  			{"lower/simple", Lower(language.Und, HandleFinalSigma(false))},
   919  			{"title/sigma", Title(language.Und)},
   920  			{"title/simple", Title(language.Und, HandleFinalSigma(false))},
   921  		} {
   922  			c := Caser{t.caser}
   923  			dst := make([]byte, len(src))
   924  			testtext.Bench(b, path.Join(s.name, t.name, "transform"), func(b *testing.B) {
   925  				b.SetBytes(int64(len(src)))
   926  				for i := 0; i < b.N; i++ {
   927  					c.Reset()
   928  					c.Transform(dst, src, true)
   929  				}
   930  			})
   931  			// No need to check span for simple cases, as they will be the same
   932  			// as sigma.
   933  			if strings.HasSuffix(t.name, "/simple") {
   934  				continue
   935  			}
   936  			spanSrc := c.Bytes(src)
   937  			testtext.Bench(b, path.Join(s.name, t.name, "span"), func(b *testing.B) {
   938  				c.Reset()
   939  				if n, _ := c.Span(spanSrc, true); n < len(spanSrc) {
   940  					b.Fatalf("spanner is not recognizing text %q as done (at %d)", spanSrc, n)
   941  				}
   942  				b.SetBytes(int64(len(spanSrc)))
   943  				for i := 0; i < b.N; i++ {
   944  					c.Reset()
   945  					c.Span(spanSrc, true)
   946  				}
   947  			})
   948  		}
   949  	}
   950  }
   951  

View as plain text