...

Source file src/golang.org/x/text/unicode/norm/normalize_test.go

Documentation: golang.org/x/text/unicode/norm

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"os"
    14  	"os/exec"
    15  	"path/filepath"
    16  	"runtime"
    17  	"strings"
    18  	"testing"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/internal/testtext"
    22  	"golang.org/x/text/transform"
    23  )
    24  
    25  var (
    26  	testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
    27  )
    28  
    29  // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
    30  func pc(s string) []byte {
    31  	b := bytes.NewBuffer(make([]byte, 0, len(s)))
    32  	for i := 0; i < len(s); {
    33  		r, sz := utf8.DecodeRuneInString(s[i:])
    34  		n := 0
    35  		if sz == 1 {
    36  			// Special-case one-byte case to handle repetition for invalid UTF-8.
    37  			for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
    38  			}
    39  		} else {
    40  			for _, r2 := range s[i:] {
    41  				if r2 != r {
    42  					break
    43  				}
    44  				n++
    45  			}
    46  		}
    47  		b.WriteString(s[i : i+sz])
    48  		if n > 1 {
    49  			fmt.Fprintf(b, "{%d}", n)
    50  		}
    51  		i += sz * n
    52  	}
    53  	return b.Bytes()
    54  }
    55  
    56  // pidx finds the index from which two strings start to differ, plus context.
    57  // It returns the index and ellipsis if the index is greater than 0.
    58  func pidx(a, b string) (i int, prefix string) {
    59  	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
    60  	}
    61  	if i < 8 {
    62  		return 0, ""
    63  	}
    64  	i -= 3 // ensure taking at least one full rune before the difference.
    65  	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
    66  	}
    67  	return i, "..."
    68  }
    69  
    70  type PositionTest struct {
    71  	input  string
    72  	pos    int
    73  	buffer string // expected contents of reorderBuffer, if applicable
    74  }
    75  
    76  type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
    77  
    78  func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
    79  	rb := reorderBuffer{}
    80  	rb.init(f, nil)
    81  	for i, test := range tests {
    82  		rb.reset()
    83  		rb.src = inputString(test.input)
    84  		rb.nsrc = len(test.input)
    85  		pos, out := fn(&rb, test.input)
    86  		if pos != test.pos {
    87  			t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
    88  		}
    89  		if outs := string(out); outs != test.buffer {
    90  			k, pfx := pidx(outs, test.buffer)
    91  			t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
    92  		}
    93  	}
    94  }
    95  
    96  func grave(n int) string {
    97  	return rep(0x0300, n)
    98  }
    99  
   100  func rep(r rune, n int) string {
   101  	return strings.Repeat(string(r), n)
   102  }
   103  
   104  const segSize = maxByteBufferSize
   105  
   106  var cgj = GraphemeJoiner
   107  
   108  var decomposeSegmentTests = []PositionTest{
   109  	// illegal runes
   110  	{"\xC2", 0, ""},
   111  	{"\xC0", 1, "\xC0"},
   112  	{"\u00E0\x80", 2, "\u0061\u0300"},
   113  	// starter
   114  	{"a", 1, "a"},
   115  	{"ab", 1, "a"},
   116  	// starter + composing
   117  	{"a\u0300", 3, "a\u0300"},
   118  	{"a\u0300b", 3, "a\u0300"},
   119  	// with decomposition
   120  	{"\u00C0", 2, "A\u0300"},
   121  	{"\u00C0b", 2, "A\u0300"},
   122  	// long
   123  	{grave(31), 60, grave(30) + cgj},
   124  	{"a" + grave(31), 61, "a" + grave(30) + cgj},
   125  
   126  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   127  	// U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
   128  	// U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
   129  	// U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
   130  	// U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
   131  	// U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
   132  	{"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
   133  	{"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
   134  	{"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
   135  	{"\u1100\u1161", 6, "\u1100\u1161"},
   136  
   137  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   138  	// Sequence of decomposing characters that are starters and modifiers.
   139  	{"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
   140  
   141  	{grave(30), 60, grave(30)},
   142  	// U+FF9E is a starter, but decomposes to U+3099, which is not.
   143  	{grave(30) + "\uff9e", 60, grave(30) + cgj},
   144  	// ends with incomplete UTF-8 encoding
   145  	{"\xCC", 0, ""},
   146  	{"\u0300\xCC", 2, "\u0300"},
   147  }
   148  
   149  func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
   150  	rb.initString(NFD, s)
   151  	rb.setFlusher(nil, appendFlush)
   152  	p := decomposeSegment(rb, 0, true)
   153  	return p, rb.out
   154  }
   155  
   156  func TestDecomposeSegment(t *testing.T) {
   157  	runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
   158  }
   159  
   160  var firstBoundaryTests = []PositionTest{
   161  	// no boundary
   162  	{"", -1, ""},
   163  	{"\u0300", -1, ""},
   164  	{"\x80\x80", -1, ""},
   165  	// illegal runes
   166  	{"\xff", 0, ""},
   167  	{"\u0300\xff", 2, ""},
   168  	{"\u0300\xc0\x80\x80", 2, ""},
   169  	// boundaries
   170  	{"a", 0, ""},
   171  	{"\u0300a", 2, ""},
   172  	// Hangul
   173  	{"\u1103\u1161", 0, ""},
   174  	{"\u110B\u1173\u11B7", 0, ""},
   175  	{"\u1161\u110B\u1173\u11B7", 3, ""},
   176  	{"\u1173\u11B7\u1103\u1161", 6, ""},
   177  	// too many combining characters.
   178  	{grave(maxNonStarters - 1), -1, ""},
   179  	{grave(maxNonStarters), 60, ""},
   180  	{grave(maxNonStarters + 1), 60, ""},
   181  }
   182  
   183  func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   184  	return rb.f.form.FirstBoundary([]byte(s)), nil
   185  }
   186  
   187  func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
   188  	return rb.f.form.FirstBoundaryInString(s), nil
   189  }
   190  
   191  func TestFirstBoundary(t *testing.T) {
   192  	runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
   193  	runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
   194  }
   195  
   196  func TestNextBoundary(t *testing.T) {
   197  	testCases := []struct {
   198  		input string
   199  		atEOF bool
   200  		want  int
   201  	}{
   202  		// no boundary
   203  		{"", true, 0},
   204  		{"", false, -1},
   205  		{"\u0300", true, 2},
   206  		{"\u0300", false, -1},
   207  		{"\x80\x80", true, 1},
   208  		{"\x80\x80", false, 1},
   209  		// illegal runes
   210  		{"\xff", false, 1},
   211  		{"\u0300\xff", false, 2},
   212  		{"\u0300\xc0\x80\x80", false, 2},
   213  		{"\xc2\x80\x80", false, 2},
   214  		{"\xc2", false, -1},
   215  		{"\xc2", true, 1},
   216  		{"a\u0300\xc2", false, -1},
   217  		{"a\u0300\xc2", true, 3},
   218  		// boundaries
   219  		{"a", true, 1},
   220  		{"a", false, -1},
   221  		{"aa", false, 1},
   222  		{"\u0300", true, 2},
   223  		{"\u0300", false, -1},
   224  		{"\u0300a", false, 2},
   225  		// Hangul
   226  		{"\u1103\u1161", true, 6},
   227  		{"\u1103\u1161", false, -1},
   228  		{"\u110B\u1173\u11B7", false, -1},
   229  		{"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
   230  		{"\u1161\u110B\u1173\u11B7", false, 3},
   231  		{"\u1173\u11B7\u1103\u1161", false, 6},
   232  		// too many combining characters.
   233  		{grave(maxNonStarters - 1), false, -1},
   234  		{grave(maxNonStarters), false, 60},
   235  		{grave(maxNonStarters + 1), false, 60},
   236  	}
   237  
   238  	for _, tc := range testCases {
   239  		if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
   240  			t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   241  		}
   242  		if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
   243  			t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   244  		}
   245  	}
   246  }
   247  
   248  var decomposeToLastTests = []PositionTest{
   249  	// ends with inert character
   250  	{"Hello!", 6, ""},
   251  	{"\u0632", 2, ""},
   252  	{"a\u0301\u0635", 5, ""},
   253  	// ends with non-inert starter
   254  	{"a", 0, "a"},
   255  	{"a\u0301a", 3, "a"},
   256  	{"a\u0301\u03B9", 3, "\u03B9"},
   257  	{"a\u0327", 0, "a\u0327"},
   258  	// illegal runes
   259  	{"\xFF", 1, ""},
   260  	{"aa\xFF", 3, ""},
   261  	{"\xC0\x80\x80", 3, ""},
   262  	{"\xCC\x80\x80", 3, ""},
   263  	// ends with incomplete UTF-8 encoding
   264  	{"a\xCC", 2, ""},
   265  	// ends with combining characters
   266  	{"\u0300\u0301", 0, "\u0300\u0301"},
   267  	{"a\u0300\u0301", 0, "a\u0300\u0301"},
   268  	{"a\u0301\u0308", 0, "a\u0301\u0308"},
   269  	{"a\u0308\u0301", 0, "a\u0308\u0301"},
   270  	{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
   271  	{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
   272  	{"\u00C0", 0, "A\u0300"},
   273  	{"a\u00C0", 1, "A\u0300"},
   274  	// decomposing
   275  	{"a\u0300\u00E0", 3, "a\u0300"},
   276  	// multisegment decompositions (flushes leading segments)
   277  	{"a\u0300\uFDC0", 7, "\u064A"},
   278  	{"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
   279  	{"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
   280  	{"\uFDC0" + grave(31), 5, grave(30)},
   281  	{"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
   282  	// Overflow
   283  	{"\u00E0" + grave(29), 0, "a" + grave(30)},
   284  	{"\u00E0" + grave(30), 2, grave(30)},
   285  	// Hangul
   286  	{"a\u1103", 1, "\u1103"},
   287  	{"a\u110B", 1, "\u110B"},
   288  	{"a\u110B\u1173", 1, "\u110B\u1173"},
   289  	// See comment in composition.go:compBoundaryAfter.
   290  	{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
   291  	{"a\uC73C", 1, "\u110B\u1173"},
   292  	{"다음", 3, "\u110B\u1173\u11B7"},
   293  	{"다", 0, "\u1103\u1161"},
   294  	{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
   295  	{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
   296  	{"다음음", 6, "\u110B\u1173\u11B7"},
   297  	{"음다다", 6, "\u1103\u1161"},
   298  	// maximized buffer
   299  	{"a" + grave(30), 0, "a" + grave(30)},
   300  	// Buffer overflow
   301  	{"a" + grave(31), 3, grave(30)},
   302  	// weird UTF-8
   303  	{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
   304  }
   305  
   306  func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
   307  	rb.setFlusher([]byte(s), appendFlush)
   308  	decomposeToLastBoundary(rb)
   309  	buf := rb.flush(nil)
   310  	return len(rb.out), buf
   311  }
   312  
   313  func TestDecomposeToLastBoundary(t *testing.T) {
   314  	runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
   315  }
   316  
   317  var lastBoundaryTests = []PositionTest{
   318  	// ends with inert character
   319  	{"Hello!", 6, ""},
   320  	{"\u0632", 2, ""},
   321  	// ends with non-inert starter
   322  	{"a", 0, ""},
   323  	// illegal runes
   324  	{"\xff", 1, ""},
   325  	{"aa\xff", 3, ""},
   326  	{"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
   327  	{"\xc0\x80\x80", 3, ""},
   328  	{"\xc0\x80\x80\u0300", 3, ""},
   329  	// ends with incomplete UTF-8 encoding
   330  	{"\xCC", -1, ""},
   331  	{"\xE0\x80", -1, ""},
   332  	{"\xF0\x80\x80", -1, ""},
   333  	{"a\xCC", 0, ""},
   334  	{"\x80\xCC", 1, ""},
   335  	{"\xCC\xCC", 1, ""},
   336  	// ends with combining characters
   337  	{"a\u0300\u0301", 0, ""},
   338  	{"aaaa\u0300\u0301", 3, ""},
   339  	{"\u0300a\u0300\u0301", 2, ""},
   340  	{"\u00C2", 0, ""},
   341  	{"a\u00C2", 1, ""},
   342  	// decomposition may recombine
   343  	{"\u0226", 0, ""},
   344  	// no boundary
   345  	{"", -1, ""},
   346  	{"\u0300\u0301", -1, ""},
   347  	{"\u0300", -1, ""},
   348  	{"\x80\x80", -1, ""},
   349  	{"\x80\x80\u0301", -1, ""},
   350  	// Hangul
   351  	{"다음", 3, ""},
   352  	{"다", 0, ""},
   353  	{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
   354  	{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
   355  	// too many combining characters.
   356  	{grave(maxNonStarters - 1), -1, ""},
   357  	// May still be preceded with a non-starter.
   358  	{grave(maxNonStarters), -1, ""},
   359  	// May still need to insert a cgj after the last combiner.
   360  	{grave(maxNonStarters + 1), 2, ""},
   361  	{grave(maxNonStarters + 2), 4, ""},
   362  
   363  	{"a" + grave(maxNonStarters-1), 0, ""},
   364  	{"a" + grave(maxNonStarters), 0, ""},
   365  	// May still need to insert a cgj after the last combiner.
   366  	{"a" + grave(maxNonStarters+1), 3, ""},
   367  	{"a" + grave(maxNonStarters+2), 5, ""},
   368  }
   369  
   370  func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   371  	return rb.f.form.LastBoundary([]byte(s)), nil
   372  }
   373  
   374  func TestLastBoundary(t *testing.T) {
   375  	runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
   376  }
   377  
   378  type spanTest struct {
   379  	input string
   380  	atEOF bool
   381  	n     int
   382  	err   error
   383  }
   384  
   385  var quickSpanTests = []spanTest{
   386  	{"", true, 0, nil},
   387  	// starters
   388  	{"a", true, 1, nil},
   389  	{"abc", true, 3, nil},
   390  	{"\u043Eb", true, 3, nil},
   391  	// incomplete last rune.
   392  	{"\xCC", true, 1, nil},
   393  	{"\xCC", false, 0, transform.ErrShortSrc},
   394  	{"a\xCC", true, 2, nil},
   395  	{"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
   396  	// incorrectly ordered combining characters
   397  	{"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
   398  	{"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
   399  	{"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
   400  	{"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
   401  	// have a maximum number of combining characters.
   402  	{rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   403  	{"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   404  	{"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   405  	{"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
   406  	{rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
   407  	{"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
   408  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   409  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   410  
   411  	{"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
   412  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   413  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   414  }
   415  
   416  var quickSpanNFDTests = []spanTest{
   417  	// needs decomposing
   418  	{"\u00C0", true, 0, transform.ErrEndOfSpan},
   419  	{"abc\u00C0", true, 3, transform.ErrEndOfSpan},
   420  	// correctly ordered combining characters
   421  	{"\u0300", true, 2, nil},
   422  	{"ab\u0300", true, 4, nil},
   423  	{"ab\u0300cd", true, 6, nil},
   424  	{"\u0300cd", true, 4, nil},
   425  	{"\u0316\u0300", true, 4, nil},
   426  	{"ab\u0316\u0300", true, 6, nil},
   427  	{"ab\u0316\u0300cd", true, 8, nil},
   428  	{"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
   429  	{"\u0316\u0300cd", true, 6, nil},
   430  	{"\u043E\u0308b", true, 5, nil},
   431  	// incorrectly ordered combining characters
   432  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
   433  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   434  	// Hangul
   435  	{"같은", true, 0, transform.ErrEndOfSpan},
   436  }
   437  
   438  var quickSpanNFCTests = []spanTest{
   439  	// okay composed
   440  	{"\u00C0", true, 2, nil},
   441  	{"abc\u00C0", true, 5, nil},
   442  	// correctly ordered combining characters
   443  	// TODO: b may combine with modifiers, which is why this fails. We could
   444  	// make a more precise test that actually checks whether last
   445  	// characters combines. Probably not worth it.
   446  	{"ab\u0300", true, 1, transform.ErrEndOfSpan},
   447  	{"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
   448  	{"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
   449  	{"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
   450  	{"\u00C0\u035D", true, 4, nil},
   451  	// we do not special case leading combining characters
   452  	{"\u0300cd", true, 0, transform.ErrEndOfSpan},
   453  	{"\u0300", true, 0, transform.ErrEndOfSpan},
   454  	{"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
   455  	{"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
   456  	// incorrectly ordered combining characters
   457  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
   458  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   459  	// Hangul
   460  	{"같은", true, 6, nil},
   461  	{"같은", false, 3, transform.ErrShortSrc},
   462  	// We return the start of the violating segment in case of overflow.
   463  	{grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
   464  	{grave(30), true, 0, transform.ErrEndOfSpan},
   465  }
   466  
   467  func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
   468  	for i, tc := range testCases {
   469  		s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   470  		ok := testtext.Run(t, s, func(t *testing.T) {
   471  			n, err := f.Span([]byte(tc.input), tc.atEOF)
   472  			if n != tc.n || err != tc.err {
   473  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   474  			}
   475  		})
   476  		if !ok {
   477  			continue // Don't do the String variant if the Bytes variant failed.
   478  		}
   479  		s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   480  		testtext.Run(t, s, func(t *testing.T) {
   481  			n, err := f.SpanString(tc.input, tc.atEOF)
   482  			if n != tc.n || err != tc.err {
   483  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   484  			}
   485  		})
   486  	}
   487  }
   488  
   489  func TestSpan(t *testing.T) {
   490  	runSpanTests(t, "NFD", NFD, quickSpanTests)
   491  	runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
   492  	runSpanTests(t, "NFC", NFC, quickSpanTests)
   493  	runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
   494  }
   495  
   496  var isNormalTests = []PositionTest{
   497  	{"", 1, ""},
   498  	// illegal runes
   499  	{"\xff", 1, ""},
   500  	// starters
   501  	{"a", 1, ""},
   502  	{"abc", 1, ""},
   503  	{"\u043Eb", 1, ""},
   504  	// incorrectly ordered combining characters
   505  	{"\u0300\u0316", 0, ""},
   506  	{"ab\u0300\u0316", 0, ""},
   507  	{"ab\u0300\u0316cd", 0, ""},
   508  	{"\u0300\u0316cd", 0, ""},
   509  }
   510  var isNormalNFDTests = []PositionTest{
   511  	// needs decomposing
   512  	{"\u00C0", 0, ""},
   513  	{"abc\u00C0", 0, ""},
   514  	// correctly ordered combining characters
   515  	{"\u0300", 1, ""},
   516  	{"ab\u0300", 1, ""},
   517  	{"ab\u0300cd", 1, ""},
   518  	{"\u0300cd", 1, ""},
   519  	{"\u0316\u0300", 1, ""},
   520  	{"ab\u0316\u0300", 1, ""},
   521  	{"ab\u0316\u0300cd", 1, ""},
   522  	{"\u0316\u0300cd", 1, ""},
   523  	{"\u043E\u0308b", 1, ""},
   524  	// Hangul
   525  	{"같은", 0, ""},
   526  }
   527  var isNormalNFCTests = []PositionTest{
   528  	// okay composed
   529  	{"\u00C0", 1, ""},
   530  	{"abc\u00C0", 1, ""},
   531  	// need reordering
   532  	{"a\u0300", 0, ""},
   533  	{"a\u0300cd", 0, ""},
   534  	{"a\u0316\u0300", 0, ""},
   535  	{"a\u0316\u0300cd", 0, ""},
   536  	// correctly ordered combining characters
   537  	{"ab\u0300", 1, ""},
   538  	{"ab\u0300cd", 1, ""},
   539  	{"ab\u0316\u0300", 1, ""},
   540  	{"ab\u0316\u0300cd", 1, ""},
   541  	{"\u00C0\u035D", 1, ""},
   542  	{"\u0300", 1, ""},
   543  	{"\u0316\u0300cd", 1, ""},
   544  	// Hangul
   545  	{"같은", 1, ""},
   546  }
   547  
   548  var isNormalNFKXTests = []PositionTest{
   549  	// Special case.
   550  	{"\u00BC", 0, ""},
   551  }
   552  
   553  func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
   554  	if rb.f.form.IsNormal([]byte(s)) {
   555  		return 1, nil
   556  	}
   557  	return 0, nil
   558  }
   559  
   560  func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
   561  	if rb.f.form.IsNormalString(s) {
   562  		return 1, nil
   563  	}
   564  	return 0, nil
   565  }
   566  
   567  func TestIsNormal(t *testing.T) {
   568  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
   569  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
   570  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
   571  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
   572  	runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
   573  	runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
   574  	runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
   575  	runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
   576  	runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
   577  	runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
   578  }
   579  
   580  func TestIsNormalString(t *testing.T) {
   581  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
   582  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
   583  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
   584  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
   585  }
   586  
   587  type AppendTest struct {
   588  	left  string
   589  	right string
   590  	out   string
   591  }
   592  
   593  type appendFunc func(f Form, out []byte, s string) []byte
   594  
   595  var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
   596  
   597  func runNormTests(t *testing.T, name string, fn appendFunc) {
   598  	for f := NFC; f <= NFKD; f++ {
   599  		runAppendTests(t, name, f, fn, normTests[f])
   600  	}
   601  }
   602  
   603  func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
   604  	for i, test := range tests {
   605  		t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
   606  			id := pc(test.left + test.right)
   607  			if *testn >= 0 && i != *testn {
   608  				return
   609  			}
   610  			t.Run("fn", func(t *testing.T) {
   611  				out := []byte(test.left)
   612  				have := string(fn(f, out, test.right))
   613  				if len(have) != len(test.out) {
   614  					t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
   615  				}
   616  				if have != test.out {
   617  					k, pf := pidx(have, test.out)
   618  					t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
   619  				}
   620  			})
   621  
   622  			// Bootstrap by normalizing input. Ensures that the various variants
   623  			// behave the same.
   624  			for g := NFC; g <= NFKD; g++ {
   625  				if f == g {
   626  					continue
   627  				}
   628  				t.Run(fstr[g], func(t *testing.T) {
   629  					want := g.String(test.left + test.right)
   630  					have := string(fn(g, g.AppendString(nil, test.left), test.right))
   631  					if len(have) != len(want) {
   632  						t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
   633  					}
   634  					if have != want {
   635  						k, pf := pidx(have, want)
   636  						t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
   637  					}
   638  				})
   639  			}
   640  		})
   641  	}
   642  }
   643  
   644  var normTests = [][]AppendTest{
   645  	appendTestsNFC,
   646  	appendTestsNFD,
   647  	appendTestsNFKC,
   648  	appendTestsNFKD,
   649  }
   650  
   651  var appendTestsNFC = []AppendTest{
   652  	{"", ascii, ascii},
   653  	{"", txt_all, txt_all},
   654  	{"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
   655  	{grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
   656  
   657  	// Tests designed for Iter.
   658  	{ // ordering of non-composing combining characters
   659  		"",
   660  		"\u0305\u0316",
   661  		"\u0316\u0305",
   662  	},
   663  	{ // segment overflow
   664  		"",
   665  		"a" + rep(0x0305, maxNonStarters+4) + "\u0316",
   666  		"a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
   667  	},
   668  
   669  	{ // Combine across non-blocking non-starters.
   670  		// U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
   671  		// U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
   672  		"", "a\u0327\u0325", "\u1e01\u0327",
   673  	},
   674  
   675  	{ // Jamo V+T does not combine.
   676  		"",
   677  		"\u1161\u11a8",
   678  		"\u1161\u11a8",
   679  	},
   680  
   681  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   682  	{"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
   683  	{"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
   684  	{"", "\u0b47\u0b3e", "\u0b4b"},
   685  	{"", "\u1100\u1161", "\uac00"},
   686  
   687  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   688  	{ // 0d4a starts a new segment.
   689  		"",
   690  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   691  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   692  	},
   693  
   694  	{ // Split combining characters.
   695  		// TODO: don't insert CGJ before starters.
   696  		"",
   697  		"\u0d46" + strings.Repeat("\u0d3e", 31),
   698  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   699  	},
   700  
   701  	{ // Split combining characters.
   702  		"",
   703  		"\u0d4a" + strings.Repeat("\u0d3e", 30),
   704  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   705  	},
   706  
   707  	{ //  https://golang.org/issues/20079
   708  		"",
   709  		"\xeb\u0344",
   710  		"\xeb\u0308\u0301",
   711  	},
   712  
   713  	{ //  https://golang.org/issues/20079
   714  		"",
   715  		"\uac00" + strings.Repeat("\u0300", 30),
   716  		"\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
   717  	},
   718  
   719  	{ //  https://golang.org/issues/20079
   720  		"",
   721  		"\xeb" + strings.Repeat("\u0300", 31),
   722  		"\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
   723  	},
   724  }
   725  
   726  var appendTestsNFD = []AppendTest{
   727  	// TODO: Move some of the tests here.
   728  }
   729  
   730  var appendTestsNFKC = []AppendTest{
   731  	// empty buffers
   732  	{"", "", ""},
   733  	{"a", "", "a"},
   734  	{"", "a", "a"},
   735  	{"", "\u0041\u0307\u0304", "\u01E0"},
   736  	// segment split across buffers
   737  	{"", "a\u0300b", "\u00E0b"},
   738  	{"a", "\u0300b", "\u00E0b"},
   739  	{"a", "\u0300\u0316", "\u00E0\u0316"},
   740  	{"a", "\u0316\u0300", "\u00E0\u0316"},
   741  	{"a", "\u0300a\u0300", "\u00E0\u00E0"},
   742  	{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
   743  	{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
   744  	{"a\u0300", "\u0327", "\u00E0\u0327"},
   745  	{"a\u0327", "\u0300", "\u00E0\u0327"},
   746  	{"a\u0316", "\u0300", "\u00E0\u0316"},
   747  	{"\u0041\u0307", "\u0304", "\u01E0"},
   748  	// Hangul
   749  	{"", "\u110B\u1173", "\uC73C"},
   750  	{"", "\u1103\u1161", "\uB2E4"},
   751  	{"", "\u110B\u1173\u11B7", "\uC74C"},
   752  	{"", "\u320E", "\x28\uAC00\x29"},
   753  	{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
   754  	{"\u1103", "\u1161", "\uB2E4"},
   755  	{"\u110B", "\u1173\u11B7", "\uC74C"},
   756  	{"\u110B\u1173", "\u11B7", "\uC74C"},
   757  	{"\uC73C", "\u11B7", "\uC74C"},
   758  	// UTF-8 encoding split across buffers
   759  	{"a\xCC", "\x80", "\u00E0"},
   760  	{"a\xCC", "\x80b", "\u00E0b"},
   761  	{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
   762  	{"a\xCC", "\x80\x80", "\u00E0\x80"},
   763  	{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
   764  	{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
   765  	// ending in incomplete UTF-8 encoding
   766  	{"", "\xCC", "\xCC"},
   767  	{"a", "\xCC", "a\xCC"},
   768  	{"a", "b\xCC", "ab\xCC"},
   769  	{"\u0226", "\xCC", "\u0226\xCC"},
   770  	// illegal runes
   771  	{"", "\x80", "\x80"},
   772  	{"", "\x80\x80\x80", "\x80\x80\x80"},
   773  	{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
   774  	{"", "a\x80", "a\x80"},
   775  	{"", "a\x80\x80\x80", "a\x80\x80\x80"},
   776  	{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
   777  	{"a", "\x80\x80\x80", "a\x80\x80\x80"},
   778  	// overflow
   779  	{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
   780  	{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
   781  	{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
   782  	// overflow of combining characters
   783  	{"", grave(34), grave(30) + cgj + grave(4)},
   784  	{"", grave(36), grave(30) + cgj + grave(6)},
   785  	{grave(29), grave(5), grave(30) + cgj + grave(4)},
   786  	{grave(30), grave(4), grave(30) + cgj + grave(4)},
   787  	{grave(30), grave(3), grave(30) + cgj + grave(3)},
   788  	{grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
   789  	{"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
   790  	{"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
   791  	// - First rune has a trailing non-starter.
   792  	{"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
   793  	// - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
   794  	//   inserted even when FF9E starts a new segment.
   795  	{"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
   796  	{grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
   797  	// - Many non-starter decompositions in a row causing overflow.
   798  	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
   799  	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
   800  
   801  	{"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
   802  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   803  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   804  
   805  	// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   806  	{"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
   807  	{"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
   808  	{"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
   809  
   810  	// weird UTF-8
   811  	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
   812  	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
   813  	{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
   814  	{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
   815  	{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
   816  	{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   817  	{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
   818  	{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   819  
   820  	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
   821  	// large input.
   822  	{"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
   823  	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
   824  	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
   825  	{"", "\u0041\u0307\u0304", "\u01E0"},
   826  }
   827  
   828  var appendTestsNFKD = []AppendTest{
   829  	{"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
   830  
   831  	{ // segment overflow on unchanged character
   832  		"",
   833  		"a" + grave(64) + "\u0316",
   834  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
   835  	},
   836  	{ // segment overflow on unchanged character + start value
   837  		"",
   838  		"a" + grave(98) + "\u0316",
   839  		"a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
   840  	},
   841  	{ // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
   842  		"",
   843  		"a" + grave(59) + "\u0340",
   844  		"a" + grave(30) + cgj + grave(30),
   845  	},
   846  	{ // segment overflow on non-starter decomposition
   847  		"",
   848  		"a" + grave(33) + "\u0340" + grave(30) + "\u0320",
   849  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
   850  	},
   851  	{ // start value after ASCII overflow
   852  		"",
   853  		rep('a', segSize) + grave(32) + "\u0320",
   854  		rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
   855  	},
   856  	{ // Jamo overflow
   857  		"",
   858  		"\u1100\u1161" + grave(30) + "\u0320" + grave(2),
   859  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   860  	},
   861  	{ // Hangul
   862  		"",
   863  		"\uac00",
   864  		"\u1100\u1161",
   865  	},
   866  	{ // Hangul overflow
   867  		"",
   868  		"\uac00" + grave(32) + "\u0320",
   869  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   870  	},
   871  	{ // Hangul overflow in Hangul mode.
   872  		"",
   873  		"\uac00\uac00" + grave(32) + "\u0320",
   874  		"\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   875  	},
   876  	{ // Hangul overflow in Hangul mode.
   877  		"",
   878  		strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
   879  		strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
   880  	},
   881  	{ // start value after cc=0
   882  		"",
   883  		"您您" + grave(34) + "\u0320",
   884  		"您您" + grave(30) + cgj + "\u0320" + grave(4),
   885  	},
   886  	{ // start value after normalization
   887  		"",
   888  		"\u0300\u0320a" + grave(34) + "\u0320",
   889  		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
   890  	},
   891  	{
   892  		// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   893  		"",
   894  		"a\u0f7f" + rep(0xf71, 29) + "\u0f81",
   895  		"a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
   896  	},
   897  }
   898  
   899  func TestAppend(t *testing.T) {
   900  	runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
   901  		return f.Append(out, []byte(s)...)
   902  	})
   903  }
   904  
   905  func TestAppendString(t *testing.T) {
   906  	runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
   907  		return f.AppendString(out, s)
   908  	})
   909  }
   910  
   911  func TestBytes(t *testing.T) {
   912  	runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
   913  		buf := []byte{}
   914  		buf = append(buf, out...)
   915  		buf = append(buf, s...)
   916  		return f.Bytes(buf)
   917  	})
   918  }
   919  
   920  func TestString(t *testing.T) {
   921  	runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
   922  		outs := string(out) + s
   923  		return []byte(f.String(outs))
   924  	})
   925  }
   926  
   927  func runNM(code string) (string, error) {
   928  	// Write the file.
   929  	tmpdir, err := os.MkdirTemp(os.TempDir(), "normalize_test")
   930  	if err != nil {
   931  		return "", fmt.Errorf("failed to create tmpdir: %v", err)
   932  	}
   933  	defer os.RemoveAll(tmpdir)
   934  	goTool := filepath.Join(runtime.GOROOT(), "bin", "go")
   935  	filename := filepath.Join(tmpdir, "main.go")
   936  	if err := os.WriteFile(filename, []byte(code), 0644); err != nil {
   937  		return "", fmt.Errorf("failed to write main.go: %v", err)
   938  	}
   939  	outputFile := filepath.Join(tmpdir, "main")
   940  
   941  	// Build the binary.
   942  	out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput()
   943  	if err != nil {
   944  		return "", fmt.Errorf("failed to execute command: %v", err)
   945  	}
   946  
   947  	// Get the symbols.
   948  	out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput()
   949  	return string(out), err
   950  }
   951  
   952  func TestLinking(t *testing.T) {
   953  	const prog = `
   954  	package main
   955  	import "fmt"
   956  	import "golang.org/x/text/unicode/norm"
   957  	func main() { fmt.Println(norm.%s) }
   958  	`
   959  
   960  	baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize"))
   961  	withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`))
   962  	if errB != nil || errT != nil {
   963  		t.Skipf("TestLinking failed: %v and %v", errB, errT)
   964  	}
   965  
   966  	symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"}
   967  	for _, symbol := range symbols {
   968  		if strings.Contains(baseline, symbol) {
   969  			t.Errorf("found: %q unexpectedly", symbol)
   970  		}
   971  		if !strings.Contains(withTables, symbol) {
   972  			t.Errorf("didn't find: %q unexpectedly", symbol)
   973  		}
   974  	}
   975  }
   976  
   977  func appendBench(f Form, in []byte) func() {
   978  	buf := make([]byte, 0, 4*len(in))
   979  	return func() {
   980  		f.Append(buf, in...)
   981  	}
   982  }
   983  
   984  func bytesBench(f Form, in []byte) func() {
   985  	return func() {
   986  		f.Bytes(in)
   987  	}
   988  }
   989  
   990  func iterBench(f Form, in []byte) func() {
   991  	iter := Iter{}
   992  	return func() {
   993  		iter.Init(f, in)
   994  		for !iter.Done() {
   995  			iter.Next()
   996  		}
   997  	}
   998  }
   999  
  1000  func transformBench(f Form, in []byte) func() {
  1001  	buf := make([]byte, 4*len(in))
  1002  	return func() {
  1003  		if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
  1004  			log.Panic(n, len(in), err)
  1005  		}
  1006  	}
  1007  }
  1008  
  1009  func readerBench(f Form, in []byte) func() {
  1010  	buf := make([]byte, 4*len(in))
  1011  	return func() {
  1012  		r := f.Reader(bytes.NewReader(in))
  1013  		var err error
  1014  		for err == nil {
  1015  			_, err = r.Read(buf)
  1016  		}
  1017  		if err != io.EOF {
  1018  			panic("")
  1019  		}
  1020  	}
  1021  }
  1022  
  1023  func writerBench(f Form, in []byte) func() {
  1024  	buf := make([]byte, 0, 4*len(in))
  1025  	return func() {
  1026  		r := f.Writer(bytes.NewBuffer(buf))
  1027  		if _, err := r.Write(in); err != nil {
  1028  			panic("")
  1029  		}
  1030  	}
  1031  }
  1032  
  1033  func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
  1034  	bm = append(bm, appendBench(f, in))
  1035  	bm = append(bm, iterBench(f, in))
  1036  	bm = append(bm, transformBench(f, in))
  1037  	bm = append(bm, readerBench(f, in))
  1038  	bm = append(bm, writerBench(f, in))
  1039  	return bm
  1040  }
  1041  
  1042  func doFormBenchmark(b *testing.B, inf, f Form, s string) {
  1043  	b.StopTimer()
  1044  	in := inf.Bytes([]byte(s))
  1045  	bm := appendBenchmarks(nil, f, in)
  1046  	b.SetBytes(int64(len(in) * len(bm)))
  1047  	b.StartTimer()
  1048  	for i := 0; i < b.N; i++ {
  1049  		for _, fn := range bm {
  1050  			fn()
  1051  		}
  1052  	}
  1053  }
  1054  
  1055  func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
  1056  	b.StopTimer()
  1057  	fn := f(NFC, s)
  1058  	b.SetBytes(int64(len(s)))
  1059  	b.StartTimer()
  1060  	for i := 0; i < b.N; i++ {
  1061  		fn()
  1062  	}
  1063  }
  1064  
  1065  var (
  1066  	smallNoChange = []byte("nörmalization")
  1067  	smallChange   = []byte("No\u0308rmalization")
  1068  	ascii         = strings.Repeat("There is nothing to change here! ", 500)
  1069  )
  1070  
  1071  func lowerBench(f Form, in []byte) func() {
  1072  	// Use package strings instead of bytes as it doesn't allocate memory
  1073  	// if there aren't any changes.
  1074  	s := string(in)
  1075  	return func() {
  1076  		strings.ToLower(s)
  1077  	}
  1078  }
  1079  
  1080  func BenchmarkLowerCaseNoChange(b *testing.B) {
  1081  	doSingle(b, lowerBench, smallNoChange)
  1082  }
  1083  func BenchmarkLowerCaseChange(b *testing.B) {
  1084  	doSingle(b, lowerBench, smallChange)
  1085  }
  1086  
  1087  func quickSpanBench(f Form, in []byte) func() {
  1088  	return func() {
  1089  		f.QuickSpan(in)
  1090  	}
  1091  }
  1092  
  1093  func BenchmarkQuickSpanChangeNFC(b *testing.B) {
  1094  	doSingle(b, quickSpanBench, smallNoChange)
  1095  }
  1096  
  1097  func BenchmarkBytesNoChangeNFC(b *testing.B) {
  1098  	doSingle(b, bytesBench, smallNoChange)
  1099  }
  1100  func BenchmarkBytesChangeNFC(b *testing.B) {
  1101  	doSingle(b, bytesBench, smallChange)
  1102  }
  1103  
  1104  func BenchmarkAppendNoChangeNFC(b *testing.B) {
  1105  	doSingle(b, appendBench, smallNoChange)
  1106  }
  1107  func BenchmarkAppendChangeNFC(b *testing.B) {
  1108  	doSingle(b, appendBench, smallChange)
  1109  }
  1110  func BenchmarkAppendLargeNFC(b *testing.B) {
  1111  	doSingle(b, appendBench, txt_all_bytes)
  1112  }
  1113  
  1114  func BenchmarkIterNoChangeNFC(b *testing.B) {
  1115  	doSingle(b, iterBench, smallNoChange)
  1116  }
  1117  func BenchmarkIterChangeNFC(b *testing.B) {
  1118  	doSingle(b, iterBench, smallChange)
  1119  }
  1120  func BenchmarkIterLargeNFC(b *testing.B) {
  1121  	doSingle(b, iterBench, txt_all_bytes)
  1122  }
  1123  
  1124  func BenchmarkTransformNoChangeNFC(b *testing.B) {
  1125  	doSingle(b, transformBench, smallNoChange)
  1126  }
  1127  func BenchmarkTransformChangeNFC(b *testing.B) {
  1128  	doSingle(b, transformBench, smallChange)
  1129  }
  1130  func BenchmarkTransformLargeNFC(b *testing.B) {
  1131  	doSingle(b, transformBench, txt_all_bytes)
  1132  }
  1133  
  1134  func BenchmarkNormalizeAsciiNFC(b *testing.B) {
  1135  	doFormBenchmark(b, NFC, NFC, ascii)
  1136  }
  1137  func BenchmarkNormalizeAsciiNFD(b *testing.B) {
  1138  	doFormBenchmark(b, NFC, NFD, ascii)
  1139  }
  1140  func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
  1141  	doFormBenchmark(b, NFC, NFKC, ascii)
  1142  }
  1143  func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
  1144  	doFormBenchmark(b, NFC, NFKD, ascii)
  1145  }
  1146  
  1147  func BenchmarkNormalizeNFC2NFC(b *testing.B) {
  1148  	doFormBenchmark(b, NFC, NFC, txt_all)
  1149  }
  1150  func BenchmarkNormalizeNFC2NFD(b *testing.B) {
  1151  	doFormBenchmark(b, NFC, NFD, txt_all)
  1152  }
  1153  func BenchmarkNormalizeNFD2NFC(b *testing.B) {
  1154  	doFormBenchmark(b, NFD, NFC, txt_all)
  1155  }
  1156  func BenchmarkNormalizeNFD2NFD(b *testing.B) {
  1157  	doFormBenchmark(b, NFD, NFD, txt_all)
  1158  }
  1159  
  1160  // Hangul is often special-cased, so we test it separately.
  1161  func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
  1162  	doFormBenchmark(b, NFC, NFC, txt_kr)
  1163  }
  1164  func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
  1165  	doFormBenchmark(b, NFC, NFD, txt_kr)
  1166  }
  1167  func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
  1168  	doFormBenchmark(b, NFD, NFC, txt_kr)
  1169  }
  1170  func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
  1171  	doFormBenchmark(b, NFD, NFD, txt_kr)
  1172  }
  1173  
  1174  var forms = []Form{NFC, NFD, NFKC, NFKD}
  1175  
  1176  func doTextBenchmark(b *testing.B, s string) {
  1177  	b.StopTimer()
  1178  	in := []byte(s)
  1179  	bm := []func(){}
  1180  	for _, f := range forms {
  1181  		bm = appendBenchmarks(bm, f, in)
  1182  	}
  1183  	b.SetBytes(int64(len(s) * len(bm)))
  1184  	b.StartTimer()
  1185  	for i := 0; i < b.N; i++ {
  1186  		for _, f := range bm {
  1187  			f()
  1188  		}
  1189  	}
  1190  }
  1191  
  1192  func BenchmarkCanonicalOrdering(b *testing.B) {
  1193  	doTextBenchmark(b, txt_canon)
  1194  }
  1195  func BenchmarkExtendedLatin(b *testing.B) {
  1196  	doTextBenchmark(b, txt_vn)
  1197  }
  1198  func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1199  	doTextBenchmark(b, twoByteUtf8)
  1200  }
  1201  func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1202  	doTextBenchmark(b, threeByteUtf8)
  1203  }
  1204  func BenchmarkHangul(b *testing.B) {
  1205  	doTextBenchmark(b, txt_kr)
  1206  }
  1207  func BenchmarkJapanese(b *testing.B) {
  1208  	doTextBenchmark(b, txt_jp)
  1209  }
  1210  func BenchmarkChinese(b *testing.B) {
  1211  	doTextBenchmark(b, txt_cn)
  1212  }
  1213  func BenchmarkOverflow(b *testing.B) {
  1214  	doTextBenchmark(b, overflow)
  1215  }
  1216  
  1217  var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1218  
  1219  // Tests sampled from the Canonical ordering tests (Part 2) of
  1220  // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1221  const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1222  \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1223  \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1224  \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 
  1225  \u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
  1226  \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1227  \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1228  \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1229  \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1230  \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1231  \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1232  \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1233  \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1234  \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1235  \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1236  \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1237  \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1238  \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1239  
  1240  // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1241  const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 
  1242  Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 
  1243  nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 
  1244  một giấy phép khác có các điều khoản tương tự như giấy phép này
  1245  cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1246  trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1247  người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1248  bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1249  chúng theo quy định của pháp luật thì tình trạng của nó không 
  1250  bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1251  
  1252  // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1253  const txt_ru = `При обязательном соблюдении следующих условий:
  1254  Attribution — Вы должны атрибутировать произведение (указывать
  1255  автора и источник) в порядке, предусмотренном автором или
  1256  лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1257  что они поддерживают вас или использование вами данного произведения).
  1258  Υπό τις ακόλουθες προϋποθέσεις:`
  1259  
  1260  // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1261  const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1262  τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1263  (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1264  τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1265  τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1266  μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1267  παρόμοια άδεια.`
  1268  
  1269  // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1270  const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1271  تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1272  الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1273  المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1274  من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1275  لهذا الترخيص.`
  1276  
  1277  // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1278  const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1279  המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1280  שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1281  לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1282  החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1283  
  1284  const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1285  
  1286  // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1287  const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1288  (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1289  원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1290  이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 
  1291  동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1292  라이선스와 동일한 라이선스를 적용해야 합니다.`
  1293  
  1294  // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1295  const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1296  มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1297  ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1298  คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1299  อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1300  อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1301  
  1302  const threeByteUtf8 = txt_th
  1303  
  1304  // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1305  const txt_jp = `あなたの従うべき条件は以下の通りです。
  1306  表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1307  継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1308  あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1309  頒布することができます。`
  1310  
  1311  // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1312  const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
  1313  广播或通过信息网络传播本作品 创作演绎作品
  1314  对本作品进行商业性使用 惟须遵守下列条件:
  1315  署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1316  相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
  1317  您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1318  
  1319  const txt_cjk = txt_cn + txt_jp + txt_kr
  1320  const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1321  
  1322  var txt_all_bytes = []byte(txt_all)
  1323  

View as plain text