...

Source file src/golang.org/x/text/encoding/unicode/unicode_test.go

Documentation: golang.org/x/text/encoding/unicode

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package unicode
     6  
     7  import (
     8  	"testing"
     9  
    10  	"golang.org/x/text/encoding"
    11  	"golang.org/x/text/encoding/charmap"
    12  	"golang.org/x/text/encoding/internal/enctest"
    13  	"golang.org/x/text/transform"
    14  )
    15  
    16  func TestBasics(t *testing.T) {
    17  	testCases := []struct {
    18  		e         encoding.Encoding
    19  		encPrefix string
    20  		encSuffix string
    21  		encoded   string
    22  		utf8      string
    23  	}{{
    24  		e:       utf16BEIB,
    25  		encoded: "\x00\x57\x00\xe4\xd8\x35\xdd\x65",
    26  		utf8:    "\x57\u00e4\U0001d565",
    27  	}, {
    28  		e:         utf16BEEB,
    29  		encPrefix: "\xfe\xff",
    30  		encoded:   "\x00\x57\x00\xe4\xd8\x35\xdd\x65",
    31  		utf8:      "\x57\u00e4\U0001d565",
    32  	}, {
    33  		e:       utf16LEIB,
    34  		encoded: "\x57\x00\xe4\x00\x35\xd8\x65\xdd",
    35  		utf8:    "\x57\u00e4\U0001d565",
    36  	}, {
    37  		e:         utf16LEEB,
    38  		encPrefix: "\xff\xfe",
    39  		encoded:   "\x57\x00\xe4\x00\x35\xd8\x65\xdd",
    40  		utf8:      "\x57\u00e4\U0001d565",
    41  	}}
    42  
    43  	for _, tc := range testCases {
    44  		enctest.TestEncoding(t, tc.e, tc.encoded, tc.utf8, tc.encPrefix, tc.encSuffix)
    45  	}
    46  }
    47  
    48  func TestFiles(t *testing.T) {
    49  	enctest.TestFile(t, UTF8)
    50  	enctest.TestFile(t, utf16LEIB)
    51  }
    52  
    53  func BenchmarkEncoding(b *testing.B) {
    54  	enctest.Benchmark(b, UTF8)
    55  	enctest.Benchmark(b, utf16LEIB)
    56  }
    57  
    58  var (
    59  	utf16LEIB = UTF16(LittleEndian, IgnoreBOM) // UTF-16LE (atypical interpretation)
    60  	utf16LEUB = UTF16(LittleEndian, UseBOM)    // UTF-16, LE
    61  	utf16LEEB = UTF16(LittleEndian, ExpectBOM) // UTF-16, LE, Expect
    62  	utf16BEIB = UTF16(BigEndian, IgnoreBOM)    // UTF-16BE (atypical interpretation)
    63  	utf16BEUB = UTF16(BigEndian, UseBOM)       // UTF-16 default
    64  	utf16BEEB = UTF16(BigEndian, ExpectBOM)    // UTF-16 Expect
    65  )
    66  
    67  func TestUTF16(t *testing.T) {
    68  	testCases := []struct {
    69  		desc    string
    70  		src     string
    71  		notEOF  bool // the inverse of atEOF
    72  		sizeDst int
    73  		want    string
    74  		nSrc    int
    75  		err     error
    76  		t       transform.Transformer
    77  	}{{
    78  		desc: "utf-16 IgnoreBOM dec: empty string",
    79  		t:    utf16BEIB.NewDecoder(),
    80  	}, {
    81  		desc: "utf-16 UseBOM dec: empty string",
    82  		t:    utf16BEUB.NewDecoder(),
    83  	}, {
    84  		desc: "utf-16 ExpectBOM dec: empty string",
    85  		err:  ErrMissingBOM,
    86  		t:    utf16BEEB.NewDecoder(),
    87  	}, {
    88  		desc:    "utf-16 dec: BOM determines encoding BE (RFC 2781:3.3)",
    89  		src:     "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
    90  		sizeDst: 100,
    91  		want:    "\U00012345=Ra",
    92  		nSrc:    12,
    93  		t:       utf16BEUB.NewDecoder(),
    94  	}, {
    95  		desc:    "utf-16 dec: BOM determines encoding LE (RFC 2781:3.3)",
    96  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
    97  		sizeDst: 100,
    98  		want:    "\U00012345=Ra",
    99  		nSrc:    12,
   100  		t:       utf16LEUB.NewDecoder(),
   101  	}, {
   102  		desc:    "utf-16 dec: BOM determines encoding LE, change default (RFC 2781:3.3)",
   103  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   104  		sizeDst: 100,
   105  		want:    "\U00012345=Ra",
   106  		nSrc:    12,
   107  		t:       utf16BEUB.NewDecoder(),
   108  	}, {
   109  		desc:    "utf-16 dec: Fail on missing BOM when required",
   110  		src:     "\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x00\x52\x00\x61",
   111  		sizeDst: 100,
   112  		want:    "",
   113  		nSrc:    0,
   114  		err:     ErrMissingBOM,
   115  		t:       utf16BEEB.NewDecoder(),
   116  	}, {
   117  		desc:    "utf-16 dec: Fail on single byte missing BOM when required",
   118  		src:     "\x00",
   119  		sizeDst: 4,
   120  		t:       utf16BEEB.NewDecoder(),
   121  		err:     ErrMissingBOM,
   122  	}, {
   123  		desc:    "utf-16 dec: Fail on short src missing BOM when required",
   124  		src:     "\x00",
   125  		notEOF:  true,
   126  		sizeDst: 4,
   127  		t:       utf16BEEB.NewDecoder(),
   128  		err:     transform.ErrShortSrc,
   129  	}, {
   130  		desc:    "utf-16 dec: SHOULD interpret text as big-endian when BOM not present (RFC 2781:4.3)",
   131  		src:     "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   132  		sizeDst: 100,
   133  		want:    "\U00012345=Ra",
   134  		nSrc:    10,
   135  		t:       utf16BEUB.NewDecoder(),
   136  	}, {
   137  		desc:    "utf-16 dec: incorrect UTF-16: odd bytes",
   138  		src:     "\x00",
   139  		sizeDst: 100,
   140  		want:    "\uFFFD",
   141  		nSrc:    1,
   142  		t:       utf16BEUB.NewDecoder(),
   143  	}, {
   144  		desc:    "utf-16 dec: Fail on incorrect UTF-16: short source odd bytes",
   145  		src:     "\x00",
   146  		notEOF:  true,
   147  		sizeDst: 100,
   148  		t:       utf16BEUB.NewDecoder(),
   149  		err:     transform.ErrShortSrc,
   150  	}, {
   151  		// This is an error according to RFC 2781. But errors in RFC 2781 are
   152  		// open to interpretations, so I guess this is fine.
   153  		desc:    "utf-16le dec: incorrect BOM is an error (RFC 2781:4.1)",
   154  		src:     "\xFE\xFF\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   155  		sizeDst: 100,
   156  		want:    "\uFFFE\U00012345=Ra",
   157  		nSrc:    12,
   158  		t:       utf16LEIB.NewDecoder(),
   159  	}, {
   160  		desc:    "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)",
   161  		src:     "\U00012345=Ra",
   162  		sizeDst: 100,
   163  		want:    "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   164  		nSrc:    7,
   165  		t:       utf16LEUB.NewEncoder(),
   166  	}, {
   167  		desc:    "utf-16 enc: SHOULD write BOM (RFC 2781:3.3)",
   168  		src:     "\U00012345=Ra",
   169  		sizeDst: 100,
   170  		want:    "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   171  		nSrc:    7,
   172  		t:       utf16BEUB.NewEncoder(),
   173  	}, {
   174  		desc:    "utf-16le enc: MUST NOT write BOM (RFC 2781:3.3)",
   175  		src:     "\U00012345=Ra",
   176  		sizeDst: 100,
   177  		want:    "\x08\xD8\x45\xDF\x3D\x00\x52\x00\x61\x00",
   178  		nSrc:    7,
   179  		t:       utf16LEIB.NewEncoder(),
   180  	}, {
   181  		desc:    "utf-16be dec: incorrect UTF-16: odd bytes",
   182  		src:     "\x00",
   183  		sizeDst: 100,
   184  		want:    "\uFFFD",
   185  		nSrc:    1,
   186  		t:       utf16BEIB.NewDecoder(),
   187  	}, {
   188  		desc:    "utf-16be dec: unpaired surrogate, odd bytes",
   189  		src:     "\xD8\x45\x00",
   190  		sizeDst: 100,
   191  		want:    "\uFFFD\uFFFD",
   192  		nSrc:    3,
   193  		t:       utf16BEIB.NewDecoder(),
   194  	}, {
   195  		desc:    "utf-16be dec: unpaired low surrogate + valid text",
   196  		src:     "\xD8\x45\x00a",
   197  		sizeDst: 100,
   198  		want:    "\uFFFDa",
   199  		nSrc:    4,
   200  		t:       utf16BEIB.NewDecoder(),
   201  	}, {
   202  		desc:    "utf-16be dec: unpaired low surrogate + valid text + single byte",
   203  		src:     "\xD8\x45\x00ab",
   204  		sizeDst: 100,
   205  		want:    "\uFFFDa\uFFFD",
   206  		nSrc:    5,
   207  		t:       utf16BEIB.NewDecoder(),
   208  	}, {
   209  		desc:    "utf-16le dec: unpaired high surrogate",
   210  		src:     "\x00\x00\x00\xDC\x12\xD8",
   211  		sizeDst: 100,
   212  		want:    "\x00\uFFFD\uFFFD",
   213  		nSrc:    6,
   214  		t:       utf16LEIB.NewDecoder(),
   215  	}, {
   216  		desc:    "utf-16be dec: two unpaired low surrogates",
   217  		src:     "\xD8\x45\xD8\x12",
   218  		sizeDst: 100,
   219  		want:    "\uFFFD\uFFFD",
   220  		nSrc:    4,
   221  		t:       utf16BEIB.NewDecoder(),
   222  	}, {
   223  		desc:    "utf-16be dec: short dst",
   224  		src:     "\x00a",
   225  		sizeDst: 0,
   226  		want:    "",
   227  		nSrc:    0,
   228  		t:       utf16BEIB.NewDecoder(),
   229  		err:     transform.ErrShortDst,
   230  	}, {
   231  		desc:    "utf-16be dec: short dst surrogate",
   232  		src:     "\xD8\xF5\xDC\x12",
   233  		sizeDst: 3,
   234  		want:    "",
   235  		nSrc:    0,
   236  		t:       utf16BEIB.NewDecoder(),
   237  		err:     transform.ErrShortDst,
   238  	}, {
   239  		desc:    "utf-16be dec: short dst trailing byte",
   240  		src:     "\x00",
   241  		sizeDst: 2,
   242  		want:    "",
   243  		nSrc:    0,
   244  		t:       utf16BEIB.NewDecoder(),
   245  		err:     transform.ErrShortDst,
   246  	}, {
   247  		desc:    "utf-16be dec: short src",
   248  		src:     "\x00",
   249  		notEOF:  true,
   250  		sizeDst: 3,
   251  		want:    "",
   252  		nSrc:    0,
   253  		t:       utf16BEIB.NewDecoder(),
   254  		err:     transform.ErrShortSrc,
   255  	}, {
   256  		desc:    "utf-16 enc",
   257  		src:     "\U00012345=Ra",
   258  		sizeDst: 100,
   259  		want:    "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   260  		nSrc:    7,
   261  		t:       utf16BEUB.NewEncoder(),
   262  	}, {
   263  		desc:    "utf-16 enc: short dst normal",
   264  		src:     "\U00012345=Ra",
   265  		sizeDst: 9,
   266  		want:    "\xD8\x08\xDF\x45\x00\x3D\x00\x52",
   267  		nSrc:    6,
   268  		t:       utf16BEIB.NewEncoder(),
   269  		err:     transform.ErrShortDst,
   270  	}, {
   271  		desc:    "utf-16 enc: short dst surrogate",
   272  		src:     "\U00012345=Ra",
   273  		sizeDst: 3,
   274  		want:    "",
   275  		nSrc:    0,
   276  		t:       utf16BEIB.NewEncoder(),
   277  		err:     transform.ErrShortDst,
   278  	}, {
   279  		desc:    "utf-16 enc: short src",
   280  		src:     "\U00012345=Ra\xC2",
   281  		notEOF:  true,
   282  		sizeDst: 100,
   283  		want:    "\xD8\x08\xDF\x45\x00\x3D\x00\x52\x00\x61",
   284  		nSrc:    7,
   285  		t:       utf16BEIB.NewEncoder(),
   286  		err:     transform.ErrShortSrc,
   287  	}, {
   288  		desc:    "utf-16be dec: don't change byte order mid-stream",
   289  		src:     "\xFE\xFF\xD8\x08\xDF\x45\x00\x3D\xFF\xFE\x00\x52\x00\x61",
   290  		sizeDst: 100,
   291  		want:    "\U00012345=\ufffeRa",
   292  		nSrc:    14,
   293  		t:       utf16BEUB.NewDecoder(),
   294  	}, {
   295  		desc:    "utf-16le dec: don't change byte order mid-stream",
   296  		src:     "\xFF\xFE\x08\xD8\x45\xDF\x3D\x00\xFF\xFE\xFE\xFF\x52\x00\x61\x00",
   297  		sizeDst: 100,
   298  		want:    "\U00012345=\ufeff\ufffeRa",
   299  		nSrc:    16,
   300  		t:       utf16LEUB.NewDecoder(),
   301  	}}
   302  	for i, tc := range testCases {
   303  		for j := 0; j < 2; j++ {
   304  			b := make([]byte, tc.sizeDst)
   305  			nDst, nSrc, err := tc.t.Transform(b, []byte(tc.src), !tc.notEOF)
   306  			if err != tc.err {
   307  				t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   308  			}
   309  			if got := string(b[:nDst]); got != tc.want {
   310  				t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   311  			}
   312  			if nSrc != tc.nSrc {
   313  				t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   314  			}
   315  			// Since Transform is stateful, run failures again
   316  			// to ensure that the same error occurs a second time.
   317  			if err == nil {
   318  				break
   319  			}
   320  		}
   321  	}
   322  }
   323  
   324  func TestUTF8Decoder(t *testing.T) {
   325  	testCases := []struct {
   326  		desc    string
   327  		src     string
   328  		notEOF  bool // the inverse of atEOF
   329  		sizeDst int
   330  		want    string
   331  		nSrc    int
   332  		err     error
   333  	}{{
   334  		desc: "empty string, empty dest buffer",
   335  	}, {
   336  		desc:    "empty string",
   337  		sizeDst: 8,
   338  	}, {
   339  		desc:    "empty string, streaming",
   340  		notEOF:  true,
   341  		sizeDst: 8,
   342  	}, {
   343  		desc:    "ascii",
   344  		src:     "abcde",
   345  		sizeDst: 8,
   346  		want:    "abcde",
   347  		nSrc:    5,
   348  	}, {
   349  		desc:    "ascii and error",
   350  		src:     "ab\x80de",
   351  		sizeDst: 7,
   352  		want:    "ab\ufffdde",
   353  		nSrc:    5,
   354  	}, {
   355  		desc:    "valid two-byte sequence",
   356  		src:     "a\u0300bc",
   357  		sizeDst: 7,
   358  		want:    "a\u0300bc",
   359  		nSrc:    5,
   360  	}, {
   361  		desc:    "valid three-byte sequence",
   362  		src:     "a\u0300中",
   363  		sizeDst: 7,
   364  		want:    "a\u0300中",
   365  		nSrc:    6,
   366  	}, {
   367  		desc:    "valid four-byte sequence",
   368  		src:     "a中\U00016F50",
   369  		sizeDst: 8,
   370  		want:    "a中\U00016F50",
   371  		nSrc:    8,
   372  	}, {
   373  		desc:    "short source buffer",
   374  		src:     "abc\xf0\x90",
   375  		notEOF:  true,
   376  		sizeDst: 10,
   377  		want:    "abc",
   378  		nSrc:    3,
   379  		err:     transform.ErrShortSrc,
   380  	}, {
   381  		// We don't check for the maximal subpart of an ill-formed subsequence
   382  		// at the end of an open segment.
   383  		desc:    "complete invalid that looks like short at end",
   384  		src:     "abc\xf0\x80",
   385  		notEOF:  true,
   386  		sizeDst: 10,
   387  		want:    "abc", // instead of "abc\ufffd\ufffd",
   388  		nSrc:    3,
   389  		err:     transform.ErrShortSrc,
   390  	}, {
   391  		desc:    "incomplete sequence at end",
   392  		src:     "a\x80bc\xf0\x90",
   393  		sizeDst: 9,
   394  		want:    "a\ufffdbc\ufffd",
   395  		nSrc:    6,
   396  	}, {
   397  		desc:    "invalid second byte",
   398  		src:     "abc\xf0dddd",
   399  		sizeDst: 10,
   400  		want:    "abc\ufffddddd",
   401  		nSrc:    8,
   402  	}, {
   403  		desc:    "invalid second byte at end",
   404  		src:     "abc\xf0d",
   405  		sizeDst: 10,
   406  		want:    "abc\ufffdd",
   407  		nSrc:    5,
   408  	}, {
   409  		desc:    "invalid third byte",
   410  		src:     "a\u0300bc\xf0\x90dddd",
   411  		sizeDst: 12,
   412  		want:    "a\u0300bc\ufffddddd",
   413  		nSrc:    11,
   414  	}, {
   415  		desc:    "invalid third byte at end",
   416  		src:     "a\u0300bc\xf0\x90d",
   417  		sizeDst: 12,
   418  		want:    "a\u0300bc\ufffdd",
   419  		nSrc:    8,
   420  	}, {
   421  		desc:    "invalid fourth byte, tight buffer",
   422  		src:     "a\u0300bc\xf0\x90\x80d",
   423  		sizeDst: 9,
   424  		want:    "a\u0300bc\ufffdd",
   425  		nSrc:    9,
   426  	}, {
   427  		desc:    "invalid fourth byte at end",
   428  		src:     "a\u0300bc\xf0\x90\x80",
   429  		sizeDst: 8,
   430  		want:    "a\u0300bc\ufffd",
   431  		nSrc:    8,
   432  	}, {
   433  		desc:    "invalid fourth byte and short four byte sequence",
   434  		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
   435  		notEOF:  true,
   436  		sizeDst: 20,
   437  		want:    "a\u0300bc\ufffd",
   438  		nSrc:    8,
   439  		err:     transform.ErrShortSrc,
   440  	}, {
   441  		desc:    "valid four-byte sequence overflowing short buffer",
   442  		src:     "a\u0300bc\xf0\x90\x80\x80",
   443  		notEOF:  true,
   444  		sizeDst: 8,
   445  		want:    "a\u0300bc",
   446  		nSrc:    5,
   447  		err:     transform.ErrShortDst,
   448  	}, {
   449  		desc:    "invalid fourth byte at end short, but short dst",
   450  		src:     "a\u0300bc\xf0\x90\x80\xf0\x90\x80",
   451  		notEOF:  true,
   452  		sizeDst: 8,
   453  		// More bytes would fit in the buffer, but this seems to require a more
   454  		// complicated and slower algorithm.
   455  		want: "a\u0300bc", // instead of "a\u0300bc"
   456  		nSrc: 5,
   457  		err:  transform.ErrShortDst,
   458  	}, {
   459  		desc:    "short dst for error",
   460  		src:     "abc\x80",
   461  		notEOF:  true,
   462  		sizeDst: 5,
   463  		want:    "abc",
   464  		nSrc:    3,
   465  		err:     transform.ErrShortDst,
   466  	}, {
   467  		desc:    "adjusting short dst buffer",
   468  		src:     "abc\x80ef",
   469  		notEOF:  true,
   470  		sizeDst: 6,
   471  		want:    "abc\ufffd",
   472  		nSrc:    4,
   473  		err:     transform.ErrShortDst,
   474  	}}
   475  	tr := UTF8.NewDecoder()
   476  	for i, tc := range testCases {
   477  		b := make([]byte, tc.sizeDst)
   478  		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
   479  		if err != tc.err {
   480  			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   481  		}
   482  		if got := string(b[:nDst]); got != tc.want {
   483  			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   484  		}
   485  		if nSrc != tc.nSrc {
   486  			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   487  		}
   488  	}
   489  }
   490  
   491  func TestUTF8BOMDecoder(t *testing.T) {
   492  	testCases := []struct {
   493  		desc    string
   494  		src     string
   495  		notEOF  bool // the inverse of atEOF
   496  		sizeDst int
   497  		want    string
   498  		nSrc    int
   499  		err     error
   500  		wantAll string
   501  	}{{
   502  		desc: "empty string, empty dest buffer",
   503  	}, {
   504  		desc:    "empty string",
   505  		sizeDst: 8,
   506  	}, {
   507  		desc:    "empty string, streaming",
   508  		notEOF:  true,
   509  		sizeDst: 8,
   510  	}, {
   511  		desc:    "ascii",
   512  		src:     "abcde",
   513  		sizeDst: 8,
   514  		want:    "abcde",
   515  		nSrc:    5,
   516  		wantAll: "abcde",
   517  	}, {
   518  		desc:    "ascii with bom",
   519  		src:     utf8BOM + "abcde",
   520  		sizeDst: 11,
   521  		want:    "abcde",
   522  		nSrc:    8,
   523  		wantAll: "abcde",
   524  	}, {
   525  		desc:    "error with bom",
   526  		src:     utf8BOM + "ab\x80de",
   527  		sizeDst: 11,
   528  		want:    "ab\ufffdde",
   529  		nSrc:    8,
   530  		wantAll: "ab\ufffdde",
   531  	}, {
   532  		desc:    "short bom",
   533  		src:     utf8BOM[:2],
   534  		notEOF:  true,
   535  		sizeDst: 7,
   536  		want:    "",
   537  		nSrc:    0,
   538  		wantAll: "\ufffd", // needs to be 1 replacement
   539  		err:     transform.ErrShortSrc,
   540  	}, {
   541  		desc:    "short bom at end",
   542  		src:     utf8BOM[:2],
   543  		sizeDst: 7,
   544  		want:    "\ufffd", // needs to be 1 replacement
   545  		nSrc:    2,
   546  		wantAll: "\ufffd", // needs to be 1 replacement
   547  		err:     nil,
   548  	}, {
   549  		desc:    "short source buffer",
   550  		src:     "abc\xf0\x90",
   551  		notEOF:  true,
   552  		sizeDst: 10,
   553  		want:    "abc",
   554  		nSrc:    3,
   555  		wantAll: "abc\ufffd",
   556  		err:     transform.ErrShortSrc,
   557  	}, {
   558  		desc:    "short source buffer with bom",
   559  		src:     utf8BOM + "abc\xf0\x90",
   560  		notEOF:  true,
   561  		sizeDst: 15,
   562  		want:    "abc",
   563  		nSrc:    6,
   564  		wantAll: "abc\ufffd",
   565  		err:     transform.ErrShortSrc,
   566  	}, {
   567  		desc:    "short dst for error",
   568  		src:     utf8BOM + "abc\x80",
   569  		notEOF:  true,
   570  		sizeDst: 5,
   571  		want:    "abc",
   572  		nSrc:    6,
   573  		wantAll: "abc\ufffd",
   574  		err:     transform.ErrShortDst,
   575  	}}
   576  	tr := UTF8BOM.NewDecoder()
   577  	for i, tc := range testCases {
   578  		tr.Reset()
   579  		b := make([]byte, tc.sizeDst)
   580  		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
   581  		if err != tc.err {
   582  			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   583  		}
   584  		if got := string(b[:nDst]); got != tc.want {
   585  			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   586  		}
   587  		if nSrc != tc.nSrc {
   588  			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   589  		}
   590  		if got, _ := tr.String(tc.src); got != tc.wantAll {
   591  			t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
   592  		}
   593  	}
   594  }
   595  
   596  func TestUTF8SigEncoder(t *testing.T) {
   597  	testCases := []struct {
   598  		desc    string
   599  		src     string
   600  		notEOF  bool // the inverse of atEOF
   601  		sizeDst int
   602  		want    string
   603  		wantAll string // converting all bytes
   604  		nSrc    int
   605  		err     error
   606  	}{{
   607  		desc:    "empty string, empty dest buffer",
   608  		err:     transform.ErrShortDst,
   609  		wantAll: utf8BOM,
   610  	}, {
   611  		desc:    "empty string",
   612  		sizeDst: 8,
   613  		want:    utf8BOM,
   614  		wantAll: utf8BOM,
   615  	}, {
   616  		desc:    "empty string, streaming",
   617  		notEOF:  true,
   618  		sizeDst: 8,
   619  		want:    utf8BOM,
   620  		wantAll: utf8BOM,
   621  	}, {
   622  		desc:    "ascii",
   623  		src:     "abcde",
   624  		sizeDst: 8,
   625  		want:    utf8BOM + "abcde",
   626  		nSrc:    5,
   627  		wantAll: utf8BOM + "abcde",
   628  	}, {
   629  		desc:    "short bom at end",
   630  		src:     utf8BOM[:2],
   631  		sizeDst: 11,
   632  		want:    utf8BOM + "\ufffd",
   633  		nSrc:    2,
   634  		wantAll: utf8BOM + "\ufffd",
   635  	}, {
   636  		desc:    "short bom",
   637  		src:     utf8BOM[:2],
   638  		notEOF:  true,
   639  		sizeDst: 7,
   640  		want:    utf8BOM,
   641  		nSrc:    0,
   642  		err:     transform.ErrShortSrc,
   643  		wantAll: utf8BOM + "\ufffd",
   644  	}, {
   645  		desc:    "short bom at end",
   646  		src:     utf8BOM[:2],
   647  		sizeDst: 7,
   648  		want:    utf8BOM + "\ufffd", // needs to be 1 replacement
   649  		nSrc:    2,
   650  		err:     nil,
   651  		wantAll: utf8BOM + "\ufffd",
   652  	}, {
   653  		desc:    "short dst buffer 2",
   654  		src:     "ab",
   655  		sizeDst: 2,
   656  		want:    "",
   657  		nSrc:    0,
   658  		err:     transform.ErrShortDst,
   659  		wantAll: utf8BOM + "ab",
   660  	}, {
   661  		desc:    "short dst buffer 3",
   662  		src:     "ab",
   663  		sizeDst: 3,
   664  		want:    utf8BOM,
   665  		nSrc:    0,
   666  		err:     transform.ErrShortDst,
   667  		wantAll: utf8BOM + "ab",
   668  	}, {
   669  		desc:    "short dst buffer 4",
   670  		src:     "ab",
   671  		sizeDst: 4,
   672  		want:    utf8BOM + "a",
   673  		nSrc:    1,
   674  		err:     transform.ErrShortDst,
   675  		wantAll: utf8BOM + "ab",
   676  	}}
   677  	tr := UTF8BOM.NewEncoder()
   678  	for i, tc := range testCases {
   679  		tr.Reset()
   680  		b := make([]byte, tc.sizeDst)
   681  		nDst, nSrc, err := tr.Transform(b, []byte(tc.src), !tc.notEOF)
   682  		if err != tc.err {
   683  			t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
   684  		}
   685  		if got := string(b[:nDst]); got != tc.want {
   686  			t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
   687  		}
   688  		if nSrc != tc.nSrc {
   689  			t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
   690  		}
   691  		if got, _ := tr.String(tc.src); got != tc.wantAll {
   692  			t.Errorf("%d:%s: String was %s; want %s", i, tc.desc, got, tc.wantAll)
   693  		}
   694  	}
   695  }
   696  
   697  func TestBOMOverride(t *testing.T) {
   698  	dec := BOMOverride(charmap.CodePage437.NewDecoder())
   699  	dst := make([]byte, 100)
   700  	for i, tc := range []struct {
   701  		src   string
   702  		atEOF bool
   703  		dst   string
   704  		nSrc  int
   705  		err   error
   706  	}{
   707  		0:  {"H\x82ll\x93", true, "Héllô", 5, nil},
   708  		1:  {"\uFEFFHéllö", true, "Héllö", 10, nil},
   709  		2:  {"\xFE\xFF\x00H\x00e\x00l\x00l\x00o", true, "Hello", 12, nil},
   710  		3:  {"\xFF\xFEH\x00e\x00l\x00l\x00o\x00", true, "Hello", 12, nil},
   711  		4:  {"\uFEFF", true, "", 3, nil},
   712  		5:  {"\xFE\xFF", true, "", 2, nil},
   713  		6:  {"\xFF\xFE", true, "", 2, nil},
   714  		7:  {"\xEF\xBB", true, "\u2229\u2557", 2, nil},
   715  		8:  {"\xEF", true, "\u2229", 1, nil},
   716  		9:  {"", true, "", 0, nil},
   717  		10: {"\xFE", true, "\u25a0", 1, nil},
   718  		11: {"\xFF", true, "\u00a0", 1, nil},
   719  		12: {"\xEF\xBB", false, "", 0, transform.ErrShortSrc},
   720  		13: {"\xEF", false, "", 0, transform.ErrShortSrc},
   721  		14: {"", false, "", 0, transform.ErrShortSrc},
   722  		15: {"\xFE", false, "", 0, transform.ErrShortSrc},
   723  		16: {"\xFF", false, "", 0, transform.ErrShortSrc},
   724  		17: {"\xFF\xFE", false, "", 0, transform.ErrShortSrc},
   725  	} {
   726  		dec.Reset()
   727  		nDst, nSrc, err := dec.Transform(dst, []byte(tc.src), tc.atEOF)
   728  		got := string(dst[:nDst])
   729  		if nSrc != tc.nSrc {
   730  			t.Errorf("%d: nSrc: got %d; want %d", i, nSrc, tc.nSrc)
   731  		}
   732  		if got != tc.dst {
   733  			t.Errorf("%d: got %+q; want %+q", i, got, tc.dst)
   734  		}
   735  		if err != tc.err {
   736  			t.Errorf("%d: error: got %v; want %v", i, err, tc.err)
   737  		}
   738  	}
   739  }
   740  

View as plain text