...

Source file src/github.com/bytedance/sonic/utf8/utf8_test.go

Documentation: github.com/bytedance/sonic/utf8

     1  /*
     2   * Copyright 2022 ByteDance Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package utf8
    18  
    19  import (
    20      `testing`
    21      `strings`
    22      `github.com/stretchr/testify/assert`
    23      `unicode/utf8`
    24      `bytes`
    25      `math/rand`
    26  )
    27  
    28  var (
    29      _Header_2Bytes  = string([]byte{0xC0})
    30      _Header_3Bytes  = string([]byte{0xE0})
    31      _Header_4Bytes  = string([]byte{0xF0})
    32      _Low_Surrogate  = string([]byte{0xED, 0xA0, 0x80}) // \ud800
    33      _High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00
    34      _Cont           = "\xb0"
    35  )
    36  
    37  func TestCorrectWith_InvalidUtf8(t *testing.T) {
    38      var tests = []struct {
    39          name   string
    40          input  string
    41          expect string
    42          errpos int
    43      } {
    44          {"basic", `abc`, "abc", -1},
    45          {"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1},
    46  
    47          // invalid utf8 - single byte
    48          {"single_Cont", _Cont, "\ufffd", 0},
    49          {"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
    50          {"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
    51          {"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
    52  
    53          // invalid utf8 - two bytes
    54          {"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
    55          {`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx",  "\ufffd\ufffdxx", 0},
    56          { `"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
    57  
    58          // invalid utf8 - three bytes
    59          {`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
    60          {`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
    61  
    62          // invalid utf8 - multi bytes
    63          {`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
    64          {`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
    65      }
    66      for _, test := range tests {
    67          got := CorrectWith(nil, []byte(test.input), "\ufffd")
    68          assert.Equal(t, []byte(test.expect), got, test.name)
    69          assert.Equal(t,test.errpos == -1, utf8.ValidString(test.input), test.name)
    70      }
    71  }
    72  
    73  func genRandBytes(length int) []byte {
    74      var buf bytes.Buffer
    75      for j := 0; j < length; j++ {
    76          buf.WriteByte(byte(rand.Intn(0xFF + 1)))
    77      }
    78      return buf.Bytes()
    79  }
    80  
    81  func genRandAscii(length int) []byte {
    82      var buf bytes.Buffer
    83      for j := 0; j < length; j++ {
    84          buf.WriteByte(byte(rand.Intn(0x7F + 1)))
    85      }
    86      return buf.Bytes()
    87  }
    88  
    89  func genRandRune(length int) []byte {
    90      var buf bytes.Buffer
    91      for j := 0; j < length; j++ {
    92          buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
    93      }
    94      return buf.Bytes()
    95  }
    96  
    97  func TestValidate_Random(t *testing.T) {
    98      // compare with stdlib
    99      compare := func(t *testing.T, data []byte) {
   100          assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
   101      }
   102  
   103      // random testing
   104      nums   := 1000
   105      maxLen := 1000
   106      for i := 0; i < nums; i++ {
   107          length := rand.Intn(maxLen)
   108          compare(t, genRandBytes(length))
   109          compare(t, genRandRune(length))
   110      }
   111  }
   112  
   113  func BenchmarkValidate(b *testing.B) {
   114      bench := []struct {
   115          name string
   116          data []byte
   117      } {
   118          {"ValidAscii", genRandAscii(1000)},
   119          {"ValidUTF8",  genRandRune(1000)},
   120          {"RandomBytes", genRandBytes(1000)},
   121      }
   122  
   123      for _, test := range bench {
   124          if utf8.Valid(test.data) != Validate(test.data) {
   125              b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
   126          }
   127          b.Run("Sonic_" + test.name, func(b *testing.B) {
   128              for i := 0; i < b.N; i++ {
   129                  Validate(test.data)
   130              }
   131          })
   132          b.Run("StdLib_" + test.name, func(b *testing.B) {
   133              for i := 0; i < b.N; i++ {
   134                  utf8.Valid(test.data)
   135              }
   136          })
   137      }
   138  }
   139  

View as plain text