1
16
17 package utf8
18
19 import (
20 `testing`
21 `strings`
22 `github.com/stretchr/testify/assert`
23 `unicode/utf8`
24 `bytes`
25 `math/rand`
26 )
27
28 var (
29 _Header_2Bytes = string([]byte{0xC0})
30 _Header_3Bytes = string([]byte{0xE0})
31 _Header_4Bytes = string([]byte{0xF0})
32 _Low_Surrogate = string([]byte{0xED, 0xA0, 0x80})
33 _High_Surrogate = string([]byte{0xED, 0xB0, 0x80})
34 _Cont = "\xb0"
35 )
36
37 func TestCorrectWith_InvalidUtf8(t *testing.T) {
38 var tests = []struct {
39 name string
40 input string
41 expect string
42 errpos int
43 } {
44 {"basic", `abc`, "abc", -1},
45 {"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1},
46
47
48 {"single_Cont", _Cont, "\ufffd", 0},
49 {"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
50 {"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
51 {"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
52
53
54 {"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
55 {`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0},
56 { `"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
57
58
59 {`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
60 {`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
61
62
63 {`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
64 {`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
65 }
66 for _, test := range tests {
67 got := CorrectWith(nil, []byte(test.input), "\ufffd")
68 assert.Equal(t, []byte(test.expect), got, test.name)
69 assert.Equal(t,test.errpos == -1, utf8.ValidString(test.input), test.name)
70 }
71 }
72
73 func genRandBytes(length int) []byte {
74 var buf bytes.Buffer
75 for j := 0; j < length; j++ {
76 buf.WriteByte(byte(rand.Intn(0xFF + 1)))
77 }
78 return buf.Bytes()
79 }
80
81 func genRandAscii(length int) []byte {
82 var buf bytes.Buffer
83 for j := 0; j < length; j++ {
84 buf.WriteByte(byte(rand.Intn(0x7F + 1)))
85 }
86 return buf.Bytes()
87 }
88
89 func genRandRune(length int) []byte {
90 var buf bytes.Buffer
91 for j := 0; j < length; j++ {
92 buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
93 }
94 return buf.Bytes()
95 }
96
97 func TestValidate_Random(t *testing.T) {
98
99 compare := func(t *testing.T, data []byte) {
100 assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
101 }
102
103
104 nums := 1000
105 maxLen := 1000
106 for i := 0; i < nums; i++ {
107 length := rand.Intn(maxLen)
108 compare(t, genRandBytes(length))
109 compare(t, genRandRune(length))
110 }
111 }
112
113 func BenchmarkValidate(b *testing.B) {
114 bench := []struct {
115 name string
116 data []byte
117 } {
118 {"ValidAscii", genRandAscii(1000)},
119 {"ValidUTF8", genRandRune(1000)},
120 {"RandomBytes", genRandBytes(1000)},
121 }
122
123 for _, test := range bench {
124 if utf8.Valid(test.data) != Validate(test.data) {
125 b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
126 }
127 b.Run("Sonic_" + test.name, func(b *testing.B) {
128 for i := 0; i < b.N; i++ {
129 Validate(test.data)
130 }
131 })
132 b.Run("StdLib_" + test.name, func(b *testing.B) {
133 for i := 0; i < b.N; i++ {
134 utf8.Valid(test.data)
135 }
136 })
137 }
138 }
139
View as plain text