1 package encoder
2
3 import "unicode/utf8"
4
5 const (
6
7 locb = 128
8 hicb = 191
9
10
11
12
13
14 xx = 0xF1
15 as = 0xF0
16 s1 = 0x02
17 s2 = 0x13
18 s3 = 0x03
19 s4 = 0x23
20 s5 = 0x34
21 s6 = 0x04
22 s7 = 0x44
23 )
24
25
26 var first = [256]uint8{
27
28 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
29 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
30 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
31 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
32 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
33 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
34 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
35 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
36
37 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
38 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
39 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
40 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
41 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
42 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
43 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
44 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
45 }
46
47 const (
48 lineSep = byte(168)
49 paragraphSep = byte(169)
50 )
51
52 type decodeRuneState int
53
54 const (
55 validUTF8State decodeRuneState = iota
56 runeErrorState
57 lineSepState
58 paragraphSepState
59 )
60
61 func decodeRuneInString(s string) (decodeRuneState, int) {
62 n := len(s)
63 s0 := s[0]
64 x := first[s0]
65 if x >= as {
66
67
68
69 mask := rune(x) << 31 >> 31
70 if rune(s[0])&^mask|utf8.RuneError&mask == utf8.RuneError {
71 return runeErrorState, 1
72 }
73 return validUTF8State, 1
74 }
75 sz := int(x & 7)
76 if n < sz {
77 return runeErrorState, 1
78 }
79 s1 := s[1]
80 switch x >> 4 {
81 case 0:
82 if s1 < locb || hicb < s1 {
83 return runeErrorState, 1
84 }
85 case 1:
86 if s1 < 0xA0 || hicb < s1 {
87 return runeErrorState, 1
88 }
89 case 2:
90 if s1 < locb || 0x9F < s1 {
91 return runeErrorState, 1
92 }
93 case 3:
94 if s1 < 0x90 || hicb < s1 {
95 return runeErrorState, 1
96 }
97 case 4:
98 if s1 < locb || 0x8F < s1 {
99 return runeErrorState, 1
100 }
101 }
102 if sz <= 2 {
103 return validUTF8State, 2
104 }
105 s2 := s[2]
106 if s2 < locb || hicb < s2 {
107 return runeErrorState, 1
108 }
109 if sz <= 3 {
110
111 if s0 == 226 && s1 == 128 {
112 switch s2 {
113 case lineSep:
114 return lineSepState, 3
115 case paragraphSep:
116 return paragraphSepState, 3
117 }
118 }
119 return validUTF8State, 3
120 }
121 s3 := s[3]
122 if s3 < locb || hicb < s3 {
123 return runeErrorState, 1
124 }
125 return validUTF8State, 4
126 }
127
View as plain text