1
2
3
4
5
6
7
8
9
10
11
12
13 package utf32
14
15 import (
16 "errors"
17 "unicode/utf8"
18
19 "golang.org/x/text/encoding"
20 "golang.org/x/text/encoding/internal/identifier"
21 "golang.org/x/text/transform"
22 )
23
24
25 var All = []encoding.Encoding{
26 UTF32(BigEndian, UseBOM),
27 UTF32(BigEndian, IgnoreBOM),
28 UTF32(LittleEndian, IgnoreBOM),
29 }
30
31
32
33 var ErrMissingBOM = errors.New("encoding: missing byte order mark")
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66 func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
67 return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
68 }
69
70
71
72 var mibValue = map[Endianness][numBOMValues]identifier.MIB{
73 BigEndian: [numBOMValues]identifier.MIB{
74 IgnoreBOM: identifier.UTF32BE,
75 UseBOM: identifier.UTF32,
76 },
77 LittleEndian: [numBOMValues]identifier.MIB{
78 IgnoreBOM: identifier.UTF32LE,
79 UseBOM: identifier.UTF32,
80 },
81
82 }
83
84
85 type BOMPolicy uint8
86
87 const (
88 writeBOM BOMPolicy = 0x01
89 acceptBOM BOMPolicy = 0x02
90 requireBOM BOMPolicy = 0x04
91 bomMask BOMPolicy = 0x07
92
93
94
95
96
97
98 numBOMValues = 8 + 1
99
100
101 IgnoreBOM BOMPolicy = 0
102
103
104
105
106 UseBOM BOMPolicy = writeBOM | acceptBOM
107
108
109
110
111 ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
112
113 )
114
115
116 type Endianness bool
117
118 const (
119
120 BigEndian Endianness = false
121
122 LittleEndian Endianness = true
123 )
124
125 type config struct {
126 endianness Endianness
127 bomPolicy BOMPolicy
128 }
129
130 type utf32Encoding struct {
131 config
132 mib identifier.MIB
133 }
134
135 func (u utf32Encoding) NewDecoder() *encoding.Decoder {
136 return &encoding.Decoder{Transformer: &utf32Decoder{
137 initial: u.config,
138 current: u.config,
139 }}
140 }
141
142 func (u utf32Encoding) NewEncoder() *encoding.Encoder {
143 return &encoding.Encoder{Transformer: &utf32Encoder{
144 endianness: u.endianness,
145 initialBOMPolicy: u.bomPolicy,
146 currentBOMPolicy: u.bomPolicy,
147 }}
148 }
149
150 func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
151 return u.mib, ""
152 }
153
154 func (u utf32Encoding) String() string {
155 e, b := "B", ""
156 if u.endianness == LittleEndian {
157 e = "L"
158 }
159 switch u.bomPolicy {
160 case ExpectBOM:
161 b = "Expect"
162 case UseBOM:
163 b = "Use"
164 case IgnoreBOM:
165 b = "Ignore"
166 }
167 return "UTF-32" + e + "E (" + b + " BOM)"
168 }
169
170 type utf32Decoder struct {
171 initial config
172 current config
173 }
174
175 func (u *utf32Decoder) Reset() {
176 u.current = u.initial
177 }
178
179 func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
180 if len(src) == 0 {
181 if atEOF && u.current.bomPolicy&requireBOM != 0 {
182 return 0, 0, ErrMissingBOM
183 }
184 return 0, 0, nil
185 }
186 if u.current.bomPolicy&acceptBOM != 0 {
187 if len(src) < 4 {
188 return 0, 0, transform.ErrShortSrc
189 }
190 switch {
191 case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
192 u.current.endianness = BigEndian
193 nSrc = 4
194 case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
195 u.current.endianness = LittleEndian
196 nSrc = 4
197 default:
198 if u.current.bomPolicy&requireBOM != 0 {
199 return 0, 0, ErrMissingBOM
200 }
201 }
202 u.current.bomPolicy = IgnoreBOM
203 }
204
205 var r rune
206 var dSize, sSize int
207 for nSrc < len(src) {
208 if nSrc+3 < len(src) {
209 x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
210 uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
211 if u.current.endianness == LittleEndian {
212 x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
213 }
214 r, sSize = rune(x), 4
215 if dSize = utf8.RuneLen(r); dSize < 0 {
216 r, dSize = utf8.RuneError, 3
217 }
218 } else if atEOF {
219
220 r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
221 } else {
222 err = transform.ErrShortSrc
223 break
224 }
225 if nDst+dSize > len(dst) {
226 err = transform.ErrShortDst
227 break
228 }
229 nDst += utf8.EncodeRune(dst[nDst:], r)
230 nSrc += sSize
231 }
232 return nDst, nSrc, err
233 }
234
235 type utf32Encoder struct {
236 endianness Endianness
237 initialBOMPolicy BOMPolicy
238 currentBOMPolicy BOMPolicy
239 }
240
241 func (u *utf32Encoder) Reset() {
242 u.currentBOMPolicy = u.initialBOMPolicy
243 }
244
245 func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
246 if u.currentBOMPolicy&writeBOM != 0 {
247 if len(dst) < 4 {
248 return 0, 0, transform.ErrShortDst
249 }
250 dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
251 u.currentBOMPolicy = IgnoreBOM
252 nDst = 4
253 }
254
255 r, size := rune(0), 0
256 for nSrc < len(src) {
257 r = rune(src[nSrc])
258
259
260 if r < utf8.RuneSelf {
261 size = 1
262
263 } else {
264
265 r, size = utf8.DecodeRune(src[nSrc:])
266 if size == 1 {
267
268
269
270 if !atEOF && !utf8.FullRune(src[nSrc:]) {
271 err = transform.ErrShortSrc
272 break
273 }
274 }
275 }
276
277 if nDst+4 > len(dst) {
278 err = transform.ErrShortDst
279 break
280 }
281
282 dst[nDst+0] = uint8(r >> 24)
283 dst[nDst+1] = uint8(r >> 16)
284 dst[nDst+2] = uint8(r >> 8)
285 dst[nDst+3] = uint8(r)
286 nDst += 4
287 nSrc += size
288 }
289
290 if u.endianness == LittleEndian {
291 for i := 0; i < nDst; i += 4 {
292 dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
293 }
294 }
295 return nDst, nSrc, err
296 }
297
View as plain text