1
2
3
4
5
6 package unicode
7
8 import (
9 "bytes"
10 "errors"
11 "unicode/utf16"
12 "unicode/utf8"
13
14 "golang.org/x/text/encoding"
15 "golang.org/x/text/encoding/internal"
16 "golang.org/x/text/encoding/internal/identifier"
17 "golang.org/x/text/internal/utf8internal"
18 "golang.org/x/text/runes"
19 "golang.org/x/text/transform"
20 )
21
22
23
24
25
26
27
28
29
30 var UTF8 encoding.Encoding = utf8enc
31
32
33
34
35
36
37
38
39
40 var UTF8BOM encoding.Encoding = utf8bomEncoding{}
41
42 type utf8bomEncoding struct{}
43
44 func (utf8bomEncoding) String() string {
45 return "UTF-8-BOM"
46 }
47
48 func (utf8bomEncoding) ID() (identifier.MIB, string) {
49 return identifier.Unofficial, "x-utf8bom"
50 }
51
52 func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
53 return &encoding.Encoder{
54 Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
55 }
56 }
57
58 func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
59 return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
60 }
61
62 var utf8enc = &internal.Encoding{
63 &internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
64 "UTF-8",
65 identifier.UTF8,
66 }
67
68 type utf8bomDecoder struct {
69 checked bool
70 }
71
72 func (t *utf8bomDecoder) Reset() {
73 t.checked = false
74 }
75
76 func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
77 if !t.checked {
78 if !atEOF && len(src) < len(utf8BOM) {
79 if len(src) == 0 {
80 return 0, 0, nil
81 }
82 return 0, 0, transform.ErrShortSrc
83 }
84 if bytes.HasPrefix(src, []byte(utf8BOM)) {
85 nSrc += len(utf8BOM)
86 src = src[len(utf8BOM):]
87 }
88 t.checked = true
89 }
90 nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
91 nSrc += n
92 return nDst, nSrc, err
93 }
94
95 type utf8bomEncoder struct {
96 written bool
97 t transform.Transformer
98 }
99
100 func (t *utf8bomEncoder) Reset() {
101 t.written = false
102 t.t.Reset()
103 }
104
105 func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
106 if !t.written {
107 if len(dst) < len(utf8BOM) {
108 return nDst, 0, transform.ErrShortDst
109 }
110 nDst = copy(dst, utf8BOM)
111 t.written = true
112 }
113 n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
114 nDst += n
115 return nDst, nSrc, err
116 }
117
118 type utf8Decoder struct{ transform.NopResetter }
119
120 func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
121 var pSrc int
122 var accept utf8internal.AcceptRange
123
124
125 n := len(src)
126 if len(dst) < n {
127 err = transform.ErrShortDst
128 n = len(dst)
129 atEOF = false
130 }
131 for nSrc < n {
132 c := src[nSrc]
133 if c < utf8.RuneSelf {
134 nSrc++
135 continue
136 }
137 first := utf8internal.First[c]
138 size := int(first & utf8internal.SizeMask)
139 if first == utf8internal.FirstInvalid {
140 goto handleInvalid
141 }
142 accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
143 if nSrc+size > n {
144 if !atEOF {
145
146
147
148 if err == nil {
149 err = transform.ErrShortSrc
150 }
151 break
152 }
153
154 switch {
155 case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
156 size = 1
157 case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
158 size = 2
159 default:
160 size = 3
161 }
162 goto handleInvalid
163 }
164 if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
165 size = 1
166 goto handleInvalid
167 } else if size == 2 {
168 } else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
169 size = 2
170 goto handleInvalid
171 } else if size == 3 {
172 } else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
173 size = 3
174 goto handleInvalid
175 }
176 nSrc += size
177 continue
178
179 handleInvalid:
180
181 nDst += copy(dst[nDst:], src[pSrc:nSrc])
182
183
184 const runeError = "\ufffd"
185 if nDst+len(runeError) > len(dst) {
186 return nDst, nSrc, transform.ErrShortDst
187 }
188 nDst += copy(dst[nDst:], runeError)
189
190
191
192
193 nSrc += size
194 pSrc = nSrc
195
196
197 if sz := len(dst) - nDst; sz < len(src)-nSrc {
198 err = transform.ErrShortDst
199 n = nSrc + sz
200 atEOF = false
201 }
202 }
203 return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
204 }
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233 func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
234 return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
235 }
236
237
238
239
240
241 var mibValue = map[Endianness][numBOMValues]identifier.MIB{
242 BigEndian: [numBOMValues]identifier.MIB{
243 IgnoreBOM: identifier.UTF16BE,
244 UseBOM: identifier.UTF16,
245
246 },
247 LittleEndian: [numBOMValues]identifier.MIB{
248 IgnoreBOM: identifier.UTF16LE,
249 UseBOM: identifier.UTF16,
250
251 },
252
253 }
254
255
256 var All = []encoding.Encoding{
257 UTF8,
258 UTF16(BigEndian, UseBOM),
259 UTF16(BigEndian, IgnoreBOM),
260 UTF16(LittleEndian, IgnoreBOM),
261 }
262
263
264 type BOMPolicy uint8
265
266 const (
267 writeBOM BOMPolicy = 0x01
268 acceptBOM BOMPolicy = 0x02
269 requireBOM BOMPolicy = 0x04
270 bomMask BOMPolicy = 0x07
271
272
273
274
275
276
277 numBOMValues = 8 + 1
278
279
280 IgnoreBOM BOMPolicy = 0
281
282
283
284
285 UseBOM BOMPolicy = writeBOM | acceptBOM
286
287
288
289
290 ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
291
292
293
294
295
296
297
298
299
300 )
301
302
303 type Endianness bool
304
305 const (
306
307 BigEndian Endianness = false
308
309 LittleEndian Endianness = true
310 )
311
312
313
314 var ErrMissingBOM = errors.New("encoding: missing byte order mark")
315
316 type utf16Encoding struct {
317 config
318 mib identifier.MIB
319 }
320
321 type config struct {
322 endianness Endianness
323 bomPolicy BOMPolicy
324 }
325
326 func (u utf16Encoding) NewDecoder() *encoding.Decoder {
327 return &encoding.Decoder{Transformer: &utf16Decoder{
328 initial: u.config,
329 current: u.config,
330 }}
331 }
332
333 func (u utf16Encoding) NewEncoder() *encoding.Encoder {
334 return &encoding.Encoder{Transformer: &utf16Encoder{
335 endianness: u.endianness,
336 initialBOMPolicy: u.bomPolicy,
337 currentBOMPolicy: u.bomPolicy,
338 }}
339 }
340
341 func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
342 return u.mib, ""
343 }
344
345 func (u utf16Encoding) String() string {
346 e, b := "B", ""
347 if u.endianness == LittleEndian {
348 e = "L"
349 }
350 switch u.bomPolicy {
351 case ExpectBOM:
352 b = "Expect"
353 case UseBOM:
354 b = "Use"
355 case IgnoreBOM:
356 b = "Ignore"
357 }
358 return "UTF-16" + e + "E (" + b + " BOM)"
359 }
360
361 type utf16Decoder struct {
362 initial config
363 current config
364 }
365
366 func (u *utf16Decoder) Reset() {
367 u.current = u.initial
368 }
369
370 func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
371 if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
372 return 0, 0, ErrMissingBOM
373 }
374 if len(src) == 0 {
375 return 0, 0, nil
376 }
377 if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
378 switch {
379 case src[0] == 0xfe && src[1] == 0xff:
380 u.current.endianness = BigEndian
381 nSrc = 2
382 case src[0] == 0xff && src[1] == 0xfe:
383 u.current.endianness = LittleEndian
384 nSrc = 2
385 default:
386 if u.current.bomPolicy&requireBOM != 0 {
387 return 0, 0, ErrMissingBOM
388 }
389 }
390 u.current.bomPolicy = IgnoreBOM
391 }
392
393 var r rune
394 var dSize, sSize int
395 for nSrc < len(src) {
396 if nSrc+1 < len(src) {
397 x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
398 if u.current.endianness == LittleEndian {
399 x = x>>8 | x<<8
400 }
401 r, sSize = rune(x), 2
402 if utf16.IsSurrogate(r) {
403 if nSrc+3 < len(src) {
404 x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
405 if u.current.endianness == LittleEndian {
406 x = x>>8 | x<<8
407 }
408
409 if isHighSurrogate(rune(x)) {
410 r, sSize = utf16.DecodeRune(r, rune(x)), 4
411 }
412 } else if !atEOF {
413 err = transform.ErrShortSrc
414 break
415 }
416 }
417 if dSize = utf8.RuneLen(r); dSize < 0 {
418 r, dSize = utf8.RuneError, 3
419 }
420 } else if atEOF {
421
422 r, dSize, sSize = utf8.RuneError, 3, 1
423 } else {
424 err = transform.ErrShortSrc
425 break
426 }
427 if nDst+dSize > len(dst) {
428 err = transform.ErrShortDst
429 break
430 }
431 nDst += utf8.EncodeRune(dst[nDst:], r)
432 nSrc += sSize
433 }
434 return nDst, nSrc, err
435 }
436
437 func isHighSurrogate(r rune) bool {
438 return 0xDC00 <= r && r <= 0xDFFF
439 }
440
441 type utf16Encoder struct {
442 endianness Endianness
443 initialBOMPolicy BOMPolicy
444 currentBOMPolicy BOMPolicy
445 }
446
447 func (u *utf16Encoder) Reset() {
448 u.currentBOMPolicy = u.initialBOMPolicy
449 }
450
451 func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
452 if u.currentBOMPolicy&writeBOM != 0 {
453 if len(dst) < 2 {
454 return 0, 0, transform.ErrShortDst
455 }
456 dst[0], dst[1] = 0xfe, 0xff
457 u.currentBOMPolicy = IgnoreBOM
458 nDst = 2
459 }
460
461 r, size := rune(0), 0
462 for nSrc < len(src) {
463 r = rune(src[nSrc])
464
465
466 if r < utf8.RuneSelf {
467 size = 1
468
469 } else {
470
471 r, size = utf8.DecodeRune(src[nSrc:])
472 if size == 1 {
473
474
475
476 if !atEOF && !utf8.FullRune(src[nSrc:]) {
477 err = transform.ErrShortSrc
478 break
479 }
480 }
481 }
482
483 if r <= 0xffff {
484 if nDst+2 > len(dst) {
485 err = transform.ErrShortDst
486 break
487 }
488 dst[nDst+0] = uint8(r >> 8)
489 dst[nDst+1] = uint8(r)
490 nDst += 2
491 } else {
492 if nDst+4 > len(dst) {
493 err = transform.ErrShortDst
494 break
495 }
496 r1, r2 := utf16.EncodeRune(r)
497 dst[nDst+0] = uint8(r1 >> 8)
498 dst[nDst+1] = uint8(r1)
499 dst[nDst+2] = uint8(r2 >> 8)
500 dst[nDst+3] = uint8(r2)
501 nDst += 4
502 }
503 nSrc += size
504 }
505
506 if u.endianness == LittleEndian {
507 for i := 0; i < nDst; i += 2 {
508 dst[i], dst[i+1] = dst[i+1], dst[i]
509 }
510 }
511 return nDst, nSrc, err
512 }
513
View as plain text