utf32.go

Documentation: golang.org/x/text/encoding/unicode/utf32

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package utf32 provides the UTF-32 Unicode encoding.
     6  //
     7  // Please note that support for UTF-32 is discouraged as it is a rare and
     8  // inefficient encoding, unfit for use as an interchange format. For use
     9  // on the web, the W3C strongly discourages its use
    10  // (https://www.w3.org/TR/html5/document-metadata.html#charset)
    11  // while WHATWG directly prohibits supporting it
    12  // (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
    13  package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
    14  
    15  import (
    16  	"errors"
    17  	"unicode/utf8"
    18  
    19  	"golang.org/x/text/encoding"
    20  	"golang.org/x/text/encoding/internal/identifier"
    21  	"golang.org/x/text/transform"
    22  )
    23  
    24  // All lists a configuration for each IANA-defined UTF-32 variant.
    25  var All = []encoding.Encoding{
    26  	UTF32(BigEndian, UseBOM),
    27  	UTF32(BigEndian, IgnoreBOM),
    28  	UTF32(LittleEndian, IgnoreBOM),
    29  }
    30  
    31  // ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
    32  // find a starting byte order mark.
    33  var ErrMissingBOM = errors.New("encoding: missing byte order mark")
    34  
    35  // UTF32 returns a UTF-32 Encoding for the given default endianness and
    36  // byte order mark (BOM) policy.
    37  //
    38  // When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
    39  // neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
    40  // stream will affect the endianness used for decoding. Instead BOMs will
    41  // be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
    42  // 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
    43  // UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
    44  // is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
    45  // output. Instead, it overrides the default endianness e for the remainder
    46  // of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
    47  // units 0xFFFE0000 will not affect the endianness used, and will instead
    48  // be output as their standard UTF-8 (replacement) encodings. For UseBOM,
    49  // if there is no starting BOM, it will proceed with the default
    50  // Endianness. For ExpectBOM, in that case, the transformation will return
    51  // early with an ErrMissingBOM error.
    52  //
    53  // When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
    54  // of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
    55  // will not be inserted. The UTF-8 input does not need to contain a BOM.
    56  //
    57  // There is no concept of a 'native' endianness. If the UTF-32 data is
    58  // produced and consumed in a greater context that implies a certain
    59  // endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
    60  // and consume a BOM.
    61  //
    62  // In the language of https://www.unicode.org/faq/utf_bom.html#bom10,
    63  // IgnoreBOM corresponds to "Where the precise type of the data stream is
    64  // known... the BOM should not be used" and ExpectBOM corresponds to "A
    65  // particular protocol... may require use of the BOM".
    66  func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
    67  	return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
    68  }
    69  
    70  // mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
    71  // Note that some configurations map to the same MIB identifier.
    72  var mibValue = map[Endianness][numBOMValues]identifier.MIB{
    73  	BigEndian: [numBOMValues]identifier.MIB{
    74  		IgnoreBOM: identifier.UTF32BE,
    75  		UseBOM:    identifier.UTF32,
    76  	},
    77  	LittleEndian: [numBOMValues]identifier.MIB{
    78  		IgnoreBOM: identifier.UTF32LE,
    79  		UseBOM:    identifier.UTF32,
    80  	},
    81  	// ExpectBOM is not widely used and has no valid MIB identifier.
    82  }
    83  
    84  // BOMPolicy is a UTF-32 encodings's byte order mark policy.
    85  type BOMPolicy uint8
    86  
    87  const (
    88  	writeBOM   BOMPolicy = 0x01
    89  	acceptBOM  BOMPolicy = 0x02
    90  	requireBOM BOMPolicy = 0x04
    91  	bomMask    BOMPolicy = 0x07
    92  
    93  	// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
    94  	// map of an array of length 8 of a type that is also used as a key or value
    95  	// in another map). See golang.org/issue/11354.
    96  	// TODO: consider changing this value back to 8 if the use of 1.4.* has
    97  	// been minimized.
    98  	numBOMValues = 8 + 1
    99  
   100  	// IgnoreBOM means to ignore any byte order marks.
   101  	IgnoreBOM BOMPolicy = 0
   102  	// Unicode-compliant interpretation for UTF-32BE/LE.
   103  
   104  	// UseBOM means that the UTF-32 form may start with a byte order mark,
   105  	// which will be used to override the default encoding.
   106  	UseBOM BOMPolicy = writeBOM | acceptBOM
   107  	// Unicode-compliant interpretation for UTF-32.
   108  
   109  	// ExpectBOM means that the UTF-32 form must start with a byte order mark,
   110  	// which will be used to override the default encoding.
   111  	ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
   112  	// Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
   113  )
   114  
   115  // Endianness is a UTF-32 encoding's default endianness.
   116  type Endianness bool
   117  
   118  const (
   119  	// BigEndian is UTF-32BE.
   120  	BigEndian Endianness = false
   121  	// LittleEndian is UTF-32LE.
   122  	LittleEndian Endianness = true
   123  )
   124  
   125  type config struct {
   126  	endianness Endianness
   127  	bomPolicy  BOMPolicy
   128  }
   129  
   130  type utf32Encoding struct {
   131  	config
   132  	mib identifier.MIB
   133  }
   134  
   135  func (u utf32Encoding) NewDecoder() *encoding.Decoder {
   136  	return &encoding.Decoder{Transformer: &utf32Decoder{
   137  		initial: u.config,
   138  		current: u.config,
   139  	}}
   140  }
   141  
   142  func (u utf32Encoding) NewEncoder() *encoding.Encoder {
   143  	return &encoding.Encoder{Transformer: &utf32Encoder{
   144  		endianness:       u.endianness,
   145  		initialBOMPolicy: u.bomPolicy,
   146  		currentBOMPolicy: u.bomPolicy,
   147  	}}
   148  }
   149  
   150  func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
   151  	return u.mib, ""
   152  }
   153  
   154  func (u utf32Encoding) String() string {
   155  	e, b := "B", ""
   156  	if u.endianness == LittleEndian {
   157  		e = "L"
   158  	}
   159  	switch u.bomPolicy {
   160  	case ExpectBOM:
   161  		b = "Expect"
   162  	case UseBOM:
   163  		b = "Use"
   164  	case IgnoreBOM:
   165  		b = "Ignore"
   166  	}
   167  	return "UTF-32" + e + "E (" + b + " BOM)"
   168  }
   169  
   170  type utf32Decoder struct {
   171  	initial config
   172  	current config
   173  }
   174  
   175  func (u *utf32Decoder) Reset() {
   176  	u.current = u.initial
   177  }
   178  
   179  func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   180  	if len(src) == 0 {
   181  		if atEOF && u.current.bomPolicy&requireBOM != 0 {
   182  			return 0, 0, ErrMissingBOM
   183  		}
   184  		return 0, 0, nil
   185  	}
   186  	if u.current.bomPolicy&acceptBOM != 0 {
   187  		if len(src) < 4 {
   188  			return 0, 0, transform.ErrShortSrc
   189  		}
   190  		switch {
   191  		case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
   192  			u.current.endianness = BigEndian
   193  			nSrc = 4
   194  		case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
   195  			u.current.endianness = LittleEndian
   196  			nSrc = 4
   197  		default:
   198  			if u.current.bomPolicy&requireBOM != 0 {
   199  				return 0, 0, ErrMissingBOM
   200  			}
   201  		}
   202  		u.current.bomPolicy = IgnoreBOM
   203  	}
   204  
   205  	var r rune
   206  	var dSize, sSize int
   207  	for nSrc < len(src) {
   208  		if nSrc+3 < len(src) {
   209  			x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
   210  				uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
   211  			if u.current.endianness == LittleEndian {
   212  				x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
   213  			}
   214  			r, sSize = rune(x), 4
   215  			if dSize = utf8.RuneLen(r); dSize < 0 {
   216  				r, dSize = utf8.RuneError, 3
   217  			}
   218  		} else if atEOF {
   219  			// 1..3 trailing bytes.
   220  			r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
   221  		} else {
   222  			err = transform.ErrShortSrc
   223  			break
   224  		}
   225  		if nDst+dSize > len(dst) {
   226  			err = transform.ErrShortDst
   227  			break
   228  		}
   229  		nDst += utf8.EncodeRune(dst[nDst:], r)
   230  		nSrc += sSize
   231  	}
   232  	return nDst, nSrc, err
   233  }
   234  
   235  type utf32Encoder struct {
   236  	endianness       Endianness
   237  	initialBOMPolicy BOMPolicy
   238  	currentBOMPolicy BOMPolicy
   239  }
   240  
   241  func (u *utf32Encoder) Reset() {
   242  	u.currentBOMPolicy = u.initialBOMPolicy
   243  }
   244  
   245  func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   246  	if u.currentBOMPolicy&writeBOM != 0 {
   247  		if len(dst) < 4 {
   248  			return 0, 0, transform.ErrShortDst
   249  		}
   250  		dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
   251  		u.currentBOMPolicy = IgnoreBOM
   252  		nDst = 4
   253  	}
   254  
   255  	r, size := rune(0), 0
   256  	for nSrc < len(src) {
   257  		r = rune(src[nSrc])
   258  
   259  		// Decode a 1-byte rune.
   260  		if r < utf8.RuneSelf {
   261  			size = 1
   262  
   263  		} else {
   264  			// Decode a multi-byte rune.
   265  			r, size = utf8.DecodeRune(src[nSrc:])
   266  			if size == 1 {
   267  				// All valid runes of size 1 (those below utf8.RuneSelf) were
   268  				// handled above. We have invalid UTF-8 or we haven't seen the
   269  				// full character yet.
   270  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
   271  					err = transform.ErrShortSrc
   272  					break
   273  				}
   274  			}
   275  		}
   276  
   277  		if nDst+4 > len(dst) {
   278  			err = transform.ErrShortDst
   279  			break
   280  		}
   281  
   282  		dst[nDst+0] = uint8(r >> 24)
   283  		dst[nDst+1] = uint8(r >> 16)
   284  		dst[nDst+2] = uint8(r >> 8)
   285  		dst[nDst+3] = uint8(r)
   286  		nDst += 4
   287  		nSrc += size
   288  	}
   289  
   290  	if u.endianness == LittleEndian {
   291  		for i := 0; i < nDst; i += 4 {
   292  			dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
   293  		}
   294  	}
   295  	return nDst, nSrc, err
   296  }
   297
View as plain text