...

Source file src/golang.org/x/text/encoding/encoding.go

Documentation: golang.org/x/text/encoding

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package encoding defines an interface for character encodings, such as Shift
     6  // JIS and Windows 1252, that can convert to and from UTF-8.
     7  //
     8  // Encoding implementations are provided in other packages, such as
     9  // golang.org/x/text/encoding/charmap and
    10  // golang.org/x/text/encoding/japanese.
    11  package encoding // import "golang.org/x/text/encoding"
    12  
    13  import (
    14  	"errors"
    15  	"io"
    16  	"strconv"
    17  	"unicode/utf8"
    18  
    19  	"golang.org/x/text/encoding/internal/identifier"
    20  	"golang.org/x/text/transform"
    21  )
    22  
    23  // TODO:
    24  // - There seems to be some inconsistency in when decoders return errors
    25  //   and when not. Also documentation seems to suggest they shouldn't return
    26  //   errors at all (except for UTF-16).
    27  // - Encoders seem to rely on or at least benefit from the input being in NFC
    28  //   normal form. Perhaps add an example how users could prepare their output.
    29  
    30  // Encoding is a character set encoding that can be transformed to and from
    31  // UTF-8.
    32  type Encoding interface {
    33  	// NewDecoder returns a Decoder.
    34  	NewDecoder() *Decoder
    35  
    36  	// NewEncoder returns an Encoder.
    37  	NewEncoder() *Encoder
    38  }
    39  
    40  // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
    41  //
    42  // Transforming source bytes that are not of that encoding will not result in an
    43  // error per se. Each byte that cannot be transcoded will be represented in the
    44  // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
    45  type Decoder struct {
    46  	transform.Transformer
    47  
    48  	// This forces external creators of Decoders to use names in struct
    49  	// initializers, allowing for future extendibility without having to break
    50  	// code.
    51  	_ struct{}
    52  }
    53  
    54  // Bytes converts the given encoded bytes to UTF-8. It returns the converted
    55  // bytes or nil, err if any error occurred.
    56  func (d *Decoder) Bytes(b []byte) ([]byte, error) {
    57  	b, _, err := transform.Bytes(d, b)
    58  	if err != nil {
    59  		return nil, err
    60  	}
    61  	return b, nil
    62  }
    63  
    64  // String converts the given encoded string to UTF-8. It returns the converted
    65  // string or "", err if any error occurred.
    66  func (d *Decoder) String(s string) (string, error) {
    67  	s, _, err := transform.String(d, s)
    68  	if err != nil {
    69  		return "", err
    70  	}
    71  	return s, nil
    72  }
    73  
    74  // Reader wraps another Reader to decode its bytes.
    75  //
    76  // The Decoder may not be used for any other operation as long as the returned
    77  // Reader is in use.
    78  func (d *Decoder) Reader(r io.Reader) io.Reader {
    79  	return transform.NewReader(r, d)
    80  }
    81  
    82  // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
    83  //
    84  // Each rune that cannot be transcoded will result in an error. In this case,
    85  // the transform will consume all source byte up to, not including the offending
    86  // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
    87  // `\uFFFD`. To return early with an error instead, use transform.Chain to
    88  // preprocess the data with a UTF8Validator.
    89  type Encoder struct {
    90  	transform.Transformer
    91  
    92  	// This forces external creators of Encoders to use names in struct
    93  	// initializers, allowing for future extendibility without having to break
    94  	// code.
    95  	_ struct{}
    96  }
    97  
    98  // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
    99  // any error occurred.
   100  func (e *Encoder) Bytes(b []byte) ([]byte, error) {
   101  	b, _, err := transform.Bytes(e, b)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  	return b, nil
   106  }
   107  
   108  // String converts a string from UTF-8. It returns the converted string or
   109  // "", err if any error occurred.
   110  func (e *Encoder) String(s string) (string, error) {
   111  	s, _, err := transform.String(e, s)
   112  	if err != nil {
   113  		return "", err
   114  	}
   115  	return s, nil
   116  }
   117  
   118  // Writer wraps another Writer to encode its UTF-8 output.
   119  //
   120  // The Encoder may not be used for any other operation as long as the returned
   121  // Writer is in use.
   122  func (e *Encoder) Writer(w io.Writer) io.Writer {
   123  	return transform.NewWriter(w, e)
   124  }
   125  
   126  // ASCIISub is the ASCII substitute character, as recommended by
   127  // https://unicode.org/reports/tr36/#Text_Comparison
   128  const ASCIISub = '\x1a'
   129  
   130  // Nop is the nop encoding. Its transformed bytes are the same as the source
   131  // bytes; it does not replace invalid UTF-8 sequences.
   132  var Nop Encoding = nop{}
   133  
   134  type nop struct{}
   135  
   136  func (nop) NewDecoder() *Decoder {
   137  	return &Decoder{Transformer: transform.Nop}
   138  }
   139  func (nop) NewEncoder() *Encoder {
   140  	return &Encoder{Transformer: transform.Nop}
   141  }
   142  
   143  // Replacement is the replacement encoding. Decoding from the replacement
   144  // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
   145  // the replacement encoding yields the same as the source bytes except that
   146  // invalid UTF-8 is converted to '\uFFFD'.
   147  //
   148  // It is defined at http://encoding.spec.whatwg.org/#replacement
   149  var Replacement Encoding = replacement{}
   150  
   151  type replacement struct{}
   152  
   153  func (replacement) NewDecoder() *Decoder {
   154  	return &Decoder{Transformer: replacementDecoder{}}
   155  }
   156  
   157  func (replacement) NewEncoder() *Encoder {
   158  	return &Encoder{Transformer: replacementEncoder{}}
   159  }
   160  
   161  func (replacement) ID() (mib identifier.MIB, other string) {
   162  	return identifier.Replacement, ""
   163  }
   164  
   165  type replacementDecoder struct{ transform.NopResetter }
   166  
   167  func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   168  	if len(dst) < 3 {
   169  		return 0, 0, transform.ErrShortDst
   170  	}
   171  	if atEOF {
   172  		const fffd = "\ufffd"
   173  		dst[0] = fffd[0]
   174  		dst[1] = fffd[1]
   175  		dst[2] = fffd[2]
   176  		nDst = 3
   177  	}
   178  	return nDst, len(src), nil
   179  }
   180  
   181  type replacementEncoder struct{ transform.NopResetter }
   182  
   183  func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   184  	r, size := rune(0), 0
   185  
   186  	for ; nSrc < len(src); nSrc += size {
   187  		r = rune(src[nSrc])
   188  
   189  		// Decode a 1-byte rune.
   190  		if r < utf8.RuneSelf {
   191  			size = 1
   192  
   193  		} else {
   194  			// Decode a multi-byte rune.
   195  			r, size = utf8.DecodeRune(src[nSrc:])
   196  			if size == 1 {
   197  				// All valid runes of size 1 (those below utf8.RuneSelf) were
   198  				// handled above. We have invalid UTF-8 or we haven't seen the
   199  				// full character yet.
   200  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
   201  					err = transform.ErrShortSrc
   202  					break
   203  				}
   204  				r = '\ufffd'
   205  			}
   206  		}
   207  
   208  		if nDst+utf8.RuneLen(r) > len(dst) {
   209  			err = transform.ErrShortDst
   210  			break
   211  		}
   212  		nDst += utf8.EncodeRune(dst[nDst:], r)
   213  	}
   214  	return nDst, nSrc, err
   215  }
   216  
   217  // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
   218  // repertoire of the destination encoding with HTML escape sequences.
   219  //
   220  // This wrapper exists to comply to URL and HTML forms requiring a
   221  // non-terminating legacy encoder. The produced sequences may lead to data
   222  // loss as they are indistinguishable from legitimate input. To avoid this
   223  // issue, use UTF-8 encodings whenever possible.
   224  func HTMLEscapeUnsupported(e *Encoder) *Encoder {
   225  	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
   226  }
   227  
   228  // ReplaceUnsupported wraps encoders to replace source runes outside the
   229  // repertoire of the destination encoding with an encoding-specific
   230  // replacement.
   231  //
   232  // This wrapper is only provided for backwards compatibility and legacy
   233  // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
   234  func ReplaceUnsupported(e *Encoder) *Encoder {
   235  	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
   236  }
   237  
   238  type errorHandler struct {
   239  	*Encoder
   240  	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
   241  }
   242  
   243  // TODO: consider making this error public in some form.
   244  type repertoireError interface {
   245  	Replacement() byte
   246  }
   247  
   248  func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   249  	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
   250  	for err != nil {
   251  		rerr, ok := err.(repertoireError)
   252  		if !ok {
   253  			return nDst, nSrc, err
   254  		}
   255  		r, sz := utf8.DecodeRune(src[nSrc:])
   256  		n, ok := h.handler(dst[nDst:], r, rerr)
   257  		if !ok {
   258  			return nDst, nSrc, transform.ErrShortDst
   259  		}
   260  		err = nil
   261  		nDst += n
   262  		if nSrc += sz; nSrc < len(src) {
   263  			var dn, sn int
   264  			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
   265  			nDst += dn
   266  			nSrc += sn
   267  		}
   268  	}
   269  	return nDst, nSrc, err
   270  }
   271  
   272  func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
   273  	buf := [8]byte{}
   274  	b := strconv.AppendUint(buf[:0], uint64(r), 10)
   275  	if n = len(b) + len("&#;"); n >= len(dst) {
   276  		return 0, false
   277  	}
   278  	dst[0] = '&'
   279  	dst[1] = '#'
   280  	dst[copy(dst[2:], b)+2] = ';'
   281  	return n, true
   282  }
   283  
   284  func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
   285  	if len(dst) == 0 {
   286  		return 0, false
   287  	}
   288  	dst[0] = err.Replacement()
   289  	return 1, true
   290  }
   291  
   292  // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
   293  var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
   294  
   295  // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
   296  // input byte that is not valid UTF-8.
   297  var UTF8Validator transform.Transformer = utf8Validator{}
   298  
   299  type utf8Validator struct{ transform.NopResetter }
   300  
   301  func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   302  	n := len(src)
   303  	if n > len(dst) {
   304  		n = len(dst)
   305  	}
   306  	for i := 0; i < n; {
   307  		if c := src[i]; c < utf8.RuneSelf {
   308  			dst[i] = c
   309  			i++
   310  			continue
   311  		}
   312  		_, size := utf8.DecodeRune(src[i:])
   313  		if size == 1 {
   314  			// All valid runes of size 1 (those below utf8.RuneSelf) were
   315  			// handled above. We have invalid UTF-8 or we haven't seen the
   316  			// full character yet.
   317  			err = ErrInvalidUTF8
   318  			if !atEOF && !utf8.FullRune(src[i:]) {
   319  				err = transform.ErrShortSrc
   320  			}
   321  			return i, i, err
   322  		}
   323  		if i+size > len(dst) {
   324  			return i, i, transform.ErrShortDst
   325  		}
   326  		for ; size > 0; size-- {
   327  			dst[i] = src[i]
   328  			i++
   329  		}
   330  	}
   331  	if len(src) > len(dst) {
   332  		err = transform.ErrShortDst
   333  	}
   334  	return n, n, err
   335  }
   336  

View as plain text