...

Source file src/golang.org/x/text/encoding/charmap/charmap.go

Documentation: golang.org/x/text/encoding/charmap

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run maketables.go
     6  
     7  // Package charmap provides simple character encodings such as IBM Code Page 437
     8  // and Windows 1252.
     9  package charmap // import "golang.org/x/text/encoding/charmap"
    10  
    11  import (
    12  	"unicode/utf8"
    13  
    14  	"golang.org/x/text/encoding"
    15  	"golang.org/x/text/encoding/internal"
    16  	"golang.org/x/text/encoding/internal/identifier"
    17  	"golang.org/x/text/transform"
    18  )
    19  
    20  // These encodings vary only in the way clients should interpret them. Their
    21  // coded character set is identical and a single implementation can be shared.
    22  var (
    23  	// ISO8859_6E is the ISO 8859-6E encoding.
    24  	ISO8859_6E encoding.Encoding = &iso8859_6E
    25  
    26  	// ISO8859_6I is the ISO 8859-6I encoding.
    27  	ISO8859_6I encoding.Encoding = &iso8859_6I
    28  
    29  	// ISO8859_8E is the ISO 8859-8E encoding.
    30  	ISO8859_8E encoding.Encoding = &iso8859_8E
    31  
    32  	// ISO8859_8I is the ISO 8859-8I encoding.
    33  	ISO8859_8I encoding.Encoding = &iso8859_8I
    34  
    35  	iso8859_6E = internal.Encoding{
    36  		Encoding: ISO8859_6,
    37  		Name:     "ISO-8859-6E",
    38  		MIB:      identifier.ISO88596E,
    39  	}
    40  
    41  	iso8859_6I = internal.Encoding{
    42  		Encoding: ISO8859_6,
    43  		Name:     "ISO-8859-6I",
    44  		MIB:      identifier.ISO88596I,
    45  	}
    46  
    47  	iso8859_8E = internal.Encoding{
    48  		Encoding: ISO8859_8,
    49  		Name:     "ISO-8859-8E",
    50  		MIB:      identifier.ISO88598E,
    51  	}
    52  
    53  	iso8859_8I = internal.Encoding{
    54  		Encoding: ISO8859_8,
    55  		Name:     "ISO-8859-8I",
    56  		MIB:      identifier.ISO88598I,
    57  	}
    58  )
    59  
    60  // All is a list of all defined encodings in this package.
    61  var All []encoding.Encoding = listAll
    62  
    63  // TODO: implement these encodings, in order of importance.
    64  // ASCII, ISO8859_1:       Rather common. Close to Windows 1252.
    65  // ISO8859_9:              Close to Windows 1254.
    66  
    67  // utf8Enc holds a rune's UTF-8 encoding in data[:len].
    68  type utf8Enc struct {
    69  	len  uint8
    70  	data [3]byte
    71  }
    72  
    73  // Charmap is an 8-bit character set encoding.
    74  type Charmap struct {
    75  	// name is the encoding's name.
    76  	name string
    77  	// mib is the encoding type of this encoder.
    78  	mib identifier.MIB
    79  	// asciiSuperset states whether the encoding is a superset of ASCII.
    80  	asciiSuperset bool
    81  	// low is the lower bound of the encoded byte for a non-ASCII rune. If
    82  	// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
    83  	low uint8
    84  	// replacement is the encoded replacement character.
    85  	replacement byte
    86  	// decode is the map from encoded byte to UTF-8.
    87  	decode [256]utf8Enc
    88  	// encoding is the map from runes to encoded bytes. Each entry is a
    89  	// uint32: the high 8 bits are the encoded byte and the low 24 bits are
    90  	// the rune. The table entries are sorted by ascending rune.
    91  	encode [256]uint32
    92  }
    93  
    94  // NewDecoder implements the encoding.Encoding interface.
    95  func (m *Charmap) NewDecoder() *encoding.Decoder {
    96  	return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
    97  }
    98  
    99  // NewEncoder implements the encoding.Encoding interface.
   100  func (m *Charmap) NewEncoder() *encoding.Encoder {
   101  	return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
   102  }
   103  
   104  // String returns the Charmap's name.
   105  func (m *Charmap) String() string {
   106  	return m.name
   107  }
   108  
   109  // ID implements an internal interface.
   110  func (m *Charmap) ID() (mib identifier.MIB, other string) {
   111  	return m.mib, ""
   112  }
   113  
   114  // charmapDecoder implements transform.Transformer by decoding to UTF-8.
   115  type charmapDecoder struct {
   116  	transform.NopResetter
   117  	charmap *Charmap
   118  }
   119  
   120  func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   121  	for i, c := range src {
   122  		if m.charmap.asciiSuperset && c < utf8.RuneSelf {
   123  			if nDst >= len(dst) {
   124  				err = transform.ErrShortDst
   125  				break
   126  			}
   127  			dst[nDst] = c
   128  			nDst++
   129  			nSrc = i + 1
   130  			continue
   131  		}
   132  
   133  		decode := &m.charmap.decode[c]
   134  		n := int(decode.len)
   135  		if nDst+n > len(dst) {
   136  			err = transform.ErrShortDst
   137  			break
   138  		}
   139  		// It's 15% faster to avoid calling copy for these tiny slices.
   140  		for j := 0; j < n; j++ {
   141  			dst[nDst] = decode.data[j]
   142  			nDst++
   143  		}
   144  		nSrc = i + 1
   145  	}
   146  	return nDst, nSrc, err
   147  }
   148  
   149  // DecodeByte returns the Charmap's rune decoding of the byte b.
   150  func (m *Charmap) DecodeByte(b byte) rune {
   151  	switch x := &m.decode[b]; x.len {
   152  	case 1:
   153  		return rune(x.data[0])
   154  	case 2:
   155  		return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
   156  	default:
   157  		return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
   158  	}
   159  }
   160  
   161  // charmapEncoder implements transform.Transformer by encoding from UTF-8.
   162  type charmapEncoder struct {
   163  	transform.NopResetter
   164  	charmap *Charmap
   165  }
   166  
   167  func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   168  	r, size := rune(0), 0
   169  loop:
   170  	for nSrc < len(src) {
   171  		if nDst >= len(dst) {
   172  			err = transform.ErrShortDst
   173  			break
   174  		}
   175  		r = rune(src[nSrc])
   176  
   177  		// Decode a 1-byte rune.
   178  		if r < utf8.RuneSelf {
   179  			if m.charmap.asciiSuperset {
   180  				nSrc++
   181  				dst[nDst] = uint8(r)
   182  				nDst++
   183  				continue
   184  			}
   185  			size = 1
   186  
   187  		} else {
   188  			// Decode a multi-byte rune.
   189  			r, size = utf8.DecodeRune(src[nSrc:])
   190  			if size == 1 {
   191  				// All valid runes of size 1 (those below utf8.RuneSelf) were
   192  				// handled above. We have invalid UTF-8 or we haven't seen the
   193  				// full character yet.
   194  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
   195  					err = transform.ErrShortSrc
   196  				} else {
   197  					err = internal.RepertoireError(m.charmap.replacement)
   198  				}
   199  				break
   200  			}
   201  		}
   202  
   203  		// Binary search in [low, high) for that rune in the m.charmap.encode table.
   204  		for low, high := int(m.charmap.low), 0x100; ; {
   205  			if low >= high {
   206  				err = internal.RepertoireError(m.charmap.replacement)
   207  				break loop
   208  			}
   209  			mid := (low + high) / 2
   210  			got := m.charmap.encode[mid]
   211  			gotRune := rune(got & (1<<24 - 1))
   212  			if gotRune < r {
   213  				low = mid + 1
   214  			} else if gotRune > r {
   215  				high = mid
   216  			} else {
   217  				dst[nDst] = byte(got >> 24)
   218  				nDst++
   219  				break
   220  			}
   221  		}
   222  		nSrc += size
   223  	}
   224  	return nDst, nSrc, err
   225  }
   226  
   227  // EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
   228  // r is in the Charmap's repertoire. If not, b is set to the Charmap's
   229  // replacement byte. This is often the ASCII substitute character '\x1a'.
   230  func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
   231  	if r < utf8.RuneSelf && m.asciiSuperset {
   232  		return byte(r), true
   233  	}
   234  	for low, high := int(m.low), 0x100; ; {
   235  		if low >= high {
   236  			return m.replacement, false
   237  		}
   238  		mid := (low + high) / 2
   239  		got := m.encode[mid]
   240  		gotRune := rune(got & (1<<24 - 1))
   241  		if gotRune < r {
   242  			low = mid + 1
   243  		} else if gotRune > r {
   244  			high = mid
   245  		} else {
   246  			return byte(got >> 24), true
   247  		}
   248  	}
   249  }
   250  

View as plain text