...

Source file src/golang.org/x/text/encoding/simplifiedchinese/hzgb2312.go

Documentation: golang.org/x/text/encoding/simplifiedchinese

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package simplifiedchinese
     6  
     7  import (
     8  	"unicode/utf8"
     9  
    10  	"golang.org/x/text/encoding"
    11  	"golang.org/x/text/encoding/internal"
    12  	"golang.org/x/text/encoding/internal/identifier"
    13  	"golang.org/x/text/transform"
    14  )
    15  
    16  // HZGB2312 is the HZ-GB2312 encoding.
    17  var HZGB2312 encoding.Encoding = &hzGB2312
    18  
    19  var hzGB2312 = internal.Encoding{
    20  	internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
    21  	"HZ-GB2312",
    22  	identifier.HZGB2312,
    23  }
    24  
    25  func hzGB2312NewDecoder() transform.Transformer {
    26  	return new(hzGB2312Decoder)
    27  }
    28  
    29  func hzGB2312NewEncoder() transform.Transformer {
    30  	return new(hzGB2312Encoder)
    31  }
    32  
    33  const (
    34  	asciiState = iota
    35  	gbState
    36  )
    37  
    38  type hzGB2312Decoder int
    39  
    40  func (d *hzGB2312Decoder) Reset() {
    41  	*d = asciiState
    42  }
    43  
    44  func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    45  	r, size := rune(0), 0
    46  loop:
    47  	for ; nSrc < len(src); nSrc += size {
    48  		c0 := src[nSrc]
    49  		if c0 >= utf8.RuneSelf {
    50  			r, size = utf8.RuneError, 1
    51  			goto write
    52  		}
    53  
    54  		if c0 == '~' {
    55  			if nSrc+1 >= len(src) {
    56  				if !atEOF {
    57  					err = transform.ErrShortSrc
    58  					break loop
    59  				}
    60  				r, size = utf8.RuneError, 1
    61  				goto write
    62  			}
    63  			size = 2
    64  			switch src[nSrc+1] {
    65  			case '{':
    66  				*d = gbState
    67  				continue
    68  			case '}':
    69  				*d = asciiState
    70  				continue
    71  			case '~':
    72  				if nDst >= len(dst) {
    73  					err = transform.ErrShortDst
    74  					break loop
    75  				}
    76  				dst[nDst] = '~'
    77  				nDst++
    78  				continue
    79  			case '\n':
    80  				continue
    81  			default:
    82  				r = utf8.RuneError
    83  				goto write
    84  			}
    85  		}
    86  
    87  		if *d == asciiState {
    88  			r, size = rune(c0), 1
    89  		} else {
    90  			if nSrc+1 >= len(src) {
    91  				if !atEOF {
    92  					err = transform.ErrShortSrc
    93  					break loop
    94  				}
    95  				r, size = utf8.RuneError, 1
    96  				goto write
    97  			}
    98  			size = 2
    99  			c1 := src[nSrc+1]
   100  			if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
   101  				// error
   102  			} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
   103  				r = rune(decode[i])
   104  				if r != 0 {
   105  					goto write
   106  				}
   107  			}
   108  			if c1 > utf8.RuneSelf {
   109  				// Be consistent and always treat non-ASCII as a single error.
   110  				size = 1
   111  			}
   112  			r = utf8.RuneError
   113  		}
   114  
   115  	write:
   116  		if nDst+utf8.RuneLen(r) > len(dst) {
   117  			err = transform.ErrShortDst
   118  			break loop
   119  		}
   120  		nDst += utf8.EncodeRune(dst[nDst:], r)
   121  	}
   122  	return nDst, nSrc, err
   123  }
   124  
   125  type hzGB2312Encoder int
   126  
   127  func (d *hzGB2312Encoder) Reset() {
   128  	*d = asciiState
   129  }
   130  
   131  func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   132  	r, size := rune(0), 0
   133  	for ; nSrc < len(src); nSrc += size {
   134  		r = rune(src[nSrc])
   135  
   136  		// Decode a 1-byte rune.
   137  		if r < utf8.RuneSelf {
   138  			size = 1
   139  			if r == '~' {
   140  				if nDst+2 > len(dst) {
   141  					err = transform.ErrShortDst
   142  					break
   143  				}
   144  				dst[nDst+0] = '~'
   145  				dst[nDst+1] = '~'
   146  				nDst += 2
   147  				continue
   148  			} else if *e != asciiState {
   149  				if nDst+3 > len(dst) {
   150  					err = transform.ErrShortDst
   151  					break
   152  				}
   153  				*e = asciiState
   154  				dst[nDst+0] = '~'
   155  				dst[nDst+1] = '}'
   156  				nDst += 2
   157  			} else if nDst >= len(dst) {
   158  				err = transform.ErrShortDst
   159  				break
   160  			}
   161  			dst[nDst] = uint8(r)
   162  			nDst += 1
   163  			continue
   164  
   165  		}
   166  
   167  		// Decode a multi-byte rune.
   168  		r, size = utf8.DecodeRune(src[nSrc:])
   169  		if size == 1 {
   170  			// All valid runes of size 1 (those below utf8.RuneSelf) were
   171  			// handled above. We have invalid UTF-8 or we haven't seen the
   172  			// full character yet.
   173  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
   174  				err = transform.ErrShortSrc
   175  				break
   176  			}
   177  		}
   178  
   179  		// func init checks that the switch covers all tables.
   180  		switch {
   181  		case encode0Low <= r && r < encode0High:
   182  			if r = rune(encode0[r-encode0Low]); r != 0 {
   183  				goto writeGB
   184  			}
   185  		case encode1Low <= r && r < encode1High:
   186  			if r = rune(encode1[r-encode1Low]); r != 0 {
   187  				goto writeGB
   188  			}
   189  		case encode2Low <= r && r < encode2High:
   190  			if r = rune(encode2[r-encode2Low]); r != 0 {
   191  				goto writeGB
   192  			}
   193  		case encode3Low <= r && r < encode3High:
   194  			if r = rune(encode3[r-encode3Low]); r != 0 {
   195  				goto writeGB
   196  			}
   197  		case encode4Low <= r && r < encode4High:
   198  			if r = rune(encode4[r-encode4Low]); r != 0 {
   199  				goto writeGB
   200  			}
   201  		}
   202  
   203  	terminateInASCIIState:
   204  		// Switch back to ASCII state in case of error so that an ASCII
   205  		// replacement character can be written in the correct state.
   206  		if *e != asciiState {
   207  			if nDst+2 > len(dst) {
   208  				err = transform.ErrShortDst
   209  				break
   210  			}
   211  			dst[nDst+0] = '~'
   212  			dst[nDst+1] = '}'
   213  			nDst += 2
   214  		}
   215  		err = internal.ErrASCIIReplacement
   216  		break
   217  
   218  	writeGB:
   219  		c0 := uint8(r>>8) - 0x80
   220  		c1 := uint8(r) - 0x80
   221  		if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
   222  			goto terminateInASCIIState
   223  		}
   224  		if *e == asciiState {
   225  			if nDst+4 > len(dst) {
   226  				err = transform.ErrShortDst
   227  				break
   228  			}
   229  			*e = gbState
   230  			dst[nDst+0] = '~'
   231  			dst[nDst+1] = '{'
   232  			nDst += 2
   233  		} else if nDst+2 > len(dst) {
   234  			err = transform.ErrShortDst
   235  			break
   236  		}
   237  		dst[nDst+0] = c0
   238  		dst[nDst+1] = c1
   239  		nDst += 2
   240  		continue
   241  	}
   242  	// TODO: should one always terminate in ASCII state to make it safe to
   243  	// concatenate two HZ-GB2312-encoded strings?
   244  	return nDst, nSrc, err
   245  }
   246  

View as plain text