...

Source file src/golang.org/x/text/encoding/japanese/iso2022jp.go

Documentation: golang.org/x/text/encoding/japanese

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package japanese
     6  
     7  import (
     8  	"unicode/utf8"
     9  
    10  	"golang.org/x/text/encoding"
    11  	"golang.org/x/text/encoding/internal"
    12  	"golang.org/x/text/encoding/internal/identifier"
    13  	"golang.org/x/text/transform"
    14  )
    15  
    16  // ISO2022JP is the ISO-2022-JP encoding.
    17  var ISO2022JP encoding.Encoding = &iso2022JP
    18  
    19  var iso2022JP = internal.Encoding{
    20  	internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
    21  	"ISO-2022-JP",
    22  	identifier.ISO2022JP,
    23  }
    24  
    25  func iso2022JPNewDecoder() transform.Transformer {
    26  	return new(iso2022JPDecoder)
    27  }
    28  
    29  func iso2022JPNewEncoder() transform.Transformer {
    30  	return new(iso2022JPEncoder)
    31  }
    32  
    33  const (
    34  	asciiState = iota
    35  	katakanaState
    36  	jis0208State
    37  	jis0212State
    38  )
    39  
    40  const asciiEsc = 0x1b
    41  
    42  type iso2022JPDecoder int
    43  
    44  func (d *iso2022JPDecoder) Reset() {
    45  	*d = asciiState
    46  }
    47  
    48  func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    49  	r, size := rune(0), 0
    50  	for ; nSrc < len(src); nSrc += size {
    51  		c0 := src[nSrc]
    52  		if c0 >= utf8.RuneSelf {
    53  			r, size = '\ufffd', 1
    54  			goto write
    55  		}
    56  
    57  		if c0 == asciiEsc {
    58  			if nSrc+2 >= len(src) {
    59  				if !atEOF {
    60  					return nDst, nSrc, transform.ErrShortSrc
    61  				}
    62  				// TODO: is it correct to only skip 1??
    63  				r, size = '\ufffd', 1
    64  				goto write
    65  			}
    66  			size = 3
    67  			c1 := src[nSrc+1]
    68  			c2 := src[nSrc+2]
    69  			switch {
    70  			case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
    71  				*d = jis0208State
    72  				continue
    73  			case c1 == '$' && c2 == '(': // 0x24 0x28
    74  				if nSrc+3 >= len(src) {
    75  					if !atEOF {
    76  						return nDst, nSrc, transform.ErrShortSrc
    77  					}
    78  					r, size = '\ufffd', 1
    79  					goto write
    80  				}
    81  				size = 4
    82  				if src[nSrc+3] == 'D' {
    83  					*d = jis0212State
    84  					continue
    85  				}
    86  			case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
    87  				*d = asciiState
    88  				continue
    89  			case c1 == '(' && c2 == 'I': // 0x28 0x49
    90  				*d = katakanaState
    91  				continue
    92  			}
    93  			r, size = '\ufffd', 1
    94  			goto write
    95  		}
    96  
    97  		switch *d {
    98  		case asciiState:
    99  			r, size = rune(c0), 1
   100  
   101  		case katakanaState:
   102  			if c0 < 0x21 || 0x60 <= c0 {
   103  				r, size = '\ufffd', 1
   104  				goto write
   105  			}
   106  			r, size = rune(c0)+(0xff61-0x21), 1
   107  
   108  		default:
   109  			if c0 == 0x0a {
   110  				*d = asciiState
   111  				r, size = rune(c0), 1
   112  				goto write
   113  			}
   114  			if nSrc+1 >= len(src) {
   115  				if !atEOF {
   116  					return nDst, nSrc, transform.ErrShortSrc
   117  				}
   118  				r, size = '\ufffd', 1
   119  				goto write
   120  			}
   121  			size = 2
   122  			c1 := src[nSrc+1]
   123  			i := int(c0-0x21)*94 + int(c1-0x21)
   124  			if *d == jis0208State && i < len(jis0208Decode) {
   125  				r = rune(jis0208Decode[i])
   126  			} else if *d == jis0212State && i < len(jis0212Decode) {
   127  				r = rune(jis0212Decode[i])
   128  			} else {
   129  				r = '\ufffd'
   130  				goto write
   131  			}
   132  			if r == 0 {
   133  				r = '\ufffd'
   134  			}
   135  		}
   136  
   137  	write:
   138  		if nDst+utf8.RuneLen(r) > len(dst) {
   139  			return nDst, nSrc, transform.ErrShortDst
   140  		}
   141  		nDst += utf8.EncodeRune(dst[nDst:], r)
   142  	}
   143  	return nDst, nSrc, err
   144  }
   145  
   146  type iso2022JPEncoder int
   147  
   148  func (e *iso2022JPEncoder) Reset() {
   149  	*e = asciiState
   150  }
   151  
   152  func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   153  	r, size := rune(0), 0
   154  	for ; nSrc < len(src); nSrc += size {
   155  		r = rune(src[nSrc])
   156  
   157  		// Decode a 1-byte rune.
   158  		if r < utf8.RuneSelf {
   159  			size = 1
   160  
   161  		} else {
   162  			// Decode a multi-byte rune.
   163  			r, size = utf8.DecodeRune(src[nSrc:])
   164  			if size == 1 {
   165  				// All valid runes of size 1 (those below utf8.RuneSelf) were
   166  				// handled above. We have invalid UTF-8 or we haven't seen the
   167  				// full character yet.
   168  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
   169  					err = transform.ErrShortSrc
   170  					break
   171  				}
   172  			}
   173  
   174  			// func init checks that the switch covers all tables.
   175  			//
   176  			// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
   177  			// is not used by the iso-2022-jp encoder due to lack of widespread support".
   178  			//
   179  			// TODO: do we have to special-case U+00A5 and U+203E, as per
   180  			// http://encoding.spec.whatwg.org/#iso-2022-jp
   181  			// Doing so would mean that "\u00a5" would not be preserved
   182  			// after an encode-decode round trip.
   183  			switch {
   184  			case encode0Low <= r && r < encode0High:
   185  				if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
   186  					goto writeJIS
   187  				}
   188  			case encode1Low <= r && r < encode1High:
   189  				if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
   190  					goto writeJIS
   191  				}
   192  			case encode2Low <= r && r < encode2High:
   193  				if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
   194  					goto writeJIS
   195  				}
   196  			case encode3Low <= r && r < encode3High:
   197  				if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
   198  					goto writeJIS
   199  				}
   200  			case encode4Low <= r && r < encode4High:
   201  				if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
   202  					goto writeJIS
   203  				}
   204  			case encode5Low <= r && r < encode5High:
   205  				if 0xff61 <= r && r < 0xffa0 {
   206  					goto writeKatakana
   207  				}
   208  				if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
   209  					goto writeJIS
   210  				}
   211  			}
   212  
   213  			// Switch back to ASCII state in case of error so that an ASCII
   214  			// replacement character can be written in the correct state.
   215  			if *e != asciiState {
   216  				if nDst+3 > len(dst) {
   217  					err = transform.ErrShortDst
   218  					break
   219  				}
   220  				*e = asciiState
   221  				dst[nDst+0] = asciiEsc
   222  				dst[nDst+1] = '('
   223  				dst[nDst+2] = 'B'
   224  				nDst += 3
   225  			}
   226  			err = internal.ErrASCIIReplacement
   227  			break
   228  		}
   229  
   230  		if *e != asciiState {
   231  			if nDst+4 > len(dst) {
   232  				err = transform.ErrShortDst
   233  				break
   234  			}
   235  			*e = asciiState
   236  			dst[nDst+0] = asciiEsc
   237  			dst[nDst+1] = '('
   238  			dst[nDst+2] = 'B'
   239  			nDst += 3
   240  		} else if nDst >= len(dst) {
   241  			err = transform.ErrShortDst
   242  			break
   243  		}
   244  		dst[nDst] = uint8(r)
   245  		nDst++
   246  		continue
   247  
   248  	writeJIS:
   249  		if *e != jis0208State {
   250  			if nDst+5 > len(dst) {
   251  				err = transform.ErrShortDst
   252  				break
   253  			}
   254  			*e = jis0208State
   255  			dst[nDst+0] = asciiEsc
   256  			dst[nDst+1] = '$'
   257  			dst[nDst+2] = 'B'
   258  			nDst += 3
   259  		} else if nDst+2 > len(dst) {
   260  			err = transform.ErrShortDst
   261  			break
   262  		}
   263  		dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
   264  		dst[nDst+1] = 0x21 + uint8(r)&codeMask
   265  		nDst += 2
   266  		continue
   267  
   268  	writeKatakana:
   269  		if *e != katakanaState {
   270  			if nDst+4 > len(dst) {
   271  				err = transform.ErrShortDst
   272  				break
   273  			}
   274  			*e = katakanaState
   275  			dst[nDst+0] = asciiEsc
   276  			dst[nDst+1] = '('
   277  			dst[nDst+2] = 'I'
   278  			nDst += 3
   279  		} else if nDst >= len(dst) {
   280  			err = transform.ErrShortDst
   281  			break
   282  		}
   283  		dst[nDst] = uint8(r - (0xff61 - 0x21))
   284  		nDst++
   285  		continue
   286  	}
   287  	if atEOF && err == nil && *e != asciiState {
   288  		if nDst+3 > len(dst) {
   289  			err = transform.ErrShortDst
   290  		} else {
   291  			*e = asciiState
   292  			dst[nDst+0] = asciiEsc
   293  			dst[nDst+1] = '('
   294  			dst[nDst+2] = 'B'
   295  			nDst += 3
   296  		}
   297  	}
   298  	return nDst, nSrc, err
   299  }
   300  

View as plain text