...

Source file src/golang.org/x/net/html/charset/charset.go

Documentation: golang.org/x/net/html/charset

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package charset provides common text encodings for HTML documents.
     6  //
     7  // The mapping from encoding labels to encodings is defined at
     8  // https://encoding.spec.whatwg.org/.
     9  package charset // import "golang.org/x/net/html/charset"
    10  
    11  import (
    12  	"bytes"
    13  	"fmt"
    14  	"io"
    15  	"mime"
    16  	"strings"
    17  	"unicode/utf8"
    18  
    19  	"golang.org/x/net/html"
    20  	"golang.org/x/text/encoding"
    21  	"golang.org/x/text/encoding/charmap"
    22  	"golang.org/x/text/encoding/htmlindex"
    23  	"golang.org/x/text/transform"
    24  )
    25  
    26  // Lookup returns the encoding with the specified label, and its canonical
    27  // name. It returns nil and the empty string if label is not one of the
    28  // standard encodings for HTML. Matching is case-insensitive and ignores
    29  // leading and trailing whitespace. Encoders will use HTML escape sequences for
    30  // runes that are not supported by the character set.
    31  func Lookup(label string) (e encoding.Encoding, name string) {
    32  	e, err := htmlindex.Get(label)
    33  	if err != nil {
    34  		return nil, ""
    35  	}
    36  	name, _ = htmlindex.Name(e)
    37  	return &htmlEncoding{e}, name
    38  }
    39  
    40  type htmlEncoding struct{ encoding.Encoding }
    41  
    42  func (h *htmlEncoding) NewEncoder() *encoding.Encoder {
    43  	// HTML requires a non-terminating legacy encoder. We use HTML escapes to
    44  	// substitute unsupported code points.
    45  	return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())
    46  }
    47  
    48  // DetermineEncoding determines the encoding of an HTML document by examining
    49  // up to the first 1024 bytes of content and the declared Content-Type.
    50  //
    51  // See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
    52  func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
    53  	if len(content) > 1024 {
    54  		content = content[:1024]
    55  	}
    56  
    57  	for _, b := range boms {
    58  		if bytes.HasPrefix(content, b.bom) {
    59  			e, name = Lookup(b.enc)
    60  			return e, name, true
    61  		}
    62  	}
    63  
    64  	if _, params, err := mime.ParseMediaType(contentType); err == nil {
    65  		if cs, ok := params["charset"]; ok {
    66  			if e, name = Lookup(cs); e != nil {
    67  				return e, name, true
    68  			}
    69  		}
    70  	}
    71  
    72  	if len(content) > 0 {
    73  		e, name = prescan(content)
    74  		if e != nil {
    75  			return e, name, false
    76  		}
    77  	}
    78  
    79  	// Try to detect UTF-8.
    80  	// First eliminate any partial rune at the end.
    81  	for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
    82  		b := content[i]
    83  		if b < 0x80 {
    84  			break
    85  		}
    86  		if utf8.RuneStart(b) {
    87  			content = content[:i]
    88  			break
    89  		}
    90  	}
    91  	hasHighBit := false
    92  	for _, c := range content {
    93  		if c >= 0x80 {
    94  			hasHighBit = true
    95  			break
    96  		}
    97  	}
    98  	if hasHighBit && utf8.Valid(content) {
    99  		return encoding.Nop, "utf-8", false
   100  	}
   101  
   102  	// TODO: change default depending on user's locale?
   103  	return charmap.Windows1252, "windows-1252", false
   104  }
   105  
   106  // NewReader returns an io.Reader that converts the content of r to UTF-8.
   107  // It calls DetermineEncoding to find out what r's encoding is.
   108  func NewReader(r io.Reader, contentType string) (io.Reader, error) {
   109  	preview := make([]byte, 1024)
   110  	n, err := io.ReadFull(r, preview)
   111  	switch {
   112  	case err == io.ErrUnexpectedEOF:
   113  		preview = preview[:n]
   114  		r = bytes.NewReader(preview)
   115  	case err != nil:
   116  		return nil, err
   117  	default:
   118  		r = io.MultiReader(bytes.NewReader(preview), r)
   119  	}
   120  
   121  	if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
   122  		r = transform.NewReader(r, e.NewDecoder())
   123  	}
   124  	return r, nil
   125  }
   126  
   127  // NewReaderLabel returns a reader that converts from the specified charset to
   128  // UTF-8. It uses Lookup to find the encoding that corresponds to label, and
   129  // returns an error if Lookup returns nil. It is suitable for use as
   130  // encoding/xml.Decoder's CharsetReader function.
   131  func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
   132  	e, _ := Lookup(label)
   133  	if e == nil {
   134  		return nil, fmt.Errorf("unsupported charset: %q", label)
   135  	}
   136  	return transform.NewReader(input, e.NewDecoder()), nil
   137  }
   138  
   139  func prescan(content []byte) (e encoding.Encoding, name string) {
   140  	z := html.NewTokenizer(bytes.NewReader(content))
   141  	for {
   142  		switch z.Next() {
   143  		case html.ErrorToken:
   144  			return nil, ""
   145  
   146  		case html.StartTagToken, html.SelfClosingTagToken:
   147  			tagName, hasAttr := z.TagName()
   148  			if !bytes.Equal(tagName, []byte("meta")) {
   149  				continue
   150  			}
   151  			attrList := make(map[string]bool)
   152  			gotPragma := false
   153  
   154  			const (
   155  				dontKnow = iota
   156  				doNeedPragma
   157  				doNotNeedPragma
   158  			)
   159  			needPragma := dontKnow
   160  
   161  			name = ""
   162  			e = nil
   163  			for hasAttr {
   164  				var key, val []byte
   165  				key, val, hasAttr = z.TagAttr()
   166  				ks := string(key)
   167  				if attrList[ks] {
   168  					continue
   169  				}
   170  				attrList[ks] = true
   171  				for i, c := range val {
   172  					if 'A' <= c && c <= 'Z' {
   173  						val[i] = c + 0x20
   174  					}
   175  				}
   176  
   177  				switch ks {
   178  				case "http-equiv":
   179  					if bytes.Equal(val, []byte("content-type")) {
   180  						gotPragma = true
   181  					}
   182  
   183  				case "content":
   184  					if e == nil {
   185  						name = fromMetaElement(string(val))
   186  						if name != "" {
   187  							e, name = Lookup(name)
   188  							if e != nil {
   189  								needPragma = doNeedPragma
   190  							}
   191  						}
   192  					}
   193  
   194  				case "charset":
   195  					e, name = Lookup(string(val))
   196  					needPragma = doNotNeedPragma
   197  				}
   198  			}
   199  
   200  			if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
   201  				continue
   202  			}
   203  
   204  			if strings.HasPrefix(name, "utf-16") {
   205  				name = "utf-8"
   206  				e = encoding.Nop
   207  			}
   208  
   209  			if e != nil {
   210  				return e, name
   211  			}
   212  		}
   213  	}
   214  }
   215  
   216  func fromMetaElement(s string) string {
   217  	for s != "" {
   218  		csLoc := strings.Index(s, "charset")
   219  		if csLoc == -1 {
   220  			return ""
   221  		}
   222  		s = s[csLoc+len("charset"):]
   223  		s = strings.TrimLeft(s, " \t\n\f\r")
   224  		if !strings.HasPrefix(s, "=") {
   225  			continue
   226  		}
   227  		s = s[1:]
   228  		s = strings.TrimLeft(s, " \t\n\f\r")
   229  		if s == "" {
   230  			return ""
   231  		}
   232  		if q := s[0]; q == '"' || q == '\'' {
   233  			s = s[1:]
   234  			closeQuote := strings.IndexRune(s, rune(q))
   235  			if closeQuote == -1 {
   236  				return ""
   237  			}
   238  			return s[:closeQuote]
   239  		}
   240  
   241  		end := strings.IndexAny(s, "; \t\n\f\r")
   242  		if end == -1 {
   243  			end = len(s)
   244  		}
   245  		return s[:end]
   246  	}
   247  	return ""
   248  }
   249  
   250  var boms = []struct {
   251  	bom []byte
   252  	enc string
   253  }{
   254  	{[]byte{0xfe, 0xff}, "utf-16be"},
   255  	{[]byte{0xff, 0xfe}, "utf-16le"},
   256  	{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
   257  }
   258  

View as plain text