// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ignore package main import ( "bytes" "encoding/json" "fmt" "log" "strings" "golang.org/x/text/internal/gen" ) type group struct { Encodings []struct { Labels []string Name string } } func main() { gen.Init() r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json") var groups []group if err := json.NewDecoder(r).Decode(&groups); err != nil { log.Fatalf("Error reading encodings.json: %v", err) } w := &bytes.Buffer{} fmt.Fprintln(w, "type htmlEncoding byte") fmt.Fprintln(w, "const (") for i, g := range groups { for _, e := range g.Encodings { key := strings.ToLower(e.Name) name := consts[key] if name == "" { log.Fatalf("No const defined for %s.", key) } if i == 0 { fmt.Fprintf(w, "%s htmlEncoding = iota\n", name) } else { fmt.Fprintf(w, "%s\n", name) } } } fmt.Fprintln(w, "numEncodings") fmt.Fprint(w, ")\n\n") fmt.Fprintln(w, "var canonical = [numEncodings]string{") for _, g := range groups { for _, e := range g.Encodings { fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name)) } } fmt.Fprint(w, "}\n\n") fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{") for _, g := range groups { for _, e := range g.Encodings { for _, l := range e.Labels { key := strings.ToLower(e.Name) name := consts[key] fmt.Fprintf(w, "%q: %s,\n", l, name) } } } fmt.Fprint(w, "}\n\n") var tags []string fmt.Fprintln(w, "var localeMap = []htmlEncoding{") for _, loc := range locales { tags = append(tags, loc.tag) fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag) } fmt.Fprint(w, "}\n\n") fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " ")) gen.WriteGoFile("tables.go", "htmlindex", w.Bytes()) } // consts maps canonical encoding name to internal constant. var consts = map[string]string{ "utf-8": "utf8", "ibm866": "ibm866", "iso-8859-2": "iso8859_2", "iso-8859-3": "iso8859_3", "iso-8859-4": "iso8859_4", "iso-8859-5": "iso8859_5", "iso-8859-6": "iso8859_6", "iso-8859-7": "iso8859_7", "iso-8859-8": "iso8859_8", "iso-8859-8-i": "iso8859_8I", "iso-8859-10": "iso8859_10", "iso-8859-13": "iso8859_13", "iso-8859-14": "iso8859_14", "iso-8859-15": "iso8859_15", "iso-8859-16": "iso8859_16", "koi8-r": "koi8r", "koi8-u": "koi8u", "macintosh": "macintosh", "windows-874": "windows874", "windows-1250": "windows1250", "windows-1251": "windows1251", "windows-1252": "windows1252", "windows-1253": "windows1253", "windows-1254": "windows1254", "windows-1255": "windows1255", "windows-1256": "windows1256", "windows-1257": "windows1257", "windows-1258": "windows1258", "x-mac-cyrillic": "macintoshCyrillic", "gbk": "gbk", "gb18030": "gb18030", // "hz-gb-2312": "hzgb2312", // Was removed from WhatWG "big5": "big5", "euc-jp": "eucjp", "iso-2022-jp": "iso2022jp", "shift_jis": "shiftJIS", "euc-kr": "euckr", "replacement": "replacement", "utf-16be": "utf16be", "utf-16le": "utf16le", "x-user-defined": "xUserDefined", } // locales is taken from // https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm. var locales = []struct{ tag, name string }{ // The default value. Explicitly state latin to benefit from the exact // script option, while still making 1252 the default encoding for languages // written in Latin script. {"und_Latn", "windows-1252"}, {"ar", "windows-1256"}, {"ba", "windows-1251"}, {"be", "windows-1251"}, {"bg", "windows-1251"}, {"cs", "windows-1250"}, {"el", "iso-8859-7"}, {"et", "windows-1257"}, {"fa", "windows-1256"}, {"he", "windows-1255"}, {"hr", "windows-1250"}, {"hu", "iso-8859-2"}, {"ja", "shift_jis"}, {"kk", "windows-1251"}, {"ko", "euc-kr"}, {"ku", "windows-1254"}, {"ky", "windows-1251"}, {"lt", "windows-1257"}, {"lv", "windows-1257"}, {"mk", "windows-1251"}, {"pl", "iso-8859-2"}, {"ru", "windows-1251"}, {"sah", "windows-1251"}, {"sk", "windows-1250"}, {"sl", "iso-8859-2"}, {"sr", "windows-1251"}, {"tg", "windows-1251"}, {"th", "windows-874"}, {"tr", "windows-1254"}, {"tt", "windows-1251"}, {"uk", "windows-1251"}, {"vi", "windows-1258"}, {"zh-hans", "gb18030"}, {"zh-hant", "big5"}, }