...

Source file src/golang.org/x/text/encoding/charmap/maketables.go

Documentation: golang.org/x/text/encoding/charmap

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  import (
    10  	"bufio"
    11  	"fmt"
    12  	"log"
    13  	"net/http"
    14  	"sort"
    15  	"strings"
    16  	"unicode/utf8"
    17  
    18  	"golang.org/x/text/encoding"
    19  	"golang.org/x/text/internal/gen"
    20  )
    21  
    22  const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
    23  	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
    24  	` !"#$%&'()*+,-./0123456789:;<=>?` +
    25  	`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
    26  	"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
    27  
    28  var encodings = []struct {
    29  	name        string
    30  	mib         string
    31  	comment     string
    32  	varName     string
    33  	replacement byte
    34  	mapping     string
    35  }{
    36  	{
    37  		"IBM Code Page 037",
    38  		"IBM037",
    39  		"",
    40  		"CodePage037",
    41  		0x3f,
    42  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
    43  	},
    44  	{
    45  		"IBM Code Page 437",
    46  		"PC8CodePage437",
    47  		"",
    48  		"CodePage437",
    49  		encoding.ASCIISub,
    50  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
    51  	},
    52  	{
    53  		"IBM Code Page 850",
    54  		"PC850Multilingual",
    55  		"",
    56  		"CodePage850",
    57  		encoding.ASCIISub,
    58  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
    59  	},
    60  	{
    61  		"IBM Code Page 852",
    62  		"PCp852",
    63  		"",
    64  		"CodePage852",
    65  		encoding.ASCIISub,
    66  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
    67  	},
    68  	{
    69  		"IBM Code Page 855",
    70  		"IBM855",
    71  		"",
    72  		"CodePage855",
    73  		encoding.ASCIISub,
    74  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
    75  	},
    76  	{
    77  		"Windows Code Page 858", // PC latin1 with Euro
    78  		"IBM00858",
    79  		"",
    80  		"CodePage858",
    81  		encoding.ASCIISub,
    82  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
    83  	},
    84  	{
    85  		"IBM Code Page 860",
    86  		"IBM860",
    87  		"",
    88  		"CodePage860",
    89  		encoding.ASCIISub,
    90  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
    91  	},
    92  	{
    93  		"IBM Code Page 862",
    94  		"PC862LatinHebrew",
    95  		"",
    96  		"CodePage862",
    97  		encoding.ASCIISub,
    98  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
    99  	},
   100  	{
   101  		"IBM Code Page 863",
   102  		"IBM863",
   103  		"",
   104  		"CodePage863",
   105  		encoding.ASCIISub,
   106  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
   107  	},
   108  	{
   109  		"IBM Code Page 865",
   110  		"IBM865",
   111  		"",
   112  		"CodePage865",
   113  		encoding.ASCIISub,
   114  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
   115  	},
   116  	{
   117  		"IBM Code Page 866",
   118  		"IBM866",
   119  		"",
   120  		"CodePage866",
   121  		encoding.ASCIISub,
   122  		"http://encoding.spec.whatwg.org/index-ibm866.txt",
   123  	},
   124  	{
   125  		"IBM Code Page 1047",
   126  		"IBM1047",
   127  		"",
   128  		"CodePage1047",
   129  		0x3f,
   130  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
   131  	},
   132  	{
   133  		"IBM Code Page 1140",
   134  		"IBM01140",
   135  		"",
   136  		"CodePage1140",
   137  		0x3f,
   138  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
   139  	},
   140  	{
   141  		"ISO 8859-1",
   142  		"ISOLatin1",
   143  		"",
   144  		"ISO8859_1",
   145  		encoding.ASCIISub,
   146  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
   147  	},
   148  	{
   149  		"ISO 8859-2",
   150  		"ISOLatin2",
   151  		"",
   152  		"ISO8859_2",
   153  		encoding.ASCIISub,
   154  		"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
   155  	},
   156  	{
   157  		"ISO 8859-3",
   158  		"ISOLatin3",
   159  		"",
   160  		"ISO8859_3",
   161  		encoding.ASCIISub,
   162  		"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
   163  	},
   164  	{
   165  		"ISO 8859-4",
   166  		"ISOLatin4",
   167  		"",
   168  		"ISO8859_4",
   169  		encoding.ASCIISub,
   170  		"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
   171  	},
   172  	{
   173  		"ISO 8859-5",
   174  		"ISOLatinCyrillic",
   175  		"",
   176  		"ISO8859_5",
   177  		encoding.ASCIISub,
   178  		"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
   179  	},
   180  	{
   181  		"ISO 8859-6",
   182  		"ISOLatinArabic",
   183  		"",
   184  		"ISO8859_6,ISO8859_6E,ISO8859_6I",
   185  		encoding.ASCIISub,
   186  		"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
   187  	},
   188  	{
   189  		"ISO 8859-7",
   190  		"ISOLatinGreek",
   191  		"",
   192  		"ISO8859_7",
   193  		encoding.ASCIISub,
   194  		"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
   195  	},
   196  	{
   197  		"ISO 8859-8",
   198  		"ISOLatinHebrew",
   199  		"",
   200  		"ISO8859_8,ISO8859_8E,ISO8859_8I",
   201  		encoding.ASCIISub,
   202  		"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
   203  	},
   204  	{
   205  		"ISO 8859-9",
   206  		"ISOLatin5",
   207  		"",
   208  		"ISO8859_9",
   209  		encoding.ASCIISub,
   210  		"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
   211  	},
   212  	{
   213  		"ISO 8859-10",
   214  		"ISOLatin6",
   215  		"",
   216  		"ISO8859_10",
   217  		encoding.ASCIISub,
   218  		"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
   219  	},
   220  	{
   221  		"ISO 8859-13",
   222  		"ISO885913",
   223  		"",
   224  		"ISO8859_13",
   225  		encoding.ASCIISub,
   226  		"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
   227  	},
   228  	{
   229  		"ISO 8859-14",
   230  		"ISO885914",
   231  		"",
   232  		"ISO8859_14",
   233  		encoding.ASCIISub,
   234  		"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
   235  	},
   236  	{
   237  		"ISO 8859-15",
   238  		"ISO885915",
   239  		"",
   240  		"ISO8859_15",
   241  		encoding.ASCIISub,
   242  		"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
   243  	},
   244  	{
   245  		"ISO 8859-16",
   246  		"ISO885916",
   247  		"",
   248  		"ISO8859_16",
   249  		encoding.ASCIISub,
   250  		"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
   251  	},
   252  	{
   253  		"KOI8-R",
   254  		"KOI8R",
   255  		"",
   256  		"KOI8R",
   257  		encoding.ASCIISub,
   258  		"http://encoding.spec.whatwg.org/index-koi8-r.txt",
   259  	},
   260  	{
   261  		"KOI8-U",
   262  		"KOI8U",
   263  		"",
   264  		"KOI8U",
   265  		encoding.ASCIISub,
   266  		"http://encoding.spec.whatwg.org/index-koi8-u.txt",
   267  	},
   268  	{
   269  		"Macintosh",
   270  		"Macintosh",
   271  		"",
   272  		"Macintosh",
   273  		encoding.ASCIISub,
   274  		"http://encoding.spec.whatwg.org/index-macintosh.txt",
   275  	},
   276  	{
   277  		"Macintosh Cyrillic",
   278  		"MacintoshCyrillic",
   279  		"",
   280  		"MacintoshCyrillic",
   281  		encoding.ASCIISub,
   282  		"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
   283  	},
   284  	{
   285  		"Windows 874",
   286  		"Windows874",
   287  		"",
   288  		"Windows874",
   289  		encoding.ASCIISub,
   290  		"http://encoding.spec.whatwg.org/index-windows-874.txt",
   291  	},
   292  	{
   293  		"Windows 1250",
   294  		"Windows1250",
   295  		"",
   296  		"Windows1250",
   297  		encoding.ASCIISub,
   298  		"http://encoding.spec.whatwg.org/index-windows-1250.txt",
   299  	},
   300  	{
   301  		"Windows 1251",
   302  		"Windows1251",
   303  		"",
   304  		"Windows1251",
   305  		encoding.ASCIISub,
   306  		"http://encoding.spec.whatwg.org/index-windows-1251.txt",
   307  	},
   308  	{
   309  		"Windows 1252",
   310  		"Windows1252",
   311  		"",
   312  		"Windows1252",
   313  		encoding.ASCIISub,
   314  		"http://encoding.spec.whatwg.org/index-windows-1252.txt",
   315  	},
   316  	{
   317  		"Windows 1253",
   318  		"Windows1253",
   319  		"",
   320  		"Windows1253",
   321  		encoding.ASCIISub,
   322  		"http://encoding.spec.whatwg.org/index-windows-1253.txt",
   323  	},
   324  	{
   325  		"Windows 1254",
   326  		"Windows1254",
   327  		"",
   328  		"Windows1254",
   329  		encoding.ASCIISub,
   330  		"http://encoding.spec.whatwg.org/index-windows-1254.txt",
   331  	},
   332  	{
   333  		"Windows 1255",
   334  		"Windows1255",
   335  		"",
   336  		"Windows1255",
   337  		encoding.ASCIISub,
   338  		"http://encoding.spec.whatwg.org/index-windows-1255.txt",
   339  	},
   340  	{
   341  		"Windows 1256",
   342  		"Windows1256",
   343  		"",
   344  		"Windows1256",
   345  		encoding.ASCIISub,
   346  		"http://encoding.spec.whatwg.org/index-windows-1256.txt",
   347  	},
   348  	{
   349  		"Windows 1257",
   350  		"Windows1257",
   351  		"",
   352  		"Windows1257",
   353  		encoding.ASCIISub,
   354  		"http://encoding.spec.whatwg.org/index-windows-1257.txt",
   355  	},
   356  	{
   357  		"Windows 1258",
   358  		"Windows1258",
   359  		"",
   360  		"Windows1258",
   361  		encoding.ASCIISub,
   362  		"http://encoding.spec.whatwg.org/index-windows-1258.txt",
   363  	},
   364  	{
   365  		"X-User-Defined",
   366  		"XUserDefined",
   367  		"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
   368  		"XUserDefined",
   369  		encoding.ASCIISub,
   370  		ascii +
   371  			"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
   372  			"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
   373  			"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
   374  			"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
   375  			"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
   376  			"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
   377  			"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
   378  			"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
   379  			"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
   380  			"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
   381  			"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
   382  			"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
   383  			"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
   384  			"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
   385  			"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
   386  			"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
   387  	},
   388  }
   389  
   390  func getWHATWG(url string) string {
   391  	res, err := http.Get(url)
   392  	if err != nil {
   393  		log.Fatalf("%q: Get: %v", url, err)
   394  	}
   395  	defer res.Body.Close()
   396  
   397  	mapping := make([]rune, 128)
   398  	for i := range mapping {
   399  		mapping[i] = '\ufffd'
   400  	}
   401  
   402  	scanner := bufio.NewScanner(res.Body)
   403  	for scanner.Scan() {
   404  		s := strings.TrimSpace(scanner.Text())
   405  		if s == "" || s[0] == '#' {
   406  			continue
   407  		}
   408  		x, y := 0, 0
   409  		if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
   410  			log.Fatalf("could not parse %q", s)
   411  		}
   412  		if x < 0 || 128 <= x {
   413  			log.Fatalf("code %d is out of range", x)
   414  		}
   415  		if 0x80 <= y && y < 0xa0 {
   416  			// We diverge from the WHATWG spec by mapping control characters
   417  			// in the range [0x80, 0xa0) to U+FFFD.
   418  			continue
   419  		}
   420  		mapping[x] = rune(y)
   421  	}
   422  	return ascii + string(mapping)
   423  }
   424  
   425  func getUCM(url string) string {
   426  	res, err := http.Get(url)
   427  	if err != nil {
   428  		log.Fatalf("%q: Get: %v", url, err)
   429  	}
   430  	defer res.Body.Close()
   431  
   432  	mapping := make([]rune, 256)
   433  	for i := range mapping {
   434  		mapping[i] = '\ufffd'
   435  	}
   436  
   437  	charsFound := 0
   438  	scanner := bufio.NewScanner(res.Body)
   439  	for scanner.Scan() {
   440  		s := strings.TrimSpace(scanner.Text())
   441  		if s == "" || s[0] == '#' {
   442  			continue
   443  		}
   444  		var c byte
   445  		var r rune
   446  		if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
   447  			continue
   448  		}
   449  		mapping[c] = r
   450  		charsFound++
   451  	}
   452  
   453  	if charsFound < 200 {
   454  		log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
   455  	}
   456  
   457  	return string(mapping)
   458  }
   459  
   460  func main() {
   461  	mibs := map[string]bool{}
   462  	all := []string{}
   463  
   464  	w := gen.NewCodeWriter()
   465  	defer w.WriteGoFile("tables.go", "charmap")
   466  
   467  	printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
   468  
   469  	printf("import (\n")
   470  	printf("\t\"golang.org/x/text/encoding\"\n")
   471  	printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
   472  	printf(")\n\n")
   473  	for _, e := range encodings {
   474  		varNames := strings.Split(e.varName, ",")
   475  		all = append(all, varNames...)
   476  		varName := varNames[0]
   477  		switch {
   478  		case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
   479  			e.mapping = getWHATWG(e.mapping)
   480  		case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
   481  			e.mapping = getUCM(e.mapping)
   482  		}
   483  
   484  		asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
   485  		if asciiSuperset {
   486  			low = 0x80
   487  		}
   488  		lvn := 1
   489  		if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
   490  			lvn = 3
   491  		}
   492  		lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
   493  		printf("// %s is the %s encoding.\n", varName, e.name)
   494  		if e.comment != "" {
   495  			printf("//\n// %s\n", e.comment)
   496  		}
   497  		printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
   498  			varName, lowerVarName, lowerVarName, e.name)
   499  		if mibs[e.mib] {
   500  			log.Fatalf("MIB type %q declared multiple times.", e.mib)
   501  		}
   502  		printf("mib: identifier.%s,\n", e.mib)
   503  		printf("asciiSuperset: %t,\n", asciiSuperset)
   504  		printf("low: 0x%02x,\n", low)
   505  		printf("replacement: 0x%02x,\n", e.replacement)
   506  
   507  		printf("decode: [256]utf8Enc{\n")
   508  		i, backMapping := 0, map[rune]byte{}
   509  		for _, c := range e.mapping {
   510  			if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
   511  				backMapping[c] = byte(i)
   512  			}
   513  			var buf [8]byte
   514  			n := utf8.EncodeRune(buf[:], c)
   515  			if n > 3 {
   516  				panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
   517  			}
   518  			printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
   519  			if i%2 == 1 {
   520  				printf("\n")
   521  			}
   522  			i++
   523  		}
   524  		printf("},\n")
   525  
   526  		printf("encode: [256]uint32{\n")
   527  		encode := make([]uint32, 0, 256)
   528  		for c, i := range backMapping {
   529  			encode = append(encode, uint32(i)<<24|uint32(c))
   530  		}
   531  		sort.Sort(byRune(encode))
   532  		for len(encode) < cap(encode) {
   533  			encode = append(encode, encode[len(encode)-1])
   534  		}
   535  		for i, enc := range encode {
   536  			printf("0x%08x,", enc)
   537  			if i%8 == 7 {
   538  				printf("\n")
   539  			}
   540  		}
   541  		printf("},\n}\n")
   542  
   543  		// Add an estimate of the size of a single Charmap{} struct value, which
   544  		// includes two 256 elem arrays of 4 bytes and some extra fields, which
   545  		// align to 3 uint64s on 64-bit architectures.
   546  		w.Size += 2*4*256 + 3*8
   547  	}
   548  	// TODO: add proper line breaking.
   549  	printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
   550  }
   551  
   552  type byRune []uint32
   553  
   554  func (b byRune) Len() int           { return len(b) }
   555  func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
   556  func (b byRune) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   557  

View as plain text