...

Source file src/golang.org/x/text/internal/language/parse_test.go

Documentation: golang.org/x/text/internal/language

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package language
     6  
     7  import (
     8  	"bytes"
     9  	"strings"
    10  	"testing"
    11  
    12  	"golang.org/x/text/internal/tag"
    13  )
    14  
    15  type scanTest struct {
    16  	ok  bool // true if scanning does not result in an error
    17  	in  string
    18  	tok []string // the expected tokens
    19  }
    20  
    21  var tests = []scanTest{
    22  	{true, "", []string{}},
    23  	{true, "1", []string{"1"}},
    24  	{true, "en", []string{"en"}},
    25  	{true, "root", []string{"root"}},
    26  	{true, "maxchars", []string{"maxchars"}},
    27  	{false, "bad/", []string{}},
    28  	{false, "morethan8", []string{}},
    29  	{false, "-", []string{}},
    30  	{false, "----", []string{}},
    31  	{false, "_", []string{}},
    32  	{true, "en-US", []string{"en", "US"}},
    33  	{true, "en_US", []string{"en", "US"}},
    34  	{false, "en-US-", []string{"en", "US"}},
    35  	{false, "en-US--", []string{"en", "US"}},
    36  	{false, "en-US---", []string{"en", "US"}},
    37  	{false, "en--US", []string{"en", "US"}},
    38  	{false, "-en-US", []string{"en", "US"}},
    39  	{false, "-en--US-", []string{"en", "US"}},
    40  	{false, "-en--US-", []string{"en", "US"}},
    41  	{false, "en-.-US", []string{"en", "US"}},
    42  	{false, ".-en--US-.", []string{"en", "US"}},
    43  	{false, "en-u.-US", []string{"en", "US"}},
    44  	{true, "en-u1-US", []string{"en", "u1", "US"}},
    45  	{true, "maxchar1_maxchar2-maxchar3", []string{"maxchar1", "maxchar2", "maxchar3"}},
    46  	{false, "moreThan8-moreThan8-e", []string{"e"}},
    47  }
    48  
    49  func TestScan(t *testing.T) {
    50  	for i, tt := range tests {
    51  		scan := makeScannerString(tt.in)
    52  		for j := 0; !scan.done; j++ {
    53  			if j >= len(tt.tok) {
    54  				t.Errorf("%d: extra token %q", i, scan.token)
    55  			} else if tag.Compare(tt.tok[j], scan.token) != 0 {
    56  				t.Errorf("%d: token %d: found %q; want %q", i, j, scan.token, tt.tok[j])
    57  				break
    58  			}
    59  			scan.scan()
    60  		}
    61  		if s := strings.Join(tt.tok, "-"); tag.Compare(s, bytes.Replace(scan.b, b("_"), b("-"), -1)) != 0 {
    62  			t.Errorf("%d: input: found %q; want %q", i, scan.b, s)
    63  		}
    64  		if (scan.err == nil) != tt.ok {
    65  			t.Errorf("%d: ok: found %v; want %v", i, scan.err == nil, tt.ok)
    66  		}
    67  	}
    68  }
    69  
    70  func TestAcceptMinSize(t *testing.T) {
    71  	for i, tt := range tests {
    72  		// count number of successive tokens with a minimum size.
    73  		for sz := 1; sz <= 8; sz++ {
    74  			scan := makeScannerString(tt.in)
    75  			scan.end, scan.next = 0, 0
    76  			end := scan.acceptMinSize(sz)
    77  			n := 0
    78  			for i := 0; i < len(tt.tok) && len(tt.tok[i]) >= sz; i++ {
    79  				n += len(tt.tok[i])
    80  				if i > 0 {
    81  					n++
    82  				}
    83  			}
    84  			if end != n {
    85  				t.Errorf("%d:%d: found len %d; want %d", i, sz, end, n)
    86  			}
    87  		}
    88  	}
    89  }
    90  
    91  type parseTest struct {
    92  	i                    int // the index of this test
    93  	in                   string
    94  	lang, script, region string
    95  	variants, ext        string
    96  	extList              []string // only used when more than one extension is present
    97  	invalid              bool
    98  	rewrite              bool // special rewrite not handled by parseTag
    99  	changed              bool // string needed to be reformatted
   100  }
   101  
   102  func parseTests() []parseTest {
   103  	tests := []parseTest{
   104  		{in: "root", lang: "und"},
   105  		{in: "und", lang: "und"},
   106  		{in: "en", lang: "en"},
   107  		{in: "xy", lang: "und", invalid: true},
   108  		{in: "en-ZY", lang: "en", invalid: true},
   109  		{in: "gsw", lang: "gsw"},
   110  		{in: "sr_Latn", lang: "sr", script: "Latn"},
   111  		{in: "af-Arab", lang: "af", script: "Arab"},
   112  		{in: "nl-BE", lang: "nl", region: "BE"},
   113  		{in: "es-419", lang: "es", region: "419"},
   114  		{in: "und-001", lang: "und", region: "001"},
   115  		{in: "de-latn-be", lang: "de", script: "Latn", region: "BE"},
   116  		// Variants
   117  		{in: "de-1901", lang: "de", variants: "1901"},
   118  		// Accept with unsuppressed script.
   119  		{in: "de-Latn-1901", lang: "de", script: "Latn", variants: "1901"},
   120  		// Specialized.
   121  		{in: "sl-rozaj", lang: "sl", variants: "rozaj"},
   122  		{in: "sl-rozaj-lipaw", lang: "sl", variants: "rozaj-lipaw"},
   123  		{in: "sl-rozaj-biske", lang: "sl", variants: "rozaj-biske"},
   124  		{in: "sl-rozaj-biske-1994", lang: "sl", variants: "rozaj-biske-1994"},
   125  		{in: "sl-rozaj-1994", lang: "sl", variants: "rozaj-1994"},
   126  		// Maximum number of variants while adhering to prefix rules.
   127  		{in: "sl-rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp"},
   128  
   129  		// Sorting.
   130  		{in: "sl-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
   131  		{in: "sl-rozaj-biske-1994-alalc97-fonupa-fonipa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", changed: true},
   132  		{in: "nl-fonxsamp-alalc97-fonipa-fonupa", lang: "nl", variants: "alalc97-fonipa-fonupa-fonxsamp", changed: true},
   133  
   134  		// Duplicates variants are removed, but not an error.
   135  		{in: "nl-fonupa-fonupa", lang: "nl", variants: "fonupa"},
   136  
   137  		// Variants that do not have correct prefixes. We still accept these.
   138  		{in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
   139  		{in: "sl-rozaj-lipaw-1994", lang: "sl", variants: "rozaj-lipaw-1994"},
   140  		{in: "sl-1994-biske-rozaj-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
   141  		{in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
   142  
   143  		// Invalid variant.
   144  		{in: "de-1902", lang: "de", variants: "", invalid: true},
   145  
   146  		{in: "EN_CYRL", lang: "en", script: "Cyrl"},
   147  		// private use and extensions
   148  		{in: "x-a-b-c-d", ext: "x-a-b-c-d"},
   149  		{in: "x_A.-B-C_D", ext: "x-b-c-d", invalid: true, changed: true},
   150  		{in: "x-aa-bbbb-cccccccc-d", ext: "x-aa-bbbb-cccccccc-d"},
   151  		{in: "en-c_cc-b-bbb-a-aaa", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc"}},
   152  		{in: "en-x_cc-b-bbb-a-aaa", lang: "en", ext: "x-cc-b-bbb-a-aaa", changed: true},
   153  		{in: "en-c_cc-b-bbb-a-aaa-x-x", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc", "x-x"}},
   154  		{in: "en-v-c", lang: "en", ext: "", invalid: true},
   155  		{in: "en-v-abcdefghi", lang: "en", ext: "", invalid: true},
   156  		{in: "en-v-abc-x", lang: "en", ext: "v-abc", invalid: true},
   157  		{in: "en-v-abc-x-", lang: "en", ext: "v-abc", invalid: true},
   158  		{in: "en-v-abc-w-x-xx", lang: "en", extList: []string{"v-abc", "x-xx"}, invalid: true, changed: true},
   159  		{in: "en-v-abc-w-y-yx", lang: "en", extList: []string{"v-abc", "y-yx"}, invalid: true, changed: true},
   160  		{in: "en-v-c-abc", lang: "en", ext: "c-abc", invalid: true, changed: true},
   161  		{in: "en-v-w-abc", lang: "en", ext: "w-abc", invalid: true, changed: true},
   162  		{in: "en-v-x-abc", lang: "en", ext: "x-abc", invalid: true, changed: true},
   163  		{in: "en-v-x-a", lang: "en", ext: "x-a", invalid: true, changed: true},
   164  		{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
   165  		{in: "en-u-c", lang: "en", ext: "", invalid: true},
   166  		{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
   167  		{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", changed: true},
   168  		{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", changed: true},
   169  		{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
   170  		{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
   171  		{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
   172  		{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
   173  		{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
   174  		{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
   175  		{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
   176  		{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
   177  		{in: "en-scotland-u-co-phonebk", lang: "en", variants: "scotland", ext: "u-co-phonebk"},
   178  		{in: "en-u-cu-xua-co-phonebk", lang: "en", ext: "u-co-phonebk-cu-xua", changed: true},
   179  		{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
   180  		{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
   181  		{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
   182  		{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, changed: true},
   183  		{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, changed: true},
   184  		// LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
   185  		{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
   186  		// No change as the result is a substring of the original!
   187  		{in: "en-US-u-cu-xau-cu-eur", lang: "en", region: "US", ext: "u-cu-xau", invalid: true, changed: false},
   188  		{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
   189  		{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
   190  		{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
   191  		// Not necessary to have changed here.
   192  		{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
   193  		{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
   194  		{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
   195  		{in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
   196  		{in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
   197  		{in: "fr-est", lang: "et", changed: false},
   198  		{in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: false},
   199  		// The same input here is used in both TestParse and TestParseExtensions.
   200  		// changed should be true for this input in TestParse but changed should be false for this input in TestParseExtensions
   201  		// because the entire input has been reformatted but the extension part hasn't.
   202  		// {in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
   203  		// invalid
   204  		{in: "", lang: "und", invalid: true},
   205  		{in: "-", lang: "und", invalid: true},
   206  		{in: "x", lang: "und", invalid: true},
   207  		{in: "x-", lang: "und", invalid: true},
   208  		{in: "x--", lang: "und", invalid: true},
   209  		{in: "a-a-b-c-d", lang: "und", invalid: true},
   210  		{in: "en-", lang: "en", invalid: true},
   211  		{in: "enne-", lang: "und", invalid: true},
   212  		{in: "en.", lang: "und", invalid: true},
   213  		{in: "en.-latn", lang: "und", invalid: true},
   214  		{in: "en.-en", lang: "en", invalid: true},
   215  		{in: "x-a-tooManyChars-c-d", ext: "x-a-c-d", invalid: true, changed: true},
   216  		{in: "a-tooManyChars-c-d", lang: "und", invalid: true},
   217  		// TODO: check key-value validity
   218  		// { in: "en-u-cu-xd", lang: "en", ext: "u-cu-xd", invalid: true },
   219  		{in: "en-t-abcd", lang: "en", invalid: true},
   220  		{in: "en-Latn-US-en", lang: "en", script: "Latn", region: "US", invalid: true},
   221  		// rewrites (more tests in TestGrandfathered)
   222  		{in: "zh-min-nan", lang: "nan"},
   223  		{in: "zh-yue", lang: "yue"},
   224  		{in: "zh-xiang", lang: "hsn", rewrite: true},
   225  		{in: "zh-guoyu", lang: "cmn", rewrite: true},
   226  		{in: "iw", lang: "iw"},
   227  		{in: "sgn-BE-FR", lang: "sfb", rewrite: true},
   228  		{in: "i-klingon", lang: "tlh", rewrite: true},
   229  	}
   230  	for i, tt := range tests {
   231  		tests[i].i = i
   232  		if tt.extList != nil {
   233  			tests[i].ext = strings.Join(tt.extList, "-")
   234  		}
   235  		if tt.ext != "" && tt.extList == nil {
   236  			tests[i].extList = []string{tt.ext}
   237  		}
   238  	}
   239  	return tests
   240  }
   241  
   242  func TestParseExtensions(t *testing.T) {
   243  	for i, tt := range parseTests() {
   244  		if tt.ext == "" || tt.rewrite {
   245  			continue
   246  		}
   247  		scan := makeScannerString(tt.in)
   248  		if len(scan.b) > 1 && scan.b[1] != '-' {
   249  			scan.end = nextExtension(string(scan.b), 0)
   250  			scan.next = scan.end + 1
   251  			scan.scan()
   252  		}
   253  		start := scan.start
   254  		scan.toLower(start, len(scan.b))
   255  		parseExtensions(&scan)
   256  		ext := string(scan.b[start:])
   257  		if ext != tt.ext {
   258  			t.Errorf("%d(%s): ext was %v; want %v", i, tt.in, ext, tt.ext)
   259  		}
   260  		if changed := !strings.HasPrefix(tt.in[start:], ext); changed != tt.changed {
   261  			t.Errorf("%d(%s): changed was %v; want %v", i, tt.in, changed, tt.changed)
   262  		}
   263  	}
   264  }
   265  
   266  // partChecks runs checks for each part by calling the function returned by f.
   267  func partChecks(t *testing.T, f func(*testing.T, *parseTest) (Tag, bool)) {
   268  	for i, tt := range parseTests() {
   269  		t.Run(tt.in, func(t *testing.T) {
   270  			tag, skip := f(t, &tt)
   271  			if skip {
   272  				return
   273  			}
   274  			if l, _ := getLangID(b(tt.lang)); l != tag.LangID {
   275  				t.Errorf("%d: lang was %q; want %q", i, tag.LangID, l)
   276  			}
   277  			if sc, _ := getScriptID(script, b(tt.script)); sc != tag.ScriptID {
   278  				t.Errorf("%d: script was %q; want %q", i, tag.ScriptID, sc)
   279  			}
   280  			if r, _ := getRegionID(b(tt.region)); r != tag.RegionID {
   281  				t.Errorf("%d: region was %q; want %q", i, tag.RegionID, r)
   282  			}
   283  			if tag.str == "" {
   284  				return
   285  			}
   286  			p := int(tag.pVariant)
   287  			if p < int(tag.pExt) {
   288  				p++
   289  			}
   290  			if s, g := tag.str[p:tag.pExt], tt.variants; s != g {
   291  				t.Errorf("%d: variants was %q; want %q", i, s, g)
   292  			}
   293  			p = int(tag.pExt)
   294  			if p > 0 && p < len(tag.str) {
   295  				p++
   296  			}
   297  			if s, g := (tag.str)[p:], tt.ext; s != g {
   298  				t.Errorf("%d: extensions were %q; want %q", i, s, g)
   299  			}
   300  		})
   301  	}
   302  }
   303  
   304  func TestParseTag(t *testing.T) {
   305  	partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
   306  		if strings.HasPrefix(tt.in, "x-") || tt.rewrite {
   307  			return Tag{}, true
   308  		}
   309  		scan := makeScannerString(tt.in)
   310  		id, end := parseTag(&scan, true)
   311  		id.str = string(scan.b[:end])
   312  		tt.ext = ""
   313  		tt.extList = []string{}
   314  		return id, false
   315  	})
   316  }
   317  
   318  func TestParse(t *testing.T) {
   319  	partChecks(t, func(t *testing.T, tt *parseTest) (id Tag, skip bool) {
   320  		id, err := Parse(tt.in)
   321  		ext := ""
   322  		if id.str != "" {
   323  			if strings.HasPrefix(id.str, "x-") {
   324  				ext = id.str
   325  			} else if int(id.pExt) < len(id.str) && id.pExt > 0 {
   326  				ext = id.str[id.pExt+1:]
   327  			}
   328  		}
   329  		if tag, _ := Parse(id.String()); tag.String() != id.String() {
   330  			t.Errorf("%d:%s: reparse was %q; want %q", tt.i, tt.in, id.String(), tag.String())
   331  		}
   332  		if ext != tt.ext {
   333  			t.Errorf("%d:%s: ext was %q; want %q", tt.i, tt.in, ext, tt.ext)
   334  		}
   335  		changed := id.str != "" && !strings.HasPrefix(tt.in, id.str)
   336  		if changed != tt.changed {
   337  			t.Errorf("%d:%s: changed was %v; want %v", tt.i, tt.in, changed, tt.changed)
   338  		}
   339  		if (err != nil) != tt.invalid {
   340  			t.Errorf("%d:%s: invalid was %v; want %v. Error: %v", tt.i, tt.in, err != nil, tt.invalid, err)
   341  		}
   342  		return id, false
   343  	})
   344  }
   345  
   346  func TestErrors(t *testing.T) {
   347  	mkInvalid := func(s string) error {
   348  		return NewValueError([]byte(s))
   349  	}
   350  	tests := []struct {
   351  		in  string
   352  		out error
   353  	}{
   354  		// invalid subtags.
   355  		{"ac", mkInvalid("ac")},
   356  		{"AC", mkInvalid("ac")},
   357  		{"aa-Uuuu", mkInvalid("Uuuu")},
   358  		{"aa-AB", mkInvalid("AB")},
   359  		// ill-formed wins over invalid.
   360  		{"ac-u", ErrSyntax},
   361  		{"ac-u-ca", mkInvalid("ac")},
   362  		{"ac-u-ca-co-pinyin", mkInvalid("ac")},
   363  		{"noob", ErrSyntax},
   364  	}
   365  	for _, tt := range tests {
   366  		_, err := Parse(tt.in)
   367  		if err != tt.out {
   368  			t.Errorf("%s: was %q; want %q", tt.in, err, tt.out)
   369  		}
   370  	}
   371  }
   372  

View as plain text