...

Source file src/golang.org/x/text/collate/reg_test.go

Documentation: golang.org/x/text/collate

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package collate
     6  
     7  import (
     8  	"archive/zip"
     9  	"bufio"
    10  	"bytes"
    11  	"flag"
    12  	"io"
    13  	"log"
    14  	"path"
    15  	"regexp"
    16  	"strconv"
    17  	"strings"
    18  	"testing"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/collate/build"
    22  	"golang.org/x/text/internal/gen"
    23  	"golang.org/x/text/language"
    24  )
    25  
    26  var long = flag.Bool("long", false,
    27  	"run time-consuming tests, such as tests that fetch data online")
    28  
    29  // This regression test runs tests for the test files in CollationTest.zip
    30  // (taken from https://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
    31  //
    32  // The test files have the following form:
    33  // # header
    34  // 0009 0021;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 025E]
    35  // 0009 003F;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 0263]
    36  // 000A 0021;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 025E]
    37  // 000A 003F;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 0263]
    38  //
    39  // The part before the semicolon is the hex representation of a sequence
    40  // of runes. After the hash mark is a comment. The strings
    41  // represented by rune sequence are in the file in sorted order, as
    42  // defined by the DUCET.
    43  
    44  type Test struct {
    45  	name    string
    46  	str     [][]byte
    47  	comment []string
    48  }
    49  
    50  var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
    51  var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
    52  
    53  func TestCollation(t *testing.T) {
    54  	if !gen.IsLocal() && !*long {
    55  		t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
    56  	}
    57  	t.Skip("must first update to new file format to support test")
    58  	for _, test := range loadTestData() {
    59  		doTest(t, test)
    60  	}
    61  }
    62  
    63  func Error(e error) {
    64  	if e != nil {
    65  		log.Fatal(e)
    66  	}
    67  }
    68  
    69  // parseUCA parses a Default Unicode Collation Element Table of the format
    70  // specified in https://www.unicode.org/reports/tr10/#File_Format.
    71  // It returns the variable top.
    72  func parseUCA(builder *build.Builder) {
    73  	r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
    74  	defer r.Close()
    75  	input := bufio.NewReader(r)
    76  	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
    77  	for i := 1; true; i++ {
    78  		l, prefix, err := input.ReadLine()
    79  		if err == io.EOF {
    80  			break
    81  		}
    82  		Error(err)
    83  		line := string(l)
    84  		if prefix {
    85  			log.Fatalf("%d: buffer overflow", i)
    86  		}
    87  		if len(line) == 0 || line[0] == '#' {
    88  			continue
    89  		}
    90  		if line[0] == '@' {
    91  			if strings.HasPrefix(line[1:], "version ") {
    92  				if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
    93  					log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
    94  				}
    95  			}
    96  		} else {
    97  			// parse entries
    98  			part := strings.Split(line, " ; ")
    99  			if len(part) != 2 {
   100  				log.Fatalf("%d: production rule without ';': %v", i, line)
   101  			}
   102  			lhs := []rune{}
   103  			for _, v := range strings.Split(part[0], " ") {
   104  				if v != "" {
   105  					lhs = append(lhs, rune(convHex(i, v)))
   106  				}
   107  			}
   108  			vars := []int{}
   109  			rhs := [][]int{}
   110  			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
   111  				if m[1] == "*" {
   112  					vars = append(vars, i)
   113  				}
   114  				elem := []int{}
   115  				for _, h := range strings.Split(m[2], ".") {
   116  					elem = append(elem, convHex(i, h))
   117  				}
   118  				rhs = append(rhs, elem)
   119  			}
   120  			builder.Add(lhs, rhs, vars)
   121  		}
   122  	}
   123  }
   124  
   125  func convHex(line int, s string) int {
   126  	r, e := strconv.ParseInt(s, 16, 32)
   127  	if e != nil {
   128  		log.Fatalf("%d: %v", line, e)
   129  	}
   130  	return int(r)
   131  }
   132  
   133  func loadTestData() []Test {
   134  	f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
   135  	buffer, err := io.ReadAll(f)
   136  	f.Close()
   137  	Error(err)
   138  	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
   139  	Error(err)
   140  	tests := []Test{}
   141  	for _, f := range archive.File {
   142  		// Skip the short versions, which are simply duplicates of the long versions.
   143  		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
   144  			continue
   145  		}
   146  		ff, err := f.Open()
   147  		Error(err)
   148  		defer ff.Close()
   149  		scanner := bufio.NewScanner(ff)
   150  		test := Test{name: path.Base(f.Name)}
   151  		for scanner.Scan() {
   152  			line := scanner.Text()
   153  			if len(line) <= 1 || line[0] == '#' {
   154  				if m := versionRe.FindStringSubmatch(line); m != nil {
   155  					if m[1] != gen.UnicodeVersion() {
   156  						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
   157  					}
   158  				}
   159  				continue
   160  			}
   161  			m := testRe.FindStringSubmatch(line)
   162  			if m == nil || len(m) < 3 {
   163  				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
   164  			}
   165  			str := []byte{}
   166  			// In the regression test data (unpaired) surrogates are assigned a weight
   167  			// corresponding to their code point value.  However, utf8.DecodeRune,
   168  			// which is used to compute the implicit weight, assigns FFFD to surrogates.
   169  			// We therefore skip tests with surrogates.  This skips about 35 entries
   170  			// per test.
   171  			valid := true
   172  			for _, split := range strings.Split(m[1], " ") {
   173  				r, err := strconv.ParseUint(split, 16, 64)
   174  				Error(err)
   175  				valid = valid && utf8.ValidRune(rune(r))
   176  				str = append(str, string(rune(r))...)
   177  			}
   178  			if valid {
   179  				test.str = append(test.str, str)
   180  				test.comment = append(test.comment, m[2])
   181  			}
   182  		}
   183  		if scanner.Err() != nil {
   184  			log.Fatal(scanner.Err())
   185  		}
   186  		tests = append(tests, test)
   187  	}
   188  	return tests
   189  }
   190  
   191  var errorCount int
   192  
   193  func runes(b []byte) []rune {
   194  	return []rune(string(b))
   195  }
   196  
   197  var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
   198  
   199  func doTest(t *testing.T, tc Test) {
   200  	bld := build.NewBuilder()
   201  	parseUCA(bld)
   202  	w, err := bld.Build()
   203  	Error(err)
   204  	var tag language.Tag
   205  	if !strings.Contains(tc.name, "NON_IGNOR") {
   206  		tag = shifted
   207  	}
   208  	c := NewFromTable(w, OptionsFromTag(tag))
   209  	b := &Buffer{}
   210  	prev := tc.str[0]
   211  	for i := 1; i < len(tc.str); i++ {
   212  		b.Reset()
   213  		s := tc.str[i]
   214  		ka := c.Key(b, prev)
   215  		kb := c.Key(b, s)
   216  		if r := bytes.Compare(ka, kb); r == 1 {
   217  			t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
   218  			prev = s
   219  			continue
   220  		}
   221  		if r := c.Compare(prev, s); r == 1 {
   222  			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
   223  		}
   224  		if r := c.Compare(s, prev); r == -1 {
   225  			t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)
   226  		}
   227  		prev = s
   228  	}
   229  }
   230  

View as plain text