...

Source file src/golang.org/x/text/search/pattern_test.go

Documentation: golang.org/x/text/search

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package search
     6  
     7  import (
     8  	"reflect"
     9  	"strings"
    10  	"testing"
    11  
    12  	"golang.org/x/text/language"
    13  )
    14  
    15  func TestCompile(t *testing.T) {
    16  	for i, tc := range []struct {
    17  		desc    string
    18  		pattern string
    19  		options []Option
    20  		n       int
    21  	}{{
    22  		desc:    "empty",
    23  		pattern: "",
    24  		n:       0,
    25  	}, {
    26  		desc:    "single",
    27  		pattern: "a",
    28  		n:       1,
    29  	}, {
    30  		desc:    "keep modifier",
    31  		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
    32  		n:       2,
    33  	}, {
    34  		desc:    "remove modifier",
    35  		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
    36  		options: []Option{IgnoreDiacritics},
    37  		n:       1,
    38  	}, {
    39  		desc:    "single with double collation element",
    40  		pattern: "ä",
    41  		n:       2,
    42  	}, {
    43  		desc:    "leading variable",
    44  		pattern: " a",
    45  		n:       2,
    46  	}, {
    47  		desc:    "trailing variable",
    48  		pattern: "aa ",
    49  		n:       3,
    50  	}, {
    51  		desc:    "leading and trailing variable",
    52  		pattern: " äb ",
    53  		n:       5,
    54  	}, {
    55  		desc:    "keep interior variable",
    56  		pattern: " ä b ",
    57  		n:       6,
    58  	}, {
    59  		desc:    "keep interior variables",
    60  		pattern: " b  ä ",
    61  		n:       7,
    62  	}, {
    63  		desc:    "remove ignoreables (zero-weights across the board)",
    64  		pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
    65  		n:       3,
    66  	}} {
    67  		m := New(language.Und, tc.options...)
    68  		p := m.CompileString(tc.pattern)
    69  		if len(p.ce) != tc.n {
    70  			t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
    71  		}
    72  	}
    73  }
    74  
    75  func TestNorm(t *testing.T) {
    76  	// U+0300: COMBINING GRAVE ACCENT (CCC=230)
    77  	// U+031B: COMBINING HORN (CCC=216)
    78  	for _, tc := range []struct {
    79  		desc string
    80  		a    string
    81  		b    string
    82  		want bool // a and b compile into the same pattern?
    83  	}{{
    84  		"simple",
    85  		"eee\u0300\u031b",
    86  		"eee\u031b\u0300",
    87  		true,
    88  	}, {
    89  		"large number of modifiers in pattern",
    90  		strings.Repeat("\u0300", 29) + "\u0318",
    91  		"\u0318" + strings.Repeat("\u0300", 29),
    92  		true,
    93  	}, {
    94  		"modifier overflow in pattern",
    95  		strings.Repeat("\u0300", 30) + "\u0318",
    96  		"\u0318" + strings.Repeat("\u0300", 30),
    97  		false,
    98  	}} {
    99  		m := New(language.Und)
   100  		a := m.CompileString(tc.a)
   101  		b := m.CompileString(tc.b)
   102  		if got := reflect.DeepEqual(a, b); got != tc.want {
   103  			t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
   104  		}
   105  	}
   106  }
   107  
   108  func TestForwardSearch(t *testing.T) {
   109  	for i, tc := range []struct {
   110  		desc    string
   111  		tag     string
   112  		options []Option
   113  		pattern string
   114  		text    string
   115  		want    []int
   116  	}{{
   117  		// The semantics of an empty search is to match nothing.
   118  		// TODO: change this to be in line with strings.Index? It is quite a
   119  		// different beast, so not sure yet.
   120  
   121  		desc:    "empty pattern and text",
   122  		tag:     "und",
   123  		pattern: "",
   124  		text:    "",
   125  		want:    nil, // TODO: consider: []int{0, 0},
   126  	}, {
   127  		desc:    "non-empty pattern and empty text",
   128  		tag:     "und",
   129  		pattern: " ",
   130  		text:    "",
   131  		want:    nil,
   132  	}, {
   133  		desc:    "empty pattern and non-empty text",
   134  		tag:     "und",
   135  		pattern: "",
   136  		text:    "abc",
   137  		want:    nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
   138  	}, {
   139  		// Variable-only patterns. We don't support variables at the moment,
   140  		// but verify that, given this, the behavior is indeed as expected.
   141  
   142  		desc:    "exact match of variable",
   143  		tag:     "und",
   144  		pattern: " ",
   145  		text:    " ",
   146  		want:    []int{0, 1},
   147  	}, {
   148  		desc:    "variables not handled by default",
   149  		tag:     "und",
   150  		pattern: "- ",
   151  		text:    " -",
   152  		want:    nil, // Would be (1, 2) for a median match with variable}.
   153  	}, {
   154  		desc:    "multiple subsequent identical variables",
   155  		tag:     "und",
   156  		pattern: " ",
   157  		text:    "    ",
   158  		want:    []int{0, 1, 1, 2, 2, 3, 3, 4},
   159  	}, {
   160  		desc:    "text with variables",
   161  		tag:     "und",
   162  		options: []Option{IgnoreDiacritics},
   163  		pattern: "abc",
   164  		text:    "3 abc 3",
   165  		want:    []int{2, 5},
   166  	}, {
   167  		desc:    "pattern with interior variables",
   168  		tag:     "und",
   169  		options: []Option{IgnoreDiacritics},
   170  		pattern: "a b c",
   171  		text:    "3 a b c abc a  b  c 3",
   172  		want:    []int{2, 7}, // Would have 3 matches using variable.
   173  
   174  		// TODO: Different variable handling settings.
   175  	}, {
   176  		// Options.
   177  
   178  		desc:    "match all levels",
   179  		tag:     "und",
   180  		pattern: "Abc",
   181  		text:    "abcAbcABCÁbcábc",
   182  		want:    []int{3, 6},
   183  	}, {
   184  		desc:    "ignore diacritics in text",
   185  		tag:     "und",
   186  		options: []Option{IgnoreDiacritics},
   187  		pattern: "Abc",
   188  		text:    "Ábc",
   189  		want:    []int{0, 4},
   190  	}, {
   191  		desc:    "ignore diacritics in pattern",
   192  		tag:     "und",
   193  		options: []Option{IgnoreDiacritics},
   194  		pattern: "Ábc",
   195  		text:    "Abc",
   196  		want:    []int{0, 3},
   197  	}, {
   198  		desc:    "ignore diacritics",
   199  		tag:     "und",
   200  		options: []Option{IgnoreDiacritics},
   201  		pattern: "Abc",
   202  		text:    "abcAbcABCÁbcábc",
   203  		want:    []int{3, 6, 9, 13},
   204  	}, {
   205  		desc:    "ignore case",
   206  		tag:     "und",
   207  		options: []Option{IgnoreCase},
   208  		pattern: "Abc",
   209  		text:    "abcAbcABCÁbcábc",
   210  		want:    []int{0, 3, 3, 6, 6, 9},
   211  	}, {
   212  		desc:    "ignore case and diacritics",
   213  		tag:     "und",
   214  		options: []Option{IgnoreCase, IgnoreDiacritics},
   215  		pattern: "Abc",
   216  		text:    "abcAbcABCÁbcábc",
   217  		want:    []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
   218  	}, {
   219  		desc:    "ignore width to fullwidth",
   220  		tag:     "und",
   221  		options: []Option{IgnoreWidth},
   222  		pattern: "abc",
   223  		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
   224  		want:    []int{4, 13},
   225  	}, {
   226  		// TODO: distinguish between case and width.
   227  		desc:    "don't ignore width to fullwidth, ignoring only case",
   228  		tag:     "und",
   229  		options: []Option{IgnoreCase},
   230  		pattern: "abc",
   231  		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
   232  		want:    []int{4, 13},
   233  	}, {
   234  		desc:    "ignore width to fullwidth and diacritics",
   235  		tag:     "und",
   236  		options: []Option{IgnoreWidth, IgnoreDiacritics},
   237  		pattern: "abc",
   238  		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
   239  		want:    []int{4, 13},
   240  	}, {
   241  		desc:    "whole grapheme, single rune",
   242  		tag:     "und",
   243  		pattern: "eee",
   244  		text:    "123 eeé 123",
   245  		want:    nil,
   246  	}, {
   247  		// Note: rules on when to apply contractions may, for certain languages,
   248  		// differ between search and collation. For example, "ch" is not
   249  		// considered a contraction for the purpose of searching in Spanish.
   250  		// Therefore, be careful picking this test.
   251  		desc:    "whole grapheme, contractions",
   252  		tag:     "da",
   253  		pattern: "aba",
   254  		// Fails at the primary level, because "aa" is a contraction.
   255  		text: "123 abaa 123",
   256  		want: []int{},
   257  	}, {
   258  		desc:    "whole grapheme, trailing modifier",
   259  		tag:     "und",
   260  		pattern: "eee",
   261  		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
   262  		want:    nil,
   263  	}, {
   264  		// Language-specific matching.
   265  
   266  		desc:    "",
   267  		tag:     "da",
   268  		options: []Option{IgnoreCase},
   269  		pattern: "Århus",
   270  		text:    "AarhusÅrhus  Århus  ",
   271  		want:    []int{0, 6, 6, 12, 14, 20},
   272  	}, {
   273  		desc:    "",
   274  		tag:     "da",
   275  		options: []Option{IgnoreCase},
   276  		pattern: "Aarhus",
   277  		text:    "Århus Aarhus",
   278  		want:    []int{0, 6, 7, 13},
   279  	}, {
   280  		desc:    "",
   281  		tag:     "en", // Å does not match A for English.
   282  		options: []Option{IgnoreCase},
   283  		pattern: "Aarhus",
   284  		text:    "Århus",
   285  		want:    nil,
   286  	}, {
   287  		desc:    "ignore modifier in text",
   288  		options: []Option{IgnoreDiacritics},
   289  		tag:     "und",
   290  		pattern: "eee",
   291  		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
   292  		want:    []int{4, 9},         // Matches on grapheme boundary.
   293  	}, {
   294  		desc:    "ignore multiple modifiers in text",
   295  		options: []Option{IgnoreDiacritics},
   296  		tag:     "und",
   297  		pattern: "eee",
   298  		text:    "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
   299  		want:    []int{4, 11},              // Matches on grapheme boundary.
   300  	}, {
   301  		desc:    "ignore modifier in pattern",
   302  		options: []Option{IgnoreDiacritics},
   303  		tag:     "und",
   304  		pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
   305  		text:    "123 eee 123",
   306  		want:    []int{4, 7},
   307  	}, {
   308  		desc:    "ignore multiple modifiers in pattern",
   309  		options: []Option{IgnoreDiacritics},
   310  		tag:     "und",
   311  		pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
   312  		text:    "123 eee 123",
   313  		want:    []int{4, 7},
   314  	}, {
   315  		desc: "match non-normalized pattern",
   316  		tag:  "und",
   317  		// U+0300: COMBINING GRAVE ACCENT (CCC=230)
   318  		// U+031B: COMBINING HORN (CCC=216)
   319  		pattern: "eee\u0300\u031b",
   320  		text:    "123 eee\u031b\u0300 123",
   321  		want:    []int{4, 11},
   322  	}, {
   323  		desc: "match non-normalized text",
   324  		tag:  "und",
   325  		// U+0300: COMBINING GRAVE ACCENT (CCC=230)
   326  		// U+031B: COMBINING HORN (CCC=216)
   327  		pattern: "eee\u031b\u0300",
   328  		text:    "123 eee\u0300\u031b 123",
   329  		want:    []int{4, 11},
   330  	}} {
   331  		m := New(language.MustParse(tc.tag), tc.options...)
   332  		p := m.CompileString(tc.pattern)
   333  		for j := 0; j < len(tc.text); {
   334  			start, end := p.IndexString(tc.text[j:])
   335  			if start == -1 && end == -1 {
   336  				j++
   337  				continue
   338  			}
   339  			start += j
   340  			end += j
   341  			j = end
   342  			if len(tc.want) == 0 {
   343  				t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
   344  				break
   345  			}
   346  			if tc.want[0] != start || tc.want[1] != end {
   347  				t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
   348  				tc.want = tc.want[2:]
   349  				break
   350  			}
   351  			tc.want = tc.want[2:]
   352  		}
   353  		if len(tc.want) != 0 {
   354  			t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
   355  		}
   356  	}
   357  }
   358  

View as plain text