search.go

Documentation: golang.org/x/text/search

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
     6  
     7  // Package search provides language-specific search and string matching.
     8  //
     9  // Natural language matching can be intricate. For example, Danish will insist
    10  // "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
    11  // the lack of a dot) in a case-insensitive match. This package handles such
    12  // language-specific details.
    13  //
    14  // Text passed to any of the calls in this message does not need to be
    15  // normalized.
    16  package search // import "golang.org/x/text/search"
    17  
    18  import (
    19  	"strings"
    20  
    21  	"golang.org/x/text/internal/colltab"
    22  	"golang.org/x/text/language"
    23  )
    24  
    25  // An Option configures a Matcher.
    26  type Option func(*Matcher)
    27  
    28  var (
    29  	// WholeWord restricts matches to complete words. The default is to match at
    30  	// the character level.
    31  	WholeWord Option = nil
    32  
    33  	// Exact requires that two strings are their exact equivalent. For example
    34  	// å would not match aa in Danish. It overrides any of the ignore options.
    35  	Exact Option = nil
    36  
    37  	// Loose causes case, diacritics and width to be ignored.
    38  	Loose Option = loose
    39  
    40  	// IgnoreCase enables case-insensitive search.
    41  	IgnoreCase Option = ignoreCase
    42  
    43  	// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
    44  	IgnoreDiacritics Option = ignoreDiacritics
    45  
    46  	// IgnoreWidth equates narrow with wide variants.
    47  	IgnoreWidth Option = ignoreWidth
    48  )
    49  
    50  func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
    51  func ignoreCase(m *Matcher)       { m.ignoreCase = true }
    52  func ignoreWidth(m *Matcher)      { m.ignoreWidth = true }
    53  func loose(m *Matcher) {
    54  	ignoreDiacritics(m)
    55  	ignoreCase(m)
    56  	ignoreWidth(m)
    57  }
    58  
    59  var (
    60  	// Supported lists the languages for which search differs from its parent.
    61  	Supported language.Coverage
    62  
    63  	tags []language.Tag
    64  )
    65  
    66  func init() {
    67  	ids := strings.Split(availableLocales, ",")
    68  	tags = make([]language.Tag, len(ids))
    69  	for i, s := range ids {
    70  		tags[i] = language.Raw.MustParse(s)
    71  	}
    72  	Supported = language.NewCoverage(tags)
    73  }
    74  
    75  // New returns a new Matcher for the given language and options.
    76  func New(t language.Tag, opts ...Option) *Matcher {
    77  	m := &Matcher{
    78  		w: getTable(locales[colltab.MatchLang(t, tags)]),
    79  	}
    80  	for _, f := range opts {
    81  		f(m)
    82  	}
    83  	return m
    84  }
    85  
    86  // A Matcher implements language-specific string matching.
    87  type Matcher struct {
    88  	w                colltab.Weighter
    89  	ignoreCase       bool
    90  	ignoreWidth      bool
    91  	ignoreDiacritics bool
    92  }
    93  
    94  // An IndexOption specifies how the Index methods of Pattern or Matcher should
    95  // match the input.
    96  type IndexOption byte
    97  
    98  const (
    99  	// Anchor restricts the search to the start (or end for Backwards) of the
   100  	// text.
   101  	Anchor IndexOption = 1 << iota
   102  
   103  	// Backwards starts the search from the end of the text.
   104  	Backwards
   105  
   106  	anchorBackwards = Anchor | Backwards
   107  )
   108  
   109  // Index reports the start and end position of the first occurrence of pat in b
   110  // or -1, -1 if pat is not present.
   111  func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
   112  	// TODO: implement optimized version that does not use a pattern.
   113  	return m.Compile(pat).Index(b, opts...)
   114  }
   115  
   116  // IndexString reports the start and end position of the first occurrence of pat
   117  // in s or -1, -1 if pat is not present.
   118  func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
   119  	// TODO: implement optimized version that does not use a pattern.
   120  	return m.CompileString(pat).IndexString(s, opts...)
   121  }
   122  
   123  // Equal reports whether a and b are equivalent.
   124  func (m *Matcher) Equal(a, b []byte) bool {
   125  	_, end := m.Index(a, b, Anchor)
   126  	return end == len(a)
   127  }
   128  
   129  // EqualString reports whether a and b are equivalent.
   130  func (m *Matcher) EqualString(a, b string) bool {
   131  	_, end := m.IndexString(a, b, Anchor)
   132  	return end == len(a)
   133  }
   134  
   135  // Compile compiles and returns a pattern that can be used for faster searching.
   136  func (m *Matcher) Compile(b []byte) *Pattern {
   137  	p := &Pattern{m: m}
   138  	iter := colltab.Iter{Weighter: m.w}
   139  	for iter.SetInput(b); iter.Next(); {
   140  	}
   141  	p.ce = iter.Elems
   142  	p.deleteEmptyElements()
   143  	return p
   144  }
   145  
   146  // CompileString compiles and returns a pattern that can be used for faster
   147  // searching.
   148  func (m *Matcher) CompileString(s string) *Pattern {
   149  	p := &Pattern{m: m}
   150  	iter := colltab.Iter{Weighter: m.w}
   151  	for iter.SetInputString(s); iter.Next(); {
   152  	}
   153  	p.ce = iter.Elems
   154  	p.deleteEmptyElements()
   155  	return p
   156  }
   157  
   158  // A Pattern is a compiled search string. It is safe for concurrent use.
   159  type Pattern struct {
   160  	m  *Matcher
   161  	ce []colltab.Elem
   162  }
   163  
   164  // Design note (TODO remove):
   165  // The cost of retrieving collation elements for each rune, which is used for
   166  // search as well, is not trivial. Also, algorithms like Boyer-Moore and
   167  // Sunday require some additional precomputing.
   168  
   169  // Index reports the start and end position of the first occurrence of p in b
   170  // or -1, -1 if p is not present.
   171  func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
   172  	// Pick a large enough buffer such that we likely do not need to allocate
   173  	// and small enough to not cause too much overhead initializing.
   174  	var buf [8]colltab.Elem
   175  
   176  	it := &colltab.Iter{
   177  		Weighter: p.m.w,
   178  		Elems:    buf[:0],
   179  	}
   180  	it.SetInput(b)
   181  
   182  	var optMask IndexOption
   183  	for _, o := range opts {
   184  		optMask |= o
   185  	}
   186  
   187  	switch optMask {
   188  	case 0:
   189  		return p.forwardSearch(it)
   190  	case Anchor:
   191  		return p.anchoredForwardSearch(it)
   192  	case Backwards, anchorBackwards:
   193  		panic("TODO: implement")
   194  	default:
   195  		panic("unrecognized option")
   196  	}
   197  }
   198  
   199  // IndexString reports the start and end position of the first occurrence of p
   200  // in s or -1, -1 if p is not present.
   201  func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
   202  	// Pick a large enough buffer such that we likely do not need to allocate
   203  	// and small enough to not cause too much overhead initializing.
   204  	var buf [8]colltab.Elem
   205  
   206  	it := &colltab.Iter{
   207  		Weighter: p.m.w,
   208  		Elems:    buf[:0],
   209  	}
   210  	it.SetInputString(s)
   211  
   212  	var optMask IndexOption
   213  	for _, o := range opts {
   214  		optMask |= o
   215  	}
   216  
   217  	switch optMask {
   218  	case 0:
   219  		return p.forwardSearch(it)
   220  	case Anchor:
   221  		return p.anchoredForwardSearch(it)
   222  	case Backwards, anchorBackwards:
   223  		panic("TODO: implement")
   224  	default:
   225  		panic("unrecognized option")
   226  	}
   227  }
   228  
   229  // TODO:
   230  // - Maybe IndexAll methods (probably not necessary).
   231  // - Some way to match patterns in a Reader (a bit tricky).
   232  // - Some fold transformer that folds text to comparable text, based on the
   233  //   search options. This is a common technique, though very different from the
   234  //   collation-based design of this package. It has a somewhat different use
   235  //   case, so probably makes sense to support both. Should probably be in a
   236  //   different package, though, as it uses completely different kind of tables
   237  //   (based on norm, cases, width and range tables.)
   238
View as plain text