...

Source file src/golang.org/x/text/internal/colltab/contract.go

Documentation: golang.org/x/text/internal/colltab

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package colltab
     6  
     7  import "unicode/utf8"
     8  
     9  // For a description of ContractTrieSet, see text/collate/build/contract.go.
    10  
    11  type ContractTrieSet []struct{ L, H, N, I uint8 }
    12  
    13  // ctScanner is used to match a trie to an input sequence.
    14  // A contraction may match a non-contiguous sequence of bytes in an input string.
    15  // For example, if there is a contraction for <a, combining_ring>, it should match
    16  // the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
    17  // not block combining_ring.
    18  // ctScanner does not automatically skip over non-blocking non-starters, but rather
    19  // retains the state of the last match and leaves it up to the user to continue
    20  // the match at the appropriate points.
    21  type ctScanner struct {
    22  	states ContractTrieSet
    23  	s      []byte
    24  	n      int
    25  	index  int
    26  	pindex int
    27  	done   bool
    28  }
    29  
    30  type ctScannerString struct {
    31  	states ContractTrieSet
    32  	s      string
    33  	n      int
    34  	index  int
    35  	pindex int
    36  	done   bool
    37  }
    38  
    39  func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner {
    40  	return ctScanner{s: b, states: t[index:], n: n}
    41  }
    42  
    43  func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString {
    44  	return ctScannerString{s: str, states: t[index:], n: n}
    45  }
    46  
    47  // result returns the offset i and bytes consumed p so far.  If no suffix
    48  // matched, i and p will be 0.
    49  func (s *ctScanner) result() (i, p int) {
    50  	return s.index, s.pindex
    51  }
    52  
    53  func (s *ctScannerString) result() (i, p int) {
    54  	return s.index, s.pindex
    55  }
    56  
    57  const (
    58  	final   = 0
    59  	noIndex = 0xFF
    60  )
    61  
    62  // scan matches the longest suffix at the current location in the input
    63  // and returns the number of bytes consumed.
    64  func (s *ctScanner) scan(p int) int {
    65  	pr := p // the p at the rune start
    66  	str := s.s
    67  	states, n := s.states, s.n
    68  	for i := 0; i < n && p < len(str); {
    69  		e := states[i]
    70  		c := str[p]
    71  		// TODO: a significant number of contractions are of a form that
    72  		// cannot match discontiguous UTF-8 in a normalized string. We could let
    73  		// a negative value of e.n mean that we can set s.done = true and avoid
    74  		// the need for additional matches.
    75  		if c >= e.L {
    76  			if e.L == c {
    77  				p++
    78  				if e.I != noIndex {
    79  					s.index = int(e.I)
    80  					s.pindex = p
    81  				}
    82  				if e.N != final {
    83  					i, states, n = 0, states[int(e.H)+n:], int(e.N)
    84  					if p >= len(str) || utf8.RuneStart(str[p]) {
    85  						s.states, s.n, pr = states, n, p
    86  					}
    87  				} else {
    88  					s.done = true
    89  					return p
    90  				}
    91  				continue
    92  			} else if e.N == final && c <= e.H {
    93  				p++
    94  				s.done = true
    95  				s.index = int(c-e.L) + int(e.I)
    96  				s.pindex = p
    97  				return p
    98  			}
    99  		}
   100  		i++
   101  	}
   102  	return pr
   103  }
   104  
   105  // scan is a verbatim copy of ctScanner.scan.
   106  func (s *ctScannerString) scan(p int) int {
   107  	pr := p // the p at the rune start
   108  	str := s.s
   109  	states, n := s.states, s.n
   110  	for i := 0; i < n && p < len(str); {
   111  		e := states[i]
   112  		c := str[p]
   113  		// TODO: a significant number of contractions are of a form that
   114  		// cannot match discontiguous UTF-8 in a normalized string. We could let
   115  		// a negative value of e.n mean that we can set s.done = true and avoid
   116  		// the need for additional matches.
   117  		if c >= e.L {
   118  			if e.L == c {
   119  				p++
   120  				if e.I != noIndex {
   121  					s.index = int(e.I)
   122  					s.pindex = p
   123  				}
   124  				if e.N != final {
   125  					i, states, n = 0, states[int(e.H)+n:], int(e.N)
   126  					if p >= len(str) || utf8.RuneStart(str[p]) {
   127  						s.states, s.n, pr = states, n, p
   128  					}
   129  				} else {
   130  					s.done = true
   131  					return p
   132  				}
   133  				continue
   134  			} else if e.N == final && c <= e.H {
   135  				p++
   136  				s.done = true
   137  				s.index = int(c-e.L) + int(e.I)
   138  				s.pindex = p
   139  				return p
   140  			}
   141  		}
   142  		i++
   143  	}
   144  	return pr
   145  }
   146  

View as plain text