...

Source file src/golang.org/x/text/unicode/norm/iter.go

Documentation: golang.org/x/text/unicode/norm

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"fmt"
     9  	"unicode/utf8"
    10  )
    11  
    12  // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
    13  // sequence of starter and non-starter runes for the purpose of normalization.
    14  const MaxSegmentSize = maxByteBufferSize
    15  
    16  // An Iter iterates over a string or byte slice, while normalizing it
    17  // to a given Form.
    18  type Iter struct {
    19  	rb     reorderBuffer
    20  	buf    [maxByteBufferSize]byte
    21  	info   Properties // first character saved from previous iteration
    22  	next   iterFunc   // implementation of next depends on form
    23  	asciiF iterFunc
    24  
    25  	p        int    // current position in input source
    26  	multiSeg []byte // remainder of multi-segment decomposition
    27  }
    28  
    29  type iterFunc func(*Iter) []byte
    30  
    31  // Init initializes i to iterate over src after normalizing it to Form f.
    32  func (i *Iter) Init(f Form, src []byte) {
    33  	i.p = 0
    34  	if len(src) == 0 {
    35  		i.setDone()
    36  		i.rb.nsrc = 0
    37  		return
    38  	}
    39  	i.multiSeg = nil
    40  	i.rb.init(f, src)
    41  	i.next = i.rb.f.nextMain
    42  	i.asciiF = nextASCIIBytes
    43  	i.info = i.rb.f.info(i.rb.src, i.p)
    44  	i.rb.ss.first(i.info)
    45  }
    46  
    47  // InitString initializes i to iterate over src after normalizing it to Form f.
    48  func (i *Iter) InitString(f Form, src string) {
    49  	i.p = 0
    50  	if len(src) == 0 {
    51  		i.setDone()
    52  		i.rb.nsrc = 0
    53  		return
    54  	}
    55  	i.multiSeg = nil
    56  	i.rb.initString(f, src)
    57  	i.next = i.rb.f.nextMain
    58  	i.asciiF = nextASCIIString
    59  	i.info = i.rb.f.info(i.rb.src, i.p)
    60  	i.rb.ss.first(i.info)
    61  }
    62  
    63  // Seek sets the segment to be returned by the next call to Next to start
    64  // at position p.  It is the responsibility of the caller to set p to the
    65  // start of a segment.
    66  func (i *Iter) Seek(offset int64, whence int) (int64, error) {
    67  	var abs int64
    68  	switch whence {
    69  	case 0:
    70  		abs = offset
    71  	case 1:
    72  		abs = int64(i.p) + offset
    73  	case 2:
    74  		abs = int64(i.rb.nsrc) + offset
    75  	default:
    76  		return 0, fmt.Errorf("norm: invalid whence")
    77  	}
    78  	if abs < 0 {
    79  		return 0, fmt.Errorf("norm: negative position")
    80  	}
    81  	if int(abs) >= i.rb.nsrc {
    82  		i.setDone()
    83  		return int64(i.p), nil
    84  	}
    85  	i.p = int(abs)
    86  	i.multiSeg = nil
    87  	i.next = i.rb.f.nextMain
    88  	i.info = i.rb.f.info(i.rb.src, i.p)
    89  	i.rb.ss.first(i.info)
    90  	return abs, nil
    91  }
    92  
    93  // returnSlice returns a slice of the underlying input type as a byte slice.
    94  // If the underlying is of type []byte, it will simply return a slice.
    95  // If the underlying is of type string, it will copy the slice to the buffer
    96  // and return that.
    97  func (i *Iter) returnSlice(a, b int) []byte {
    98  	if i.rb.src.bytes == nil {
    99  		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
   100  	}
   101  	return i.rb.src.bytes[a:b]
   102  }
   103  
   104  // Pos returns the byte position at which the next call to Next will commence processing.
   105  func (i *Iter) Pos() int {
   106  	return i.p
   107  }
   108  
   109  func (i *Iter) setDone() {
   110  	i.next = nextDone
   111  	i.p = i.rb.nsrc
   112  }
   113  
   114  // Done returns true if there is no more input to process.
   115  func (i *Iter) Done() bool {
   116  	return i.p >= i.rb.nsrc
   117  }
   118  
   119  // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
   120  // For any input a and b for which f(a) == f(b), subsequent calls
   121  // to Next will return the same segments.
   122  // Modifying runes are grouped together with the preceding starter, if such a starter exists.
   123  // Although not guaranteed, n will typically be the smallest possible n.
   124  func (i *Iter) Next() []byte {
   125  	return i.next(i)
   126  }
   127  
   128  func nextASCIIBytes(i *Iter) []byte {
   129  	p := i.p + 1
   130  	if p >= i.rb.nsrc {
   131  		p0 := i.p
   132  		i.setDone()
   133  		return i.rb.src.bytes[p0:p]
   134  	}
   135  	if i.rb.src.bytes[p] < utf8.RuneSelf {
   136  		p0 := i.p
   137  		i.p = p
   138  		return i.rb.src.bytes[p0:p]
   139  	}
   140  	i.info = i.rb.f.info(i.rb.src, i.p)
   141  	i.next = i.rb.f.nextMain
   142  	return i.next(i)
   143  }
   144  
   145  func nextASCIIString(i *Iter) []byte {
   146  	p := i.p + 1
   147  	if p >= i.rb.nsrc {
   148  		i.buf[0] = i.rb.src.str[i.p]
   149  		i.setDone()
   150  		return i.buf[:1]
   151  	}
   152  	if i.rb.src.str[p] < utf8.RuneSelf {
   153  		i.buf[0] = i.rb.src.str[i.p]
   154  		i.p = p
   155  		return i.buf[:1]
   156  	}
   157  	i.info = i.rb.f.info(i.rb.src, i.p)
   158  	i.next = i.rb.f.nextMain
   159  	return i.next(i)
   160  }
   161  
   162  func nextHangul(i *Iter) []byte {
   163  	p := i.p
   164  	next := p + hangulUTF8Size
   165  	if next >= i.rb.nsrc {
   166  		i.setDone()
   167  	} else if i.rb.src.hangul(next) == 0 {
   168  		i.rb.ss.next(i.info)
   169  		i.info = i.rb.f.info(i.rb.src, i.p)
   170  		i.next = i.rb.f.nextMain
   171  		return i.next(i)
   172  	}
   173  	i.p = next
   174  	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
   175  }
   176  
   177  func nextDone(i *Iter) []byte {
   178  	return nil
   179  }
   180  
   181  // nextMulti is used for iterating over multi-segment decompositions
   182  // for decomposing normal forms.
   183  func nextMulti(i *Iter) []byte {
   184  	j := 0
   185  	d := i.multiSeg
   186  	// skip first rune
   187  	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
   188  	}
   189  	for j < len(d) {
   190  		info := i.rb.f.info(input{bytes: d}, j)
   191  		if info.BoundaryBefore() {
   192  			i.multiSeg = d[j:]
   193  			return d[:j]
   194  		}
   195  		j += int(info.size)
   196  	}
   197  	// treat last segment as normal decomposition
   198  	i.next = i.rb.f.nextMain
   199  	return i.next(i)
   200  }
   201  
   202  // nextMultiNorm is used for iterating over multi-segment decompositions
   203  // for composing normal forms.
   204  func nextMultiNorm(i *Iter) []byte {
   205  	j := 0
   206  	d := i.multiSeg
   207  	for j < len(d) {
   208  		info := i.rb.f.info(input{bytes: d}, j)
   209  		if info.BoundaryBefore() {
   210  			i.rb.compose()
   211  			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   212  			i.rb.insertUnsafe(input{bytes: d}, j, info)
   213  			i.multiSeg = d[j+int(info.size):]
   214  			return seg
   215  		}
   216  		i.rb.insertUnsafe(input{bytes: d}, j, info)
   217  		j += int(info.size)
   218  	}
   219  	i.multiSeg = nil
   220  	i.next = nextComposed
   221  	return doNormComposed(i)
   222  }
   223  
   224  // nextDecomposed is the implementation of Next for forms NFD and NFKD.
   225  func nextDecomposed(i *Iter) (next []byte) {
   226  	outp := 0
   227  	inCopyStart, outCopyStart := i.p, 0
   228  	for {
   229  		if sz := int(i.info.size); sz <= 1 {
   230  			i.rb.ss = 0
   231  			p := i.p
   232  			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
   233  			if i.p >= i.rb.nsrc {
   234  				i.setDone()
   235  				return i.returnSlice(p, i.p)
   236  			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   237  				i.next = i.asciiF
   238  				return i.returnSlice(p, i.p)
   239  			}
   240  			outp++
   241  		} else if d := i.info.Decomposition(); d != nil {
   242  			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
   243  			// Case 1: there is a leftover to copy.  In this case the decomposition
   244  			// must begin with a modifier and should always be appended.
   245  			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
   246  			p := outp + len(d)
   247  			if outp > 0 {
   248  				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   249  				// TODO: this condition should not be possible, but we leave it
   250  				// in for defensive purposes.
   251  				if p > len(i.buf) {
   252  					return i.buf[:outp]
   253  				}
   254  			} else if i.info.multiSegment() {
   255  				// outp must be 0 as multi-segment decompositions always
   256  				// start a new segment.
   257  				if i.multiSeg == nil {
   258  					i.multiSeg = d
   259  					i.next = nextMulti
   260  					return nextMulti(i)
   261  				}
   262  				// We are in the last segment.  Treat as normal decomposition.
   263  				d = i.multiSeg
   264  				i.multiSeg = nil
   265  				p = len(d)
   266  			}
   267  			prevCC := i.info.tccc
   268  			if i.p += sz; i.p >= i.rb.nsrc {
   269  				i.setDone()
   270  				i.info = Properties{} // Force BoundaryBefore to succeed.
   271  			} else {
   272  				i.info = i.rb.f.info(i.rb.src, i.p)
   273  			}
   274  			switch i.rb.ss.next(i.info) {
   275  			case ssOverflow:
   276  				i.next = nextCGJDecompose
   277  				fallthrough
   278  			case ssStarter:
   279  				if outp > 0 {
   280  					copy(i.buf[outp:], d)
   281  					return i.buf[:p]
   282  				}
   283  				return d
   284  			}
   285  			copy(i.buf[outp:], d)
   286  			outp = p
   287  			inCopyStart, outCopyStart = i.p, outp
   288  			if i.info.ccc < prevCC {
   289  				goto doNorm
   290  			}
   291  			continue
   292  		} else if r := i.rb.src.hangul(i.p); r != 0 {
   293  			outp = decomposeHangul(i.buf[:], r)
   294  			i.p += hangulUTF8Size
   295  			inCopyStart, outCopyStart = i.p, outp
   296  			if i.p >= i.rb.nsrc {
   297  				i.setDone()
   298  				break
   299  			} else if i.rb.src.hangul(i.p) != 0 {
   300  				i.next = nextHangul
   301  				return i.buf[:outp]
   302  			}
   303  		} else {
   304  			p := outp + sz
   305  			if p > len(i.buf) {
   306  				break
   307  			}
   308  			outp = p
   309  			i.p += sz
   310  		}
   311  		if i.p >= i.rb.nsrc {
   312  			i.setDone()
   313  			break
   314  		}
   315  		prevCC := i.info.tccc
   316  		i.info = i.rb.f.info(i.rb.src, i.p)
   317  		if v := i.rb.ss.next(i.info); v == ssStarter {
   318  			break
   319  		} else if v == ssOverflow {
   320  			i.next = nextCGJDecompose
   321  			break
   322  		}
   323  		if i.info.ccc < prevCC {
   324  			goto doNorm
   325  		}
   326  	}
   327  	if outCopyStart == 0 {
   328  		return i.returnSlice(inCopyStart, i.p)
   329  	} else if inCopyStart < i.p {
   330  		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   331  	}
   332  	return i.buf[:outp]
   333  doNorm:
   334  	// Insert what we have decomposed so far in the reorderBuffer.
   335  	// As we will only reorder, there will always be enough room.
   336  	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   337  	i.rb.insertDecomposed(i.buf[0:outp])
   338  	return doNormDecomposed(i)
   339  }
   340  
   341  func doNormDecomposed(i *Iter) []byte {
   342  	for {
   343  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   344  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   345  			i.setDone()
   346  			break
   347  		}
   348  		i.info = i.rb.f.info(i.rb.src, i.p)
   349  		if i.info.ccc == 0 {
   350  			break
   351  		}
   352  		if s := i.rb.ss.next(i.info); s == ssOverflow {
   353  			i.next = nextCGJDecompose
   354  			break
   355  		}
   356  	}
   357  	// new segment or too many combining characters: exit normalization
   358  	return i.buf[:i.rb.flushCopy(i.buf[:])]
   359  }
   360  
   361  func nextCGJDecompose(i *Iter) []byte {
   362  	i.rb.ss = 0
   363  	i.rb.insertCGJ()
   364  	i.next = nextDecomposed
   365  	i.rb.ss.first(i.info)
   366  	buf := doNormDecomposed(i)
   367  	return buf
   368  }
   369  
   370  // nextComposed is the implementation of Next for forms NFC and NFKC.
   371  func nextComposed(i *Iter) []byte {
   372  	outp, startp := 0, i.p
   373  	var prevCC uint8
   374  	for {
   375  		if !i.info.isYesC() {
   376  			goto doNorm
   377  		}
   378  		prevCC = i.info.tccc
   379  		sz := int(i.info.size)
   380  		if sz == 0 {
   381  			sz = 1 // illegal rune: copy byte-by-byte
   382  		}
   383  		p := outp + sz
   384  		if p > len(i.buf) {
   385  			break
   386  		}
   387  		outp = p
   388  		i.p += sz
   389  		if i.p >= i.rb.nsrc {
   390  			i.setDone()
   391  			break
   392  		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   393  			i.rb.ss = 0
   394  			i.next = i.asciiF
   395  			break
   396  		}
   397  		i.info = i.rb.f.info(i.rb.src, i.p)
   398  		if v := i.rb.ss.next(i.info); v == ssStarter {
   399  			break
   400  		} else if v == ssOverflow {
   401  			i.next = nextCGJCompose
   402  			break
   403  		}
   404  		if i.info.ccc < prevCC {
   405  			goto doNorm
   406  		}
   407  	}
   408  	return i.returnSlice(startp, i.p)
   409  doNorm:
   410  	// reset to start position
   411  	i.p = startp
   412  	i.info = i.rb.f.info(i.rb.src, i.p)
   413  	i.rb.ss.first(i.info)
   414  	if i.info.multiSegment() {
   415  		d := i.info.Decomposition()
   416  		info := i.rb.f.info(input{bytes: d}, 0)
   417  		i.rb.insertUnsafe(input{bytes: d}, 0, info)
   418  		i.multiSeg = d[int(info.size):]
   419  		i.next = nextMultiNorm
   420  		return nextMultiNorm(i)
   421  	}
   422  	i.rb.ss.first(i.info)
   423  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   424  	return doNormComposed(i)
   425  }
   426  
   427  func doNormComposed(i *Iter) []byte {
   428  	// First rune should already be inserted.
   429  	for {
   430  		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   431  			i.setDone()
   432  			break
   433  		}
   434  		i.info = i.rb.f.info(i.rb.src, i.p)
   435  		if s := i.rb.ss.next(i.info); s == ssStarter {
   436  			break
   437  		} else if s == ssOverflow {
   438  			i.next = nextCGJCompose
   439  			break
   440  		}
   441  		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   442  	}
   443  	i.rb.compose()
   444  	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   445  	return seg
   446  }
   447  
   448  func nextCGJCompose(i *Iter) []byte {
   449  	i.rb.ss = 0 // instead of first
   450  	i.rb.insertCGJ()
   451  	i.next = nextComposed
   452  	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
   453  	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
   454  	// If we ever change that, insert a check here.
   455  	i.rb.ss.first(i.info)
   456  	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   457  	return doNormComposed(i)
   458  }
   459  

View as plain text