...

Source file src/golang.org/x/text/unicode/norm/normalize.go

Documentation: golang.org/x/text/unicode/norm

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Note: the file data_test.go that is generated should not be checked in.
     6  //go:generate go run maketables.go triegen.go
     7  //go:generate go test -tags test
     8  
     9  // Package norm contains types and functions for normalizing Unicode strings.
    10  package norm // import "golang.org/x/text/unicode/norm"
    11  
    12  import (
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/transform"
    16  )
    17  
    18  // A Form denotes a canonical representation of Unicode code points.
    19  // The Unicode-defined normalization and equivalence forms are:
    20  //
    21  //	NFC   Unicode Normalization Form C
    22  //	NFD   Unicode Normalization Form D
    23  //	NFKC  Unicode Normalization Form KC
    24  //	NFKD  Unicode Normalization Form KD
    25  //
    26  // For a Form f, this documentation uses the notation f(x) to mean
    27  // the bytes or string x converted to the given form.
    28  // A position n in x is called a boundary if conversion to the form can
    29  // proceed independently on both sides:
    30  //
    31  //	f(x) == append(f(x[0:n]), f(x[n:])...)
    32  //
    33  // References: https://unicode.org/reports/tr15/ and
    34  // https://unicode.org/notes/tn5/.
    35  type Form int
    36  
    37  const (
    38  	NFC Form = iota
    39  	NFD
    40  	NFKC
    41  	NFKD
    42  )
    43  
    44  // Bytes returns f(b). May return b if f(b) = b.
    45  func (f Form) Bytes(b []byte) []byte {
    46  	src := inputBytes(b)
    47  	ft := formTable[f]
    48  	n, ok := ft.quickSpan(src, 0, len(b), true)
    49  	if ok {
    50  		return b
    51  	}
    52  	out := make([]byte, n, len(b))
    53  	copy(out, b[0:n])
    54  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
    55  	return doAppendInner(&rb, n)
    56  }
    57  
    58  // String returns f(s).
    59  func (f Form) String(s string) string {
    60  	src := inputString(s)
    61  	ft := formTable[f]
    62  	n, ok := ft.quickSpan(src, 0, len(s), true)
    63  	if ok {
    64  		return s
    65  	}
    66  	out := make([]byte, n, len(s))
    67  	copy(out, s[0:n])
    68  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
    69  	return string(doAppendInner(&rb, n))
    70  }
    71  
    72  // IsNormal returns true if b == f(b).
    73  func (f Form) IsNormal(b []byte) bool {
    74  	src := inputBytes(b)
    75  	ft := formTable[f]
    76  	bp, ok := ft.quickSpan(src, 0, len(b), true)
    77  	if ok {
    78  		return true
    79  	}
    80  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
    81  	rb.setFlusher(nil, cmpNormalBytes)
    82  	for bp < len(b) {
    83  		rb.out = b[bp:]
    84  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    85  			return false
    86  		}
    87  		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
    88  	}
    89  	return true
    90  }
    91  
    92  func cmpNormalBytes(rb *reorderBuffer) bool {
    93  	b := rb.out
    94  	for i := 0; i < rb.nrune; i++ {
    95  		info := rb.rune[i]
    96  		if int(info.size) > len(b) {
    97  			return false
    98  		}
    99  		p := info.pos
   100  		pe := p + info.size
   101  		for ; p < pe; p++ {
   102  			if b[0] != rb.byte[p] {
   103  				return false
   104  			}
   105  			b = b[1:]
   106  		}
   107  	}
   108  	return true
   109  }
   110  
   111  // IsNormalString returns true if s == f(s).
   112  func (f Form) IsNormalString(s string) bool {
   113  	src := inputString(s)
   114  	ft := formTable[f]
   115  	bp, ok := ft.quickSpan(src, 0, len(s), true)
   116  	if ok {
   117  		return true
   118  	}
   119  	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
   120  	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
   121  		for i := 0; i < rb.nrune; i++ {
   122  			info := rb.rune[i]
   123  			if bp+int(info.size) > len(s) {
   124  				return false
   125  			}
   126  			p := info.pos
   127  			pe := p + info.size
   128  			for ; p < pe; p++ {
   129  				if s[bp] != rb.byte[p] {
   130  					return false
   131  				}
   132  				bp++
   133  			}
   134  		}
   135  		return true
   136  	})
   137  	for bp < len(s) {
   138  		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
   139  			return false
   140  		}
   141  		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
   142  	}
   143  	return true
   144  }
   145  
   146  // patchTail fixes a case where a rune may be incorrectly normalized
   147  // if it is followed by illegal continuation bytes. It returns the
   148  // patched buffer and whether the decomposition is still in progress.
   149  func patchTail(rb *reorderBuffer) bool {
   150  	info, p := lastRuneStart(&rb.f, rb.out)
   151  	if p == -1 || info.size == 0 {
   152  		return true
   153  	}
   154  	end := p + int(info.size)
   155  	extra := len(rb.out) - end
   156  	if extra > 0 {
   157  		// Potentially allocating memory. However, this only
   158  		// happens with ill-formed UTF-8.
   159  		x := make([]byte, 0)
   160  		x = append(x, rb.out[len(rb.out)-extra:]...)
   161  		rb.out = rb.out[:end]
   162  		decomposeToLastBoundary(rb)
   163  		rb.doFlush()
   164  		rb.out = append(rb.out, x...)
   165  		return false
   166  	}
   167  	buf := rb.out[p:]
   168  	rb.out = rb.out[:p]
   169  	decomposeToLastBoundary(rb)
   170  	if s := rb.ss.next(info); s == ssStarter {
   171  		rb.doFlush()
   172  		rb.ss.first(info)
   173  	} else if s == ssOverflow {
   174  		rb.doFlush()
   175  		rb.insertCGJ()
   176  		rb.ss = 0
   177  	}
   178  	rb.insertUnsafe(inputBytes(buf), 0, info)
   179  	return true
   180  }
   181  
   182  func appendQuick(rb *reorderBuffer, i int) int {
   183  	if rb.nsrc == i {
   184  		return i
   185  	}
   186  	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
   187  	rb.out = rb.src.appendSlice(rb.out, i, end)
   188  	return end
   189  }
   190  
   191  // Append returns f(append(out, b...)).
   192  // The buffer out must be nil, empty, or equal to f(out).
   193  func (f Form) Append(out []byte, src ...byte) []byte {
   194  	return f.doAppend(out, inputBytes(src), len(src))
   195  }
   196  
   197  func (f Form) doAppend(out []byte, src input, n int) []byte {
   198  	if n == 0 {
   199  		return out
   200  	}
   201  	ft := formTable[f]
   202  	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
   203  	if len(out) == 0 {
   204  		p, _ := ft.quickSpan(src, 0, n, true)
   205  		out = src.appendSlice(out, 0, p)
   206  		if p == n {
   207  			return out
   208  		}
   209  		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
   210  		return doAppendInner(&rb, p)
   211  	}
   212  	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
   213  	return doAppend(&rb, out, 0)
   214  }
   215  
   216  func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
   217  	rb.setFlusher(out, appendFlush)
   218  	src, n := rb.src, rb.nsrc
   219  	doMerge := len(out) > 0
   220  	if q := src.skipContinuationBytes(p); q > p {
   221  		// Move leading non-starters to destination.
   222  		rb.out = src.appendSlice(rb.out, p, q)
   223  		p = q
   224  		doMerge = patchTail(rb)
   225  	}
   226  	fd := &rb.f
   227  	if doMerge {
   228  		var info Properties
   229  		if p < n {
   230  			info = fd.info(src, p)
   231  			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
   232  				if p == 0 {
   233  					decomposeToLastBoundary(rb)
   234  				}
   235  				p = decomposeSegment(rb, p, true)
   236  			}
   237  		}
   238  		if info.size == 0 {
   239  			rb.doFlush()
   240  			// Append incomplete UTF-8 encoding.
   241  			return src.appendSlice(rb.out, p, n)
   242  		}
   243  		if rb.nrune > 0 {
   244  			return doAppendInner(rb, p)
   245  		}
   246  	}
   247  	p = appendQuick(rb, p)
   248  	return doAppendInner(rb, p)
   249  }
   250  
   251  func doAppendInner(rb *reorderBuffer, p int) []byte {
   252  	for n := rb.nsrc; p < n; {
   253  		p = decomposeSegment(rb, p, true)
   254  		p = appendQuick(rb, p)
   255  	}
   256  	return rb.out
   257  }
   258  
   259  // AppendString returns f(append(out, []byte(s))).
   260  // The buffer out must be nil, empty, or equal to f(out).
   261  func (f Form) AppendString(out []byte, src string) []byte {
   262  	return f.doAppend(out, inputString(src), len(src))
   263  }
   264  
   265  // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
   266  // It is not guaranteed to return the largest such n.
   267  func (f Form) QuickSpan(b []byte) int {
   268  	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
   269  	return n
   270  }
   271  
   272  // Span implements transform.SpanningTransformer. It returns a boundary n such
   273  // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
   274  func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
   275  	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
   276  	if n < len(b) {
   277  		if !ok {
   278  			err = transform.ErrEndOfSpan
   279  		} else {
   280  			err = transform.ErrShortSrc
   281  		}
   282  	}
   283  	return n, err
   284  }
   285  
   286  // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   287  // It is not guaranteed to return the largest such n.
   288  func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
   289  	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
   290  	if n < len(s) {
   291  		if !ok {
   292  			err = transform.ErrEndOfSpan
   293  		} else {
   294  			err = transform.ErrShortSrc
   295  		}
   296  	}
   297  	return n, err
   298  }
   299  
   300  // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
   301  // whether any non-normalized parts were found. If atEOF is false, n will
   302  // not point past the last segment if this segment might be become
   303  // non-normalized by appending other runes.
   304  func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
   305  	var lastCC uint8
   306  	ss := streamSafe(0)
   307  	lastSegStart := i
   308  	for n = end; i < n; {
   309  		if j := src.skipASCII(i, n); i != j {
   310  			i = j
   311  			lastSegStart = i - 1
   312  			lastCC = 0
   313  			ss = 0
   314  			continue
   315  		}
   316  		info := f.info(src, i)
   317  		if info.size == 0 {
   318  			if atEOF {
   319  				// include incomplete runes
   320  				return n, true
   321  			}
   322  			return lastSegStart, true
   323  		}
   324  		// This block needs to be before the next, because it is possible to
   325  		// have an overflow for runes that are starters (e.g. with U+FF9E).
   326  		switch ss.next(info) {
   327  		case ssStarter:
   328  			lastSegStart = i
   329  		case ssOverflow:
   330  			return lastSegStart, false
   331  		case ssSuccess:
   332  			if lastCC > info.ccc {
   333  				return lastSegStart, false
   334  			}
   335  		}
   336  		if f.composing {
   337  			if !info.isYesC() {
   338  				break
   339  			}
   340  		} else {
   341  			if !info.isYesD() {
   342  				break
   343  			}
   344  		}
   345  		lastCC = info.ccc
   346  		i += int(info.size)
   347  	}
   348  	if i == n {
   349  		if !atEOF {
   350  			n = lastSegStart
   351  		}
   352  		return n, true
   353  	}
   354  	return lastSegStart, false
   355  }
   356  
   357  // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   358  // It is not guaranteed to return the largest such n.
   359  func (f Form) QuickSpanString(s string) int {
   360  	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
   361  	return n
   362  }
   363  
   364  // FirstBoundary returns the position i of the first boundary in b
   365  // or -1 if b contains no boundary.
   366  func (f Form) FirstBoundary(b []byte) int {
   367  	return f.firstBoundary(inputBytes(b), len(b))
   368  }
   369  
   370  func (f Form) firstBoundary(src input, nsrc int) int {
   371  	i := src.skipContinuationBytes(0)
   372  	if i >= nsrc {
   373  		return -1
   374  	}
   375  	fd := formTable[f]
   376  	ss := streamSafe(0)
   377  	// We should call ss.first here, but we can't as the first rune is
   378  	// skipped already. This means FirstBoundary can't really determine
   379  	// CGJ insertion points correctly. Luckily it doesn't have to.
   380  	for {
   381  		info := fd.info(src, i)
   382  		if info.size == 0 {
   383  			return -1
   384  		}
   385  		if s := ss.next(info); s != ssSuccess {
   386  			return i
   387  		}
   388  		i += int(info.size)
   389  		if i >= nsrc {
   390  			if !info.BoundaryAfter() && !ss.isMax() {
   391  				return -1
   392  			}
   393  			return nsrc
   394  		}
   395  	}
   396  }
   397  
   398  // FirstBoundaryInString returns the position i of the first boundary in s
   399  // or -1 if s contains no boundary.
   400  func (f Form) FirstBoundaryInString(s string) int {
   401  	return f.firstBoundary(inputString(s), len(s))
   402  }
   403  
   404  // NextBoundary reports the index of the boundary between the first and next
   405  // segment in b or -1 if atEOF is false and there are not enough bytes to
   406  // determine this boundary.
   407  func (f Form) NextBoundary(b []byte, atEOF bool) int {
   408  	return f.nextBoundary(inputBytes(b), len(b), atEOF)
   409  }
   410  
   411  // NextBoundaryInString reports the index of the boundary between the first and
   412  // next segment in b or -1 if atEOF is false and there are not enough bytes to
   413  // determine this boundary.
   414  func (f Form) NextBoundaryInString(s string, atEOF bool) int {
   415  	return f.nextBoundary(inputString(s), len(s), atEOF)
   416  }
   417  
   418  func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
   419  	if nsrc == 0 {
   420  		if atEOF {
   421  			return 0
   422  		}
   423  		return -1
   424  	}
   425  	fd := formTable[f]
   426  	info := fd.info(src, 0)
   427  	if info.size == 0 {
   428  		if atEOF {
   429  			return 1
   430  		}
   431  		return -1
   432  	}
   433  	ss := streamSafe(0)
   434  	ss.first(info)
   435  
   436  	for i := int(info.size); i < nsrc; i += int(info.size) {
   437  		info = fd.info(src, i)
   438  		if info.size == 0 {
   439  			if atEOF {
   440  				return i
   441  			}
   442  			return -1
   443  		}
   444  		// TODO: Using streamSafe to determine the boundary isn't the same as
   445  		// using BoundaryBefore. Determine which should be used.
   446  		if s := ss.next(info); s != ssSuccess {
   447  			return i
   448  		}
   449  	}
   450  	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
   451  		return -1
   452  	}
   453  	return nsrc
   454  }
   455  
   456  // LastBoundary returns the position i of the last boundary in b
   457  // or -1 if b contains no boundary.
   458  func (f Form) LastBoundary(b []byte) int {
   459  	return lastBoundary(formTable[f], b)
   460  }
   461  
   462  func lastBoundary(fd *formInfo, b []byte) int {
   463  	i := len(b)
   464  	info, p := lastRuneStart(fd, b)
   465  	if p == -1 {
   466  		return -1
   467  	}
   468  	if info.size == 0 { // ends with incomplete rune
   469  		if p == 0 { // starts with incomplete rune
   470  			return -1
   471  		}
   472  		i = p
   473  		info, p = lastRuneStart(fd, b[:i])
   474  		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
   475  			return i
   476  		}
   477  	}
   478  	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
   479  		return i
   480  	}
   481  	if info.BoundaryAfter() {
   482  		return i
   483  	}
   484  	ss := streamSafe(0)
   485  	v := ss.backwards(info)
   486  	for i = p; i >= 0 && v != ssStarter; i = p {
   487  		info, p = lastRuneStart(fd, b[:i])
   488  		if v = ss.backwards(info); v == ssOverflow {
   489  			break
   490  		}
   491  		if p+int(info.size) != i {
   492  			if p == -1 { // no boundary found
   493  				return -1
   494  			}
   495  			return i // boundary after an illegal UTF-8 encoding
   496  		}
   497  	}
   498  	return i
   499  }
   500  
   501  // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
   502  // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
   503  // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
   504  func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
   505  	// Force one character to be consumed.
   506  	info := rb.f.info(rb.src, sp)
   507  	if info.size == 0 {
   508  		return 0
   509  	}
   510  	if s := rb.ss.next(info); s == ssStarter {
   511  		// TODO: this could be removed if we don't support merging.
   512  		if rb.nrune > 0 {
   513  			goto end
   514  		}
   515  	} else if s == ssOverflow {
   516  		rb.insertCGJ()
   517  		goto end
   518  	}
   519  	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   520  		return int(err)
   521  	}
   522  	for {
   523  		sp += int(info.size)
   524  		if sp >= rb.nsrc {
   525  			if !atEOF && !info.BoundaryAfter() {
   526  				return int(iShortSrc)
   527  			}
   528  			break
   529  		}
   530  		info = rb.f.info(rb.src, sp)
   531  		if info.size == 0 {
   532  			if !atEOF {
   533  				return int(iShortSrc)
   534  			}
   535  			break
   536  		}
   537  		if s := rb.ss.next(info); s == ssStarter {
   538  			break
   539  		} else if s == ssOverflow {
   540  			rb.insertCGJ()
   541  			break
   542  		}
   543  		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   544  			return int(err)
   545  		}
   546  	}
   547  end:
   548  	if !rb.doFlush() {
   549  		return int(iShortDst)
   550  	}
   551  	return sp
   552  }
   553  
   554  // lastRuneStart returns the runeInfo and position of the last
   555  // rune in buf or the zero runeInfo and -1 if no rune was found.
   556  func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
   557  	p := len(buf) - 1
   558  	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
   559  	}
   560  	if p < 0 {
   561  		return Properties{}, -1
   562  	}
   563  	return fd.info(inputBytes(buf), p), p
   564  }
   565  
   566  // decomposeToLastBoundary finds an open segment at the end of the buffer
   567  // and scans it into rb. Returns the buffer minus the last segment.
   568  func decomposeToLastBoundary(rb *reorderBuffer) {
   569  	fd := &rb.f
   570  	info, i := lastRuneStart(fd, rb.out)
   571  	if int(info.size) != len(rb.out)-i {
   572  		// illegal trailing continuation bytes
   573  		return
   574  	}
   575  	if info.BoundaryAfter() {
   576  		return
   577  	}
   578  	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
   579  	padd := 0
   580  	ss := streamSafe(0)
   581  	p := len(rb.out)
   582  	for {
   583  		add[padd] = info
   584  		v := ss.backwards(info)
   585  		if v == ssOverflow {
   586  			// Note that if we have an overflow, it the string we are appending to
   587  			// is not correctly normalized. In this case the behavior is undefined.
   588  			break
   589  		}
   590  		padd++
   591  		p -= int(info.size)
   592  		if v == ssStarter || p < 0 {
   593  			break
   594  		}
   595  		info, i = lastRuneStart(fd, rb.out[:p])
   596  		if int(info.size) != p-i {
   597  			break
   598  		}
   599  	}
   600  	rb.ss = ss
   601  	// Copy bytes for insertion as we may need to overwrite rb.out.
   602  	var buf [maxBufferSize * utf8.UTFMax]byte
   603  	cp := buf[:copy(buf[:], rb.out[p:])]
   604  	rb.out = rb.out[:p]
   605  	for padd--; padd >= 0; padd-- {
   606  		info = add[padd]
   607  		rb.insertUnsafe(inputBytes(cp), 0, info)
   608  		cp = cp[info.size:]
   609  	}
   610  }
   611  

View as plain text