...

Source file src/golang.org/x/text/runes/runes.go

Documentation: golang.org/x/text/runes

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package runes provide transforms for UTF-8 encoded text.
     6  package runes // import "golang.org/x/text/runes"
     7  
     8  import (
     9  	"unicode"
    10  	"unicode/utf8"
    11  
    12  	"golang.org/x/text/transform"
    13  )
    14  
    15  // A Set is a collection of runes.
    16  type Set interface {
    17  	// Contains returns true if r is contained in the set.
    18  	Contains(r rune) bool
    19  }
    20  
    21  type setFunc func(rune) bool
    22  
    23  func (s setFunc) Contains(r rune) bool {
    24  	return s(r)
    25  }
    26  
    27  // Note: using funcs here instead of wrapping types result in cleaner
    28  // documentation and a smaller API.
    29  
    30  // In creates a Set with a Contains method that returns true for all runes in
    31  // the given RangeTable.
    32  func In(rt *unicode.RangeTable) Set {
    33  	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
    34  }
    35  
    36  // NotIn creates a Set with a Contains method that returns true for all runes not
    37  // in the given RangeTable.
    38  func NotIn(rt *unicode.RangeTable) Set {
    39  	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
    40  }
    41  
    42  // Predicate creates a Set with a Contains method that returns f(r).
    43  func Predicate(f func(rune) bool) Set {
    44  	return setFunc(f)
    45  }
    46  
    47  // Transformer implements the transform.Transformer interface.
    48  type Transformer struct {
    49  	t transform.SpanningTransformer
    50  }
    51  
    52  func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    53  	return t.t.Transform(dst, src, atEOF)
    54  }
    55  
    56  func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
    57  	return t.t.Span(b, atEOF)
    58  }
    59  
    60  func (t Transformer) Reset() { t.t.Reset() }
    61  
    62  // Bytes returns a new byte slice with the result of converting b using t.  It
    63  // calls Reset on t. It returns nil if any error was found. This can only happen
    64  // if an error-producing Transformer is passed to If.
    65  func (t Transformer) Bytes(b []byte) []byte {
    66  	b, _, err := transform.Bytes(t, b)
    67  	if err != nil {
    68  		return nil
    69  	}
    70  	return b
    71  }
    72  
    73  // String returns a string with the result of converting s using t. It calls
    74  // Reset on t. It returns the empty string if any error was found. This can only
    75  // happen if an error-producing Transformer is passed to If.
    76  func (t Transformer) String(s string) string {
    77  	s, _, err := transform.String(t, s)
    78  	if err != nil {
    79  		return ""
    80  	}
    81  	return s
    82  }
    83  
    84  // TODO:
    85  // - Copy: copying strings and bytes in whole-rune units.
    86  // - Validation (maybe)
    87  // - Well-formed-ness (maybe)
    88  
    89  const runeErrorString = string(utf8.RuneError)
    90  
    91  // Remove returns a Transformer that removes runes r for which s.Contains(r).
    92  // Illegal input bytes are replaced by RuneError before being passed to f.
    93  func Remove(s Set) Transformer {
    94  	if f, ok := s.(setFunc); ok {
    95  		// This little trick cuts the running time of BenchmarkRemove for sets
    96  		// created by Predicate roughly in half.
    97  		// TODO: special-case RangeTables as well.
    98  		return Transformer{remove(f)}
    99  	}
   100  	return Transformer{remove(s.Contains)}
   101  }
   102  
   103  // TODO: remove transform.RemoveFunc.
   104  
   105  type remove func(r rune) bool
   106  
   107  func (remove) Reset() {}
   108  
   109  // Span implements transform.Spanner.
   110  func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
   111  	for r, size := rune(0), 0; n < len(src); {
   112  		if r = rune(src[n]); r < utf8.RuneSelf {
   113  			size = 1
   114  		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
   115  			// Invalid rune.
   116  			if !atEOF && !utf8.FullRune(src[n:]) {
   117  				err = transform.ErrShortSrc
   118  			} else {
   119  				err = transform.ErrEndOfSpan
   120  			}
   121  			break
   122  		}
   123  		if t(r) {
   124  			err = transform.ErrEndOfSpan
   125  			break
   126  		}
   127  		n += size
   128  	}
   129  	return
   130  }
   131  
   132  // Transform implements transform.Transformer.
   133  func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   134  	for r, size := rune(0), 0; nSrc < len(src); {
   135  		if r = rune(src[nSrc]); r < utf8.RuneSelf {
   136  			size = 1
   137  		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
   138  			// Invalid rune.
   139  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
   140  				err = transform.ErrShortSrc
   141  				break
   142  			}
   143  			// We replace illegal bytes with RuneError. Not doing so might
   144  			// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
   145  			// The resulting byte sequence may subsequently contain runes
   146  			// for which t(r) is true that were passed unnoticed.
   147  			if !t(utf8.RuneError) {
   148  				if nDst+3 > len(dst) {
   149  					err = transform.ErrShortDst
   150  					break
   151  				}
   152  				dst[nDst+0] = runeErrorString[0]
   153  				dst[nDst+1] = runeErrorString[1]
   154  				dst[nDst+2] = runeErrorString[2]
   155  				nDst += 3
   156  			}
   157  			nSrc++
   158  			continue
   159  		}
   160  		if t(r) {
   161  			nSrc += size
   162  			continue
   163  		}
   164  		if nDst+size > len(dst) {
   165  			err = transform.ErrShortDst
   166  			break
   167  		}
   168  		for i := 0; i < size; i++ {
   169  			dst[nDst] = src[nSrc]
   170  			nDst++
   171  			nSrc++
   172  		}
   173  	}
   174  	return
   175  }
   176  
   177  // Map returns a Transformer that maps the runes in the input using the given
   178  // mapping. Illegal bytes in the input are converted to utf8.RuneError before
   179  // being passed to the mapping func.
   180  func Map(mapping func(rune) rune) Transformer {
   181  	return Transformer{mapper(mapping)}
   182  }
   183  
   184  type mapper func(rune) rune
   185  
   186  func (mapper) Reset() {}
   187  
   188  // Span implements transform.Spanner.
   189  func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
   190  	for r, size := rune(0), 0; n < len(src); n += size {
   191  		if r = rune(src[n]); r < utf8.RuneSelf {
   192  			size = 1
   193  		} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
   194  			// Invalid rune.
   195  			if !atEOF && !utf8.FullRune(src[n:]) {
   196  				err = transform.ErrShortSrc
   197  			} else {
   198  				err = transform.ErrEndOfSpan
   199  			}
   200  			break
   201  		}
   202  		if t(r) != r {
   203  			err = transform.ErrEndOfSpan
   204  			break
   205  		}
   206  	}
   207  	return n, err
   208  }
   209  
   210  // Transform implements transform.Transformer.
   211  func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   212  	var replacement rune
   213  	var b [utf8.UTFMax]byte
   214  
   215  	for r, size := rune(0), 0; nSrc < len(src); {
   216  		if r = rune(src[nSrc]); r < utf8.RuneSelf {
   217  			if replacement = t(r); replacement < utf8.RuneSelf {
   218  				if nDst == len(dst) {
   219  					err = transform.ErrShortDst
   220  					break
   221  				}
   222  				dst[nDst] = byte(replacement)
   223  				nDst++
   224  				nSrc++
   225  				continue
   226  			}
   227  			size = 1
   228  		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
   229  			// Invalid rune.
   230  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
   231  				err = transform.ErrShortSrc
   232  				break
   233  			}
   234  
   235  			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
   236  				if nDst+3 > len(dst) {
   237  					err = transform.ErrShortDst
   238  					break
   239  				}
   240  				dst[nDst+0] = runeErrorString[0]
   241  				dst[nDst+1] = runeErrorString[1]
   242  				dst[nDst+2] = runeErrorString[2]
   243  				nDst += 3
   244  				nSrc++
   245  				continue
   246  			}
   247  		} else if replacement = t(r); replacement == r {
   248  			if nDst+size > len(dst) {
   249  				err = transform.ErrShortDst
   250  				break
   251  			}
   252  			for i := 0; i < size; i++ {
   253  				dst[nDst] = src[nSrc]
   254  				nDst++
   255  				nSrc++
   256  			}
   257  			continue
   258  		}
   259  
   260  		n := utf8.EncodeRune(b[:], replacement)
   261  
   262  		if nDst+n > len(dst) {
   263  			err = transform.ErrShortDst
   264  			break
   265  		}
   266  		for i := 0; i < n; i++ {
   267  			dst[nDst] = b[i]
   268  			nDst++
   269  		}
   270  		nSrc += size
   271  	}
   272  	return
   273  }
   274  
   275  // ReplaceIllFormed returns a transformer that replaces all input bytes that are
   276  // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
   277  func ReplaceIllFormed() Transformer {
   278  	return Transformer{&replaceIllFormed{}}
   279  }
   280  
   281  type replaceIllFormed struct{ transform.NopResetter }
   282  
   283  func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
   284  	for n < len(src) {
   285  		// ASCII fast path.
   286  		if src[n] < utf8.RuneSelf {
   287  			n++
   288  			continue
   289  		}
   290  
   291  		r, size := utf8.DecodeRune(src[n:])
   292  
   293  		// Look for a valid non-ASCII rune.
   294  		if r != utf8.RuneError || size != 1 {
   295  			n += size
   296  			continue
   297  		}
   298  
   299  		// Look for short source data.
   300  		if !atEOF && !utf8.FullRune(src[n:]) {
   301  			err = transform.ErrShortSrc
   302  			break
   303  		}
   304  
   305  		// We have an invalid rune.
   306  		err = transform.ErrEndOfSpan
   307  		break
   308  	}
   309  	return n, err
   310  }
   311  
   312  func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   313  	for nSrc < len(src) {
   314  		// ASCII fast path.
   315  		if r := src[nSrc]; r < utf8.RuneSelf {
   316  			if nDst == len(dst) {
   317  				err = transform.ErrShortDst
   318  				break
   319  			}
   320  			dst[nDst] = r
   321  			nDst++
   322  			nSrc++
   323  			continue
   324  		}
   325  
   326  		// Look for a valid non-ASCII rune.
   327  		if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
   328  			if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
   329  				err = transform.ErrShortDst
   330  				break
   331  			}
   332  			nDst += size
   333  			nSrc += size
   334  			continue
   335  		}
   336  
   337  		// Look for short source data.
   338  		if !atEOF && !utf8.FullRune(src[nSrc:]) {
   339  			err = transform.ErrShortSrc
   340  			break
   341  		}
   342  
   343  		// We have an invalid rune.
   344  		if nDst+3 > len(dst) {
   345  			err = transform.ErrShortDst
   346  			break
   347  		}
   348  		dst[nDst+0] = runeErrorString[0]
   349  		dst[nDst+1] = runeErrorString[1]
   350  		dst[nDst+2] = runeErrorString[2]
   351  		nDst += 3
   352  		nSrc++
   353  	}
   354  	return nDst, nSrc, err
   355  }
   356  

View as plain text