utf8.go

Documentation: github.com/pelletier/go-toml/v2/internal/characters

     1  package characters
     2  
     3  import (
     4  	"unicode/utf8"
     5  )
     6  
     7  type utf8Err struct {
     8  	Index int
     9  	Size  int
    10  }
    11  
    12  func (u utf8Err) Zero() bool {
    13  	return u.Size == 0
    14  }
    15  
    16  // Verified that a given string is only made of valid UTF-8 characters allowed
    17  // by the TOML spec:
    18  //
    19  // Any Unicode character may be used except those that must be escaped:
    20  // quotation mark, backslash, and the control characters other than tab (U+0000
    21  // to U+0008, U+000A to U+001F, U+007F).
    22  //
    23  // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
    24  // when a character is not allowed.
    25  //
    26  // The returned utf8Err is Zero() if the string is valid, or contains the byte
    27  // index and size of the invalid character.
    28  //
    29  // quotation mark => already checked
    30  // backslash => already checked
    31  // 0-0x8 => invalid
    32  // 0x9 => tab, ok
    33  // 0xA - 0x1F => invalid
    34  // 0x7F => invalid
    35  func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
    36  	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
    37  	offset := 0
    38  	for len(p) >= 8 {
    39  		// Combining two 32 bit loads allows the same code to be used
    40  		// for 32 and 64 bit platforms.
    41  		// The compiler can generate a 32bit load for first32 and second32
    42  		// on many platforms. See test/codegen/memcombine.go.
    43  		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
    44  		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
    45  		if (first32|second32)&0x80808080 != 0 {
    46  			// Found a non ASCII byte (>= RuneSelf).
    47  			break
    48  		}
    49  
    50  		for i, b := range p[:8] {
    51  			if InvalidAscii(b) {
    52  				err.Index = offset + i
    53  				err.Size = 1
    54  				return
    55  			}
    56  		}
    57  
    58  		p = p[8:]
    59  		offset += 8
    60  	}
    61  	n := len(p)
    62  	for i := 0; i < n; {
    63  		pi := p[i]
    64  		if pi < utf8.RuneSelf {
    65  			if InvalidAscii(pi) {
    66  				err.Index = offset + i
    67  				err.Size = 1
    68  				return
    69  			}
    70  			i++
    71  			continue
    72  		}
    73  		x := first[pi]
    74  		if x == xx {
    75  			// Illegal starter byte.
    76  			err.Index = offset + i
    77  			err.Size = 1
    78  			return
    79  		}
    80  		size := int(x & 7)
    81  		if i+size > n {
    82  			// Short or invalid.
    83  			err.Index = offset + i
    84  			err.Size = n - i
    85  			return
    86  		}
    87  		accept := acceptRanges[x>>4]
    88  		if c := p[i+1]; c < accept.lo || accept.hi < c {
    89  			err.Index = offset + i
    90  			err.Size = 2
    91  			return
    92  		} else if size == 2 {
    93  		} else if c := p[i+2]; c < locb || hicb < c {
    94  			err.Index = offset + i
    95  			err.Size = 3
    96  			return
    97  		} else if size == 3 {
    98  		} else if c := p[i+3]; c < locb || hicb < c {
    99  			err.Index = offset + i
   100  			err.Size = 4
   101  			return
   102  		}
   103  		i += size
   104  	}
   105  	return
   106  }
   107  
   108  // Return the size of the next rune if valid, 0 otherwise.
   109  func Utf8ValidNext(p []byte) int {
   110  	c := p[0]
   111  
   112  	if c < utf8.RuneSelf {
   113  		if InvalidAscii(c) {
   114  			return 0
   115  		}
   116  		return 1
   117  	}
   118  
   119  	x := first[c]
   120  	if x == xx {
   121  		// Illegal starter byte.
   122  		return 0
   123  	}
   124  	size := int(x & 7)
   125  	if size > len(p) {
   126  		// Short or invalid.
   127  		return 0
   128  	}
   129  	accept := acceptRanges[x>>4]
   130  	if c := p[1]; c < accept.lo || accept.hi < c {
   131  		return 0
   132  	} else if size == 2 {
   133  	} else if c := p[2]; c < locb || hicb < c {
   134  		return 0
   135  	} else if size == 3 {
   136  	} else if c := p[3]; c < locb || hicb < c {
   137  		return 0
   138  	}
   139  
   140  	return size
   141  }
   142  
   143  // acceptRange gives the range of valid values for the second byte in a UTF-8
   144  // sequence.
   145  type acceptRange struct {
   146  	lo uint8 // lowest value for second byte.
   147  	hi uint8 // highest value for second byte.
   148  }
   149  
   150  // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
   151  var acceptRanges = [16]acceptRange{
   152  	0: {locb, hicb},
   153  	1: {0xA0, hicb},
   154  	2: {locb, 0x9F},
   155  	3: {0x90, hicb},
   156  	4: {locb, 0x8F},
   157  }
   158  
   159  // first is information about the first byte in a UTF-8 sequence.
   160  var first = [256]uint8{
   161  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
   162  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
   163  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
   164  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
   165  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
   166  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
   167  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
   168  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
   169  	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
   170  	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
   171  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
   172  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
   173  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
   174  	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
   175  	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
   176  	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
   177  	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
   178  	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
   179  }
   180  
   181  const (
   182  	// The default lowest and highest continuation byte.
   183  	locb = 0b10000000
   184  	hicb = 0b10111111
   185  
   186  	// These names of these constants are chosen to give nice alignment in the
   187  	// table below. The first nibble is an index into acceptRanges or F for
   188  	// special one-byte cases. The second nibble is the Rune length or the
   189  	// Status for the special one-byte case.
   190  	xx = 0xF1 // invalid: size 1
   191  	as = 0xF0 // ASCII: size 1
   192  	s1 = 0x02 // accept 0, size 2
   193  	s2 = 0x13 // accept 1, size 3
   194  	s3 = 0x03 // accept 0, size 3
   195  	s4 = 0x23 // accept 2, size 3
   196  	s5 = 0x34 // accept 3, size 4
   197  	s6 = 0x04 // accept 0, size 4
   198  	s7 = 0x44 // accept 4, size 4
   199  )
   200
View as plain text