...

Source file src/golang.org/x/text/secure/bidirule/bidirule.go

Documentation: golang.org/x/text/secure/bidirule

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package bidirule implements the Bidi Rule defined by RFC 5893.
     6  //
     7  // This package is under development. The API may change without notice and
     8  // without preserving backward compatibility.
     9  package bidirule
    10  
    11  import (
    12  	"errors"
    13  	"unicode/utf8"
    14  
    15  	"golang.org/x/text/transform"
    16  	"golang.org/x/text/unicode/bidi"
    17  )
    18  
    19  // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
    20  // Internationalized Domain Names for Applications (IDNA)
    21  //
    22  // A label is an individual component of a domain name.  Labels are usually
    23  // shown separated by dots; for example, the domain name "www.example.com" is
    24  // composed of three labels: "www", "example", and "com".
    25  //
    26  // An RTL label is a label that contains at least one character of class R, AL,
    27  // or AN. An LTR label is any label that is not an RTL label.
    28  //
    29  // A "Bidi domain name" is a domain name that contains at least one RTL label.
    30  //
    31  //  The following guarantees can be made based on the above:
    32  //
    33  //  o  In a domain name consisting of only labels that satisfy the rule,
    34  //     the requirements of Section 3 are satisfied.  Note that even LTR
    35  //     labels and pure ASCII labels have to be tested.
    36  //
    37  //  o  In a domain name consisting of only LDH labels (as defined in the
    38  //     Definitions document [RFC5890]) and labels that satisfy the rule,
    39  //     the requirements of Section 3 are satisfied as long as a label
    40  //     that starts with an ASCII digit does not come after a
    41  //     right-to-left label.
    42  //
    43  //  No guarantee is given for other combinations.
    44  
    45  // ErrInvalid indicates a label is invalid according to the Bidi Rule.
    46  var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
    47  
    48  type ruleState uint8
    49  
    50  const (
    51  	ruleInitial ruleState = iota
    52  	ruleLTR
    53  	ruleLTRFinal
    54  	ruleRTL
    55  	ruleRTLFinal
    56  	ruleInvalid
    57  )
    58  
    59  type ruleTransition struct {
    60  	next ruleState
    61  	mask uint16
    62  }
    63  
    64  var transitions = [...][2]ruleTransition{
    65  	// [2.1] The first character must be a character with Bidi property L, R, or
    66  	// AL. If it has the R or AL property, it is an RTL label; if it has the L
    67  	// property, it is an LTR label.
    68  	ruleInitial: {
    69  		{ruleLTRFinal, 1 << bidi.L},
    70  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
    71  	},
    72  	ruleRTL: {
    73  		// [2.3] In an RTL label, the end of the label must be a character with
    74  		// Bidi property R, AL, EN, or AN, followed by zero or more characters
    75  		// with Bidi property NSM.
    76  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
    77  
    78  		// [2.2] In an RTL label, only characters with the Bidi properties R,
    79  		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
    80  		// We exclude the entries from [2.3]
    81  		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
    82  	},
    83  	ruleRTLFinal: {
    84  		// [2.3] In an RTL label, the end of the label must be a character with
    85  		// Bidi property R, AL, EN, or AN, followed by zero or more characters
    86  		// with Bidi property NSM.
    87  		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
    88  
    89  		// [2.2] In an RTL label, only characters with the Bidi properties R,
    90  		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
    91  		// We exclude the entries from [2.3] and NSM.
    92  		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
    93  	},
    94  	ruleLTR: {
    95  		// [2.6] In an LTR label, the end of the label must be a character with
    96  		// Bidi property L or EN, followed by zero or more characters with Bidi
    97  		// property NSM.
    98  		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
    99  
   100  		// [2.5] In an LTR label, only characters with the Bidi properties L,
   101  		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
   102  		// We exclude the entries from [2.6].
   103  		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
   104  	},
   105  	ruleLTRFinal: {
   106  		// [2.6] In an LTR label, the end of the label must be a character with
   107  		// Bidi property L or EN, followed by zero or more characters with Bidi
   108  		// property NSM.
   109  		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
   110  
   111  		// [2.5] In an LTR label, only characters with the Bidi properties L,
   112  		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
   113  		// We exclude the entries from [2.6].
   114  		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
   115  	},
   116  	ruleInvalid: {
   117  		{ruleInvalid, 0},
   118  		{ruleInvalid, 0},
   119  	},
   120  }
   121  
   122  // [2.4] In an RTL label, if an EN is present, no AN may be present, and
   123  // vice versa.
   124  const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
   125  
   126  // From RFC 5893
   127  // An RTL label is a label that contains at least one character of type
   128  // R, AL, or AN.
   129  //
   130  // An LTR label is any label that is not an RTL label.
   131  
   132  // Direction reports the direction of the given label as defined by RFC 5893.
   133  // The Bidi Rule does not have to be applied to labels of the category
   134  // LeftToRight.
   135  func Direction(b []byte) bidi.Direction {
   136  	for i := 0; i < len(b); {
   137  		e, sz := bidi.Lookup(b[i:])
   138  		if sz == 0 {
   139  			i++
   140  		}
   141  		c := e.Class()
   142  		if c == bidi.R || c == bidi.AL || c == bidi.AN {
   143  			return bidi.RightToLeft
   144  		}
   145  		i += sz
   146  	}
   147  	return bidi.LeftToRight
   148  }
   149  
   150  // DirectionString reports the direction of the given label as defined by RFC
   151  // 5893. The Bidi Rule does not have to be applied to labels of the category
   152  // LeftToRight.
   153  func DirectionString(s string) bidi.Direction {
   154  	for i := 0; i < len(s); {
   155  		e, sz := bidi.LookupString(s[i:])
   156  		if sz == 0 {
   157  			i++
   158  			continue
   159  		}
   160  		c := e.Class()
   161  		if c == bidi.R || c == bidi.AL || c == bidi.AN {
   162  			return bidi.RightToLeft
   163  		}
   164  		i += sz
   165  	}
   166  	return bidi.LeftToRight
   167  }
   168  
   169  // Valid reports whether b conforms to the BiDi rule.
   170  func Valid(b []byte) bool {
   171  	var t Transformer
   172  	if n, ok := t.advance(b); !ok || n < len(b) {
   173  		return false
   174  	}
   175  	return t.isFinal()
   176  }
   177  
   178  // ValidString reports whether s conforms to the BiDi rule.
   179  func ValidString(s string) bool {
   180  	var t Transformer
   181  	if n, ok := t.advanceString(s); !ok || n < len(s) {
   182  		return false
   183  	}
   184  	return t.isFinal()
   185  }
   186  
   187  // New returns a Transformer that verifies that input adheres to the Bidi Rule.
   188  func New() *Transformer {
   189  	return &Transformer{}
   190  }
   191  
   192  // Transformer implements transform.Transform.
   193  type Transformer struct {
   194  	state  ruleState
   195  	hasRTL bool
   196  	seen   uint16
   197  }
   198  
   199  // A rule can only be violated for "Bidi Domain names", meaning if one of the
   200  // following categories has been observed.
   201  func (t *Transformer) isRTL() bool {
   202  	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
   203  	return t.seen&isRTL != 0
   204  }
   205  
   206  // Reset implements transform.Transformer.
   207  func (t *Transformer) Reset() { *t = Transformer{} }
   208  
   209  // Transform implements transform.Transformer. This Transformer has state and
   210  // needs to be reset between uses.
   211  func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   212  	if len(dst) < len(src) {
   213  		src = src[:len(dst)]
   214  		atEOF = false
   215  		err = transform.ErrShortDst
   216  	}
   217  	n, err1 := t.Span(src, atEOF)
   218  	copy(dst, src[:n])
   219  	if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
   220  		err = err1
   221  	}
   222  	return n, n, err
   223  }
   224  
   225  // Span returns the first n bytes of src that conform to the Bidi rule.
   226  func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
   227  	if t.state == ruleInvalid && t.isRTL() {
   228  		return 0, ErrInvalid
   229  	}
   230  	n, ok := t.advance(src)
   231  	switch {
   232  	case !ok:
   233  		err = ErrInvalid
   234  	case n < len(src):
   235  		if !atEOF {
   236  			err = transform.ErrShortSrc
   237  			break
   238  		}
   239  		err = ErrInvalid
   240  	case !t.isFinal():
   241  		err = ErrInvalid
   242  	}
   243  	return n, err
   244  }
   245  
   246  // Precomputing the ASCII values decreases running time for the ASCII fast path
   247  // by about 30%.
   248  var asciiTable [128]bidi.Properties
   249  
   250  func init() {
   251  	for i := range asciiTable {
   252  		p, _ := bidi.LookupRune(rune(i))
   253  		asciiTable[i] = p
   254  	}
   255  }
   256  
   257  func (t *Transformer) advance(s []byte) (n int, ok bool) {
   258  	var e bidi.Properties
   259  	var sz int
   260  	for n < len(s) {
   261  		if s[n] < utf8.RuneSelf {
   262  			e, sz = asciiTable[s[n]], 1
   263  		} else {
   264  			e, sz = bidi.Lookup(s[n:])
   265  			if sz <= 1 {
   266  				if sz == 1 {
   267  					// We always consider invalid UTF-8 to be invalid, even if
   268  					// the string has not yet been determined to be RTL.
   269  					// TODO: is this correct?
   270  					return n, false
   271  				}
   272  				return n, true // incomplete UTF-8 encoding
   273  			}
   274  		}
   275  		// TODO: using CompactClass would result in noticeable speedup.
   276  		// See unicode/bidi/prop.go:Properties.CompactClass.
   277  		c := uint16(1 << e.Class())
   278  		t.seen |= c
   279  		if t.seen&exclusiveRTL == exclusiveRTL {
   280  			t.state = ruleInvalid
   281  			return n, false
   282  		}
   283  		switch tr := transitions[t.state]; {
   284  		case tr[0].mask&c != 0:
   285  			t.state = tr[0].next
   286  		case tr[1].mask&c != 0:
   287  			t.state = tr[1].next
   288  		default:
   289  			t.state = ruleInvalid
   290  			if t.isRTL() {
   291  				return n, false
   292  			}
   293  		}
   294  		n += sz
   295  	}
   296  	return n, true
   297  }
   298  
   299  func (t *Transformer) advanceString(s string) (n int, ok bool) {
   300  	var e bidi.Properties
   301  	var sz int
   302  	for n < len(s) {
   303  		if s[n] < utf8.RuneSelf {
   304  			e, sz = asciiTable[s[n]], 1
   305  		} else {
   306  			e, sz = bidi.LookupString(s[n:])
   307  			if sz <= 1 {
   308  				if sz == 1 {
   309  					return n, false // invalid UTF-8
   310  				}
   311  				return n, true // incomplete UTF-8 encoding
   312  			}
   313  		}
   314  		// TODO: using CompactClass results in noticeable speedup.
   315  		// See unicode/bidi/prop.go:Properties.CompactClass.
   316  		c := uint16(1 << e.Class())
   317  		t.seen |= c
   318  		if t.seen&exclusiveRTL == exclusiveRTL {
   319  			t.state = ruleInvalid
   320  			return n, false
   321  		}
   322  		switch tr := transitions[t.state]; {
   323  		case tr[0].mask&c != 0:
   324  			t.state = tr[0].next
   325  		case tr[1].mask&c != 0:
   326  			t.state = tr[1].next
   327  		default:
   328  			t.state = ruleInvalid
   329  			if t.isRTL() {
   330  				return n, false
   331  			}
   332  		}
   333  		n += sz
   334  	}
   335  	return n, true
   336  }
   337  

View as plain text