collate.go

Documentation: golang.org/x/text/unicode/cldr

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cldr
     6  
     7  import (
     8  	"bufio"
     9  	"encoding/xml"
    10  	"errors"
    11  	"fmt"
    12  	"strconv"
    13  	"strings"
    14  	"unicode"
    15  	"unicode/utf8"
    16  )
    17  
    18  // RuleProcessor can be passed to Collator's Process method, which
    19  // parses the rules and calls the respective method for each rule found.
    20  type RuleProcessor interface {
    21  	Reset(anchor string, before int) error
    22  	Insert(level int, str, context, extend string) error
    23  	Index(id string)
    24  }
    25  
    26  const (
    27  	// cldrIndex is a Unicode-reserved sentinel value used to mark the start
    28  	// of a grouping within an index.
    29  	// We ignore any rule that starts with this rune.
    30  	// See https://unicode.org/reports/tr35/#Collation_Elements for details.
    31  	cldrIndex = "\uFDD0"
    32  
    33  	// specialAnchor is the format in which to represent logical reset positions,
    34  	// such as "first tertiary ignorable".
    35  	specialAnchor = "<%s/>"
    36  )
    37  
    38  // Process parses the rules for the tailorings of this collation
    39  // and calls the respective methods of p for each rule found.
    40  func (c Collation) Process(p RuleProcessor) (err error) {
    41  	if len(c.Cr) > 0 {
    42  		if len(c.Cr) > 1 {
    43  			return fmt.Errorf("multiple cr elements, want 0 or 1")
    44  		}
    45  		return processRules(p, c.Cr[0].Data())
    46  	}
    47  	if c.Rules.Any != nil {
    48  		return c.processXML(p)
    49  	}
    50  	return errors.New("no tailoring data")
    51  }
    52  
    53  // processRules parses rules in the Collation Rule Syntax defined in
    54  // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
    55  func processRules(p RuleProcessor, s string) (err error) {
    56  	chk := func(s string, e error) string {
    57  		if err == nil {
    58  			err = e
    59  		}
    60  		return s
    61  	}
    62  	i := 0 // Save the line number for use after the loop.
    63  	scanner := bufio.NewScanner(strings.NewReader(s))
    64  	for ; scanner.Scan() && err == nil; i++ {
    65  		for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
    66  			level := 5
    67  			var ch byte
    68  			switch ch, s = s[0], s[1:]; ch {
    69  			case '&': // followed by <anchor> or '[' <key> ']'
    70  				if s = skipSpace(s); consume(&s, '[') {
    71  					s = chk(parseSpecialAnchor(p, s))
    72  				} else {
    73  					s = chk(parseAnchor(p, 0, s))
    74  				}
    75  			case '<': // sort relation '<'{1,4}, optionally followed by '*'.
    76  				for level = 1; consume(&s, '<'); level++ {
    77  				}
    78  				if level > 4 {
    79  					err = fmt.Errorf("level %d > 4", level)
    80  				}
    81  				fallthrough
    82  			case '=': // identity relation, optionally followed by *.
    83  				if consume(&s, '*') {
    84  					s = chk(parseSequence(p, level, s))
    85  				} else {
    86  					s = chk(parseOrder(p, level, s))
    87  				}
    88  			default:
    89  				chk("", fmt.Errorf("illegal operator %q", ch))
    90  				break
    91  			}
    92  		}
    93  	}
    94  	if chk("", scanner.Err()); err != nil {
    95  		return fmt.Errorf("%d: %v", i, err)
    96  	}
    97  	return nil
    98  }
    99  
   100  // parseSpecialAnchor parses the anchor syntax which is either of the form
   101  //
   102  //	['before' <level>] <anchor>
   103  //
   104  // or
   105  //
   106  //	[<label>]
   107  //
   108  // The starting should already be consumed.
   109  func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
   110  	i := strings.IndexByte(s, ']')
   111  	if i == -1 {
   112  		return "", errors.New("unmatched bracket")
   113  	}
   114  	a := strings.TrimSpace(s[:i])
   115  	s = s[i+1:]
   116  	if strings.HasPrefix(a, "before ") {
   117  		l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
   118  		if err != nil {
   119  			return s, err
   120  		}
   121  		return parseAnchor(p, int(l), s)
   122  	}
   123  	return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
   124  }
   125  
   126  func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
   127  	anchor, s, err := scanString(s)
   128  	if err != nil {
   129  		return s, err
   130  	}
   131  	return s, p.Reset(anchor, level)
   132  }
   133  
   134  func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
   135  	var value, context, extend string
   136  	if value, s, err = scanString(s); err != nil {
   137  		return s, err
   138  	}
   139  	if strings.HasPrefix(value, cldrIndex) {
   140  		p.Index(value[len(cldrIndex):])
   141  		return
   142  	}
   143  	if consume(&s, '|') {
   144  		if context, s, err = scanString(s); err != nil {
   145  			return s, errors.New("missing string after context")
   146  		}
   147  	}
   148  	if consume(&s, '/') {
   149  		if extend, s, err = scanString(s); err != nil {
   150  			return s, errors.New("missing string after extension")
   151  		}
   152  	}
   153  	return s, p.Insert(level, value, context, extend)
   154  }
   155  
   156  // scanString scans a single input string.
   157  func scanString(s string) (str, tail string, err error) {
   158  	if s = skipSpace(s); s == "" {
   159  		return s, s, errors.New("missing string")
   160  	}
   161  	buf := [16]byte{} // small but enough to hold most cases.
   162  	value := buf[:0]
   163  	for s != "" {
   164  		if consume(&s, '\'') {
   165  			i := strings.IndexByte(s, '\'')
   166  			if i == -1 {
   167  				return "", "", errors.New(`unmatched single quote`)
   168  			}
   169  			if i == 0 {
   170  				value = append(value, '\'')
   171  			} else {
   172  				value = append(value, s[:i]...)
   173  			}
   174  			s = s[i+1:]
   175  			continue
   176  		}
   177  		r, sz := utf8.DecodeRuneInString(s)
   178  		if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
   179  			break
   180  		}
   181  		value = append(value, s[:sz]...)
   182  		s = s[sz:]
   183  	}
   184  	return string(value), skipSpace(s), nil
   185  }
   186  
   187  func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
   188  	if s = skipSpace(s); s == "" {
   189  		return s, errors.New("empty sequence")
   190  	}
   191  	last := rune(0)
   192  	for s != "" {
   193  		r, sz := utf8.DecodeRuneInString(s)
   194  		s = s[sz:]
   195  
   196  		if r == '-' {
   197  			// We have a range. The first element was already written.
   198  			if last == 0 {
   199  				return s, errors.New("range without starter value")
   200  			}
   201  			r, sz = utf8.DecodeRuneInString(s)
   202  			s = s[sz:]
   203  			if r == utf8.RuneError || r < last {
   204  				return s, fmt.Errorf("invalid range %q-%q", last, r)
   205  			}
   206  			for i := last + 1; i <= r; i++ {
   207  				if err := p.Insert(level, string(i), "", ""); err != nil {
   208  					return s, err
   209  				}
   210  			}
   211  			last = 0
   212  			continue
   213  		}
   214  
   215  		if unicode.IsSpace(r) || unicode.IsPunct(r) {
   216  			break
   217  		}
   218  
   219  		// normal case
   220  		if err := p.Insert(level, string(r), "", ""); err != nil {
   221  			return s, err
   222  		}
   223  		last = r
   224  	}
   225  	return s, nil
   226  }
   227  
   228  func skipSpace(s string) string {
   229  	return strings.TrimLeftFunc(s, unicode.IsSpace)
   230  }
   231  
   232  // consume returns whether the next byte is ch. If so, it gobbles it by
   233  // updating s.
   234  func consume(s *string, ch byte) (ok bool) {
   235  	if *s == "" || (*s)[0] != ch {
   236  		return false
   237  	}
   238  	*s = (*s)[1:]
   239  	return true
   240  }
   241  
   242  // The following code parses Collation rules of CLDR version 24 and before.
   243  
   244  var lmap = map[byte]int{
   245  	'p': 1,
   246  	's': 2,
   247  	't': 3,
   248  	'i': 5,
   249  }
   250  
   251  type rulesElem struct {
   252  	Rules struct {
   253  		Common
   254  		Any []*struct {
   255  			XMLName xml.Name
   256  			rule
   257  		} `xml:",any"`
   258  	} `xml:"rules"`
   259  }
   260  
   261  type rule struct {
   262  	Value  string `xml:",chardata"`
   263  	Before string `xml:"before,attr"`
   264  	Any    []*struct {
   265  		XMLName xml.Name
   266  		rule
   267  	} `xml:",any"`
   268  }
   269  
   270  var emptyValueError = errors.New("cldr: empty rule value")
   271  
   272  func (r *rule) value() (string, error) {
   273  	// Convert hexadecimal Unicode codepoint notation to a string.
   274  	s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
   275  	r.Value = s
   276  	if s == "" {
   277  		if len(r.Any) != 1 {
   278  			return "", emptyValueError
   279  		}
   280  		r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
   281  		r.Any = nil
   282  	} else if len(r.Any) != 0 {
   283  		return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
   284  	}
   285  	return r.Value, nil
   286  }
   287  
   288  func (r rule) process(p RuleProcessor, name, context, extend string) error {
   289  	v, err := r.value()
   290  	if err != nil {
   291  		return err
   292  	}
   293  	switch name {
   294  	case "p", "s", "t", "i":
   295  		if strings.HasPrefix(v, cldrIndex) {
   296  			p.Index(v[len(cldrIndex):])
   297  			return nil
   298  		}
   299  		if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
   300  			return err
   301  		}
   302  	case "pc", "sc", "tc", "ic":
   303  		level := lmap[name[0]]
   304  		for _, s := range v {
   305  			if err := p.Insert(level, string(s), context, extend); err != nil {
   306  				return err
   307  			}
   308  		}
   309  	default:
   310  		return fmt.Errorf("cldr: unsupported tag: %q", name)
   311  	}
   312  	return nil
   313  }
   314  
   315  // processXML parses the format of CLDR versions 24 and older.
   316  func (c Collation) processXML(p RuleProcessor) (err error) {
   317  	// Collation is generated and defined in xml.go.
   318  	var v string
   319  	for _, r := range c.Rules.Any {
   320  		switch r.XMLName.Local {
   321  		case "reset":
   322  			level := 0
   323  			switch r.Before {
   324  			case "primary", "1":
   325  				level = 1
   326  			case "secondary", "2":
   327  				level = 2
   328  			case "tertiary", "3":
   329  				level = 3
   330  			case "":
   331  			default:
   332  				return fmt.Errorf("cldr: unknown level %q", r.Before)
   333  			}
   334  			v, err = r.value()
   335  			if err == nil {
   336  				err = p.Reset(v, level)
   337  			}
   338  		case "x":
   339  			var context, extend string
   340  			for _, r1 := range r.Any {
   341  				v, err = r1.value()
   342  				switch r1.XMLName.Local {
   343  				case "context":
   344  					context = v
   345  				case "extend":
   346  					extend = v
   347  				}
   348  			}
   349  			for _, r1 := range r.Any {
   350  				if t := r1.XMLName.Local; t == "context" || t == "extend" {
   351  					continue
   352  				}
   353  				r1.rule.process(p, r1.XMLName.Local, context, extend)
   354  			}
   355  		default:
   356  			err = r.rule.process(p, r.XMLName.Local, "", "")
   357  		}
   358  		if err != nil {
   359  			return err
   360  		}
   361  	}
   362  	return nil
   363  }
   364
View as plain text