...

Source file src/golang.org/x/text/cases/map.go

Documentation: golang.org/x/text/cases

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cases
     6  
     7  // This file contains the definitions of case mappings for all supported
     8  // languages. The rules for the language-specific tailorings were taken and
     9  // modified from the CLDR transform definitions in common/transforms.
    10  
    11  import (
    12  	"strings"
    13  	"unicode"
    14  	"unicode/utf8"
    15  
    16  	"golang.org/x/text/internal"
    17  	"golang.org/x/text/language"
    18  	"golang.org/x/text/transform"
    19  	"golang.org/x/text/unicode/norm"
    20  )
    21  
    22  // A mapFunc takes a context set to the current rune and writes the mapped
    23  // version to the same context. It may advance the context to the next rune. It
    24  // returns whether a checkpoint is possible: whether the pDst bytes written to
    25  // dst so far won't need changing as we see more source bytes.
    26  type mapFunc func(*context) bool
    27  
    28  // A spanFunc takes a context set to the current rune and returns whether this
    29  // rune would be altered when written to the output. It may advance the context
    30  // to the next rune. It returns whether a checkpoint is possible.
    31  type spanFunc func(*context) bool
    32  
    33  // maxIgnorable defines the maximum number of ignorables to consider for
    34  // lookahead operations.
    35  const maxIgnorable = 30
    36  
    37  // supported lists the language tags for which we have tailorings.
    38  const supported = "und af az el lt nl tr"
    39  
    40  func init() {
    41  	tags := []language.Tag{}
    42  	for _, s := range strings.Split(supported, " ") {
    43  		tags = append(tags, language.MustParse(s))
    44  	}
    45  	matcher = internal.NewInheritanceMatcher(tags)
    46  	Supported = language.NewCoverage(tags)
    47  }
    48  
    49  var (
    50  	matcher *internal.InheritanceMatcher
    51  
    52  	Supported language.Coverage
    53  
    54  	// We keep the following lists separate, instead of having a single per-
    55  	// language struct, to give the compiler a chance to remove unused code.
    56  
    57  	// Some uppercase mappers are stateless, so we can precompute the
    58  	// Transformers and save a bit on runtime allocations.
    59  	upperFunc = []struct {
    60  		upper mapFunc
    61  		span  spanFunc
    62  	}{
    63  		{nil, nil},                  // und
    64  		{nil, nil},                  // af
    65  		{aztrUpper(upper), isUpper}, // az
    66  		{elUpper, noSpan},           // el
    67  		{ltUpper(upper), noSpan},    // lt
    68  		{nil, nil},                  // nl
    69  		{aztrUpper(upper), isUpper}, // tr
    70  	}
    71  
    72  	undUpper            transform.SpanningTransformer = &undUpperCaser{}
    73  	undLower            transform.SpanningTransformer = &undLowerCaser{}
    74  	undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
    75  
    76  	lowerFunc = []mapFunc{
    77  		nil,       // und
    78  		nil,       // af
    79  		aztrLower, // az
    80  		nil,       // el
    81  		ltLower,   // lt
    82  		nil,       // nl
    83  		aztrLower, // tr
    84  	}
    85  
    86  	titleInfos = []struct {
    87  		title     mapFunc
    88  		lower     mapFunc
    89  		titleSpan spanFunc
    90  		rewrite   func(*context)
    91  	}{
    92  		{title, lower, isTitle, nil},                // und
    93  		{title, lower, isTitle, afnlRewrite},        // af
    94  		{aztrUpper(title), aztrLower, isTitle, nil}, // az
    95  		{title, lower, isTitle, nil},                // el
    96  		{ltUpper(title), ltLower, noSpan, nil},      // lt
    97  		{nlTitle, lower, nlTitleSpan, afnlRewrite},  // nl
    98  		{aztrUpper(title), aztrLower, isTitle, nil}, // tr
    99  	}
   100  )
   101  
   102  func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
   103  	_, i, _ := matcher.Match(t)
   104  	f := upperFunc[i].upper
   105  	if f == nil {
   106  		return undUpper
   107  	}
   108  	return &simpleCaser{f: f, span: upperFunc[i].span}
   109  }
   110  
   111  func makeLower(t language.Tag, o options) transform.SpanningTransformer {
   112  	_, i, _ := matcher.Match(t)
   113  	f := lowerFunc[i]
   114  	if f == nil {
   115  		if o.ignoreFinalSigma {
   116  			return undLowerIgnoreSigma
   117  		}
   118  		return undLower
   119  	}
   120  	if o.ignoreFinalSigma {
   121  		return &simpleCaser{f: f, span: isLower}
   122  	}
   123  	return &lowerCaser{
   124  		first:   f,
   125  		midWord: finalSigma(f),
   126  	}
   127  }
   128  
   129  func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
   130  	_, i, _ := matcher.Match(t)
   131  	x := &titleInfos[i]
   132  	lower := x.lower
   133  	if o.noLower {
   134  		lower = (*context).copy
   135  	} else if !o.ignoreFinalSigma {
   136  		lower = finalSigma(lower)
   137  	}
   138  	return &titleCaser{
   139  		title:     x.title,
   140  		lower:     lower,
   141  		titleSpan: x.titleSpan,
   142  		rewrite:   x.rewrite,
   143  	}
   144  }
   145  
   146  func noSpan(c *context) bool {
   147  	c.err = transform.ErrEndOfSpan
   148  	return false
   149  }
   150  
   151  // TODO: consider a similar special case for the fast majority lower case. This
   152  // is a bit more involved so will require some more precise benchmarking to
   153  // justify it.
   154  
   155  type undUpperCaser struct{ transform.NopResetter }
   156  
   157  // undUpperCaser implements the Transformer interface for doing an upper case
   158  // mapping for the root locale (und). It eliminates the need for an allocation
   159  // as it prevents escaping by not using function pointers.
   160  func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   161  	c := context{dst: dst, src: src, atEOF: atEOF}
   162  	for c.next() {
   163  		upper(&c)
   164  		c.checkpoint()
   165  	}
   166  	return c.ret()
   167  }
   168  
   169  func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
   170  	c := context{src: src, atEOF: atEOF}
   171  	for c.next() && isUpper(&c) {
   172  		c.checkpoint()
   173  	}
   174  	return c.retSpan()
   175  }
   176  
   177  // undLowerIgnoreSigmaCaser implements the Transformer interface for doing
   178  // a lower case mapping for the root locale (und) ignoring final sigma
   179  // handling. This casing algorithm is used in some performance-critical packages
   180  // like secure/precis and x/net/http/idna, which warrants its special-casing.
   181  type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
   182  
   183  func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   184  	c := context{dst: dst, src: src, atEOF: atEOF}
   185  	for c.next() && lower(&c) {
   186  		c.checkpoint()
   187  	}
   188  	return c.ret()
   189  
   190  }
   191  
   192  // Span implements a generic lower-casing. This is possible as isLower works
   193  // for all lowercasing variants. All lowercase variants only vary in how they
   194  // transform a non-lowercase letter. They will never change an already lowercase
   195  // letter. In addition, there is no state.
   196  func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
   197  	c := context{src: src, atEOF: atEOF}
   198  	for c.next() && isLower(&c) {
   199  		c.checkpoint()
   200  	}
   201  	return c.retSpan()
   202  }
   203  
   204  type simpleCaser struct {
   205  	context
   206  	f    mapFunc
   207  	span spanFunc
   208  }
   209  
   210  // simpleCaser implements the Transformer interface for doing a case operation
   211  // on a rune-by-rune basis.
   212  func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   213  	c := context{dst: dst, src: src, atEOF: atEOF}
   214  	for c.next() && t.f(&c) {
   215  		c.checkpoint()
   216  	}
   217  	return c.ret()
   218  }
   219  
   220  func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
   221  	c := context{src: src, atEOF: atEOF}
   222  	for c.next() && t.span(&c) {
   223  		c.checkpoint()
   224  	}
   225  	return c.retSpan()
   226  }
   227  
   228  // undLowerCaser implements the Transformer interface for doing a lower case
   229  // mapping for the root locale (und) ignoring final sigma handling. This casing
   230  // algorithm is used in some performance-critical packages like secure/precis
   231  // and x/net/http/idna, which warrants its special-casing.
   232  type undLowerCaser struct{ transform.NopResetter }
   233  
   234  func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   235  	c := context{dst: dst, src: src, atEOF: atEOF}
   236  
   237  	for isInterWord := true; c.next(); {
   238  		if isInterWord {
   239  			if c.info.isCased() {
   240  				if !lower(&c) {
   241  					break
   242  				}
   243  				isInterWord = false
   244  			} else if !c.copy() {
   245  				break
   246  			}
   247  		} else {
   248  			if c.info.isNotCasedAndNotCaseIgnorable() {
   249  				if !c.copy() {
   250  					break
   251  				}
   252  				isInterWord = true
   253  			} else if !c.hasPrefix("Σ") {
   254  				if !lower(&c) {
   255  					break
   256  				}
   257  			} else if !finalSigmaBody(&c) {
   258  				break
   259  			}
   260  		}
   261  		c.checkpoint()
   262  	}
   263  	return c.ret()
   264  }
   265  
   266  func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
   267  	c := context{src: src, atEOF: atEOF}
   268  	for c.next() && isLower(&c) {
   269  		c.checkpoint()
   270  	}
   271  	return c.retSpan()
   272  }
   273  
   274  // lowerCaser implements the Transformer interface. The default Unicode lower
   275  // casing requires different treatment for the first and subsequent characters
   276  // of a word, most notably to handle the Greek final Sigma.
   277  type lowerCaser struct {
   278  	undLowerIgnoreSigmaCaser
   279  
   280  	context
   281  
   282  	first, midWord mapFunc
   283  }
   284  
   285  func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   286  	t.context = context{dst: dst, src: src, atEOF: atEOF}
   287  	c := &t.context
   288  
   289  	for isInterWord := true; c.next(); {
   290  		if isInterWord {
   291  			if c.info.isCased() {
   292  				if !t.first(c) {
   293  					break
   294  				}
   295  				isInterWord = false
   296  			} else if !c.copy() {
   297  				break
   298  			}
   299  		} else {
   300  			if c.info.isNotCasedAndNotCaseIgnorable() {
   301  				if !c.copy() {
   302  					break
   303  				}
   304  				isInterWord = true
   305  			} else if !t.midWord(c) {
   306  				break
   307  			}
   308  		}
   309  		c.checkpoint()
   310  	}
   311  	return c.ret()
   312  }
   313  
   314  // titleCaser implements the Transformer interface. Title casing algorithms
   315  // distinguish between the first letter of a word and subsequent letters of the
   316  // same word. It uses state to avoid requiring a potentially infinite lookahead.
   317  type titleCaser struct {
   318  	context
   319  
   320  	// rune mappings used by the actual casing algorithms.
   321  	title     mapFunc
   322  	lower     mapFunc
   323  	titleSpan spanFunc
   324  
   325  	rewrite func(*context)
   326  }
   327  
   328  // Transform implements the standard Unicode title case algorithm as defined in
   329  // Chapter 3 of The Unicode Standard:
   330  // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
   331  // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
   332  // first cased character F following the word boundary. If F exists, map F to
   333  // Titlecase_Mapping(F); then map all characters C between F and the following
   334  // word boundary to Lowercase_Mapping(C).
   335  func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   336  	t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
   337  	c := &t.context
   338  
   339  	if !c.next() {
   340  		return c.ret()
   341  	}
   342  
   343  	for {
   344  		p := c.info
   345  		if t.rewrite != nil {
   346  			t.rewrite(c)
   347  		}
   348  
   349  		wasMid := p.isMid()
   350  		// Break out of this loop on failure to ensure we do not modify the
   351  		// state incorrectly.
   352  		if p.isCased() {
   353  			if !c.isMidWord {
   354  				if !t.title(c) {
   355  					break
   356  				}
   357  				c.isMidWord = true
   358  			} else if !t.lower(c) {
   359  				break
   360  			}
   361  		} else if !c.copy() {
   362  			break
   363  		} else if p.isBreak() {
   364  			c.isMidWord = false
   365  		}
   366  
   367  		// As we save the state of the transformer, it is safe to call
   368  		// checkpoint after any successful write.
   369  		if !(c.isMidWord && wasMid) {
   370  			c.checkpoint()
   371  		}
   372  
   373  		if !c.next() {
   374  			break
   375  		}
   376  		if wasMid && c.info.isMid() {
   377  			c.isMidWord = false
   378  		}
   379  	}
   380  	return c.ret()
   381  }
   382  
   383  func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
   384  	t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
   385  	c := &t.context
   386  
   387  	if !c.next() {
   388  		return c.retSpan()
   389  	}
   390  
   391  	for {
   392  		p := c.info
   393  		if t.rewrite != nil {
   394  			t.rewrite(c)
   395  		}
   396  
   397  		wasMid := p.isMid()
   398  		// Break out of this loop on failure to ensure we do not modify the
   399  		// state incorrectly.
   400  		if p.isCased() {
   401  			if !c.isMidWord {
   402  				if !t.titleSpan(c) {
   403  					break
   404  				}
   405  				c.isMidWord = true
   406  			} else if !isLower(c) {
   407  				break
   408  			}
   409  		} else if p.isBreak() {
   410  			c.isMidWord = false
   411  		}
   412  		// As we save the state of the transformer, it is safe to call
   413  		// checkpoint after any successful write.
   414  		if !(c.isMidWord && wasMid) {
   415  			c.checkpoint()
   416  		}
   417  
   418  		if !c.next() {
   419  			break
   420  		}
   421  		if wasMid && c.info.isMid() {
   422  			c.isMidWord = false
   423  		}
   424  	}
   425  	return c.retSpan()
   426  }
   427  
   428  // finalSigma adds Greek final Sigma handing to another casing function. It
   429  // determines whether a lowercased sigma should be σ or ς, by looking ahead for
   430  // case-ignorables and a cased letters.
   431  func finalSigma(f mapFunc) mapFunc {
   432  	return func(c *context) bool {
   433  		if !c.hasPrefix("Σ") {
   434  			return f(c)
   435  		}
   436  		return finalSigmaBody(c)
   437  	}
   438  }
   439  
   440  func finalSigmaBody(c *context) bool {
   441  	// Current rune must be ∑.
   442  
   443  	// ::NFD();
   444  	// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
   445  	// Σ } [:case-ignorable:]* [:cased:] → σ;
   446  	// [:cased:] [:case-ignorable:]* { Σ → ς;
   447  	// ::Any-Lower;
   448  	// ::NFC();
   449  
   450  	p := c.pDst
   451  	c.writeString("ς")
   452  
   453  	// TODO: we should do this here, but right now this will never have an
   454  	// effect as this is called when the prefix is Sigma, whereas Dutch and
   455  	// Afrikaans only test for an apostrophe.
   456  	//
   457  	// if t.rewrite != nil {
   458  	// 	t.rewrite(c)
   459  	// }
   460  
   461  	// We need to do one more iteration after maxIgnorable, as a cased
   462  	// letter is not an ignorable and may modify the result.
   463  	wasMid := false
   464  	for i := 0; i < maxIgnorable+1; i++ {
   465  		if !c.next() {
   466  			return false
   467  		}
   468  		if !c.info.isCaseIgnorable() {
   469  			// All Midword runes are also case ignorable, so we are
   470  			// guaranteed to have a letter or word break here. As we are
   471  			// unreading the run, there is no need to unset c.isMidWord;
   472  			// the title caser will handle this.
   473  			if c.info.isCased() {
   474  				// p+1 is guaranteed to be in bounds: if writing ς was
   475  				// successful, p+1 will contain the second byte of ς. If not,
   476  				// this function will have returned after c.next returned false.
   477  				c.dst[p+1]++ // ς → σ
   478  			}
   479  			c.unreadRune()
   480  			return true
   481  		}
   482  		// A case ignorable may also introduce a word break, so we may need
   483  		// to continue searching even after detecting a break.
   484  		isMid := c.info.isMid()
   485  		if (wasMid && isMid) || c.info.isBreak() {
   486  			c.isMidWord = false
   487  		}
   488  		wasMid = isMid
   489  		c.copy()
   490  	}
   491  	return true
   492  }
   493  
   494  // finalSigmaSpan would be the same as isLower.
   495  
   496  // elUpper implements Greek upper casing, which entails removing a predefined
   497  // set of non-blocked modifiers. Note that these accents should not be removed
   498  // for title casing!
   499  // Example: "Οδός" -> "ΟΔΟΣ".
   500  func elUpper(c *context) bool {
   501  	// From CLDR:
   502  	// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
   503  	// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
   504  
   505  	r, _ := utf8.DecodeRune(c.src[c.pSrc:])
   506  	oldPDst := c.pDst
   507  	if !upper(c) {
   508  		return false
   509  	}
   510  	if !unicode.Is(unicode.Greek, r) {
   511  		return true
   512  	}
   513  	i := 0
   514  	// Take the properties of the uppercased rune that is already written to the
   515  	// destination. This saves us the trouble of having to uppercase the
   516  	// decomposed rune again.
   517  	if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
   518  		// Restore the destination position and process the decomposed rune.
   519  		r, sz := utf8.DecodeRune(b)
   520  		if r <= 0xFF { // See A.6.1
   521  			return true
   522  		}
   523  		c.pDst = oldPDst
   524  		// Insert the first rune and ignore the modifiers. See A.6.2.
   525  		c.writeBytes(b[:sz])
   526  		i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
   527  	}
   528  
   529  	for ; i < maxIgnorable && c.next(); i++ {
   530  		switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
   531  		// Above and Iota Subscript
   532  		case 0x0300, // U+0300 COMBINING GRAVE ACCENT
   533  			0x0301, // U+0301 COMBINING ACUTE ACCENT
   534  			0x0304, // U+0304 COMBINING MACRON
   535  			0x0306, // U+0306 COMBINING BREVE
   536  			0x0308, // U+0308 COMBINING DIAERESIS
   537  			0x0313, // U+0313 COMBINING COMMA ABOVE
   538  			0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
   539  			0x0342, // U+0342 COMBINING GREEK PERISPOMENI
   540  			0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
   541  			// No-op. Gobble the modifier.
   542  
   543  		default:
   544  			switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
   545  			case cccZero:
   546  				c.unreadRune()
   547  				return true
   548  
   549  			// We don't need to test for IotaSubscript as the only rune that
   550  			// qualifies (U+0345) was already excluded in the switch statement
   551  			// above. See A.4.
   552  
   553  			case cccAbove:
   554  				return c.copy()
   555  			default:
   556  				// Some other modifier. We're still allowed to gobble Greek
   557  				// modifiers after this.
   558  				c.copy()
   559  			}
   560  		}
   561  	}
   562  	return i == maxIgnorable
   563  }
   564  
   565  // TODO: implement elUpperSpan (low-priority: complex and infrequent).
   566  
   567  func ltLower(c *context) bool {
   568  	// From CLDR:
   569  	// # Introduce an explicit dot above when lowercasing capital I's and J's
   570  	// # whenever there are more accents above.
   571  	// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
   572  	// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
   573  	// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
   574  	// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
   575  	// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
   576  	// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
   577  	// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
   578  	// ::NFD();
   579  	// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
   580  	// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
   581  	// I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
   582  	// I \u0300 (Ì) → i \u0307 \u0300;
   583  	// I \u0301 (Í) → i \u0307 \u0301;
   584  	// I \u0303 (Ĩ) → i \u0307 \u0303;
   585  	// ::Any-Lower();
   586  	// ::NFC();
   587  
   588  	i := 0
   589  	if r := c.src[c.pSrc]; r < utf8.RuneSelf {
   590  		lower(c)
   591  		if r != 'I' && r != 'J' {
   592  			return true
   593  		}
   594  	} else {
   595  		p := norm.NFD.Properties(c.src[c.pSrc:])
   596  		if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
   597  			// UTF-8 optimization: the decomposition will only have an above
   598  			// modifier if the last rune of the decomposition is in [U+300-U+311].
   599  			// In all other cases, a decomposition starting with I is always
   600  			// an I followed by modifiers that are not cased themselves. See A.2.
   601  			if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
   602  				if !c.writeBytes(d[:1]) {
   603  					return false
   604  				}
   605  				c.dst[c.pDst-1] += 'a' - 'A' // lower
   606  
   607  				// Assumption: modifier never changes on lowercase. See A.1.
   608  				// Assumption: all modifiers added have CCC = Above. See A.2.3.
   609  				return c.writeString("\u0307") && c.writeBytes(d[1:])
   610  			}
   611  			// In all other cases the additional modifiers will have a CCC
   612  			// that is less than 230 (Above). We will insert the U+0307, if
   613  			// needed, after these modifiers so that a string in FCD form
   614  			// will remain so. See A.2.2.
   615  			lower(c)
   616  			i = 1
   617  		} else {
   618  			return lower(c)
   619  		}
   620  	}
   621  
   622  	for ; i < maxIgnorable && c.next(); i++ {
   623  		switch c.info.cccType() {
   624  		case cccZero:
   625  			c.unreadRune()
   626  			return true
   627  		case cccAbove:
   628  			return c.writeString("\u0307") && c.copy() // See A.1.
   629  		default:
   630  			c.copy() // See A.1.
   631  		}
   632  	}
   633  	return i == maxIgnorable
   634  }
   635  
   636  // ltLowerSpan would be the same as isLower.
   637  
   638  func ltUpper(f mapFunc) mapFunc {
   639  	return func(c *context) bool {
   640  		// Unicode:
   641  		// 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   642  		//
   643  		// From CLDR:
   644  		// # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
   645  		// # intervening non-230 marks.
   646  		// ::NFD();
   647  		// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
   648  		// ::Any-Upper();
   649  		// ::NFC();
   650  
   651  		// TODO: See A.5. A soft-dotted rune never has an exception. This would
   652  		// allow us to overload the exception bit and encode this property in
   653  		// info. Need to measure performance impact of this.
   654  		r, _ := utf8.DecodeRune(c.src[c.pSrc:])
   655  		oldPDst := c.pDst
   656  		if !f(c) {
   657  			return false
   658  		}
   659  		if !unicode.Is(unicode.Soft_Dotted, r) {
   660  			return true
   661  		}
   662  
   663  		// We don't need to do an NFD normalization, as a soft-dotted rune never
   664  		// contains U+0307. See A.3.
   665  
   666  		i := 0
   667  		for ; i < maxIgnorable && c.next(); i++ {
   668  			switch c.info.cccType() {
   669  			case cccZero:
   670  				c.unreadRune()
   671  				return true
   672  			case cccAbove:
   673  				if c.hasPrefix("\u0307") {
   674  					// We don't do a full NFC, but rather combine runes for
   675  					// some of the common cases. (Returning NFC or
   676  					// preserving normal form is neither a requirement nor
   677  					// a possibility anyway).
   678  					if !c.next() {
   679  						return false
   680  					}
   681  					if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
   682  						s := ""
   683  						switch c.src[c.pSrc+1] {
   684  						case 0x80: // U+0300 COMBINING GRAVE ACCENT
   685  							s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
   686  						case 0x81: // U+0301 COMBINING ACUTE ACCENT
   687  							s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
   688  						case 0x83: // U+0303 COMBINING TILDE
   689  							s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
   690  						case 0x88: // U+0308 COMBINING DIAERESIS
   691  							s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
   692  						default:
   693  						}
   694  						if s != "" {
   695  							c.pDst = oldPDst
   696  							return c.writeString(s)
   697  						}
   698  					}
   699  				}
   700  				return c.copy()
   701  			default:
   702  				c.copy()
   703  			}
   704  		}
   705  		return i == maxIgnorable
   706  	}
   707  }
   708  
   709  // TODO: implement ltUpperSpan (low priority: complex and infrequent).
   710  
   711  func aztrUpper(f mapFunc) mapFunc {
   712  	return func(c *context) bool {
   713  		// i→İ;
   714  		if c.src[c.pSrc] == 'i' {
   715  			return c.writeString("İ")
   716  		}
   717  		return f(c)
   718  	}
   719  }
   720  
   721  func aztrLower(c *context) (done bool) {
   722  	// From CLDR:
   723  	// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   724  	// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   725  	// İ→i;
   726  	// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   727  	// # This matches the behavior of the canonically equivalent I-dot_above
   728  	// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   729  	// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
   730  	// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
   731  	// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
   732  	// I→ı ;
   733  	// ::Any-Lower();
   734  	if c.hasPrefix("\u0130") { // İ
   735  		return c.writeString("i")
   736  	}
   737  	if c.src[c.pSrc] != 'I' {
   738  		return lower(c)
   739  	}
   740  
   741  	// We ignore the lower-case I for now, but insert it later when we know
   742  	// which form we need.
   743  	start := c.pSrc + c.sz
   744  
   745  	i := 0
   746  Loop:
   747  	// We check for up to n ignorables before \u0307. As \u0307 is an
   748  	// ignorable as well, n is maxIgnorable-1.
   749  	for ; i < maxIgnorable && c.next(); i++ {
   750  		switch c.info.cccType() {
   751  		case cccAbove:
   752  			if c.hasPrefix("\u0307") {
   753  				return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
   754  			}
   755  			done = true
   756  			break Loop
   757  		case cccZero:
   758  			c.unreadRune()
   759  			done = true
   760  			break Loop
   761  		default:
   762  			// We'll write this rune after we know which starter to use.
   763  		}
   764  	}
   765  	if i == maxIgnorable {
   766  		done = true
   767  	}
   768  	return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
   769  }
   770  
   771  // aztrLowerSpan would be the same as isLower.
   772  
   773  func nlTitle(c *context) bool {
   774  	// From CLDR:
   775  	// # Special titlecasing for Dutch initial "ij".
   776  	// ::Any-Title();
   777  	// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
   778  	// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
   779  	if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
   780  		return title(c)
   781  	}
   782  
   783  	if !c.writeString("I") || !c.next() {
   784  		return false
   785  	}
   786  	if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
   787  		return c.writeString("J")
   788  	}
   789  	c.unreadRune()
   790  	return true
   791  }
   792  
   793  func nlTitleSpan(c *context) bool {
   794  	// From CLDR:
   795  	// # Special titlecasing for Dutch initial "ij".
   796  	// ::Any-Title();
   797  	// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
   798  	// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
   799  	if c.src[c.pSrc] != 'I' {
   800  		return isTitle(c)
   801  	}
   802  	if !c.next() || c.src[c.pSrc] == 'j' {
   803  		return false
   804  	}
   805  	if c.src[c.pSrc] != 'J' {
   806  		c.unreadRune()
   807  	}
   808  	return true
   809  }
   810  
   811  // Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
   812  func afnlRewrite(c *context) {
   813  	if c.hasPrefix("'") || c.hasPrefix("’") {
   814  		c.isMidWord = true
   815  	}
   816  }
   817  

View as plain text