// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build icu package cases import ( "path" "strings" "testing" "golang.org/x/text/internal/testtext" "golang.org/x/text/language" "golang.org/x/text/unicode/norm" ) func TestICUConformance(t *testing.T) { // Build test set. input := []string{ "a.a a_a", "a\u05d0a", "\u05d0'a", "a\u03084a", "a\u0308a", "a3\u30a3a", "a\u303aa", "a_\u303a_a", "1_a..a", "1_a.a", "a..a.", "a--a-", "a-a-", "a\u200ba", "a\u200b\u200ba", "a\u00ad\u00ada", // Format "a\u00ada", "a''a", // SingleQuote "a'a", "a::a", // MidLetter "a:a", "a..a", // MidNumLet "a.a", "a;;a", // MidNum "a;a", "a__a", // ExtendNumlet "a_a", "ΟΣ''a", } add := func(x interface{}) { switch v := x.(type) { case string: input = append(input, v) case []string: for _, s := range v { input = append(input, s) } } } for _, tc := range testCases { add(tc.src) add(tc.lower) add(tc.upper) add(tc.title) } for _, tc := range bufferTests { add(tc.src) } for _, tc := range breakTest { add(strings.Replace(tc, "|", "", -1)) } for _, tc := range foldTestCases { add(tc) } // Compare ICU to Go. for _, c := range []string{"lower", "upper", "title", "fold"} { for _, tag := range []string{ "und", "af", "az", "el", "lt", "nl", "tr", } { for _, s := range input { if exclude(c, tag, s) { continue } testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) { want := doICU(tag, c, s) got := doGo(tag, c, s) if norm.NFC.String(got) != norm.NFC.String(want) { t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s) } }) } } } } // exclude indicates if a string should be excluded from testing. func exclude(cm, tag, s string) bool { list := []struct{ cm, tags, pattern string }{ // TODO: Go does not handle certain esoteric breaks correctly. This will be // fixed once we have a real word break iterator. Alternatively, it // seems like we're not too far off from making it work, so we could // fix these last steps. But first verify that using a separate word // breaker does not hurt performance. {"title", "af nl", "a''a"}, {"", "", "א'a"}, // All the exclusions below seem to be issues with the ICU // implementation (at version 57) and thus are not marked as TODO. // ICU does not handle leading apostrophe for Dutch and // Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078. {"title", "af nl", "'n"}, {"title", "af nl", "'N"}, // Go terminates the final sigma check after a fixed number of // ignorables have been found. This ensures that the algorithm can make // progress in a streaming scenario. {"lower title", "", "\u039f\u03a3...............................a"}, // This also applies to upper in Greek. // NOTE: we could fix the following two cases by adding state to elUpper // and aztrLower. However, considering a modifier to not belong to the // preceding letter after the maximum modifiers count is reached is // consistent with the behavior of unicode/norm. {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"}, {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"}, {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"}, {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"}, // ICU title case seems to erroneously removes \u0307 from an upper case // I unconditionally, instead of only when lowercasing. The ICU // transform algorithm transforms these cases consistently with our // implementation. {"title", "az tr", "\u0307"}, // The spec says to remove \u0307 after Soft-Dotted characters. ICU // transforms conform but ucasemap_utf8ToUpper does not. {"upper title", "lt", "i\u0307"}, {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"}, // Both Unicode and CLDR prescribe an extra explicit dot above after a // Soft_Dotted character if there are other modifiers. // ucasemap_utf8ToUpper does not do this; ICU transforms do. // The issue with ucasemap_utf8ToUpper seems to be that it does not // consider the modifiers that are part of composition in the evaluation // of More_Above. For instance, according to the More_Above rule for lt, // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with // two additional dots). This seems odd, but is correct. ICU is // definitely not correct as it produces different results for different // normal forms. For instance, for an İ: // \u0130 (NFC) -> i\u0307 (incorrect) // I\u0307 (NFD) -> i\u0307\u0307 (correct) // We could argue that we should not add a \u0307 if there already is // one, but this may be hard to get correct and is not conform the // standard. {"lower title", "lt", "\u0130"}, {"lower title", "lt", "\u00cf"}, // We are conform ICU ucasemap_utf8ToUpper if we remove support for // elUpper. However, this is clearly not conform the spec. Moreover, the // ICU transforms _do_ implement this transform and produces results // consistent with our implementation. Note that we still prefer to use // ucasemap_utf8ToUpper instead of transforms as the latter have // inconsistencies in the word breaking algorithm. {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA } for _, x := range list { if x.cm != "" && strings.Index(x.cm, cm) == -1 { continue } if x.tags != "" && strings.Index(x.tags, tag) == -1 { continue } if strings.Index(s, x.pattern) != -1 { return true } } return false } func doGo(tag, caser, input string) string { var c Caser t := language.MustParse(tag) switch caser { case "lower": c = Lower(t) case "upper": c = Upper(t) case "title": c = Title(t) case "fold": c = Fold() } return c.String(input) }