...

Source file src/golang.org/x/text/secure/precis/gen.go

Documentation: golang.org/x/text/secure/precis

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Unicode table generator.
     6  // Data read from the web.
     7  
     8  //go:build ignore
     9  
    10  package main
    11  
    12  import (
    13  	"flag"
    14  	"log"
    15  	"unicode"
    16  	"unicode/utf8"
    17  
    18  	"golang.org/x/text/internal/gen"
    19  	"golang.org/x/text/internal/triegen"
    20  	"golang.org/x/text/internal/ucd"
    21  	"golang.org/x/text/unicode/norm"
    22  	"golang.org/x/text/unicode/rangetable"
    23  )
    24  
    25  var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
    26  
    27  var assigned, disallowedRunes *unicode.RangeTable
    28  
    29  var runeCategory = map[rune]category{}
    30  
    31  var overrides = map[category]category{
    32  	viramaModifier: viramaJoinT,
    33  	greek:          greekJoinT,
    34  	hebrew:         hebrewJoinT,
    35  }
    36  
    37  func setCategory(r rune, cat category) {
    38  	if c, ok := runeCategory[r]; ok {
    39  		if override, ok := overrides[c]; cat == joiningT && ok {
    40  			cat = override
    41  		} else {
    42  			log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
    43  		}
    44  	}
    45  	runeCategory[r] = cat
    46  }
    47  
    48  func init() {
    49  	if numCategories > 1<<propShift {
    50  		log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
    51  	}
    52  }
    53  
    54  func main() {
    55  	gen.Init()
    56  
    57  	// Load data
    58  	runes := []rune{}
    59  	// PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
    60  	ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
    61  		if p.String(1) == "Default_Ignorable_Code_Point" {
    62  			runes = append(runes, p.Rune(0))
    63  		}
    64  	})
    65  	ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
    66  		switch p.String(1) {
    67  		case "Noncharacter_Code_Point":
    68  			runes = append(runes, p.Rune(0))
    69  		}
    70  	})
    71  	// OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
    72  	ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
    73  		switch p.String(1) {
    74  		case "L", "V", "T":
    75  			runes = append(runes, p.Rune(0))
    76  		}
    77  	})
    78  
    79  	disallowedRunes = rangetable.New(runes...)
    80  	assigned = rangetable.Assigned(unicode.Version)
    81  
    82  	// Load category data.
    83  	runeCategory['l'] = latinSmallL
    84  	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
    85  		const cccVirama = 9
    86  		if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
    87  			setCategory(p.Rune(0), viramaModifier)
    88  		}
    89  	})
    90  	ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
    91  		switch p.String(1) {
    92  		case "Greek":
    93  			setCategory(p.Rune(0), greek)
    94  		case "Hebrew":
    95  			setCategory(p.Rune(0), hebrew)
    96  		case "Hiragana", "Katakana", "Han":
    97  			setCategory(p.Rune(0), japanese)
    98  		}
    99  	})
   100  
   101  	// Set the rule categories associated with exceptions. This overrides any
   102  	// previously set categories. The original categories are manually
   103  	// reintroduced in the categoryTransitions table.
   104  	for r, e := range exceptions {
   105  		if e.cat != 0 {
   106  			runeCategory[r] = e.cat
   107  		}
   108  	}
   109  	cat := map[string]category{
   110  		"L": joiningL,
   111  		"D": joiningD,
   112  		"T": joiningT,
   113  
   114  		"R": joiningR,
   115  	}
   116  	ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
   117  		switch v := p.String(1); v {
   118  		case "L", "D", "T", "R":
   119  			setCategory(p.Rune(0), cat[v])
   120  		}
   121  	})
   122  
   123  	writeTables()
   124  	gen.Repackage("gen_trieval.go", "trieval.go", "precis")
   125  }
   126  
   127  type exception struct {
   128  	prop property
   129  	cat  category
   130  }
   131  
   132  func init() {
   133  	// Programmatically add the Arabic and Indic digits to the exceptions map.
   134  	// See comment in the exceptions map below why these are marked disallowed.
   135  	for i := rune(0); i <= 9; i++ {
   136  		exceptions[0x0660+i] = exception{
   137  			prop: disallowed,
   138  			cat:  arabicIndicDigit,
   139  		}
   140  		exceptions[0x06F0+i] = exception{
   141  			prop: disallowed,
   142  			cat:  extendedArabicIndicDigit,
   143  		}
   144  	}
   145  }
   146  
   147  // The Exceptions class as defined in RFC 5892
   148  // https://tools.ietf.org/html/rfc5892#section-2.6
   149  var exceptions = map[rune]exception{
   150  	0x00DF: {prop: pValid},
   151  	0x03C2: {prop: pValid},
   152  	0x06FD: {prop: pValid},
   153  	0x06FE: {prop: pValid},
   154  	0x0F0B: {prop: pValid},
   155  	0x3007: {prop: pValid},
   156  
   157  	// ContextO|J rules are marked as disallowed, taking a "guilty until proven
   158  	// innocent" approach. The main reason for this is that the check for
   159  	// whether a context rule should be applied can be moved to the logic for
   160  	// handing disallowed runes, taken it off the common path. The exception to
   161  	// this rule is for katakanaMiddleDot, as the rule logic is handled without
   162  	// using a rule function.
   163  
   164  	// ContextJ (Join control)
   165  	0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
   166  	0x200D: {prop: disallowed, cat: zeroWidthJoiner},
   167  
   168  	// ContextO
   169  	0x00B7: {prop: disallowed, cat: middleDot},
   170  	0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
   171  	0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
   172  	0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
   173  	0x30FB: {prop: pValid, cat: katakanaMiddleDot},
   174  
   175  	// These are officially ContextO, but the implementation does not require
   176  	// special treatment of these, so we simply mark them as valid.
   177  	0x0660: {prop: pValid},
   178  	0x0661: {prop: pValid},
   179  	0x0662: {prop: pValid},
   180  	0x0663: {prop: pValid},
   181  	0x0664: {prop: pValid},
   182  	0x0665: {prop: pValid},
   183  	0x0666: {prop: pValid},
   184  	0x0667: {prop: pValid},
   185  	0x0668: {prop: pValid},
   186  	0x0669: {prop: pValid},
   187  	0x06F0: {prop: pValid},
   188  	0x06F1: {prop: pValid},
   189  	0x06F2: {prop: pValid},
   190  	0x06F3: {prop: pValid},
   191  	0x06F4: {prop: pValid},
   192  	0x06F5: {prop: pValid},
   193  	0x06F6: {prop: pValid},
   194  	0x06F7: {prop: pValid},
   195  	0x06F8: {prop: pValid},
   196  	0x06F9: {prop: pValid},
   197  
   198  	0x0640: {prop: disallowed},
   199  	0x07FA: {prop: disallowed},
   200  	0x302E: {prop: disallowed},
   201  	0x302F: {prop: disallowed},
   202  	0x3031: {prop: disallowed},
   203  	0x3032: {prop: disallowed},
   204  	0x3033: {prop: disallowed},
   205  	0x3034: {prop: disallowed},
   206  	0x3035: {prop: disallowed},
   207  	0x303B: {prop: disallowed},
   208  }
   209  
   210  // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
   211  // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
   212  func isLetterDigits(r rune) bool {
   213  	return unicode.In(r,
   214  		unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
   215  		unicode.Mn, unicode.Mc, // Modifiers
   216  		unicode.Nd, // Digits
   217  	)
   218  }
   219  
   220  func isIdDisAndFreePVal(r rune) bool {
   221  	return unicode.In(r,
   222  		// OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
   223  		// r in {Lt, Nl, No, Me}
   224  		unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
   225  		unicode.Me, // Modifiers
   226  
   227  		// Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
   228  		// r in {Zs}
   229  		unicode.Zs,
   230  
   231  		// Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
   232  		// r in {Sm, Sc, Sk, So}
   233  		unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
   234  
   235  		// Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
   236  		// r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
   237  		unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
   238  		unicode.Pi, unicode.Pf, unicode.Po,
   239  	)
   240  }
   241  
   242  // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
   243  func hasCompat(r rune) bool {
   244  	return !norm.NFKC.IsNormalString(string(r))
   245  }
   246  
   247  // From https://tools.ietf.org/html/rfc5892:
   248  //
   249  // If .cp. .in. Exceptions Then Exceptions(cp);
   250  //   Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
   251  //   Else If .cp. .in. Unassigned Then UNASSIGNED;
   252  //   Else If .cp. .in. ASCII7 Then PVALID;
   253  //   Else If .cp. .in. JoinControl Then CONTEXTJ;
   254  //   Else If .cp. .in. OldHangulJamo Then DISALLOWED;
   255  //   Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
   256  //   Else If .cp. .in. Controls Then DISALLOWED;
   257  //   Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
   258  //   Else If .cp. .in. LetterDigits Then PVALID;
   259  //   Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
   260  //   Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
   261  //   Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
   262  //   Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
   263  //   Else DISALLOWED;
   264  
   265  func writeTables() {
   266  	propTrie := triegen.NewTrie("derivedProperties")
   267  	w := gen.NewCodeWriter()
   268  	defer w.WriteVersionedGoFile(*outputFile, "precis")
   269  	gen.WriteUnicodeVersion(w)
   270  
   271  	// Iterate over all the runes...
   272  	for i := rune(0); i < unicode.MaxRune; i++ {
   273  		r := rune(i)
   274  
   275  		if !utf8.ValidRune(r) {
   276  			continue
   277  		}
   278  
   279  		e, ok := exceptions[i]
   280  		p := e.prop
   281  		switch {
   282  		case ok:
   283  		case !unicode.In(r, assigned):
   284  			p = unassigned
   285  		case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
   286  			p = pValid
   287  		case unicode.In(r, disallowedRunes, unicode.Cc):
   288  			p = disallowed
   289  		case hasCompat(r):
   290  			p = idDisOrFreePVal
   291  		case isLetterDigits(r):
   292  			p = pValid
   293  		case isIdDisAndFreePVal(r):
   294  			p = idDisOrFreePVal
   295  		default:
   296  			p = disallowed
   297  		}
   298  		cat := runeCategory[r]
   299  		// Don't set category for runes that are disallowed.
   300  		if p == disallowed {
   301  			cat = exceptions[r].cat
   302  		}
   303  		propTrie.Insert(r, uint64(p)|uint64(cat))
   304  	}
   305  	sz, err := propTrie.Gen(w)
   306  	if err != nil {
   307  		log.Fatal(err)
   308  	}
   309  	w.Size += sz
   310  }
   311  

View as plain text