...

Source file src/golang.org/x/text/internal/cldrtree/cldrtree.go

Documentation: golang.org/x/text/internal/cldrtree

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package cldrtree builds and generates a CLDR index file, including all
     6  // inheritance.
     7  package cldrtree
     8  
     9  //go:generate go test -gen
    10  
    11  // cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
    12  // data each branch in the tree is indicated by either an element name or an
    13  // attribute value. A Tree does not distinguish between these two cases, but
    14  // rather assumes that all branches can be accessed by an enum with a compact
    15  // range of positive integer values starting from 0.
    16  //
    17  // Each Tree consists of three parts:
    18  //    - a slice mapping compact language identifiers to an offset into a set of
    19  //      indices,
    20  //    - a set of indices, stored as a large blob of uint16 values that encode
    21  //      the actual tree structure of data, and
    22  //    - a set of buckets that each holds a collection of strings.
    23  // each of which is explained in more detail below.
    24  //
    25  //
    26  // Tree lookup
    27  // A tree lookup is done by providing a locale and a "path", which is a
    28  // sequence of enum values. The search starts with getting the index for the
    29  // given locale and then incrementally jumping into the index using the path
    30  // values. If an element cannot be found in the index, the search starts anew
    31  // for the locale's parent locale. The path may change during lookup by means
    32  // of aliasing, described below.
    33  //
    34  // Buckets
    35  // Buckets hold the actual string data of the leaf values of the CLDR tree.
    36  // This data is stored in buckets, rather than one large string, for multiple
    37  // reasons:
    38  //   - it allows representing leaf values more compactly, by storing all leaf
    39  //     values in a single bucket and then needing only needing a uint16 to index
    40  //     into this bucket for all leaf values,
    41  //   - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
    42  //     linking in a smaller amount of data if only a subset of the buckets is
    43  //     needed,
    44  //   - to be nice to go fmt and the compiler.
    45  //
    46  // indices
    47  // An index is a slice of uint16 for which the values are interpreted in one of
    48  // two ways: as a node or a set of leaf values.
    49  // A set of leaf values has the following form:
    50  //      <max_size>, <bucket>, <offset>...
    51  // max_size indicates the maximum enum value for which an offset is defined.
    52  // An offset value of 0xFFFF (missingValue) also indicates an undefined value.
    53  // If defined offset indicates the offset within the given bucket of the string.
    54  // A node value has the following form:
    55  //      <max_size>, <offset_or_alias>...
    56  // max_size indicates the maximum value for which an offset is defined.
    57  // A missing offset may also be indicated with 0. If the high bit (0x8000, or
    58  // inheritMask) is not set, the offset points to the offset within the index
    59  // for the current locale.
    60  // An offset with high bit set is an alias. In this case the uint16 has the form
    61  //       bits:
    62  //         15: 1
    63  //      14-12: negative offset into path relative to current position
    64  //       0-11: new enum value for path element.
    65  // On encountering an alias, the path is modified accordingly and the lookup is
    66  // restarted for the given locale.
    67  
    68  import (
    69  	"fmt"
    70  	"reflect"
    71  	"regexp"
    72  	"strings"
    73  	"unicode/utf8"
    74  
    75  	"golang.org/x/text/internal/gen"
    76  	"golang.org/x/text/language"
    77  	"golang.org/x/text/unicode/cldr"
    78  )
    79  
    80  // TODO:
    81  // - allow two Trees to share the same set of buckets.
    82  
    83  // A Builder allows storing CLDR data in compact form.
    84  type Builder struct {
    85  	table []string
    86  
    87  	rootMeta    *metaData
    88  	locales     []locale
    89  	strToBucket map[string]stringInfo
    90  	buckets     [][]byte
    91  	enums       []*enum
    92  	err         error
    93  
    94  	// Stats
    95  	size        int
    96  	sizeAll     int
    97  	bucketWaste int
    98  }
    99  
   100  const (
   101  	maxBucketSize = 8 * 1024 // 8K
   102  	maxStrlen     = 254      // allow 0xFF sentinel
   103  )
   104  
   105  func (b *Builder) setError(err error) {
   106  	if b.err == nil {
   107  		b.err = err
   108  	}
   109  }
   110  
   111  func (b *Builder) addString(data string) stringInfo {
   112  	data = b.makeString(data)
   113  	info, ok := b.strToBucket[data]
   114  	if !ok {
   115  		b.size += len(data)
   116  		x := len(b.buckets) - 1
   117  		bucket := b.buckets[x]
   118  		if len(bucket)+len(data) < maxBucketSize {
   119  			info.bucket = uint16(x)
   120  			info.bucketPos = uint16(len(bucket))
   121  			b.buckets[x] = append(bucket, data...)
   122  		} else {
   123  			info.bucket = uint16(len(b.buckets))
   124  			info.bucketPos = 0
   125  			b.buckets = append(b.buckets, []byte(data))
   126  		}
   127  		b.strToBucket[data] = info
   128  	}
   129  	return info
   130  }
   131  
   132  func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
   133  	data = b.makeString(data)
   134  	info, ok := b.strToBucket[data]
   135  	if !ok || info.bucket != bucket {
   136  		if ok {
   137  			b.bucketWaste += len(data)
   138  		}
   139  		b.size += len(data)
   140  		bk := b.buckets[bucket]
   141  		info.bucket = bucket
   142  		info.bucketPos = uint16(len(bk))
   143  		b.buckets[bucket] = append(bk, data...)
   144  		b.strToBucket[data] = info
   145  	}
   146  	return info
   147  }
   148  
   149  func (b *Builder) makeString(data string) string {
   150  	if len(data) > maxStrlen {
   151  		b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
   152  		data = data[:maxStrlen]
   153  		for i := len(data) - 1; i > len(data)-4; i-- {
   154  			if utf8.RuneStart(data[i]) {
   155  				data = data[:i]
   156  				break
   157  			}
   158  		}
   159  	}
   160  	data = string([]byte{byte(len(data))}) + data
   161  	b.sizeAll += len(data)
   162  	return data
   163  }
   164  
   165  type stringInfo struct {
   166  	bufferPos uint32
   167  	bucket    uint16
   168  	bucketPos uint16
   169  }
   170  
   171  // New creates a new Builder.
   172  func New(tableName string) *Builder {
   173  	b := &Builder{
   174  		strToBucket: map[string]stringInfo{},
   175  		buckets:     [][]byte{nil}, // initialize with first bucket.
   176  	}
   177  	b.rootMeta = &metaData{
   178  		b:        b,
   179  		typeInfo: &typeInfo{},
   180  	}
   181  	return b
   182  }
   183  
   184  // Gen writes all the tables and types for the collected data.
   185  func (b *Builder) Gen(w *gen.CodeWriter) error {
   186  	t, err := build(b)
   187  	if err != nil {
   188  		return err
   189  	}
   190  	return generate(b, t, w)
   191  }
   192  
   193  // GenTestData generates tables useful for testing data generated with Gen.
   194  func (b *Builder) GenTestData(w *gen.CodeWriter) error {
   195  	return generateTestData(b, w)
   196  }
   197  
   198  type locale struct {
   199  	tag  language.Tag
   200  	root *Index
   201  }
   202  
   203  // Locale creates an index for the given locale.
   204  func (b *Builder) Locale(t language.Tag) *Index {
   205  	index := &Index{
   206  		meta: b.rootMeta,
   207  	}
   208  	b.locales = append(b.locales, locale{tag: t, root: index})
   209  	return index
   210  }
   211  
   212  // An Index holds a map of either leaf values or other indices.
   213  type Index struct {
   214  	meta *metaData
   215  
   216  	subIndex []*Index
   217  	values   []keyValue
   218  }
   219  
   220  func (i *Index) setError(err error) { i.meta.b.setError(err) }
   221  
   222  type keyValue struct {
   223  	key   enumIndex
   224  	value stringInfo
   225  }
   226  
   227  // Element is a CLDR XML element.
   228  type Element interface {
   229  	GetCommon() *cldr.Common
   230  }
   231  
   232  // Index creates a subindex where the type and enum values are not shared
   233  // with siblings by default. The name is derived from the elem. If elem is
   234  // an alias reference, the alias will be resolved and linked. If elem is nil
   235  // Index returns nil.
   236  func (i *Index) Index(elem Element, opt ...Option) *Index {
   237  	if elem == nil || reflect.ValueOf(elem).IsNil() {
   238  		return nil
   239  	}
   240  	c := elem.GetCommon()
   241  	o := &options{
   242  		parent: i,
   243  		name:   c.GetCommon().Element(),
   244  	}
   245  	o.fill(opt)
   246  	o.setAlias(elem)
   247  	return i.subIndexForKey(o)
   248  }
   249  
   250  // IndexWithName is like Section but derives the name from the given name.
   251  func (i *Index) IndexWithName(name string, opt ...Option) *Index {
   252  	o := &options{parent: i, name: name}
   253  	o.fill(opt)
   254  	return i.subIndexForKey(o)
   255  }
   256  
   257  // IndexFromType creates a subindex the value of tye type attribute as key. It
   258  // will also configure the Index to share the enumeration values with all
   259  // sibling values. If elem is an alias, it will be resolved and linked.
   260  func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
   261  	o := &options{
   262  		parent: i,
   263  		name:   elem.GetCommon().Type,
   264  	}
   265  	o.fill(opts)
   266  	o.setAlias(elem)
   267  	useSharedType()(o)
   268  	return i.subIndexForKey(o)
   269  }
   270  
   271  // IndexFromAlt creates a subindex the value of tye alt attribute as key. It
   272  // will also configure the Index to share the enumeration values with all
   273  // sibling values. If elem is an alias, it will be resolved and linked.
   274  func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
   275  	o := &options{
   276  		parent: i,
   277  		name:   elem.GetCommon().Alt,
   278  	}
   279  	o.fill(opts)
   280  	o.setAlias(elem)
   281  	useSharedType()(o)
   282  	return i.subIndexForKey(o)
   283  }
   284  
   285  func (i *Index) subIndexForKey(opts *options) *Index {
   286  	key := opts.name
   287  	if len(i.values) > 0 {
   288  		panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
   289  	}
   290  	meta := i.meta.sub(key, opts)
   291  	for _, x := range i.subIndex {
   292  		if x.meta == meta {
   293  			return x
   294  		}
   295  	}
   296  	if alias := opts.alias; alias != nil {
   297  		if a := alias.GetCommon().Alias; a != nil {
   298  			if a.Source != "locale" {
   299  				i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
   300  			}
   301  			if meta.inheritOffset < 0 {
   302  				i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
   303  			}
   304  			path := a.Path
   305  			for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
   306  				meta.inheritOffset--
   307  			}
   308  			m := aliasRe.FindStringSubmatch(path)
   309  			if m == nil {
   310  				i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
   311  			} else {
   312  				key := m[4]
   313  				if key == "" {
   314  					key = m[1]
   315  				}
   316  				meta.inheritIndex = key
   317  			}
   318  		}
   319  	}
   320  	x := &Index{meta: meta}
   321  	i.subIndex = append(i.subIndex, x)
   322  	return x
   323  }
   324  
   325  var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
   326  
   327  // SetValue sets the value, the data from a CLDR XML element, for the given key.
   328  func (i *Index) SetValue(key string, value Element, opt ...Option) {
   329  	if len(i.subIndex) > 0 {
   330  		panic(fmt.Errorf("adding value for key %q when index already exists", key))
   331  	}
   332  	o := &options{parent: i}
   333  	o.fill(opt)
   334  	c := value.GetCommon()
   335  	if c.Alias != nil {
   336  		i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
   337  	}
   338  	i.setValue(key, c.Data(), o)
   339  }
   340  
   341  func (i *Index) setValue(key, data string, o *options) {
   342  	index, _ := i.meta.typeInfo.lookupSubtype(key, o)
   343  	kv := keyValue{key: index}
   344  	if len(i.values) > 0 {
   345  		// Add string to the same bucket as the other values.
   346  		bucket := i.values[0].value.bucket
   347  		kv.value = i.meta.b.addStringToBucket(data, bucket)
   348  	} else {
   349  		kv.value = i.meta.b.addString(data)
   350  	}
   351  	i.values = append(i.values, kv)
   352  }
   353  

View as plain text