...

Source file src/golang.org/x/text/width/width.go

Documentation: golang.org/x/text/width

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate stringer -type=Kind
     6  //go:generate go run gen.go gen_common.go gen_trieval.go
     7  
     8  // Package width provides functionality for handling different widths in text.
     9  //
    10  // Wide characters behave like ideographs; they tend to allow line breaks after
    11  // each character and remain upright in vertical text layout. Narrow characters
    12  // are kept together in words or runs that are rotated sideways in vertical text
    13  // layout.
    14  //
    15  // For more information, see https://unicode.org/reports/tr11/.
    16  package width // import "golang.org/x/text/width"
    17  
    18  import (
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/transform"
    22  )
    23  
    24  // TODO
    25  // 1) Reduce table size by compressing blocks.
    26  // 2) API proposition for computing display length
    27  //    (approximation, fixed pitch only).
    28  // 3) Implement display length.
    29  
    30  // Kind indicates the type of width property as defined in https://unicode.org/reports/tr11/.
    31  type Kind int
    32  
    33  const (
    34  	// Neutral characters do not occur in legacy East Asian character sets.
    35  	Neutral Kind = iota
    36  
    37  	// EastAsianAmbiguous characters that can be sometimes wide and sometimes
    38  	// narrow and require additional information not contained in the character
    39  	// code to further resolve their width.
    40  	EastAsianAmbiguous
    41  
    42  	// EastAsianWide characters are wide in its usual form. They occur only in
    43  	// the context of East Asian typography. These runes may have explicit
    44  	// halfwidth counterparts.
    45  	EastAsianWide
    46  
    47  	// EastAsianNarrow characters are narrow in its usual form. They often have
    48  	// fullwidth counterparts.
    49  	EastAsianNarrow
    50  
    51  	// Note: there exist Narrow runes that do not have fullwidth or wide
    52  	// counterparts, despite what the definition says (e.g. U+27E6).
    53  
    54  	// EastAsianFullwidth characters have a compatibility decompositions of type
    55  	// wide that map to a narrow counterpart.
    56  	EastAsianFullwidth
    57  
    58  	// EastAsianHalfwidth characters have a compatibility decomposition of type
    59  	// narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON
    60  	// SIGN.
    61  	EastAsianHalfwidth
    62  
    63  	// Note: there exist runes that have a halfwidth counterparts but that are
    64  	// classified as Ambiguous, rather than wide (e.g. U+2190).
    65  )
    66  
    67  // TODO: the generated tries need to return size 1 for invalid runes for the
    68  // width to be computed correctly (each byte should render width 1)
    69  
    70  var trie = newWidthTrie(0)
    71  
    72  // Lookup reports the Properties of the first rune in b and the number of bytes
    73  // of its UTF-8 encoding.
    74  func Lookup(b []byte) (p Properties, size int) {
    75  	v, sz := trie.lookup(b)
    76  	return Properties{elem(v), b[sz-1]}, sz
    77  }
    78  
    79  // LookupString reports the Properties of the first rune in s and the number of
    80  // bytes of its UTF-8 encoding.
    81  func LookupString(s string) (p Properties, size int) {
    82  	v, sz := trie.lookupString(s)
    83  	return Properties{elem(v), s[sz-1]}, sz
    84  }
    85  
    86  // LookupRune reports the Properties of rune r.
    87  func LookupRune(r rune) Properties {
    88  	var buf [4]byte
    89  	n := utf8.EncodeRune(buf[:], r)
    90  	v, _ := trie.lookup(buf[:n])
    91  	last := byte(r)
    92  	if r >= utf8.RuneSelf {
    93  		last = 0x80 + byte(r&0x3f)
    94  	}
    95  	return Properties{elem(v), last}
    96  }
    97  
    98  // Properties provides access to width properties of a rune.
    99  type Properties struct {
   100  	elem elem
   101  	last byte
   102  }
   103  
   104  func (e elem) kind() Kind {
   105  	return Kind(e >> typeShift)
   106  }
   107  
   108  // Kind returns the Kind of a rune as defined in Unicode TR #11.
   109  // See https://unicode.org/reports/tr11/ for more details.
   110  func (p Properties) Kind() Kind {
   111  	return p.elem.kind()
   112  }
   113  
   114  // Folded returns the folded variant of a rune or 0 if the rune is canonical.
   115  func (p Properties) Folded() rune {
   116  	if p.elem&tagNeedsFold != 0 {
   117  		buf := inverseData[byte(p.elem)]
   118  		buf[buf[0]] ^= p.last
   119  		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
   120  		return r
   121  	}
   122  	return 0
   123  }
   124  
   125  // Narrow returns the narrow variant of a rune or 0 if the rune is already
   126  // narrow or doesn't have a narrow variant.
   127  func (p Properties) Narrow() rune {
   128  	if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
   129  		buf := inverseData[byte(p.elem)]
   130  		buf[buf[0]] ^= p.last
   131  		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
   132  		return r
   133  	}
   134  	return 0
   135  }
   136  
   137  // Wide returns the wide variant of a rune or 0 if the rune is already
   138  // wide or doesn't have a wide variant.
   139  func (p Properties) Wide() rune {
   140  	if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
   141  		buf := inverseData[byte(p.elem)]
   142  		buf[buf[0]] ^= p.last
   143  		r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
   144  		return r
   145  	}
   146  	return 0
   147  }
   148  
   149  // TODO for Properties:
   150  // - Add Fullwidth/Halfwidth or Inverted methods for computing variants
   151  // mapping.
   152  // - Add width information (including information on non-spacing runes).
   153  
   154  // Transformer implements the transform.Transformer interface.
   155  type Transformer struct {
   156  	t transform.SpanningTransformer
   157  }
   158  
   159  // Reset implements the transform.Transformer interface.
   160  func (t Transformer) Reset() { t.t.Reset() }
   161  
   162  // Transform implements the transform.Transformer interface.
   163  func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   164  	return t.t.Transform(dst, src, atEOF)
   165  }
   166  
   167  // Span implements the transform.SpanningTransformer interface.
   168  func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) {
   169  	return t.t.Span(src, atEOF)
   170  }
   171  
   172  // Bytes returns a new byte slice with the result of applying t to b.
   173  func (t Transformer) Bytes(b []byte) []byte {
   174  	b, _, _ = transform.Bytes(t, b)
   175  	return b
   176  }
   177  
   178  // String returns a string with the result of applying t to s.
   179  func (t Transformer) String(s string) string {
   180  	s, _, _ = transform.String(t, s)
   181  	return s
   182  }
   183  
   184  var (
   185  	// Fold is a transform that maps all runes to their canonical width.
   186  	//
   187  	// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
   188  	// provide a more generic folding mechanism.
   189  	Fold Transformer = Transformer{foldTransform{}}
   190  
   191  	// Widen is a transform that maps runes to their wide variant, if
   192  	// available.
   193  	Widen Transformer = Transformer{wideTransform{}}
   194  
   195  	// Narrow is a transform that maps runes to their narrow variant, if
   196  	// available.
   197  	Narrow Transformer = Transformer{narrowTransform{}}
   198  )
   199  
   200  // TODO: Consider the following options:
   201  // - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
   202  //   generalized variant of this.
   203  // - Consider a wide Won character to be the default width (or some generalized
   204  //   variant of this).
   205  // - Filter the set of characters that gets converted (the preferred approach is
   206  //   to allow applying filters to transforms).
   207  

View as plain text