...

Source file src/golang.org/x/text/collate/tools/colcmp/icu.go

Documentation: golang.org/x/text/collate/tools/colcmp

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build icu
     6  
     7  package main
     8  
     9  /*
    10  #cgo LDFLAGS: -licui18n -licuuc
    11  #include <stdlib.h>
    12  #include <unicode/ucol.h>
    13  #include <unicode/uiter.h>
    14  #include <unicode/utypes.h>
    15  */
    16  import "C"
    17  import (
    18  	"fmt"
    19  	"log"
    20  	"unicode/utf16"
    21  	"unicode/utf8"
    22  	"unsafe"
    23  )
    24  
    25  func init() {
    26  	AddFactory(CollatorFactory{"icu", newUTF16,
    27  		"Main ICU collator, using native strings."})
    28  	AddFactory(CollatorFactory{"icu8", newUTF8iter,
    29  		"ICU collator using ICU iterators to process UTF8."})
    30  	AddFactory(CollatorFactory{"icu16", newUTF8conv,
    31  		"ICU collation by first converting UTF8 to UTF16."})
    32  }
    33  
    34  func icuCharP(s []byte) *C.char {
    35  	return (*C.char)(unsafe.Pointer(&s[0]))
    36  }
    37  
    38  func icuUInt8P(s []byte) *C.uint8_t {
    39  	return (*C.uint8_t)(unsafe.Pointer(&s[0]))
    40  }
    41  
    42  func icuUCharP(s []uint16) *C.UChar {
    43  	return (*C.UChar)(unsafe.Pointer(&s[0]))
    44  }
    45  func icuULen(s []uint16) C.int32_t {
    46  	return C.int32_t(len(s))
    47  }
    48  func icuSLen(s []byte) C.int32_t {
    49  	return C.int32_t(len(s))
    50  }
    51  
    52  // icuCollator implements a Collator based on ICU.
    53  type icuCollator struct {
    54  	loc    *C.char
    55  	col    *C.UCollator
    56  	keyBuf []byte
    57  }
    58  
    59  const growBufSize = 10 * 1024 * 1024
    60  
    61  func (c *icuCollator) init(locale string) error {
    62  	err := C.UErrorCode(0)
    63  	c.loc = C.CString(locale)
    64  	c.col = C.ucol_open(c.loc, &err)
    65  	if err > 0 {
    66  		return fmt.Errorf("failed opening collator for %q", locale)
    67  	} else if err < 0 {
    68  		loc := C.ucol_getLocaleByType(c.col, 0, &err)
    69  		fmt, ok := map[int]string{
    70  			-127: "warning: using default collator: %s",
    71  			-128: "warning: using fallback collator: %s",
    72  		}[int(err)]
    73  		if ok {
    74  			log.Printf(fmt, C.GoString(loc))
    75  		}
    76  	}
    77  	c.keyBuf = make([]byte, 0, growBufSize)
    78  	return nil
    79  }
    80  
    81  func (c *icuCollator) buf() (*C.uint8_t, C.int32_t) {
    82  	if len(c.keyBuf) == cap(c.keyBuf) {
    83  		c.keyBuf = make([]byte, 0, growBufSize)
    84  	}
    85  	b := c.keyBuf[len(c.keyBuf):cap(c.keyBuf)]
    86  	return icuUInt8P(b), icuSLen(b)
    87  }
    88  
    89  func (c *icuCollator) extendBuf(n C.int32_t) []byte {
    90  	end := len(c.keyBuf) + int(n)
    91  	if end > cap(c.keyBuf) {
    92  		if len(c.keyBuf) == 0 {
    93  			log.Fatalf("icuCollator: max string size exceeded: %v > %v", n, growBufSize)
    94  		}
    95  		c.keyBuf = make([]byte, 0, growBufSize)
    96  		return nil
    97  	}
    98  	b := c.keyBuf[len(c.keyBuf):end]
    99  	c.keyBuf = c.keyBuf[:end]
   100  	return b
   101  }
   102  
   103  func (c *icuCollator) Close() error {
   104  	C.ucol_close(c.col)
   105  	C.free(unsafe.Pointer(c.loc))
   106  	return nil
   107  }
   108  
   109  // icuUTF16 implements the Collator interface.
   110  type icuUTF16 struct {
   111  	icuCollator
   112  }
   113  
   114  func newUTF16(locale string) (Collator, error) {
   115  	c := &icuUTF16{}
   116  	return c, c.init(locale)
   117  }
   118  
   119  func (c *icuUTF16) Compare(a, b Input) int {
   120  	return int(C.ucol_strcoll(c.col, icuUCharP(a.UTF16), icuULen(a.UTF16), icuUCharP(b.UTF16), icuULen(b.UTF16)))
   121  }
   122  
   123  func (c *icuUTF16) Key(s Input) []byte {
   124  	bp, bn := c.buf()
   125  	n := C.ucol_getSortKey(c.col, icuUCharP(s.UTF16), icuULen(s.UTF16), bp, bn)
   126  	if b := c.extendBuf(n); b != nil {
   127  		return b
   128  	}
   129  	return c.Key(s)
   130  }
   131  
   132  // icuUTF8iter implements the Collator interface
   133  // This implementation wraps the UTF8 string in an iterator
   134  // which is passed to the collator.
   135  type icuUTF8iter struct {
   136  	icuCollator
   137  	a, b C.UCharIterator
   138  }
   139  
   140  func newUTF8iter(locale string) (Collator, error) {
   141  	c := &icuUTF8iter{}
   142  	return c, c.init(locale)
   143  }
   144  
   145  func (c *icuUTF8iter) Compare(a, b Input) int {
   146  	err := C.UErrorCode(0)
   147  	C.uiter_setUTF8(&c.a, icuCharP(a.UTF8), icuSLen(a.UTF8))
   148  	C.uiter_setUTF8(&c.b, icuCharP(b.UTF8), icuSLen(b.UTF8))
   149  	return int(C.ucol_strcollIter(c.col, &c.a, &c.b, &err))
   150  }
   151  
   152  func (c *icuUTF8iter) Key(s Input) []byte {
   153  	err := C.UErrorCode(0)
   154  	state := [2]C.uint32_t{}
   155  	C.uiter_setUTF8(&c.a, icuCharP(s.UTF8), icuSLen(s.UTF8))
   156  	bp, bn := c.buf()
   157  	n := C.ucol_nextSortKeyPart(c.col, &c.a, &(state[0]), bp, bn, &err)
   158  	if n >= bn {
   159  		// Force failure.
   160  		if c.extendBuf(n+1) != nil {
   161  			log.Fatal("expected extension to fail")
   162  		}
   163  		return c.Key(s)
   164  	}
   165  	return c.extendBuf(n)
   166  }
   167  
   168  // icuUTF8conv implements the Collator interface.
   169  // This implementation first converts the give UTF8 string
   170  // to UTF16 and then calls the main ICU collation function.
   171  type icuUTF8conv struct {
   172  	icuCollator
   173  }
   174  
   175  func newUTF8conv(locale string) (Collator, error) {
   176  	c := &icuUTF8conv{}
   177  	return c, c.init(locale)
   178  }
   179  
   180  func (c *icuUTF8conv) Compare(sa, sb Input) int {
   181  	a := encodeUTF16(sa.UTF8)
   182  	b := encodeUTF16(sb.UTF8)
   183  	return int(C.ucol_strcoll(c.col, icuUCharP(a), icuULen(a), icuUCharP(b), icuULen(b)))
   184  }
   185  
   186  func (c *icuUTF8conv) Key(s Input) []byte {
   187  	a := encodeUTF16(s.UTF8)
   188  	bp, bn := c.buf()
   189  	n := C.ucol_getSortKey(c.col, icuUCharP(a), icuULen(a), bp, bn)
   190  	if b := c.extendBuf(n); b != nil {
   191  		return b
   192  	}
   193  	return c.Key(s)
   194  }
   195  
   196  func encodeUTF16(b []byte) []uint16 {
   197  	a := []uint16{}
   198  	for len(b) > 0 {
   199  		r, sz := utf8.DecodeRune(b)
   200  		b = b[sz:]
   201  		r1, r2 := utf16.EncodeRune(r)
   202  		if r1 != 0xFFFD {
   203  			a = append(a, uint16(r1), uint16(r2))
   204  		} else {
   205  			a = append(a, uint16(r))
   206  		}
   207  	}
   208  	return a
   209  }
   210  

View as plain text