...

Source file src/golang.org/x/text/unicode/bidi/bracket.go

Documentation: golang.org/x/text/unicode/bidi

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bidi
     6  
     7  import (
     8  	"container/list"
     9  	"fmt"
    10  	"sort"
    11  )
    12  
    13  // This file contains a port of the reference implementation of the
    14  // Bidi Parentheses Algorithm:
    15  // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
    16  //
    17  // The implementation in this file covers definitions BD14-BD16 and rule N0
    18  // of UAX#9.
    19  //
    20  // Some preprocessing is done for each rune before data is passed to this
    21  // algorithm:
    22  //  - opening and closing brackets are identified
    23  //  - a bracket pair type, like '(' and ')' is assigned a unique identifier that
    24  //    is identical for the opening and closing bracket. It is left to do these
    25  //    mappings.
    26  //  - The BPA algorithm requires that bracket characters that are canonical
    27  //    equivalents of each other be able to be substituted for each other.
    28  //    It is the responsibility of the caller to do this canonicalization.
    29  //
    30  // In implementing BD16, this implementation departs slightly from the "logical"
    31  // algorithm defined in UAX#9. In particular, the stack referenced there
    32  // supports operations that go beyond a "basic" stack. An equivalent
    33  // implementation based on a linked list is used here.
    34  
    35  // Bidi_Paired_Bracket_Type
    36  // BD14. An opening paired bracket is a character whose
    37  // Bidi_Paired_Bracket_Type property value is Open.
    38  //
    39  // BD15. A closing paired bracket is a character whose
    40  // Bidi_Paired_Bracket_Type property value is Close.
    41  type bracketType byte
    42  
    43  const (
    44  	bpNone bracketType = iota
    45  	bpOpen
    46  	bpClose
    47  )
    48  
    49  // bracketPair holds a pair of index values for opening and closing bracket
    50  // location of a bracket pair.
    51  type bracketPair struct {
    52  	opener int
    53  	closer int
    54  }
    55  
    56  func (b *bracketPair) String() string {
    57  	return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
    58  }
    59  
    60  // bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
    61  type bracketPairs []bracketPair
    62  
    63  func (b bracketPairs) Len() int           { return len(b) }
    64  func (b bracketPairs) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
    65  func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
    66  
    67  // resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
    68  //
    69  // For each rune, it takes the indexes into the original string, the class the
    70  // bracket type (in pairTypes) and the bracket identifier (pairValues). It also
    71  // takes the direction type for the start-of-sentence and the embedding level.
    72  //
    73  // The identifiers for bracket types are the rune of the canonicalized opening
    74  // bracket for brackets (open or close) or 0 for runes that are not brackets.
    75  func resolvePairedBrackets(s *isolatingRunSequence) {
    76  	p := bracketPairer{
    77  		sos:              s.sos,
    78  		openers:          list.New(),
    79  		codesIsolatedRun: s.types,
    80  		indexes:          s.indexes,
    81  	}
    82  	dirEmbed := L
    83  	if s.level&1 != 0 {
    84  		dirEmbed = R
    85  	}
    86  	p.locateBrackets(s.p.pairTypes, s.p.pairValues)
    87  	p.resolveBrackets(dirEmbed, s.p.initialTypes)
    88  }
    89  
    90  type bracketPairer struct {
    91  	sos Class // direction corresponding to start of sequence
    92  
    93  	// The following is a restatement of BD 16 using non-algorithmic language.
    94  	//
    95  	// A bracket pair is a pair of characters consisting of an opening
    96  	// paired bracket and a closing paired bracket such that the
    97  	// Bidi_Paired_Bracket property value of the former equals the latter,
    98  	// subject to the following constraints.
    99  	// - both characters of a pair occur in the same isolating run sequence
   100  	// - the closing character of a pair follows the opening character
   101  	// - any bracket character can belong at most to one pair, the earliest possible one
   102  	// - any bracket character not part of a pair is treated like an ordinary character
   103  	// - pairs may nest properly, but their spans may not overlap otherwise
   104  
   105  	// Bracket characters with canonical decompositions are supposed to be
   106  	// treated as if they had been normalized, to allow normalized and non-
   107  	// normalized text to give the same result. In this implementation that step
   108  	// is pushed out to the caller. The caller has to ensure that the pairValue
   109  	// slices contain the rune of the opening bracket after normalization for
   110  	// any opening or closing bracket.
   111  
   112  	openers *list.List // list of positions for opening brackets
   113  
   114  	// bracket pair positions sorted by location of opening bracket
   115  	pairPositions bracketPairs
   116  
   117  	codesIsolatedRun []Class // directional bidi codes for an isolated run
   118  	indexes          []int   // array of index values into the original string
   119  
   120  }
   121  
   122  // matchOpener reports whether characters at given positions form a matching
   123  // bracket pair.
   124  func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
   125  	return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
   126  }
   127  
   128  const maxPairingDepth = 63
   129  
   130  // locateBrackets locates matching bracket pairs according to BD16.
   131  //
   132  // This implementation uses a linked list instead of a stack, because, while
   133  // elements are added at the front (like a push) they are not generally removed
   134  // in atomic 'pop' operations, reducing the benefit of the stack archetype.
   135  func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
   136  	// traverse the run
   137  	// do that explicitly (not in a for-each) so we can record position
   138  	for i, index := range p.indexes {
   139  
   140  		// look at the bracket type for each character
   141  		if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
   142  			// continue scanning
   143  			continue
   144  		}
   145  		switch pairTypes[index] {
   146  		case bpOpen:
   147  			// check if maximum pairing depth reached
   148  			if p.openers.Len() == maxPairingDepth {
   149  				p.openers.Init()
   150  				return
   151  			}
   152  			// remember opener location, most recent first
   153  			p.openers.PushFront(i)
   154  
   155  		case bpClose:
   156  			// see if there is a match
   157  			count := 0
   158  			for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
   159  				count++
   160  				opener := elem.Value.(int)
   161  				if p.matchOpener(pairValues, opener, i) {
   162  					// if the opener matches, add nested pair to the ordered list
   163  					p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
   164  					// remove up to and including matched opener
   165  					for ; count > 0; count-- {
   166  						p.openers.Remove(p.openers.Front())
   167  					}
   168  					break
   169  				}
   170  			}
   171  			sort.Sort(p.pairPositions)
   172  			// if we get here, the closing bracket matched no openers
   173  			// and gets ignored
   174  		}
   175  	}
   176  }
   177  
   178  // Bracket pairs within an isolating run sequence are processed as units so
   179  // that both the opening and the closing paired bracket in a pair resolve to
   180  // the same direction.
   181  //
   182  // N0. Process bracket pairs in an isolating run sequence sequentially in
   183  // the logical order of the text positions of the opening paired brackets
   184  // using the logic given below. Within this scope, bidirectional types EN
   185  // and AN are treated as R.
   186  //
   187  // Identify the bracket pairs in the current isolating run sequence
   188  // according to BD16. For each bracket-pair element in the list of pairs of
   189  // text positions:
   190  //
   191  // a Inspect the bidirectional types of the characters enclosed within the
   192  // bracket pair.
   193  //
   194  // b If any strong type (either L or R) matching the embedding direction is
   195  // found, set the type for both brackets in the pair to match the embedding
   196  // direction.
   197  //
   198  // o [ e ] o -> o e e e o
   199  //
   200  // o [ o e ] -> o e o e e
   201  //
   202  // o [ NI e ] -> o e NI e e
   203  //
   204  // c Otherwise, if a strong type (opposite the embedding direction) is
   205  // found, test for adjacent strong types as follows: 1 First, check
   206  // backwards before the opening paired bracket until the first strong type
   207  // (L, R, or sos) is found. If that first preceding strong type is opposite
   208  // the embedding direction, then set the type for both brackets in the pair
   209  // to that type. 2 Otherwise, set the type for both brackets in the pair to
   210  // the embedding direction.
   211  //
   212  // o [ o ] e -> o o o o e
   213  //
   214  // o [ o NI ] o -> o o o NI o o
   215  //
   216  // e [ o ] o -> e e o e o
   217  //
   218  // e [ o ] e -> e e o e e
   219  //
   220  // e ( o [ o ] NI ) e -> e e o o o o NI e e
   221  //
   222  // d Otherwise, do not set the type for the current bracket pair. Note that
   223  // if the enclosed text contains no strong types the paired brackets will
   224  // both resolve to the same level when resolved individually using rules N1
   225  // and N2.
   226  //
   227  // e ( NI ) o -> e ( NI ) o
   228  
   229  // getStrongTypeN0 maps character's directional code to strong type as required
   230  // by rule N0.
   231  //
   232  // TODO: have separate type for "strong" directionality.
   233  func (p *bracketPairer) getStrongTypeN0(index int) Class {
   234  	switch p.codesIsolatedRun[index] {
   235  	// in the scope of N0, number types are treated as R
   236  	case EN, AN, AL, R:
   237  		return R
   238  	case L:
   239  		return L
   240  	default:
   241  		return ON
   242  	}
   243  }
   244  
   245  // classifyPairContent reports the strong types contained inside a Bracket Pair,
   246  // assuming the given embedding direction.
   247  //
   248  // It returns ON if no strong type is found. If a single strong type is found,
   249  // it returns this type. Otherwise it returns the embedding direction.
   250  //
   251  // TODO: use separate type for "strong" directionality.
   252  func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
   253  	dirOpposite := ON
   254  	for i := loc.opener + 1; i < loc.closer; i++ {
   255  		dir := p.getStrongTypeN0(i)
   256  		if dir == ON {
   257  			continue
   258  		}
   259  		if dir == dirEmbed {
   260  			return dir // type matching embedding direction found
   261  		}
   262  		dirOpposite = dir
   263  	}
   264  	// return ON if no strong type found, or class opposite to dirEmbed
   265  	return dirOpposite
   266  }
   267  
   268  // classBeforePair determines which strong types are present before a Bracket
   269  // Pair. Return R or L if strong type found, otherwise ON.
   270  func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
   271  	for i := loc.opener - 1; i >= 0; i-- {
   272  		if dir := p.getStrongTypeN0(i); dir != ON {
   273  			return dir
   274  		}
   275  	}
   276  	// no strong types found, return sos
   277  	return p.sos
   278  }
   279  
   280  // assignBracketType implements rule N0 for a single bracket pair.
   281  func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
   282  	// rule "N0, a", inspect contents of pair
   283  	dirPair := p.classifyPairContent(loc, dirEmbed)
   284  
   285  	// dirPair is now L, R, or N (no strong type found)
   286  
   287  	// the following logical tests are performed out of order compared to
   288  	// the statement of the rules but yield the same results
   289  	if dirPair == ON {
   290  		return // case "d" - nothing to do
   291  	}
   292  
   293  	if dirPair != dirEmbed {
   294  		// case "c": strong type found, opposite - check before (c.1)
   295  		dirPair = p.classBeforePair(loc)
   296  		if dirPair == dirEmbed || dirPair == ON {
   297  			// no strong opposite type found before - use embedding (c.2)
   298  			dirPair = dirEmbed
   299  		}
   300  	}
   301  	// else: case "b", strong type found matching embedding,
   302  	// no explicit action needed, as dirPair is already set to embedding
   303  	// direction
   304  
   305  	// set the bracket types to the type found
   306  	p.setBracketsToType(loc, dirPair, initialTypes)
   307  }
   308  
   309  func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
   310  	p.codesIsolatedRun[loc.opener] = dirPair
   311  	p.codesIsolatedRun[loc.closer] = dirPair
   312  
   313  	for i := loc.opener + 1; i < loc.closer; i++ {
   314  		index := p.indexes[i]
   315  		if initialTypes[index] != NSM {
   316  			break
   317  		}
   318  		p.codesIsolatedRun[i] = dirPair
   319  	}
   320  
   321  	for i := loc.closer + 1; i < len(p.indexes); i++ {
   322  		index := p.indexes[i]
   323  		if initialTypes[index] != NSM {
   324  			break
   325  		}
   326  		p.codesIsolatedRun[i] = dirPair
   327  	}
   328  }
   329  
   330  // resolveBrackets implements rule N0 for a list of pairs.
   331  func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
   332  	for _, loc := range p.pairPositions {
   333  		p.assignBracketType(loc, dirEmbed, initialTypes)
   334  	}
   335  }
   336  

View as plain text