...

Source file src/golang.org/x/text/unicode/bidi/core.go

Documentation: golang.org/x/text/unicode/bidi

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bidi
     6  
     7  import (
     8  	"fmt"
     9  	"log"
    10  )
    11  
    12  // This implementation is a port based on the reference implementation found at:
    13  // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/
    14  //
    15  // described in Unicode Bidirectional Algorithm (UAX #9).
    16  //
    17  // Input:
    18  // There are two levels of input to the algorithm, since clients may prefer to
    19  // supply some information from out-of-band sources rather than relying on the
    20  // default behavior.
    21  //
    22  // - Bidi class array
    23  // - Bidi class array, with externally supplied base line direction
    24  //
    25  // Output:
    26  // Output is separated into several stages:
    27  //
    28  //  - levels array over entire paragraph
    29  //  - reordering array over entire paragraph
    30  //  - levels array over line
    31  //  - reordering array over line
    32  //
    33  // Note that for conformance to the Unicode Bidirectional Algorithm,
    34  // implementations are only required to generate correct reordering and
    35  // character directionality (odd or even levels) over a line. Generating
    36  // identical level arrays over a line is not required. Bidi explicit format
    37  // codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and
    38  // positions as long as the rest of the input is properly reordered.
    39  //
    40  // As the algorithm is defined to operate on a single paragraph at a time, this
    41  // implementation is written to handle single paragraphs. Thus rule P1 is
    42  // presumed by this implementation-- the data provided to the implementation is
    43  // assumed to be a single paragraph, and either contains no 'B' codes, or a
    44  // single 'B' code at the end of the input. 'B' is allowed as input to
    45  // illustrate how the algorithm assigns it a level.
    46  //
    47  // Also note that rules L3 and L4 depend on the rendering engine that uses the
    48  // result of the bidi algorithm. This implementation assumes that the rendering
    49  // engine expects combining marks in visual order (e.g. to the left of their
    50  // base character in RTL runs) and that it adjusts the glyphs used to render
    51  // mirrored characters that are in RTL runs so that they render appropriately.
    52  
    53  // level is the embedding level of a character. Even embedding levels indicate
    54  // left-to-right order and odd levels indicate right-to-left order. The special
    55  // level of -1 is reserved for undefined order.
    56  type level int8
    57  
    58  const implicitLevel level = -1
    59  
    60  // in returns if x is equal to any of the values in set.
    61  func (c Class) in(set ...Class) bool {
    62  	for _, s := range set {
    63  		if c == s {
    64  			return true
    65  		}
    66  	}
    67  	return false
    68  }
    69  
    70  // A paragraph contains the state of a paragraph.
    71  type paragraph struct {
    72  	initialTypes []Class
    73  
    74  	// Arrays of properties needed for paired bracket evaluation in N0
    75  	pairTypes  []bracketType // paired Bracket types for paragraph
    76  	pairValues []rune        // rune for opening bracket or pbOpen and pbClose; 0 for pbNone
    77  
    78  	embeddingLevel level // default: = implicitLevel;
    79  
    80  	// at the paragraph levels
    81  	resultTypes  []Class
    82  	resultLevels []level
    83  
    84  	// Index of matching PDI for isolate initiator characters. For other
    85  	// characters, the value of matchingPDI will be set to -1. For isolate
    86  	// initiators with no matching PDI, matchingPDI will be set to the length of
    87  	// the input string.
    88  	matchingPDI []int
    89  
    90  	// Index of matching isolate initiator for PDI characters. For other
    91  	// characters, and for PDIs with no matching isolate initiator, the value of
    92  	// matchingIsolateInitiator will be set to -1.
    93  	matchingIsolateInitiator []int
    94  }
    95  
    96  // newParagraph initializes a paragraph. The user needs to supply a few arrays
    97  // corresponding to the preprocessed text input. The types correspond to the
    98  // Unicode BiDi classes for each rune. pairTypes indicates the bracket type for
    99  // each rune. pairValues provides a unique bracket class identifier for each
   100  // rune (suggested is the rune of the open bracket for opening and matching
   101  // close brackets, after normalization). The embedding levels are optional, but
   102  // may be supplied to encode embedding levels of styled text.
   103  func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) (*paragraph, error) {
   104  	var err error
   105  	if err = validateTypes(types); err != nil {
   106  		return nil, err
   107  	}
   108  	if err = validatePbTypes(pairTypes); err != nil {
   109  		return nil, err
   110  	}
   111  	if err = validatePbValues(pairValues, pairTypes); err != nil {
   112  		return nil, err
   113  	}
   114  	if err = validateParagraphEmbeddingLevel(levels); err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	p := &paragraph{
   119  		initialTypes:   append([]Class(nil), types...),
   120  		embeddingLevel: levels,
   121  
   122  		pairTypes:  pairTypes,
   123  		pairValues: pairValues,
   124  
   125  		resultTypes: append([]Class(nil), types...),
   126  	}
   127  	p.run()
   128  	return p, nil
   129  }
   130  
   131  func (p *paragraph) Len() int { return len(p.initialTypes) }
   132  
   133  // The algorithm. Does not include line-based processing (Rules L1, L2).
   134  // These are applied later in the line-based phase of the algorithm.
   135  func (p *paragraph) run() {
   136  	p.determineMatchingIsolates()
   137  
   138  	// 1) determining the paragraph level
   139  	// Rule P1 is the requirement for entering this algorithm.
   140  	// Rules P2, P3.
   141  	// If no externally supplied paragraph embedding level, use default.
   142  	if p.embeddingLevel == implicitLevel {
   143  		p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len())
   144  	}
   145  
   146  	// Initialize result levels to paragraph embedding level.
   147  	p.resultLevels = make([]level, p.Len())
   148  	setLevels(p.resultLevels, p.embeddingLevel)
   149  
   150  	// 2) Explicit levels and directions
   151  	// Rules X1-X8.
   152  	p.determineExplicitEmbeddingLevels()
   153  
   154  	// Rule X9.
   155  	// We do not remove the embeddings, the overrides, the PDFs, and the BNs
   156  	// from the string explicitly. But they are not copied into isolating run
   157  	// sequences when they are created, so they are removed for all
   158  	// practical purposes.
   159  
   160  	// Rule X10.
   161  	// Run remainder of algorithm one isolating run sequence at a time
   162  	for _, seq := range p.determineIsolatingRunSequences() {
   163  		// 3) resolving weak types
   164  		// Rules W1-W7.
   165  		seq.resolveWeakTypes()
   166  
   167  		// 4a) resolving paired brackets
   168  		// Rule N0
   169  		resolvePairedBrackets(seq)
   170  
   171  		// 4b) resolving neutral types
   172  		// Rules N1-N3.
   173  		seq.resolveNeutralTypes()
   174  
   175  		// 5) resolving implicit embedding levels
   176  		// Rules I1, I2.
   177  		seq.resolveImplicitLevels()
   178  
   179  		// Apply the computed levels and types
   180  		seq.applyLevelsAndTypes()
   181  	}
   182  
   183  	// Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and
   184  	// BNs. This is for convenience, so the resulting level array will have
   185  	// a value for every character.
   186  	p.assignLevelsToCharactersRemovedByX9()
   187  }
   188  
   189  // determineMatchingIsolates determines the matching PDI for each isolate
   190  // initiator and vice versa.
   191  //
   192  // Definition BD9.
   193  //
   194  // At the end of this function:
   195  //
   196  //   - The member variable matchingPDI is set to point to the index of the
   197  //     matching PDI character for each isolate initiator character. If there is
   198  //     no matching PDI, it is set to the length of the input text. For other
   199  //     characters, it is set to -1.
   200  //   - The member variable matchingIsolateInitiator is set to point to the
   201  //     index of the matching isolate initiator character for each PDI character.
   202  //     If there is no matching isolate initiator, or the character is not a PDI,
   203  //     it is set to -1.
   204  func (p *paragraph) determineMatchingIsolates() {
   205  	p.matchingPDI = make([]int, p.Len())
   206  	p.matchingIsolateInitiator = make([]int, p.Len())
   207  
   208  	for i := range p.matchingIsolateInitiator {
   209  		p.matchingIsolateInitiator[i] = -1
   210  	}
   211  
   212  	for i := range p.matchingPDI {
   213  		p.matchingPDI[i] = -1
   214  
   215  		if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) {
   216  			depthCounter := 1
   217  			for j := i + 1; j < p.Len(); j++ {
   218  				if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) {
   219  					depthCounter++
   220  				} else if u == PDI {
   221  					if depthCounter--; depthCounter == 0 {
   222  						p.matchingPDI[i] = j
   223  						p.matchingIsolateInitiator[j] = i
   224  						break
   225  					}
   226  				}
   227  			}
   228  			if p.matchingPDI[i] == -1 {
   229  				p.matchingPDI[i] = p.Len()
   230  			}
   231  		}
   232  	}
   233  }
   234  
   235  // determineParagraphEmbeddingLevel reports the resolved paragraph direction of
   236  // the substring limited by the given range [start, end).
   237  //
   238  // Determines the paragraph level based on rules P2, P3. This is also used
   239  // in rule X5c to find if an FSI should resolve to LRI or RLI.
   240  func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level {
   241  	var strongType Class = unknownClass
   242  
   243  	// Rule P2.
   244  	for i := start; i < end; i++ {
   245  		if t := p.resultTypes[i]; t.in(L, AL, R) {
   246  			strongType = t
   247  			break
   248  		} else if t.in(FSI, LRI, RLI) {
   249  			i = p.matchingPDI[i] // skip over to the matching PDI
   250  			if i > end {
   251  				log.Panic("assert (i <= end)")
   252  			}
   253  		}
   254  	}
   255  	// Rule P3.
   256  	switch strongType {
   257  	case unknownClass: // none found
   258  		// default embedding level when no strong types found is 0.
   259  		return 0
   260  	case L:
   261  		return 0
   262  	default: // AL, R
   263  		return 1
   264  	}
   265  }
   266  
   267  const maxDepth = 125
   268  
   269  // This stack will store the embedding levels and override and isolated
   270  // statuses
   271  type directionalStatusStack struct {
   272  	stackCounter        int
   273  	embeddingLevelStack [maxDepth + 1]level
   274  	overrideStatusStack [maxDepth + 1]Class
   275  	isolateStatusStack  [maxDepth + 1]bool
   276  }
   277  
   278  func (s *directionalStatusStack) empty()     { s.stackCounter = 0 }
   279  func (s *directionalStatusStack) pop()       { s.stackCounter-- }
   280  func (s *directionalStatusStack) depth() int { return s.stackCounter }
   281  
   282  func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) {
   283  	s.embeddingLevelStack[s.stackCounter] = level
   284  	s.overrideStatusStack[s.stackCounter] = overrideStatus
   285  	s.isolateStatusStack[s.stackCounter] = isolateStatus
   286  	s.stackCounter++
   287  }
   288  
   289  func (s *directionalStatusStack) lastEmbeddingLevel() level {
   290  	return s.embeddingLevelStack[s.stackCounter-1]
   291  }
   292  
   293  func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class {
   294  	return s.overrideStatusStack[s.stackCounter-1]
   295  }
   296  
   297  func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool {
   298  	return s.isolateStatusStack[s.stackCounter-1]
   299  }
   300  
   301  // Determine explicit levels using rules X1 - X8
   302  func (p *paragraph) determineExplicitEmbeddingLevels() {
   303  	var stack directionalStatusStack
   304  	var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int
   305  
   306  	// Rule X1.
   307  	stack.push(p.embeddingLevel, ON, false)
   308  
   309  	for i, t := range p.resultTypes {
   310  		// Rules X2, X3, X4, X5, X5a, X5b, X5c
   311  		switch t {
   312  		case RLE, LRE, RLO, LRO, RLI, LRI, FSI:
   313  			isIsolate := t.in(RLI, LRI, FSI)
   314  			isRTL := t.in(RLE, RLO, RLI)
   315  
   316  			// override if this is an FSI that resolves to RLI
   317  			if t == FSI {
   318  				isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1)
   319  			}
   320  			if isIsolate {
   321  				p.resultLevels[i] = stack.lastEmbeddingLevel()
   322  				if stack.lastDirectionalOverrideStatus() != ON {
   323  					p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
   324  				}
   325  			}
   326  
   327  			var newLevel level
   328  			if isRTL {
   329  				// least greater odd
   330  				newLevel = (stack.lastEmbeddingLevel() + 1) | 1
   331  			} else {
   332  				// least greater even
   333  				newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1
   334  			}
   335  
   336  			if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 {
   337  				if isIsolate {
   338  					validIsolateCount++
   339  				}
   340  				// Push new embedding level, override status, and isolated
   341  				// status.
   342  				// No check for valid stack counter, since the level check
   343  				// suffices.
   344  				switch t {
   345  				case LRO:
   346  					stack.push(newLevel, L, isIsolate)
   347  				case RLO:
   348  					stack.push(newLevel, R, isIsolate)
   349  				default:
   350  					stack.push(newLevel, ON, isIsolate)
   351  				}
   352  				// Not really part of the spec
   353  				if !isIsolate {
   354  					p.resultLevels[i] = newLevel
   355  				}
   356  			} else {
   357  				// This is an invalid explicit formatting character,
   358  				// so apply the "Otherwise" part of rules X2-X5b.
   359  				if isIsolate {
   360  					overflowIsolateCount++
   361  				} else { // !isIsolate
   362  					if overflowIsolateCount == 0 {
   363  						overflowEmbeddingCount++
   364  					}
   365  				}
   366  			}
   367  
   368  		// Rule X6a
   369  		case PDI:
   370  			if overflowIsolateCount > 0 {
   371  				overflowIsolateCount--
   372  			} else if validIsolateCount == 0 {
   373  				// do nothing
   374  			} else {
   375  				overflowEmbeddingCount = 0
   376  				for !stack.lastDirectionalIsolateStatus() {
   377  					stack.pop()
   378  				}
   379  				stack.pop()
   380  				validIsolateCount--
   381  			}
   382  			p.resultLevels[i] = stack.lastEmbeddingLevel()
   383  
   384  		// Rule X7
   385  		case PDF:
   386  			// Not really part of the spec
   387  			p.resultLevels[i] = stack.lastEmbeddingLevel()
   388  
   389  			if overflowIsolateCount > 0 {
   390  				// do nothing
   391  			} else if overflowEmbeddingCount > 0 {
   392  				overflowEmbeddingCount--
   393  			} else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 {
   394  				stack.pop()
   395  			}
   396  
   397  		case B: // paragraph separator.
   398  			// Rule X8.
   399  
   400  			// These values are reset for clarity, in this implementation B
   401  			// can only occur as the last code in the array.
   402  			stack.empty()
   403  			overflowIsolateCount = 0
   404  			overflowEmbeddingCount = 0
   405  			validIsolateCount = 0
   406  			p.resultLevels[i] = p.embeddingLevel
   407  
   408  		default:
   409  			p.resultLevels[i] = stack.lastEmbeddingLevel()
   410  			if stack.lastDirectionalOverrideStatus() != ON {
   411  				p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
   412  			}
   413  		}
   414  	}
   415  }
   416  
   417  type isolatingRunSequence struct {
   418  	p *paragraph
   419  
   420  	indexes []int // indexes to the original string
   421  
   422  	types          []Class // type of each character using the index
   423  	resolvedLevels []level // resolved levels after application of rules
   424  	level          level
   425  	sos, eos       Class
   426  }
   427  
   428  func (i *isolatingRunSequence) Len() int { return len(i.indexes) }
   429  
   430  func maxLevel(a, b level) level {
   431  	if a > b {
   432  		return a
   433  	}
   434  	return b
   435  }
   436  
   437  // Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types,
   438  // either L or R, for each isolating run sequence.
   439  func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence {
   440  	length := len(indexes)
   441  	types := make([]Class, length)
   442  	for i, x := range indexes {
   443  		types[i] = p.resultTypes[x]
   444  	}
   445  
   446  	// assign level, sos and eos
   447  	prevChar := indexes[0] - 1
   448  	for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) {
   449  		prevChar--
   450  	}
   451  	prevLevel := p.embeddingLevel
   452  	if prevChar >= 0 {
   453  		prevLevel = p.resultLevels[prevChar]
   454  	}
   455  
   456  	var succLevel level
   457  	lastType := types[length-1]
   458  	if lastType.in(LRI, RLI, FSI) {
   459  		succLevel = p.embeddingLevel
   460  	} else {
   461  		// the first character after the end of run sequence
   462  		limit := indexes[length-1] + 1
   463  		for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ {
   464  
   465  		}
   466  		succLevel = p.embeddingLevel
   467  		if limit < p.Len() {
   468  			succLevel = p.resultLevels[limit]
   469  		}
   470  	}
   471  	level := p.resultLevels[indexes[0]]
   472  	return &isolatingRunSequence{
   473  		p:       p,
   474  		indexes: indexes,
   475  		types:   types,
   476  		level:   level,
   477  		sos:     typeForLevel(maxLevel(prevLevel, level)),
   478  		eos:     typeForLevel(maxLevel(succLevel, level)),
   479  	}
   480  }
   481  
   482  // Resolving weak types Rules W1-W7.
   483  //
   484  // Note that some weak types (EN, AN) remain after this processing is
   485  // complete.
   486  func (s *isolatingRunSequence) resolveWeakTypes() {
   487  
   488  	// on entry, only these types remain
   489  	s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI)
   490  
   491  	// Rule W1.
   492  	// Changes all NSMs.
   493  	precedingCharacterType := s.sos
   494  	for i, t := range s.types {
   495  		if t == NSM {
   496  			s.types[i] = precedingCharacterType
   497  		} else {
   498  			// if t.in(LRI, RLI, FSI, PDI) {
   499  			// 	precedingCharacterType = ON
   500  			// }
   501  			precedingCharacterType = t
   502  		}
   503  	}
   504  
   505  	// Rule W2.
   506  	// EN does not change at the start of the run, because sos != AL.
   507  	for i, t := range s.types {
   508  		if t == EN {
   509  			for j := i - 1; j >= 0; j-- {
   510  				if t := s.types[j]; t.in(L, R, AL) {
   511  					if t == AL {
   512  						s.types[i] = AN
   513  					}
   514  					break
   515  				}
   516  			}
   517  		}
   518  	}
   519  
   520  	// Rule W3.
   521  	for i, t := range s.types {
   522  		if t == AL {
   523  			s.types[i] = R
   524  		}
   525  	}
   526  
   527  	// Rule W4.
   528  	// Since there must be values on both sides for this rule to have an
   529  	// effect, the scan skips the first and last value.
   530  	//
   531  	// Although the scan proceeds left to right, and changes the type
   532  	// values in a way that would appear to affect the computations
   533  	// later in the scan, there is actually no problem. A change in the
   534  	// current value can only affect the value to its immediate right,
   535  	// and only affect it if it is ES or CS. But the current value can
   536  	// only change if the value to its right is not ES or CS. Thus
   537  	// either the current value will not change, or its change will have
   538  	// no effect on the remainder of the analysis.
   539  
   540  	for i := 1; i < s.Len()-1; i++ {
   541  		t := s.types[i]
   542  		if t == ES || t == CS {
   543  			prevSepType := s.types[i-1]
   544  			succSepType := s.types[i+1]
   545  			if prevSepType == EN && succSepType == EN {
   546  				s.types[i] = EN
   547  			} else if s.types[i] == CS && prevSepType == AN && succSepType == AN {
   548  				s.types[i] = AN
   549  			}
   550  		}
   551  	}
   552  
   553  	// Rule W5.
   554  	for i, t := range s.types {
   555  		if t == ET {
   556  			// locate end of sequence
   557  			runStart := i
   558  			runEnd := s.findRunLimit(runStart, ET)
   559  
   560  			// check values at ends of sequence
   561  			t := s.sos
   562  			if runStart > 0 {
   563  				t = s.types[runStart-1]
   564  			}
   565  			if t != EN {
   566  				t = s.eos
   567  				if runEnd < len(s.types) {
   568  					t = s.types[runEnd]
   569  				}
   570  			}
   571  			if t == EN {
   572  				setTypes(s.types[runStart:runEnd], EN)
   573  			}
   574  			// continue at end of sequence
   575  			i = runEnd
   576  		}
   577  	}
   578  
   579  	// Rule W6.
   580  	for i, t := range s.types {
   581  		if t.in(ES, ET, CS) {
   582  			s.types[i] = ON
   583  		}
   584  	}
   585  
   586  	// Rule W7.
   587  	for i, t := range s.types {
   588  		if t == EN {
   589  			// set default if we reach start of run
   590  			prevStrongType := s.sos
   591  			for j := i - 1; j >= 0; j-- {
   592  				t = s.types[j]
   593  				if t == L || t == R { // AL's have been changed to R
   594  					prevStrongType = t
   595  					break
   596  				}
   597  			}
   598  			if prevStrongType == L {
   599  				s.types[i] = L
   600  			}
   601  		}
   602  	}
   603  }
   604  
   605  // 6) resolving neutral types Rules N1-N2.
   606  func (s *isolatingRunSequence) resolveNeutralTypes() {
   607  
   608  	// on entry, only these types can be in resultTypes
   609  	s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI)
   610  
   611  	for i, t := range s.types {
   612  		switch t {
   613  		case WS, ON, B, S, RLI, LRI, FSI, PDI:
   614  			// find bounds of run of neutrals
   615  			runStart := i
   616  			runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI)
   617  
   618  			// determine effective types at ends of run
   619  			var leadType, trailType Class
   620  
   621  			// Note that the character found can only be L, R, AN, or
   622  			// EN.
   623  			if runStart == 0 {
   624  				leadType = s.sos
   625  			} else {
   626  				leadType = s.types[runStart-1]
   627  				if leadType.in(AN, EN) {
   628  					leadType = R
   629  				}
   630  			}
   631  			if runEnd == len(s.types) {
   632  				trailType = s.eos
   633  			} else {
   634  				trailType = s.types[runEnd]
   635  				if trailType.in(AN, EN) {
   636  					trailType = R
   637  				}
   638  			}
   639  
   640  			var resolvedType Class
   641  			if leadType == trailType {
   642  				// Rule N1.
   643  				resolvedType = leadType
   644  			} else {
   645  				// Rule N2.
   646  				// Notice the embedding level of the run is used, not
   647  				// the paragraph embedding level.
   648  				resolvedType = typeForLevel(s.level)
   649  			}
   650  
   651  			setTypes(s.types[runStart:runEnd], resolvedType)
   652  
   653  			// skip over run of (former) neutrals
   654  			i = runEnd
   655  		}
   656  	}
   657  }
   658  
   659  func setLevels(levels []level, newLevel level) {
   660  	for i := range levels {
   661  		levels[i] = newLevel
   662  	}
   663  }
   664  
   665  func setTypes(types []Class, newType Class) {
   666  	for i := range types {
   667  		types[i] = newType
   668  	}
   669  }
   670  
   671  // 7) resolving implicit embedding levels Rules I1, I2.
   672  func (s *isolatingRunSequence) resolveImplicitLevels() {
   673  
   674  	// on entry, only these types can be in resultTypes
   675  	s.assertOnly(L, R, EN, AN)
   676  
   677  	s.resolvedLevels = make([]level, len(s.types))
   678  	setLevels(s.resolvedLevels, s.level)
   679  
   680  	if (s.level & 1) == 0 { // even level
   681  		for i, t := range s.types {
   682  			// Rule I1.
   683  			if t == L {
   684  				// no change
   685  			} else if t == R {
   686  				s.resolvedLevels[i] += 1
   687  			} else { // t == AN || t == EN
   688  				s.resolvedLevels[i] += 2
   689  			}
   690  		}
   691  	} else { // odd level
   692  		for i, t := range s.types {
   693  			// Rule I2.
   694  			if t == R {
   695  				// no change
   696  			} else { // t == L || t == AN || t == EN
   697  				s.resolvedLevels[i] += 1
   698  			}
   699  		}
   700  	}
   701  }
   702  
   703  // Applies the levels and types resolved in rules W1-I2 to the
   704  // resultLevels array.
   705  func (s *isolatingRunSequence) applyLevelsAndTypes() {
   706  	for i, x := range s.indexes {
   707  		s.p.resultTypes[x] = s.types[i]
   708  		s.p.resultLevels[x] = s.resolvedLevels[i]
   709  	}
   710  }
   711  
   712  // Return the limit of the run consisting only of the types in validSet
   713  // starting at index. This checks the value at index, and will return
   714  // index if that value is not in validSet.
   715  func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int {
   716  loop:
   717  	for ; index < len(s.types); index++ {
   718  		t := s.types[index]
   719  		for _, valid := range validSet {
   720  			if t == valid {
   721  				continue loop
   722  			}
   723  		}
   724  		return index // didn't find a match in validSet
   725  	}
   726  	return len(s.types)
   727  }
   728  
   729  // Algorithm validation. Assert that all values in types are in the
   730  // provided set.
   731  func (s *isolatingRunSequence) assertOnly(codes ...Class) {
   732  loop:
   733  	for i, t := range s.types {
   734  		for _, c := range codes {
   735  			if t == c {
   736  				continue loop
   737  			}
   738  		}
   739  		log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i])
   740  	}
   741  }
   742  
   743  // determineLevelRuns returns an array of level runs. Each level run is
   744  // described as an array of indexes into the input string.
   745  //
   746  // Determines the level runs. Rule X9 will be applied in determining the
   747  // runs, in the way that makes sure the characters that are supposed to be
   748  // removed are not included in the runs.
   749  func (p *paragraph) determineLevelRuns() [][]int {
   750  	run := []int{}
   751  	allRuns := [][]int{}
   752  	currentLevel := implicitLevel
   753  
   754  	for i := range p.initialTypes {
   755  		if !isRemovedByX9(p.initialTypes[i]) {
   756  			if p.resultLevels[i] != currentLevel {
   757  				// we just encountered a new run; wrap up last run
   758  				if currentLevel >= 0 { // only wrap it up if there was a run
   759  					allRuns = append(allRuns, run)
   760  					run = nil
   761  				}
   762  				// Start new run
   763  				currentLevel = p.resultLevels[i]
   764  			}
   765  			run = append(run, i)
   766  		}
   767  	}
   768  	// Wrap up the final run, if any
   769  	if len(run) > 0 {
   770  		allRuns = append(allRuns, run)
   771  	}
   772  	return allRuns
   773  }
   774  
   775  // Definition BD13. Determine isolating run sequences.
   776  func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence {
   777  	levelRuns := p.determineLevelRuns()
   778  
   779  	// Compute the run that each character belongs to
   780  	runForCharacter := make([]int, p.Len())
   781  	for i, run := range levelRuns {
   782  		for _, index := range run {
   783  			runForCharacter[index] = i
   784  		}
   785  	}
   786  
   787  	sequences := []*isolatingRunSequence{}
   788  
   789  	var currentRunSequence []int
   790  
   791  	for _, run := range levelRuns {
   792  		first := run[0]
   793  		if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 {
   794  			currentRunSequence = nil
   795  			// int run = i;
   796  			for {
   797  				// Copy this level run into currentRunSequence
   798  				currentRunSequence = append(currentRunSequence, run...)
   799  
   800  				last := currentRunSequence[len(currentRunSequence)-1]
   801  				lastT := p.initialTypes[last]
   802  				if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() {
   803  					run = levelRuns[runForCharacter[p.matchingPDI[last]]]
   804  				} else {
   805  					break
   806  				}
   807  			}
   808  			sequences = append(sequences, p.isolatingRunSequence(currentRunSequence))
   809  		}
   810  	}
   811  	return sequences
   812  }
   813  
   814  // Assign level information to characters removed by rule X9. This is for
   815  // ease of relating the level information to the original input data. Note
   816  // that the levels assigned to these codes are arbitrary, they're chosen so
   817  // as to avoid breaking level runs.
   818  func (p *paragraph) assignLevelsToCharactersRemovedByX9() {
   819  	for i, t := range p.initialTypes {
   820  		if t.in(LRE, RLE, LRO, RLO, PDF, BN) {
   821  			p.resultTypes[i] = t
   822  			p.resultLevels[i] = -1
   823  		}
   824  	}
   825  	// now propagate forward the levels information (could have
   826  	// propagated backward, the main thing is not to introduce a level
   827  	// break where one doesn't already exist).
   828  
   829  	if p.resultLevels[0] == -1 {
   830  		p.resultLevels[0] = p.embeddingLevel
   831  	}
   832  	for i := 1; i < len(p.initialTypes); i++ {
   833  		if p.resultLevels[i] == -1 {
   834  			p.resultLevels[i] = p.resultLevels[i-1]
   835  		}
   836  	}
   837  	// Embedding information is for informational purposes only so need not be
   838  	// adjusted.
   839  }
   840  
   841  //
   842  // Output
   843  //
   844  
   845  // getLevels computes levels array breaking lines at offsets in linebreaks.
   846  // Rule L1.
   847  //
   848  // The linebreaks array must include at least one value. The values must be
   849  // in strictly increasing order (no duplicates) between 1 and the length of
   850  // the text, inclusive. The last value must be the length of the text.
   851  func (p *paragraph) getLevels(linebreaks []int) []level {
   852  	// Note that since the previous processing has removed all
   853  	// P, S, and WS values from resultTypes, the values referred to
   854  	// in these rules are the initial types, before any processing
   855  	// has been applied (including processing of overrides).
   856  	//
   857  	// This example implementation has reinserted explicit format codes
   858  	// and BN, in order that the levels array correspond to the
   859  	// initial text. Their final placement is not normative.
   860  	// These codes are treated like WS in this implementation,
   861  	// so they don't interrupt sequences of WS.
   862  
   863  	validateLineBreaks(linebreaks, p.Len())
   864  
   865  	result := append([]level(nil), p.resultLevels...)
   866  
   867  	// don't worry about linebreaks since if there is a break within
   868  	// a series of WS values preceding S, the linebreak itself
   869  	// causes the reset.
   870  	for i, t := range p.initialTypes {
   871  		if t.in(B, S) {
   872  			// Rule L1, clauses one and two.
   873  			result[i] = p.embeddingLevel
   874  
   875  			// Rule L1, clause three.
   876  			for j := i - 1; j >= 0; j-- {
   877  				if isWhitespace(p.initialTypes[j]) { // including format codes
   878  					result[j] = p.embeddingLevel
   879  				} else {
   880  					break
   881  				}
   882  			}
   883  		}
   884  	}
   885  
   886  	// Rule L1, clause four.
   887  	start := 0
   888  	for _, limit := range linebreaks {
   889  		for j := limit - 1; j >= start; j-- {
   890  			if isWhitespace(p.initialTypes[j]) { // including format codes
   891  				result[j] = p.embeddingLevel
   892  			} else {
   893  				break
   894  			}
   895  		}
   896  		start = limit
   897  	}
   898  
   899  	return result
   900  }
   901  
   902  // getReordering returns the reordering of lines from a visual index to a
   903  // logical index for line breaks at the given offsets.
   904  //
   905  // Lines are concatenated from left to right. So for example, the fifth
   906  // character from the left on the third line is
   907  //
   908  //	getReordering(linebreaks)[linebreaks[1] + 4]
   909  //
   910  // (linebreaks[1] is the position after the last character of the second
   911  // line, which is also the index of the first character on the third line,
   912  // and adding four gets the fifth character from the left).
   913  //
   914  // The linebreaks array must include at least one value. The values must be
   915  // in strictly increasing order (no duplicates) between 1 and the length of
   916  // the text, inclusive. The last value must be the length of the text.
   917  func (p *paragraph) getReordering(linebreaks []int) []int {
   918  	validateLineBreaks(linebreaks, p.Len())
   919  
   920  	return computeMultilineReordering(p.getLevels(linebreaks), linebreaks)
   921  }
   922  
   923  // Return multiline reordering array for a given level array. Reordering
   924  // does not occur across a line break.
   925  func computeMultilineReordering(levels []level, linebreaks []int) []int {
   926  	result := make([]int, len(levels))
   927  
   928  	start := 0
   929  	for _, limit := range linebreaks {
   930  		tempLevels := make([]level, limit-start)
   931  		copy(tempLevels, levels[start:])
   932  
   933  		for j, order := range computeReordering(tempLevels) {
   934  			result[start+j] = order + start
   935  		}
   936  		start = limit
   937  	}
   938  	return result
   939  }
   940  
   941  // Return reordering array for a given level array. This reorders a single
   942  // line. The reordering is a visual to logical map. For example, the
   943  // leftmost char is string.charAt(order[0]). Rule L2.
   944  func computeReordering(levels []level) []int {
   945  	result := make([]int, len(levels))
   946  	// initialize order
   947  	for i := range result {
   948  		result[i] = i
   949  	}
   950  
   951  	// locate highest level found on line.
   952  	// Note the rules say text, but no reordering across line bounds is
   953  	// performed, so this is sufficient.
   954  	highestLevel := level(0)
   955  	lowestOddLevel := level(maxDepth + 2)
   956  	for _, level := range levels {
   957  		if level > highestLevel {
   958  			highestLevel = level
   959  		}
   960  		if level&1 != 0 && level < lowestOddLevel {
   961  			lowestOddLevel = level
   962  		}
   963  	}
   964  
   965  	for level := highestLevel; level >= lowestOddLevel; level-- {
   966  		for i := 0; i < len(levels); i++ {
   967  			if levels[i] >= level {
   968  				// find range of text at or above this level
   969  				start := i
   970  				limit := i + 1
   971  				for limit < len(levels) && levels[limit] >= level {
   972  					limit++
   973  				}
   974  
   975  				for j, k := start, limit-1; j < k; j, k = j+1, k-1 {
   976  					result[j], result[k] = result[k], result[j]
   977  				}
   978  				// skip to end of level run
   979  				i = limit
   980  			}
   981  		}
   982  	}
   983  
   984  	return result
   985  }
   986  
   987  // isWhitespace reports whether the type is considered a whitespace type for the
   988  // line break rules.
   989  func isWhitespace(c Class) bool {
   990  	switch c {
   991  	case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS:
   992  		return true
   993  	}
   994  	return false
   995  }
   996  
   997  // isRemovedByX9 reports whether the type is one of the types removed in X9.
   998  func isRemovedByX9(c Class) bool {
   999  	switch c {
  1000  	case LRE, RLE, LRO, RLO, PDF, BN:
  1001  		return true
  1002  	}
  1003  	return false
  1004  }
  1005  
  1006  // typeForLevel reports the strong type (L or R) corresponding to the level.
  1007  func typeForLevel(level level) Class {
  1008  	if (level & 0x1) == 0 {
  1009  		return L
  1010  	}
  1011  	return R
  1012  }
  1013  
  1014  func validateTypes(types []Class) error {
  1015  	if len(types) == 0 {
  1016  		return fmt.Errorf("types is null")
  1017  	}
  1018  	for i, t := range types[:len(types)-1] {
  1019  		if t == B {
  1020  			return fmt.Errorf("B type before end of paragraph at index: %d", i)
  1021  		}
  1022  	}
  1023  	return nil
  1024  }
  1025  
  1026  func validateParagraphEmbeddingLevel(embeddingLevel level) error {
  1027  	if embeddingLevel != implicitLevel &&
  1028  		embeddingLevel != 0 &&
  1029  		embeddingLevel != 1 {
  1030  		return fmt.Errorf("illegal paragraph embedding level: %d", embeddingLevel)
  1031  	}
  1032  	return nil
  1033  }
  1034  
  1035  func validateLineBreaks(linebreaks []int, textLength int) error {
  1036  	prev := 0
  1037  	for i, next := range linebreaks {
  1038  		if next <= prev {
  1039  			return fmt.Errorf("bad linebreak: %d at index: %d", next, i)
  1040  		}
  1041  		prev = next
  1042  	}
  1043  	if prev != textLength {
  1044  		return fmt.Errorf("last linebreak was %d, want %d", prev, textLength)
  1045  	}
  1046  	return nil
  1047  }
  1048  
  1049  func validatePbTypes(pairTypes []bracketType) error {
  1050  	if len(pairTypes) == 0 {
  1051  		return fmt.Errorf("pairTypes is null")
  1052  	}
  1053  	for i, pt := range pairTypes {
  1054  		switch pt {
  1055  		case bpNone, bpOpen, bpClose:
  1056  		default:
  1057  			return fmt.Errorf("illegal pairType value at %d: %v", i, pairTypes[i])
  1058  		}
  1059  	}
  1060  	return nil
  1061  }
  1062  
  1063  func validatePbValues(pairValues []rune, pairTypes []bracketType) error {
  1064  	if pairValues == nil {
  1065  		return fmt.Errorf("pairValues is null")
  1066  	}
  1067  	if len(pairTypes) != len(pairValues) {
  1068  		return fmt.Errorf("pairTypes is different length from pairValues")
  1069  	}
  1070  	return nil
  1071  }
  1072  

View as plain text