lexer.go

Documentation: github.com/noirbizarre/gonja/tokens

     1  package tokens
     2  
     3  import (
     4  	"fmt"
     5  	// "encoding/json"
     6  	"regexp"
     7  	// "strconv"
     8  	"strings"
     9  	"unicode"
    10  	"unicode/utf8"
    11  
    12  	"github.com/noirbizarre/gonja/config"
    13  )
    14  
    15  // EOF is an arbitraty value for End Of File
    16  const rEOF = -1
    17  const re_ENDRAW = `%s\s*%s`
    18  
    19  var escapedStrings = map[string]string{
    20  	`\"`: `"`,
    21  	`\'`: `'`,
    22  }
    23  
    24  // var pattern = regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
    25  
    26  // lexFn represents the state of the scanner
    27  // as a function that returns the next state.
    28  type lexFn func() lexFn
    29  
    30  // Lexer holds the state of the scanner.
    31  type Lexer struct {
    32  	Input string // the string being scanned.
    33  	Start int    // start position of this item.
    34  	Pos   int    // current position in the input.
    35  	Width int    // width of last rune read from input.
    36  	Line  int    // Current line in the input
    37  	Col   int    // Current position in the line
    38  	// Position Position // Current lexing position in the input
    39  	Config        *config.Config // The lexer configuration
    40  	Tokens        chan *Token    // channel of scanned tokens.
    41  	delimiters    []rune
    42  	RawStatements rawStmt
    43  	rawEnd        *regexp.Regexp
    44  }
    45  
    46  // TODO: set from env
    47  type rawStmt map[string]*regexp.Regexp
    48  
    49  // NewLexer creates a new scanner for the input string.
    50  func NewLexer(input string) *Lexer {
    51  	cfg := config.DefaultConfig
    52  	return &Lexer{
    53  		Input:  input,
    54  		Tokens: make(chan *Token),
    55  		Config: cfg,
    56  		RawStatements: rawStmt{
    57  			"raw":     regexp.MustCompile(fmt.Sprintf(`%s\s*endraw`, cfg.BlockStartString)),
    58  			"comment": regexp.MustCompile(fmt.Sprintf(`%s\s*endcomment`, cfg.BlockStartString)),
    59  		},
    60  	}
    61  }
    62  
    63  func Lex(input string) *Stream {
    64  	l := NewLexer(input)
    65  	go l.Run()
    66  	return NewStream(l.Tokens)
    67  }
    68  
    69  // errorf returns an error token and terminates the scan
    70  // by passing back a nil pointer that will be the next
    71  // state, terminating Lexer.Run.
    72  func (l *Lexer) errorf(format string, args ...interface{}) lexFn {
    73  	l.Tokens <- &Token{
    74  		Type: Error,
    75  		Val:  fmt.Sprintf(format, args...),
    76  		Pos:  l.Pos,
    77  	}
    78  	return nil
    79  }
    80  
    81  // Position return the current position in the input
    82  func (l *Lexer) Position() *Position {
    83  	return &Position{
    84  		Offset: l.Pos,
    85  		Line:   l.Line,
    86  		Column: l.Col,
    87  	}
    88  }
    89  
    90  func (l *Lexer) Current() string {
    91  	return l.Input[l.Start:l.Pos]
    92  }
    93  
    94  // Run lexes the input by executing state functions until
    95  // the state is nil.
    96  func (l *Lexer) Run() {
    97  	for state := l.lexData; state != nil; {
    98  		state = state()
    99  	}
   100  	close(l.Tokens) // No more tokens will be delivered.
   101  }
   102  
   103  // next returns the next rune in the input.
   104  func (l *Lexer) next() (rune rune) {
   105  	if l.Pos >= len(l.Input) {
   106  		l.Width = 0
   107  		return rEOF
   108  	}
   109  	rune, l.Width = utf8.DecodeRuneInString(l.Input[l.Pos:])
   110  	l.Pos += l.Width
   111  	if rune == '\n' {
   112  		l.Line++
   113  		l.Col = 1
   114  	}
   115  	return rune
   116  }
   117  
   118  // emit passes a Token back to the client.
   119  func (l *Lexer) emit(t Type) {
   120  	l.processAndEmit(t, nil)
   121  }
   122  
   123  func (l *Lexer) processAndEmit(t Type, fn func(string) string) {
   124  	line, col := ReadablePosition(l.Start, l.Input)
   125  	val := l.Input[l.Start:l.Pos]
   126  	if fn != nil {
   127  		val = fn(val)
   128  	}
   129  	l.Tokens <- &Token{
   130  		Type: t,
   131  		Val:  val,
   132  		Pos:  l.Start,
   133  		Line: line,
   134  		Col:  col,
   135  	}
   136  	l.Start = l.Pos
   137  }
   138  
   139  // ignore skips over the pending input before this point.
   140  func (l *Lexer) ignore() {
   141  	l.Start = l.Pos
   142  }
   143  
   144  // backup steps back one rune.
   145  // Can be called only once per call of next.
   146  func (l *Lexer) backup() {
   147  	l.Pos -= l.Width
   148  }
   149  
   150  // peek returns but does not consume
   151  // the next rune in the input.
   152  func (l *Lexer) peek() rune {
   153  	rune := l.next()
   154  	l.backup()
   155  	return rune
   156  }
   157  
   158  // accept consumes the next rune
   159  // if it's from the valid set.
   160  func (l *Lexer) accept(valid string) bool {
   161  	if strings.IndexRune(valid, l.next()) >= 0 {
   162  		return true
   163  	}
   164  	l.backup()
   165  	return false
   166  }
   167  
   168  // acceptRun consumes a run of runes from the valid set.
   169  func (l *Lexer) acceptRun(valid string) {
   170  	for strings.IndexRune(valid, l.next()) >= 0 {
   171  	}
   172  	l.backup()
   173  }
   174  
   175  func (l *Lexer) pushDelimiter(r rune) {
   176  	l.delimiters = append(l.delimiters, r)
   177  }
   178  
   179  func (l *Lexer) hasPrefix(prefix string) bool {
   180  	return strings.HasPrefix(l.Input[l.Pos:], prefix)
   181  }
   182  
   183  func (l *Lexer) popDelimiter(r rune) bool {
   184  	if len(l.delimiters) == 0 {
   185  		l.errorf(`Unexpected delimiter "%c"`, r)
   186  		return false
   187  	}
   188  	last := len(l.delimiters) - 1
   189  	expected := l.delimiters[last]
   190  	if r != expected {
   191  		l.errorf(`Unbalanced delimiters, expected "%c", got "%c"`, expected, r)
   192  		return false
   193  	}
   194  	// l.delimiters[last] = nil // Erase element (write zero value)
   195  	l.delimiters = l.delimiters[:last]
   196  	return true
   197  }
   198  
   199  // return whether or not we are expecting r as the next delimiter
   200  func (l *Lexer) expectDelimiter(r rune) bool {
   201  	if len(l.delimiters) == 0 {
   202  		return false
   203  	}
   204  	expected := l.delimiters[len(l.delimiters)-1]
   205  	return r == expected
   206  }
   207  
   208  func (l *Lexer) lexData() lexFn {
   209  	for {
   210  		if l.hasPrefix(l.Config.CommentStartString) {
   211  			if l.Pos > l.Start {
   212  				l.emit(Data)
   213  			}
   214  			return l.lexComment
   215  		}
   216  
   217  		if l.hasPrefix(l.Config.VariableStartString) {
   218  			if l.Pos > l.Start {
   219  				l.emit(Data)
   220  			}
   221  			return l.lexVariable
   222  		}
   223  
   224  		if l.hasPrefix(l.Config.BlockStartString) {
   225  			if l.Pos > l.Start {
   226  				l.emit(Data)
   227  			}
   228  			return l.lexBlock
   229  		}
   230  
   231  		if l.next() == rEOF {
   232  			break
   233  		}
   234  	}
   235  	// Correctly reached EOF.
   236  	if l.Pos > l.Start {
   237  		l.emit(Data)
   238  	}
   239  	l.emit(EOF) // Useful to make EOF a token.
   240  	return nil  // Stop the run loop.
   241  }
   242  
   243  func (l *Lexer) remaining() string {
   244  	return l.Input[l.Pos:]
   245  }
   246  
   247  func (l *Lexer) lexRaw() lexFn {
   248  	loc := l.rawEnd.FindStringIndex(l.remaining())
   249  	if loc == nil {
   250  		return l.errorf(`Unable to find raw closing statement`)
   251  	}
   252  	l.Pos += loc[0]
   253  	l.emit(Data)
   254  	l.rawEnd = nil
   255  	return l.lexBlock
   256  	// regexp.MustCompile(`(?m)(?P<key>\w+):\s+(?P<value>\w+)$`)
   257  	// idx := pattern
   258  }
   259  
   260  func (l *Lexer) lexComment() lexFn {
   261  	l.Pos += len(l.Config.CommentStartString)
   262  	l.emit(CommentBegin)
   263  	i := strings.Index(l.Input[l.Pos:], l.Config.CommentEndString)
   264  	if i < 0 {
   265  		return l.errorf("unclosed comment")
   266  	}
   267  	l.Pos += i
   268  	l.emit(Data)
   269  	l.Pos += len(l.Config.CommentEndString)
   270  	l.emit(CommentEnd)
   271  	return l.lexData
   272  }
   273  
   274  func (l *Lexer) lexVariable() lexFn {
   275  	l.Pos += len(l.Config.VariableStartString)
   276  	l.accept("-")
   277  	l.emit(VariableBegin)
   278  	return l.lexExpression
   279  }
   280  
   281  func (l *Lexer) lexVariableEnd() lexFn {
   282  	l.accept("-")
   283  	l.Pos += len(l.Config.VariableEndString)
   284  	l.emit(VariableEnd)
   285  	return l.lexData
   286  }
   287  
   288  func (l *Lexer) lexBlock() lexFn {
   289  	l.Pos += len(l.Config.BlockStartString)
   290  	l.accept("+-")
   291  	l.emit(BlockBegin)
   292  	for isSpace(l.peek()) {
   293  		l.next()
   294  	}
   295  	if len(l.Current()) > 0 {
   296  		l.emit(Whitespace)
   297  	}
   298  	stmt := l.nextIdentifier()
   299  	l.emit(Name)
   300  	re, exists := l.RawStatements[stmt]
   301  	if exists {
   302  		l.rawEnd = re
   303  	}
   304  	return l.lexExpression
   305  }
   306  
   307  func (l *Lexer) lexBlockEnd() lexFn {
   308  	l.accept("-")
   309  	l.Pos += len(l.Config.BlockEndString)
   310  	l.emit(BlockEnd)
   311  	if l.rawEnd != nil {
   312  		return l.lexRaw
   313  	} else {
   314  		return l.lexData
   315  	}
   316  }
   317  
   318  func (l *Lexer) lexExpression() lexFn {
   319  	for {
   320  		if !l.expectDelimiter(l.peek()) {
   321  			if l.hasPrefix(l.Config.VariableEndString) { // && l.expectDelimiter(l.peek()) {
   322  				return l.lexVariableEnd
   323  			}
   324  
   325  			// if this is the rightDelim, but we are expecting the next char as a delimiter
   326  			// then skip marking this as rightDelim.  This allows us to have, eg, '}}' as
   327  			// part of a literal inside a var block.
   328  			// if strings.HasPrefix(l.input[l.pos:], l.rightDelim) && !l.shouldExpectDelim(l.peek()) {
   329  			// 	l.pos += Pos(len(l.rightDelim))
   330  			// 	l.emitRight()
   331  			// 	return lexText
   332  			// }
   333  
   334  			if l.hasPrefix(l.Config.BlockEndString) {
   335  				return l.lexBlockEnd
   336  			}
   337  		}
   338  
   339  		r := l.next()
   340  		// remaining := l.Input[l.Pos:]
   341  		switch {
   342  		case isSpace(r):
   343  			return l.lexSpace
   344  		case isNumeric(r):
   345  			return l.lexNumber
   346  		case isAlphaNumeric(r):
   347  			return l.lexIdentifier
   348  		}
   349  
   350  		switch r {
   351  		case '"', '\'':
   352  			l.backup()
   353  			return l.lexString
   354  		case ',':
   355  			l.emit(Comma)
   356  		case '|':
   357  			l.emit(Pipe)
   358  			// if l.accept("|") {
   359  			// 	l.emit(Or)
   360  			// } else {
   361  			// }
   362  		case '+':
   363  			l.emit(Add)
   364  		case '-':
   365  			if l.hasPrefix(l.Config.BlockEndString) {
   366  				l.backup()
   367  				return l.lexBlockEnd
   368  			} else if l.hasPrefix(l.Config.VariableEndString) {
   369  				l.backup()
   370  				return l.lexVariableEnd
   371  			} else {
   372  				l.emit(Sub)
   373  			}
   374  		case '~':
   375  			l.emit(Tilde)
   376  		case ':':
   377  			l.emit(Colon)
   378  		case '.':
   379  			l.emit(Dot)
   380  		case '%':
   381  			l.emit(Mod)
   382  		case '/':
   383  			if l.accept("/") {
   384  				l.emit(Floordiv)
   385  			} else {
   386  				l.emit(Div)
   387  			}
   388  		case '<':
   389  			if l.accept("=") {
   390  				l.emit(Lteq)
   391  			} else {
   392  				l.emit(Lt)
   393  			}
   394  		case '>':
   395  			if l.accept("=") {
   396  				l.emit(Gteq)
   397  			} else {
   398  				l.emit(Gt)
   399  			}
   400  		case '*':
   401  			if l.accept("*") {
   402  				l.emit(Pow)
   403  			} else {
   404  				l.emit(Mul)
   405  			}
   406  		case '!':
   407  			if l.accept("=") {
   408  				l.emit(Ne)
   409  			} else {
   410  				// l.emit(Not)
   411  				l.errorf(`Unexpected "!"`)
   412  			}
   413  		// case '&':
   414  		// 	if l.accept("&") {
   415  		// 		l.emit(And)
   416  		// 	} else {
   417  		// 		return nil
   418  		// 	}
   419  		case '=':
   420  			if l.accept("=") {
   421  				l.emit(Eq)
   422  			} else {
   423  				l.emit(Assign)
   424  			}
   425  		case '(':
   426  			l.emit(Lparen)
   427  			l.pushDelimiter(')')
   428  		case '{':
   429  			l.emit(Lbrace)
   430  			l.pushDelimiter('}')
   431  		case '[':
   432  			l.emit(Lbracket)
   433  			l.pushDelimiter(']')
   434  		case ')':
   435  			if !l.popDelimiter(')') {
   436  				return nil
   437  			}
   438  			l.emit(Rparen)
   439  		case '}':
   440  			if !l.popDelimiter('}') {
   441  				return nil
   442  			}
   443  			l.emit(Rbrace)
   444  		case ']':
   445  			if !l.popDelimiter(']') {
   446  				return nil
   447  			}
   448  			l.emit(Rbracket)
   449  		}
   450  	}
   451  	return l.lexData
   452  }
   453  
   454  func (l *Lexer) lexSpace() lexFn {
   455  	for isSpace(l.peek()) {
   456  		l.next()
   457  	}
   458  	l.emit(Whitespace)
   459  	return l.lexExpression
   460  }
   461  
   462  func (l *Lexer) nextIdentifier() string {
   463  	for {
   464  		switch r := l.next(); {
   465  		case isAlphaNumeric(r):
   466  			// absorb.
   467  		default:
   468  			l.backup()
   469  			// l.emit(Name)
   470  			return l.Current()
   471  		}
   472  	}
   473  }
   474  
   475  func (l *Lexer) lexIdentifier() lexFn {
   476  	l.nextIdentifier()
   477  	l.emit(Name)
   478  	return l.lexExpression
   479  }
   480  
   481  func (l *Lexer) lexNumber() lexFn {
   482  	tokType := Integer
   483  	for {
   484  		switch r := l.next(); {
   485  		case isNumeric(r):
   486  			// abosrb
   487  		case r == '.':
   488  			if tokType != Float {
   489  				tokType = Float
   490  			} else {
   491  				l.errorf("two dots in numeric token")
   492  			}
   493  		case isAlphaNumeric(r) && tokType == Integer:
   494  			return l.lexIdentifier
   495  		default:
   496  			l.backup()
   497  			l.emit(tokType)
   498  			return l.lexExpression
   499  		}
   500  	}
   501  }
   502  
   503  func unescape(str string) string {
   504  	str = str[1 : len(str)-1]
   505  	for escaped, unescaped := range escapedStrings {
   506  		str = strings.ReplaceAll(str, escaped, unescaped)
   507  	}
   508  	return str
   509  }
   510  
   511  func (l *Lexer) lexString() lexFn {
   512  	quote := l.next() // should be either ' or "
   513  	var prev rune
   514  	for r := l.next(); r != quote || prev == '\\'; r, prev = l.next(), r {
   515  	}
   516  	l.processAndEmit(String, unescape)
   517  	return l.lexExpression
   518  }
   519  
   520  // isSpace reports whether r is a space character.
   521  func isSpace(r rune) bool {
   522  	return r == ' ' || r == '\t'
   523  }
   524  
   525  // isEndOfLine reports whether r is an end-of-line character.
   526  func isEndOfLine(r rune) bool {
   527  	return r == '\r' || r == '\n'
   528  }
   529  
   530  // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
   531  func isAlphaNumeric(r rune) bool {
   532  	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
   533  }
   534  
   535  func isNumeric(r rune) bool {
   536  	return unicode.IsDigit(r)
   537  }
   538
View as plain text