1 package tokens
2
3 import (
4 "fmt"
5
6 "regexp"
7
8 "strings"
9 "unicode"
10 "unicode/utf8"
11
12 "github.com/noirbizarre/gonja/config"
13 )
14
15
16 const rEOF = -1
17 const re_ENDRAW = `%s\s*%s`
18
19 var escapedStrings = map[string]string{
20 `\"`: `"`,
21 `\'`: `'`,
22 }
23
24
25
26
27
28 type lexFn func() lexFn
29
30
31 type Lexer struct {
32 Input string
33 Start int
34 Pos int
35 Width int
36 Line int
37 Col int
38
39 Config *config.Config
40 Tokens chan *Token
41 delimiters []rune
42 RawStatements rawStmt
43 rawEnd *regexp.Regexp
44 }
45
46
47 type rawStmt map[string]*regexp.Regexp
48
49
50 func NewLexer(input string) *Lexer {
51 cfg := config.DefaultConfig
52 return &Lexer{
53 Input: input,
54 Tokens: make(chan *Token),
55 Config: cfg,
56 RawStatements: rawStmt{
57 "raw": regexp.MustCompile(fmt.Sprintf(`%s\s*endraw`, cfg.BlockStartString)),
58 "comment": regexp.MustCompile(fmt.Sprintf(`%s\s*endcomment`, cfg.BlockStartString)),
59 },
60 }
61 }
62
63 func Lex(input string) *Stream {
64 l := NewLexer(input)
65 go l.Run()
66 return NewStream(l.Tokens)
67 }
68
69
70
71
72 func (l *Lexer) errorf(format string, args ...interface{}) lexFn {
73 l.Tokens <- &Token{
74 Type: Error,
75 Val: fmt.Sprintf(format, args...),
76 Pos: l.Pos,
77 }
78 return nil
79 }
80
81
82 func (l *Lexer) Position() *Position {
83 return &Position{
84 Offset: l.Pos,
85 Line: l.Line,
86 Column: l.Col,
87 }
88 }
89
90 func (l *Lexer) Current() string {
91 return l.Input[l.Start:l.Pos]
92 }
93
94
95
96 func (l *Lexer) Run() {
97 for state := l.lexData; state != nil; {
98 state = state()
99 }
100 close(l.Tokens)
101 }
102
103
104 func (l *Lexer) next() (rune rune) {
105 if l.Pos >= len(l.Input) {
106 l.Width = 0
107 return rEOF
108 }
109 rune, l.Width = utf8.DecodeRuneInString(l.Input[l.Pos:])
110 l.Pos += l.Width
111 if rune == '\n' {
112 l.Line++
113 l.Col = 1
114 }
115 return rune
116 }
117
118
119 func (l *Lexer) emit(t Type) {
120 l.processAndEmit(t, nil)
121 }
122
123 func (l *Lexer) processAndEmit(t Type, fn func(string) string) {
124 line, col := ReadablePosition(l.Start, l.Input)
125 val := l.Input[l.Start:l.Pos]
126 if fn != nil {
127 val = fn(val)
128 }
129 l.Tokens <- &Token{
130 Type: t,
131 Val: val,
132 Pos: l.Start,
133 Line: line,
134 Col: col,
135 }
136 l.Start = l.Pos
137 }
138
139
140 func (l *Lexer) ignore() {
141 l.Start = l.Pos
142 }
143
144
145
146 func (l *Lexer) backup() {
147 l.Pos -= l.Width
148 }
149
150
151
152 func (l *Lexer) peek() rune {
153 rune := l.next()
154 l.backup()
155 return rune
156 }
157
158
159
160 func (l *Lexer) accept(valid string) bool {
161 if strings.IndexRune(valid, l.next()) >= 0 {
162 return true
163 }
164 l.backup()
165 return false
166 }
167
168
169 func (l *Lexer) acceptRun(valid string) {
170 for strings.IndexRune(valid, l.next()) >= 0 {
171 }
172 l.backup()
173 }
174
175 func (l *Lexer) pushDelimiter(r rune) {
176 l.delimiters = append(l.delimiters, r)
177 }
178
179 func (l *Lexer) hasPrefix(prefix string) bool {
180 return strings.HasPrefix(l.Input[l.Pos:], prefix)
181 }
182
183 func (l *Lexer) popDelimiter(r rune) bool {
184 if len(l.delimiters) == 0 {
185 l.errorf(`Unexpected delimiter "%c"`, r)
186 return false
187 }
188 last := len(l.delimiters) - 1
189 expected := l.delimiters[last]
190 if r != expected {
191 l.errorf(`Unbalanced delimiters, expected "%c", got "%c"`, expected, r)
192 return false
193 }
194
195 l.delimiters = l.delimiters[:last]
196 return true
197 }
198
199
200 func (l *Lexer) expectDelimiter(r rune) bool {
201 if len(l.delimiters) == 0 {
202 return false
203 }
204 expected := l.delimiters[len(l.delimiters)-1]
205 return r == expected
206 }
207
208 func (l *Lexer) lexData() lexFn {
209 for {
210 if l.hasPrefix(l.Config.CommentStartString) {
211 if l.Pos > l.Start {
212 l.emit(Data)
213 }
214 return l.lexComment
215 }
216
217 if l.hasPrefix(l.Config.VariableStartString) {
218 if l.Pos > l.Start {
219 l.emit(Data)
220 }
221 return l.lexVariable
222 }
223
224 if l.hasPrefix(l.Config.BlockStartString) {
225 if l.Pos > l.Start {
226 l.emit(Data)
227 }
228 return l.lexBlock
229 }
230
231 if l.next() == rEOF {
232 break
233 }
234 }
235
236 if l.Pos > l.Start {
237 l.emit(Data)
238 }
239 l.emit(EOF)
240 return nil
241 }
242
243 func (l *Lexer) remaining() string {
244 return l.Input[l.Pos:]
245 }
246
247 func (l *Lexer) lexRaw() lexFn {
248 loc := l.rawEnd.FindStringIndex(l.remaining())
249 if loc == nil {
250 return l.errorf(`Unable to find raw closing statement`)
251 }
252 l.Pos += loc[0]
253 l.emit(Data)
254 l.rawEnd = nil
255 return l.lexBlock
256
257
258 }
259
260 func (l *Lexer) lexComment() lexFn {
261 l.Pos += len(l.Config.CommentStartString)
262 l.emit(CommentBegin)
263 i := strings.Index(l.Input[l.Pos:], l.Config.CommentEndString)
264 if i < 0 {
265 return l.errorf("unclosed comment")
266 }
267 l.Pos += i
268 l.emit(Data)
269 l.Pos += len(l.Config.CommentEndString)
270 l.emit(CommentEnd)
271 return l.lexData
272 }
273
274 func (l *Lexer) lexVariable() lexFn {
275 l.Pos += len(l.Config.VariableStartString)
276 l.accept("-")
277 l.emit(VariableBegin)
278 return l.lexExpression
279 }
280
281 func (l *Lexer) lexVariableEnd() lexFn {
282 l.accept("-")
283 l.Pos += len(l.Config.VariableEndString)
284 l.emit(VariableEnd)
285 return l.lexData
286 }
287
288 func (l *Lexer) lexBlock() lexFn {
289 l.Pos += len(l.Config.BlockStartString)
290 l.accept("+-")
291 l.emit(BlockBegin)
292 for isSpace(l.peek()) {
293 l.next()
294 }
295 if len(l.Current()) > 0 {
296 l.emit(Whitespace)
297 }
298 stmt := l.nextIdentifier()
299 l.emit(Name)
300 re, exists := l.RawStatements[stmt]
301 if exists {
302 l.rawEnd = re
303 }
304 return l.lexExpression
305 }
306
307 func (l *Lexer) lexBlockEnd() lexFn {
308 l.accept("-")
309 l.Pos += len(l.Config.BlockEndString)
310 l.emit(BlockEnd)
311 if l.rawEnd != nil {
312 return l.lexRaw
313 } else {
314 return l.lexData
315 }
316 }
317
318 func (l *Lexer) lexExpression() lexFn {
319 for {
320 if !l.expectDelimiter(l.peek()) {
321 if l.hasPrefix(l.Config.VariableEndString) {
322 return l.lexVariableEnd
323 }
324
325
326
327
328
329
330
331
332
333
334 if l.hasPrefix(l.Config.BlockEndString) {
335 return l.lexBlockEnd
336 }
337 }
338
339 r := l.next()
340
341 switch {
342 case isSpace(r):
343 return l.lexSpace
344 case isNumeric(r):
345 return l.lexNumber
346 case isAlphaNumeric(r):
347 return l.lexIdentifier
348 }
349
350 switch r {
351 case '"', '\'':
352 l.backup()
353 return l.lexString
354 case ',':
355 l.emit(Comma)
356 case '|':
357 l.emit(Pipe)
358
359
360
361
362 case '+':
363 l.emit(Add)
364 case '-':
365 if l.hasPrefix(l.Config.BlockEndString) {
366 l.backup()
367 return l.lexBlockEnd
368 } else if l.hasPrefix(l.Config.VariableEndString) {
369 l.backup()
370 return l.lexVariableEnd
371 } else {
372 l.emit(Sub)
373 }
374 case '~':
375 l.emit(Tilde)
376 case ':':
377 l.emit(Colon)
378 case '.':
379 l.emit(Dot)
380 case '%':
381 l.emit(Mod)
382 case '/':
383 if l.accept("/") {
384 l.emit(Floordiv)
385 } else {
386 l.emit(Div)
387 }
388 case '<':
389 if l.accept("=") {
390 l.emit(Lteq)
391 } else {
392 l.emit(Lt)
393 }
394 case '>':
395 if l.accept("=") {
396 l.emit(Gteq)
397 } else {
398 l.emit(Gt)
399 }
400 case '*':
401 if l.accept("*") {
402 l.emit(Pow)
403 } else {
404 l.emit(Mul)
405 }
406 case '!':
407 if l.accept("=") {
408 l.emit(Ne)
409 } else {
410
411 l.errorf(`Unexpected "!"`)
412 }
413
414
415
416
417
418
419 case '=':
420 if l.accept("=") {
421 l.emit(Eq)
422 } else {
423 l.emit(Assign)
424 }
425 case '(':
426 l.emit(Lparen)
427 l.pushDelimiter(')')
428 case '{':
429 l.emit(Lbrace)
430 l.pushDelimiter('}')
431 case '[':
432 l.emit(Lbracket)
433 l.pushDelimiter(']')
434 case ')':
435 if !l.popDelimiter(')') {
436 return nil
437 }
438 l.emit(Rparen)
439 case '}':
440 if !l.popDelimiter('}') {
441 return nil
442 }
443 l.emit(Rbrace)
444 case ']':
445 if !l.popDelimiter(']') {
446 return nil
447 }
448 l.emit(Rbracket)
449 }
450 }
451 return l.lexData
452 }
453
454 func (l *Lexer) lexSpace() lexFn {
455 for isSpace(l.peek()) {
456 l.next()
457 }
458 l.emit(Whitespace)
459 return l.lexExpression
460 }
461
462 func (l *Lexer) nextIdentifier() string {
463 for {
464 switch r := l.next(); {
465 case isAlphaNumeric(r):
466
467 default:
468 l.backup()
469
470 return l.Current()
471 }
472 }
473 }
474
475 func (l *Lexer) lexIdentifier() lexFn {
476 l.nextIdentifier()
477 l.emit(Name)
478 return l.lexExpression
479 }
480
481 func (l *Lexer) lexNumber() lexFn {
482 tokType := Integer
483 for {
484 switch r := l.next(); {
485 case isNumeric(r):
486
487 case r == '.':
488 if tokType != Float {
489 tokType = Float
490 } else {
491 l.errorf("two dots in numeric token")
492 }
493 case isAlphaNumeric(r) && tokType == Integer:
494 return l.lexIdentifier
495 default:
496 l.backup()
497 l.emit(tokType)
498 return l.lexExpression
499 }
500 }
501 }
502
503 func unescape(str string) string {
504 str = str[1 : len(str)-1]
505 for escaped, unescaped := range escapedStrings {
506 str = strings.ReplaceAll(str, escaped, unescaped)
507 }
508 return str
509 }
510
511 func (l *Lexer) lexString() lexFn {
512 quote := l.next()
513 var prev rune
514 for r := l.next(); r != quote || prev == '\\'; r, prev = l.next(), r {
515 }
516 l.processAndEmit(String, unescape)
517 return l.lexExpression
518 }
519
520
521 func isSpace(r rune) bool {
522 return r == ' ' || r == '\t'
523 }
524
525
526 func isEndOfLine(r rune) bool {
527 return r == '\r' || r == '\n'
528 }
529
530
531 func isAlphaNumeric(r rune) bool {
532 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
533 }
534
535 func isNumeric(r rune) bool {
536 return unicode.IsDigit(r)
537 }
538
View as plain text