1
2
3
4
5 package syntax
6
7
8
9
10 import (
11 "strconv"
12 "strings"
13 "unicode"
14 )
15
16
17 type Regexp struct {
18 Op Op
19 Flags Flags
20 Sub []*Regexp
21 Sub0 [1]*Regexp
22 Rune []rune
23 Rune0 [2]rune
24 Min, Max int
25 Cap int
26 Name string
27 }
28
29
30
31
32 type Op uint8
33
34
35
36
37
38 const (
39 OpNoMatch Op = 1 + iota
40 OpEmptyMatch
41 OpLiteral
42 OpCharClass
43 OpAnyCharNotNL
44 OpAnyChar
45 OpBeginLine
46 OpEndLine
47 OpBeginText
48 OpEndText
49 OpWordBoundary
50 OpNoWordBoundary
51 OpCapture
52 OpStar
53 OpPlus
54 OpQuest
55 OpRepeat
56 OpConcat
57 OpAlternate
58 )
59
60 const opPseudo Op = 128
61
62
63 func (x *Regexp) Equal(y *Regexp) bool {
64 if x == nil || y == nil {
65 return x == y
66 }
67 if x.Op != y.Op {
68 return false
69 }
70 switch x.Op {
71 case OpEndText:
72
73 if x.Flags&WasDollar != y.Flags&WasDollar {
74 return false
75 }
76
77 case OpLiteral, OpCharClass:
78 if len(x.Rune) != len(y.Rune) {
79 return false
80 }
81 for i, r := range x.Rune {
82 if r != y.Rune[i] {
83 return false
84 }
85 }
86
87 case OpAlternate, OpConcat:
88 if len(x.Sub) != len(y.Sub) {
89 return false
90 }
91 for i, sub := range x.Sub {
92 if !sub.Equal(y.Sub[i]) {
93 return false
94 }
95 }
96
97 case OpStar, OpPlus, OpQuest:
98 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
99 return false
100 }
101
102 case OpRepeat:
103 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
104 return false
105 }
106
107 case OpCapture:
108 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
109 return false
110 }
111 }
112 return true
113 }
114
115
116 type printFlags uint8
117
118 const (
119 flagI printFlags = 1 << iota
120 flagM
121 flagS
122 flagOff
123 flagPrec
124 negShift = 5
125 )
126
127
128
129 func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
130 if *flags == nil {
131 *flags = make(map[*Regexp]printFlags)
132 }
133 (*flags)[start] = f
134 (*flags)[last] |= flagOff
135 }
136
137
138
139
140
141
142 func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
143 switch re.Op {
144 default:
145 return 0, 0
146
147 case OpLiteral:
148
149
150
151 for _, r := range re.Rune {
152 if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
153 if re.Flags&FoldCase != 0 {
154 return flagI, 0
155 } else {
156 return 0, flagI
157 }
158 }
159 }
160 return 0, 0
161
162 case OpCharClass:
163
164
165 for i := 0; i < len(re.Rune); i += 2 {
166 lo := max(minFold, re.Rune[i])
167 hi := min(maxFold, re.Rune[i+1])
168 for r := lo; r <= hi; r++ {
169 for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
170 if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
171 return 0, flagI
172 }
173 }
174 }
175 }
176 return 0, 0
177
178 case OpAnyCharNotNL:
179 return 0, flagS
180
181 case OpAnyChar:
182 return flagS, 0
183
184 case OpBeginLine, OpEndLine:
185 return flagM, 0
186
187 case OpEndText:
188 if re.Flags&WasDollar != 0 {
189 return 0, flagM
190 }
191 return 0, 0
192
193 case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
194 return calcFlags(re.Sub[0], flags)
195
196 case OpConcat, OpAlternate:
197
198
199
200 var must, cant, allCant printFlags
201 start := 0
202 last := 0
203 did := false
204 for i, sub := range re.Sub {
205 subMust, subCant := calcFlags(sub, flags)
206 if must&subCant != 0 || subMust&cant != 0 {
207 if must != 0 {
208 addSpan(re.Sub[start], re.Sub[last], must, flags)
209 }
210 must = 0
211 cant = 0
212 start = i
213 did = true
214 }
215 must |= subMust
216 cant |= subCant
217 allCant |= subCant
218 if subMust != 0 {
219 last = i
220 }
221 if must == 0 && start == i {
222 start++
223 }
224 }
225 if !did {
226
227 return must, cant
228 }
229 if must != 0 {
230
231 addSpan(re.Sub[start], re.Sub[last], must, flags)
232 }
233 return 0, allCant
234 }
235 }
236
237
238 func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
239 f |= flags[re]
240 if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
241
242 f &^= flagPrec
243 }
244 if f&^(flagOff|flagPrec) != 0 {
245 b.WriteString(`(?`)
246 if f&flagI != 0 {
247 b.WriteString(`i`)
248 }
249 if f&flagM != 0 {
250 b.WriteString(`m`)
251 }
252 if f&flagS != 0 {
253 b.WriteString(`s`)
254 }
255 if f&((flagM|flagS)<<negShift) != 0 {
256 b.WriteString(`-`)
257 if f&(flagM<<negShift) != 0 {
258 b.WriteString(`m`)
259 }
260 if f&(flagS<<negShift) != 0 {
261 b.WriteString(`s`)
262 }
263 }
264 b.WriteString(`:`)
265 }
266 if f&flagOff != 0 {
267 defer b.WriteString(`)`)
268 }
269 if f&flagPrec != 0 {
270 b.WriteString(`(?:`)
271 defer b.WriteString(`)`)
272 }
273
274 switch re.Op {
275 default:
276 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
277 case OpNoMatch:
278 b.WriteString(`[^\x00-\x{10FFFF}]`)
279 case OpEmptyMatch:
280 b.WriteString(`(?:)`)
281 case OpLiteral:
282 for _, r := range re.Rune {
283 escape(b, r, false)
284 }
285 case OpCharClass:
286 if len(re.Rune)%2 != 0 {
287 b.WriteString(`[invalid char class]`)
288 break
289 }
290 b.WriteRune('[')
291 if len(re.Rune) == 0 {
292 b.WriteString(`^\x00-\x{10FFFF}`)
293 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
294
295
296 b.WriteRune('^')
297 for i := 1; i < len(re.Rune)-1; i += 2 {
298 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
299 escape(b, lo, lo == '-')
300 if lo != hi {
301 if hi != lo+1 {
302 b.WriteRune('-')
303 }
304 escape(b, hi, hi == '-')
305 }
306 }
307 } else {
308 for i := 0; i < len(re.Rune); i += 2 {
309 lo, hi := re.Rune[i], re.Rune[i+1]
310 escape(b, lo, lo == '-')
311 if lo != hi {
312 if hi != lo+1 {
313 b.WriteRune('-')
314 }
315 escape(b, hi, hi == '-')
316 }
317 }
318 }
319 b.WriteRune(']')
320 case OpAnyCharNotNL, OpAnyChar:
321 b.WriteString(`.`)
322 case OpBeginLine:
323 b.WriteString(`^`)
324 case OpEndLine:
325 b.WriteString(`$`)
326 case OpBeginText:
327 b.WriteString(`\A`)
328 case OpEndText:
329 if re.Flags&WasDollar != 0 {
330 b.WriteString(`$`)
331 } else {
332 b.WriteString(`\z`)
333 }
334 case OpWordBoundary:
335 b.WriteString(`\b`)
336 case OpNoWordBoundary:
337 b.WriteString(`\B`)
338 case OpCapture:
339 if re.Name != "" {
340 b.WriteString(`(?P<`)
341 b.WriteString(re.Name)
342 b.WriteRune('>')
343 } else {
344 b.WriteRune('(')
345 }
346 if re.Sub[0].Op != OpEmptyMatch {
347 writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
348 }
349 b.WriteRune(')')
350 case OpStar, OpPlus, OpQuest, OpRepeat:
351 p := printFlags(0)
352 sub := re.Sub[0]
353 if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
354 p = flagPrec
355 }
356 writeRegexp(b, sub, p, flags)
357
358 switch re.Op {
359 case OpStar:
360 b.WriteRune('*')
361 case OpPlus:
362 b.WriteRune('+')
363 case OpQuest:
364 b.WriteRune('?')
365 case OpRepeat:
366 b.WriteRune('{')
367 b.WriteString(strconv.Itoa(re.Min))
368 if re.Max != re.Min {
369 b.WriteRune(',')
370 if re.Max >= 0 {
371 b.WriteString(strconv.Itoa(re.Max))
372 }
373 }
374 b.WriteRune('}')
375 }
376 if re.Flags&NonGreedy != 0 {
377 b.WriteRune('?')
378 }
379 case OpConcat:
380 for _, sub := range re.Sub {
381 p := printFlags(0)
382 if sub.Op == OpAlternate {
383 p = flagPrec
384 }
385 writeRegexp(b, sub, p, flags)
386 }
387 case OpAlternate:
388 for i, sub := range re.Sub {
389 if i > 0 {
390 b.WriteRune('|')
391 }
392 writeRegexp(b, sub, 0, flags)
393 }
394 }
395 }
396
397 func (re *Regexp) String() string {
398 var b strings.Builder
399 var flags map[*Regexp]printFlags
400 must, cant := calcFlags(re, &flags)
401 must |= (cant &^ flagI) << negShift
402 if must != 0 {
403 must |= flagOff
404 }
405 writeRegexp(&b, re, must, flags)
406 return b.String()
407 }
408
409 const meta = `\.+*?()|[]{}^$`
410
411 func escape(b *strings.Builder, r rune, force bool) {
412 if unicode.IsPrint(r) {
413 if strings.ContainsRune(meta, r) || force {
414 b.WriteRune('\\')
415 }
416 b.WriteRune(r)
417 return
418 }
419
420 switch r {
421 case '\a':
422 b.WriteString(`\a`)
423 case '\f':
424 b.WriteString(`\f`)
425 case '\n':
426 b.WriteString(`\n`)
427 case '\r':
428 b.WriteString(`\r`)
429 case '\t':
430 b.WriteString(`\t`)
431 case '\v':
432 b.WriteString(`\v`)
433 default:
434 if r < 0x100 {
435 b.WriteString(`\x`)
436 s := strconv.FormatInt(int64(r), 16)
437 if len(s) == 1 {
438 b.WriteRune('0')
439 }
440 b.WriteString(s)
441 break
442 }
443 b.WriteString(`\x{`)
444 b.WriteString(strconv.FormatInt(int64(r), 16))
445 b.WriteString(`}`)
446 }
447 }
448
449
450 func (re *Regexp) MaxCap() int {
451 m := 0
452 if re.Op == OpCapture {
453 m = re.Cap
454 }
455 for _, sub := range re.Sub {
456 if n := sub.MaxCap(); m < n {
457 m = n
458 }
459 }
460 return m
461 }
462
463
464 func (re *Regexp) CapNames() []string {
465 names := make([]string, re.MaxCap()+1)
466 re.capNames(names)
467 return names
468 }
469
470 func (re *Regexp) capNames(names []string) {
471 if re.Op == OpCapture {
472 names[re.Cap] = re.Name
473 }
474 for _, sub := range re.Sub {
475 sub.capNames(names)
476 }
477 }
478
View as plain text