...

Source file src/regexp/syntax/simplify_test.go

Documentation: regexp/syntax

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import "testing"
     8  
     9  var simplifyTests = []struct {
    10  	Regexp string
    11  	Simple string
    12  }{
    13  	// Already-simple constructs
    14  	{`a`, `a`},
    15  	{`ab`, `ab`},
    16  	{`a|b`, `[ab]`},
    17  	{`ab|cd`, `ab|cd`},
    18  	{`(ab)*`, `(ab)*`},
    19  	{`(ab)+`, `(ab)+`},
    20  	{`(ab)?`, `(ab)?`},
    21  	{`.`, `(?s:.)`},
    22  	{`^`, `(?m:^)`},
    23  	{`$`, `(?m:$)`},
    24  	{`[ac]`, `[ac]`},
    25  	{`[^ac]`, `[^ac]`},
    26  
    27  	// Posix character classes
    28  	{`[[:alnum:]]`, `[0-9A-Za-z]`},
    29  	{`[[:alpha:]]`, `[A-Za-z]`},
    30  	{`[[:blank:]]`, `[\t ]`},
    31  	{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
    32  	{`[[:digit:]]`, `[0-9]`},
    33  	{`[[:graph:]]`, `[!-~]`},
    34  	{`[[:lower:]]`, `[a-z]`},
    35  	{`[[:print:]]`, `[ -~]`},
    36  	{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
    37  	{`[[:space:]]`, `[\t-\r ]`},
    38  	{`[[:upper:]]`, `[A-Z]`},
    39  	{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
    40  
    41  	// Perl character classes
    42  	{`\d`, `[0-9]`},
    43  	{`\s`, `[\t\n\f\r ]`},
    44  	{`\w`, `[0-9A-Z_a-z]`},
    45  	{`\D`, `[^0-9]`},
    46  	{`\S`, `[^\t\n\f\r ]`},
    47  	{`\W`, `[^0-9A-Z_a-z]`},
    48  	{`[\d]`, `[0-9]`},
    49  	{`[\s]`, `[\t\n\f\r ]`},
    50  	{`[\w]`, `[0-9A-Z_a-z]`},
    51  	{`[\D]`, `[^0-9]`},
    52  	{`[\S]`, `[^\t\n\f\r ]`},
    53  	{`[\W]`, `[^0-9A-Z_a-z]`},
    54  
    55  	// Posix repetitions
    56  	{`a{1}`, `a`},
    57  	{`a{2}`, `aa`},
    58  	{`a{5}`, `aaaaa`},
    59  	{`a{0,1}`, `a?`},
    60  	// The next three are illegible because Simplify inserts (?:)
    61  	// parens instead of () parens to avoid creating extra
    62  	// captured subexpressions. The comments show a version with fewer parens.
    63  	{`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
    64  	{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
    65  	{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
    66  	{`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
    67  	{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
    68  	{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
    69  	{`a{0,}`, `a*`},
    70  	{`a{1,}`, `a+`},
    71  	{`a{2,}`, `aa+`},
    72  	{`a{5,}`, `aaaaa+`},
    73  
    74  	// Test that operators simplify their arguments.
    75  	{`(?:a{1,}){1,}`, `a+`},
    76  	{`(a{1,}b{1,})`, `(a+b+)`},
    77  	{`a{1,}|b{1,}`, `a+|b+`},
    78  	{`(?:a{1,})*`, `(?:a+)*`},
    79  	{`(?:a{1,})+`, `a+`},
    80  	{`(?:a{1,})?`, `(?:a+)?`},
    81  	{``, `(?:)`},
    82  	{`a{0}`, `(?:)`},
    83  
    84  	// Character class simplification
    85  	{`[ab]`, `[ab]`},
    86  	{`[abc]`, `[a-c]`},
    87  	{`[a-za-za-z]`, `[a-z]`},
    88  	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
    89  	{`[ABCDEFGH]`, `[A-H]`},
    90  	{`[AB-CD-EF-GH]`, `[A-H]`},
    91  	{`[W-ZP-XE-R]`, `[E-Z]`},
    92  	{`[a-ee-gg-m]`, `[a-m]`},
    93  	{`[a-ea-ha-m]`, `[a-m]`},
    94  	{`[a-ma-ha-e]`, `[a-m]`},
    95  	{`[a-zA-Z0-9 -~]`, `[ -~]`},
    96  
    97  	// Empty character classes
    98  	{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
    99  
   100  	// Full character classes
   101  	{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
   102  
   103  	// Unicode case folding.
   104  	{`(?i)A`, `(?i:A)`},
   105  	{`(?i)a`, `(?i:A)`},
   106  	{`(?i)[A]`, `(?i:A)`},
   107  	{`(?i)[a]`, `(?i:A)`},
   108  	{`(?i)K`, `(?i:K)`},
   109  	{`(?i)k`, `(?i:K)`},
   110  	{`(?i)\x{212a}`, "(?i:K)"},
   111  	{`(?i)[K]`, "[Kk\u212A]"},
   112  	{`(?i)[k]`, "[Kk\u212A]"},
   113  	{`(?i)[\x{212a}]`, "[Kk\u212A]"},
   114  	{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
   115  	{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
   116  	{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
   117  
   118  	// Empty string as a regular expression.
   119  	// The empty string must be preserved inside parens in order
   120  	// to make submatches work right, so these tests are less
   121  	// interesting than they might otherwise be. String inserts
   122  	// explicit (?:) in place of non-parenthesized empty strings,
   123  	// to make them easier to spot for other parsers.
   124  	{`(a|b|c|)`, `([a-c]|(?:))`},
   125  	{`(a|b|)`, `([ab]|(?:))`},
   126  	{`(|)`, `()`},
   127  	{`a()`, `a()`},
   128  	{`(()|())`, `(()|())`},
   129  	{`(a|)`, `(a|(?:))`},
   130  	{`ab()cd()`, `ab()cd()`},
   131  	{`()`, `()`},
   132  	{`()*`, `()*`},
   133  	{`()+`, `()+`},
   134  	{`()?`, `()?`},
   135  	{`(){0}`, `(?:)`},
   136  	{`(){1}`, `()`},
   137  	{`(){1,}`, `()+`},
   138  	{`(){0,2}`, `(?:()()?)?`},
   139  }
   140  
   141  func TestSimplify(t *testing.T) {
   142  	for _, tt := range simplifyTests {
   143  		re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
   144  		if err != nil {
   145  			t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
   146  			continue
   147  		}
   148  		s := re.Simplify().String()
   149  		if s != tt.Simple {
   150  			t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
   151  		}
   152  	}
   153  }
   154  

View as plain text