token_test.go

Documentation: golang.org/x/net/html

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bytes"
     9  	"io"
    10  	"io/ioutil"
    11  	"reflect"
    12  	"runtime"
    13  	"strings"
    14  	"testing"
    15  )
    16  
    17  // https://github.com/golang/go/issues/58246
    18  const issue58246 = `<!--[if gte mso 12]>
    19    <xml>
    20        <o:OfficeDocumentSettings>
    21        <o:AllowPNG/>
    22        <o:PixelsPerInch>96</o:PixelsPerInch>
    23        </o:OfficeDocumentSettings>
    24      </xml>
    25  <![endif]-->`
    26  
    27  type tokenTest struct {
    28  	// A short description of the test case.
    29  	desc string
    30  	// The HTML to parse.
    31  	html string
    32  	// The string representations of the expected tokens, joined by '$'.
    33  	golden string
    34  }
    35  
    36  var tokenTests = []tokenTest{
    37  	{
    38  		"empty",
    39  		"",
    40  		"",
    41  	},
    42  	// A single text node. The tokenizer should not break text nodes on whitespace,
    43  	// nor should it normalize whitespace within a text node.
    44  	{
    45  		"text",
    46  		"foo  bar",
    47  		"foo  bar",
    48  	},
    49  	// An entity.
    50  	{
    51  		"entity",
    52  		"one &lt; two",
    53  		"one &lt; two",
    54  	},
    55  	// A start, self-closing and end tag. The tokenizer does not care if the start
    56  	// and end tokens don't match; that is the job of the parser.
    57  	{
    58  		"tags",
    59  		"<a>b<c/>d</e>",
    60  		"<a>$b$<c/>$d$</e>",
    61  	},
    62  	// Angle brackets that aren't a tag.
    63  	{
    64  		"not a tag #0",
    65  		"<",
    66  		"&lt;",
    67  	},
    68  	{
    69  		"not a tag #1",
    70  		"</",
    71  		"&lt;/",
    72  	},
    73  	{
    74  		"not a tag #2",
    75  		"</>",
    76  		"<!---->",
    77  	},
    78  	{
    79  		"not a tag #3",
    80  		"a</>b",
    81  		"a$<!---->$b",
    82  	},
    83  	{
    84  		"not a tag #4",
    85  		"</ >",
    86  		"<!-- -->",
    87  	},
    88  	{
    89  		"not a tag #5",
    90  		"</.",
    91  		"<!--.-->",
    92  	},
    93  	{
    94  		"not a tag #6",
    95  		"</.>",
    96  		"<!--.-->",
    97  	},
    98  	{
    99  		"not a tag #7",
   100  		"a < b",
   101  		"a &lt; b",
   102  	},
   103  	{
   104  		"not a tag #8",
   105  		"<.>",
   106  		"&lt;.&gt;",
   107  	},
   108  	{
   109  		"not a tag #9",
   110  		"a<<<b>>>c",
   111  		"a&lt;&lt;$<b>$&gt;&gt;c",
   112  	},
   113  	{
   114  		"not a tag #10",
   115  		"if x<0 and y < 0 then x*y>0",
   116  		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
   117  	},
   118  	{
   119  		"not a tag #11",
   120  		"<<p>",
   121  		"&lt;$<p>",
   122  	},
   123  	// EOF in a tag name.
   124  	{
   125  		"tag name eof #0",
   126  		"<a",
   127  		"",
   128  	},
   129  	{
   130  		"tag name eof #1",
   131  		"<a ",
   132  		"",
   133  	},
   134  	{
   135  		"tag name eof #2",
   136  		"a<b",
   137  		"a",
   138  	},
   139  	{
   140  		"tag name eof #3",
   141  		"<a><b",
   142  		"<a>",
   143  	},
   144  	{
   145  		"tag name eof #4",
   146  		`<a x`,
   147  		``,
   148  	},
   149  	// Some malformed tags that are missing a '>'.
   150  	{
   151  		"malformed tag #0",
   152  		`<p</p>`,
   153  		`<p< p="">`,
   154  	},
   155  	{
   156  		"malformed tag #1",
   157  		`<p </p>`,
   158  		`<p <="" p="">`,
   159  	},
   160  	{
   161  		"malformed tag #2",
   162  		`<p id`,
   163  		``,
   164  	},
   165  	{
   166  		"malformed tag #3",
   167  		`<p id=`,
   168  		``,
   169  	},
   170  	{
   171  		"malformed tag #4",
   172  		`<p id=>`,
   173  		`<p id="">`,
   174  	},
   175  	{
   176  		"malformed tag #5",
   177  		`<p id=0`,
   178  		``,
   179  	},
   180  	{
   181  		"malformed tag #6",
   182  		`<p id=0</p>`,
   183  		`<p id="0&lt;/p">`,
   184  	},
   185  	{
   186  		"malformed tag #7",
   187  		`<p id="0</p>`,
   188  		``,
   189  	},
   190  	{
   191  		"malformed tag #8",
   192  		`<p id="0"</p>`,
   193  		`<p id="0" <="" p="">`,
   194  	},
   195  	{
   196  		"malformed tag #9",
   197  		`<p></p id`,
   198  		`<p>`,
   199  	},
   200  	// Raw text and RCDATA.
   201  	{
   202  		"basic raw text",
   203  		"<script><a></b></script>",
   204  		"<script>$&lt;a&gt;&lt;/b&gt;$</script>",
   205  	},
   206  	{
   207  		"unfinished script end tag",
   208  		"<SCRIPT>a</SCR",
   209  		"<script>$a&lt;/SCR",
   210  	},
   211  	{
   212  		"broken script end tag",
   213  		"<SCRIPT>a</SCR ipt>",
   214  		"<script>$a&lt;/SCR ipt&gt;",
   215  	},
   216  	{
   217  		"EOF in script end tag",
   218  		"<SCRIPT>a</SCRipt",
   219  		"<script>$a&lt;/SCRipt",
   220  	},
   221  	{
   222  		"scriptx end tag",
   223  		"<SCRIPT>a</SCRiptx",
   224  		"<script>$a&lt;/SCRiptx",
   225  	},
   226  	{
   227  		"' ' completes script end tag",
   228  		"<SCRIPT>a</SCRipt ",
   229  		"<script>$a",
   230  	},
   231  	{
   232  		"'>' completes script end tag",
   233  		"<SCRIPT>a</SCRipt>",
   234  		"<script>$a$</script>",
   235  	},
   236  	{
   237  		"self-closing script end tag",
   238  		"<SCRIPT>a</SCRipt/>",
   239  		"<script>$a$</script>",
   240  	},
   241  	{
   242  		"nested script tag",
   243  		"<SCRIPT>a</SCRipt<script>",
   244  		"<script>$a&lt;/SCRipt&lt;script&gt;",
   245  	},
   246  	{
   247  		"script end tag after unfinished",
   248  		"<SCRIPT>a</SCRipt</script>",
   249  		"<script>$a&lt;/SCRipt$</script>",
   250  	},
   251  	{
   252  		"script/style mismatched tags",
   253  		"<script>a</style>",
   254  		"<script>$a&lt;/style&gt;",
   255  	},
   256  	{
   257  		"style element with entity",
   258  		"<style>&apos;",
   259  		"<style>$&amp;apos;",
   260  	},
   261  	{
   262  		"textarea with tag",
   263  		"<textarea><div></textarea>",
   264  		"<textarea>$&lt;div&gt;$</textarea>",
   265  	},
   266  	{
   267  		"title with tag and entity",
   268  		"<title><b>K&amp;R C</b></title>",
   269  		"<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
   270  	},
   271  	{
   272  		"title with trailing '&lt;' entity",
   273  		"<title>foobar<</title>",
   274  		"<title>$foobar&lt;$</title>",
   275  	},
   276  	// DOCTYPE tests.
   277  	{
   278  		"Proper DOCTYPE",
   279  		"<!DOCTYPE html>",
   280  		"<!DOCTYPE html>",
   281  	},
   282  	{
   283  		"DOCTYPE with no space",
   284  		"<!doctypehtml>",
   285  		"<!DOCTYPE html>",
   286  	},
   287  	{
   288  		"DOCTYPE with two spaces",
   289  		"<!doctype  html>",
   290  		"<!DOCTYPE html>",
   291  	},
   292  	{
   293  		"looks like DOCTYPE but isn't",
   294  		"<!DOCUMENT html>",
   295  		"<!--DOCUMENT html-->",
   296  	},
   297  	{
   298  		"DOCTYPE at EOF",
   299  		"<!DOCtype",
   300  		"<!DOCTYPE >",
   301  	},
   302  	// XML processing instructions.
   303  	{
   304  		"XML processing instruction",
   305  		"<?xml?>",
   306  		"<!--?xml?-->",
   307  	},
   308  	// Comments. See also func TestComments.
   309  	{
   310  		"comment0",
   311  		"abc<b><!-- skipme --></b>def",
   312  		"abc$<b>$<!-- skipme -->$</b>$def",
   313  	},
   314  	{
   315  		"comment1",
   316  		"a<!-->z",
   317  		"a$<!---->$z",
   318  	},
   319  	{
   320  		"comment2",
   321  		"a<!--->z",
   322  		"a$<!---->$z",
   323  	},
   324  	{
   325  		"comment3",
   326  		"a<!--x>-->z",
   327  		"a$<!--x>-->$z",
   328  	},
   329  	{
   330  		"comment4",
   331  		"a<!--x->-->z",
   332  		"a$<!--x-&gt;-->$z",
   333  	},
   334  	{
   335  		"comment5",
   336  		"a<!>z",
   337  		"a$<!---->$z",
   338  	},
   339  	{
   340  		"comment6",
   341  		"a<!->z",
   342  		"a$<!----->$z",
   343  	},
   344  	{
   345  		"comment7",
   346  		"a<!---<>z",
   347  		"a$<!---<>z-->",
   348  	},
   349  	{
   350  		"comment8",
   351  		"a<!--z",
   352  		"a$<!--z-->",
   353  	},
   354  	{
   355  		"comment9",
   356  		"a<!--z-",
   357  		"a$<!--z-->",
   358  	},
   359  	{
   360  		"comment10",
   361  		"a<!--z--",
   362  		"a$<!--z-->",
   363  	},
   364  	{
   365  		"comment11",
   366  		"a<!--z---",
   367  		"a$<!--z--->",
   368  	},
   369  	{
   370  		"comment12",
   371  		"a<!--z----",
   372  		"a$<!--z---->",
   373  	},
   374  	{
   375  		"comment13",
   376  		"a<!--x--!>z",
   377  		"a$<!--x-->$z",
   378  	},
   379  	{
   380  		"comment14",
   381  		"a<!--!-->z",
   382  		"a$<!--!-->$z",
   383  	},
   384  	{
   385  		"comment15",
   386  		"a<!-- !-->z",
   387  		"a$<!-- !-->$z",
   388  	},
   389  	{
   390  		"comment16",
   391  		"a<!--i\x00j-->z",
   392  		"a$<!--i\uFFFDj-->$z",
   393  	},
   394  	{
   395  		"comment17",
   396  		"a<!--\x00",
   397  		"a$<!--\uFFFD-->",
   398  	},
   399  	{
   400  		"comment18",
   401  		"a<!--<!-->z",
   402  		"a$<!--<!-->$z",
   403  	},
   404  	{
   405  		"comment19",
   406  		"a<!--<!--",
   407  		"a$<!--<!-->",
   408  	},
   409  	{
   410  		"comment20",
   411  		"a<!--ij--kl-->z",
   412  		"a$<!--ij--kl-->$z",
   413  	},
   414  	{
   415  		"comment21",
   416  		"a<!--ij--kl--!>z",
   417  		"a$<!--ij--kl-->$z",
   418  	},
   419  	{
   420  		"comment22",
   421  		"a<!--!--!<--!-->z",
   422  		"a$<!--!--!<--!-->$z",
   423  	},
   424  	{
   425  		"comment23",
   426  		"a<!--&gt;-->z",
   427  		"a$<!--&gt;-->$z",
   428  	},
   429  	{
   430  		"comment24",
   431  		"a<!--&gt;>x",
   432  		"a$<!--&gt;>x-->",
   433  	},
   434  	{
   435  		"comment25",
   436  		"a<!--&gt;&gt;",
   437  		"a$<!--&gt;>-->",
   438  	},
   439  	{
   440  		"comment26",
   441  		"a<!--&gt;&gt;-",
   442  		"a$<!--&gt;>-->",
   443  	},
   444  	{
   445  		"comment27",
   446  		"a<!--&gt;&gt;-->z",
   447  		"a$<!--&gt;>-->$z",
   448  	},
   449  	{
   450  		"comment28",
   451  		"a<!--&amp;&gt;-->z",
   452  		"a$<!--&amp;>-->$z",
   453  	},
   454  	{
   455  		"comment29",
   456  		"a<!--&amp;gt;-->z",
   457  		"a$<!--&amp;gt;-->$z",
   458  	},
   459  	{
   460  		"comment30",
   461  		"a<!--&nosuchentity;-->z",
   462  		"a$<!--&amp;nosuchentity;-->$z",
   463  	},
   464  	{
   465  		"comment31",
   466  		"a<!--i>>j-->z",
   467  		"a$<!--i>>j-->$z",
   468  	},
   469  	{
   470  		"comment32",
   471  		"a<!--i!>>j-->z",
   472  		"a$<!--i!&gt;>j-->$z",
   473  	},
   474  	// https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions
   475  	// says "[For] Windows Outlook 2003 and above... conditional comments allow
   476  	// us to add bits of HTML that are only read by the Word-based versions of
   477  	// Outlook". These comments (with angle brackets) should pass through
   478  	// unchanged (by this Go package) when rendering.
   479  	//
   480  	// We should also still escape ">" as "&gt;" when necessary.
   481  	// https://github.com/golang/go/issues/48237
   482  	//
   483  	// The "your code" example below comes from that stackoverflow.design link
   484  	// above but note that it can contain angle-bracket-rich XML.
   485  	// https://github.com/golang/go/issues/58246
   486  	{
   487  		"issue48237CommentWithAmpgtsemi1",
   488  		"a<!--<p></p>&lt;!--[video]--&gt;-->z",
   489  		"a$<!--<p></p><!--[video]--&gt;-->$z",
   490  	},
   491  	{
   492  		"issue48237CommentWithAmpgtsemi2",
   493  		"a<!--<p></p>&lt;!--[video]--!&gt;-->z",
   494  		"a$<!--<p></p><!--[video]--!&gt;-->$z",
   495  	},
   496  	{
   497  		"issue58246MicrosoftOutlookComment1",
   498  		"a<!--[if mso]> your code <![endif]-->z",
   499  		"a$<!--[if mso]> your code <![endif]-->$z",
   500  	},
   501  	{
   502  		"issue58246MicrosoftOutlookComment2",
   503  		"a" + issue58246 + "z",
   504  		"a$" + issue58246 + "$z",
   505  	},
   506  	// An attribute with a backslash.
   507  	{
   508  		"backslash",
   509  		`<p id="a\"b">`,
   510  		`<p id="a\" b"="">`,
   511  	},
   512  	// Entities, tag name and attribute key lower-casing, and whitespace
   513  	// normalization within a tag.
   514  	{
   515  		"tricky",
   516  		"<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
   517  		`<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
   518  	},
   519  	// A nonexistent entity. Tokenizing and converting back to a string should
   520  	// escape the "&" to become "&amp;".
   521  	{
   522  		"noSuchEntity",
   523  		`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
   524  		`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
   525  	},
   526  	{
   527  		"entity without semicolon",
   528  		`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
   529  		`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
   530  	},
   531  	{
   532  		"entity with digits",
   533  		"&frac12;",
   534  		"½",
   535  	},
   536  	// Attribute tests:
   537  	// http://dev.w3.org/html5/pf-summary/Overview.html#attributes
   538  	{
   539  		"Empty attribute",
   540  		`<input disabled FOO>`,
   541  		`<input disabled="" foo="">`,
   542  	},
   543  	{
   544  		"Empty attribute, whitespace",
   545  		`<input disabled FOO >`,
   546  		`<input disabled="" foo="">`,
   547  	},
   548  	{
   549  		"Unquoted attribute value",
   550  		`<input value=yes FOO=BAR>`,
   551  		`<input value="yes" foo="BAR">`,
   552  	},
   553  	{
   554  		"Unquoted attribute value, spaces",
   555  		`<input value = yes FOO = BAR>`,
   556  		`<input value="yes" foo="BAR">`,
   557  	},
   558  	{
   559  		"Unquoted attribute value, trailing space",
   560  		`<input value=yes FOO=BAR >`,
   561  		`<input value="yes" foo="BAR">`,
   562  	},
   563  	{
   564  		"Single-quoted attribute value",
   565  		`<input value='yes' FOO='BAR'>`,
   566  		`<input value="yes" foo="BAR">`,
   567  	},
   568  	{
   569  		"Single-quoted attribute value, trailing space",
   570  		`<input value='yes' FOO='BAR' >`,
   571  		`<input value="yes" foo="BAR">`,
   572  	},
   573  	{
   574  		"Double-quoted attribute value",
   575  		`<input value="I'm an attribute" FOO="BAR">`,
   576  		`<input value="I&#39;m an attribute" foo="BAR">`,
   577  	},
   578  	{
   579  		"Attribute name characters",
   580  		`<meta http-equiv="content-type">`,
   581  		`<meta http-equiv="content-type">`,
   582  	},
   583  	{
   584  		"Mixed attributes",
   585  		`a<P V="0 1" w='2' X=3 y>z`,
   586  		`a$<p v="0 1" w="2" x="3" y="">$z`,
   587  	},
   588  	{
   589  		"Attributes with a solitary single quote",
   590  		`<p id=can't><p id=won't>`,
   591  		`<p id="can&#39;t">$<p id="won&#39;t">`,
   592  	},
   593  	// WHATWG 13.2.5.32 equals sign before attribute name state
   594  	{
   595  		"equals sign before attribute name",
   596  		`<p  =>`,
   597  		`<p =="">`,
   598  	},
   599  	{
   600  		"equals sign before attribute name, extra cruft",
   601  		`<p  =asd>`,
   602  		`<p =asd="">`,
   603  	},
   604  	{
   605  		"forward slash before attribute name",
   606  		`<p/=">`,
   607  		`<p ="="">`,
   608  	},
   609  	{
   610  		"forward slash before attribute name with spaces around",
   611  		`<p / =">`,
   612  		`<p ="="">`,
   613  	},
   614  	{
   615  		"forward slash after attribute name followed by a character",
   616  		`<p a/ ="">`,
   617  		`<p a="" =""="">`,
   618  	},
   619  }
   620  
   621  func TestTokenizer(t *testing.T) {
   622  	for _, tt := range tokenTests {
   623  		t.Run(tt.desc, func(t *testing.T) {
   624  			z := NewTokenizer(strings.NewReader(tt.html))
   625  			if tt.golden != "" {
   626  				for i, s := range strings.Split(tt.golden, "$") {
   627  					if z.Next() == ErrorToken {
   628  						t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
   629  						return
   630  					}
   631  					actual := z.Token().String()
   632  					if s != actual {
   633  						t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
   634  						return
   635  					}
   636  				}
   637  			}
   638  			z.Next()
   639  			if z.Err() != io.EOF {
   640  				t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
   641  			}
   642  		})
   643  	}
   644  }
   645  
   646  func TestMaxBuffer(t *testing.T) {
   647  	// Exceeding the maximum buffer size generates ErrBufferExceeded.
   648  	z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
   649  	z.SetMaxBuf(5)
   650  	tt := z.Next()
   651  	if got, want := tt, ErrorToken; got != want {
   652  		t.Fatalf("token type: got: %v want: %v", got, want)
   653  	}
   654  	if got, want := z.Err(), ErrBufferExceeded; got != want {
   655  		t.Errorf("error type: got: %v want: %v", got, want)
   656  	}
   657  	if got, want := string(z.Raw()), "<tttt"; got != want {
   658  		t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
   659  	}
   660  }
   661  
   662  func TestMaxBufferReconstruction(t *testing.T) {
   663  	// Exceeding the maximum buffer size at any point while tokenizing permits
   664  	// reconstructing the original input.
   665  tests:
   666  	for _, test := range tokenTests {
   667  		for maxBuf := 1; ; maxBuf++ {
   668  			r := strings.NewReader(test.html)
   669  			z := NewTokenizer(r)
   670  			z.SetMaxBuf(maxBuf)
   671  			var tokenized bytes.Buffer
   672  			for {
   673  				tt := z.Next()
   674  				tokenized.Write(z.Raw())
   675  				if tt == ErrorToken {
   676  					if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
   677  						t.Errorf("%s: unexpected error: %v", test.desc, err)
   678  					}
   679  					break
   680  				}
   681  			}
   682  			// Anything tokenized along with untokenized input or data left in the reader.
   683  			assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
   684  			if err != nil {
   685  				t.Errorf("%s: ReadAll: %v", test.desc, err)
   686  				continue tests
   687  			}
   688  			if got, want := string(assembled), test.html; got != want {
   689  				t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
   690  				continue tests
   691  			}
   692  			// EOF indicates that we completed tokenization and hence found the max
   693  			// maxBuf that generates ErrBufferExceeded, so continue to the next test.
   694  			if z.Err() == io.EOF {
   695  				break
   696  			}
   697  		} // buffer sizes
   698  	} // tests
   699  }
   700  
   701  func TestPassthrough(t *testing.T) {
   702  	// Accumulating the raw output for each parse event should reconstruct the
   703  	// original input.
   704  	for _, test := range tokenTests {
   705  		z := NewTokenizer(strings.NewReader(test.html))
   706  		var parsed bytes.Buffer
   707  		for {
   708  			tt := z.Next()
   709  			parsed.Write(z.Raw())
   710  			if tt == ErrorToken {
   711  				break
   712  			}
   713  		}
   714  		if got, want := parsed.String(), test.html; got != want {
   715  			t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
   716  		}
   717  	}
   718  }
   719  
   720  func TestBufAPI(t *testing.T) {
   721  	s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
   722  	z := NewTokenizer(bytes.NewBufferString(s))
   723  	var result bytes.Buffer
   724  	depth := 0
   725  loop:
   726  	for {
   727  		tt := z.Next()
   728  		switch tt {
   729  		case ErrorToken:
   730  			if z.Err() != io.EOF {
   731  				t.Error(z.Err())
   732  			}
   733  			break loop
   734  		case TextToken:
   735  			if depth > 0 {
   736  				result.Write(z.Text())
   737  			}
   738  		case StartTagToken, EndTagToken:
   739  			tn, _ := z.TagName()
   740  			if len(tn) == 1 && tn[0] == 'a' {
   741  				if tt == StartTagToken {
   742  					depth++
   743  				} else {
   744  					depth--
   745  				}
   746  			}
   747  		}
   748  	}
   749  	u := "14567"
   750  	v := string(result.Bytes())
   751  	if u != v {
   752  		t.Errorf("TestBufAPI: want %q got %q", u, v)
   753  	}
   754  }
   755  
   756  func TestConvertNewlines(t *testing.T) {
   757  	testCases := map[string]string{
   758  		"Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
   759  		"Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
   760  		"DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
   761  		"":                      "",
   762  		"\n":                    "\n",
   763  		"\n\r":                  "\n\n",
   764  		"\r":                    "\n",
   765  		"\r\n":                  "\n",
   766  		"\r\n\n":                "\n\n",
   767  		"\r\n\r":                "\n\n",
   768  		"\r\n\r\n":              "\n\n",
   769  		"\r\r":                  "\n\n",
   770  		"\r\r\n":                "\n\n",
   771  		"\r\r\n\n":              "\n\n\n",
   772  		"\r\r\r\n":              "\n\n\n",
   773  		"\r \n":                 "\n \n",
   774  		"xyz":                   "xyz",
   775  	}
   776  	for in, want := range testCases {
   777  		if got := string(convertNewlines([]byte(in))); got != want {
   778  			t.Errorf("input %q: got %q, want %q", in, got, want)
   779  		}
   780  	}
   781  }
   782  
   783  func TestReaderEdgeCases(t *testing.T) {
   784  	const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
   785  	testCases := []io.Reader{
   786  		&zeroOneByteReader{s: s},
   787  		&eofStringsReader{s: s},
   788  		&stuckReader{},
   789  	}
   790  	for i, tc := range testCases {
   791  		got := []TokenType{}
   792  		z := NewTokenizer(tc)
   793  		for {
   794  			tt := z.Next()
   795  			if tt == ErrorToken {
   796  				break
   797  			}
   798  			got = append(got, tt)
   799  		}
   800  		if err := z.Err(); err != nil && err != io.EOF {
   801  			if err != io.ErrNoProgress {
   802  				t.Errorf("i=%d: %v", i, err)
   803  			}
   804  			continue
   805  		}
   806  		want := []TokenType{
   807  			StartTagToken,
   808  			TextToken,
   809  			EndTagToken,
   810  		}
   811  		if !reflect.DeepEqual(got, want) {
   812  			t.Errorf("i=%d: got %v, want %v", i, got, want)
   813  			continue
   814  		}
   815  	}
   816  }
   817  
   818  // zeroOneByteReader is like a strings.Reader that alternates between
   819  // returning 0 bytes and 1 byte at a time.
   820  type zeroOneByteReader struct {
   821  	s string
   822  	n int
   823  }
   824  
   825  func (r *zeroOneByteReader) Read(p []byte) (int, error) {
   826  	if len(p) == 0 {
   827  		return 0, nil
   828  	}
   829  	if len(r.s) == 0 {
   830  		return 0, io.EOF
   831  	}
   832  	r.n++
   833  	if r.n%2 != 0 {
   834  		return 0, nil
   835  	}
   836  	p[0], r.s = r.s[0], r.s[1:]
   837  	return 1, nil
   838  }
   839  
   840  // eofStringsReader is like a strings.Reader but can return an (n, err) where
   841  // n > 0 && err != nil.
   842  type eofStringsReader struct {
   843  	s string
   844  }
   845  
   846  func (r *eofStringsReader) Read(p []byte) (int, error) {
   847  	n := copy(p, r.s)
   848  	r.s = r.s[n:]
   849  	if r.s != "" {
   850  		return n, nil
   851  	}
   852  	return n, io.EOF
   853  }
   854  
   855  // stuckReader is an io.Reader that always returns no data and no error.
   856  type stuckReader struct{}
   857  
   858  func (*stuckReader) Read(p []byte) (int, error) {
   859  	return 0, nil
   860  }
   861  
   862  const (
   863  	rawLevel = iota
   864  	lowLevel
   865  	highLevel
   866  )
   867  
   868  func benchmarkTokenizer(b *testing.B, level int) {
   869  	buf, err := ioutil.ReadFile("testdata/go1.html")
   870  	if err != nil {
   871  		b.Fatalf("could not read testdata/go1.html: %v", err)
   872  	}
   873  	b.SetBytes(int64(len(buf)))
   874  	runtime.GC()
   875  	b.ReportAllocs()
   876  	b.ResetTimer()
   877  	for i := 0; i < b.N; i++ {
   878  		z := NewTokenizer(bytes.NewBuffer(buf))
   879  		for {
   880  			tt := z.Next()
   881  			if tt == ErrorToken {
   882  				if err := z.Err(); err != nil && err != io.EOF {
   883  					b.Fatalf("tokenizer error: %v", err)
   884  				}
   885  				break
   886  			}
   887  			switch level {
   888  			case rawLevel:
   889  				// Calling z.Raw just returns the raw bytes of the token. It does
   890  				// not unescape &lt; to <, or lower-case tag names and attribute keys.
   891  				z.Raw()
   892  			case lowLevel:
   893  				// Caling z.Text, z.TagName and z.TagAttr returns []byte values
   894  				// whose contents may change on the next call to z.Next.
   895  				switch tt {
   896  				case TextToken, CommentToken, DoctypeToken:
   897  					z.Text()
   898  				case StartTagToken, SelfClosingTagToken:
   899  					_, more := z.TagName()
   900  					for more {
   901  						_, _, more = z.TagAttr()
   902  					}
   903  				case EndTagToken:
   904  					z.TagName()
   905  				}
   906  			case highLevel:
   907  				// Calling z.Token converts []byte values to strings whose validity
   908  				// extend beyond the next call to z.Next.
   909  				z.Token()
   910  			}
   911  		}
   912  	}
   913  }
   914  
   915  func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
   916  func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
   917  func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }
   918
View as plain text