1
2
3
4
5 package html
6
7 import (
8 "bytes"
9 "io"
10 "io/ioutil"
11 "reflect"
12 "runtime"
13 "strings"
14 "testing"
15 )
16
17
18 const issue58246 = `<!--[if gte mso 12]>
19 <xml>
20 <o:OfficeDocumentSettings>
21 <o:AllowPNG/>
22 <o:PixelsPerInch>96</o:PixelsPerInch>
23 </o:OfficeDocumentSettings>
24 </xml>
25 <![endif]-->`
26
27 type tokenTest struct {
28
29 desc string
30
31 html string
32
33 golden string
34 }
35
36 var tokenTests = []tokenTest{
37 {
38 "empty",
39 "",
40 "",
41 },
42
43
44 {
45 "text",
46 "foo bar",
47 "foo bar",
48 },
49
50 {
51 "entity",
52 "one < two",
53 "one < two",
54 },
55
56
57 {
58 "tags",
59 "<a>b<c/>d</e>",
60 "<a>$b$<c/>$d$</e>",
61 },
62
63 {
64 "not a tag #0",
65 "<",
66 "<",
67 },
68 {
69 "not a tag #1",
70 "</",
71 "</",
72 },
73 {
74 "not a tag #2",
75 "</>",
76 "<!---->",
77 },
78 {
79 "not a tag #3",
80 "a</>b",
81 "a$<!---->$b",
82 },
83 {
84 "not a tag #4",
85 "</ >",
86 "<!-- -->",
87 },
88 {
89 "not a tag #5",
90 "</.",
91 "<!--.-->",
92 },
93 {
94 "not a tag #6",
95 "</.>",
96 "<!--.-->",
97 },
98 {
99 "not a tag #7",
100 "a < b",
101 "a < b",
102 },
103 {
104 "not a tag #8",
105 "<.>",
106 "<.>",
107 },
108 {
109 "not a tag #9",
110 "a<<<b>>>c",
111 "a<<$<b>$>>c",
112 },
113 {
114 "not a tag #10",
115 "if x<0 and y < 0 then x*y>0",
116 "if x<0 and y < 0 then x*y>0",
117 },
118 {
119 "not a tag #11",
120 "<<p>",
121 "<$<p>",
122 },
123
124 {
125 "tag name eof #0",
126 "<a",
127 "",
128 },
129 {
130 "tag name eof #1",
131 "<a ",
132 "",
133 },
134 {
135 "tag name eof #2",
136 "a<b",
137 "a",
138 },
139 {
140 "tag name eof #3",
141 "<a><b",
142 "<a>",
143 },
144 {
145 "tag name eof #4",
146 `<a x`,
147 ``,
148 },
149
150 {
151 "malformed tag #0",
152 `<p</p>`,
153 `<p< p="">`,
154 },
155 {
156 "malformed tag #1",
157 `<p </p>`,
158 `<p <="" p="">`,
159 },
160 {
161 "malformed tag #2",
162 `<p id`,
163 ``,
164 },
165 {
166 "malformed tag #3",
167 `<p id=`,
168 ``,
169 },
170 {
171 "malformed tag #4",
172 `<p id=>`,
173 `<p id="">`,
174 },
175 {
176 "malformed tag #5",
177 `<p id=0`,
178 ``,
179 },
180 {
181 "malformed tag #6",
182 `<p id=0</p>`,
183 `<p id="0</p">`,
184 },
185 {
186 "malformed tag #7",
187 `<p id="0</p>`,
188 ``,
189 },
190 {
191 "malformed tag #8",
192 `<p id="0"</p>`,
193 `<p id="0" <="" p="">`,
194 },
195 {
196 "malformed tag #9",
197 `<p></p id`,
198 `<p>`,
199 },
200
201 {
202 "basic raw text",
203 "<script><a></b></script>",
204 "<script>$<a></b>$</script>",
205 },
206 {
207 "unfinished script end tag",
208 "<SCRIPT>a</SCR",
209 "<script>$a</SCR",
210 },
211 {
212 "broken script end tag",
213 "<SCRIPT>a</SCR ipt>",
214 "<script>$a</SCR ipt>",
215 },
216 {
217 "EOF in script end tag",
218 "<SCRIPT>a</SCRipt",
219 "<script>$a</SCRipt",
220 },
221 {
222 "scriptx end tag",
223 "<SCRIPT>a</SCRiptx",
224 "<script>$a</SCRiptx",
225 },
226 {
227 "' ' completes script end tag",
228 "<SCRIPT>a</SCRipt ",
229 "<script>$a",
230 },
231 {
232 "'>' completes script end tag",
233 "<SCRIPT>a</SCRipt>",
234 "<script>$a$</script>",
235 },
236 {
237 "self-closing script end tag",
238 "<SCRIPT>a</SCRipt/>",
239 "<script>$a$</script>",
240 },
241 {
242 "nested script tag",
243 "<SCRIPT>a</SCRipt<script>",
244 "<script>$a</SCRipt<script>",
245 },
246 {
247 "script end tag after unfinished",
248 "<SCRIPT>a</SCRipt</script>",
249 "<script>$a</SCRipt$</script>",
250 },
251 {
252 "script/style mismatched tags",
253 "<script>a</style>",
254 "<script>$a</style>",
255 },
256 {
257 "style element with entity",
258 "<style>'",
259 "<style>$&apos;",
260 },
261 {
262 "textarea with tag",
263 "<textarea><div></textarea>",
264 "<textarea>$<div>$</textarea>",
265 },
266 {
267 "title with tag and entity",
268 "<title><b>K&R C</b></title>",
269 "<title>$<b>K&R C</b>$</title>",
270 },
271 {
272 "title with trailing '<' entity",
273 "<title>foobar<</title>",
274 "<title>$foobar<$</title>",
275 },
276
277 {
278 "Proper DOCTYPE",
279 "<!DOCTYPE html>",
280 "<!DOCTYPE html>",
281 },
282 {
283 "DOCTYPE with no space",
284 "<!doctypehtml>",
285 "<!DOCTYPE html>",
286 },
287 {
288 "DOCTYPE with two spaces",
289 "<!doctype html>",
290 "<!DOCTYPE html>",
291 },
292 {
293 "looks like DOCTYPE but isn't",
294 "<!DOCUMENT html>",
295 "<!--DOCUMENT html-->",
296 },
297 {
298 "DOCTYPE at EOF",
299 "<!DOCtype",
300 "<!DOCTYPE >",
301 },
302
303 {
304 "XML processing instruction",
305 "<?xml?>",
306 "<!--?xml?-->",
307 },
308
309 {
310 "comment0",
311 "abc<b><!-- skipme --></b>def",
312 "abc$<b>$<!-- skipme -->$</b>$def",
313 },
314 {
315 "comment1",
316 "a<!-->z",
317 "a$<!---->$z",
318 },
319 {
320 "comment2",
321 "a<!--->z",
322 "a$<!---->$z",
323 },
324 {
325 "comment3",
326 "a<!--x>-->z",
327 "a$<!--x>-->$z",
328 },
329 {
330 "comment4",
331 "a<!--x->-->z",
332 "a$<!--x->-->$z",
333 },
334 {
335 "comment5",
336 "a<!>z",
337 "a$<!---->$z",
338 },
339 {
340 "comment6",
341 "a<!->z",
342 "a$<!----->$z",
343 },
344 {
345 "comment7",
346 "a<!---<>z",
347 "a$<!---<>z-->",
348 },
349 {
350 "comment8",
351 "a<!--z",
352 "a$<!--z-->",
353 },
354 {
355 "comment9",
356 "a<!--z-",
357 "a$<!--z-->",
358 },
359 {
360 "comment10",
361 "a<!--z--",
362 "a$<!--z-->",
363 },
364 {
365 "comment11",
366 "a<!--z---",
367 "a$<!--z--->",
368 },
369 {
370 "comment12",
371 "a<!--z----",
372 "a$<!--z---->",
373 },
374 {
375 "comment13",
376 "a<!--x--!>z",
377 "a$<!--x-->$z",
378 },
379 {
380 "comment14",
381 "a<!--!-->z",
382 "a$<!--!-->$z",
383 },
384 {
385 "comment15",
386 "a<!-- !-->z",
387 "a$<!-- !-->$z",
388 },
389 {
390 "comment16",
391 "a<!--i\x00j-->z",
392 "a$<!--i\uFFFDj-->$z",
393 },
394 {
395 "comment17",
396 "a<!--\x00",
397 "a$<!--\uFFFD-->",
398 },
399 {
400 "comment18",
401 "a<!--<!-->z",
402 "a$<!--<!-->$z",
403 },
404 {
405 "comment19",
406 "a<!--<!--",
407 "a$<!--<!-->",
408 },
409 {
410 "comment20",
411 "a<!--ij--kl-->z",
412 "a$<!--ij--kl-->$z",
413 },
414 {
415 "comment21",
416 "a<!--ij--kl--!>z",
417 "a$<!--ij--kl-->$z",
418 },
419 {
420 "comment22",
421 "a<!--!--!<--!-->z",
422 "a$<!--!--!<--!-->$z",
423 },
424 {
425 "comment23",
426 "a<!-->-->z",
427 "a$<!-->-->$z",
428 },
429 {
430 "comment24",
431 "a<!-->>x",
432 "a$<!-->>x-->",
433 },
434 {
435 "comment25",
436 "a<!-->>",
437 "a$<!-->>-->",
438 },
439 {
440 "comment26",
441 "a<!-->>-",
442 "a$<!-->>-->",
443 },
444 {
445 "comment27",
446 "a<!-->>-->z",
447 "a$<!-->>-->$z",
448 },
449 {
450 "comment28",
451 "a<!--&>-->z",
452 "a$<!--&>-->$z",
453 },
454 {
455 "comment29",
456 "a<!--&gt;-->z",
457 "a$<!--&gt;-->$z",
458 },
459 {
460 "comment30",
461 "a<!--&nosuchentity;-->z",
462 "a$<!--&nosuchentity;-->$z",
463 },
464 {
465 "comment31",
466 "a<!--i>>j-->z",
467 "a$<!--i>>j-->$z",
468 },
469 {
470 "comment32",
471 "a<!--i!>>j-->z",
472 "a$<!--i!>>j-->$z",
473 },
474
475
476
477
478
479
480
481
482
483
484
485
486 {
487 "issue48237CommentWithAmpgtsemi1",
488 "a<!--<p></p><!--[video]-->-->z",
489 "a$<!--<p></p><!--[video]-->-->$z",
490 },
491 {
492 "issue48237CommentWithAmpgtsemi2",
493 "a<!--<p></p><!--[video]--!>-->z",
494 "a$<!--<p></p><!--[video]--!>-->$z",
495 },
496 {
497 "issue58246MicrosoftOutlookComment1",
498 "a<!--[if mso]> your code <![endif]-->z",
499 "a$<!--[if mso]> your code <![endif]-->$z",
500 },
501 {
502 "issue58246MicrosoftOutlookComment2",
503 "a" + issue58246 + "z",
504 "a$" + issue58246 + "$z",
505 },
506
507 {
508 "backslash",
509 `<p id="a\"b">`,
510 `<p id="a\" b"="">`,
511 },
512
513
514 {
515 "tricky",
516 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
517 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
518 },
519
520
521 {
522 "noSuchEntity",
523 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
524 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
525 },
526 {
527 "entity without semicolon",
528 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
529 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
530 },
531 {
532 "entity with digits",
533 "½",
534 "½",
535 },
536
537
538 {
539 "Empty attribute",
540 `<input disabled FOO>`,
541 `<input disabled="" foo="">`,
542 },
543 {
544 "Empty attribute, whitespace",
545 `<input disabled FOO >`,
546 `<input disabled="" foo="">`,
547 },
548 {
549 "Unquoted attribute value",
550 `<input value=yes FOO=BAR>`,
551 `<input value="yes" foo="BAR">`,
552 },
553 {
554 "Unquoted attribute value, spaces",
555 `<input value = yes FOO = BAR>`,
556 `<input value="yes" foo="BAR">`,
557 },
558 {
559 "Unquoted attribute value, trailing space",
560 `<input value=yes FOO=BAR >`,
561 `<input value="yes" foo="BAR">`,
562 },
563 {
564 "Single-quoted attribute value",
565 `<input value='yes' FOO='BAR'>`,
566 `<input value="yes" foo="BAR">`,
567 },
568 {
569 "Single-quoted attribute value, trailing space",
570 `<input value='yes' FOO='BAR' >`,
571 `<input value="yes" foo="BAR">`,
572 },
573 {
574 "Double-quoted attribute value",
575 `<input value="I'm an attribute" FOO="BAR">`,
576 `<input value="I'm an attribute" foo="BAR">`,
577 },
578 {
579 "Attribute name characters",
580 `<meta http-equiv="content-type">`,
581 `<meta http-equiv="content-type">`,
582 },
583 {
584 "Mixed attributes",
585 `a<P V="0 1" w='2' X=3 y>z`,
586 `a$<p v="0 1" w="2" x="3" y="">$z`,
587 },
588 {
589 "Attributes with a solitary single quote",
590 `<p id=can't><p id=won't>`,
591 `<p id="can't">$<p id="won't">`,
592 },
593
594 {
595 "equals sign before attribute name",
596 `<p =>`,
597 `<p =="">`,
598 },
599 {
600 "equals sign before attribute name, extra cruft",
601 `<p =asd>`,
602 `<p =asd="">`,
603 },
604 {
605 "forward slash before attribute name",
606 `<p/=">`,
607 `<p ="="">`,
608 },
609 {
610 "forward slash before attribute name with spaces around",
611 `<p / =">`,
612 `<p ="="">`,
613 },
614 {
615 "forward slash after attribute name followed by a character",
616 `<p a/ ="">`,
617 `<p a="" =""="">`,
618 },
619 }
620
621 func TestTokenizer(t *testing.T) {
622 for _, tt := range tokenTests {
623 t.Run(tt.desc, func(t *testing.T) {
624 z := NewTokenizer(strings.NewReader(tt.html))
625 if tt.golden != "" {
626 for i, s := range strings.Split(tt.golden, "$") {
627 if z.Next() == ErrorToken {
628 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
629 return
630 }
631 actual := z.Token().String()
632 if s != actual {
633 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
634 return
635 }
636 }
637 }
638 z.Next()
639 if z.Err() != io.EOF {
640 t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
641 }
642 })
643 }
644 }
645
646 func TestMaxBuffer(t *testing.T) {
647
648 z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
649 z.SetMaxBuf(5)
650 tt := z.Next()
651 if got, want := tt, ErrorToken; got != want {
652 t.Fatalf("token type: got: %v want: %v", got, want)
653 }
654 if got, want := z.Err(), ErrBufferExceeded; got != want {
655 t.Errorf("error type: got: %v want: %v", got, want)
656 }
657 if got, want := string(z.Raw()), "<tttt"; got != want {
658 t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
659 }
660 }
661
662 func TestMaxBufferReconstruction(t *testing.T) {
663
664
665 tests:
666 for _, test := range tokenTests {
667 for maxBuf := 1; ; maxBuf++ {
668 r := strings.NewReader(test.html)
669 z := NewTokenizer(r)
670 z.SetMaxBuf(maxBuf)
671 var tokenized bytes.Buffer
672 for {
673 tt := z.Next()
674 tokenized.Write(z.Raw())
675 if tt == ErrorToken {
676 if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
677 t.Errorf("%s: unexpected error: %v", test.desc, err)
678 }
679 break
680 }
681 }
682
683 assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
684 if err != nil {
685 t.Errorf("%s: ReadAll: %v", test.desc, err)
686 continue tests
687 }
688 if got, want := string(assembled), test.html; got != want {
689 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
690 continue tests
691 }
692
693
694 if z.Err() == io.EOF {
695 break
696 }
697 }
698 }
699 }
700
701 func TestPassthrough(t *testing.T) {
702
703
704 for _, test := range tokenTests {
705 z := NewTokenizer(strings.NewReader(test.html))
706 var parsed bytes.Buffer
707 for {
708 tt := z.Next()
709 parsed.Write(z.Raw())
710 if tt == ErrorToken {
711 break
712 }
713 }
714 if got, want := parsed.String(), test.html; got != want {
715 t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
716 }
717 }
718 }
719
720 func TestBufAPI(t *testing.T) {
721 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
722 z := NewTokenizer(bytes.NewBufferString(s))
723 var result bytes.Buffer
724 depth := 0
725 loop:
726 for {
727 tt := z.Next()
728 switch tt {
729 case ErrorToken:
730 if z.Err() != io.EOF {
731 t.Error(z.Err())
732 }
733 break loop
734 case TextToken:
735 if depth > 0 {
736 result.Write(z.Text())
737 }
738 case StartTagToken, EndTagToken:
739 tn, _ := z.TagName()
740 if len(tn) == 1 && tn[0] == 'a' {
741 if tt == StartTagToken {
742 depth++
743 } else {
744 depth--
745 }
746 }
747 }
748 }
749 u := "14567"
750 v := string(result.Bytes())
751 if u != v {
752 t.Errorf("TestBufAPI: want %q got %q", u, v)
753 }
754 }
755
756 func TestConvertNewlines(t *testing.T) {
757 testCases := map[string]string{
758 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
759 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
760 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
761 "": "",
762 "\n": "\n",
763 "\n\r": "\n\n",
764 "\r": "\n",
765 "\r\n": "\n",
766 "\r\n\n": "\n\n",
767 "\r\n\r": "\n\n",
768 "\r\n\r\n": "\n\n",
769 "\r\r": "\n\n",
770 "\r\r\n": "\n\n",
771 "\r\r\n\n": "\n\n\n",
772 "\r\r\r\n": "\n\n\n",
773 "\r \n": "\n \n",
774 "xyz": "xyz",
775 }
776 for in, want := range testCases {
777 if got := string(convertNewlines([]byte(in))); got != want {
778 t.Errorf("input %q: got %q, want %q", in, got, want)
779 }
780 }
781 }
782
783 func TestReaderEdgeCases(t *testing.T) {
784 const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
785 testCases := []io.Reader{
786 &zeroOneByteReader{s: s},
787 &eofStringsReader{s: s},
788 &stuckReader{},
789 }
790 for i, tc := range testCases {
791 got := []TokenType{}
792 z := NewTokenizer(tc)
793 for {
794 tt := z.Next()
795 if tt == ErrorToken {
796 break
797 }
798 got = append(got, tt)
799 }
800 if err := z.Err(); err != nil && err != io.EOF {
801 if err != io.ErrNoProgress {
802 t.Errorf("i=%d: %v", i, err)
803 }
804 continue
805 }
806 want := []TokenType{
807 StartTagToken,
808 TextToken,
809 EndTagToken,
810 }
811 if !reflect.DeepEqual(got, want) {
812 t.Errorf("i=%d: got %v, want %v", i, got, want)
813 continue
814 }
815 }
816 }
817
818
819
820 type zeroOneByteReader struct {
821 s string
822 n int
823 }
824
825 func (r *zeroOneByteReader) Read(p []byte) (int, error) {
826 if len(p) == 0 {
827 return 0, nil
828 }
829 if len(r.s) == 0 {
830 return 0, io.EOF
831 }
832 r.n++
833 if r.n%2 != 0 {
834 return 0, nil
835 }
836 p[0], r.s = r.s[0], r.s[1:]
837 return 1, nil
838 }
839
840
841
842 type eofStringsReader struct {
843 s string
844 }
845
846 func (r *eofStringsReader) Read(p []byte) (int, error) {
847 n := copy(p, r.s)
848 r.s = r.s[n:]
849 if r.s != "" {
850 return n, nil
851 }
852 return n, io.EOF
853 }
854
855
856 type stuckReader struct{}
857
858 func (*stuckReader) Read(p []byte) (int, error) {
859 return 0, nil
860 }
861
862 const (
863 rawLevel = iota
864 lowLevel
865 highLevel
866 )
867
868 func benchmarkTokenizer(b *testing.B, level int) {
869 buf, err := ioutil.ReadFile("testdata/go1.html")
870 if err != nil {
871 b.Fatalf("could not read testdata/go1.html: %v", err)
872 }
873 b.SetBytes(int64(len(buf)))
874 runtime.GC()
875 b.ReportAllocs()
876 b.ResetTimer()
877 for i := 0; i < b.N; i++ {
878 z := NewTokenizer(bytes.NewBuffer(buf))
879 for {
880 tt := z.Next()
881 if tt == ErrorToken {
882 if err := z.Err(); err != nil && err != io.EOF {
883 b.Fatalf("tokenizer error: %v", err)
884 }
885 break
886 }
887 switch level {
888 case rawLevel:
889
890
891 z.Raw()
892 case lowLevel:
893
894
895 switch tt {
896 case TextToken, CommentToken, DoctypeToken:
897 z.Text()
898 case StartTagToken, SelfClosingTagToken:
899 _, more := z.TagName()
900 for more {
901 _, _, more = z.TagAttr()
902 }
903 case EndTagToken:
904 z.TagName()
905 }
906 case highLevel:
907
908
909 z.Token()
910 }
911 }
912 }
913 }
914
915 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
916 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
917 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }
918
View as plain text