1
2
3
4
5 package html
6
7 import (
8 "bufio"
9 "bytes"
10 "errors"
11 "fmt"
12 "io"
13 "io/ioutil"
14 "os"
15 "path/filepath"
16 "runtime"
17 "sort"
18 "strings"
19 "testing"
20
21 "golang.org/x/net/html/atom"
22 )
23
24 type testAttrs struct {
25 text, want, context string
26 scripting bool
27 }
28
29
30 func readParseTest(r *bufio.Reader) (*testAttrs, error) {
31 ta := &testAttrs{scripting: true}
32 line, err := r.ReadSlice('\n')
33 if err != nil {
34 return nil, err
35 }
36 var b []byte
37
38
39 if string(line) != "#data\n" {
40 return nil, fmt.Errorf(`got %q want "#data\n"`, line)
41 }
42 for {
43 line, err = r.ReadSlice('\n')
44 if err != nil {
45 return nil, err
46 }
47 if line[0] == '#' {
48 break
49 }
50 b = append(b, line...)
51 }
52 ta.text = strings.TrimSuffix(string(b), "\n")
53 b = b[:0]
54
55
56 if string(line) != "#errors\n" {
57 return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
58 }
59 for {
60 line, err = r.ReadSlice('\n')
61 if err != nil {
62 return nil, err
63 }
64 if line[0] == '#' {
65 break
66 }
67 }
68
69
70 if string(line) == "#new-errors\n" {
71 for {
72 line, err = r.ReadSlice('\n')
73 if err != nil {
74 return nil, err
75 }
76 if line[0] == '#' {
77 break
78 }
79 }
80 }
81
82 if ls := string(line); strings.HasPrefix(ls, "#script-") {
83 switch {
84 case strings.HasSuffix(ls, "-on\n"):
85 ta.scripting = true
86 case strings.HasSuffix(ls, "-off\n"):
87 ta.scripting = false
88 default:
89 return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
90 }
91 for {
92 line, err = r.ReadSlice('\n')
93 if err != nil {
94 return nil, err
95 }
96 if line[0] == '#' {
97 break
98 }
99 }
100 }
101
102 if string(line) == "#document-fragment\n" {
103 line, err = r.ReadSlice('\n')
104 if err != nil {
105 return nil, err
106 }
107 ta.context = strings.TrimSpace(string(line))
108 line, err = r.ReadSlice('\n')
109 if err != nil {
110 return nil, err
111 }
112 }
113
114
115 if string(line) != "#document\n" {
116 return nil, fmt.Errorf(`got %q want "#document\n"`, line)
117 }
118 inQuote := false
119 for {
120 line, err = r.ReadSlice('\n')
121 if err != nil && err != io.EOF {
122 return nil, err
123 }
124 trimmed := bytes.Trim(line, "| \n")
125 if len(trimmed) > 0 {
126 if line[0] == '|' && trimmed[0] == '"' {
127 inQuote = true
128 }
129 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
130 inQuote = false
131 }
132 }
133 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
134 break
135 }
136 b = append(b, line...)
137 }
138 ta.want = string(b)
139 return ta, nil
140 }
141
142 func dumpIndent(w io.Writer, level int) {
143 io.WriteString(w, "| ")
144 for i := 0; i < level; i++ {
145 io.WriteString(w, " ")
146 }
147 }
148
149 type sortedAttributes []Attribute
150
151 func (a sortedAttributes) Len() int {
152 return len(a)
153 }
154
155 func (a sortedAttributes) Less(i, j int) bool {
156 if a[i].Namespace != a[j].Namespace {
157 return a[i].Namespace < a[j].Namespace
158 }
159 return a[i].Key < a[j].Key
160 }
161
162 func (a sortedAttributes) Swap(i, j int) {
163 a[i], a[j] = a[j], a[i]
164 }
165
166 func dumpLevel(w io.Writer, n *Node, level int) error {
167 dumpIndent(w, level)
168 level++
169 switch n.Type {
170 case ErrorNode:
171 return errors.New("unexpected ErrorNode")
172 case DocumentNode:
173 return errors.New("unexpected DocumentNode")
174 case ElementNode:
175 if n.Namespace != "" {
176 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
177 } else {
178 fmt.Fprintf(w, "<%s>", n.Data)
179 }
180 attr := sortedAttributes(n.Attr)
181 sort.Sort(attr)
182 for _, a := range attr {
183 io.WriteString(w, "\n")
184 dumpIndent(w, level)
185 if a.Namespace != "" {
186 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
187 } else {
188 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
189 }
190 }
191 if n.Namespace == "" && n.DataAtom == atom.Template {
192 io.WriteString(w, "\n")
193 dumpIndent(w, level)
194 level++
195 io.WriteString(w, "content")
196 }
197 case TextNode:
198 fmt.Fprintf(w, `"%s"`, n.Data)
199 case CommentNode:
200 fmt.Fprintf(w, "<!-- %s -->", n.Data)
201 case DoctypeNode:
202 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
203 if n.Attr != nil {
204 var p, s string
205 for _, a := range n.Attr {
206 switch a.Key {
207 case "public":
208 p = a.Val
209 case "system":
210 s = a.Val
211 }
212 }
213 if p != "" || s != "" {
214 fmt.Fprintf(w, ` "%s"`, p)
215 fmt.Fprintf(w, ` "%s"`, s)
216 }
217 }
218 io.WriteString(w, ">")
219 case scopeMarkerNode:
220 return errors.New("unexpected scopeMarkerNode")
221 default:
222 return errors.New("unknown node type")
223 }
224 io.WriteString(w, "\n")
225 for c := n.FirstChild; c != nil; c = c.NextSibling {
226 if err := dumpLevel(w, c, level); err != nil {
227 return err
228 }
229 }
230 return nil
231 }
232
233 func dump(n *Node) (string, error) {
234 if n == nil || n.FirstChild == nil {
235 return "", nil
236 }
237 var b bytes.Buffer
238 for c := n.FirstChild; c != nil; c = c.NextSibling {
239 if err := dumpLevel(&b, c, 0); err != nil {
240 return "", err
241 }
242 }
243 return b.String(), nil
244 }
245
246 var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
247
248 func TestParser(t *testing.T) {
249 for _, testDataDir := range testDataDirs {
250 testFiles, err := filepath.Glob(testDataDir + "*.dat")
251 if err != nil {
252 t.Fatal(err)
253 }
254 for _, tf := range testFiles {
255 f, err := os.Open(tf)
256 if err != nil {
257 t.Fatal(err)
258 }
259 defer f.Close()
260 r := bufio.NewReader(f)
261
262 for i := 0; ; i++ {
263 ta, err := readParseTest(r)
264 if err == io.EOF {
265 break
266 }
267 if err != nil {
268 t.Fatal(err)
269 }
270 if parseTestBlacklist[ta.text] {
271 continue
272 }
273
274 err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
275
276 if err != nil {
277 t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
278 }
279 }
280 }
281 }
282 }
283
284
285 func TestParserWithoutScripting(t *testing.T) {
286 text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
287 want := `| <html>
288 | <head>
289 | <noscript>
290 | <body>
291 | <img>
292 | src="https://golang.org/doc/gopher/frontpage.png"
293 | <p>
294 | <img>
295 | src="https://golang.org/doc/gopher/doc.png"
296 `
297
298 if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
299 t.Errorf("test with scripting is disabled, %q, %s", text, err)
300 }
301 }
302
303
304
305
306
307 func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
308 defer func() {
309 if x := recover(); x != nil {
310 switch e := x.(type) {
311 case error:
312 err = e
313 default:
314 err = fmt.Errorf("%v", e)
315 }
316 }
317 }()
318
319 var doc *Node
320 if context == "" {
321 doc, err = ParseWithOptions(strings.NewReader(text), opts...)
322 if err != nil {
323 return err
324 }
325 } else {
326 namespace := ""
327 if i := strings.IndexByte(context, ' '); i >= 0 {
328 namespace, context = context[:i], context[i+1:]
329 }
330 contextNode := &Node{
331 Data: context,
332 DataAtom: atom.Lookup([]byte(context)),
333 Namespace: namespace,
334 Type: ElementNode,
335 }
336 nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
337 if err != nil {
338 return err
339 }
340 doc = &Node{
341 Type: DocumentNode,
342 }
343 for _, n := range nodes {
344 doc.AppendChild(n)
345 }
346 }
347
348 if err := checkTreeConsistency(doc); err != nil {
349 return err
350 }
351
352 got, err := dump(doc)
353 if err != nil {
354 return err
355 }
356
357 if got != want {
358 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
359 }
360
361 if renderTestBlacklist[text] || context != "" {
362 return nil
363 }
364
365
366 pr, pw := io.Pipe()
367 go func() {
368 pw.CloseWithError(Render(pw, doc))
369 }()
370 doc1, err := ParseWithOptions(pr, opts...)
371 if err != nil {
372 return err
373 }
374 got1, err := dump(doc1)
375 if err != nil {
376 return err
377 }
378 if got != got1 {
379 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
380 }
381
382 return nil
383 }
384
385
386
387 var parseTestBlacklist = map[string]bool{
388
389 `<math><template><mo><template>`: true,
390 `<template><svg><foo><template><foreignObject><div></template><div>`: true,
391 }
392
393
394
395
396
397 var renderTestBlacklist = map[string]bool{
398
399
400 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
401
402 `<p><table></p>`: true,
403
404 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
405 `<a><table><a></table><p><a><div><a>`: true,
406 `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
407 `<template><a><table><a>`: true,
408
409 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
410
411
412 `<table><plaintext><td>`: true,
413 `<!doctype html><table><plaintext></plaintext>`: true,
414 `<!doctype html><table><tbody><plaintext></plaintext>`: true,
415 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
416
417 `<!doctype html><form><table></form><form></table></form>`: true,
418
419 `<!doctype html><script><!--<script `: true,
420 `<!doctype html><script><!--<script <`: true,
421 `<!doctype html><script><!--<script <a`: true,
422 `<!doctype html><script><!--<script </`: true,
423 `<!doctype html><script><!--<script </s`: true,
424 `<!doctype html><script><!--<script </script`: true,
425 `<!doctype html><script><!--<script </scripta`: true,
426 `<!doctype html><script><!--<script -`: true,
427 `<!doctype html><script><!--<script -a`: true,
428 `<!doctype html><script><!--<script -<`: true,
429 `<!doctype html><script><!--<script --`: true,
430 `<!doctype html><script><!--<script --a`: true,
431 `<!doctype html><script><!--<script --<`: true,
432 `<script><!--<script `: true,
433 `<script><!--<script <a`: true,
434 `<script><!--<script </script`: true,
435 `<script><!--<script </scripta`: true,
436 `<script><!--<script -`: true,
437 `<script><!--<script -a`: true,
438 `<script><!--<script --`: true,
439 `<script><!--<script --a`: true,
440 `<script><!--<script <`: true,
441 `<script><!--<script </`: true,
442 `<script><!--<script </s`: true,
443
444
445 `<!doctype html><p><a><plaintext>b`: true,
446 `<table><math><select><mi><select></table>`: true,
447 `<!doctype html><table><colgroup><plaintext></plaintext>`: true,
448 `<!doctype html><svg><plaintext>a</plaintext>b`: true,
449 }
450
451 func TestNodeConsistency(t *testing.T) {
452
453 inconsistentNode := &Node{
454 Type: ElementNode,
455 DataAtom: atom.Frameset,
456 Data: "table",
457 }
458 if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
459 t.Errorf("got nil error, want non-nil")
460 }
461 }
462
463 func TestParseFragmentWithNilContext(t *testing.T) {
464
465 ParseFragment(strings.NewReader("<p>hello</p>"), nil)
466 }
467
468 func TestParseFragmentForeignContentTemplates(t *testing.T) {
469 srcs := []string{
470 "<math><html><template><mn><template></template></template>",
471 "<math><math><head><mi><template>",
472 }
473 for _, src := range srcs {
474
475 ParseFragment(strings.NewReader(src), nil)
476 }
477 }
478
479 func BenchmarkParser(b *testing.B) {
480 buf, err := ioutil.ReadFile("testdata/go1.html")
481 if err != nil {
482 b.Fatalf("could not read testdata/go1.html: %v", err)
483 }
484 b.SetBytes(int64(len(buf)))
485 runtime.GC()
486 b.ReportAllocs()
487 b.ResetTimer()
488 for i := 0; i < b.N; i++ {
489 Parse(bytes.NewBuffer(buf))
490 }
491 }
492
View as plain text