1
2
3
4
5 package html
6
7 import (
8 "bytes"
9 "errors"
10 "io"
11 "strconv"
12 "strings"
13
14 "golang.org/x/net/html/atom"
15 )
16
17
18 type TokenType uint32
19
20 const (
21
22 ErrorToken TokenType = iota
23
24 TextToken
25
26 StartTagToken
27
28 EndTagToken
29
30 SelfClosingTagToken
31
32 CommentToken
33
34 DoctypeToken
35 )
36
37
38 var ErrBufferExceeded = errors.New("max buffer exceeded")
39
40
41 func (t TokenType) String() string {
42 switch t {
43 case ErrorToken:
44 return "Error"
45 case TextToken:
46 return "Text"
47 case StartTagToken:
48 return "StartTag"
49 case EndTagToken:
50 return "EndTag"
51 case SelfClosingTagToken:
52 return "SelfClosingTag"
53 case CommentToken:
54 return "Comment"
55 case DoctypeToken:
56 return "Doctype"
57 }
58 return "Invalid(" + strconv.Itoa(int(t)) + ")"
59 }
60
61
62
63
64
65
66
67 type Attribute struct {
68 Namespace, Key, Val string
69 }
70
71
72
73
74
75
76 type Token struct {
77 Type TokenType
78 DataAtom atom.Atom
79 Data string
80 Attr []Attribute
81 }
82
83
84 func (t Token) tagString() string {
85 if len(t.Attr) == 0 {
86 return t.Data
87 }
88 buf := bytes.NewBufferString(t.Data)
89 for _, a := range t.Attr {
90 buf.WriteByte(' ')
91 buf.WriteString(a.Key)
92 buf.WriteString(`="`)
93 escape(buf, a.Val)
94 buf.WriteByte('"')
95 }
96 return buf.String()
97 }
98
99
100 func (t Token) String() string {
101 switch t.Type {
102 case ErrorToken:
103 return ""
104 case TextToken:
105 return EscapeString(t.Data)
106 case StartTagToken:
107 return "<" + t.tagString() + ">"
108 case EndTagToken:
109 return "</" + t.tagString() + ">"
110 case SelfClosingTagToken:
111 return "<" + t.tagString() + "/>"
112 case CommentToken:
113 return "<!--" + escapeCommentString(t.Data) + "-->"
114 case DoctypeToken:
115 return "<!DOCTYPE " + EscapeString(t.Data) + ">"
116 }
117 return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
118 }
119
120
121
122 type span struct {
123 start, end int
124 }
125
126
127 type Tokenizer struct {
128
129 r io.Reader
130
131 tt TokenType
132
133
134
135
136
137
138
139 err error
140
141
142
143
144 readErr error
145
146
147 raw span
148 buf []byte
149
150 maxBuf int
151
152
153 data span
154
155
156
157 pendingAttr [2]span
158 attr [][2]span
159 nAttrReturned int
160
161
162
163
164 rawTag string
165
166 textIsRaw bool
167
168
169 convertNUL bool
170
171 allowCDATA bool
172 }
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188 func (z *Tokenizer) AllowCDATA(allowCDATA bool) {
189 z.allowCDATA = allowCDATA
190 }
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216 func (z *Tokenizer) NextIsNotRawText() {
217 z.rawTag = ""
218 }
219
220
221
222 func (z *Tokenizer) Err() error {
223 if z.tt != ErrorToken {
224 return nil
225 }
226 return z.err
227 }
228
229
230
231
232
233
234 func (z *Tokenizer) readByte() byte {
235 if z.raw.end >= len(z.buf) {
236
237
238 if z.readErr != nil {
239 z.err = z.readErr
240 return 0
241 }
242
243
244
245 c := cap(z.buf)
246 d := z.raw.end - z.raw.start
247 var buf1 []byte
248 if 2*d > c {
249 buf1 = make([]byte, d, 2*c)
250 } else {
251 buf1 = z.buf[:d]
252 }
253 copy(buf1, z.buf[z.raw.start:z.raw.end])
254 if x := z.raw.start; x != 0 {
255
256 z.data.start -= x
257 z.data.end -= x
258 z.pendingAttr[0].start -= x
259 z.pendingAttr[0].end -= x
260 z.pendingAttr[1].start -= x
261 z.pendingAttr[1].end -= x
262 for i := range z.attr {
263 z.attr[i][0].start -= x
264 z.attr[i][0].end -= x
265 z.attr[i][1].start -= x
266 z.attr[i][1].end -= x
267 }
268 }
269 z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
270
271
272 var n int
273 n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)])
274 if n == 0 {
275 z.err = z.readErr
276 return 0
277 }
278 z.buf = buf1[:d+n]
279 }
280 x := z.buf[z.raw.end]
281 z.raw.end++
282 if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {
283 z.err = ErrBufferExceeded
284 return 0
285 }
286 return x
287 }
288
289
290 func (z *Tokenizer) Buffered() []byte {
291 return z.buf[z.raw.end:]
292 }
293
294
295
296
297 func readAtLeastOneByte(r io.Reader, b []byte) (int, error) {
298 for i := 0; i < 100; i++ {
299 if n, err := r.Read(b); n != 0 || err != nil {
300 return n, err
301 }
302 }
303 return 0, io.ErrNoProgress
304 }
305
306
307 func (z *Tokenizer) skipWhiteSpace() {
308 if z.err != nil {
309 return
310 }
311 for {
312 c := z.readByte()
313 if z.err != nil {
314 return
315 }
316 switch c {
317 case ' ', '\n', '\r', '\t', '\f':
318
319 default:
320 z.raw.end--
321 return
322 }
323 }
324 }
325
326
327
328 func (z *Tokenizer) readRawOrRCDATA() {
329 if z.rawTag == "script" {
330 z.readScript()
331 z.textIsRaw = true
332 z.rawTag = ""
333 return
334 }
335 loop:
336 for {
337 c := z.readByte()
338 if z.err != nil {
339 break loop
340 }
341 if c != '<' {
342 continue loop
343 }
344 c = z.readByte()
345 if z.err != nil {
346 break loop
347 }
348 if c != '/' {
349 z.raw.end--
350 continue loop
351 }
352 if z.readRawEndTag() || z.err != nil {
353 break loop
354 }
355 }
356 z.data.end = z.raw.end
357
358 z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
359 z.rawTag = ""
360 }
361
362
363
364
365
366 func (z *Tokenizer) readRawEndTag() bool {
367 for i := 0; i < len(z.rawTag); i++ {
368 c := z.readByte()
369 if z.err != nil {
370 return false
371 }
372 if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
373 z.raw.end--
374 return false
375 }
376 }
377 c := z.readByte()
378 if z.err != nil {
379 return false
380 }
381 switch c {
382 case ' ', '\n', '\r', '\t', '\f', '/', '>':
383
384 z.raw.end -= 3 + len(z.rawTag)
385 return true
386 }
387 z.raw.end--
388 return false
389 }
390
391
392
393 func (z *Tokenizer) readScript() {
394 defer func() {
395 z.data.end = z.raw.end
396 }()
397 var c byte
398
399 scriptData:
400 c = z.readByte()
401 if z.err != nil {
402 return
403 }
404 if c == '<' {
405 goto scriptDataLessThanSign
406 }
407 goto scriptData
408
409 scriptDataLessThanSign:
410 c = z.readByte()
411 if z.err != nil {
412 return
413 }
414 switch c {
415 case '/':
416 goto scriptDataEndTagOpen
417 case '!':
418 goto scriptDataEscapeStart
419 }
420 z.raw.end--
421 goto scriptData
422
423 scriptDataEndTagOpen:
424 if z.readRawEndTag() || z.err != nil {
425 return
426 }
427 goto scriptData
428
429 scriptDataEscapeStart:
430 c = z.readByte()
431 if z.err != nil {
432 return
433 }
434 if c == '-' {
435 goto scriptDataEscapeStartDash
436 }
437 z.raw.end--
438 goto scriptData
439
440 scriptDataEscapeStartDash:
441 c = z.readByte()
442 if z.err != nil {
443 return
444 }
445 if c == '-' {
446 goto scriptDataEscapedDashDash
447 }
448 z.raw.end--
449 goto scriptData
450
451 scriptDataEscaped:
452 c = z.readByte()
453 if z.err != nil {
454 return
455 }
456 switch c {
457 case '-':
458 goto scriptDataEscapedDash
459 case '<':
460 goto scriptDataEscapedLessThanSign
461 }
462 goto scriptDataEscaped
463
464 scriptDataEscapedDash:
465 c = z.readByte()
466 if z.err != nil {
467 return
468 }
469 switch c {
470 case '-':
471 goto scriptDataEscapedDashDash
472 case '<':
473 goto scriptDataEscapedLessThanSign
474 }
475 goto scriptDataEscaped
476
477 scriptDataEscapedDashDash:
478 c = z.readByte()
479 if z.err != nil {
480 return
481 }
482 switch c {
483 case '-':
484 goto scriptDataEscapedDashDash
485 case '<':
486 goto scriptDataEscapedLessThanSign
487 case '>':
488 goto scriptData
489 }
490 goto scriptDataEscaped
491
492 scriptDataEscapedLessThanSign:
493 c = z.readByte()
494 if z.err != nil {
495 return
496 }
497 if c == '/' {
498 goto scriptDataEscapedEndTagOpen
499 }
500 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
501 goto scriptDataDoubleEscapeStart
502 }
503 z.raw.end--
504 goto scriptData
505
506 scriptDataEscapedEndTagOpen:
507 if z.readRawEndTag() || z.err != nil {
508 return
509 }
510 goto scriptDataEscaped
511
512 scriptDataDoubleEscapeStart:
513 z.raw.end--
514 for i := 0; i < len("script"); i++ {
515 c = z.readByte()
516 if z.err != nil {
517 return
518 }
519 if c != "script"[i] && c != "SCRIPT"[i] {
520 z.raw.end--
521 goto scriptDataEscaped
522 }
523 }
524 c = z.readByte()
525 if z.err != nil {
526 return
527 }
528 switch c {
529 case ' ', '\n', '\r', '\t', '\f', '/', '>':
530 goto scriptDataDoubleEscaped
531 }
532 z.raw.end--
533 goto scriptDataEscaped
534
535 scriptDataDoubleEscaped:
536 c = z.readByte()
537 if z.err != nil {
538 return
539 }
540 switch c {
541 case '-':
542 goto scriptDataDoubleEscapedDash
543 case '<':
544 goto scriptDataDoubleEscapedLessThanSign
545 }
546 goto scriptDataDoubleEscaped
547
548 scriptDataDoubleEscapedDash:
549 c = z.readByte()
550 if z.err != nil {
551 return
552 }
553 switch c {
554 case '-':
555 goto scriptDataDoubleEscapedDashDash
556 case '<':
557 goto scriptDataDoubleEscapedLessThanSign
558 }
559 goto scriptDataDoubleEscaped
560
561 scriptDataDoubleEscapedDashDash:
562 c = z.readByte()
563 if z.err != nil {
564 return
565 }
566 switch c {
567 case '-':
568 goto scriptDataDoubleEscapedDashDash
569 case '<':
570 goto scriptDataDoubleEscapedLessThanSign
571 case '>':
572 goto scriptData
573 }
574 goto scriptDataDoubleEscaped
575
576 scriptDataDoubleEscapedLessThanSign:
577 c = z.readByte()
578 if z.err != nil {
579 return
580 }
581 if c == '/' {
582 goto scriptDataDoubleEscapeEnd
583 }
584 z.raw.end--
585 goto scriptDataDoubleEscaped
586
587 scriptDataDoubleEscapeEnd:
588 if z.readRawEndTag() {
589 z.raw.end += len("</script>")
590 goto scriptDataEscaped
591 }
592 if z.err != nil {
593 return
594 }
595 goto scriptDataDoubleEscaped
596 }
597
598
599
600 func (z *Tokenizer) readComment() {
601
602
603
604
605
606 z.data.start = z.raw.end
607 defer func() {
608 if z.data.end < z.data.start {
609
610 z.data.end = z.data.start
611 }
612 }()
613
614 var dashCount int
615 beginning := true
616 for {
617 c := z.readByte()
618 if z.err != nil {
619 z.data.end = z.calculateAbruptCommentDataEnd()
620 return
621 }
622 switch c {
623 case '-':
624 dashCount++
625 continue
626 case '>':
627 if dashCount >= 2 || beginning {
628 z.data.end = z.raw.end - len("-->")
629 return
630 }
631 case '!':
632 if dashCount >= 2 {
633 c = z.readByte()
634 if z.err != nil {
635 z.data.end = z.calculateAbruptCommentDataEnd()
636 return
637 } else if c == '>' {
638 z.data.end = z.raw.end - len("--!>")
639 return
640 } else if c == '-' {
641 dashCount = 1
642 beginning = false
643 continue
644 }
645 }
646 }
647 dashCount = 0
648 beginning = false
649 }
650 }
651
652 func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
653 raw := z.Raw()
654 const prefixLen = len("<!--")
655 if len(raw) >= prefixLen {
656 raw = raw[prefixLen:]
657 if hasSuffix(raw, "--!") {
658 return z.raw.end - 3
659 } else if hasSuffix(raw, "--") {
660 return z.raw.end - 2
661 } else if hasSuffix(raw, "-") {
662 return z.raw.end - 1
663 }
664 }
665 return z.raw.end
666 }
667
668 func hasSuffix(b []byte, suffix string) bool {
669 if len(b) < len(suffix) {
670 return false
671 }
672 b = b[len(b)-len(suffix):]
673 for i := range b {
674 if b[i] != suffix[i] {
675 return false
676 }
677 }
678 return true
679 }
680
681
682 func (z *Tokenizer) readUntilCloseAngle() {
683 z.data.start = z.raw.end
684 for {
685 c := z.readByte()
686 if z.err != nil {
687 z.data.end = z.raw.end
688 return
689 }
690 if c == '>' {
691 z.data.end = z.raw.end - len(">")
692 return
693 }
694 }
695 }
696
697
698
699
700 func (z *Tokenizer) readMarkupDeclaration() TokenType {
701 z.data.start = z.raw.end
702 var c [2]byte
703 for i := 0; i < 2; i++ {
704 c[i] = z.readByte()
705 if z.err != nil {
706 z.data.end = z.raw.end
707 return CommentToken
708 }
709 }
710 if c[0] == '-' && c[1] == '-' {
711 z.readComment()
712 return CommentToken
713 }
714 z.raw.end -= 2
715 if z.readDoctype() {
716 return DoctypeToken
717 }
718 if z.allowCDATA && z.readCDATA() {
719 z.convertNUL = true
720 return TextToken
721 }
722
723 z.readUntilCloseAngle()
724 return CommentToken
725 }
726
727
728
729 func (z *Tokenizer) readDoctype() bool {
730 const s = "DOCTYPE"
731 for i := 0; i < len(s); i++ {
732 c := z.readByte()
733 if z.err != nil {
734 z.data.end = z.raw.end
735 return false
736 }
737 if c != s[i] && c != s[i]+('a'-'A') {
738
739 z.raw.end = z.data.start
740 return false
741 }
742 }
743 if z.skipWhiteSpace(); z.err != nil {
744 z.data.start = z.raw.end
745 z.data.end = z.raw.end
746 return true
747 }
748 z.readUntilCloseAngle()
749 return true
750 }
751
752
753
754 func (z *Tokenizer) readCDATA() bool {
755 const s = "[CDATA["
756 for i := 0; i < len(s); i++ {
757 c := z.readByte()
758 if z.err != nil {
759 z.data.end = z.raw.end
760 return false
761 }
762 if c != s[i] {
763
764 z.raw.end = z.data.start
765 return false
766 }
767 }
768 z.data.start = z.raw.end
769 brackets := 0
770 for {
771 c := z.readByte()
772 if z.err != nil {
773 z.data.end = z.raw.end
774 return true
775 }
776 switch c {
777 case ']':
778 brackets++
779 case '>':
780 if brackets >= 2 {
781 z.data.end = z.raw.end - len("]]>")
782 return true
783 }
784 brackets = 0
785 default:
786 brackets = 0
787 }
788 }
789 }
790
791
792
793 func (z *Tokenizer) startTagIn(ss ...string) bool {
794 loop:
795 for _, s := range ss {
796 if z.data.end-z.data.start != len(s) {
797 continue loop
798 }
799 for i := 0; i < len(s); i++ {
800 c := z.buf[z.data.start+i]
801 if 'A' <= c && c <= 'Z' {
802 c += 'a' - 'A'
803 }
804 if c != s[i] {
805 continue loop
806 }
807 }
808 return true
809 }
810 return false
811 }
812
813
814
815 func (z *Tokenizer) readStartTag() TokenType {
816 z.readTag(true)
817 if z.err != nil {
818 return ErrorToken
819 }
820
821 c, raw := z.buf[z.data.start], false
822 if 'A' <= c && c <= 'Z' {
823 c += 'a' - 'A'
824 }
825 switch c {
826 case 'i':
827 raw = z.startTagIn("iframe")
828 case 'n':
829 raw = z.startTagIn("noembed", "noframes", "noscript")
830 case 'p':
831 raw = z.startTagIn("plaintext")
832 case 's':
833 raw = z.startTagIn("script", "style")
834 case 't':
835 raw = z.startTagIn("textarea", "title")
836 case 'x':
837 raw = z.startTagIn("xmp")
838 }
839 if raw {
840 z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))
841 }
842
843 if z.err == nil && z.buf[z.raw.end-2] == '/' {
844 return SelfClosingTagToken
845 }
846 return StartTagToken
847 }
848
849
850
851
852
853 func (z *Tokenizer) readTag(saveAttr bool) {
854 z.attr = z.attr[:0]
855 z.nAttrReturned = 0
856
857 z.readTagName()
858 if z.skipWhiteSpace(); z.err != nil {
859 return
860 }
861 for {
862 c := z.readByte()
863 if z.err != nil || c == '>' {
864 break
865 }
866 z.raw.end--
867 z.readTagAttrKey()
868 z.readTagAttrVal()
869
870 if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {
871 z.attr = append(z.attr, z.pendingAttr)
872 }
873 if z.skipWhiteSpace(); z.err != nil {
874 break
875 }
876 }
877 }
878
879
880
881
882 func (z *Tokenizer) readTagName() {
883 z.data.start = z.raw.end - 1
884 for {
885 c := z.readByte()
886 if z.err != nil {
887 z.data.end = z.raw.end
888 return
889 }
890 switch c {
891 case ' ', '\n', '\r', '\t', '\f':
892 z.data.end = z.raw.end - 1
893 return
894 case '/', '>':
895 z.raw.end--
896 z.data.end = z.raw.end
897 return
898 }
899 }
900 }
901
902
903
904 func (z *Tokenizer) readTagAttrKey() {
905 z.pendingAttr[0].start = z.raw.end
906 for {
907 c := z.readByte()
908 if z.err != nil {
909 z.pendingAttr[0].end = z.raw.end
910 return
911 }
912 switch c {
913 case '=':
914 if z.pendingAttr[0].start+1 == z.raw.end {
915
916
917 continue
918 }
919 fallthrough
920 case ' ', '\n', '\r', '\t', '\f', '/', '>':
921
922
923 z.raw.end--
924 z.pendingAttr[0].end = z.raw.end
925 return
926 }
927 }
928 }
929
930
931 func (z *Tokenizer) readTagAttrVal() {
932 z.pendingAttr[1].start = z.raw.end
933 z.pendingAttr[1].end = z.raw.end
934 if z.skipWhiteSpace(); z.err != nil {
935 return
936 }
937 c := z.readByte()
938 if z.err != nil {
939 return
940 }
941 if c == '/' {
942
943
944 return
945 }
946 if c != '=' {
947 z.raw.end--
948 return
949 }
950 if z.skipWhiteSpace(); z.err != nil {
951 return
952 }
953 quote := z.readByte()
954 if z.err != nil {
955 return
956 }
957 switch quote {
958 case '>':
959 z.raw.end--
960 return
961
962 case '\'', '"':
963 z.pendingAttr[1].start = z.raw.end
964 for {
965 c := z.readByte()
966 if z.err != nil {
967 z.pendingAttr[1].end = z.raw.end
968 return
969 }
970 if c == quote {
971 z.pendingAttr[1].end = z.raw.end - 1
972 return
973 }
974 }
975
976 default:
977 z.pendingAttr[1].start = z.raw.end - 1
978 for {
979 c := z.readByte()
980 if z.err != nil {
981 z.pendingAttr[1].end = z.raw.end
982 return
983 }
984 switch c {
985 case ' ', '\n', '\r', '\t', '\f':
986 z.pendingAttr[1].end = z.raw.end - 1
987 return
988 case '>':
989 z.raw.end--
990 z.pendingAttr[1].end = z.raw.end
991 return
992 }
993 }
994 }
995 }
996
997
998 func (z *Tokenizer) Next() TokenType {
999 z.raw.start = z.raw.end
1000 z.data.start = z.raw.end
1001 z.data.end = z.raw.end
1002 if z.err != nil {
1003 z.tt = ErrorToken
1004 return z.tt
1005 }
1006 if z.rawTag != "" {
1007 if z.rawTag == "plaintext" {
1008
1009 for z.err == nil {
1010 z.readByte()
1011 }
1012 z.data.end = z.raw.end
1013 z.textIsRaw = true
1014 } else {
1015 z.readRawOrRCDATA()
1016 }
1017 if z.data.end > z.data.start {
1018 z.tt = TextToken
1019 z.convertNUL = true
1020 return z.tt
1021 }
1022 }
1023 z.textIsRaw = false
1024 z.convertNUL = false
1025
1026 loop:
1027 for {
1028 c := z.readByte()
1029 if z.err != nil {
1030 break loop
1031 }
1032 if c != '<' {
1033 continue loop
1034 }
1035
1036
1037
1038 c = z.readByte()
1039 if z.err != nil {
1040 break loop
1041 }
1042 var tokenType TokenType
1043 switch {
1044 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
1045 tokenType = StartTagToken
1046 case c == '/':
1047 tokenType = EndTagToken
1048 case c == '!' || c == '?':
1049
1050
1051 tokenType = CommentToken
1052 default:
1053
1054 z.raw.end--
1055 continue
1056 }
1057
1058
1059
1060
1061 if x := z.raw.end - len("<a"); z.raw.start < x {
1062 z.raw.end = x
1063 z.data.end = x
1064 z.tt = TextToken
1065 return z.tt
1066 }
1067 switch tokenType {
1068 case StartTagToken:
1069 z.tt = z.readStartTag()
1070 return z.tt
1071 case EndTagToken:
1072 c = z.readByte()
1073 if z.err != nil {
1074 break loop
1075 }
1076 if c == '>' {
1077
1078
1079
1080 z.tt = CommentToken
1081 return z.tt
1082 }
1083 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
1084 z.readTag(false)
1085 if z.err != nil {
1086 z.tt = ErrorToken
1087 } else {
1088 z.tt = EndTagToken
1089 }
1090 return z.tt
1091 }
1092 z.raw.end--
1093 z.readUntilCloseAngle()
1094 z.tt = CommentToken
1095 return z.tt
1096 case CommentToken:
1097 if c == '!' {
1098 z.tt = z.readMarkupDeclaration()
1099 return z.tt
1100 }
1101 z.raw.end--
1102 z.readUntilCloseAngle()
1103 z.tt = CommentToken
1104 return z.tt
1105 }
1106 }
1107 if z.raw.start < z.raw.end {
1108 z.data.end = z.raw.end
1109 z.tt = TextToken
1110 return z.tt
1111 }
1112 z.tt = ErrorToken
1113 return z.tt
1114 }
1115
1116
1117
1118
1119
1120
1121
1122
1123 func (z *Tokenizer) Raw() []byte {
1124 return z.buf[z.raw.start:z.raw.end]
1125 }
1126
1127
1128
1129 func convertNewlines(s []byte) []byte {
1130 for i, c := range s {
1131 if c != '\r' {
1132 continue
1133 }
1134
1135 src := i + 1
1136 if src >= len(s) || s[src] != '\n' {
1137 s[i] = '\n'
1138 continue
1139 }
1140
1141 dst := i
1142 for src < len(s) {
1143 if s[src] == '\r' {
1144 if src+1 < len(s) && s[src+1] == '\n' {
1145 src++
1146 }
1147 s[dst] = '\n'
1148 } else {
1149 s[dst] = s[src]
1150 }
1151 src++
1152 dst++
1153 }
1154 return s[:dst]
1155 }
1156 return s
1157 }
1158
1159 var (
1160 nul = []byte("\x00")
1161 replacement = []byte("\ufffd")
1162 )
1163
1164
1165
1166 func (z *Tokenizer) Text() []byte {
1167 switch z.tt {
1168 case TextToken, CommentToken, DoctypeToken:
1169 s := z.buf[z.data.start:z.data.end]
1170 z.data.start = z.raw.end
1171 z.data.end = z.raw.end
1172 s = convertNewlines(s)
1173 if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) {
1174 s = bytes.Replace(s, nul, replacement, -1)
1175 }
1176 if !z.textIsRaw {
1177 s = unescape(s, false)
1178 }
1179 return s
1180 }
1181 return nil
1182 }
1183
1184
1185
1186
1187 func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
1188 if z.data.start < z.data.end {
1189 switch z.tt {
1190 case StartTagToken, EndTagToken, SelfClosingTagToken:
1191 s := z.buf[z.data.start:z.data.end]
1192 z.data.start = z.raw.end
1193 z.data.end = z.raw.end
1194 return lower(s), z.nAttrReturned < len(z.attr)
1195 }
1196 }
1197 return nil, false
1198 }
1199
1200
1201
1202
1203 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
1204 if z.nAttrReturned < len(z.attr) {
1205 switch z.tt {
1206 case StartTagToken, SelfClosingTagToken:
1207 x := z.attr[z.nAttrReturned]
1208 z.nAttrReturned++
1209 key = z.buf[x[0].start:x[0].end]
1210 val = z.buf[x[1].start:x[1].end]
1211 return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)
1212 }
1213 }
1214 return nil, nil, false
1215 }
1216
1217
1218
1219 func (z *Tokenizer) Token() Token {
1220 t := Token{Type: z.tt}
1221 switch z.tt {
1222 case TextToken, CommentToken, DoctypeToken:
1223 t.Data = string(z.Text())
1224 case StartTagToken, SelfClosingTagToken, EndTagToken:
1225 name, moreAttr := z.TagName()
1226 for moreAttr {
1227 var key, val []byte
1228 key, val, moreAttr = z.TagAttr()
1229 t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
1230 }
1231 if a := atom.Lookup(name); a != 0 {
1232 t.DataAtom, t.Data = a, a.String()
1233 } else {
1234 t.DataAtom, t.Data = 0, string(name)
1235 }
1236 }
1237 return t
1238 }
1239
1240
1241
1242 func (z *Tokenizer) SetMaxBuf(n int) {
1243 z.maxBuf = n
1244 }
1245
1246
1247
1248 func NewTokenizer(r io.Reader) *Tokenizer {
1249 return NewTokenizerFragment(r, "")
1250 }
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260 func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {
1261 z := &Tokenizer{
1262 r: r,
1263 buf: make([]byte, 0, 4096),
1264 }
1265 if contextTag != "" {
1266 switch s := strings.ToLower(contextTag); s {
1267 case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
1268 z.rawTag = s
1269 }
1270 }
1271 return z
1272 }
1273
View as plain text