1
2
3
4
5
6
7 package main
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import (
22 "bufio"
23 "bytes"
24 "encoding/binary"
25 "flag"
26 "fmt"
27 "go/format"
28 "io"
29 "io/ioutil"
30 "net/http"
31 "os"
32 "regexp"
33 "sort"
34 "strings"
35
36 "golang.org/x/net/idna"
37 )
38
39 const (
40
41
42 nodesBits = 40
43
44
45 nodesBitsChildren = 10
46 nodesBitsICANN = 1
47 nodesBitsTextOffset = 16
48 nodesBitsTextLength = 6
49
50
51 childrenBitsWildcard = 1
52 childrenBitsNodeType = 2
53 childrenBitsHi = 14
54 childrenBitsLo = 14
55 )
56
57 var (
58 combinedText string
59 maxChildren int
60 maxTextOffset int
61 maxTextLength int
62 maxHi uint32
63 maxLo uint32
64 )
65
66 func max(a, b int) int {
67 if a < b {
68 return b
69 }
70 return a
71 }
72
73 func u32max(a, b uint32) uint32 {
74 if a < b {
75 return b
76 }
77 return a
78 }
79
80 const (
81 nodeTypeNormal = 0
82 nodeTypeException = 1
83 nodeTypeParentOnly = 2
84 numNodeType = 3
85 )
86
87 func nodeTypeStr(n int) string {
88 switch n {
89 case nodeTypeNormal:
90 return "+"
91 case nodeTypeException:
92 return "!"
93 case nodeTypeParentOnly:
94 return "o"
95 }
96 panic("unreachable")
97 }
98
99 const (
100 defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat"
101 gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
102 )
103
104 var (
105 labelEncoding = map[string]uint64{}
106 labelsList = []string{}
107 labelsMap = map[string]bool{}
108 rules = []string{}
109 numICANNRules = 0
110
111
112
113
114 validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
115
116 shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
117 dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
118
119 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
120 url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
121 v = flag.Bool("v", false, "verbose output (to stderr)")
122 version = flag.String("version", "", "the effective_tld_names.dat version")
123 )
124
125 func main() {
126 if err := main1(); err != nil {
127 fmt.Fprintln(os.Stderr, err)
128 os.Exit(1)
129 }
130 }
131
132 func main1() error {
133 flag.Parse()
134 if nodesBits > 64 {
135 return fmt.Errorf("nodesBits is too large")
136 }
137 if nodesBits%8 != 0 {
138 return fmt.Errorf("nodesBits must be a multiple of 8")
139 }
140 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > nodesBits {
141 return fmt.Errorf("not enough bits to encode the nodes table")
142 }
143 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
144 return fmt.Errorf("not enough bits to encode the children table")
145 }
146 if *version == "" {
147 if *url != defaultURL {
148 return fmt.Errorf("-version was not specified, and the -url is not the default one")
149 }
150 sha, date, err := gitCommit()
151 if err != nil {
152 return err
153 }
154 *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
155 }
156 var r io.Reader = os.Stdin
157 if *url != "" {
158 res, err := http.Get(*url)
159 if err != nil {
160 return err
161 }
162 if res.StatusCode != http.StatusOK {
163 return fmt.Errorf("bad GET status for %s: %s", *url, res.Status)
164 }
165 r = res.Body
166 defer res.Body.Close()
167 }
168
169 var root node
170 icann := false
171 br := bufio.NewReader(r)
172 for {
173 s, err := br.ReadString('\n')
174 if err != nil {
175 if err == io.EOF {
176 break
177 }
178 return err
179 }
180 s = strings.TrimSpace(s)
181 if strings.Contains(s, "BEGIN ICANN DOMAINS") {
182 if len(rules) != 0 {
183 return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
184 }
185 icann = true
186 continue
187 }
188 if strings.Contains(s, "END ICANN DOMAINS") {
189 icann, numICANNRules = false, len(rules)
190 continue
191 }
192 if s == "" || strings.HasPrefix(s, "//") {
193 continue
194 }
195 s, err = idna.ToASCII(s)
196 if err != nil {
197 return err
198 }
199 if !validSuffixRE.MatchString(s) {
200 return fmt.Errorf("bad publicsuffix.org list data: %q", s)
201 }
202
203 if *subset {
204 switch {
205 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
206 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
207 case s == "ao" || strings.HasSuffix(s, ".ao"):
208 case s == "ar" || strings.HasSuffix(s, ".ar"):
209 case s == "arpa" || strings.HasSuffix(s, ".arpa"):
210 case s == "cy" || strings.HasSuffix(s, ".cy"):
211 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
212 case s == "jp":
213 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
214 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
215 case s == "om" || strings.HasSuffix(s, ".om"):
216 case s == "uk" || strings.HasSuffix(s, ".uk"):
217 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
218 case s == "tw" || strings.HasSuffix(s, ".tw"):
219 case s == "zw" || strings.HasSuffix(s, ".zw"):
220 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
221
222 default:
223 continue
224 }
225 }
226
227 rules = append(rules, s)
228
229 nt, wildcard := nodeTypeNormal, false
230 switch {
231 case strings.HasPrefix(s, "*."):
232 s, nt = s[2:], nodeTypeParentOnly
233 wildcard = true
234 case strings.HasPrefix(s, "!"):
235 s, nt = s[1:], nodeTypeException
236 }
237 labels := strings.Split(s, ".")
238 for n, i := &root, len(labels)-1; i >= 0; i-- {
239 label := labels[i]
240 n = n.child(label)
241 if i == 0 {
242 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
243 n.nodeType = nt
244 }
245 n.icann = n.icann && icann
246 n.wildcard = n.wildcard || wildcard
247 }
248 labelsMap[label] = true
249 }
250 }
251 labelsList = make([]string, 0, len(labelsMap))
252 for label := range labelsMap {
253 labelsList = append(labelsList, label)
254 }
255 sort.Strings(labelsList)
256
257 combinedText = combineText(labelsList)
258 if combinedText == "" {
259 return fmt.Errorf("internal error: combineText returned no text")
260 }
261 for _, label := range labelsList {
262 offset, length := strings.Index(combinedText, label), len(label)
263 if offset < 0 {
264 return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
265 }
266 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
267 if offset >= 1<<nodesBitsTextOffset {
268 return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
269 }
270 if length >= 1<<nodesBitsTextLength {
271 return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
272 }
273 labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
274 }
275
276 if err := root.walk(assignIndexes); err != nil {
277 return err
278 }
279
280 if err := generate(printMetadata, &root, "table.go"); err != nil {
281 return err
282 }
283 if err := generateBinaryData(&root, combinedText); err != nil {
284 return err
285 }
286 if err := generate(printTest, &root, "table_test.go"); err != nil {
287 return err
288 }
289 return nil
290 }
291
292 func generate(p func(io.Writer, *node) error, root *node, filename string) error {
293 buf := new(bytes.Buffer)
294 if err := p(buf, root); err != nil {
295 return err
296 }
297 b, err := format.Source(buf.Bytes())
298 if err != nil {
299 return err
300 }
301 return ioutil.WriteFile(filename, b, 0644)
302 }
303
304 func gitCommit() (sha, date string, retErr error) {
305 res, err := http.Get(gitCommitURL)
306 if err != nil {
307 return "", "", err
308 }
309 if res.StatusCode != http.StatusOK {
310 return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status)
311 }
312 defer res.Body.Close()
313 b, err := ioutil.ReadAll(res.Body)
314 if err != nil {
315 return "", "", err
316 }
317 if m := shaRE.FindSubmatch(b); m != nil {
318 sha = string(m[1])
319 }
320 if m := dateRE.FindSubmatch(b); m != nil {
321 date = string(m[1])
322 }
323 if sha == "" || date == "" {
324 retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
325 }
326 return sha, date, retErr
327 }
328
329 func printTest(w io.Writer, n *node) error {
330 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
331 fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
332 for _, rule := range rules {
333 fmt.Fprintf(w, "%q,\n", rule)
334 }
335 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
336 if err := n.walk(func(n *node) error {
337 return printNodeLabel(w, n)
338 }); err != nil {
339 return err
340 }
341 fmt.Fprintf(w, "}\n")
342 return nil
343 }
344
345 func generateBinaryData(root *node, combinedText string) error {
346 if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
347 return err
348 }
349
350 var nodes []byte
351 if err := root.walk(func(n *node) error {
352 for _, c := range n.children {
353 nodes = appendNodeEncoding(nodes, c)
354 }
355 return nil
356 }); err != nil {
357 return err
358 }
359 if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
360 return err
361 }
362
363 var children []byte
364 for _, c := range childrenEncoding {
365 children = binary.BigEndian.AppendUint32(children, c)
366 }
367 if err := os.WriteFile("data/children", children, 0666); err != nil {
368 return err
369 }
370
371 return nil
372 }
373
374 func appendNodeEncoding(b []byte, n *node) []byte {
375 encoding := labelEncoding[n.label]
376 if n.icann {
377 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
378 }
379 encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
380 for i := nodesBits - 8; i >= 0; i -= 8 {
381 b = append(b, byte((encoding>>i)&0xff))
382 }
383 return b
384 }
385
386 func printMetadata(w io.Writer, n *node) error {
387 const header = `// generated by go run gen.go; DO NOT EDIT
388
389 package publicsuffix
390
391 import _ "embed"
392
393 const version = %q
394
395 const (
396 nodesBits = %d
397 nodesBitsChildren = %d
398 nodesBitsICANN = %d
399 nodesBitsTextOffset = %d
400 nodesBitsTextLength = %d
401
402 childrenBitsWildcard = %d
403 childrenBitsNodeType = %d
404 childrenBitsHi = %d
405 childrenBitsLo = %d
406 )
407
408 const (
409 nodeTypeNormal = %d
410 nodeTypeException = %d
411 nodeTypeParentOnly = %d
412 )
413
414 // numTLD is the number of top level domains.
415 const numTLD = %d
416
417 // text is the combined text of all labels.
418 //
419 //go:embed data/text
420 var text string
421
422 `
423 fmt.Fprintf(w, header, *version,
424 nodesBits,
425 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
426 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
427 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
428 fmt.Fprintf(w, `
429 // nodes is the list of nodes. Each node is represented as a %v-bit integer,
430 // which encodes the node's children, wildcard bit and node type (as an index
431 // into the children array), ICANN bit and text.
432 //
433 // The layout within the node, from MSB to LSB, is:
434 // [%2d bits] unused
435 // [%2d bits] children index
436 // [%2d bits] ICANN bit
437 // [%2d bits] text index
438 // [%2d bits] text length
439 //
440 //go:embed data/nodes
441 var nodes uint40String
442 `,
443 nodesBits,
444 nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
445 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
446 fmt.Fprintf(w, `
447 // children is the list of nodes' children, the parent's wildcard bit and the
448 // parent's node type. If a node has no children then their children index
449 // will be in the range [0, 6), depending on the wildcard bit and node type.
450 //
451 // The layout within the uint32, from MSB to LSB, is:
452 // [%2d bits] unused
453 // [%2d bits] wildcard bit
454 // [%2d bits] node type
455 // [%2d bits] high nodes index (exclusive) of children
456 // [%2d bits] low nodes index (inclusive) of children
457 //
458 //go:embed data/children
459 var children uint32String
460 `,
461 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
462 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
463
464 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
465 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
466 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
467 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
468 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
469 return nil
470 }
471
472 type node struct {
473 label string
474 nodeType int
475 icann bool
476 wildcard bool
477
478
479 nodesIndex, childrenIndex int
480
481
482 firstChild int
483
484 children []*node
485 }
486
487 func (n *node) walk(f func(*node) error) error {
488 if err := f(n); err != nil {
489 return err
490 }
491 for _, c := range n.children {
492 if err := c.walk(f); err != nil {
493 return err
494 }
495 }
496 return nil
497 }
498
499
500
501 func (n *node) child(label string) *node {
502 for _, c := range n.children {
503 if c.label == label {
504 return c
505 }
506 }
507 c := &node{
508 label: label,
509 nodeType: nodeTypeParentOnly,
510 icann: true,
511 }
512 n.children = append(n.children, c)
513 sort.Sort(byLabel(n.children))
514 return c
515 }
516
517 type byLabel []*node
518
519 func (b byLabel) Len() int { return len(b) }
520 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
521 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
522
523 var nextNodesIndex int
524
525
526
527 var childrenEncoding = []uint32{
528 0 << (childrenBitsLo + childrenBitsHi),
529 1 << (childrenBitsLo + childrenBitsHi),
530 2 << (childrenBitsLo + childrenBitsHi),
531 4 << (childrenBitsLo + childrenBitsHi),
532 5 << (childrenBitsLo + childrenBitsHi),
533 6 << (childrenBitsLo + childrenBitsHi),
534 }
535
536 var firstCallToAssignIndexes = true
537
538 func assignIndexes(n *node) error {
539 if len(n.children) != 0 {
540
541 n.firstChild = nextNodesIndex
542 for _, c := range n.children {
543 c.nodesIndex = nextNodesIndex
544 nextNodesIndex++
545 }
546
547
548 if firstCallToAssignIndexes {
549 firstCallToAssignIndexes = false
550 return nil
551 }
552
553
554 maxChildren = max(maxChildren, len(childrenEncoding))
555 if len(childrenEncoding) >= 1<<nodesBitsChildren {
556 return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
557 }
558 n.childrenIndex = len(childrenEncoding)
559 lo := uint32(n.firstChild)
560 hi := lo + uint32(len(n.children))
561 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
562 if lo >= 1<<childrenBitsLo {
563 return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
564 }
565 if hi >= 1<<childrenBitsHi {
566 return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
567 }
568 enc := hi<<childrenBitsLo | lo
569 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
570 if n.wildcard {
571 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
572 }
573 childrenEncoding = append(childrenEncoding, enc)
574 } else {
575 n.childrenIndex = n.nodeType
576 if n.wildcard {
577 n.childrenIndex += numNodeType
578 }
579 }
580 return nil
581 }
582
583 func printNodeLabel(w io.Writer, n *node) error {
584 for _, c := range n.children {
585 fmt.Fprintf(w, "%q,\n", c.label)
586 }
587 return nil
588 }
589
590 func icannStr(icann bool) string {
591 if icann {
592 return "I"
593 }
594 return " "
595 }
596
597 func wildcardStr(wildcard bool) string {
598 if wildcard {
599 return "*"
600 }
601 return " "
602 }
603
604
605
606
607 func combineText(labelsList []string) string {
608 beforeLength := 0
609 for _, s := range labelsList {
610 beforeLength += len(s)
611 }
612
613 text := crush(removeSubstrings(labelsList))
614 if *v {
615 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
616 }
617 return text
618 }
619
620 type byLength []string
621
622 func (s byLength) Len() int { return len(s) }
623 func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
624 func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
625
626
627
628 func removeSubstrings(input []string) []string {
629
630 ss := append(make([]string, 0, len(input)), input...)
631 sort.Sort(byLength(ss))
632
633 for i, shortString := range ss {
634
635
636 for _, longString := range ss[i+1:] {
637 if strings.Contains(longString, shortString) {
638 ss[i] = ""
639 break
640 }
641 }
642 }
643
644
645 sort.Strings(ss)
646 for len(ss) > 0 && ss[0] == "" {
647 ss = ss[1:]
648 }
649 return ss
650 }
651
652
653
654 func crush(ss []string) string {
655 maxLabelLen := 0
656 for _, s := range ss {
657 if maxLabelLen < len(s) {
658 maxLabelLen = len(s)
659 }
660 }
661
662 for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
663 prefixes := makePrefixMap(ss, prefixLen)
664 for i, s := range ss {
665 if len(s) <= prefixLen {
666 continue
667 }
668 mergeLabel(ss, i, prefixLen, prefixes)
669 }
670 }
671
672 return strings.Join(ss, "")
673 }
674
675
676
677
678
679
680 func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
681 s := ss[i]
682 suffix := s[len(s)-prefixLen:]
683 for _, j := range prefixes[suffix] {
684
685 if ss[j] == "" || i == j {
686 continue
687 }
688 if *v {
689 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
690 prefixLen, i, j, ss[i], ss[j], suffix)
691 }
692 ss[i] += ss[j][prefixLen:]
693 ss[j] = ""
694
695
696
697
698
699
700
701
702
703 mergeLabel(ss, i, prefixLen, prefixes)
704 return
705 }
706 }
707
708
709
710
711 type prefixMap map[string][]int
712
713
714 func makePrefixMap(ss []string, prefixLen int) prefixMap {
715 prefixes := make(prefixMap)
716 for i, s := range ss {
717
718
719
720 if prefixLen < len(s) {
721 prefix := s[:prefixLen]
722 prefixes[prefix] = append(prefixes[prefix], i)
723 }
724 }
725
726 return prefixes
727 }
728
View as plain text