...

Source file src/golang.org/x/net/publicsuffix/list.go

Documentation: golang.org/x/net/publicsuffix

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run gen.go
     6  
     7  // Package publicsuffix provides a public suffix list based on data from
     8  // https://publicsuffix.org/
     9  //
    10  // A public suffix is one under which Internet users can directly register
    11  // names. It is related to, but different from, a TLD (top level domain).
    12  //
    13  // "com" is a TLD (top level domain). Top level means it has no dots.
    14  //
    15  // "com" is also a public suffix. Amazon and Google have registered different
    16  // siblings under that domain: "amazon.com" and "google.com".
    17  //
    18  // "au" is another TLD, again because it has no dots. But it's not "amazon.au".
    19  // Instead, it's "amazon.com.au".
    20  //
    21  // "com.au" isn't an actual TLD, because it's not at the top level (it has
    22  // dots). But it is an eTLD (effective TLD), because that's the branching point
    23  // for domain name registrars.
    24  //
    25  // Another name for "an eTLD" is "a public suffix". Often, what's more of
    26  // interest is the eTLD+1, or one more label than the public suffix. For
    27  // example, browsers partition read/write access to HTTP cookies according to
    28  // the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
    29  // "google.com.au", but web pages served from "maps.google.com" can share
    30  // cookies from "www.google.com", so you don't have to sign into Google Maps
    31  // separately from signing into Google Web Search. Note that all four of those
    32  // domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
    33  // the last two are not (but share the same eTLD+1: "google.com").
    34  //
    35  // All of these domains have the same eTLD+1:
    36  //   - "www.books.amazon.co.uk"
    37  //   - "books.amazon.co.uk"
    38  //   - "amazon.co.uk"
    39  //
    40  // Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
    41  //
    42  // There is no closed form algorithm to calculate the eTLD of a domain.
    43  // Instead, the calculation is data driven. This package provides a
    44  // pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
    45  // https://publicsuffix.org/
    46  package publicsuffix // import "golang.org/x/net/publicsuffix"
    47  
    48  // TODO: specify case sensitivity and leading/trailing dot behavior for
    49  // func PublicSuffix and func EffectiveTLDPlusOne.
    50  
    51  import (
    52  	"fmt"
    53  	"net/http/cookiejar"
    54  	"strings"
    55  )
    56  
    57  // List implements the cookiejar.PublicSuffixList interface by calling the
    58  // PublicSuffix function.
    59  var List cookiejar.PublicSuffixList = list{}
    60  
    61  type list struct{}
    62  
    63  func (list) PublicSuffix(domain string) string {
    64  	ps, _ := PublicSuffix(domain)
    65  	return ps
    66  }
    67  
    68  func (list) String() string {
    69  	return version
    70  }
    71  
    72  // PublicSuffix returns the public suffix of the domain using a copy of the
    73  // publicsuffix.org database compiled into the library.
    74  //
    75  // icann is whether the public suffix is managed by the Internet Corporation
    76  // for Assigned Names and Numbers. If not, the public suffix is either a
    77  // privately managed domain (and in practice, not a top level domain) or an
    78  // unmanaged top level domain (and not explicitly mentioned in the
    79  // publicsuffix.org list). For example, "foo.org" and "foo.co.uk" are ICANN
    80  // domains, "foo.dyndns.org" and "foo.blogspot.co.uk" are private domains and
    81  // "cromulent" is an unmanaged top level domain.
    82  //
    83  // Use cases for distinguishing ICANN domains like "foo.com" from private
    84  // domains like "foo.appspot.com" can be found at
    85  // https://wiki.mozilla.org/Public_Suffix_List/Use_Cases
    86  func PublicSuffix(domain string) (publicSuffix string, icann bool) {
    87  	lo, hi := uint32(0), uint32(numTLD)
    88  	s, suffix, icannNode, wildcard := domain, len(domain), false, false
    89  loop:
    90  	for {
    91  		dot := strings.LastIndex(s, ".")
    92  		if wildcard {
    93  			icann = icannNode
    94  			suffix = 1 + dot
    95  		}
    96  		if lo == hi {
    97  			break
    98  		}
    99  		f := find(s[1+dot:], lo, hi)
   100  		if f == notFound {
   101  			break
   102  		}
   103  
   104  		u := uint32(nodes.get(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
   105  		icannNode = u&(1<<nodesBitsICANN-1) != 0
   106  		u >>= nodesBitsICANN
   107  		u = children.get(u & (1<<nodesBitsChildren - 1))
   108  		lo = u & (1<<childrenBitsLo - 1)
   109  		u >>= childrenBitsLo
   110  		hi = u & (1<<childrenBitsHi - 1)
   111  		u >>= childrenBitsHi
   112  		switch u & (1<<childrenBitsNodeType - 1) {
   113  		case nodeTypeNormal:
   114  			suffix = 1 + dot
   115  		case nodeTypeException:
   116  			suffix = 1 + len(s)
   117  			break loop
   118  		}
   119  		u >>= childrenBitsNodeType
   120  		wildcard = u&(1<<childrenBitsWildcard-1) != 0
   121  		if !wildcard {
   122  			icann = icannNode
   123  		}
   124  
   125  		if dot == -1 {
   126  			break
   127  		}
   128  		s = s[:dot]
   129  	}
   130  	if suffix == len(domain) {
   131  		// If no rules match, the prevailing rule is "*".
   132  		return domain[1+strings.LastIndex(domain, "."):], icann
   133  	}
   134  	return domain[suffix:], icann
   135  }
   136  
   137  const notFound uint32 = 1<<32 - 1
   138  
   139  // find returns the index of the node in the range [lo, hi) whose label equals
   140  // label, or notFound if there is no such node. The range is assumed to be in
   141  // strictly increasing node label order.
   142  func find(label string, lo, hi uint32) uint32 {
   143  	for lo < hi {
   144  		mid := lo + (hi-lo)/2
   145  		s := nodeLabel(mid)
   146  		if s < label {
   147  			lo = mid + 1
   148  		} else if s == label {
   149  			return mid
   150  		} else {
   151  			hi = mid
   152  		}
   153  	}
   154  	return notFound
   155  }
   156  
   157  // nodeLabel returns the label for the i'th node.
   158  func nodeLabel(i uint32) string {
   159  	x := nodes.get(i)
   160  	length := x & (1<<nodesBitsTextLength - 1)
   161  	x >>= nodesBitsTextLength
   162  	offset := x & (1<<nodesBitsTextOffset - 1)
   163  	return text[offset : offset+length]
   164  }
   165  
   166  // EffectiveTLDPlusOne returns the effective top level domain plus one more
   167  // label. For example, the eTLD+1 for "foo.bar.golang.org" is "golang.org".
   168  func EffectiveTLDPlusOne(domain string) (string, error) {
   169  	if strings.HasPrefix(domain, ".") || strings.HasSuffix(domain, ".") || strings.Contains(domain, "..") {
   170  		return "", fmt.Errorf("publicsuffix: empty label in domain %q", domain)
   171  	}
   172  
   173  	suffix, _ := PublicSuffix(domain)
   174  	if len(domain) <= len(suffix) {
   175  		return "", fmt.Errorf("publicsuffix: cannot derive eTLD+1 for domain %q", domain)
   176  	}
   177  	i := len(domain) - len(suffix) - 1
   178  	if domain[i] != '.' {
   179  		return "", fmt.Errorf("publicsuffix: invalid public suffix %q for domain %q", suffix, domain)
   180  	}
   181  	return domain[1+strings.LastIndex(domain[:i], "."):], nil
   182  }
   183  
   184  type uint32String string
   185  
   186  func (u uint32String) get(i uint32) uint32 {
   187  	off := i * 4
   188  	return (uint32(u[off])<<24 |
   189  		uint32(u[off+1])<<16 |
   190  		uint32(u[off+2])<<8 |
   191  		uint32(u[off+3]))
   192  }
   193  
   194  type uint40String string
   195  
   196  func (u uint40String) get(i uint32) uint64 {
   197  	off := uint64(i * (nodesBits / 8))
   198  	return uint64(u[off])<<32 |
   199  		uint64(u[off+1])<<24 |
   200  		uint64(u[off+2])<<16 |
   201  		uint64(u[off+3])<<8 |
   202  		uint64(u[off+4])
   203  }
   204  

View as plain text