...

Source file src/golang.org/x/net/publicsuffix/gen.go

Documentation: golang.org/x/net/publicsuffix

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  // This program generates table.go and table_test.go based on the authoritative
    10  // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
    11  //
    12  // The version is derived from
    13  // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
    14  // and a human-readable form is at
    15  // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
    16  //
    17  // To fetch a particular git revision, such as 5c70ccd250, pass
    18  // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
    19  // and -version "an explicit version string".
    20  
    21  import (
    22  	"bufio"
    23  	"bytes"
    24  	"encoding/binary"
    25  	"flag"
    26  	"fmt"
    27  	"go/format"
    28  	"io"
    29  	"io/ioutil"
    30  	"net/http"
    31  	"os"
    32  	"regexp"
    33  	"sort"
    34  	"strings"
    35  
    36  	"golang.org/x/net/idna"
    37  )
    38  
    39  const (
    40  	// This must be a multiple of 8 and no greater than 64.
    41  	// Update nodeValue in list.go if this changes.
    42  	nodesBits = 40
    43  
    44  	// These sum of these four values must be no greater than nodesBits.
    45  	nodesBitsChildren   = 10
    46  	nodesBitsICANN      = 1
    47  	nodesBitsTextOffset = 16
    48  	nodesBitsTextLength = 6
    49  
    50  	// These sum of these four values must be no greater than 32.
    51  	childrenBitsWildcard = 1
    52  	childrenBitsNodeType = 2
    53  	childrenBitsHi       = 14
    54  	childrenBitsLo       = 14
    55  )
    56  
    57  var (
    58  	combinedText  string
    59  	maxChildren   int
    60  	maxTextOffset int
    61  	maxTextLength int
    62  	maxHi         uint32
    63  	maxLo         uint32
    64  )
    65  
    66  func max(a, b int) int {
    67  	if a < b {
    68  		return b
    69  	}
    70  	return a
    71  }
    72  
    73  func u32max(a, b uint32) uint32 {
    74  	if a < b {
    75  		return b
    76  	}
    77  	return a
    78  }
    79  
    80  const (
    81  	nodeTypeNormal     = 0
    82  	nodeTypeException  = 1
    83  	nodeTypeParentOnly = 2
    84  	numNodeType        = 3
    85  )
    86  
    87  func nodeTypeStr(n int) string {
    88  	switch n {
    89  	case nodeTypeNormal:
    90  		return "+"
    91  	case nodeTypeException:
    92  		return "!"
    93  	case nodeTypeParentOnly:
    94  		return "o"
    95  	}
    96  	panic("unreachable")
    97  }
    98  
    99  const (
   100  	defaultURL   = "https://publicsuffix.org/list/effective_tld_names.dat"
   101  	gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
   102  )
   103  
   104  var (
   105  	labelEncoding = map[string]uint64{}
   106  	labelsList    = []string{}
   107  	labelsMap     = map[string]bool{}
   108  	rules         = []string{}
   109  	numICANNRules = 0
   110  
   111  	// validSuffixRE is used to check that the entries in the public suffix
   112  	// list are in canonical form (after Punycode encoding). Specifically,
   113  	// capital letters are not allowed.
   114  	validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
   115  
   116  	shaRE  = regexp.MustCompile(`"sha":"([^"]+)"`)
   117  	dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
   118  
   119  	subset  = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
   120  	url     = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
   121  	v       = flag.Bool("v", false, "verbose output (to stderr)")
   122  	version = flag.String("version", "", "the effective_tld_names.dat version")
   123  )
   124  
   125  func main() {
   126  	if err := main1(); err != nil {
   127  		fmt.Fprintln(os.Stderr, err)
   128  		os.Exit(1)
   129  	}
   130  }
   131  
   132  func main1() error {
   133  	flag.Parse()
   134  	if nodesBits > 64 {
   135  		return fmt.Errorf("nodesBits is too large")
   136  	}
   137  	if nodesBits%8 != 0 {
   138  		return fmt.Errorf("nodesBits must be a multiple of 8")
   139  	}
   140  	if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > nodesBits {
   141  		return fmt.Errorf("not enough bits to encode the nodes table")
   142  	}
   143  	if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
   144  		return fmt.Errorf("not enough bits to encode the children table")
   145  	}
   146  	if *version == "" {
   147  		if *url != defaultURL {
   148  			return fmt.Errorf("-version was not specified, and the -url is not the default one")
   149  		}
   150  		sha, date, err := gitCommit()
   151  		if err != nil {
   152  			return err
   153  		}
   154  		*version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
   155  	}
   156  	var r io.Reader = os.Stdin
   157  	if *url != "" {
   158  		res, err := http.Get(*url)
   159  		if err != nil {
   160  			return err
   161  		}
   162  		if res.StatusCode != http.StatusOK {
   163  			return fmt.Errorf("bad GET status for %s: %s", *url, res.Status)
   164  		}
   165  		r = res.Body
   166  		defer res.Body.Close()
   167  	}
   168  
   169  	var root node
   170  	icann := false
   171  	br := bufio.NewReader(r)
   172  	for {
   173  		s, err := br.ReadString('\n')
   174  		if err != nil {
   175  			if err == io.EOF {
   176  				break
   177  			}
   178  			return err
   179  		}
   180  		s = strings.TrimSpace(s)
   181  		if strings.Contains(s, "BEGIN ICANN DOMAINS") {
   182  			if len(rules) != 0 {
   183  				return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
   184  			}
   185  			icann = true
   186  			continue
   187  		}
   188  		if strings.Contains(s, "END ICANN DOMAINS") {
   189  			icann, numICANNRules = false, len(rules)
   190  			continue
   191  		}
   192  		if s == "" || strings.HasPrefix(s, "//") {
   193  			continue
   194  		}
   195  		s, err = idna.ToASCII(s)
   196  		if err != nil {
   197  			return err
   198  		}
   199  		if !validSuffixRE.MatchString(s) {
   200  			return fmt.Errorf("bad publicsuffix.org list data: %q", s)
   201  		}
   202  
   203  		if *subset {
   204  			switch {
   205  			case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
   206  			case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
   207  			case s == "ao" || strings.HasSuffix(s, ".ao"):
   208  			case s == "ar" || strings.HasSuffix(s, ".ar"):
   209  			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
   210  			case s == "cy" || strings.HasSuffix(s, ".cy"):
   211  			case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
   212  			case s == "jp":
   213  			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
   214  			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
   215  			case s == "om" || strings.HasSuffix(s, ".om"):
   216  			case s == "uk" || strings.HasSuffix(s, ".uk"):
   217  			case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
   218  			case s == "tw" || strings.HasSuffix(s, ".tw"):
   219  			case s == "zw" || strings.HasSuffix(s, ".zw"):
   220  			case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
   221  				// xn--p1ai is Russian-Cyrillic "рф".
   222  			default:
   223  				continue
   224  			}
   225  		}
   226  
   227  		rules = append(rules, s)
   228  
   229  		nt, wildcard := nodeTypeNormal, false
   230  		switch {
   231  		case strings.HasPrefix(s, "*."):
   232  			s, nt = s[2:], nodeTypeParentOnly
   233  			wildcard = true
   234  		case strings.HasPrefix(s, "!"):
   235  			s, nt = s[1:], nodeTypeException
   236  		}
   237  		labels := strings.Split(s, ".")
   238  		for n, i := &root, len(labels)-1; i >= 0; i-- {
   239  			label := labels[i]
   240  			n = n.child(label)
   241  			if i == 0 {
   242  				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
   243  					n.nodeType = nt
   244  				}
   245  				n.icann = n.icann && icann
   246  				n.wildcard = n.wildcard || wildcard
   247  			}
   248  			labelsMap[label] = true
   249  		}
   250  	}
   251  	labelsList = make([]string, 0, len(labelsMap))
   252  	for label := range labelsMap {
   253  		labelsList = append(labelsList, label)
   254  	}
   255  	sort.Strings(labelsList)
   256  
   257  	combinedText = combineText(labelsList)
   258  	if combinedText == "" {
   259  		return fmt.Errorf("internal error: combineText returned no text")
   260  	}
   261  	for _, label := range labelsList {
   262  		offset, length := strings.Index(combinedText, label), len(label)
   263  		if offset < 0 {
   264  			return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
   265  		}
   266  		maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
   267  		if offset >= 1<<nodesBitsTextOffset {
   268  			return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
   269  		}
   270  		if length >= 1<<nodesBitsTextLength {
   271  			return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
   272  		}
   273  		labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
   274  	}
   275  
   276  	if err := root.walk(assignIndexes); err != nil {
   277  		return err
   278  	}
   279  
   280  	if err := generate(printMetadata, &root, "table.go"); err != nil {
   281  		return err
   282  	}
   283  	if err := generateBinaryData(&root, combinedText); err != nil {
   284  		return err
   285  	}
   286  	if err := generate(printTest, &root, "table_test.go"); err != nil {
   287  		return err
   288  	}
   289  	return nil
   290  }
   291  
   292  func generate(p func(io.Writer, *node) error, root *node, filename string) error {
   293  	buf := new(bytes.Buffer)
   294  	if err := p(buf, root); err != nil {
   295  		return err
   296  	}
   297  	b, err := format.Source(buf.Bytes())
   298  	if err != nil {
   299  		return err
   300  	}
   301  	return ioutil.WriteFile(filename, b, 0644)
   302  }
   303  
   304  func gitCommit() (sha, date string, retErr error) {
   305  	res, err := http.Get(gitCommitURL)
   306  	if err != nil {
   307  		return "", "", err
   308  	}
   309  	if res.StatusCode != http.StatusOK {
   310  		return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status)
   311  	}
   312  	defer res.Body.Close()
   313  	b, err := ioutil.ReadAll(res.Body)
   314  	if err != nil {
   315  		return "", "", err
   316  	}
   317  	if m := shaRE.FindSubmatch(b); m != nil {
   318  		sha = string(m[1])
   319  	}
   320  	if m := dateRE.FindSubmatch(b); m != nil {
   321  		date = string(m[1])
   322  	}
   323  	if sha == "" || date == "" {
   324  		retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
   325  	}
   326  	return sha, date, retErr
   327  }
   328  
   329  func printTest(w io.Writer, n *node) error {
   330  	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
   331  	fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
   332  	for _, rule := range rules {
   333  		fmt.Fprintf(w, "%q,\n", rule)
   334  	}
   335  	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
   336  	if err := n.walk(func(n *node) error {
   337  		return printNodeLabel(w, n)
   338  	}); err != nil {
   339  		return err
   340  	}
   341  	fmt.Fprintf(w, "}\n")
   342  	return nil
   343  }
   344  
   345  func generateBinaryData(root *node, combinedText string) error {
   346  	if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
   347  		return err
   348  	}
   349  
   350  	var nodes []byte
   351  	if err := root.walk(func(n *node) error {
   352  		for _, c := range n.children {
   353  			nodes = appendNodeEncoding(nodes, c)
   354  		}
   355  		return nil
   356  	}); err != nil {
   357  		return err
   358  	}
   359  	if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
   360  		return err
   361  	}
   362  
   363  	var children []byte
   364  	for _, c := range childrenEncoding {
   365  		children = binary.BigEndian.AppendUint32(children, c)
   366  	}
   367  	if err := os.WriteFile("data/children", children, 0666); err != nil {
   368  		return err
   369  	}
   370  
   371  	return nil
   372  }
   373  
   374  func appendNodeEncoding(b []byte, n *node) []byte {
   375  	encoding := labelEncoding[n.label]
   376  	if n.icann {
   377  		encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
   378  	}
   379  	encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
   380  	for i := nodesBits - 8; i >= 0; i -= 8 {
   381  		b = append(b, byte((encoding>>i)&0xff))
   382  	}
   383  	return b
   384  }
   385  
   386  func printMetadata(w io.Writer, n *node) error {
   387  	const header = `// generated by go run gen.go; DO NOT EDIT
   388  
   389  package publicsuffix
   390  
   391  import _ "embed"
   392  
   393  const version = %q
   394  
   395  const (
   396  	nodesBits           = %d
   397  	nodesBitsChildren   = %d
   398  	nodesBitsICANN      = %d
   399  	nodesBitsTextOffset = %d
   400  	nodesBitsTextLength = %d
   401  
   402  	childrenBitsWildcard = %d
   403  	childrenBitsNodeType = %d
   404  	childrenBitsHi       = %d
   405  	childrenBitsLo       = %d
   406  )
   407  
   408  const (
   409  	nodeTypeNormal     = %d
   410  	nodeTypeException  = %d
   411  	nodeTypeParentOnly = %d
   412  )
   413  
   414  // numTLD is the number of top level domains.
   415  const numTLD = %d
   416  
   417  // text is the combined text of all labels.
   418  //
   419  //go:embed data/text
   420  var text string
   421  
   422  `
   423  	fmt.Fprintf(w, header, *version,
   424  		nodesBits,
   425  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
   426  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
   427  		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
   428  	fmt.Fprintf(w, `
   429  // nodes is the list of nodes. Each node is represented as a %v-bit integer,
   430  // which encodes the node's children, wildcard bit and node type (as an index
   431  // into the children array), ICANN bit and text.
   432  //
   433  // The layout within the node, from MSB to LSB, is:
   434  //	[%2d bits] unused
   435  //	[%2d bits] children index
   436  //	[%2d bits] ICANN bit
   437  //	[%2d bits] text index
   438  //	[%2d bits] text length
   439  //
   440  //go:embed data/nodes
   441  var nodes uint40String
   442  `,
   443  		nodesBits,
   444  		nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
   445  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
   446  	fmt.Fprintf(w, `
   447  // children is the list of nodes' children, the parent's wildcard bit and the
   448  // parent's node type. If a node has no children then their children index
   449  // will be in the range [0, 6), depending on the wildcard bit and node type.
   450  //
   451  // The layout within the uint32, from MSB to LSB, is:
   452  //	[%2d bits] unused
   453  //	[%2d bits] wildcard bit
   454  //	[%2d bits] node type
   455  //	[%2d bits] high nodes index (exclusive) of children
   456  //	[%2d bits] low nodes index (inclusive) of children
   457  //
   458  //go:embed data/children
   459  var children uint32String
   460  `,
   461  		32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
   462  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
   463  
   464  	fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
   465  	fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
   466  	fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
   467  	fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
   468  	fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
   469  	return nil
   470  }
   471  
   472  type node struct {
   473  	label    string
   474  	nodeType int
   475  	icann    bool
   476  	wildcard bool
   477  	// nodesIndex and childrenIndex are the index of this node in the nodes
   478  	// and the index of its children offset/length in the children arrays.
   479  	nodesIndex, childrenIndex int
   480  	// firstChild is the index of this node's first child, or zero if this
   481  	// node has no children.
   482  	firstChild int
   483  	// children are the node's children, in strictly increasing node label order.
   484  	children []*node
   485  }
   486  
   487  func (n *node) walk(f func(*node) error) error {
   488  	if err := f(n); err != nil {
   489  		return err
   490  	}
   491  	for _, c := range n.children {
   492  		if err := c.walk(f); err != nil {
   493  			return err
   494  		}
   495  	}
   496  	return nil
   497  }
   498  
   499  // child returns the child of n with the given label. The child is created if
   500  // it did not exist beforehand.
   501  func (n *node) child(label string) *node {
   502  	for _, c := range n.children {
   503  		if c.label == label {
   504  			return c
   505  		}
   506  	}
   507  	c := &node{
   508  		label:    label,
   509  		nodeType: nodeTypeParentOnly,
   510  		icann:    true,
   511  	}
   512  	n.children = append(n.children, c)
   513  	sort.Sort(byLabel(n.children))
   514  	return c
   515  }
   516  
   517  type byLabel []*node
   518  
   519  func (b byLabel) Len() int           { return len(b) }
   520  func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   521  func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
   522  
   523  var nextNodesIndex int
   524  
   525  // childrenEncoding are the encoded entries in the generated children array.
   526  // All these pre-defined entries have no children.
   527  var childrenEncoding = []uint32{
   528  	0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
   529  	1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
   530  	2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
   531  	4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
   532  	5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
   533  	6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
   534  }
   535  
   536  var firstCallToAssignIndexes = true
   537  
   538  func assignIndexes(n *node) error {
   539  	if len(n.children) != 0 {
   540  		// Assign nodesIndex.
   541  		n.firstChild = nextNodesIndex
   542  		for _, c := range n.children {
   543  			c.nodesIndex = nextNodesIndex
   544  			nextNodesIndex++
   545  		}
   546  
   547  		// The root node's children is implicit.
   548  		if firstCallToAssignIndexes {
   549  			firstCallToAssignIndexes = false
   550  			return nil
   551  		}
   552  
   553  		// Assign childrenIndex.
   554  		maxChildren = max(maxChildren, len(childrenEncoding))
   555  		if len(childrenEncoding) >= 1<<nodesBitsChildren {
   556  			return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
   557  		}
   558  		n.childrenIndex = len(childrenEncoding)
   559  		lo := uint32(n.firstChild)
   560  		hi := lo + uint32(len(n.children))
   561  		maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
   562  		if lo >= 1<<childrenBitsLo {
   563  			return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
   564  		}
   565  		if hi >= 1<<childrenBitsHi {
   566  			return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
   567  		}
   568  		enc := hi<<childrenBitsLo | lo
   569  		enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
   570  		if n.wildcard {
   571  			enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
   572  		}
   573  		childrenEncoding = append(childrenEncoding, enc)
   574  	} else {
   575  		n.childrenIndex = n.nodeType
   576  		if n.wildcard {
   577  			n.childrenIndex += numNodeType
   578  		}
   579  	}
   580  	return nil
   581  }
   582  
   583  func printNodeLabel(w io.Writer, n *node) error {
   584  	for _, c := range n.children {
   585  		fmt.Fprintf(w, "%q,\n", c.label)
   586  	}
   587  	return nil
   588  }
   589  
   590  func icannStr(icann bool) string {
   591  	if icann {
   592  		return "I"
   593  	}
   594  	return " "
   595  }
   596  
   597  func wildcardStr(wildcard bool) string {
   598  	if wildcard {
   599  		return "*"
   600  	}
   601  	return " "
   602  }
   603  
   604  // combineText combines all the strings in labelsList to form one giant string.
   605  // Overlapping strings will be merged: "arpa" and "parliament" could yield
   606  // "arparliament".
   607  func combineText(labelsList []string) string {
   608  	beforeLength := 0
   609  	for _, s := range labelsList {
   610  		beforeLength += len(s)
   611  	}
   612  
   613  	text := crush(removeSubstrings(labelsList))
   614  	if *v {
   615  		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
   616  	}
   617  	return text
   618  }
   619  
   620  type byLength []string
   621  
   622  func (s byLength) Len() int           { return len(s) }
   623  func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   624  func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
   625  
   626  // removeSubstrings returns a copy of its input with any strings removed
   627  // that are substrings of other provided strings.
   628  func removeSubstrings(input []string) []string {
   629  	// Make a copy of input.
   630  	ss := append(make([]string, 0, len(input)), input...)
   631  	sort.Sort(byLength(ss))
   632  
   633  	for i, shortString := range ss {
   634  		// For each string, only consider strings higher than it in sort order, i.e.
   635  		// of equal length or greater.
   636  		for _, longString := range ss[i+1:] {
   637  			if strings.Contains(longString, shortString) {
   638  				ss[i] = ""
   639  				break
   640  			}
   641  		}
   642  	}
   643  
   644  	// Remove the empty strings.
   645  	sort.Strings(ss)
   646  	for len(ss) > 0 && ss[0] == "" {
   647  		ss = ss[1:]
   648  	}
   649  	return ss
   650  }
   651  
   652  // crush combines a list of strings, taking advantage of overlaps. It returns a
   653  // single string that contains each input string as a substring.
   654  func crush(ss []string) string {
   655  	maxLabelLen := 0
   656  	for _, s := range ss {
   657  		if maxLabelLen < len(s) {
   658  			maxLabelLen = len(s)
   659  		}
   660  	}
   661  
   662  	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
   663  		prefixes := makePrefixMap(ss, prefixLen)
   664  		for i, s := range ss {
   665  			if len(s) <= prefixLen {
   666  				continue
   667  			}
   668  			mergeLabel(ss, i, prefixLen, prefixes)
   669  		}
   670  	}
   671  
   672  	return strings.Join(ss, "")
   673  }
   674  
   675  // mergeLabel merges the label at ss[i] with the first available matching label
   676  // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
   677  // "prefixLen" characters in the matching label.
   678  // It will merge ss[i] repeatedly until no more matches are available.
   679  // All matching labels merged into ss[i] are replaced by "".
   680  func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
   681  	s := ss[i]
   682  	suffix := s[len(s)-prefixLen:]
   683  	for _, j := range prefixes[suffix] {
   684  		// Empty strings mean "already used." Also avoid merging with self.
   685  		if ss[j] == "" || i == j {
   686  			continue
   687  		}
   688  		if *v {
   689  			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
   690  				prefixLen, i, j, ss[i], ss[j], suffix)
   691  		}
   692  		ss[i] += ss[j][prefixLen:]
   693  		ss[j] = ""
   694  		// ss[i] has a new suffix, so merge again if possible.
   695  		// Note: we only have to merge again at the same prefix length. Shorter
   696  		// prefix lengths will be handled in the next iteration of crush's for loop.
   697  		// Can there be matches for longer prefix lengths, introduced by the merge?
   698  		// I believe that any such matches would by necessity have been eliminated
   699  		// during substring removal or merged at a higher prefix length. For
   700  		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
   701  		// would yield "abcde", which could be merged with "bcdef." However, in
   702  		// practice "cde" would already have been elimintated by removeSubstrings.
   703  		mergeLabel(ss, i, prefixLen, prefixes)
   704  		return
   705  	}
   706  }
   707  
   708  // prefixMap maps from a prefix to a list of strings containing that prefix. The
   709  // list of strings is represented as indexes into a slice of strings stored
   710  // elsewhere.
   711  type prefixMap map[string][]int
   712  
   713  // makePrefixMap constructs a prefixMap from a slice of strings.
   714  func makePrefixMap(ss []string, prefixLen int) prefixMap {
   715  	prefixes := make(prefixMap)
   716  	for i, s := range ss {
   717  		// We use < rather than <= because if a label matches on a prefix equal to
   718  		// its full length, that's actually a substring match handled by
   719  		// removeSubstrings.
   720  		if prefixLen < len(s) {
   721  			prefix := s[:prefixLen]
   722  			prefixes[prefix] = append(prefixes[prefix], i)
   723  		}
   724  	}
   725  
   726  	return prefixes
   727  }
   728  

View as plain text