parse.go

Documentation: golang.org/x/arch/x86/x86spec

     1  // Copyright 2016 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"math"
    14  	"os"
    15  	"reflect"
    16  	"regexp"
    17  	"sort"
    18  	"strconv"
    19  	"strings"
    20  	"time"
    21  
    22  	"rsc.io/pdf"
    23  )
    24  
    25  // listing holds information about one or more parsed manual pages
    26  // concerning a single instruction listing.
    27  type listing struct {
    28  	pageNum   int
    29  	name      string       // instruction heading
    30  	mtables   [][][]string // mnemonic tables (at most one per page)
    31  	enctables [][][]string // encoding tables (at most one per page)
    32  	compat    string
    33  }
    34  
    35  type logReaderAt struct {
    36  	f io.ReaderAt
    37  }
    38  
    39  func (l *logReaderAt) ReadAt(x []byte, off int64) (int, error) {
    40  	log.Printf("read %d @ %d", len(x), off)
    41  	return l.f.ReadAt(x, off)
    42  }
    43  
    44  const (
    45  	cacheBlockSize = 64 * 1024
    46  	numCacheBlock  = 16
    47  )
    48  
    49  type cachedReaderAt struct {
    50  	r     io.ReaderAt
    51  	cache *cacheBlock
    52  }
    53  
    54  type cacheBlock struct {
    55  	next   *cacheBlock
    56  	buf    []byte
    57  	offset int64
    58  	err    error
    59  }
    60  
    61  func newCachedReaderAt(r io.ReaderAt) *cachedReaderAt {
    62  	c := &cachedReaderAt{
    63  		r: r,
    64  	}
    65  	for i := 0; i < numCacheBlock; i++ {
    66  		c.cache = &cacheBlock{next: c.cache}
    67  	}
    68  	return c
    69  }
    70  
    71  func (c *cachedReaderAt) ReadAt(p []byte, offset int64) (n int, err error) {
    72  	// Assume large reads indicate a caller that doesn't need caching.
    73  	if len(p) >= cacheBlockSize {
    74  		return c.r.ReadAt(p, offset)
    75  	}
    76  
    77  	for n < len(p) {
    78  		o := offset + int64(n)
    79  		f := o & (cacheBlockSize - 1)
    80  		b := c.readBlock(o - f)
    81  		n += copy(p[n:], b.buf[f:])
    82  		if n < len(p) && b.err != nil {
    83  			return n, b.err
    84  		}
    85  	}
    86  	return n, nil
    87  }
    88  
    89  var errShortRead = errors.New("short read")
    90  
    91  func (c *cachedReaderAt) readBlock(offset int64) *cacheBlock {
    92  	if offset&(cacheBlockSize-1) != 0 {
    93  		panic("misuse of cachedReaderAt.readBlock")
    94  	}
    95  
    96  	// Look in cache.
    97  	var b, prev *cacheBlock
    98  	for b = c.cache; ; prev, b = b, b.next {
    99  		if b.buf != nil && b.offset == offset {
   100  			// Move to front.
   101  			if prev != nil {
   102  				prev.next = b.next
   103  				b.next = c.cache
   104  				c.cache = b
   105  			}
   106  			return b
   107  		}
   108  		if b.next == nil {
   109  			break
   110  		}
   111  	}
   112  
   113  	// Otherwise b is LRU block in cache, prev points at b.
   114  	if b.buf == nil {
   115  		b.buf = make([]byte, cacheBlockSize)
   116  	}
   117  	b.offset = offset
   118  	n, err := c.r.ReadAt(b.buf[:cacheBlockSize], offset)
   119  	b.buf = b.buf[:n]
   120  	b.err = err
   121  	if n > 0 {
   122  		// Move to front.
   123  		prev.next = nil
   124  		b.next = c.cache
   125  		c.cache = b
   126  	}
   127  	return b
   128  }
   129  
   130  func pdfOpen(name string) (*pdf.Reader, error) {
   131  	f, err := os.Open(name)
   132  	if err != nil {
   133  		return nil, err
   134  	}
   135  	fi, err := f.Stat()
   136  	if err != nil {
   137  		f.Close()
   138  		return nil, err
   139  	}
   140  	return pdf.NewReader(newCachedReaderAt(f), fi.Size())
   141  }
   142  
   143  func parse() []*instruction {
   144  	var insts []*instruction
   145  
   146  	f, err := pdfOpen(*flagFile)
   147  	if err != nil {
   148  		log.Fatal(err)
   149  	}
   150  
   151  	// Find instruction set reference in outline, to build instruction list.
   152  	instList := instHeadings(f.Outline())
   153  	if len(instList) < 200 {
   154  		log.Fatalf("only found %d instructions in table of contents", len(instList))
   155  	}
   156  
   157  	// Scan document looking for instructions.
   158  	// Must find exactly the ones in the outline.
   159  	n := f.NumPage()
   160  	var current *listing
   161  	finishInstruction := func() {
   162  		if current == nil {
   163  			return
   164  		}
   165  		if len(current.mtables) == 0 || len(current.mtables[0]) <= 1 {
   166  			fmt.Fprintf(os.Stderr, "p.%d: no mnemonics for instruction %q\n", current.pageNum, current.name)
   167  		}
   168  		processListing(current, &insts)
   169  		current = nil
   170  	}
   171  
   172  	for pageNum := 1; pageNum <= n; pageNum++ {
   173  		if onlySomePages && !isDebugPage(pageNum) {
   174  			continue
   175  		}
   176  		p := f.Page(pageNum)
   177  		parsed := parsePage(p, pageNum)
   178  		if parsed.name != "" {
   179  			finishInstruction()
   180  			for j, headline := range instList {
   181  				if parsed.name == headline {
   182  					instList[j] = ""
   183  					current = parsed
   184  					break
   185  				}
   186  			}
   187  			if current == nil {
   188  				fmt.Fprintf(os.Stderr, "p.%d: unexpected instruction %q\n", pageNum, parsed.name)
   189  			}
   190  			continue
   191  		}
   192  		if current != nil {
   193  			merge(current, parsed)
   194  			continue
   195  		}
   196  		if parsed.mtables != nil {
   197  			fmt.Fprintf(os.Stderr, "p.%d: unexpected mnemonic table\n", pageNum)
   198  		}
   199  		if parsed.enctables != nil {
   200  			fmt.Fprintf(os.Stderr, "p.%d: unexpected encoding table\n", pageNum)
   201  		}
   202  		if parsed.compat != "" {
   203  			fmt.Fprintf(os.Stderr, "p.%d: unexpected compatibility statement\n", pageNum)
   204  		}
   205  	}
   206  	finishInstruction()
   207  
   208  	if !onlySomePages {
   209  		for _, headline := range instList {
   210  			if headline != "" {
   211  				fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
   212  			}
   213  		}
   214  	}
   215  
   216  	return insts
   217  }
   218  
   219  // isDebugPage reports whether the -debugpage flag mentions page n.
   220  // The argument is a comma-separated list of pages.
   221  // Maybe some day it will support ranges.
   222  func isDebugPage(n int) bool {
   223  	s := *flagDebugPage
   224  	var k int
   225  	for i := 0; ; i++ {
   226  		if i == len(s) || s[i] == ',' {
   227  			if n == k {
   228  				return true
   229  			}
   230  			k = 0
   231  		}
   232  		if i == len(s) {
   233  			break
   234  		}
   235  		if '0' <= s[i] && s[i] <= '9' {
   236  			k = k*10 + int(s[i]) - '0'
   237  		}
   238  	}
   239  	return false
   240  }
   241  
   242  // merge merges the content of y into the running collection in x.
   243  func merge(x, y *listing) {
   244  	if y.name != "" {
   245  		fmt.Fprintf(os.Stderr, "p.%d: merging page incorrectly\n", y.pageNum)
   246  		return
   247  	}
   248  
   249  	x.mtables = append(x.mtables, y.mtables...)
   250  	x.enctables = append(x.enctables, y.enctables...)
   251  	x.compat += y.compat
   252  }
   253  
   254  // instHeadings returns the list of instruction headings from the table of contents.
   255  // When we parse the pages we expect to find every one of these.
   256  func instHeadings(outline pdf.Outline) []string {
   257  	return appendInstHeadings(outline, nil)
   258  }
   259  
   260  var instRE = regexp.MustCompile(`\d Instructions \([A-Z]-[A-Z]\)|VMX Instructions|Instruction SET Reference|SHA Extensions Reference`)
   261  
   262  // The headings are inconsistent about dash and superscript usage. Normalize.
   263  var fixDash = strings.NewReplacer(
   264  	"Compute 2 –1", "Compute 2^x-1",
   265  	"Compute 2x-1", "Compute 2^x-1",
   266  	"Compute 2x–1", "Compute 2^x-1",
   267  	"/ FUCOMI", "/FUCOMI",
   268  	"Compute y ∗ log x", "Compute y * log₂x",
   269  	"Compute y * log2x", "Compute y * log₂x",
   270  	"Compute y * log2(x +1)", "Compute y * log₂(x+1)",
   271  	"Compute y ∗ log (x +1)", "Compute y * log₂(x+1)",
   272  	" — ", "-",
   273  	"— ", "-",
   274  	" —", "-",
   275  	"—", "-",
   276  	" – ", "-",
   277  	" –", "-",
   278  	"– ", "-",
   279  	"–", "-",
   280  	" - ", "-",
   281  	"- ", "-",
   282  	" -", "-",
   283  )
   284  
   285  func appendInstHeadings(outline pdf.Outline, list []string) []string {
   286  	if instRE.MatchString(outline.Title) {
   287  		for _, child := range outline.Child {
   288  			list = append(list, fixDash.Replace(child.Title))
   289  		}
   290  	}
   291  	for _, child := range outline.Child {
   292  		list = appendInstHeadings(child, list)
   293  	}
   294  	return list
   295  }
   296  
   297  var dateRE = regexp.MustCompile(`\b(January|February|March|April|May|June|July|August|September|October|November|December) ((19|20)[0-9][0-9])\b`)
   298  
   299  // parsePage parses a single PDF page and returns the content it found.
   300  func parsePage(p pdf.Page, pageNum int) *listing {
   301  	if debugging {
   302  		fmt.Fprintf(os.Stderr, "DEBUG: parsing page %d\n", pageNum)
   303  	}
   304  
   305  	parsed := new(listing)
   306  	parsed.pageNum = pageNum
   307  
   308  	content := p.Content()
   309  
   310  	for i, t := range content.Text {
   311  		if match(t, "Symbol", 11, "≠") {
   312  			t.Font = "NeoSansIntel"
   313  			t.FontSize = 9
   314  			content.Text[i] = t
   315  		}
   316  		if t.S == "*" || t.S == "**" || t.S == "***" || t.S == "," && t.Font == "Arial" && t.FontSize < 9 || t.S == "1" && t.Font == "Arial" {
   317  			t.Font = "NeoSansIntel"
   318  			t.FontSize = 9
   319  			if i+1 < len(content.Text) {
   320  				t.Y = content.Text[i+1].Y
   321  			}
   322  			content.Text[i] = t
   323  		}
   324  	}
   325  
   326  	text := findWords(content.Text)
   327  
   328  	for i, t := range text {
   329  		if match(t, "NeoSansIntel", 8, ".WIG") || match(t, "NeoSansIntel", 8, "AVX2") {
   330  			t.FontSize = 9
   331  			text[i] = t
   332  		}
   333  		if t.Font == "NeoSansIntel-Medium" {
   334  			t.Font = "NeoSansIntelMedium"
   335  			text[i] = t
   336  		}
   337  		if t.Font == "NeoSansIntel-Italic" {
   338  			t.Font = "NeoSansIntel,Italic"
   339  			text[i] = t
   340  		}
   341  	}
   342  
   343  	if debugging {
   344  		for _, t := range text {
   345  			fmt.Println(t)
   346  		}
   347  	}
   348  
   349  	if pageNum == 1 {
   350  		var buf bytes.Buffer
   351  		for _, t := range text {
   352  			buf.WriteString(t.S + "\n")
   353  		}
   354  		all := buf.String()
   355  		m := regexp.MustCompile(`Order Number: ([\w-\-]+)`).FindStringSubmatch(all)
   356  		num := "???"
   357  		if m != nil {
   358  			num = m[1]
   359  		}
   360  		date := dateRE.FindString(all)
   361  		if date == "" {
   362  			date = "???"
   363  		}
   364  
   365  		fmt.Printf("# x86 instruction set description version %s, %s\n",
   366  			specFormatVersion, time.Now().Format("2006-01-02"))
   367  		fmt.Printf("# Based on Intel Instruction Set Reference #%s, %s.\n", num, date)
   368  		fmt.Printf("# https://golang.org/x/arch/x86/x86spec\n")
   369  	}
   370  
   371  	// Remove text we should ignore.
   372  	out := text[:0]
   373  	for _, t := range text {
   374  		if shouldIgnore(t) {
   375  			continue
   376  		}
   377  		out = append(out, t)
   378  	}
   379  	text = out
   380  
   381  	// Page header must say instruction set reference.
   382  	if len(text) == 0 {
   383  		return parsed
   384  	}
   385  	if (!match(text[0], "NeoSansIntel", 9, "INSTRUCTION") || !match(text[0], "NeoSansIntel", 9, "REFERENCE")) &&
   386  		!match(text[0], "NeoSansIntel", 9, "EXTENSIONS") {
   387  		return parsed
   388  	}
   389  	text = text[1:]
   390  
   391  	enctable := findEncodingTable(text)
   392  	if enctable != nil {
   393  		parsed.enctables = append(parsed.enctables, enctable)
   394  	}
   395  
   396  	parsed.compat = findCompat(text)
   397  
   398  	// Narrow scope for finding mnemonic table.
   399  	// Must be last, since it trims text.
   400  	// Next line is headline. Can wrap to multiple lines.
   401  	if len(text) == 0 || !match(text[0], "NeoSansIntelMedium", 12, "") || !isInstHeadline(text[0].S) {
   402  		if debugging {
   403  			fmt.Fprintf(os.Stderr, "non-inst-headline: %v\n", text[0])
   404  		}
   405  	} else {
   406  		parsed.name = text[0].S
   407  		text = text[1:]
   408  		for len(text) > 0 && match(text[0], "NeoSansIntelMedium", 12, "") {
   409  			parsed.name += " " + text[0].S
   410  			text = text[1:]
   411  		}
   412  		parsed.name = fixDash.Replace(parsed.name)
   413  	}
   414  
   415  	// Table follows; heading is NeoSansIntelMedium and rows are NeoSansIntel.
   416  	i := 0
   417  	for i < len(text) && match(text[i], "NeoSansIntelMedium", 9, "") {
   418  		i++
   419  	}
   420  	for i < len(text) && match(text[i], "NeoSansIntel", 9, "") && text[i].S != "NOTES:" {
   421  		i++
   422  	}
   423  
   424  	mtable := findMnemonicTable(text[:i])
   425  	if mtable != nil {
   426  		parsed.mtables = append(parsed.mtables, mtable)
   427  	}
   428  
   429  	return parsed
   430  }
   431  
   432  func match(t pdf.Text, font string, size float64, substr string) bool {
   433  	return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
   434  }
   435  
   436  func shouldIgnore(t pdf.Text) bool {
   437  	// Ignore footnote stars, which are in Arial.
   438  	// Also, the page describing MOVS has a tiny 2pt Arial backslash.
   439  	if (t.S == "*" || t.S == "\\") && strings.HasPrefix(t.Font, "Arial") {
   440  		return true
   441  	}
   442  
   443  	// Ignore superscript numbers, superscript ST(0), and superscript x.
   444  	if len(t.S) == 1 && '1' <= t.S[0] && t.S[0] <= '9' || t.S == "ST(0)" || t.S == "x" {
   445  		if match(t, "NeoSansIntel", 7.2, "") || match(t, "NeoSansIntel", 5.6, "") || match(t, "NeoSansIntelMedium", 8, "") || match(t, "NeoSansIntelMedium", 9.6, "") {
   446  			return true
   447  		}
   448  	}
   449  
   450  	return false
   451  }
   452  
   453  func isInstHeadline(s string) bool {
   454  	return strings.Contains(s, "—") ||
   455  		strings.Contains(s, " - ") ||
   456  		strings.Contains(s, "PTEST- Logical Compare")
   457  }
   458  
   459  func findWords(chars []pdf.Text) (words []pdf.Text) {
   460  	// Sort by Y coordinate and normalize.
   461  	const nudge = 1
   462  	sort.Sort(pdf.TextVertical(chars))
   463  	old := -100000.0
   464  	for i, c := range chars {
   465  		if c.Y != old && math.Abs(old-c.Y) < nudge {
   466  			chars[i].Y = old
   467  		} else {
   468  			old = c.Y
   469  		}
   470  	}
   471  
   472  	// Sort by Y coordinate, breaking ties with X.
   473  	// This will bring letters in a single word together.
   474  	sort.Sort(pdf.TextVertical(chars))
   475  
   476  	// Loop over chars.
   477  	for i := 0; i < len(chars); {
   478  		// Find all chars on line.
   479  		j := i + 1
   480  		for j < len(chars) && chars[j].Y == chars[i].Y {
   481  			j++
   482  		}
   483  		var end float64
   484  		// Split line into words (really, phrases).
   485  		for k := i; k < j; {
   486  			ck := &chars[k]
   487  			s := ck.S
   488  			end = ck.X + ck.W
   489  			charSpace := ck.FontSize / 6
   490  			wordSpace := ck.FontSize * 2 / 3
   491  			l := k + 1
   492  			for l < j {
   493  				// Grow word.
   494  				cl := &chars[l]
   495  				if sameFont(cl.Font, ck.Font) && cl.FontSize == ck.FontSize && cl.X <= end+charSpace {
   496  					s += cl.S
   497  					end = cl.X + cl.W
   498  					l++
   499  					continue
   500  				}
   501  				// Add space to phrase before next word.
   502  				if sameFont(cl.Font, ck.Font) && cl.FontSize == ck.FontSize && cl.X <= end+wordSpace {
   503  					s += " " + cl.S
   504  					end = cl.X + cl.W
   505  					l++
   506  					continue
   507  				}
   508  				break
   509  			}
   510  			f := ck.Font
   511  			f = strings.TrimSuffix(f, ",Italic")
   512  			f = strings.TrimSuffix(f, "-Italic")
   513  			words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end, s})
   514  			k = l
   515  		}
   516  		i = j
   517  	}
   518  
   519  	return words
   520  }
   521  
   522  func sameFont(f1, f2 string) bool {
   523  	f1 = strings.TrimSuffix(f1, ",Italic")
   524  	f1 = strings.TrimSuffix(f1, "-Italic")
   525  	f2 = strings.TrimSuffix(f1, ",Italic")
   526  	f2 = strings.TrimSuffix(f1, "-Italic")
   527  	return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
   528  }
   529  
   530  func findMnemonicTable(text []pdf.Text) [][]string {
   531  	sort.Sort(pdf.TextHorizontal(text))
   532  
   533  	const nudge = 1
   534  
   535  	old := -100000.0
   536  	var col []float64
   537  	for i, t := range text {
   538  		if t.Font != "NeoSansIntelMedium" { // only headings count
   539  			continue
   540  		}
   541  		if t.X != old && math.Abs(old-t.X) < nudge {
   542  			text[i].X = old
   543  		} else if t.X != old {
   544  			old = t.X
   545  			col = append(col, old)
   546  		}
   547  	}
   548  	sort.Sort(pdf.TextVertical(text))
   549  
   550  	if len(col) == 0 {
   551  		return nil
   552  	}
   553  
   554  	y := -100000.0
   555  	var table [][]string
   556  	var line []string
   557  	bold := -1
   558  	for _, t := range text {
   559  		if t.Y != y {
   560  			table = append(table, make([]string, len(col)))
   561  			line = table[len(table)-1]
   562  			y = t.Y
   563  			if t.Font == "NeoSansIntelMedium" {
   564  				bold = len(table) - 1
   565  			}
   566  		}
   567  		i := 0
   568  		for i+1 < len(col) && col[i+1] <= t.X+nudge {
   569  			i++
   570  		}
   571  		if line[i] != "" {
   572  			line[i] += " "
   573  		}
   574  		line[i] += t.S
   575  	}
   576  
   577  	var mtable [][]string
   578  	for i, t := range table {
   579  		if 0 < i && i <= bold || bold < i && halfMissing(t) {
   580  			// merge with earlier line
   581  			last := mtable[len(mtable)-1]
   582  			for j, s := range t {
   583  				if s != "" {
   584  					last[j] += "\n" + s
   585  				}
   586  			}
   587  		} else {
   588  			mtable = append(mtable, t)
   589  		}
   590  	}
   591  
   592  	if bold >= 0 {
   593  		heading := mtable[0]
   594  		for i, x := range heading {
   595  			heading[i] = fixHeading.Replace(x)
   596  		}
   597  	}
   598  
   599  	return mtable
   600  }
   601  
   602  var fixHeading = strings.NewReplacer(
   603  	"64/32-\nbit\nMode", "64/32-Bit Mode",
   604  	"64/32-\nbit Mode", "64/32-Bit Mode",
   605  	"64/32-bit\nMode", "64/32-Bit Mode",
   606  	"64/3\n2-bit\nMode", "64/32-Bit Mode",
   607  	"64/32 bit\nMode\nSupport", "64/32-Bit Mode",
   608  	"64/32bit\nMode\nSupport", "64/32-Bit Mode",
   609  	"64/32\n-bit\nMode", "64/32-Bit Mode",
   610  	"64/32\nbit Mode\nSupport", "64/32-Bit Mode",
   611  	"64-Bit\nMode", "64-Bit Mode",
   612  	"64-bit\nMode", "64-Bit Mode",
   613  
   614  	"Op/ En", "Op/En",
   615  	"Op/\nEn", "Op/En",
   616  	"Op/\nEN", "Op/En",
   617  	"Op /\nEn", "Op/En",
   618  	"Opcode***", "Opcode",
   619  	"Opcode**", "Opcode",
   620  	"Opcode*", "Opcode",
   621  	"/\nInstruction", "/Instruction",
   622  
   623  	"CPUID Fea-\nture Flag", "CPUID Feature Flag",
   624  	"CPUID\nFeature\nFlag", "CPUID Feature Flag",
   625  	"CPUID\nFeature Flag", "CPUID Feature Flag",
   626  	"CPUIDFeature\nFlag", "CPUID Feature Flag",
   627  
   628  	"Compat/\nLeg Mode*", "Compat/Leg Mode",
   629  	"Compat/\nLeg Mode", "Compat/Leg Mode",
   630  	"Compat/ *\nLeg Mode", "Compat/Leg Mode",
   631  )
   632  
   633  func halfMissing(x []string) bool {
   634  	n := 0
   635  	for _, s := range x {
   636  		if s == "" {
   637  			n++
   638  		}
   639  	}
   640  	return n >= len(x)/2
   641  }
   642  
   643  func findEncodingTable(text []pdf.Text) [][]string {
   644  	// Look for operand encoding table.
   645  	sort.Sort(pdf.TextVertical(text))
   646  	var col []float64
   647  	sawTitle := false
   648  
   649  	center := func(t pdf.Text) float64 {
   650  		return t.X + t.W/2
   651  	}
   652  
   653  	start := 0
   654  	end := len(text)
   655  	for i, t := range text {
   656  		if match(t, "NeoSansIntelMedium", 10, "Instruction Operand Encoding") {
   657  			sawTitle = true
   658  			start = i + 1
   659  			continue
   660  		}
   661  		if !sawTitle {
   662  			continue
   663  		}
   664  		if match(t, "NeoSansIntel", 9, "Op/En") || match(t, "NeoSansIntel", 9, "Operand") {
   665  			if debugging {
   666  				fmt.Printf("column %d at %.2f: %v\n", len(col), center(t), t)
   667  			}
   668  			col = append(col, center(t))
   669  		}
   670  		if match(t, "NeoSansIntelMedium", 10, "Description") {
   671  			end = i
   672  			break
   673  		}
   674  	}
   675  	text = text[start:end]
   676  
   677  	if len(col) == 0 {
   678  		return nil
   679  	}
   680  
   681  	const nudge = 20
   682  
   683  	y := -100000.0
   684  	var table [][]string
   685  	var line []string
   686  	for _, t := range text {
   687  		if t.Y != y {
   688  			table = append(table, make([]string, len(col)))
   689  			line = table[len(table)-1]
   690  			y = t.Y
   691  		}
   692  		i := 0
   693  		x := center(t)
   694  		for i+1 < len(col) && col[i+1] <= x+nudge {
   695  			i++
   696  		}
   697  		if debugging {
   698  			fmt.Printf("text at %.2f: %v => %d\n", x, t, i)
   699  		}
   700  		if line[i] != "" {
   701  			line[i] += " "
   702  		}
   703  		line[i] += t.S
   704  	}
   705  
   706  	out := table[:0]
   707  	for _, line := range table {
   708  		if strings.HasPrefix(line[len(line)-1], "Vol. 2") { // page footer
   709  			continue
   710  		}
   711  		if line[0] == "" && len(out) > 0 {
   712  			last := out[len(out)-1]
   713  			for i, col := range line {
   714  				if col != "" {
   715  					last[i] += " " + col
   716  				}
   717  			}
   718  			continue
   719  		}
   720  		out = append(out, line)
   721  	}
   722  	table = out
   723  
   724  	return table
   725  }
   726  
   727  func findCompat(text []pdf.Text) string {
   728  	sort.Sort(pdf.TextVertical(text))
   729  
   730  	inCompat := false
   731  	out := ""
   732  	for _, t := range text {
   733  		if match(t, "NeoSansIntelMedium", 10, "") {
   734  			inCompat = strings.Contains(t.S, "Architecture Compatibility")
   735  			if inCompat {
   736  				out += t.S + "\n"
   737  			}
   738  		}
   739  		if inCompat && match(t, "Verdana", 9, "") || strings.Contains(t.S, "were introduced") {
   740  			out += t.S + "\n"
   741  		}
   742  	}
   743  	return out
   744  }
   745  
   746  func processListing(p *listing, insts *[]*instruction) {
   747  	if debugging {
   748  		for _, table := range p.mtables {
   749  			fmt.Printf("table:\n")
   750  			for _, row := range table {
   751  				fmt.Printf("%q\n", row)
   752  			}
   753  		}
   754  		fmt.Printf("enctable:\n")
   755  		for _, table := range p.enctables {
   756  			for _, row := range table {
   757  				fmt.Printf("%q\n", row)
   758  			}
   759  		}
   760  		fmt.Printf("compat:\n%s", p.compat)
   761  	}
   762  
   763  	if *flagCompat && p.compat != "" {
   764  		fmt.Printf("# p.%d: %s\n#\t%s\n", p.pageNum, p.name, strings.Replace(p.compat, "\n", "\n#\t", -1))
   765  	}
   766  
   767  	encs := make(map[string][]string)
   768  	for _, table := range p.enctables {
   769  		for _, row := range table[1:] {
   770  			for len(row) > 1 && (row[len(row)-1] == "NA" || row[len(row)-1] == "" || row[len(row)-1] == " source") {
   771  				row = row[:len(row)-1]
   772  			}
   773  			encs[row[0]] = row[1:]
   774  		}
   775  	}
   776  
   777  	var wrong string
   778  	for _, table := range p.mtables {
   779  		heading := table[0]
   780  		for _, row := range table[1:] {
   781  			if row[0] == heading[0] && reflect.DeepEqual(row, heading) {
   782  				continue
   783  			}
   784  			if len(row) >= 5 && row[1] == "CMOVG r64, r/m64" && row[3] == "V/N.E." && row[4] == "NA" {
   785  				row[3] = "V"
   786  				row[4] = "N.E."
   787  			}
   788  			inst := new(instruction)
   789  			inst.page = p.pageNum
   790  			inst.compat = strings.Join(strings.Fields(p.compat), " ")
   791  			for i, hdr := range heading {
   792  				x := row[i]
   793  				x = strings.Replace(x, "\n", " ", -1)
   794  				switch strings.TrimSpace(hdr) {
   795  				default:
   796  					wrong = "unexpected header: " + strconv.Quote(hdr)
   797  					goto BadTable
   798  				case "Opcode/Instruction":
   799  					x = row[i]
   800  					if strings.HasPrefix(x, "\nVEX") {
   801  						x = x[1:]
   802  						row[i] = x
   803  					}
   804  					if strings.Contains(x, "\n/r ") {
   805  						x = strings.Replace(x, "\n/r ", " /r ", -1)
   806  						row[i] = x
   807  					}
   808  					if strings.Contains(x, ",\nimm") {
   809  						x = strings.Replace(x, ",\nimm", ", imm", -1)
   810  						row[i] = x
   811  					}
   812  					if strings.Count(x, "\n") < 1 {
   813  						wrong = "bad Opcode/Instruction pairing: " + strconv.Quote(x)
   814  						goto BadTable
   815  					}
   816  					i := strings.Index(x, "\n")
   817  					inst.opcode = x[:i]
   818  					inst.syntax = strings.Replace(x[i+1:], "\n", " ", -1)
   819  
   820  				case "Opcode":
   821  					inst.opcode = x
   822  
   823  				case "Instruction":
   824  					inst.syntax = x
   825  
   826  				case "Op/En":
   827  					inst.args = encs[x]
   828  					if inst.args == nil && len(encs) == 1 && encs["A"] != nil {
   829  						inst.args = encs["A"]
   830  					}
   831  					// In the December 2015 manual, PREFETCHW says
   832  					// encoding A but the table gives encoding M.
   833  					if inst.args == nil && inst.syntax == "PREFETCHW m8" && x == "A" && len(encs) == 1 && encs["M"] != nil {
   834  						inst.args = encs["M"]
   835  					}
   836  
   837  				case "64-Bit Mode":
   838  					x, ok := parseMode(x)
   839  					if !ok {
   840  						wrong = "unexpected value for 64-Bit Mode column: " + x
   841  						goto BadTable
   842  					}
   843  					inst.valid64 = x
   844  
   845  				case "Compat/Leg Mode":
   846  					x, ok := parseMode(x)
   847  					if !ok {
   848  						wrong = "unexpected value for Compat/Leg Mode column: " + x
   849  						goto BadTable
   850  					}
   851  					inst.valid32 = x
   852  
   853  				case "64/32-Bit Mode":
   854  					i := strings.Index(x, "/")
   855  					if i < 0 {
   856  						wrong = "unexpected value for 64/32-Bit Mode column: " + x
   857  						goto BadTable
   858  					}
   859  					x1, ok1 := parseMode(x[:i])
   860  					x2, ok2 := parseMode(x[i+1:])
   861  					if !ok1 || !ok2 {
   862  						wrong = "unexpected value for 64/32-Bit Mode column: " + x
   863  						goto BadTable
   864  					}
   865  					inst.valid64 = x1
   866  					inst.valid32 = x2
   867  
   868  				case "CPUID Feature Flag":
   869  					inst.cpuid = x
   870  
   871  				case "Description":
   872  					if inst.desc != "" {
   873  						inst.desc += " "
   874  					}
   875  					inst.desc += x
   876  				}
   877  			}
   878  
   879  			// Fixup various typos or bugs in opcode descriptions.
   880  			if inst.opcode == "VEX.128.66.0F.W0 6E /" {
   881  				inst.opcode += "r"
   882  			}
   883  			fix := func(old, new string) {
   884  				inst.opcode = strings.Replace(inst.opcode, old, new, -1)
   885  			}
   886  			fix(" imm8", " ib")
   887  			fix("REX.w", "REX.W")
   888  			fix("REX.W+", "REX.W +")
   889  			fix(" 0f ", " 0F ")
   890  			fix(". 0F38", ".0F38")
   891  			fix("0F .WIG", "0F.WIG")
   892  			fix("0F38 .WIG", "0F38.WIG")
   893  			fix("NDS .LZ", "NDS.LZ")
   894  			fix("58+ r", "58+r")
   895  			fix("B0+ ", "B0+")
   896  			fix("B8+ ", "B8+")
   897  			fix("40+ ", "40+")
   898  			fix("*", "")
   899  			fix(",", " ")
   900  			fix("/", " /")
   901  			fix("REX.W +", "REX.W")
   902  			fix("REX +", "REX")
   903  			fix("REX 0F BE", "REX.W 0F BE")
   904  			fix("REX 0F B2", "REX.W 0F B2")
   905  			fix("REX 0F B4", "REX.W 0F B4")
   906  			fix("REX 0F B5", "REX.W 0F B5")
   907  			fix("0F38.0", "0F38.W0")
   908  			fix(".660F.", ".66.0F.")
   909  			fix("VEX128", "VEX.128")
   910  			fix("0F3A.W0.1D", "0F3A.W0 1D")
   911  
   912  			inst.opcode = strings.Join(strings.Fields(inst.opcode), " ")
   913  
   914  			fix = func(old, new string) {
   915  				inst.syntax = strings.Replace(inst.syntax, old, new, -1)
   916  			}
   917  			fix("xmm1 xmm2", "xmm1, xmm2")
   918  			fix("r16/m16", "r/m16")
   919  			fix("r32/m161", "r32/m16") // really r32/m16¹ (footnote)
   920  			fix("r32/m32", "r/m32")
   921  			fix("r64/m64", "r/m64")
   922  			fix("\u2013", "-")
   923  			fix("mm3 /m", "mm3/m")
   924  			fix("mm3/.m", "mm3/m")
   925  			inst.syntax = joinSyntax(splitSyntax(inst.syntax))
   926  
   927  			fix = func(old, new string) {
   928  				inst.cpuid = strings.Replace(inst.cpuid, old, new, -1)
   929  			}
   930  			fix("PCLMUL- QDQ", "PCLMULQDQ")
   931  			fix("PCL- MULQDQ", "PCLMULQDQ")
   932  			fix("Both PCLMULQDQ and AVX flags", "PCLMULQDQ+AVX")
   933  
   934  			if !instBlacklist[inst.syntax] {
   935  				*insts = append(*insts, inst)
   936  			}
   937  		}
   938  	}
   939  	return
   940  
   941  BadTable:
   942  	fmt.Fprintf(os.Stderr, "p.%d: reading %v: %v\n", p.pageNum, p.name, wrong)
   943  	for _, table := range p.mtables {
   944  		for _, t := range table {
   945  			fmt.Fprintf(os.Stderr, "\t%q\n", t)
   946  		}
   947  	}
   948  	fmt.Fprintf(os.Stderr, "\n")
   949  }
   950  
   951  func parseMode(s string) (string, bool) {
   952  	switch strings.TrimSpace(s) {
   953  	case "Invalid", "Invalid*", "Inv.", "I", "i":
   954  		return "I", true
   955  	case "Valid", "Valid*", "V":
   956  		return "V", true
   957  	case "N.E.", "NE", "N. E.":
   958  		return "N.E.", true
   959  	case "N.P.", "N. P.":
   960  		return "N.P.", true
   961  	case "N.S.", "N. S.":
   962  		return "N.S.", true
   963  	case "N.I.", "N. I.":
   964  		return "N.I.", true
   965  	}
   966  	return s, false
   967  }
   968  
   969  func splitSyntax(syntax string) (op string, args []string) {
   970  	i := strings.Index(syntax, " ")
   971  	if i < 0 {
   972  		return syntax, nil
   973  	}
   974  	op, syntax = syntax[:i], syntax[i+1:]
   975  	args = strings.Split(syntax, ",")
   976  	for i, arg := range args {
   977  		arg = strings.TrimSpace(arg)
   978  		arg = strings.TrimRight(arg, "*")
   979  		args[i] = arg
   980  	}
   981  	return
   982  }
   983  
   984  func joinSyntax(op string, args []string) string {
   985  	if len(args) == 0 {
   986  		return op
   987  	}
   988  	return op + " " + strings.Join(args, ", ")
   989  }
   990
View as plain text