spec.go

Documentation: golang.org/x/arch/arm/armspec

     1  // Copyright 2014 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Armspec reads the “ARM Architecture Reference Manual”
     6  // to collect instruction encoding details and writes those details to standard output
     7  // in JSON format.
     8  //
     9  // # Warning Warning Warning
    10  //
    11  // This program is unfinished. It is being published in this incomplete form
    12  // for interested readers, but do not expect it to be runnable or useful.
    13  package main
    14  
    15  import (
    16  	"bufio"
    17  	"bytes"
    18  	"encoding/json"
    19  	"fmt"
    20  	"log"
    21  	"math"
    22  	"os"
    23  	"regexp"
    24  	"sort"
    25  	"strconv"
    26  	"strings"
    27  
    28  	"rsc.io/pdf"
    29  )
    30  
    31  type Inst struct {
    32  	Name   string
    33  	ID     string
    34  	Bits   string
    35  	Arch   string
    36  	Syntax []string
    37  	Code   string
    38  }
    39  
    40  const debugPage = 0
    41  
    42  var stdout *bufio.Writer
    43  
    44  func main() {
    45  	log.SetFlags(0)
    46  	log.SetPrefix("armspec: ")
    47  
    48  	if len(os.Args) != 2 {
    49  		fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n")
    50  		os.Exit(2)
    51  	}
    52  
    53  	f, err := pdf.Open(os.Args[1])
    54  	if err != nil {
    55  		log.Fatal(err)
    56  	}
    57  
    58  	// Find instruction set reference in outline, to build instruction list.
    59  	instList := instHeadings(f.Outline())
    60  	if len(instList) < 200 {
    61  		log.Fatalf("only found %d instructions in table of contents", len(instList))
    62  	}
    63  
    64  	stdout = bufio.NewWriter(os.Stdout)
    65  	fmt.Fprintf(stdout, "[")
    66  	numTable := 0
    67  	defer stdout.Flush()
    68  
    69  	// Scan document looking for instructions.
    70  	// Must find exactly the ones in the outline.
    71  	n := f.NumPage()
    72  PageLoop:
    73  	for pageNum := 1; pageNum <= n; pageNum++ {
    74  		if debugPage > 0 && pageNum != debugPage {
    75  			continue
    76  		}
    77  		if pageNum > 1127 {
    78  			break
    79  		}
    80  		p := f.Page(pageNum)
    81  		name, table := parsePage(pageNum, p)
    82  		if name == "" {
    83  			continue
    84  		}
    85  		if len(table) < 1 {
    86  			if false {
    87  				fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
    88  			}
    89  			continue
    90  		}
    91  		for _, inst := range table {
    92  			if numTable > 0 {
    93  				fmt.Fprintf(stdout, ",")
    94  			}
    95  			numTable++
    96  			js, _ := json.Marshal(inst)
    97  			fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js)))
    98  		}
    99  		for j, headline := range instList {
   100  			if name == headline {
   101  				instList[j] = ""
   102  				continue PageLoop
   103  			}
   104  		}
   105  		fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
   106  	}
   107  
   108  	fmt.Fprintf(stdout, "\n]\n")
   109  	stdout.Flush()
   110  
   111  	if debugPage == 0 {
   112  		for _, headline := range instList {
   113  			if headline != "" {
   114  				switch headline {
   115  				default:
   116  					fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
   117  				case "CHKA": // ThumbEE
   118  				case "CPS": // system instruction
   119  				case "CPY": // synonym for MOV
   120  				case "ENTERX": // ThumbEE
   121  				case "F* (former VFP instruction mnemonics)": // synonyms
   122  				case "HB, HBL, HBLP, HBP": // ThumbEE
   123  				case "LEAVEX": // ThumbEE
   124  				case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX
   125  				case "NEG": // synonym for RSB
   126  				case "RFE": // system instruction
   127  				case "SMC (previously SMI)": // system instruction
   128  				case "SRS": // system instruction
   129  				case "SUBS PC, LR and related instructions": // system instruction
   130  				case "VAND (immediate)": // pseudo instruction
   131  				case "VCLE (register)": // pseudo instruction
   132  				case "VCLT (register)": // pseudo instruction
   133  				case "VORN (immediate)": // pseudo instruction
   134  				}
   135  			}
   136  		}
   137  	}
   138  }
   139  
   140  func instHeadings(outline pdf.Outline) []string {
   141  	return appendInstHeadings(outline, nil)
   142  }
   143  
   144  var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`)
   145  var childRE = regexp.MustCompile(`A[\d.]+ (.+)`)
   146  var sectionRE = regexp.MustCompile(`^A[\d.]+$`)
   147  var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`)
   148  
   149  func appendInstHeadings(outline pdf.Outline, list []string) []string {
   150  	if instRE.MatchString(outline.Title) {
   151  		for _, child := range outline.Child {
   152  			m := childRE.FindStringSubmatch(child.Title)
   153  			if m == nil {
   154  				fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
   155  				continue
   156  			}
   157  			list = append(list, m[1])
   158  		}
   159  	}
   160  	for _, child := range outline.Child {
   161  		list = appendInstHeadings(child, list)
   162  	}
   163  	return list
   164  }
   165  
   166  const inch = 72.0
   167  
   168  func parsePage(num int, p pdf.Page) (name string, table []Inst) {
   169  	content := p.Content()
   170  
   171  	var text []pdf.Text
   172  	for _, t := range content.Text {
   173  		if match(t, "Times-Roman", 7.2, "") {
   174  			t.FontSize = 9
   175  		}
   176  		if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
   177  			t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
   178  			t.FontSize = 9
   179  			t.Y -= 2.28
   180  		}
   181  		if t.Font == "Gen_Arial" {
   182  			continue
   183  		}
   184  		text = append(text, t)
   185  	}
   186  
   187  	text = findWords(text)
   188  
   189  	for i, t := range text {
   190  		if t.Font == "Times" {
   191  			t.Font = "Times-Roman"
   192  			text[i] = t
   193  		}
   194  	}
   195  
   196  	if debugPage > 0 {
   197  		for _, t := range text {
   198  			fmt.Println(t)
   199  		}
   200  		for _, r := range content.Rect {
   201  			fmt.Println(r)
   202  		}
   203  	}
   204  
   205  	// Remove text we should ignore.
   206  	out := text[:0]
   207  	skip := false
   208  	for _, t := range text {
   209  		// skip page footer
   210  		if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
   211  			continue
   212  		}
   213  		// skip section header and body text
   214  		if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
   215  			skip = true
   216  			continue
   217  		}
   218  		if skip && match(t, "Times-Roman", 9, "") {
   219  			continue
   220  		}
   221  		skip = false
   222  		out = append(out, t)
   223  	}
   224  	text = out
   225  
   226  	// Page header must say Instruction Details.
   227  	if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
   228  		return "", nil
   229  	}
   230  	text = text[1:]
   231  
   232  	isSection := func(text []pdf.Text, i int) int {
   233  		if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
   234  			return 2
   235  		}
   236  		if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
   237  			return 1
   238  		}
   239  		return 0
   240  	}
   241  
   242  	// Skip dummy headlines and sections.
   243  	for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
   244  		i := d
   245  		for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
   246  			i++
   247  		}
   248  		if isSection(text, i) == 0 {
   249  			break
   250  		}
   251  		text = text[i:]
   252  	}
   253  
   254  	// Next line is headline. Can wrap to multiple lines.
   255  	d := isSection(text, 0)
   256  	if d == 0 {
   257  		if debugPage > 0 {
   258  			fmt.Printf("non-inst-headline: %v\n", text[0])
   259  		}
   260  		checkNoEncodings(num, text)
   261  		return "", nil
   262  	}
   263  	if d == 2 {
   264  		name = text[1].S
   265  		text = text[2:]
   266  	} else if d == 1 {
   267  		m := childRE.FindStringSubmatch(text[0].S)
   268  		name = m[1]
   269  		text = text[1:]
   270  	}
   271  	for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
   272  		name += " " + text[0].S
   273  		text = text[1:]
   274  	}
   275  
   276  	// Skip description.
   277  	for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
   278  		text = text[1:]
   279  	}
   280  
   281  	// Encodings follow.
   282  	warned := false
   283  	for i := 0; i < len(text); {
   284  		if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
   285  			match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
   286  			match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
   287  			match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
   288  			match(text[i], "Times-Roman", 9, "Figure A") ||
   289  			match(text[i], "Helvetica-Bold", 9, "Table A") ||
   290  			match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
   291  			match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
   292  			match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
   293  			match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
   294  			match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
   295  			match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
   296  			checkNoEncodings(num, text[i:])
   297  			break
   298  		}
   299  		if match(text[i], "Helvetica-Bold", 9, "Figure A") {
   300  			y := text[i].Y
   301  			i++
   302  			for i < len(text) && math.Abs(text[i].Y-y) < 2 {
   303  				i++
   304  			}
   305  			continue
   306  		}
   307  		if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
   308  			if !warned {
   309  				warned = true
   310  				fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
   311  			}
   312  			i++
   313  			continue
   314  		}
   315  		inst := Inst{
   316  			Name: name,
   317  		}
   318  		enc := text[i].S
   319  		x := text[i].X
   320  		i++
   321  		// Possible subarchitecture notes.
   322  		for i < len(text) && text[i].X > x+36 {
   323  			if inst.Arch != "" {
   324  				inst.Arch += " "
   325  			}
   326  			inst.Arch += text[i].S
   327  			i++
   328  		}
   329  		// Encoding syntaxes.
   330  		for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
   331  			if text[i].X < x+0.25*inch {
   332  				inst.Syntax = append(inst.Syntax, text[i].S)
   333  			} else {
   334  				s := inst.Syntax[len(inst.Syntax)-1]
   335  				if !strings.Contains(s, "\t") {
   336  					s += "\t"
   337  				} else {
   338  					s += " "
   339  				}
   340  				s += text[i].S
   341  				inst.Syntax[len(inst.Syntax)-1] = s
   342  			}
   343  			i++
   344  		}
   345  
   346  		var bits, abits, aenc string
   347  		bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
   348  		if strings.Contains(enc, " / ") {
   349  			if i < len(text) && match(text[i], "Times-Roman", 8, "") {
   350  				abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
   351  			} else {
   352  				abits = bits
   353  			}
   354  			slash := strings.Index(enc, " / ")
   355  			aenc = "Encoding " + enc[slash+len(" / "):]
   356  			enc = enc[:slash]
   357  		}
   358  
   359  		// pseudocode
   360  		y0 := -1 * inch
   361  		tab := 0.0
   362  		for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
   363  			t := text[i]
   364  			i++
   365  			if math.Abs(t.Y-y0) < 3 {
   366  				// same line as last fragment, probably just two spaces
   367  				inst.Code += " " + t.S
   368  				continue
   369  			}
   370  			if inst.Code != "" {
   371  				inst.Code += "\n"
   372  			}
   373  			if t.X > x+0.1*inch {
   374  				if tab == 0 {
   375  					tab = t.X - x
   376  				}
   377  				inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
   378  			} else {
   379  				tab = 0
   380  			}
   381  			inst.Code += t.S
   382  			y0 = t.Y
   383  		}
   384  
   385  		inst.ID = strings.TrimPrefix(enc, "Encoding ")
   386  		inst.Bits = bits
   387  		table = append(table, inst)
   388  		if abits != "" {
   389  			inst.ID = strings.TrimPrefix(aenc, "Encoding ")
   390  			inst.Bits = abits
   391  			table = append(table, inst)
   392  		}
   393  
   394  	}
   395  	return name, table
   396  }
   397  
   398  func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) {
   399  	// bit headings
   400  	y2 := 0.0
   401  	x1 := 0.0
   402  	x2 := 0.0
   403  	for i < len(text) && match(text[i], "Times-Roman", 8, "") {
   404  		if y2 == 0 {
   405  			y2 = text[i].Y
   406  		}
   407  		if x1 == 0 {
   408  			x1 = text[i].X
   409  		}
   410  		i++
   411  	}
   412  	// bit fields in box
   413  	y1 := 0.0
   414  	dy1 := 0.0
   415  	for i < len(text) && match(text[i], "Times-Roman", 9, "") {
   416  		if x2 < text[i].X+text[i].W {
   417  			x2 = text[i].X + text[i].W
   418  		}
   419  		y1 = text[i].Y
   420  		dy1 = text[i].FontSize
   421  		i++
   422  	}
   423  
   424  	if debugPage > 0 {
   425  		fmt.Println("encoding box", x1, y1, x2, y2)
   426  	}
   427  
   428  	// Find lines (thin rectangles) separating bit fields.
   429  	var bottom, top pdf.Rect
   430  	const (
   431  		yMargin = 0.25 * 72
   432  		xMargin = 2 * 72
   433  	)
   434  	for _, r := range content.Rect {
   435  		if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
   436  			if y1-yMargin < r.Min.Y && r.Min.Y < y1 {
   437  				bottom = r
   438  			}
   439  			if y1+dy1 < r.Min.Y && r.Min.Y < y2 {
   440  				top = r
   441  			}
   442  		}
   443  	}
   444  
   445  	if debugPage > 0 {
   446  		fmt.Println("top", top, "bottom", bottom)
   447  	}
   448  
   449  	const ε = 0.1 * 72
   450  	var bars []pdf.Rect
   451  	for _, r := range content.Rect {
   452  		if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
   453  			bars = append(bars, r)
   454  		}
   455  	}
   456  	sort.Sort(RectHorizontal(bars))
   457  
   458  	// There are 16-bit and 32-bit encodings.
   459  	// In practice, they are about 2.65 and 5.3 inches wide, respectively.
   460  	// Use 4 inches as a cutoff.
   461  	nbit := 32
   462  	dx := top.Max.X - top.Min.X
   463  	if top.Max.X-top.Min.X < 4*72 {
   464  		nbit = 16
   465  	}
   466  
   467  	total := 0
   468  	var buf bytes.Buffer
   469  	for i := 0; i < len(bars)-1; i++ {
   470  		if i > 0 {
   471  			fmt.Fprintf(&buf, "|")
   472  		}
   473  		var sub []pdf.Text
   474  		x1, x2 := bars[i].Min.X, bars[i+1].Min.X
   475  		for _, t := range content.Text {
   476  			tx := t.X + t.W/2
   477  			ty := t.Y + t.FontSize/2
   478  			if x1 < tx && tx < x2 && y1 < ty && ty < y2 {
   479  				sub = append(sub, t)
   480  			}
   481  		}
   482  		var str []string
   483  		for _, t := range findWords(sub) {
   484  			str = append(str, t.S)
   485  		}
   486  		s := strings.Join(str, " ")
   487  		s = strings.Replace(s, ")(", ") (", -1)
   488  		n := len(strings.Fields(s))
   489  		b := int(float64(nbit)*(x2-x1)/dx + 0.5)
   490  		if n == b {
   491  			for j, f := range strings.Fields(s) {
   492  				if j > 0 {
   493  					fmt.Fprintf(&buf, "|")
   494  				}
   495  				fmt.Fprintf(&buf, "%s", f)
   496  			}
   497  		} else {
   498  			if n != 1 {
   499  				fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s)
   500  			}
   501  			fmt.Fprintf(&buf, "%s:%d", s, b)
   502  		}
   503  		total += b
   504  	}
   505  
   506  	if total != nbit || total == 0 {
   507  		fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total)
   508  	}
   509  	return buf.String(), i
   510  }
   511  
   512  type RectHorizontal []pdf.Rect
   513  
   514  func (x RectHorizontal) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
   515  func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
   516  func (x RectHorizontal) Len() int           { return len(x) }
   517  
   518  func checkNoEncodings(num int, text []pdf.Text) {
   519  	for _, t := range text {
   520  		if match(t, "Helvetica-Bold", 9, "Encoding") {
   521  			fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
   522  		}
   523  	}
   524  }
   525  
   526  func match(t pdf.Text, font string, size float64, substr string) bool {
   527  	return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
   528  }
   529  
   530  func findWords(chars []pdf.Text) (words []pdf.Text) {
   531  	// Sort by Y coordinate and normalize.
   532  	const nudge = 1
   533  	sort.Sort(pdf.TextVertical(chars))
   534  	old := -100000.0
   535  	for i, c := range chars {
   536  		if c.Y != old && math.Abs(old-c.Y) < nudge {
   537  			chars[i].Y = old
   538  		} else {
   539  			old = c.Y
   540  		}
   541  	}
   542  
   543  	// Sort by Y coordinate, breaking ties with X.
   544  	// This will bring letters in a single word together.
   545  	sort.Sort(pdf.TextVertical(chars))
   546  
   547  	// Loop over chars.
   548  	for i := 0; i < len(chars); {
   549  		// Find all chars on line.
   550  		j := i + 1
   551  		for j < len(chars) && chars[j].Y == chars[i].Y {
   552  			j++
   553  		}
   554  		var end float64
   555  		// Split line into words (really, phrases).
   556  		for k := i; k < j; {
   557  			ck := &chars[k]
   558  			s := ck.S
   559  			end = ck.X + ck.W
   560  			charSpace := ck.FontSize / 6
   561  			wordSpace := ck.FontSize * 2 / 3
   562  			l := k + 1
   563  			for l < j {
   564  				// Grow word.
   565  				cl := &chars[l]
   566  				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
   567  					s += cl.S
   568  					end = cl.X + cl.W
   569  					l++
   570  					continue
   571  				}
   572  				// Add space to phrase before next word.
   573  				if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
   574  					s += " " + cl.S
   575  					end = cl.X + cl.W
   576  					l++
   577  					continue
   578  				}
   579  				break
   580  			}
   581  			f := ck.Font
   582  			f = strings.TrimSuffix(f, ",Italic")
   583  			f = strings.TrimSuffix(f, "-Italic")
   584  			words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
   585  			k = l
   586  		}
   587  		i = j
   588  	}
   589  
   590  	return words
   591  }
   592  
   593  func sameFont(f1, f2 string) bool {
   594  	f1 = strings.TrimSuffix(f1, ",Italic")
   595  	f1 = strings.TrimSuffix(f1, "-Italic")
   596  	f2 = strings.TrimSuffix(f1, ",Italic")
   597  	f2 = strings.TrimSuffix(f1, "-Italic")
   598  	return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
   599  }
   600  
   601  var jsFix = strings.NewReplacer(
   602  // `\u003c`, `<`,
   603  // `\u003e`, `>`,
   604  // `\u0026`, `&`,
   605  // `\u0009`, `\t`,
   606  )
   607  
   608  func printTable(name string, table []Inst) {
   609  	_ = strconv.Atoi
   610  }
   611
View as plain text