spec.go

Documentation: golang.org/x/arch/x86/x86spec

     1  // Copyright 2016 The Go Authors.  All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // X86spec reads the “Intel® 64 and IA-32 Architectures Software Developer's Manual”
     6  // to collect instruction encoding details and writes those details to standard output
     7  // in CSV format.
     8  //
     9  // Usage:
    10  //
    11  //	x86spec [-f file] [-u url] >x86.csv
    12  //
    13  // The -f flag specifies the input file (default x86manual.pdf), the Intel instruction
    14  // set reference manual in PDF form.
    15  // If the input file does not exist, it will be created by downloading the manual.
    16  //
    17  // The -u flag specifies the URL from which to download the manual
    18  // (default https://golang.org/s/x86manual, which redirects to Intel's site).
    19  // The URL is downloaded only when the file named by the -f flag is missing.
    20  //
    21  // There are additional debugging flags, not shown. Run x86spec -help for the list.
    22  //
    23  // # File Format
    24  //
    25  // TODO: Mention comments at top of file.
    26  // TODO: Mention that this is version 0.2 of the file.
    27  // TODO: Mention that file format will change incompatibly until version 1.0.
    28  //
    29  // Each CSV line contains these fields:
    30  //
    31  // 1. The Intel manual instruction mnemonic. For example, "SHR r/m32, imm8".
    32  //
    33  // 2. The Go assembler instruction mnemonic. For example, "SHRL imm8, r/m32".
    34  //
    35  // 3. The GNU binutils instruction mnemonic. For example, "shrl imm8, r/m32".
    36  //
    37  // 4. The instruction encoding. For example, "C1 /4 ib".
    38  //
    39  // 5. The validity of the instruction in 32-bit (aka compatibility, legacy) mode.
    40  //
    41  // 6. The validity of the instruction in 64-bit mode.
    42  //
    43  // 7. The CPUID feature flags that signal support for the instruction.
    44  //
    45  // 8. Additional comma-separated tags containing hints about the instruction.
    46  //
    47  // 9. The read/write actions of the instruction on the arguments used in
    48  // the Intel mnemonic. For example, "rw,r" to denote that "SHR r/m32, imm8"
    49  // reads and writes its first argument but only reads its second argument.
    50  //
    51  // 10. Whether the opcode used in the Intel mnemonic has encoding forms
    52  // distinguished only by operand size, like most arithmetic instructions.
    53  // The string "Y" indicates yes, the string "" indicates no.
    54  //
    55  // 11. The data size of the operation in bits. In general this is the size corresponding
    56  // to the Go and GNU assembler opcode suffix.
    57  //
    58  // The complete line used for the above examples is:
    59  //
    60  //	"SHR r/m32, imm8","SHRL imm8, r/m32","shrl imm8, r/m32","C1 /5 ib","V","V","","operand32","rw,r","Y","32"
    61  //
    62  // # Mnemonics
    63  //
    64  // The instruction mnemonics are as used in the Intel manual, with a few exceptions.
    65  //
    66  // Mnemonics claiming general memory forms but that really require fixed addressing modes
    67  // are omitted in favor of their equivalents with implicit arguments..
    68  // For example, "CMPS m16, m16" (really CMPS [SI], [DI]) is omitted in favor of "CMPSW".
    69  //
    70  // Instruction forms with an explicit REP, REPE, or REPNE prefix are also omitted.
    71  // Encoders and decoders are expected to handle those prefixes separately.
    72  //
    73  // Perhaps most significantly, the argument syntaxes used in the mnemonic indicate
    74  // exactly how to derive the argument from the instruction encoding, or vice versa.
    75  //
    76  // Immediate values: imm8, imm8u, imm16, imm16u, imm32, imm64.
    77  // Immediates are signed by default; the u suffixes indicates an unsigned value.
    78  //
    79  // Memory operands. The forms m, m128, m14/28byte, m16, m16&16, m16&32, m16&64, m16:16, m16:32,
    80  // m16:64, m16int, m256, m2byte, m32, m32&32, m32fp, m32int, m512byte, m64, m64fp, m64int,
    81  // m8, m80bcd, m80dec, m80fp, m94/108byte. These operands always correspond to the
    82  // memory address specified by the r/m half of the modrm encoding.
    83  //
    84  // Integer registers.
    85  // The forms r8, r16, r32, r64 indicate a register selected by the modrm reg encoding.
    86  // The forms rmr16, rmr32, rmr64 indicate a register (never memory) selected by the modrm r/m encoding.
    87  // The forms r/m8, r/m16, r/m32, and r/m64 indicate a register or memory selected by the modrm r/m encoding.
    88  // Forms with two sizes, like r32/m16 also indicate a register or memory selected by the modrm r/m encodng,
    89  // but the size for a register argument differs from the size of a memory argument.
    90  // The forms r8V, r16V, r32V, r64V indicate a register selected by the VEX.vvvv bits.
    91  //
    92  // Multimedia registers.
    93  // The forms mm1, xmm1, and ymm1 indicate a multimedia register selected by the
    94  // modrm reg encoding.
    95  // The forms mm2, xmm2, and ymm2 indicate a register (never memory) selected by
    96  // the modrm r/m encoding.
    97  // The forms mm2/m64, xmm2/m128, and so on indicate a register or memory
    98  // selected by the modrm r/m encoding.
    99  // The forms xmmV and ymmV indicate a register selected by the VEX.vvvv bits.
   100  // The forms xmmI and ymmI indicate a register selected by the top four bits of an /is4 immediate byte.
   101  //
   102  // Bound registers.
   103  // The form bnd1 indicate a  bound register selected by the modrm reg encoding.
   104  // The form bnd2 indicates a bound register (never memory) selected by the modrm r/m encoding.
   105  // The forms bnd2/m64 and bnd2/m128 indicate a register or memorys selected by the modrm r/m encoding.
   106  // TODO: Describe mib.
   107  //
   108  // One-of-a-kind operands: rel8, rel16, rel32, ptr16:16, ptr16:32,
   109  // moffs8, moffs16, moffs32, moffs64, vm32x, vm32y, vm64x, and vm64y
   110  // are all as in the Intel manual.
   111  //
   112  // # Encodings
   113  //
   114  // The encodings are also as used in the Intel manual, with automated corrections.
   115  // For example, the Intel manual sometimes omits the modrm /r indicator or other trailing bytes,
   116  // and it also contains typographical errors.
   117  // These problems are corrected so that the CSV data may be used to generate
   118  // tools for processing x86 machine code.
   119  // See https://golang.org/x/arch/x86/x86map for one such generator.
   120  //
   121  // # Valid32 and Valid64
   122  //
   123  // These columns hold validity abbreviations as defined in the Intel manual:
   124  // V, I, N.E., N.P., N.S., or N.I.
   125  // Tools processing the data are typically only concerned with whether the
   126  // column is "V" (valid) or not.
   127  // This data is also corrected compared to the manual.
   128  // For example, the manual lists many instruction forms using REX bytes
   129  // with an incorrect "V" in the Valid32 column.
   130  //
   131  // # CPUID Feature Flags
   132  //
   133  // This column specifies CPUID feature flags that must be present in order
   134  // to use the instruction. If multiple flags are required,
   135  // they are listed separated by plus signs, as in PCLMULQDQ+AVX.
   136  // The column can also list one of the values 486, Pentium, PentiumII, and P6,
   137  // indicating that the instruction was introduced on that architecture version.
   138  //
   139  // # Tags
   140  //
   141  // The tag column does not correspond to a traditional column in the Intel manual tables.
   142  // Instead, it is itself a comma-separated list of tags or hints derived by analysis
   143  // of the instruction set or the instruction encodings.
   144  //
   145  // The tags address16, address32, and address64 indicate that the instruction form
   146  // applies when using the specified addressing size. It may therefore be necessary to use an
   147  // address size prefix byte to access the instruction.
   148  // If two address tags are listed, the instruction can be used with either of those
   149  // address sizes. An instruction will never list all three address sizes.
   150  // (In fact, today, no instruction lists two address sizes, but that may change.)
   151  //
   152  // The tags operand16, operand32, and operand64 indicate that the instruction form
   153  // applies when using the specified operand size. It may therefore be necessary to use an
   154  // operand size prefix byte to access the instruction.
   155  // If two operand tags are listed,  the instruction can be used with either of those
   156  // operand sizes. An instruction will never list all three operand sizes.
   157  //
   158  // The tags modrm_regonly or modrm_memonly indicate that the modrm byte's
   159  // r/m encoding must specify a register or memory, respectively.
   160  // Especially in newer instructions, the modrm constraint may be the only way
   161  // to distinguish two instruction forms. For example the MOVHLPS and MOVLPS
   162  // instructions share the same encoding, except that the former requires the
   163  // modrm byte's r/m to indicate a register, while the latter requires it to indicate memory.
   164  //
   165  // The tags pseudo and pseudo64 indicate that this instruction form is redundant
   166  // with others listed in the table and should be ignored when generating disassembly
   167  // or instruction scanning programs. The pseudo64 tag is reserved for the case where
   168  // the manual lists an instruction twice, once with the optional 64-bit mode REX byte.
   169  // Since most decoders will handle the REX byte separately, the form with the
   170  // unnecessary REX is tagged pseudo64.
   171  //
   172  // # Corrections and Additions
   173  //
   174  // The x86spec program makes various corrections to the Intel manual data
   175  // as part of extracting the information. Those corrections are described above.
   176  //
   177  // The x86spec program also adds a few well-known undocumented instructions,
   178  // such as UD1 and FFREEP.
   179  //
   180  // # Examples
   181  //
   182  // The latest version of the CSV file is available in this Git repository and also
   183  // online at https://golang.org/s/x86.csv. It is meant to be human-readable for
   184  // quick reference and also to be input for generating tools that operate on
   185  // x86 machine code.
   186  //
   187  // To print instruction syntaxes introduced by the Pentium II and P6,
   188  // using https://rsc.io/csv2tsv to prepare the table for processing by awk:
   189  //
   190  //	csv2tsv x86.csv | awk -F'\t' '$5 == "PentiumII" || $5 == "P6" { print $1 }'
   191  //
   192  // The x86map program (https://golang.org/x/arch/x86/x86map)
   193  // reads the CSV file and generates an x86 instruction decoder in the form
   194  // of a simple byte-code program. This decoder is the core of the disassembler
   195  // in the x86asm package (https://golang.org/x/arch/x86/x86asm).
   196  package main
   197  
   198  import (
   199  	"bufio"
   200  	"flag"
   201  	"fmt"
   202  	"io"
   203  	"log"
   204  	"net/http"
   205  	"os"
   206  	"sort"
   207  	"strings"
   208  )
   209  
   210  const (
   211  	specFormatVersion = "0.2"
   212  )
   213  
   214  var (
   215  	flagDebugPage = flag.String("debugpage", "", "debug page `n` of the manual (can be comma-separated list)")
   216  	flagURL       = flag.String("u", "https://golang.org/s/x86manual", "use `url` for download if needed")
   217  	flagFile      = flag.String("f", "x86manual.pdf", "read manual from `file`, downloading if necessary")
   218  	flagCompat    = flag.Bool("compat", false, "print compatibility statements")
   219  
   220  	debugging     bool
   221  	onlySomePages bool
   222  )
   223  
   224  type instruction struct {
   225  	page      int
   226  	opcode    string
   227  	syntax    string
   228  	valid64   string
   229  	valid32   string
   230  	cpuid     string
   231  	desc      string
   232  	tags      []string
   233  	args      []string
   234  	seq       int // for use by cleanup
   235  	compat    string
   236  	action    string
   237  	multisize string
   238  	datasize  int
   239  	gnuSyntax string
   240  	goSyntax  string
   241  }
   242  
   243  func main() {
   244  	log.SetFlags(0)
   245  	log.SetPrefix("x86spec: ")
   246  	flags()
   247  	download()
   248  	insts := parse()
   249  	insts = cleanup(insts)
   250  	format(insts)
   251  	sort.Sort(bySyntax(insts))
   252  	write(os.Stdout, insts)
   253  }
   254  
   255  func flags() {
   256  	flag.Usage = func() {
   257  		fmt.Fprintf(os.Stderr, "usage: x86spec [options]\n")
   258  		flag.PrintDefaults()
   259  		os.Exit(2)
   260  	}
   261  	flag.Parse()
   262  	if flag.NArg() != 0 {
   263  		flag.Usage()
   264  	}
   265  	debugging = *flagDebugPage != ""
   266  	onlySomePages = *flagDebugPage != ""
   267  }
   268  
   269  func download() {
   270  	_, err := os.Stat(*flagFile)
   271  	if !os.IsNotExist(err) {
   272  		return
   273  	}
   274  
   275  	// Try downloading.
   276  	log.Printf("downloading manual to %s", *flagFile)
   277  	resp, err := http.Get(*flagURL)
   278  	if err != nil {
   279  		log.Fatal(err)
   280  	}
   281  	if resp.StatusCode != 200 {
   282  		log.Fatal(resp.Status)
   283  	}
   284  	f, err := os.Create(*flagFile)
   285  	if err != nil {
   286  		log.Fatal(err)
   287  	}
   288  	_, err = io.Copy(f, resp.Body)
   289  	if err != nil {
   290  		log.Fatal(err)
   291  	}
   292  	if err := f.Close(); err != nil {
   293  		log.Fatal(err)
   294  	}
   295  }
   296  
   297  func write(w io.Writer, insts []*instruction) {
   298  	bw := bufio.NewWriter(w)
   299  	defer bw.Flush()
   300  	for _, inst := range insts {
   301  		datasize := ""
   302  		if inst.datasize != 0 {
   303  			datasize = fmt.Sprint(inst.datasize)
   304  		}
   305  		writeCSV(bw, inst.syntax, inst.goSyntax, inst.gnuSyntax, inst.opcode, inst.valid32, inst.valid64, inst.cpuid, strings.Join(inst.tags, ","), inst.action, inst.multisize, datasize)
   306  	}
   307  }
   308  
   309  // Note: not using encoding/csv because we want the CSV to use quotes always,
   310  // so that it is a little easier to process with non-CSV tools like grep,
   311  // but the encoding/csv package does not have an "always quote" writing mode.
   312  func writeCSV(w io.Writer, args ...string) {
   313  	for i, arg := range args {
   314  		if i > 0 {
   315  			fmt.Fprintf(w, ",")
   316  		}
   317  		fmt.Fprintf(w, `"%s"`, strings.Replace(arg, `"`, `""`, -1))
   318  	}
   319  	fmt.Fprintf(w, "\n")
   320  }
   321
View as plain text