1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // X86spec reads the “Intel® 64 and IA-32 Architectures Software Developer's Manual” 6 // to collect instruction encoding details and writes those details to standard output 7 // in CSV format. 8 // 9 // Usage: 10 // 11 // x86spec [-f file] [-u url] >x86.csv 12 // 13 // The -f flag specifies the input file (default x86manual.pdf), the Intel instruction 14 // set reference manual in PDF form. 15 // If the input file does not exist, it will be created by downloading the manual. 16 // 17 // The -u flag specifies the URL from which to download the manual 18 // (default https://golang.org/s/x86manual, which redirects to Intel's site). 19 // The URL is downloaded only when the file named by the -f flag is missing. 20 // 21 // There are additional debugging flags, not shown. Run x86spec -help for the list. 22 // 23 // # File Format 24 // 25 // TODO: Mention comments at top of file. 26 // TODO: Mention that this is version 0.2 of the file. 27 // TODO: Mention that file format will change incompatibly until version 1.0. 28 // 29 // Each CSV line contains these fields: 30 // 31 // 1. The Intel manual instruction mnemonic. For example, "SHR r/m32, imm8". 32 // 33 // 2. The Go assembler instruction mnemonic. For example, "SHRL imm8, r/m32". 34 // 35 // 3. The GNU binutils instruction mnemonic. For example, "shrl imm8, r/m32". 36 // 37 // 4. The instruction encoding. For example, "C1 /4 ib". 38 // 39 // 5. The validity of the instruction in 32-bit (aka compatibility, legacy) mode. 40 // 41 // 6. The validity of the instruction in 64-bit mode. 42 // 43 // 7. The CPUID feature flags that signal support for the instruction. 44 // 45 // 8. Additional comma-separated tags containing hints about the instruction. 46 // 47 // 9. The read/write actions of the instruction on the arguments used in 48 // the Intel mnemonic. For example, "rw,r" to denote that "SHR r/m32, imm8" 49 // reads and writes its first argument but only reads its second argument. 50 // 51 // 10. Whether the opcode used in the Intel mnemonic has encoding forms 52 // distinguished only by operand size, like most arithmetic instructions. 53 // The string "Y" indicates yes, the string "" indicates no. 54 // 55 // 11. The data size of the operation in bits. In general this is the size corresponding 56 // to the Go and GNU assembler opcode suffix. 57 // 58 // The complete line used for the above examples is: 59 // 60 // "SHR r/m32, imm8","SHRL imm8, r/m32","shrl imm8, r/m32","C1 /5 ib","V","V","","operand32","rw,r","Y","32" 61 // 62 // # Mnemonics 63 // 64 // The instruction mnemonics are as used in the Intel manual, with a few exceptions. 65 // 66 // Mnemonics claiming general memory forms but that really require fixed addressing modes 67 // are omitted in favor of their equivalents with implicit arguments.. 68 // For example, "CMPS m16, m16" (really CMPS [SI], [DI]) is omitted in favor of "CMPSW". 69 // 70 // Instruction forms with an explicit REP, REPE, or REPNE prefix are also omitted. 71 // Encoders and decoders are expected to handle those prefixes separately. 72 // 73 // Perhaps most significantly, the argument syntaxes used in the mnemonic indicate 74 // exactly how to derive the argument from the instruction encoding, or vice versa. 75 // 76 // Immediate values: imm8, imm8u, imm16, imm16u, imm32, imm64. 77 // Immediates are signed by default; the u suffixes indicates an unsigned value. 78 // 79 // Memory operands. The forms m, m128, m14/28byte, m16, m16&16, m16&32, m16&64, m16:16, m16:32, 80 // m16:64, m16int, m256, m2byte, m32, m32&32, m32fp, m32int, m512byte, m64, m64fp, m64int, 81 // m8, m80bcd, m80dec, m80fp, m94/108byte. These operands always correspond to the 82 // memory address specified by the r/m half of the modrm encoding. 83 // 84 // Integer registers. 85 // The forms r8, r16, r32, r64 indicate a register selected by the modrm reg encoding. 86 // The forms rmr16, rmr32, rmr64 indicate a register (never memory) selected by the modrm r/m encoding. 87 // The forms r/m8, r/m16, r/m32, and r/m64 indicate a register or memory selected by the modrm r/m encoding. 88 // Forms with two sizes, like r32/m16 also indicate a register or memory selected by the modrm r/m encodng, 89 // but the size for a register argument differs from the size of a memory argument. 90 // The forms r8V, r16V, r32V, r64V indicate a register selected by the VEX.vvvv bits. 91 // 92 // Multimedia registers. 93 // The forms mm1, xmm1, and ymm1 indicate a multimedia register selected by the 94 // modrm reg encoding. 95 // The forms mm2, xmm2, and ymm2 indicate a register (never memory) selected by 96 // the modrm r/m encoding. 97 // The forms mm2/m64, xmm2/m128, and so on indicate a register or memory 98 // selected by the modrm r/m encoding. 99 // The forms xmmV and ymmV indicate a register selected by the VEX.vvvv bits. 100 // The forms xmmI and ymmI indicate a register selected by the top four bits of an /is4 immediate byte. 101 // 102 // Bound registers. 103 // The form bnd1 indicate a bound register selected by the modrm reg encoding. 104 // The form bnd2 indicates a bound register (never memory) selected by the modrm r/m encoding. 105 // The forms bnd2/m64 and bnd2/m128 indicate a register or memorys selected by the modrm r/m encoding. 106 // TODO: Describe mib. 107 // 108 // One-of-a-kind operands: rel8, rel16, rel32, ptr16:16, ptr16:32, 109 // moffs8, moffs16, moffs32, moffs64, vm32x, vm32y, vm64x, and vm64y 110 // are all as in the Intel manual. 111 // 112 // # Encodings 113 // 114 // The encodings are also as used in the Intel manual, with automated corrections. 115 // For example, the Intel manual sometimes omits the modrm /r indicator or other trailing bytes, 116 // and it also contains typographical errors. 117 // These problems are corrected so that the CSV data may be used to generate 118 // tools for processing x86 machine code. 119 // See https://golang.org/x/arch/x86/x86map for one such generator. 120 // 121 // # Valid32 and Valid64 122 // 123 // These columns hold validity abbreviations as defined in the Intel manual: 124 // V, I, N.E., N.P., N.S., or N.I. 125 // Tools processing the data are typically only concerned with whether the 126 // column is "V" (valid) or not. 127 // This data is also corrected compared to the manual. 128 // For example, the manual lists many instruction forms using REX bytes 129 // with an incorrect "V" in the Valid32 column. 130 // 131 // # CPUID Feature Flags 132 // 133 // This column specifies CPUID feature flags that must be present in order 134 // to use the instruction. If multiple flags are required, 135 // they are listed separated by plus signs, as in PCLMULQDQ+AVX. 136 // The column can also list one of the values 486, Pentium, PentiumII, and P6, 137 // indicating that the instruction was introduced on that architecture version. 138 // 139 // # Tags 140 // 141 // The tag column does not correspond to a traditional column in the Intel manual tables. 142 // Instead, it is itself a comma-separated list of tags or hints derived by analysis 143 // of the instruction set or the instruction encodings. 144 // 145 // The tags address16, address32, and address64 indicate that the instruction form 146 // applies when using the specified addressing size. It may therefore be necessary to use an 147 // address size prefix byte to access the instruction. 148 // If two address tags are listed, the instruction can be used with either of those 149 // address sizes. An instruction will never list all three address sizes. 150 // (In fact, today, no instruction lists two address sizes, but that may change.) 151 // 152 // The tags operand16, operand32, and operand64 indicate that the instruction form 153 // applies when using the specified operand size. It may therefore be necessary to use an 154 // operand size prefix byte to access the instruction. 155 // If two operand tags are listed, the instruction can be used with either of those 156 // operand sizes. An instruction will never list all three operand sizes. 157 // 158 // The tags modrm_regonly or modrm_memonly indicate that the modrm byte's 159 // r/m encoding must specify a register or memory, respectively. 160 // Especially in newer instructions, the modrm constraint may be the only way 161 // to distinguish two instruction forms. For example the MOVHLPS and MOVLPS 162 // instructions share the same encoding, except that the former requires the 163 // modrm byte's r/m to indicate a register, while the latter requires it to indicate memory. 164 // 165 // The tags pseudo and pseudo64 indicate that this instruction form is redundant 166 // with others listed in the table and should be ignored when generating disassembly 167 // or instruction scanning programs. The pseudo64 tag is reserved for the case where 168 // the manual lists an instruction twice, once with the optional 64-bit mode REX byte. 169 // Since most decoders will handle the REX byte separately, the form with the 170 // unnecessary REX is tagged pseudo64. 171 // 172 // # Corrections and Additions 173 // 174 // The x86spec program makes various corrections to the Intel manual data 175 // as part of extracting the information. Those corrections are described above. 176 // 177 // The x86spec program also adds a few well-known undocumented instructions, 178 // such as UD1 and FFREEP. 179 // 180 // # Examples 181 // 182 // The latest version of the CSV file is available in this Git repository and also 183 // online at https://golang.org/s/x86.csv. It is meant to be human-readable for 184 // quick reference and also to be input for generating tools that operate on 185 // x86 machine code. 186 // 187 // To print instruction syntaxes introduced by the Pentium II and P6, 188 // using https://rsc.io/csv2tsv to prepare the table for processing by awk: 189 // 190 // csv2tsv x86.csv | awk -F'\t' '$5 == "PentiumII" || $5 == "P6" { print $1 }' 191 // 192 // The x86map program (https://golang.org/x/arch/x86/x86map) 193 // reads the CSV file and generates an x86 instruction decoder in the form 194 // of a simple byte-code program. This decoder is the core of the disassembler 195 // in the x86asm package (https://golang.org/x/arch/x86/x86asm). 196 package main 197 198 import ( 199 "bufio" 200 "flag" 201 "fmt" 202 "io" 203 "log" 204 "net/http" 205 "os" 206 "sort" 207 "strings" 208 ) 209 210 const ( 211 specFormatVersion = "0.2" 212 ) 213 214 var ( 215 flagDebugPage = flag.String("debugpage", "", "debug page `n` of the manual (can be comma-separated list)") 216 flagURL = flag.String("u", "https://golang.org/s/x86manual", "use `url` for download if needed") 217 flagFile = flag.String("f", "x86manual.pdf", "read manual from `file`, downloading if necessary") 218 flagCompat = flag.Bool("compat", false, "print compatibility statements") 219 220 debugging bool 221 onlySomePages bool 222 ) 223 224 type instruction struct { 225 page int 226 opcode string 227 syntax string 228 valid64 string 229 valid32 string 230 cpuid string 231 desc string 232 tags []string 233 args []string 234 seq int // for use by cleanup 235 compat string 236 action string 237 multisize string 238 datasize int 239 gnuSyntax string 240 goSyntax string 241 } 242 243 func main() { 244 log.SetFlags(0) 245 log.SetPrefix("x86spec: ") 246 flags() 247 download() 248 insts := parse() 249 insts = cleanup(insts) 250 format(insts) 251 sort.Sort(bySyntax(insts)) 252 write(os.Stdout, insts) 253 } 254 255 func flags() { 256 flag.Usage = func() { 257 fmt.Fprintf(os.Stderr, "usage: x86spec [options]\n") 258 flag.PrintDefaults() 259 os.Exit(2) 260 } 261 flag.Parse() 262 if flag.NArg() != 0 { 263 flag.Usage() 264 } 265 debugging = *flagDebugPage != "" 266 onlySomePages = *flagDebugPage != "" 267 } 268 269 func download() { 270 _, err := os.Stat(*flagFile) 271 if !os.IsNotExist(err) { 272 return 273 } 274 275 // Try downloading. 276 log.Printf("downloading manual to %s", *flagFile) 277 resp, err := http.Get(*flagURL) 278 if err != nil { 279 log.Fatal(err) 280 } 281 if resp.StatusCode != 200 { 282 log.Fatal(resp.Status) 283 } 284 f, err := os.Create(*flagFile) 285 if err != nil { 286 log.Fatal(err) 287 } 288 _, err = io.Copy(f, resp.Body) 289 if err != nil { 290 log.Fatal(err) 291 } 292 if err := f.Close(); err != nil { 293 log.Fatal(err) 294 } 295 } 296 297 func write(w io.Writer, insts []*instruction) { 298 bw := bufio.NewWriter(w) 299 defer bw.Flush() 300 for _, inst := range insts { 301 datasize := "" 302 if inst.datasize != 0 { 303 datasize = fmt.Sprint(inst.datasize) 304 } 305 writeCSV(bw, inst.syntax, inst.goSyntax, inst.gnuSyntax, inst.opcode, inst.valid32, inst.valid64, inst.cpuid, strings.Join(inst.tags, ","), inst.action, inst.multisize, datasize) 306 } 307 } 308 309 // Note: not using encoding/csv because we want the CSV to use quotes always, 310 // so that it is a little easier to process with non-CSV tools like grep, 311 // but the encoding/csv package does not have an "always quote" writing mode. 312 func writeCSV(w io.Writer, args ...string) { 313 for i, arg := range args { 314 if i > 0 { 315 fmt.Fprintf(w, ",") 316 } 317 fmt.Fprintf(w, `"%s"`, strings.Replace(arg, `"`, `""`, -1)) 318 } 319 fmt.Fprintf(w, "\n") 320 } 321