// Copyright 2017 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // arm64spec reads the ``ARMv8-A Reference Manual'' // to collect instruction encoding details and writes those // details to standard output in JSON format. // usage: arm64spec file.pdf package main import ( "bufio" "bytes" "encoding/json" "fmt" "log" "math" "os" "regexp" "sort" "strconv" "strings" "rsc.io/pdf" ) type Inst struct { Name string Bits string Arch string Syntax string Code string Alias string } const debugPage = 0 var stdout *bufio.Writer func check(e error) { if e != nil { panic(e) } } func main() { log.SetFlags(0) log.SetPrefix("arm64spec: ") if len(os.Args) != 2 { fmt.Fprintf(os.Stderr, "usage: arm64spec file.pdf\n") os.Exit(2) } f, err := pdf.Open(os.Args[1]) if err != nil { log.Fatal(err) } // Find instruction set reference in outline, to build instruction list. instList := instHeadings(f.Outline()) if debugPage == 0 { fmt.Println("the number of instructions:", len(instList)) } if len(instList) < 200 { log.Fatalf("only found %d instructions in table of contents", len(instList)) } file, err := os.Create("inst.json") check(err) w := bufio.NewWriter(file) _, err = w.WriteString("[") check(err) numTable := 0 defer w.Flush() defer file.Close() // Scan document looking for instructions. // Must find exactly the ones in the outline. n := f.NumPage() PageLoop: for pageNum := 435; pageNum <= n; pageNum++ { if debugPage > 0 && pageNum != debugPage { continue } if pageNum == 770 { continue } if pageNum > 1495 { break } p := f.Page(pageNum) name, table := parsePage(pageNum, p, f) if name == "" { continue } if len(table) < 1 { if false { fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum) } continue } for _, inst := range table { if numTable > 0 { _, err = w.WriteString(jsFix.Replace(",")) check(err) _, err = w.WriteString("\n") check(err) } numTable++ js, _ := json.Marshal(inst) _, err = w.WriteString(jsFix.Replace(string(js))) check(err) } for j, headline := range instList { if name == headline { instList[j] = "" continue PageLoop } } fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum) } _, err = w.WriteString("\n]\n") check(err) w.Flush() if debugPage == 0 { for _, headline := range instList { if headline != "" { fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline) } } } } func instHeadings(outline pdf.Outline) []string { return appendInstHeadings(outline, nil) } var instRE = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 base instructions`) var instRE_A = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 floating-point and Advanced SIMD instructions`) var childRE = regexp.MustCompile(`C[\d.]+ (.+)`) var sectionRE = regexp.MustCompile(`^C[\d.]+$`) var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`) var IMMRE = regexp.MustCompile(`^imm[\d]+$`) func appendInstHeadings(outline pdf.Outline, list []string) []string { if instRE.MatchString(outline.Title) || instRE_A.MatchString(outline.Title) { for _, child := range outline.Child { m := childRE.FindStringSubmatch(child.Title) if m == nil { fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title) continue } list = append(list, m[1]) } } for _, child := range outline.Child { list = appendInstHeadings(child, list) } return list } const inch = 72.0 func parsePage(num int, p pdf.Page, f *pdf.Reader) (name string, table []Inst) { content := p.Content() var text []pdf.Text CrossTwoPage := true for _, t := range content.Text { text = append(text, t) } text = findWords(text) if !(instRE.MatchString(text[1].S) || instRE_A.MatchString(text[1].S)) || len(text) == 0 || !sectionRE.MatchString(text[2].S) { return "", nil } // Check whether the content crosses the page. for _, t := range text { if match(t, "Arial,Bold", 10, "Assembler symbols") { CrossTwoPage = false break } } // Deal with cross page issue. To the next page content. var Ncontent pdf.Content Npagebox := false CrossThreePage := false Noffset := "" if CrossTwoPage == true { Np := f.Page(num + 1) Ncontent = Np.Content() var Ntext []pdf.Text for _, t := range Ncontent.Text { Ntext = append(Ntext, t) } Ntext = findWords(Ntext) if len(Ntext) == 0 || sectionRE.MatchString(Ntext[2].S) { Ntext = text[:0] } else { for _, t := range Ntext { if match(t, "Arial,Bold", 10, "offset") { Noffset = t.S Npagebox = true } // This istruction cross three pages. if match(t, "Arial,Bold", 10, "Assembler symbols") { CrossThreePage = false } else { CrossThreePage = true } text = append(text, t) } } } if CrossThreePage == true { NNp := f.Page(num + 2) NNcontent := NNp.Content() var NNtext []pdf.Text for _, t := range NNcontent.Text { NNtext = append(NNtext, t) } NNtext = findWords(NNtext) if len(NNtext) == 0 || sectionRE.MatchString(NNtext[2].S) { NNtext = text[:0] } else { for _, t := range NNtext { text = append(text, t) } } } // Get alias and remove text we should ignore. out := text[:0] alias := "" for _, t := range text { if strings.Contains(t.S, "instruction is used by the alias") || strings.Contains(t.S, "instruction is an alias of") { alias_t := strings.SplitAfter(t.S, ".") alias = alias_t[0] } // Skip page footer if match(t, "Arial-ItalicMT", 8, "") || match(t, "ArialMT", 8, "") { if debugPage > 0 { fmt.Println("==the skip page footer is:==", t) } continue } // Skip the body text if match(t, "TimesNewRoman", 9, "") || match(t, "TimesNewRomanPS-ItalicMT", 9, "") { if debugPage > 0 { fmt.Println("==the skip body text is:==", t) } continue } out = append(out, t) } text = out // Page header must be child title. if len(text) == 0 || !sectionRE.MatchString(text[0].S) { return "", nil } name = text[1].S inst := Inst{ Name: name, Alias: alias, } text = text[2:] // Skip body text before bits. OffsetMark := false k := 0 for k = 0; k < len(text); { if !match(text[k], "Arial", 8, "31") { k++ } else { break } } // Check offset. if k > 0 && match(text[k-1], "Arial,Bold", 10, "") { OffsetMark = true text = text[k-1:] } else { text = text[k:] } // Encodings follow. BitMark := false bits := "" // Find bits. for i := 0; i < len(text); { inst.Bits = "" offset := "" abits := "" // Read bits only one time. if OffsetMark == true { for i < len(text) && !match(text[i], "Arial", 8, "") { i++ } if i < len(text) { offset = text[i-1].S BitMark = false bits = "" } else { break } } if BitMark == false { if Npagebox == true && Noffset == offset { bits, i = readBitBox(name, Ncontent, text, i) } else { bits, i = readBitBox(name, content, text, i) } BitMark = true // Every time, get "then SEE" after get bits. enc := false if i < len(text)-1 { m := i for m < len(text)-1 && !match(text[m], "Arial-BoldItalicMT", 9, "encoding") { m++ } if match(text[m], "Arial-BoldItalicMT", 9, "encoding") && m < len(text) { enc = true m = m + 1 } if enc == true { for m < len(text) && !match(text[m], "Arial,Bold", 10, "") && match(text[m], "LucidaSansTypewriteX", 6.48, "") { if strings.Contains(text[m].S, "then SEE") { inst.Code = text[m].S break } else { m++ } } } } } // Possible subarchitecture notes. ArchLoop: for i < len(text) { if !match(text[i], "Arial-BoldItalicMT", 9, "variant") || match(text[i], "Arial-BoldItalicMT", 9, "encoding") { i++ continue } inst.Arch = "" inst.Arch += offset inst.Arch += " " inst.Arch += text[i].S inst.Arch = strings.TrimSpace(inst.Arch) i++ // Encoding syntaxes. sign := "" SynMark := false for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") && SynMark == false { if (strings.Contains(text[i].S, "==") || strings.Contains(text[i].S, "!=")) && SynMark == false { sign = text[i].S i++ continue } // Avoid "equivalent to" another syntax. if SynMark == false { SynMark = true inst.Syntax = "" inst.Syntax = text[i].S i++ } } abits = bits // Analyse and replace some bits value.eg, sf==1 if strings.Contains(sign, "&&") { split := strings.Split(sign, "&&") for k := 0; k < len(split); { if strings.Contains(split[k], "==") && !strings.Contains(split[k], "!") { tmp := strings.Split(split[k], "==") prefix := strings.TrimSpace(tmp[0]) value := strings.TrimSpace(tmp[1]) if strings.Contains(bits, prefix) && !strings.Contains(value, "x") { abits = strings.Replace(abits, prefix, value, -1) } } k++ } } else if strings.Contains(sign, "==") && !strings.Contains(sign, "!") { split := strings.Split(sign, "==") prefix := strings.TrimSpace(split[0]) value := strings.TrimSpace(split[1]) if strings.Contains(bits, prefix) && !strings.Contains(value, "x") { abits = strings.Replace(abits, prefix, value, -1) } } // Deal with syntax contains {2} if strings.Contains(inst.Syntax, "{2}") { if !strings.Contains(abits, "Q") { fmt.Fprintf(os.Stderr, "instruction%s - syntax%s: is wrong!!\n", name, inst.Syntax) } syn := inst.Syntax bits := abits for i := 0; i < 2; { if i == 0 { inst.Bits = strings.Replace(bits, "Q", "0", -1) inst.Syntax = strings.Replace(syn, "{2}", "", -1) table = append(table, inst) } if i == 1 { inst.Bits = strings.Replace(bits, "Q", "1", -1) inst.Syntax = strings.Replace(syn, "{2}", "2", -1) table = append(table, inst) } i++ } } else { inst.Bits = abits table = append(table, inst) } if OffsetMark == true && i < len(text) && match(text[i], "Arial-BoldItalicMT", 9, "variant") && !match(text[i], "Arial-BoldItalicMT", 9, "encoding") { continue ArchLoop } else { break } } } return name, table } func readBitBox(name string, content pdf.Content, text []pdf.Text, i int) (string, int) { // Bits headings y3 := 0.0 x1 := 0.0 for i < len(text) && match(text[i], "Arial", 8, "") { if y3 == 0 { y3 = text[i].Y } if x1 == 0 { x1 = text[i].X } if text[i].Y != y3 { break } i++ } // Bits fields in box x2 := 0.0 y2 := 0.0 dy1 := 0.0 for i < len(text) && match(text[i], "Arial", 8, "") { if x2 < text[i].X+text[i].W { x2 = text[i].X + text[i].W } if y2 == 0 { y2 = text[i].Y } if text[i].Y != y2 { break } dy1 = text[i].FontSize i++ } // Bits fields below box x3 := 0.0 y1 := 0.0 for i < len(text) && match(text[i], "Arial", 8, "") { if x3 < text[i].X+text[i].W { x3 = text[i].X + text[i].W } y1 = text[i].Y if text[i].Y != y1 { break } i++ } //no bits fields below box below_flag := true if y1 == 0.0 { below_flag = false y1 = y2 } // Encoding box if debugPage > 0 { fmt.Println("encoding box", x1, y3, x2, y1) } // Find lines (thin rectangles) separating bit fields. var bottom, top pdf.Rect const ( yMargin = 0.25 * 72 xMargin = 2 * 72 ) cont := 0 if below_flag == true { for _, r := range content.Rect { cont = cont + 1 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { if y1-yMargin < r.Min.Y && r.Min.Y < y2-dy1 { bottom = r } if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin { top = r } } } } else { for _, r := range content.Rect { cont = cont + 1 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { if y1-yMargin-dy1 < r.Min.Y && r.Min.Y < y3-dy1 { bottom = r } if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin { top = r } } } } if debugPage > 0 { fmt.Println("top", top, "bottom", bottom, "content.Rect number", cont) } const ε = 0.5 * 72 cont_1 := 0 var bars []pdf.Rect for _, r := range content.Rect { if math.Abs(r.Min.X-r.Max.X) < bottom.Max.X-bottom.Min.X-(ε/2) && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε { cont_1 = cont_1 + 1 bars = append(bars, r) } } sort.Sort(RectHorizontal(bars)) if debugPage > 0 { fmt.Println("==bars number==", cont_1) } // There are 16-bit and 32-bit encodings. // In practice, they are about 2.65 and 5.3 inches wide, respectively. // Use 4 inches as a cutoff. nbit := 32 dx := top.Max.X - top.Min.X if top.Max.X-top.Min.X < 4*72 { nbit = 16 } total := 0 var buf bytes.Buffer for i := 0; i < len(bars); i++ { if i > 0 { fmt.Fprintf(&buf, "|") } var sub []pdf.Text x1, x2 := bars[i].Min.X, bars[i].Max.X for _, t := range content.Text { tx := t.X + t.W/2 ty := t.Y if x1 < tx && tx < x2 && y2-dy1 < ty && ty < y2+dy1 { sub = append(sub, t) } } var str []string for _, t := range findWords(sub) { str = append(str, t.S) } s := strings.Join(str, " ") s = strings.Replace(s, ")(", ") (", -1) // If bits contain "!" or "x", be replaced by the bits below it. if strings.Contains(s, "!") || strings.Contains(s, "x") { var sub1 []pdf.Text for _, t := range content.Text { tx := t.X + t.W/2 ty := t.Y if x1 < tx && tx < x2 && y1-dy1 < ty && ty < y1+dy1 { sub1 = append(sub1, t) } } var str1 []string for _, t := range findWords(sub1) { str1 = append(str1, t.S) } s = strings.Join(str1, " ") s = strings.Replace(s, ")(", ") (", -1) } n := len(strings.Fields(s)) var b int if IMMRE.MatchString(s) { bitNum := strings.TrimPrefix(s, "imm") b, _ = strconv.Atoi(bitNum) } else if s == "immhi" { b = 19 } else { b = int(float64(nbit)*(x2-x1)/dx + 0.5) } if n == b { for k, f := range strings.Fields(s) { if k > 0 { fmt.Fprintf(&buf, "|") } fmt.Fprintf(&buf, "%s", f) } } else { if n != 1 { fmt.Fprintf(os.Stderr, "%s - multi-field %d-bit encoding: %s\n", name, n, s) } fmt.Fprintf(&buf, "%s:%d", s, b) } total += b } if total != nbit || total == 0 { fmt.Fprintf(os.Stderr, "%s - %d-bit encoding\n", name, total) } return buf.String(), i } type RectHorizontal []pdf.Rect func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] } func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X } func (x RectHorizontal) Len() int { return len(x) } func checkNoEncodings(num int, text []pdf.Text) { for _, t := range text { if match(t, "Helvetica-Bold", 9, "Encoding") { fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S) } } } func match(t pdf.Text, font string, size float64, substr string) bool { return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr) } func findWords(chars []pdf.Text) (words []pdf.Text) { // Sort by Y coordinate and normalize. const nudge = 1 sort.Sort(pdf.TextVertical(chars)) old := -100000.0 for i, c := range chars { if c.Y != old && math.Abs(old-c.Y) < nudge { chars[i].Y = old } else { old = c.Y } } // Sort by Y coordinate, breaking ties with X. // This will bring letters in a single word together. sort.Sort(pdf.TextVertical(chars)) // Loop over chars. for i := 0; i < len(chars); { // Find all chars on line. j := i + 1 for j < len(chars) && chars[j].Y == chars[i].Y { j++ } var end float64 // Split line into words (really, phrases). for k := i; k < j; { ck := &chars[k] s := ck.S end = ck.X + ck.W charSpace := ck.FontSize / 6 wordSpace := ck.FontSize * 2 / 3 l := k + 1 for l < j { // Grow word. cl := &chars[l] if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace { s += cl.S end = cl.X + cl.W l++ continue } // Add space to phrase before next word. if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace { s += " " + cl.S end = cl.X + cl.W l++ continue } break } f := ck.Font f = strings.TrimSuffix(f, ",Italic") f = strings.TrimSuffix(f, "-Italic") words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s}) k = l } i = j } return words } func sameFont(f1, f2 string) bool { f1 = strings.TrimSuffix(f1, ",Italic") f1 = strings.TrimSuffix(f1, "-Italic") f2 = strings.TrimSuffix(f1, ",Italic") f2 = strings.TrimSuffix(f1, "-Italic") return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman" } var jsFix = strings.NewReplacer( `\u003c`, `<`, `\u003e`, `>`, `\u0026`, `&`, `\u0009`, `\t`, ) func printTable(name string, table []Inst) { _ = strconv.Atoi }