1
2
3
4
5
6
7
8
9
10
11
12
13 package main
14
15 import (
16 "bufio"
17 "bytes"
18 "encoding/json"
19 "fmt"
20 "log"
21 "math"
22 "os"
23 "regexp"
24 "sort"
25 "strconv"
26 "strings"
27
28 "rsc.io/pdf"
29 )
30
31 type Inst struct {
32 Name string
33 ID string
34 Bits string
35 Arch string
36 Syntax []string
37 Code string
38 }
39
40 const debugPage = 0
41
42 var stdout *bufio.Writer
43
44 func main() {
45 log.SetFlags(0)
46 log.SetPrefix("armspec: ")
47
48 if len(os.Args) != 2 {
49 fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n")
50 os.Exit(2)
51 }
52
53 f, err := pdf.Open(os.Args[1])
54 if err != nil {
55 log.Fatal(err)
56 }
57
58
59 instList := instHeadings(f.Outline())
60 if len(instList) < 200 {
61 log.Fatalf("only found %d instructions in table of contents", len(instList))
62 }
63
64 stdout = bufio.NewWriter(os.Stdout)
65 fmt.Fprintf(stdout, "[")
66 numTable := 0
67 defer stdout.Flush()
68
69
70
71 n := f.NumPage()
72 PageLoop:
73 for pageNum := 1; pageNum <= n; pageNum++ {
74 if debugPage > 0 && pageNum != debugPage {
75 continue
76 }
77 if pageNum > 1127 {
78 break
79 }
80 p := f.Page(pageNum)
81 name, table := parsePage(pageNum, p)
82 if name == "" {
83 continue
84 }
85 if len(table) < 1 {
86 if false {
87 fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
88 }
89 continue
90 }
91 for _, inst := range table {
92 if numTable > 0 {
93 fmt.Fprintf(stdout, ",")
94 }
95 numTable++
96 js, _ := json.Marshal(inst)
97 fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js)))
98 }
99 for j, headline := range instList {
100 if name == headline {
101 instList[j] = ""
102 continue PageLoop
103 }
104 }
105 fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
106 }
107
108 fmt.Fprintf(stdout, "\n]\n")
109 stdout.Flush()
110
111 if debugPage == 0 {
112 for _, headline := range instList {
113 if headline != "" {
114 switch headline {
115 default:
116 fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
117 case "CHKA":
118 case "CPS":
119 case "CPY":
120 case "ENTERX":
121 case "F* (former VFP instruction mnemonics)":
122 case "HB, HBL, HBLP, HBP":
123 case "LEAVEX":
124 case "MOV (shifted register)":
125 case "NEG":
126 case "RFE":
127 case "SMC (previously SMI)":
128 case "SRS":
129 case "SUBS PC, LR and related instructions":
130 case "VAND (immediate)":
131 case "VCLE (register)":
132 case "VCLT (register)":
133 case "VORN (immediate)":
134 }
135 }
136 }
137 }
138 }
139
140 func instHeadings(outline pdf.Outline) []string {
141 return appendInstHeadings(outline, nil)
142 }
143
144 var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`)
145 var childRE = regexp.MustCompile(`A[\d.]+ (.+)`)
146 var sectionRE = regexp.MustCompile(`^A[\d.]+$`)
147 var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`)
148
149 func appendInstHeadings(outline pdf.Outline, list []string) []string {
150 if instRE.MatchString(outline.Title) {
151 for _, child := range outline.Child {
152 m := childRE.FindStringSubmatch(child.Title)
153 if m == nil {
154 fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
155 continue
156 }
157 list = append(list, m[1])
158 }
159 }
160 for _, child := range outline.Child {
161 list = appendInstHeadings(child, list)
162 }
163 return list
164 }
165
166 const inch = 72.0
167
168 func parsePage(num int, p pdf.Page) (name string, table []Inst) {
169 content := p.Content()
170
171 var text []pdf.Text
172 for _, t := range content.Text {
173 if match(t, "Times-Roman", 7.2, "") {
174 t.FontSize = 9
175 }
176 if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
177 t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
178 t.FontSize = 9
179 t.Y -= 2.28
180 }
181 if t.Font == "Gen_Arial" {
182 continue
183 }
184 text = append(text, t)
185 }
186
187 text = findWords(text)
188
189 for i, t := range text {
190 if t.Font == "Times" {
191 t.Font = "Times-Roman"
192 text[i] = t
193 }
194 }
195
196 if debugPage > 0 {
197 for _, t := range text {
198 fmt.Println(t)
199 }
200 for _, r := range content.Rect {
201 fmt.Println(r)
202 }
203 }
204
205
206 out := text[:0]
207 skip := false
208 for _, t := range text {
209
210 if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
211 continue
212 }
213
214 if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
215 skip = true
216 continue
217 }
218 if skip && match(t, "Times-Roman", 9, "") {
219 continue
220 }
221 skip = false
222 out = append(out, t)
223 }
224 text = out
225
226
227 if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
228 return "", nil
229 }
230 text = text[1:]
231
232 isSection := func(text []pdf.Text, i int) int {
233 if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
234 return 2
235 }
236 if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
237 return 1
238 }
239 return 0
240 }
241
242
243 for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
244 i := d
245 for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
246 i++
247 }
248 if isSection(text, i) == 0 {
249 break
250 }
251 text = text[i:]
252 }
253
254
255 d := isSection(text, 0)
256 if d == 0 {
257 if debugPage > 0 {
258 fmt.Printf("non-inst-headline: %v\n", text[0])
259 }
260 checkNoEncodings(num, text)
261 return "", nil
262 }
263 if d == 2 {
264 name = text[1].S
265 text = text[2:]
266 } else if d == 1 {
267 m := childRE.FindStringSubmatch(text[0].S)
268 name = m[1]
269 text = text[1:]
270 }
271 for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
272 name += " " + text[0].S
273 text = text[1:]
274 }
275
276
277 for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
278 text = text[1:]
279 }
280
281
282 warned := false
283 for i := 0; i < len(text); {
284 if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
285 match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
286 match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
287 match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
288 match(text[i], "Times-Roman", 9, "Figure A") ||
289 match(text[i], "Helvetica-Bold", 9, "Table A") ||
290 match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
291 match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
292 match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
293 match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
294 match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
295 match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
296 checkNoEncodings(num, text[i:])
297 break
298 }
299 if match(text[i], "Helvetica-Bold", 9, "Figure A") {
300 y := text[i].Y
301 i++
302 for i < len(text) && math.Abs(text[i].Y-y) < 2 {
303 i++
304 }
305 continue
306 }
307 if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
308 if !warned {
309 warned = true
310 fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
311 }
312 i++
313 continue
314 }
315 inst := Inst{
316 Name: name,
317 }
318 enc := text[i].S
319 x := text[i].X
320 i++
321
322 for i < len(text) && text[i].X > x+36 {
323 if inst.Arch != "" {
324 inst.Arch += " "
325 }
326 inst.Arch += text[i].S
327 i++
328 }
329
330 for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
331 if text[i].X < x+0.25*inch {
332 inst.Syntax = append(inst.Syntax, text[i].S)
333 } else {
334 s := inst.Syntax[len(inst.Syntax)-1]
335 if !strings.Contains(s, "\t") {
336 s += "\t"
337 } else {
338 s += " "
339 }
340 s += text[i].S
341 inst.Syntax[len(inst.Syntax)-1] = s
342 }
343 i++
344 }
345
346 var bits, abits, aenc string
347 bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
348 if strings.Contains(enc, " / ") {
349 if i < len(text) && match(text[i], "Times-Roman", 8, "") {
350 abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
351 } else {
352 abits = bits
353 }
354 slash := strings.Index(enc, " / ")
355 aenc = "Encoding " + enc[slash+len(" / "):]
356 enc = enc[:slash]
357 }
358
359
360 y0 := -1 * inch
361 tab := 0.0
362 for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
363 t := text[i]
364 i++
365 if math.Abs(t.Y-y0) < 3 {
366
367 inst.Code += " " + t.S
368 continue
369 }
370 if inst.Code != "" {
371 inst.Code += "\n"
372 }
373 if t.X > x+0.1*inch {
374 if tab == 0 {
375 tab = t.X - x
376 }
377 inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
378 } else {
379 tab = 0
380 }
381 inst.Code += t.S
382 y0 = t.Y
383 }
384
385 inst.ID = strings.TrimPrefix(enc, "Encoding ")
386 inst.Bits = bits
387 table = append(table, inst)
388 if abits != "" {
389 inst.ID = strings.TrimPrefix(aenc, "Encoding ")
390 inst.Bits = abits
391 table = append(table, inst)
392 }
393
394 }
395 return name, table
396 }
397
398 func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) {
399
400 y2 := 0.0
401 x1 := 0.0
402 x2 := 0.0
403 for i < len(text) && match(text[i], "Times-Roman", 8, "") {
404 if y2 == 0 {
405 y2 = text[i].Y
406 }
407 if x1 == 0 {
408 x1 = text[i].X
409 }
410 i++
411 }
412
413 y1 := 0.0
414 dy1 := 0.0
415 for i < len(text) && match(text[i], "Times-Roman", 9, "") {
416 if x2 < text[i].X+text[i].W {
417 x2 = text[i].X + text[i].W
418 }
419 y1 = text[i].Y
420 dy1 = text[i].FontSize
421 i++
422 }
423
424 if debugPage > 0 {
425 fmt.Println("encoding box", x1, y1, x2, y2)
426 }
427
428
429 var bottom, top pdf.Rect
430 const (
431 yMargin = 0.25 * 72
432 xMargin = 2 * 72
433 )
434 for _, r := range content.Rect {
435 if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
436 if y1-yMargin < r.Min.Y && r.Min.Y < y1 {
437 bottom = r
438 }
439 if y1+dy1 < r.Min.Y && r.Min.Y < y2 {
440 top = r
441 }
442 }
443 }
444
445 if debugPage > 0 {
446 fmt.Println("top", top, "bottom", bottom)
447 }
448
449 const ε = 0.1 * 72
450 var bars []pdf.Rect
451 for _, r := range content.Rect {
452 if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
453 bars = append(bars, r)
454 }
455 }
456 sort.Sort(RectHorizontal(bars))
457
458
459
460
461 nbit := 32
462 dx := top.Max.X - top.Min.X
463 if top.Max.X-top.Min.X < 4*72 {
464 nbit = 16
465 }
466
467 total := 0
468 var buf bytes.Buffer
469 for i := 0; i < len(bars)-1; i++ {
470 if i > 0 {
471 fmt.Fprintf(&buf, "|")
472 }
473 var sub []pdf.Text
474 x1, x2 := bars[i].Min.X, bars[i+1].Min.X
475 for _, t := range content.Text {
476 tx := t.X + t.W/2
477 ty := t.Y + t.FontSize/2
478 if x1 < tx && tx < x2 && y1 < ty && ty < y2 {
479 sub = append(sub, t)
480 }
481 }
482 var str []string
483 for _, t := range findWords(sub) {
484 str = append(str, t.S)
485 }
486 s := strings.Join(str, " ")
487 s = strings.Replace(s, ")(", ") (", -1)
488 n := len(strings.Fields(s))
489 b := int(float64(nbit)*(x2-x1)/dx + 0.5)
490 if n == b {
491 for j, f := range strings.Fields(s) {
492 if j > 0 {
493 fmt.Fprintf(&buf, "|")
494 }
495 fmt.Fprintf(&buf, "%s", f)
496 }
497 } else {
498 if n != 1 {
499 fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s)
500 }
501 fmt.Fprintf(&buf, "%s:%d", s, b)
502 }
503 total += b
504 }
505
506 if total != nbit || total == 0 {
507 fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total)
508 }
509 return buf.String(), i
510 }
511
512 type RectHorizontal []pdf.Rect
513
514 func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
515 func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
516 func (x RectHorizontal) Len() int { return len(x) }
517
518 func checkNoEncodings(num int, text []pdf.Text) {
519 for _, t := range text {
520 if match(t, "Helvetica-Bold", 9, "Encoding") {
521 fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
522 }
523 }
524 }
525
526 func match(t pdf.Text, font string, size float64, substr string) bool {
527 return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
528 }
529
530 func findWords(chars []pdf.Text) (words []pdf.Text) {
531
532 const nudge = 1
533 sort.Sort(pdf.TextVertical(chars))
534 old := -100000.0
535 for i, c := range chars {
536 if c.Y != old && math.Abs(old-c.Y) < nudge {
537 chars[i].Y = old
538 } else {
539 old = c.Y
540 }
541 }
542
543
544
545 sort.Sort(pdf.TextVertical(chars))
546
547
548 for i := 0; i < len(chars); {
549
550 j := i + 1
551 for j < len(chars) && chars[j].Y == chars[i].Y {
552 j++
553 }
554 var end float64
555
556 for k := i; k < j; {
557 ck := &chars[k]
558 s := ck.S
559 end = ck.X + ck.W
560 charSpace := ck.FontSize / 6
561 wordSpace := ck.FontSize * 2 / 3
562 l := k + 1
563 for l < j {
564
565 cl := &chars[l]
566 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
567 s += cl.S
568 end = cl.X + cl.W
569 l++
570 continue
571 }
572
573 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
574 s += " " + cl.S
575 end = cl.X + cl.W
576 l++
577 continue
578 }
579 break
580 }
581 f := ck.Font
582 f = strings.TrimSuffix(f, ",Italic")
583 f = strings.TrimSuffix(f, "-Italic")
584 words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
585 k = l
586 }
587 i = j
588 }
589
590 return words
591 }
592
593 func sameFont(f1, f2 string) bool {
594 f1 = strings.TrimSuffix(f1, ",Italic")
595 f1 = strings.TrimSuffix(f1, "-Italic")
596 f2 = strings.TrimSuffix(f1, ",Italic")
597 f2 = strings.TrimSuffix(f1, "-Italic")
598 return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
599 }
600
601 var jsFix = strings.NewReplacer(
602
603
604
605
606 )
607
608 func printTable(name string, table []Inst) {
609 _ = strconv.Atoi
610 }
611
View as plain text