1
2
3
4
5
6
7
8
9
10 package main
11
12 import (
13 "bufio"
14 "bytes"
15 "encoding/json"
16 "fmt"
17 "log"
18 "math"
19 "os"
20 "regexp"
21 "sort"
22 "strconv"
23 "strings"
24
25 "rsc.io/pdf"
26 )
27
28 type Inst struct {
29 Name string
30 Bits string
31 Arch string
32 Syntax string
33 Code string
34 Alias string
35 }
36
37 const debugPage = 0
38
39 var stdout *bufio.Writer
40
41 func check(e error) {
42 if e != nil {
43 panic(e)
44 }
45 }
46
47 func main() {
48 log.SetFlags(0)
49 log.SetPrefix("arm64spec: ")
50
51 if len(os.Args) != 2 {
52 fmt.Fprintf(os.Stderr, "usage: arm64spec file.pdf\n")
53 os.Exit(2)
54 }
55 f, err := pdf.Open(os.Args[1])
56 if err != nil {
57 log.Fatal(err)
58 }
59
60
61 instList := instHeadings(f.Outline())
62 if debugPage == 0 {
63 fmt.Println("the number of instructions:", len(instList))
64 }
65 if len(instList) < 200 {
66 log.Fatalf("only found %d instructions in table of contents", len(instList))
67 }
68
69 file, err := os.Create("inst.json")
70 check(err)
71 w := bufio.NewWriter(file)
72 _, err = w.WriteString("[")
73 check(err)
74 numTable := 0
75 defer w.Flush()
76 defer file.Close()
77
78
79
80 n := f.NumPage()
81 PageLoop:
82 for pageNum := 435; pageNum <= n; pageNum++ {
83 if debugPage > 0 && pageNum != debugPage {
84 continue
85 }
86 if pageNum == 770 {
87 continue
88 }
89 if pageNum > 1495 {
90 break
91 }
92 p := f.Page(pageNum)
93 name, table := parsePage(pageNum, p, f)
94 if name == "" {
95 continue
96 }
97 if len(table) < 1 {
98 if false {
99 fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum)
100 }
101 continue
102 }
103 for _, inst := range table {
104 if numTable > 0 {
105 _, err = w.WriteString(jsFix.Replace(","))
106 check(err)
107 _, err = w.WriteString("\n")
108 check(err)
109 }
110 numTable++
111 js, _ := json.Marshal(inst)
112 _, err = w.WriteString(jsFix.Replace(string(js)))
113 check(err)
114 }
115 for j, headline := range instList {
116 if name == headline {
117 instList[j] = ""
118 continue PageLoop
119 }
120 }
121 fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum)
122 }
123
124 _, err = w.WriteString("\n]\n")
125 check(err)
126 w.Flush()
127
128 if debugPage == 0 {
129 for _, headline := range instList {
130 if headline != "" {
131 fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline)
132 }
133 }
134 }
135 }
136
137 func instHeadings(outline pdf.Outline) []string {
138 return appendInstHeadings(outline, nil)
139 }
140
141 var instRE = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 base instructions`)
142 var instRE_A = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 floating-point and Advanced SIMD instructions`)
143 var childRE = regexp.MustCompile(`C[\d.]+ (.+)`)
144 var sectionRE = regexp.MustCompile(`^C[\d.]+$`)
145 var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`)
146 var IMMRE = regexp.MustCompile(`^imm[\d]+$`)
147
148 func appendInstHeadings(outline pdf.Outline, list []string) []string {
149 if instRE.MatchString(outline.Title) || instRE_A.MatchString(outline.Title) {
150 for _, child := range outline.Child {
151 m := childRE.FindStringSubmatch(child.Title)
152 if m == nil {
153 fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title)
154 continue
155 }
156 list = append(list, m[1])
157 }
158 }
159 for _, child := range outline.Child {
160 list = appendInstHeadings(child, list)
161 }
162 return list
163 }
164
165 const inch = 72.0
166
167 func parsePage(num int, p pdf.Page, f *pdf.Reader) (name string, table []Inst) {
168 content := p.Content()
169 var text []pdf.Text
170 CrossTwoPage := true
171 for _, t := range content.Text {
172 text = append(text, t)
173 }
174 text = findWords(text)
175 if !(instRE.MatchString(text[1].S) || instRE_A.MatchString(text[1].S)) || len(text) == 0 || !sectionRE.MatchString(text[2].S) {
176 return "", nil
177 }
178
179 for _, t := range text {
180 if match(t, "Arial,Bold", 10, "Assembler symbols") {
181 CrossTwoPage = false
182 break
183 }
184 }
185
186 var Ncontent pdf.Content
187 Npagebox := false
188 CrossThreePage := false
189 Noffset := ""
190 if CrossTwoPage == true {
191 Np := f.Page(num + 1)
192 Ncontent = Np.Content()
193 var Ntext []pdf.Text
194 for _, t := range Ncontent.Text {
195 Ntext = append(Ntext, t)
196 }
197 Ntext = findWords(Ntext)
198 if len(Ntext) == 0 || sectionRE.MatchString(Ntext[2].S) {
199 Ntext = text[:0]
200 } else {
201 for _, t := range Ntext {
202 if match(t, "Arial,Bold", 10, "offset") {
203 Noffset = t.S
204 Npagebox = true
205 }
206
207 if match(t, "Arial,Bold", 10, "Assembler symbols") {
208 CrossThreePage = false
209 } else {
210 CrossThreePage = true
211 }
212 text = append(text, t)
213 }
214 }
215 }
216 if CrossThreePage == true {
217 NNp := f.Page(num + 2)
218 NNcontent := NNp.Content()
219 var NNtext []pdf.Text
220 for _, t := range NNcontent.Text {
221 NNtext = append(NNtext, t)
222 }
223 NNtext = findWords(NNtext)
224 if len(NNtext) == 0 || sectionRE.MatchString(NNtext[2].S) {
225 NNtext = text[:0]
226 } else {
227 for _, t := range NNtext {
228 text = append(text, t)
229 }
230 }
231 }
232
233 out := text[:0]
234 alias := ""
235 for _, t := range text {
236 if strings.Contains(t.S, "instruction is used by the alias") || strings.Contains(t.S, "instruction is an alias of") {
237 alias_t := strings.SplitAfter(t.S, ".")
238 alias = alias_t[0]
239 }
240
241 if match(t, "Arial-ItalicMT", 8, "") || match(t, "ArialMT", 8, "") {
242 if debugPage > 0 {
243 fmt.Println("==the skip page footer is:==", t)
244 }
245 continue
246 }
247
248 if match(t, "TimesNewRoman", 9, "") || match(t, "TimesNewRomanPS-ItalicMT", 9, "") {
249 if debugPage > 0 {
250 fmt.Println("==the skip body text is:==", t)
251 }
252 continue
253 }
254 out = append(out, t)
255 }
256 text = out
257
258 if len(text) == 0 || !sectionRE.MatchString(text[0].S) {
259 return "", nil
260 }
261
262 name = text[1].S
263 inst := Inst{
264 Name: name,
265 Alias: alias,
266 }
267 text = text[2:]
268
269 OffsetMark := false
270 k := 0
271 for k = 0; k < len(text); {
272 if !match(text[k], "Arial", 8, "31") {
273 k++
274 } else {
275 break
276 }
277 }
278
279 if k > 0 && match(text[k-1], "Arial,Bold", 10, "") {
280 OffsetMark = true
281 text = text[k-1:]
282 } else {
283 text = text[k:]
284 }
285
286 BitMark := false
287 bits := ""
288
289 for i := 0; i < len(text); {
290 inst.Bits = ""
291 offset := ""
292 abits := ""
293
294 if OffsetMark == true {
295 for i < len(text) && !match(text[i], "Arial", 8, "") {
296 i++
297 }
298 if i < len(text) {
299 offset = text[i-1].S
300 BitMark = false
301 bits = ""
302 } else {
303 break
304 }
305 }
306 if BitMark == false {
307 if Npagebox == true && Noffset == offset {
308 bits, i = readBitBox(name, Ncontent, text, i)
309 } else {
310 bits, i = readBitBox(name, content, text, i)
311 }
312 BitMark = true
313
314 enc := false
315 if i < len(text)-1 {
316 m := i
317 for m < len(text)-1 && !match(text[m], "Arial-BoldItalicMT", 9, "encoding") {
318 m++
319 }
320 if match(text[m], "Arial-BoldItalicMT", 9, "encoding") && m < len(text) {
321 enc = true
322 m = m + 1
323 }
324 if enc == true {
325 for m < len(text) && !match(text[m], "Arial,Bold", 10, "") && match(text[m], "LucidaSansTypewriteX", 6.48, "") {
326 if strings.Contains(text[m].S, "then SEE") {
327 inst.Code = text[m].S
328 break
329 } else {
330 m++
331 }
332 }
333 }
334 }
335 }
336
337
338 ArchLoop:
339 for i < len(text) {
340 if !match(text[i], "Arial-BoldItalicMT", 9, "variant") || match(text[i], "Arial-BoldItalicMT", 9, "encoding") {
341 i++
342 continue
343 }
344 inst.Arch = ""
345 inst.Arch += offset
346 inst.Arch += " "
347 inst.Arch += text[i].S
348 inst.Arch = strings.TrimSpace(inst.Arch)
349 i++
350
351 sign := ""
352 SynMark := false
353 for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") && SynMark == false {
354 if (strings.Contains(text[i].S, "==") || strings.Contains(text[i].S, "!=")) && SynMark == false {
355 sign = text[i].S
356 i++
357 continue
358 }
359
360 if SynMark == false {
361 SynMark = true
362 inst.Syntax = ""
363 inst.Syntax = text[i].S
364 i++
365 }
366 }
367 abits = bits
368
369 if strings.Contains(sign, "&&") {
370 split := strings.Split(sign, "&&")
371 for k := 0; k < len(split); {
372 if strings.Contains(split[k], "==") && !strings.Contains(split[k], "!") {
373 tmp := strings.Split(split[k], "==")
374 prefix := strings.TrimSpace(tmp[0])
375 value := strings.TrimSpace(tmp[1])
376 if strings.Contains(bits, prefix) && !strings.Contains(value, "x") {
377 abits = strings.Replace(abits, prefix, value, -1)
378 }
379 }
380 k++
381 }
382 } else if strings.Contains(sign, "==") && !strings.Contains(sign, "!") {
383 split := strings.Split(sign, "==")
384 prefix := strings.TrimSpace(split[0])
385 value := strings.TrimSpace(split[1])
386 if strings.Contains(bits, prefix) && !strings.Contains(value, "x") {
387 abits = strings.Replace(abits, prefix, value, -1)
388 }
389 }
390
391 if strings.Contains(inst.Syntax, "{2}") {
392 if !strings.Contains(abits, "Q") {
393 fmt.Fprintf(os.Stderr, "instruction%s - syntax%s: is wrong!!\n", name, inst.Syntax)
394 }
395 syn := inst.Syntax
396 bits := abits
397 for i := 0; i < 2; {
398 if i == 0 {
399 inst.Bits = strings.Replace(bits, "Q", "0", -1)
400 inst.Syntax = strings.Replace(syn, "{2}", "", -1)
401 table = append(table, inst)
402 }
403 if i == 1 {
404 inst.Bits = strings.Replace(bits, "Q", "1", -1)
405 inst.Syntax = strings.Replace(syn, "{2}", "2", -1)
406 table = append(table, inst)
407 }
408 i++
409 }
410 } else {
411 inst.Bits = abits
412 table = append(table, inst)
413 }
414
415 if OffsetMark == true && i < len(text) && match(text[i], "Arial-BoldItalicMT", 9, "variant") && !match(text[i], "Arial-BoldItalicMT", 9, "encoding") {
416 continue ArchLoop
417 } else {
418 break
419 }
420 }
421 }
422 return name, table
423 }
424
425 func readBitBox(name string, content pdf.Content, text []pdf.Text, i int) (string, int) {
426
427 y3 := 0.0
428 x1 := 0.0
429 for i < len(text) && match(text[i], "Arial", 8, "") {
430 if y3 == 0 {
431 y3 = text[i].Y
432 }
433 if x1 == 0 {
434 x1 = text[i].X
435 }
436 if text[i].Y != y3 {
437 break
438 }
439 i++
440 }
441
442 x2 := 0.0
443 y2 := 0.0
444 dy1 := 0.0
445 for i < len(text) && match(text[i], "Arial", 8, "") {
446 if x2 < text[i].X+text[i].W {
447 x2 = text[i].X + text[i].W
448 }
449 if y2 == 0 {
450 y2 = text[i].Y
451 }
452 if text[i].Y != y2 {
453 break
454 }
455 dy1 = text[i].FontSize
456 i++
457 }
458
459 x3 := 0.0
460 y1 := 0.0
461 for i < len(text) && match(text[i], "Arial", 8, "") {
462 if x3 < text[i].X+text[i].W {
463 x3 = text[i].X + text[i].W
464 }
465 y1 = text[i].Y
466 if text[i].Y != y1 {
467 break
468 }
469 i++
470 }
471
472 below_flag := true
473 if y1 == 0.0 {
474 below_flag = false
475 y1 = y2
476 }
477
478 if debugPage > 0 {
479 fmt.Println("encoding box", x1, y3, x2, y1)
480 }
481
482
483 var bottom, top pdf.Rect
484 const (
485 yMargin = 0.25 * 72
486 xMargin = 2 * 72
487 )
488 cont := 0
489 if below_flag == true {
490 for _, r := range content.Rect {
491 cont = cont + 1
492 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
493 if y1-yMargin < r.Min.Y && r.Min.Y < y2-dy1 {
494 bottom = r
495 }
496 if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin {
497 top = r
498 }
499 }
500 }
501 } else {
502 for _, r := range content.Rect {
503 cont = cont + 1
504 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin {
505 if y1-yMargin-dy1 < r.Min.Y && r.Min.Y < y3-dy1 {
506 bottom = r
507 }
508 if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin {
509 top = r
510 }
511 }
512 }
513 }
514
515 if debugPage > 0 {
516 fmt.Println("top", top, "bottom", bottom, "content.Rect number", cont)
517 }
518
519 const ε = 0.5 * 72
520 cont_1 := 0
521 var bars []pdf.Rect
522 for _, r := range content.Rect {
523 if math.Abs(r.Min.X-r.Max.X) < bottom.Max.X-bottom.Min.X-(ε/2) && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε {
524 cont_1 = cont_1 + 1
525 bars = append(bars, r)
526 }
527 }
528 sort.Sort(RectHorizontal(bars))
529 if debugPage > 0 {
530 fmt.Println("==bars number==", cont_1)
531 }
532
533
534
535
536 nbit := 32
537 dx := top.Max.X - top.Min.X
538 if top.Max.X-top.Min.X < 4*72 {
539 nbit = 16
540 }
541
542 total := 0
543 var buf bytes.Buffer
544 for i := 0; i < len(bars); i++ {
545 if i > 0 {
546 fmt.Fprintf(&buf, "|")
547 }
548 var sub []pdf.Text
549 x1, x2 := bars[i].Min.X, bars[i].Max.X
550 for _, t := range content.Text {
551 tx := t.X + t.W/2
552 ty := t.Y
553 if x1 < tx && tx < x2 && y2-dy1 < ty && ty < y2+dy1 {
554 sub = append(sub, t)
555 }
556 }
557 var str []string
558 for _, t := range findWords(sub) {
559 str = append(str, t.S)
560 }
561 s := strings.Join(str, " ")
562 s = strings.Replace(s, ")(", ") (", -1)
563
564
565 if strings.Contains(s, "!") || strings.Contains(s, "x") {
566 var sub1 []pdf.Text
567 for _, t := range content.Text {
568 tx := t.X + t.W/2
569 ty := t.Y
570 if x1 < tx && tx < x2 && y1-dy1 < ty && ty < y1+dy1 {
571 sub1 = append(sub1, t)
572 }
573
574 }
575 var str1 []string
576 for _, t := range findWords(sub1) {
577 str1 = append(str1, t.S)
578 }
579 s = strings.Join(str1, " ")
580 s = strings.Replace(s, ")(", ") (", -1)
581 }
582
583 n := len(strings.Fields(s))
584
585 var b int
586 if IMMRE.MatchString(s) {
587 bitNum := strings.TrimPrefix(s, "imm")
588 b, _ = strconv.Atoi(bitNum)
589 } else if s == "immhi" {
590 b = 19
591 } else {
592 b = int(float64(nbit)*(x2-x1)/dx + 0.5)
593 }
594 if n == b {
595 for k, f := range strings.Fields(s) {
596 if k > 0 {
597 fmt.Fprintf(&buf, "|")
598 }
599 fmt.Fprintf(&buf, "%s", f)
600 }
601 } else {
602 if n != 1 {
603 fmt.Fprintf(os.Stderr, "%s - multi-field %d-bit encoding: %s\n", name, n, s)
604 }
605 fmt.Fprintf(&buf, "%s:%d", s, b)
606 }
607 total += b
608 }
609
610 if total != nbit || total == 0 {
611 fmt.Fprintf(os.Stderr, "%s - %d-bit encoding\n", name, total)
612 }
613 return buf.String(), i
614 }
615
616 type RectHorizontal []pdf.Rect
617
618 func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
619 func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X }
620 func (x RectHorizontal) Len() int { return len(x) }
621
622 func checkNoEncodings(num int, text []pdf.Text) {
623 for _, t := range text {
624 if match(t, "Helvetica-Bold", 9, "Encoding") {
625 fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S)
626 }
627 }
628 }
629
630 func match(t pdf.Text, font string, size float64, substr string) bool {
631 return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr)
632 }
633
634 func findWords(chars []pdf.Text) (words []pdf.Text) {
635
636 const nudge = 1
637 sort.Sort(pdf.TextVertical(chars))
638 old := -100000.0
639 for i, c := range chars {
640 if c.Y != old && math.Abs(old-c.Y) < nudge {
641 chars[i].Y = old
642 } else {
643 old = c.Y
644 }
645 }
646
647
648
649 sort.Sort(pdf.TextVertical(chars))
650
651
652 for i := 0; i < len(chars); {
653
654 j := i + 1
655 for j < len(chars) && chars[j].Y == chars[i].Y {
656 j++
657 }
658 var end float64
659
660 for k := i; k < j; {
661 ck := &chars[k]
662 s := ck.S
663 end = ck.X + ck.W
664 charSpace := ck.FontSize / 6
665 wordSpace := ck.FontSize * 2 / 3
666 l := k + 1
667 for l < j {
668
669 cl := &chars[l]
670 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace {
671 s += cl.S
672 end = cl.X + cl.W
673 l++
674 continue
675 }
676
677 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace {
678 s += " " + cl.S
679 end = cl.X + cl.W
680 l++
681 continue
682 }
683 break
684 }
685 f := ck.Font
686 f = strings.TrimSuffix(f, ",Italic")
687 f = strings.TrimSuffix(f, "-Italic")
688 words = append(words, pdf.Text{f, ck.FontSize, ck.X, ck.Y, end - ck.X, s})
689 k = l
690 }
691 i = j
692 }
693
694 return words
695 }
696
697 func sameFont(f1, f2 string) bool {
698 f1 = strings.TrimSuffix(f1, ",Italic")
699 f1 = strings.TrimSuffix(f1, "-Italic")
700 f2 = strings.TrimSuffix(f1, ",Italic")
701 f2 = strings.TrimSuffix(f1, "-Italic")
702 return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman"
703 }
704
705 var jsFix = strings.NewReplacer(
706 `\u003c`, `<`,
707 `\u003e`, `>`,
708 `\u0026`, `&`,
709 `\u0009`, `\t`,
710 )
711
712 func printTable(name string, table []Inst) {
713 _ = strconv.Atoi
714 }
715
View as plain text