Text file
src/math/big/arith_ppc64x.s
1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go && (ppc64 || ppc64le)
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// func addVV(z, y, y []Word) (c Word)
13// z[i] = x[i] + y[i] for all i, carrying
14TEXT ·addVV(SB), NOSPLIT, $0
15 MOVD z_len+8(FP), R7 // R7 = z_len
16 MOVD x+24(FP), R8 // R8 = x[]
17 MOVD y+48(FP), R9 // R9 = y[]
18 MOVD z+0(FP), R10 // R10 = z[]
19
20 // If z_len = 0, we are done
21 CMP R0, R7
22 MOVD R0, R4
23 BEQ done
24
25 // Process the first iteration out of the loop so we can
26 // use MOVDU and avoid 3 index registers updates.
27 MOVD 0(R8), R11 // R11 = x[i]
28 MOVD 0(R9), R12 // R12 = y[i]
29 ADD $-1, R7 // R7 = z_len - 1
30 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
31 CMP R0, R7
32 MOVD R15, 0(R10) // z[i]
33 BEQ final // If z_len was 1, we are done
34
35 SRD $2, R7, R5 // R5 = z_len/4
36 CMP R0, R5
37 MOVD R5, CTR // Set up loop counter
38 BEQ tail // If R5 = 0, we can't use the loop
39
40 // Process 4 elements per iteration. Unrolling this loop
41 // means a performance trade-off: we will lose performance
42 // for small values of z_len (0.90x in the worst case), but
43 // gain significant performance as z_len increases (up to
44 // 1.45x).
45
46 PCALIGN $16
47loop:
48 MOVD 8(R8), R11 // R11 = x[i]
49 MOVD 16(R8), R12 // R12 = x[i+1]
50 MOVD 24(R8), R14 // R14 = x[i+2]
51 MOVDU 32(R8), R15 // R15 = x[i+3]
52 MOVD 8(R9), R16 // R16 = y[i]
53 MOVD 16(R9), R17 // R17 = y[i+1]
54 MOVD 24(R9), R18 // R18 = y[i+2]
55 MOVDU 32(R9), R19 // R19 = y[i+3]
56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
60 MOVD R20, 8(R10) // z[i]
61 MOVD R21, 16(R10) // z[i+1]
62 MOVD R22, 24(R10) // z[i+2]
63 MOVDU R23, 32(R10) // z[i+3]
64 ADD $-4, R7 // R7 = z_len - 4
65 BC 16, 0, loop // bdnz
66
67 // We may have more elements to read
68 CMP R0, R7
69 BEQ final
70
71 // Process the remaining elements, one at a time
72tail:
73 MOVDU 8(R8), R11 // R11 = x[i]
74 MOVDU 8(R9), R16 // R16 = y[i]
75 ADD $-1, R7 // R7 = z_len - 1
76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
77 CMP R0, R7
78 MOVDU R20, 8(R10) // z[i]
79 BEQ final // If R7 = 0, we are done
80
81 MOVDU 8(R8), R11
82 MOVDU 8(R9), R16
83 ADD $-1, R7
84 ADDE R11, R16, R20
85 CMP R0, R7
86 MOVDU R20, 8(R10)
87 BEQ final
88
89 MOVD 8(R8), R11
90 MOVD 8(R9), R16
91 ADDE R11, R16, R20
92 MOVD R20, 8(R10)
93
94final:
95 ADDZE R4 // Capture CA
96
97done:
98 MOVD R4, c+72(FP)
99 RET
100
101// func subVV(z, x, y []Word) (c Word)
102// z[i] = x[i] - y[i] for all i, carrying
103TEXT ·subVV(SB), NOSPLIT, $0
104 MOVD z_len+8(FP), R7 // R7 = z_len
105 MOVD x+24(FP), R8 // R8 = x[]
106 MOVD y+48(FP), R9 // R9 = y[]
107 MOVD z+0(FP), R10 // R10 = z[]
108
109 // If z_len = 0, we are done
110 CMP R0, R7
111 MOVD R0, R4
112 BEQ done
113
114 // Process the first iteration out of the loop so we can
115 // use MOVDU and avoid 3 index registers updates.
116 MOVD 0(R8), R11 // R11 = x[i]
117 MOVD 0(R9), R12 // R12 = y[i]
118 ADD $-1, R7 // R7 = z_len - 1
119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
120 CMP R0, R7
121 MOVD R15, 0(R10) // z[i]
122 BEQ final // If z_len was 1, we are done
123
124 SRD $2, R7, R5 // R5 = z_len/4
125 CMP R0, R5
126 MOVD R5, CTR // Set up loop counter
127 BEQ tail // If R5 = 0, we can't use the loop
128
129 // Process 4 elements per iteration. Unrolling this loop
130 // means a performance trade-off: we will lose performance
131 // for small values of z_len (0.92x in the worst case), but
132 // gain significant performance as z_len increases (up to
133 // 1.45x).
134
135 PCALIGN $16
136loop:
137 MOVD 8(R8), R11 // R11 = x[i]
138 MOVD 16(R8), R12 // R12 = x[i+1]
139 MOVD 24(R8), R14 // R14 = x[i+2]
140 MOVDU 32(R8), R15 // R15 = x[i+3]
141 MOVD 8(R9), R16 // R16 = y[i]
142 MOVD 16(R9), R17 // R17 = y[i+1]
143 MOVD 24(R9), R18 // R18 = y[i+2]
144 MOVDU 32(R9), R19 // R19 = y[i+3]
145 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
146 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
147 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
148 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
149 MOVD R20, 8(R10) // z[i]
150 MOVD R21, 16(R10) // z[i+1]
151 MOVD R22, 24(R10) // z[i+2]
152 MOVDU R23, 32(R10) // z[i+3]
153 ADD $-4, R7 // R7 = z_len - 4
154 BC 16, 0, loop // bdnz
155
156 // We may have more elements to read
157 CMP R0, R7
158 BEQ final
159
160 // Process the remaining elements, one at a time
161tail:
162 MOVDU 8(R8), R11 // R11 = x[i]
163 MOVDU 8(R9), R16 // R16 = y[i]
164 ADD $-1, R7 // R7 = z_len - 1
165 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
166 CMP R0, R7
167 MOVDU R20, 8(R10) // z[i]
168 BEQ final // If R7 = 0, we are done
169
170 MOVDU 8(R8), R11
171 MOVDU 8(R9), R16
172 ADD $-1, R7
173 SUBE R16, R11, R20
174 CMP R0, R7
175 MOVDU R20, 8(R10)
176 BEQ final
177
178 MOVD 8(R8), R11
179 MOVD 8(R9), R16
180 SUBE R16, R11, R20
181 MOVD R20, 8(R10)
182
183final:
184 ADDZE R4
185 XOR $1, R4
186
187done:
188 MOVD R4, c+72(FP)
189 RET
190
191// func addVW(z, x []Word, y Word) (c Word)
192TEXT ·addVW(SB), NOSPLIT, $0
193 MOVD z+0(FP), R10 // R10 = z[]
194 MOVD x+24(FP), R8 // R8 = x[]
195 MOVD y+48(FP), R4 // R4 = y = c
196 MOVD z_len+8(FP), R11 // R11 = z_len
197
198 CMP R0, R11 // If z_len is zero, return
199 BEQ done
200
201 // We will process the first iteration out of the loop so we capture
202 // the value of c. In the subsequent iterations, we will rely on the
203 // value of CA set here.
204 MOVD 0(R8), R20 // R20 = x[i]
205 ADD $-1, R11 // R11 = z_len - 1
206 ADDC R20, R4, R6 // R6 = x[i] + c
207 CMP R0, R11 // If z_len was 1, we are done
208 MOVD R6, 0(R10) // z[i]
209 BEQ final
210
211 // We will read 4 elements per iteration
212 SRD $2, R11, R9 // R9 = z_len/4
213 DCBT (R8)
214 CMP R0, R9
215 MOVD R9, CTR // Set up the loop counter
216 BEQ tail // If R9 = 0, we can't use the loop
217 PCALIGN $16
218
219loop:
220 MOVD 8(R8), R20 // R20 = x[i]
221 MOVD 16(R8), R21 // R21 = x[i+1]
222 MOVD 24(R8), R22 // R22 = x[i+2]
223 MOVDU 32(R8), R23 // R23 = x[i+3]
224 ADDZE R20, R24 // R24 = x[i] + CA
225 ADDZE R21, R25 // R25 = x[i+1] + CA
226 ADDZE R22, R26 // R26 = x[i+2] + CA
227 ADDZE R23, R27 // R27 = x[i+3] + CA
228 MOVD R24, 8(R10) // z[i]
229 MOVD R25, 16(R10) // z[i+1]
230 MOVD R26, 24(R10) // z[i+2]
231 MOVDU R27, 32(R10) // z[i+3]
232 ADD $-4, R11 // R11 = z_len - 4
233 BC 16, 0, loop // bdnz
234
235 // We may have some elements to read
236 CMP R0, R11
237 BEQ final
238
239tail:
240 MOVDU 8(R8), R20
241 ADDZE R20, R24
242 ADD $-1, R11
243 MOVDU R24, 8(R10)
244 CMP R0, R11
245 BEQ final
246
247 MOVDU 8(R8), R20
248 ADDZE R20, R24
249 ADD $-1, R11
250 MOVDU R24, 8(R10)
251 CMP R0, R11
252 BEQ final
253
254 MOVD 8(R8), R20
255 ADDZE R20, R24
256 MOVD R24, 8(R10)
257
258final:
259 ADDZE R0, R4 // c = CA
260done:
261 MOVD R4, c+56(FP)
262 RET
263
264// func subVW(z, x []Word, y Word) (c Word)
265TEXT ·subVW(SB), NOSPLIT, $0
266 MOVD z+0(FP), R10 // R10 = z[]
267 MOVD x+24(FP), R8 // R8 = x[]
268 MOVD y+48(FP), R4 // R4 = y = c
269 MOVD z_len+8(FP), R11 // R11 = z_len
270
271 CMP R0, R11 // If z_len is zero, return
272 BEQ done
273
274 // We will process the first iteration out of the loop so we capture
275 // the value of c. In the subsequent iterations, we will rely on the
276 // value of CA set here.
277 MOVD 0(R8), R20 // R20 = x[i]
278 ADD $-1, R11 // R11 = z_len - 1
279 SUBC R4, R20, R6 // R6 = x[i] - c
280 CMP R0, R11 // If z_len was 1, we are done
281 MOVD R6, 0(R10) // z[i]
282 BEQ final
283
284 // We will read 4 elements per iteration
285 SRD $2, R11, R9 // R9 = z_len/4
286 DCBT (R8)
287 CMP R0, R9
288 MOVD R9, CTR // Set up the loop counter
289 BEQ tail // If R9 = 0, we can't use the loop
290
291 // The loop here is almost the same as the one used in s390x, but
292 // we don't need to capture CA every iteration because we've already
293 // done that above.
294
295 PCALIGN $16
296loop:
297 MOVD 8(R8), R20
298 MOVD 16(R8), R21
299 MOVD 24(R8), R22
300 MOVDU 32(R8), R23
301 SUBE R0, R20
302 SUBE R0, R21
303 SUBE R0, R22
304 SUBE R0, R23
305 MOVD R20, 8(R10)
306 MOVD R21, 16(R10)
307 MOVD R22, 24(R10)
308 MOVDU R23, 32(R10)
309 ADD $-4, R11
310 BC 16, 0, loop // bdnz
311
312 // We may have some elements to read
313 CMP R0, R11
314 BEQ final
315
316tail:
317 MOVDU 8(R8), R20
318 SUBE R0, R20
319 ADD $-1, R11
320 MOVDU R20, 8(R10)
321 CMP R0, R11
322 BEQ final
323
324 MOVDU 8(R8), R20
325 SUBE R0, R20
326 ADD $-1, R11
327 MOVDU R20, 8(R10)
328 CMP R0, R11
329 BEQ final
330
331 MOVD 8(R8), R20
332 SUBE R0, R20
333 MOVD R20, 8(R10)
334
335final:
336 // Capture CA
337 SUBE R4, R4
338 NEG R4, R4
339
340done:
341 MOVD R4, c+56(FP)
342 RET
343
344//func shlVU(z, x []Word, s uint) (c Word)
345TEXT ·shlVU(SB), NOSPLIT, $0
346 MOVD z+0(FP), R3
347 MOVD x+24(FP), R6
348 MOVD s+48(FP), R9
349 MOVD z_len+8(FP), R4
350 MOVD x_len+32(FP), R7
351 CMP R9, R0 // s==0 copy(z,x)
352 BEQ zeroshift
353 CMP R4, R0 // len(z)==0 return
354 BEQ done
355
356 ADD $-1, R4, R5 // len(z)-1
357 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
358 SLD $3, R5, R7
359 ADD R6, R7, R15 // save starting address &x[len(z)-1]
360 ADD R3, R7, R16 // save starting address &z[len(z)-1]
361 MOVD (R6)(R7), R14
362 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
363 CMP R5, R0 // iterate from i=len(z)-1 to 0
364 BEQ loopexit // Already at end?
365 MOVD 0(R15),R10 // x[i]
366 PCALIGN $16
367shloop:
368 SLD R9, R10, R10 // x[i]<<s
369 MOVDU -8(R15), R14
370 SRD R4, R14, R11 // x[i-1]>>ŝ
371 OR R11, R10, R10
372 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
373 MOVD R14, R10 // reuse x[i-1] for next iteration
374 ADD $-8, R16 // i--
375 CMP R15, R6 // &x[i-1]>&x[0]?
376 BGT shloop
377loopexit:
378 MOVD 0(R6), R4
379 SLD R9, R4, R4
380 MOVD R4, 0(R3) // z[0]=x[0]<<s
381 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
382 RET
383
384zeroshift:
385 CMP R6, R0 // x is null, nothing to copy
386 BEQ done
387 CMP R6, R3 // if x is same as z, nothing to copy
388 BEQ done
389 CMP R7, R4
390 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
391 SLD $3, R7, R7
392 SUB R6, R3, R11 // dest - src
393 CMPU R11, R7, CR2 // < len?
394 BLT CR2, backward // there is overlap, copy backwards
395 MOVD $0, R14
396 // shlVU processes backwards, but added a forward copy option
397 // since its faster on POWER
398repeat:
399 MOVD (R6)(R14), R15 // Copy 8 bytes at a time
400 MOVD R15, (R3)(R14)
401 ADD $8, R14
402 CMP R14, R7 // More 8 bytes left?
403 BLT repeat
404 BR done
405backward:
406 ADD $-8,R7, R14
407repeatback:
408 MOVD (R6)(R14), R15 // copy x into z backwards
409 MOVD R15, (R3)(R14) // copy 8 bytes at a time
410 SUB $8, R14
411 CMP R14, $-8 // More 8 bytes left?
412 BGT repeatback
413
414done:
415 MOVD R0, c+56(FP) // c=0
416 RET
417
418//func shrVU(z, x []Word, s uint) (c Word)
419TEXT ·shrVU(SB), NOSPLIT, $0
420 MOVD z+0(FP), R3
421 MOVD x+24(FP), R6
422 MOVD s+48(FP), R9
423 MOVD z_len+8(FP), R4
424 MOVD x_len+32(FP), R7
425
426 CMP R9, R0 // s==0, copy(z,x)
427 BEQ zeroshift
428 CMP R4, R0 // len(z)==0 return
429 BEQ done
430 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
431
432 MOVD 0(R6), R7
433 SLD R5, R7, R7 // compute x[0]<<ŝ
434 MOVD $1, R8 // iterate from i=1 to i<len(z)
435 CMP R8, R4
436 BGE loopexit // Already at end?
437
438 // vectorize if len(z) is >=3, else jump to scalar loop
439 CMP R4, $3
440 BLT scalar
441 MTVSRD R9, VS38 // s
442 VSPLTB $7, V6, V4
443 MTVSRD R5, VS39 // ŝ
444 VSPLTB $7, V7, V2
445 ADD $-2, R4, R16
446 PCALIGN $16
447loopback:
448 ADD $-1, R8, R10
449 SLD $3, R10
450 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
451 SLD $3, R8, R12
452 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
453
454 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
455 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
456 VOR V3, V5, V5 // Or(|) the two registers together
457 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
458 ADD $2, R8 // Done processing 2 entries, i and i+1
459 CMP R8, R16 // Are there at least a couple of more entries left?
460 BLE loopback
461 CMP R8, R4 // Are we at the last element?
462 BEQ loopexit
463scalar:
464 ADD $-1, R8, R10
465 SLD $3, R10
466 MOVD (R6)(R10),R11
467 SRD R9, R11, R11 // x[len(z)-2] >> s
468 SLD $3, R8, R12
469 MOVD (R6)(R12), R12
470 SLD R5, R12, R12 // x[len(z)-1]<<ŝ
471 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
472 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
473loopexit:
474 ADD $-1, R4
475 SLD $3, R4
476 MOVD (R6)(R4), R5
477 SRD R9, R5, R5 // x[len(z)-1]>>s
478 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
479 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
480 RET
481
482zeroshift:
483 CMP R6, R0 // x is null, nothing to copy
484 BEQ done
485 CMP R6, R3 // if x is same as z, nothing to copy
486 BEQ done
487 CMP R7, R4
488 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
489 SLD $3, R7, R7
490 MOVD $0, R14
491repeat:
492 MOVD (R6)(R14), R15 // copy 8 bytes at a time
493 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
494 ADD $8, R14
495 CMP R14, R7 // More 8 bytes left?
496 BLT repeat
497done:
498 MOVD R0, c+56(FP)
499 RET
500
501// func mulAddVWW(z, x []Word, y, r Word) (c Word)
502TEXT ·mulAddVWW(SB), NOSPLIT, $0
503 MOVD z+0(FP), R10 // R10 = z[]
504 MOVD x+24(FP), R8 // R8 = x[]
505 MOVD y+48(FP), R9 // R9 = y
506 MOVD r+56(FP), R4 // R4 = r = c
507 MOVD z_len+8(FP), R11 // R11 = z_len
508
509 CMP R0, R11
510 BEQ done
511
512 MOVD 0(R8), R20
513 ADD $-1, R11
514 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
515 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
516 ADDC R4, R6 // R6 = z0 + r
517 ADDZE R7 // R7 = z1 + CA
518 CMP R0, R11
519 MOVD R7, R4 // R4 = c
520 MOVD R6, 0(R10) // z[i]
521 BEQ done
522
523 // We will read 4 elements per iteration
524 SRD $2, R11, R14 // R14 = z_len/4
525 DCBT (R8)
526 CMP R0, R14
527 MOVD R14, CTR // Set up the loop counter
528 BEQ tail // If R9 = 0, we can't use the loop
529 PCALIGN $16
530
531loop:
532 MOVD 8(R8), R20 // R20 = x[i]
533 MOVD 16(R8), R21 // R21 = x[i+1]
534 MOVD 24(R8), R22 // R22 = x[i+2]
535 MOVDU 32(R8), R23 // R23 = x[i+3]
536 MULLD R9, R20, R24 // R24 = z0[i]
537 MULHDU R9, R20, R20 // R20 = z1[i]
538 ADDC R4, R24 // R24 = z0[i] + c
539 ADDZE R20 // R7 = z1[i] + CA
540 MULLD R9, R21, R25
541 MULHDU R9, R21, R21
542 ADDC R20, R25
543 ADDZE R21
544 MULLD R9, R22, R26
545 MULHDU R9, R22, R22
546 MULLD R9, R23, R27
547 MULHDU R9, R23, R23
548 ADDC R21, R26
549 ADDZE R22
550 MOVD R24, 8(R10) // z[i]
551 MOVD R25, 16(R10) // z[i+1]
552 ADDC R22, R27
553 ADDZE R23,R4 // update carry
554 MOVD R26, 24(R10) // z[i+2]
555 MOVDU R27, 32(R10) // z[i+3]
556 ADD $-4, R11 // R11 = z_len - 4
557 BC 16, 0, loop // bdnz
558
559 // We may have some elements to read
560 CMP R0, R11
561 BEQ done
562
563 // Process the remaining elements, one at a time
564tail:
565 MOVDU 8(R8), R20 // R20 = x[i]
566 MULLD R9, R20, R24 // R24 = z0[i]
567 MULHDU R9, R20, R25 // R25 = z1[i]
568 ADD $-1, R11 // R11 = z_len - 1
569 ADDC R4, R24
570 ADDZE R25
571 MOVDU R24, 8(R10) // z[i]
572 CMP R0, R11
573 MOVD R25, R4 // R4 = c
574 BEQ done // If R11 = 0, we are done
575
576 MOVDU 8(R8), R20
577 MULLD R9, R20, R24
578 MULHDU R9, R20, R25
579 ADD $-1, R11
580 ADDC R4, R24
581 ADDZE R25
582 MOVDU R24, 8(R10)
583 CMP R0, R11
584 MOVD R25, R4
585 BEQ done
586
587 MOVD 8(R8), R20
588 MULLD R9, R20, R24
589 MULHDU R9, R20, R25
590 ADD $-1, R11
591 ADDC R4, R24
592 ADDZE R25
593 MOVD R24, 8(R10)
594 MOVD R25, R4
595
596done:
597 MOVD R4, c+64(FP)
598 RET
599
600// func addMulVVW(z, x []Word, y Word) (c Word)
601TEXT ·addMulVVW(SB), NOSPLIT, $0
602 MOVD z+0(FP), R10 // R10 = z[]
603 MOVD x+24(FP), R8 // R8 = x[]
604 MOVD y+48(FP), R9 // R9 = y
605 MOVD z_len+8(FP), R22 // R22 = z_len
606
607 MOVD R0, R3 // R3 will be the index register
608 CMP R0, R22
609 MOVD R0, R4 // R4 = c = 0
610 MOVD R22, CTR // Initialize loop counter
611 BEQ done
612 PCALIGN $16
613
614loop:
615 MOVD (R8)(R3), R20 // Load x[i]
616 MOVD (R10)(R3), R21 // Load z[i]
617 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
618 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
619 ADDC R21, R6 // R6 = z0
620 ADDZE R7 // R7 = z1
621 ADDC R4, R6 // R6 = z0 + c + 0
622 ADDZE R7, R4 // c += z1
623 MOVD R6, (R10)(R3) // Store z[i]
624 ADD $8, R3
625 BC 16, 0, loop // bdnz
626
627done:
628 MOVD R4, c+56(FP)
629 RET
630
631
View as plain text