1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements multi-precision floating-point numbers. 6 // Like in the GNU MPFR library (https://www.mpfr.org/), operands 7 // can be of mixed precision. Unlike MPFR, the rounding mode is 8 // not specified with each operation, but with each operand. The 9 // rounding mode of the result operand determines the rounding 10 // mode of an operation. This is a from-scratch implementation. 11 12 package big 13 14 import ( 15 "fmt" 16 "math" 17 "math/bits" 18 ) 19 20 const debugFloat = false // enable for debugging 21 22 // A nonzero finite Float represents a multi-precision floating point number 23 // 24 // sign × mantissa × 2**exponent 25 // 26 // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp. 27 // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf). 28 // All Floats are ordered, and the ordering of two Floats x and y 29 // is defined by x.Cmp(y). 30 // 31 // Each Float value also has a precision, rounding mode, and accuracy. 32 // The precision is the maximum number of mantissa bits available to 33 // represent the value. The rounding mode specifies how a result should 34 // be rounded to fit into the mantissa bits, and accuracy describes the 35 // rounding error with respect to the exact result. 36 // 37 // Unless specified otherwise, all operations (including setters) that 38 // specify a *Float variable for the result (usually via the receiver 39 // with the exception of [Float.MantExp]), round the numeric result according 40 // to the precision and rounding mode of the result variable. 41 // 42 // If the provided result precision is 0 (see below), it is set to the 43 // precision of the argument with the largest precision value before any 44 // rounding takes place, and the rounding mode remains unchanged. Thus, 45 // uninitialized Floats provided as result arguments will have their 46 // precision set to a reasonable value determined by the operands, and 47 // their mode is the zero value for RoundingMode (ToNearestEven). 48 // 49 // By setting the desired precision to 24 or 53 and using matching rounding 50 // mode (typically [ToNearestEven]), Float operations produce the same results 51 // as the corresponding float32 or float64 IEEE-754 arithmetic for operands 52 // that correspond to normal (i.e., not denormal) float32 or float64 numbers. 53 // Exponent underflow and overflow lead to a 0 or an Infinity for different 54 // values than IEEE-754 because Float exponents have a much larger range. 55 // 56 // The zero (uninitialized) value for a Float is ready to use and represents 57 // the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven]. 58 // 59 // Operations always take pointer arguments (*Float) rather 60 // than Float values, and each unique Float value requires 61 // its own unique *Float pointer. To "copy" a Float value, 62 // an existing (or newly allocated) Float must be set to 63 // a new value using the [Float.Set] method; shallow copies 64 // of Floats are not supported and may lead to errors. 65 type Float struct { 66 prec uint32 67 mode RoundingMode 68 acc Accuracy 69 form form 70 neg bool 71 mant nat 72 exp int32 73 } 74 75 // An ErrNaN panic is raised by a [Float] operation that would lead to 76 // a NaN under IEEE-754 rules. An ErrNaN implements the error interface. 77 type ErrNaN struct { 78 msg string 79 } 80 81 func (err ErrNaN) Error() string { 82 return err.msg 83 } 84 85 // NewFloat allocates and returns a new [Float] set to x, 86 // with precision 53 and rounding mode [ToNearestEven]. 87 // NewFloat panics with [ErrNaN] if x is a NaN. 88 func NewFloat(x float64) *Float { 89 if math.IsNaN(x) { 90 panic(ErrNaN{"NewFloat(NaN)"}) 91 } 92 return new(Float).SetFloat64(x) 93 } 94 95 // Exponent and precision limits. 96 const ( 97 MaxExp = math.MaxInt32 // largest supported exponent 98 MinExp = math.MinInt32 // smallest supported exponent 99 MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited 100 ) 101 102 // Internal representation: The mantissa bits x.mant of a nonzero finite 103 // Float x are stored in a nat slice long enough to hold up to x.prec bits; 104 // the slice may (but doesn't have to) be shorter if the mantissa contains 105 // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e., 106 // the msb is shifted all the way "to the left"). Thus, if the mantissa has 107 // trailing 0 bits or x.prec is not a multiple of the Word size _W, 108 // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds 109 // to the value 0.5; the exponent x.exp shifts the binary point as needed. 110 // 111 // A zero or non-finite Float x ignores x.mant and x.exp. 112 // 113 // x form neg mant exp 114 // ---------------------------------------------------------- 115 // ±0 zero sign - - 116 // 0 < |x| < +Inf finite sign mantissa exponent 117 // ±Inf inf sign - - 118 119 // A form value describes the internal representation. 120 type form byte 121 122 // The form value order is relevant - do not change! 123 const ( 124 zero form = iota 125 finite 126 inf 127 ) 128 129 // RoundingMode determines how a [Float] value is rounded to the 130 // desired precision. Rounding may change the [Float] value; the 131 // rounding error is described by the [Float]'s [Accuracy]. 132 type RoundingMode byte 133 134 // These constants define supported rounding modes. 135 const ( 136 ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven 137 ToNearestAway // == IEEE 754-2008 roundTiesToAway 138 ToZero // == IEEE 754-2008 roundTowardZero 139 AwayFromZero // no IEEE 754-2008 equivalent 140 ToNegativeInf // == IEEE 754-2008 roundTowardNegative 141 ToPositiveInf // == IEEE 754-2008 roundTowardPositive 142 ) 143 144 //go:generate stringer -type=RoundingMode 145 146 // Accuracy describes the rounding error produced by the most recent 147 // operation that generated a [Float] value, relative to the exact value. 148 type Accuracy int8 149 150 // Constants describing the [Accuracy] of a [Float]. 151 const ( 152 Below Accuracy = -1 153 Exact Accuracy = 0 154 Above Accuracy = +1 155 ) 156 157 //go:generate stringer -type=Accuracy 158 159 // SetPrec sets z's precision to prec and returns the (possibly) rounded 160 // value of z. Rounding occurs according to z's rounding mode if the mantissa 161 // cannot be represented in prec bits without loss of precision. 162 // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged. 163 // If prec > [MaxPrec], it is set to [MaxPrec]. 164 func (z *Float) SetPrec(prec uint) *Float { 165 z.acc = Exact // optimistically assume no rounding is needed 166 167 // special case 168 if prec == 0 { 169 z.prec = 0 170 if z.form == finite { 171 // truncate z to 0 172 z.acc = makeAcc(z.neg) 173 z.form = zero 174 } 175 return z 176 } 177 178 // general case 179 if prec > MaxPrec { 180 prec = MaxPrec 181 } 182 old := z.prec 183 z.prec = uint32(prec) 184 if z.prec < old { 185 z.round(0) 186 } 187 return z 188 } 189 190 func makeAcc(above bool) Accuracy { 191 if above { 192 return Above 193 } 194 return Below 195 } 196 197 // SetMode sets z's rounding mode to mode and returns an exact z. 198 // z remains unchanged otherwise. 199 // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact]. 200 func (z *Float) SetMode(mode RoundingMode) *Float { 201 z.mode = mode 202 z.acc = Exact 203 return z 204 } 205 206 // Prec returns the mantissa precision of x in bits. 207 // The result may be 0 for |x| == 0 and |x| == Inf. 208 func (x *Float) Prec() uint { 209 return uint(x.prec) 210 } 211 212 // MinPrec returns the minimum precision required to represent x exactly 213 // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x). 214 // The result is 0 for |x| == 0 and |x| == Inf. 215 func (x *Float) MinPrec() uint { 216 if x.form != finite { 217 return 0 218 } 219 return uint(len(x.mant))*_W - x.mant.trailingZeroBits() 220 } 221 222 // Mode returns the rounding mode of x. 223 func (x *Float) Mode() RoundingMode { 224 return x.mode 225 } 226 227 // Acc returns the accuracy of x produced by the most recent 228 // operation, unless explicitly documented otherwise by that 229 // operation. 230 func (x *Float) Acc() Accuracy { 231 return x.acc 232 } 233 234 // Sign returns: 235 // 236 // -1 if x < 0 237 // 0 if x is ±0 238 // +1 if x > 0 239 func (x *Float) Sign() int { 240 if debugFloat { 241 x.validate() 242 } 243 if x.form == zero { 244 return 0 245 } 246 if x.neg { 247 return -1 248 } 249 return 1 250 } 251 252 // MantExp breaks x into its mantissa and exponent components 253 // and returns the exponent. If a non-nil mant argument is 254 // provided its value is set to the mantissa of x, with the 255 // same precision and rounding mode as x. The components 256 // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0. 257 // Calling MantExp with a nil argument is an efficient way to 258 // get the exponent of the receiver. 259 // 260 // Special cases are: 261 // 262 // ( ±0).MantExp(mant) = 0, with mant set to ±0 263 // (±Inf).MantExp(mant) = 0, with mant set to ±Inf 264 // 265 // x and mant may be the same in which case x is set to its 266 // mantissa value. 267 func (x *Float) MantExp(mant *Float) (exp int) { 268 if debugFloat { 269 x.validate() 270 } 271 if x.form == finite { 272 exp = int(x.exp) 273 } 274 if mant != nil { 275 mant.Copy(x) 276 if mant.form == finite { 277 mant.exp = 0 278 } 279 } 280 return 281 } 282 283 func (z *Float) setExpAndRound(exp int64, sbit uint) { 284 if exp < MinExp { 285 // underflow 286 z.acc = makeAcc(z.neg) 287 z.form = zero 288 return 289 } 290 291 if exp > MaxExp { 292 // overflow 293 z.acc = makeAcc(!z.neg) 294 z.form = inf 295 return 296 } 297 298 z.form = finite 299 z.exp = int32(exp) 300 z.round(sbit) 301 } 302 303 // SetMantExp sets z to mant × 2**exp and returns z. 304 // The result z has the same precision and rounding mode 305 // as mant. SetMantExp is an inverse of [Float.MantExp] but does 306 // not require 0.5 <= |mant| < 1.0. Specifically, for a 307 // given x of type *[Float], SetMantExp relates to [Float.MantExp] 308 // as follows: 309 // 310 // mant := new(Float) 311 // new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0 312 // 313 // Special cases are: 314 // 315 // z.SetMantExp( ±0, exp) = ±0 316 // z.SetMantExp(±Inf, exp) = ±Inf 317 // 318 // z and mant may be the same in which case z's exponent 319 // is set to exp. 320 func (z *Float) SetMantExp(mant *Float, exp int) *Float { 321 if debugFloat { 322 z.validate() 323 mant.validate() 324 } 325 z.Copy(mant) 326 327 if z.form == finite { 328 // 0 < |mant| < +Inf 329 z.setExpAndRound(int64(z.exp)+int64(exp), 0) 330 } 331 return z 332 } 333 334 // Signbit reports whether x is negative or negative zero. 335 func (x *Float) Signbit() bool { 336 return x.neg 337 } 338 339 // IsInf reports whether x is +Inf or -Inf. 340 func (x *Float) IsInf() bool { 341 return x.form == inf 342 } 343 344 // IsInt reports whether x is an integer. 345 // ±Inf values are not integers. 346 func (x *Float) IsInt() bool { 347 if debugFloat { 348 x.validate() 349 } 350 // special cases 351 if x.form != finite { 352 return x.form == zero 353 } 354 // x.form == finite 355 if x.exp <= 0 { 356 return false 357 } 358 // x.exp > 0 359 return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa 360 } 361 362 // debugging support 363 func (x *Float) validate() { 364 if !debugFloat { 365 // avoid performance bugs 366 panic("validate called but debugFloat is not set") 367 } 368 if msg := x.validate0(); msg != "" { 369 panic(msg) 370 } 371 } 372 373 func (x *Float) validate0() string { 374 if x.form != finite { 375 return "" 376 } 377 m := len(x.mant) 378 if m == 0 { 379 return "nonzero finite number with empty mantissa" 380 } 381 const msb = 1 << (_W - 1) 382 if x.mant[m-1]&msb == 0 { 383 return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)) 384 } 385 if x.prec == 0 { 386 return "zero precision finite number" 387 } 388 return "" 389 } 390 391 // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly. 392 // sbit must be 0 or 1 and summarizes any "sticky bit" information one might 393 // have before calling round. z's mantissa must be normalized (with the msb set) 394 // or empty. 395 // 396 // CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the 397 // sign of z. For correct rounding, the sign of z must be set correctly before 398 // calling round. 399 func (z *Float) round(sbit uint) { 400 if debugFloat { 401 z.validate() 402 } 403 404 z.acc = Exact 405 if z.form != finite { 406 // ±0 or ±Inf => nothing left to do 407 return 408 } 409 // z.form == finite && len(z.mant) > 0 410 // m > 0 implies z.prec > 0 (checked by validate) 411 412 m := uint32(len(z.mant)) // present mantissa length in words 413 bits := m * _W // present mantissa bits; bits > 0 414 if bits <= z.prec { 415 // mantissa fits => nothing to do 416 return 417 } 418 // bits > z.prec 419 420 // Rounding is based on two bits: the rounding bit (rbit) and the 421 // sticky bit (sbit). The rbit is the bit immediately before the 422 // z.prec leading mantissa bits (the "0.5"). The sbit is set if any 423 // of the bits before the rbit are set (the "0.25", "0.125", etc.): 424 // 425 // rbit sbit => "fractional part" 426 // 427 // 0 0 == 0 428 // 0 1 > 0 , < 0.5 429 // 1 0 == 0.5 430 // 1 1 > 0.5, < 1.0 431 432 // bits > z.prec: mantissa too large => round 433 r := uint(bits - z.prec - 1) // rounding bit position; r >= 0 434 rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit 435 // The sticky bit is only needed for rounding ToNearestEven 436 // or when the rounding bit is zero. Avoid computation otherwise. 437 if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) { 438 sbit = z.mant.sticky(r) 439 } 440 sbit &= 1 // be safe and ensure it's a single bit 441 442 // cut off extra words 443 n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision 444 if m > n { 445 copy(z.mant, z.mant[m-n:]) // move n last words to front 446 z.mant = z.mant[:n] 447 } 448 449 // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word 450 ntz := n*_W - z.prec // 0 <= ntz < _W 451 lsb := Word(1) << ntz 452 453 // round if result is inexact 454 if rbit|sbit != 0 { 455 // Make rounding decision: The result mantissa is truncated ("rounded down") 456 // by default. Decide if we need to increment, or "round up", the (unsigned) 457 // mantissa. 458 inc := false 459 switch z.mode { 460 case ToNegativeInf: 461 inc = z.neg 462 case ToZero: 463 // nothing to do 464 case ToNearestEven: 465 inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0) 466 case ToNearestAway: 467 inc = rbit != 0 468 case AwayFromZero: 469 inc = true 470 case ToPositiveInf: 471 inc = !z.neg 472 default: 473 panic("unreachable") 474 } 475 476 // A positive result (!z.neg) is Above the exact result if we increment, 477 // and it's Below if we truncate (Exact results require no rounding). 478 // For a negative result (z.neg) it is exactly the opposite. 479 z.acc = makeAcc(inc != z.neg) 480 481 if inc { 482 // add 1 to mantissa 483 if addVW(z.mant, z.mant, lsb) != 0 { 484 // mantissa overflow => adjust exponent 485 if z.exp >= MaxExp { 486 // exponent overflow 487 z.form = inf 488 return 489 } 490 z.exp++ 491 // adjust mantissa: divide by 2 to compensate for exponent adjustment 492 shrVU(z.mant, z.mant, 1) 493 // set msb == carry == 1 from the mantissa overflow above 494 const msb = 1 << (_W - 1) 495 z.mant[n-1] |= msb 496 } 497 } 498 } 499 500 // zero out trailing bits in least-significant word 501 z.mant[0] &^= lsb - 1 502 503 if debugFloat { 504 z.validate() 505 } 506 } 507 508 func (z *Float) setBits64(neg bool, x uint64) *Float { 509 if z.prec == 0 { 510 z.prec = 64 511 } 512 z.acc = Exact 513 z.neg = neg 514 if x == 0 { 515 z.form = zero 516 return z 517 } 518 // x != 0 519 z.form = finite 520 s := bits.LeadingZeros64(x) 521 z.mant = z.mant.setUint64(x << uint(s)) 522 z.exp = int32(64 - s) // always fits 523 if z.prec < 64 { 524 z.round(0) 525 } 526 return z 527 } 528 529 // SetUint64 sets z to the (possibly rounded) value of x and returns z. 530 // If z's precision is 0, it is changed to 64 (and rounding will have 531 // no effect). 532 func (z *Float) SetUint64(x uint64) *Float { 533 return z.setBits64(false, x) 534 } 535 536 // SetInt64 sets z to the (possibly rounded) value of x and returns z. 537 // If z's precision is 0, it is changed to 64 (and rounding will have 538 // no effect). 539 func (z *Float) SetInt64(x int64) *Float { 540 u := x 541 if u < 0 { 542 u = -u 543 } 544 // We cannot simply call z.SetUint64(uint64(u)) and change 545 // the sign afterwards because the sign affects rounding. 546 return z.setBits64(x < 0, uint64(u)) 547 } 548 549 // SetFloat64 sets z to the (possibly rounded) value of x and returns z. 550 // If z's precision is 0, it is changed to 53 (and rounding will have 551 // no effect). SetFloat64 panics with [ErrNaN] if x is a NaN. 552 func (z *Float) SetFloat64(x float64) *Float { 553 if z.prec == 0 { 554 z.prec = 53 555 } 556 if math.IsNaN(x) { 557 panic(ErrNaN{"Float.SetFloat64(NaN)"}) 558 } 559 z.acc = Exact 560 z.neg = math.Signbit(x) // handle -0, -Inf correctly 561 if x == 0 { 562 z.form = zero 563 return z 564 } 565 if math.IsInf(x, 0) { 566 z.form = inf 567 return z 568 } 569 // normalized x != 0 570 z.form = finite 571 fmant, exp := math.Frexp(x) // get normalized mantissa 572 z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11) 573 z.exp = int32(exp) // always fits 574 if z.prec < 53 { 575 z.round(0) 576 } 577 return z 578 } 579 580 // fnorm normalizes mantissa m by shifting it to the left 581 // such that the msb of the most-significant word (msw) is 1. 582 // It returns the shift amount. It assumes that len(m) != 0. 583 func fnorm(m nat) int64 { 584 if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) { 585 panic("msw of mantissa is 0") 586 } 587 s := nlz(m[len(m)-1]) 588 if s > 0 { 589 c := shlVU(m, m, s) 590 if debugFloat && c != 0 { 591 panic("nlz or shlVU incorrect") 592 } 593 } 594 return int64(s) 595 } 596 597 // SetInt sets z to the (possibly rounded) value of x and returns z. 598 // If z's precision is 0, it is changed to the larger of x.BitLen() 599 // or 64 (and rounding will have no effect). 600 func (z *Float) SetInt(x *Int) *Float { 601 // TODO(gri) can be more efficient if z.prec > 0 602 // but small compared to the size of x, or if there 603 // are many trailing 0's. 604 bits := uint32(x.BitLen()) 605 if z.prec == 0 { 606 z.prec = umax32(bits, 64) 607 } 608 z.acc = Exact 609 z.neg = x.neg 610 if len(x.abs) == 0 { 611 z.form = zero 612 return z 613 } 614 // x != 0 615 z.mant = z.mant.set(x.abs) 616 fnorm(z.mant) 617 z.setExpAndRound(int64(bits), 0) 618 return z 619 } 620 621 // SetRat sets z to the (possibly rounded) value of x and returns z. 622 // If z's precision is 0, it is changed to the largest of a.BitLen(), 623 // b.BitLen(), or 64; with x = a/b. 624 func (z *Float) SetRat(x *Rat) *Float { 625 if x.IsInt() { 626 return z.SetInt(x.Num()) 627 } 628 var a, b Float 629 a.SetInt(x.Num()) 630 b.SetInt(x.Denom()) 631 if z.prec == 0 { 632 z.prec = umax32(a.prec, b.prec) 633 } 634 return z.Quo(&a, &b) 635 } 636 637 // SetInf sets z to the infinite Float -Inf if signbit is 638 // set, or +Inf if signbit is not set, and returns z. The 639 // precision of z is unchanged and the result is always 640 // [Exact]. 641 func (z *Float) SetInf(signbit bool) *Float { 642 z.acc = Exact 643 z.form = inf 644 z.neg = signbit 645 return z 646 } 647 648 // Set sets z to the (possibly rounded) value of x and returns z. 649 // If z's precision is 0, it is changed to the precision of x 650 // before setting z (and rounding will have no effect). 651 // Rounding is performed according to z's precision and rounding 652 // mode; and z's accuracy reports the result error relative to the 653 // exact (not rounded) result. 654 func (z *Float) Set(x *Float) *Float { 655 if debugFloat { 656 x.validate() 657 } 658 z.acc = Exact 659 if z != x { 660 z.form = x.form 661 z.neg = x.neg 662 if x.form == finite { 663 z.exp = x.exp 664 z.mant = z.mant.set(x.mant) 665 } 666 if z.prec == 0 { 667 z.prec = x.prec 668 } else if z.prec < x.prec { 669 z.round(0) 670 } 671 } 672 return z 673 } 674 675 // Copy sets z to x, with the same precision, rounding mode, and 676 // accuracy as x, and returns z. x is not changed even if z and 677 // x are the same. 678 func (z *Float) Copy(x *Float) *Float { 679 if debugFloat { 680 x.validate() 681 } 682 if z != x { 683 z.prec = x.prec 684 z.mode = x.mode 685 z.acc = x.acc 686 z.form = x.form 687 z.neg = x.neg 688 if z.form == finite { 689 z.mant = z.mant.set(x.mant) 690 z.exp = x.exp 691 } 692 } 693 return z 694 } 695 696 // msb32 returns the 32 most significant bits of x. 697 func msb32(x nat) uint32 { 698 i := len(x) - 1 699 if i < 0 { 700 return 0 701 } 702 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 703 panic("x not normalized") 704 } 705 switch _W { 706 case 32: 707 return uint32(x[i]) 708 case 64: 709 return uint32(x[i] >> 32) 710 } 711 panic("unreachable") 712 } 713 714 // msb64 returns the 64 most significant bits of x. 715 func msb64(x nat) uint64 { 716 i := len(x) - 1 717 if i < 0 { 718 return 0 719 } 720 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 721 panic("x not normalized") 722 } 723 switch _W { 724 case 32: 725 v := uint64(x[i]) << 32 726 if i > 0 { 727 v |= uint64(x[i-1]) 728 } 729 return v 730 case 64: 731 return uint64(x[i]) 732 } 733 panic("unreachable") 734 } 735 736 // Uint64 returns the unsigned integer resulting from truncating x 737 // towards zero. If 0 <= x <= math.MaxUint64, the result is [Exact] 738 // if x is an integer and [Below] otherwise. 739 // The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below]) 740 // for x > [math.MaxUint64]. 741 func (x *Float) Uint64() (uint64, Accuracy) { 742 if debugFloat { 743 x.validate() 744 } 745 746 switch x.form { 747 case finite: 748 if x.neg { 749 return 0, Above 750 } 751 // 0 < x < +Inf 752 if x.exp <= 0 { 753 // 0 < x < 1 754 return 0, Below 755 } 756 // 1 <= x < Inf 757 if x.exp <= 64 { 758 // u = trunc(x) fits into a uint64 759 u := msb64(x.mant) >> (64 - uint32(x.exp)) 760 if x.MinPrec() <= 64 { 761 return u, Exact 762 } 763 return u, Below // x truncated 764 } 765 // x too large 766 return math.MaxUint64, Below 767 768 case zero: 769 return 0, Exact 770 771 case inf: 772 if x.neg { 773 return 0, Above 774 } 775 return math.MaxUint64, Below 776 } 777 778 panic("unreachable") 779 } 780 781 // Int64 returns the integer resulting from truncating x towards zero. 782 // If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is 783 // an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise. 784 // The result is ([math.MinInt64], [Above]) for x < [math.MinInt64], 785 // and ([math.MaxInt64], [Below]) for x > [math.MaxInt64]. 786 func (x *Float) Int64() (int64, Accuracy) { 787 if debugFloat { 788 x.validate() 789 } 790 791 switch x.form { 792 case finite: 793 // 0 < |x| < +Inf 794 acc := makeAcc(x.neg) 795 if x.exp <= 0 { 796 // 0 < |x| < 1 797 return 0, acc 798 } 799 // x.exp > 0 800 801 // 1 <= |x| < +Inf 802 if x.exp <= 63 { 803 // i = trunc(x) fits into an int64 (excluding math.MinInt64) 804 i := int64(msb64(x.mant) >> (64 - uint32(x.exp))) 805 if x.neg { 806 i = -i 807 } 808 if x.MinPrec() <= uint(x.exp) { 809 return i, Exact 810 } 811 return i, acc // x truncated 812 } 813 if x.neg { 814 // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64)) 815 if x.exp == 64 && x.MinPrec() == 1 { 816 acc = Exact 817 } 818 return math.MinInt64, acc 819 } 820 // x too large 821 return math.MaxInt64, Below 822 823 case zero: 824 return 0, Exact 825 826 case inf: 827 if x.neg { 828 return math.MinInt64, Above 829 } 830 return math.MaxInt64, Below 831 } 832 833 panic("unreachable") 834 } 835 836 // Float32 returns the float32 value nearest to x. If x is too small to be 837 // represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result 838 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x. 839 // If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]), 840 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. 841 func (x *Float) Float32() (float32, Accuracy) { 842 if debugFloat { 843 x.validate() 844 } 845 846 switch x.form { 847 case finite: 848 // 0 < |x| < +Inf 849 850 const ( 851 fbits = 32 // float size 852 mbits = 23 // mantissa size (excluding implicit msb) 853 ebits = fbits - mbits - 1 // 8 exponent size 854 bias = 1<<(ebits-1) - 1 // 127 exponent bias 855 dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) 856 emin = 1 - bias // -126 smallest unbiased exponent (normal) 857 emax = bias // 127 largest unbiased exponent (normal) 858 ) 859 860 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. 861 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 862 863 // Compute precision p for float32 mantissa. 864 // If the exponent is too small, we have a denormal number before 865 // rounding and fewer than p mantissa bits of precision available 866 // (the exponent remains fixed but the mantissa gets shifted right). 867 p := mbits + 1 // precision of normal float 868 if e < emin { 869 // recompute precision 870 p = mbits + 1 - emin + int(e) 871 // If p == 0, the mantissa of x is shifted so much to the right 872 // that its msb falls immediately to the right of the float32 873 // mantissa space. In other words, if the smallest denormal is 874 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 875 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 876 // If m == 0.5, it is rounded down to even, i.e., 0.0. 877 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 878 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 879 // underflow to ±0 880 if x.neg { 881 var z float32 882 return -z, Above 883 } 884 return 0.0, Below 885 } 886 // otherwise, round up 887 // We handle p == 0 explicitly because it's easy and because 888 // Float.round doesn't support rounding to 0 bits of precision. 889 if p == 0 { 890 if x.neg { 891 return -math.SmallestNonzeroFloat32, Below 892 } 893 return math.SmallestNonzeroFloat32, Above 894 } 895 } 896 // p > 0 897 898 // round 899 var r Float 900 r.prec = uint32(p) 901 r.Set(x) 902 e = r.exp - 1 903 904 // Rounding may have caused r to overflow to ±Inf 905 // (rounding never causes underflows to 0). 906 // If the exponent is too large, also overflow to ±Inf. 907 if r.form == inf || e > emax { 908 // overflow 909 if x.neg { 910 return float32(math.Inf(-1)), Below 911 } 912 return float32(math.Inf(+1)), Above 913 } 914 // e <= emax 915 916 // Determine sign, biased exponent, and mantissa. 917 var sign, bexp, mant uint32 918 if x.neg { 919 sign = 1 << (fbits - 1) 920 } 921 922 // Rounding may have caused a denormal number to 923 // become normal. Check again. 924 if e < emin { 925 // denormal number: recompute precision 926 // Since rounding may have at best increased precision 927 // and we have eliminated p <= 0 early, we know p > 0. 928 // bexp == 0 for denormals 929 p = mbits + 1 - emin + int(e) 930 mant = msb32(r.mant) >> uint(fbits-p) 931 } else { 932 // normal number: emin <= e <= emax 933 bexp = uint32(e+bias) << mbits 934 mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 935 } 936 937 return math.Float32frombits(sign | bexp | mant), r.acc 938 939 case zero: 940 if x.neg { 941 var z float32 942 return -z, Exact 943 } 944 return 0.0, Exact 945 946 case inf: 947 if x.neg { 948 return float32(math.Inf(-1)), Exact 949 } 950 return float32(math.Inf(+1)), Exact 951 } 952 953 panic("unreachable") 954 } 955 956 // Float64 returns the float64 value nearest to x. If x is too small to be 957 // represented by a float64 (|x| < [math.SmallestNonzeroFloat64]), the result 958 // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x. 959 // If x is too large to be represented by a float64 (|x| > [math.MaxFloat64]), 960 // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x. 961 func (x *Float) Float64() (float64, Accuracy) { 962 if debugFloat { 963 x.validate() 964 } 965 966 switch x.form { 967 case finite: 968 // 0 < |x| < +Inf 969 970 const ( 971 fbits = 64 // float size 972 mbits = 52 // mantissa size (excluding implicit msb) 973 ebits = fbits - mbits - 1 // 11 exponent size 974 bias = 1<<(ebits-1) - 1 // 1023 exponent bias 975 dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) 976 emin = 1 - bias // -1022 smallest unbiased exponent (normal) 977 emax = bias // 1023 largest unbiased exponent (normal) 978 ) 979 980 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. 981 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 982 983 // Compute precision p for float64 mantissa. 984 // If the exponent is too small, we have a denormal number before 985 // rounding and fewer than p mantissa bits of precision available 986 // (the exponent remains fixed but the mantissa gets shifted right). 987 p := mbits + 1 // precision of normal float 988 if e < emin { 989 // recompute precision 990 p = mbits + 1 - emin + int(e) 991 // If p == 0, the mantissa of x is shifted so much to the right 992 // that its msb falls immediately to the right of the float64 993 // mantissa space. In other words, if the smallest denormal is 994 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 995 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 996 // If m == 0.5, it is rounded down to even, i.e., 0.0. 997 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 998 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 999 // underflow to ±0 1000 if x.neg { 1001 var z float64 1002 return -z, Above 1003 } 1004 return 0.0, Below 1005 } 1006 // otherwise, round up 1007 // We handle p == 0 explicitly because it's easy and because 1008 // Float.round doesn't support rounding to 0 bits of precision. 1009 if p == 0 { 1010 if x.neg { 1011 return -math.SmallestNonzeroFloat64, Below 1012 } 1013 return math.SmallestNonzeroFloat64, Above 1014 } 1015 } 1016 // p > 0 1017 1018 // round 1019 var r Float 1020 r.prec = uint32(p) 1021 r.Set(x) 1022 e = r.exp - 1 1023 1024 // Rounding may have caused r to overflow to ±Inf 1025 // (rounding never causes underflows to 0). 1026 // If the exponent is too large, also overflow to ±Inf. 1027 if r.form == inf || e > emax { 1028 // overflow 1029 if x.neg { 1030 return math.Inf(-1), Below 1031 } 1032 return math.Inf(+1), Above 1033 } 1034 // e <= emax 1035 1036 // Determine sign, biased exponent, and mantissa. 1037 var sign, bexp, mant uint64 1038 if x.neg { 1039 sign = 1 << (fbits - 1) 1040 } 1041 1042 // Rounding may have caused a denormal number to 1043 // become normal. Check again. 1044 if e < emin { 1045 // denormal number: recompute precision 1046 // Since rounding may have at best increased precision 1047 // and we have eliminated p <= 0 early, we know p > 0. 1048 // bexp == 0 for denormals 1049 p = mbits + 1 - emin + int(e) 1050 mant = msb64(r.mant) >> uint(fbits-p) 1051 } else { 1052 // normal number: emin <= e <= emax 1053 bexp = uint64(e+bias) << mbits 1054 mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 1055 } 1056 1057 return math.Float64frombits(sign | bexp | mant), r.acc 1058 1059 case zero: 1060 if x.neg { 1061 var z float64 1062 return -z, Exact 1063 } 1064 return 0.0, Exact 1065 1066 case inf: 1067 if x.neg { 1068 return math.Inf(-1), Exact 1069 } 1070 return math.Inf(+1), Exact 1071 } 1072 1073 panic("unreachable") 1074 } 1075 1076 // Int returns the result of truncating x towards zero; 1077 // or nil if x is an infinity. 1078 // The result is [Exact] if x.IsInt(); otherwise it is [Below] 1079 // for x > 0, and [Above] for x < 0. 1080 // If a non-nil *[Int] argument z is provided, [Int] stores 1081 // the result in z instead of allocating a new [Int]. 1082 func (x *Float) Int(z *Int) (*Int, Accuracy) { 1083 if debugFloat { 1084 x.validate() 1085 } 1086 1087 if z == nil && x.form <= finite { 1088 z = new(Int) 1089 } 1090 1091 switch x.form { 1092 case finite: 1093 // 0 < |x| < +Inf 1094 acc := makeAcc(x.neg) 1095 if x.exp <= 0 { 1096 // 0 < |x| < 1 1097 return z.SetInt64(0), acc 1098 } 1099 // x.exp > 0 1100 1101 // 1 <= |x| < +Inf 1102 // determine minimum required precision for x 1103 allBits := uint(len(x.mant)) * _W 1104 exp := uint(x.exp) 1105 if x.MinPrec() <= exp { 1106 acc = Exact 1107 } 1108 // shift mantissa as needed 1109 if z == nil { 1110 z = new(Int) 1111 } 1112 z.neg = x.neg 1113 switch { 1114 case exp > allBits: 1115 z.abs = z.abs.shl(x.mant, exp-allBits) 1116 default: 1117 z.abs = z.abs.set(x.mant) 1118 case exp < allBits: 1119 z.abs = z.abs.shr(x.mant, allBits-exp) 1120 } 1121 return z, acc 1122 1123 case zero: 1124 return z.SetInt64(0), Exact 1125 1126 case inf: 1127 return nil, makeAcc(x.neg) 1128 } 1129 1130 panic("unreachable") 1131 } 1132 1133 // Rat returns the rational number corresponding to x; 1134 // or nil if x is an infinity. 1135 // The result is [Exact] if x is not an Inf. 1136 // If a non-nil *[Rat] argument z is provided, [Rat] stores 1137 // the result in z instead of allocating a new [Rat]. 1138 func (x *Float) Rat(z *Rat) (*Rat, Accuracy) { 1139 if debugFloat { 1140 x.validate() 1141 } 1142 1143 if z == nil && x.form <= finite { 1144 z = new(Rat) 1145 } 1146 1147 switch x.form { 1148 case finite: 1149 // 0 < |x| < +Inf 1150 allBits := int32(len(x.mant)) * _W 1151 // build up numerator and denominator 1152 z.a.neg = x.neg 1153 switch { 1154 case x.exp > allBits: 1155 z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits)) 1156 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1157 // z already in normal form 1158 default: 1159 z.a.abs = z.a.abs.set(x.mant) 1160 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1161 // z already in normal form 1162 case x.exp < allBits: 1163 z.a.abs = z.a.abs.set(x.mant) 1164 t := z.b.abs.setUint64(1) 1165 z.b.abs = t.shl(t, uint(allBits-x.exp)) 1166 z.norm() 1167 } 1168 return z, Exact 1169 1170 case zero: 1171 return z.SetInt64(0), Exact 1172 1173 case inf: 1174 return nil, makeAcc(x.neg) 1175 } 1176 1177 panic("unreachable") 1178 } 1179 1180 // Abs sets z to the (possibly rounded) value |x| (the absolute value of x) 1181 // and returns z. 1182 func (z *Float) Abs(x *Float) *Float { 1183 z.Set(x) 1184 z.neg = false 1185 return z 1186 } 1187 1188 // Neg sets z to the (possibly rounded) value of x with its sign negated, 1189 // and returns z. 1190 func (z *Float) Neg(x *Float) *Float { 1191 z.Set(x) 1192 z.neg = !z.neg 1193 return z 1194 } 1195 1196 func validateBinaryOperands(x, y *Float) { 1197 if !debugFloat { 1198 // avoid performance bugs 1199 panic("validateBinaryOperands called but debugFloat is not set") 1200 } 1201 if len(x.mant) == 0 { 1202 panic("empty mantissa for x") 1203 } 1204 if len(y.mant) == 0 { 1205 panic("empty mantissa for y") 1206 } 1207 } 1208 1209 // z = x + y, ignoring signs of x and y for the addition 1210 // but using the sign of z for rounding the result. 1211 // x and y must have a non-empty mantissa and valid exponent. 1212 func (z *Float) uadd(x, y *Float) { 1213 // Note: This implementation requires 2 shifts most of the 1214 // time. It is also inefficient if exponents or precisions 1215 // differ by wide margins. The following article describes 1216 // an efficient (but much more complicated) implementation 1217 // compatible with the internal representation used here: 1218 // 1219 // Vincent Lefèvre: "The Generic Multiple-Precision Floating- 1220 // Point Addition With Exact Rounding (as in the MPFR Library)" 1221 // http://www.vinc17.net/research/papers/rnc6.pdf 1222 1223 if debugFloat { 1224 validateBinaryOperands(x, y) 1225 } 1226 1227 // compute exponents ex, ey for mantissa with "binary point" 1228 // on the right (mantissa.0) - use int64 to avoid overflow 1229 ex := int64(x.exp) - int64(len(x.mant))*_W 1230 ey := int64(y.exp) - int64(len(y.mant))*_W 1231 1232 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1233 1234 // TODO(gri) having a combined add-and-shift primitive 1235 // could make this code significantly faster 1236 switch { 1237 case ex < ey: 1238 if al { 1239 t := nat(nil).shl(y.mant, uint(ey-ex)) 1240 z.mant = z.mant.add(x.mant, t) 1241 } else { 1242 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1243 z.mant = z.mant.add(x.mant, z.mant) 1244 } 1245 default: 1246 // ex == ey, no shift needed 1247 z.mant = z.mant.add(x.mant, y.mant) 1248 case ex > ey: 1249 if al { 1250 t := nat(nil).shl(x.mant, uint(ex-ey)) 1251 z.mant = z.mant.add(t, y.mant) 1252 } else { 1253 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1254 z.mant = z.mant.add(z.mant, y.mant) 1255 } 1256 ex = ey 1257 } 1258 // len(z.mant) > 0 1259 1260 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1261 } 1262 1263 // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction 1264 // but using the sign of z for rounding the result. 1265 // x and y must have a non-empty mantissa and valid exponent. 1266 func (z *Float) usub(x, y *Float) { 1267 // This code is symmetric to uadd. 1268 // We have not factored the common code out because 1269 // eventually uadd (and usub) should be optimized 1270 // by special-casing, and the code will diverge. 1271 1272 if debugFloat { 1273 validateBinaryOperands(x, y) 1274 } 1275 1276 ex := int64(x.exp) - int64(len(x.mant))*_W 1277 ey := int64(y.exp) - int64(len(y.mant))*_W 1278 1279 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1280 1281 switch { 1282 case ex < ey: 1283 if al { 1284 t := nat(nil).shl(y.mant, uint(ey-ex)) 1285 z.mant = t.sub(x.mant, t) 1286 } else { 1287 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1288 z.mant = z.mant.sub(x.mant, z.mant) 1289 } 1290 default: 1291 // ex == ey, no shift needed 1292 z.mant = z.mant.sub(x.mant, y.mant) 1293 case ex > ey: 1294 if al { 1295 t := nat(nil).shl(x.mant, uint(ex-ey)) 1296 z.mant = t.sub(t, y.mant) 1297 } else { 1298 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1299 z.mant = z.mant.sub(z.mant, y.mant) 1300 } 1301 ex = ey 1302 } 1303 1304 // operands may have canceled each other out 1305 if len(z.mant) == 0 { 1306 z.acc = Exact 1307 z.form = zero 1308 z.neg = false 1309 return 1310 } 1311 // len(z.mant) > 0 1312 1313 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1314 } 1315 1316 // z = x * y, ignoring signs of x and y for the multiplication 1317 // but using the sign of z for rounding the result. 1318 // x and y must have a non-empty mantissa and valid exponent. 1319 func (z *Float) umul(x, y *Float) { 1320 if debugFloat { 1321 validateBinaryOperands(x, y) 1322 } 1323 1324 // Note: This is doing too much work if the precision 1325 // of z is less than the sum of the precisions of x 1326 // and y which is often the case (e.g., if all floats 1327 // have the same precision). 1328 // TODO(gri) Optimize this for the common case. 1329 1330 e := int64(x.exp) + int64(y.exp) 1331 if x == y { 1332 z.mant = z.mant.sqr(x.mant) 1333 } else { 1334 z.mant = z.mant.mul(x.mant, y.mant) 1335 } 1336 z.setExpAndRound(e-fnorm(z.mant), 0) 1337 } 1338 1339 // z = x / y, ignoring signs of x and y for the division 1340 // but using the sign of z for rounding the result. 1341 // x and y must have a non-empty mantissa and valid exponent. 1342 func (z *Float) uquo(x, y *Float) { 1343 if debugFloat { 1344 validateBinaryOperands(x, y) 1345 } 1346 1347 // mantissa length in words for desired result precision + 1 1348 // (at least one extra bit so we get the rounding bit after 1349 // the division) 1350 n := int(z.prec/_W) + 1 1351 1352 // compute adjusted x.mant such that we get enough result precision 1353 xadj := x.mant 1354 if d := n - len(x.mant) + len(y.mant); d > 0 { 1355 // d extra words needed => add d "0 digits" to x 1356 xadj = make(nat, len(x.mant)+d) 1357 copy(xadj[d:], x.mant) 1358 } 1359 // TODO(gri): If we have too many digits (d < 0), we should be able 1360 // to shorten x for faster division. But we must be extra careful 1361 // with rounding in that case. 1362 1363 // Compute d before division since there may be aliasing of x.mant 1364 // (via xadj) or y.mant with z.mant. 1365 d := len(xadj) - len(y.mant) 1366 1367 // divide 1368 var r nat 1369 z.mant, r = z.mant.div(nil, xadj, y.mant) 1370 e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W 1371 1372 // The result is long enough to include (at least) the rounding bit. 1373 // If there's a non-zero remainder, the corresponding fractional part 1374 // (if it were computed), would have a non-zero sticky bit (if it were 1375 // zero, it couldn't have a non-zero remainder). 1376 var sbit uint 1377 if len(r) > 0 { 1378 sbit = 1 1379 } 1380 1381 z.setExpAndRound(e-fnorm(z.mant), sbit) 1382 } 1383 1384 // ucmp returns -1, 0, or +1, depending on whether 1385 // |x| < |y|, |x| == |y|, or |x| > |y|. 1386 // x and y must have a non-empty mantissa and valid exponent. 1387 func (x *Float) ucmp(y *Float) int { 1388 if debugFloat { 1389 validateBinaryOperands(x, y) 1390 } 1391 1392 switch { 1393 case x.exp < y.exp: 1394 return -1 1395 case x.exp > y.exp: 1396 return +1 1397 } 1398 // x.exp == y.exp 1399 1400 // compare mantissas 1401 i := len(x.mant) 1402 j := len(y.mant) 1403 for i > 0 || j > 0 { 1404 var xm, ym Word 1405 if i > 0 { 1406 i-- 1407 xm = x.mant[i] 1408 } 1409 if j > 0 { 1410 j-- 1411 ym = y.mant[j] 1412 } 1413 switch { 1414 case xm < ym: 1415 return -1 1416 case xm > ym: 1417 return +1 1418 } 1419 } 1420 1421 return 0 1422 } 1423 1424 // Handling of sign bit as defined by IEEE 754-2008, section 6.3: 1425 // 1426 // When neither the inputs nor result are NaN, the sign of a product or 1427 // quotient is the exclusive OR of the operands’ signs; the sign of a sum, 1428 // or of a difference x−y regarded as a sum x+(−y), differs from at most 1429 // one of the addends’ signs; and the sign of the result of conversions, 1430 // the quantize operation, the roundToIntegral operations, and the 1431 // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand. 1432 // These rules shall apply even when operands or results are zero or infinite. 1433 // 1434 // When the sum of two operands with opposite signs (or the difference of 1435 // two operands with like signs) is exactly zero, the sign of that sum (or 1436 // difference) shall be +0 in all rounding-direction attributes except 1437 // roundTowardNegative; under that attribute, the sign of an exact zero 1438 // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same 1439 // sign as x even when x is zero. 1440 // 1441 // See also: https://play.golang.org/p/RtH3UCt5IH 1442 1443 // Add sets z to the rounded sum x+y and returns z. If z's precision is 0, 1444 // it is changed to the larger of x's or y's precision before the operation. 1445 // Rounding is performed according to z's precision and rounding mode; and 1446 // z's accuracy reports the result error relative to the exact (not rounded) 1447 // result. Add panics with [ErrNaN] if x and y are infinities with opposite 1448 // signs. The value of z is undefined in that case. 1449 func (z *Float) Add(x, y *Float) *Float { 1450 if debugFloat { 1451 x.validate() 1452 y.validate() 1453 } 1454 1455 if z.prec == 0 { 1456 z.prec = umax32(x.prec, y.prec) 1457 } 1458 1459 if x.form == finite && y.form == finite { 1460 // x + y (common case) 1461 1462 // Below we set z.neg = x.neg, and when z aliases y this will 1463 // change the y operand's sign. This is fine, because if an 1464 // operand aliases the receiver it'll be overwritten, but we still 1465 // want the original x.neg and y.neg values when we evaluate 1466 // x.neg != y.neg, so we need to save y.neg before setting z.neg. 1467 yneg := y.neg 1468 1469 z.neg = x.neg 1470 if x.neg == yneg { 1471 // x + y == x + y 1472 // (-x) + (-y) == -(x + y) 1473 z.uadd(x, y) 1474 } else { 1475 // x + (-y) == x - y == -(y - x) 1476 // (-x) + y == y - x == -(x - y) 1477 if x.ucmp(y) > 0 { 1478 z.usub(x, y) 1479 } else { 1480 z.neg = !z.neg 1481 z.usub(y, x) 1482 } 1483 } 1484 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1485 z.neg = true 1486 } 1487 return z 1488 } 1489 1490 if x.form == inf && y.form == inf && x.neg != y.neg { 1491 // +Inf + -Inf 1492 // -Inf + +Inf 1493 // value of z is undefined but make sure it's valid 1494 z.acc = Exact 1495 z.form = zero 1496 z.neg = false 1497 panic(ErrNaN{"addition of infinities with opposite signs"}) 1498 } 1499 1500 if x.form == zero && y.form == zero { 1501 // ±0 + ±0 1502 z.acc = Exact 1503 z.form = zero 1504 z.neg = x.neg && y.neg // -0 + -0 == -0 1505 return z 1506 } 1507 1508 if x.form == inf || y.form == zero { 1509 // ±Inf + y 1510 // x + ±0 1511 return z.Set(x) 1512 } 1513 1514 // ±0 + y 1515 // x + ±Inf 1516 return z.Set(y) 1517 } 1518 1519 // Sub sets z to the rounded difference x-y and returns z. 1520 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1521 // Sub panics with [ErrNaN] if x and y are infinities with equal 1522 // signs. The value of z is undefined in that case. 1523 func (z *Float) Sub(x, y *Float) *Float { 1524 if debugFloat { 1525 x.validate() 1526 y.validate() 1527 } 1528 1529 if z.prec == 0 { 1530 z.prec = umax32(x.prec, y.prec) 1531 } 1532 1533 if x.form == finite && y.form == finite { 1534 // x - y (common case) 1535 yneg := y.neg 1536 z.neg = x.neg 1537 if x.neg != yneg { 1538 // x - (-y) == x + y 1539 // (-x) - y == -(x + y) 1540 z.uadd(x, y) 1541 } else { 1542 // x - y == x - y == -(y - x) 1543 // (-x) - (-y) == y - x == -(x - y) 1544 if x.ucmp(y) > 0 { 1545 z.usub(x, y) 1546 } else { 1547 z.neg = !z.neg 1548 z.usub(y, x) 1549 } 1550 } 1551 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1552 z.neg = true 1553 } 1554 return z 1555 } 1556 1557 if x.form == inf && y.form == inf && x.neg == y.neg { 1558 // +Inf - +Inf 1559 // -Inf - -Inf 1560 // value of z is undefined but make sure it's valid 1561 z.acc = Exact 1562 z.form = zero 1563 z.neg = false 1564 panic(ErrNaN{"subtraction of infinities with equal signs"}) 1565 } 1566 1567 if x.form == zero && y.form == zero { 1568 // ±0 - ±0 1569 z.acc = Exact 1570 z.form = zero 1571 z.neg = x.neg && !y.neg // -0 - +0 == -0 1572 return z 1573 } 1574 1575 if x.form == inf || y.form == zero { 1576 // ±Inf - y 1577 // x - ±0 1578 return z.Set(x) 1579 } 1580 1581 // ±0 - y 1582 // x - ±Inf 1583 return z.Neg(y) 1584 } 1585 1586 // Mul sets z to the rounded product x*y and returns z. 1587 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1588 // Mul panics with [ErrNaN] if one operand is zero and the other 1589 // operand an infinity. The value of z is undefined in that case. 1590 func (z *Float) Mul(x, y *Float) *Float { 1591 if debugFloat { 1592 x.validate() 1593 y.validate() 1594 } 1595 1596 if z.prec == 0 { 1597 z.prec = umax32(x.prec, y.prec) 1598 } 1599 1600 z.neg = x.neg != y.neg 1601 1602 if x.form == finite && y.form == finite { 1603 // x * y (common case) 1604 z.umul(x, y) 1605 return z 1606 } 1607 1608 z.acc = Exact 1609 if x.form == zero && y.form == inf || x.form == inf && y.form == zero { 1610 // ±0 * ±Inf 1611 // ±Inf * ±0 1612 // value of z is undefined but make sure it's valid 1613 z.form = zero 1614 z.neg = false 1615 panic(ErrNaN{"multiplication of zero with infinity"}) 1616 } 1617 1618 if x.form == inf || y.form == inf { 1619 // ±Inf * y 1620 // x * ±Inf 1621 z.form = inf 1622 return z 1623 } 1624 1625 // ±0 * y 1626 // x * ±0 1627 z.form = zero 1628 return z 1629 } 1630 1631 // Quo sets z to the rounded quotient x/y and returns z. 1632 // Precision, rounding, and accuracy reporting are as for [Float.Add]. 1633 // Quo panics with [ErrNaN] if both operands are zero or infinities. 1634 // The value of z is undefined in that case. 1635 func (z *Float) Quo(x, y *Float) *Float { 1636 if debugFloat { 1637 x.validate() 1638 y.validate() 1639 } 1640 1641 if z.prec == 0 { 1642 z.prec = umax32(x.prec, y.prec) 1643 } 1644 1645 z.neg = x.neg != y.neg 1646 1647 if x.form == finite && y.form == finite { 1648 // x / y (common case) 1649 z.uquo(x, y) 1650 return z 1651 } 1652 1653 z.acc = Exact 1654 if x.form == zero && y.form == zero || x.form == inf && y.form == inf { 1655 // ±0 / ±0 1656 // ±Inf / ±Inf 1657 // value of z is undefined but make sure it's valid 1658 z.form = zero 1659 z.neg = false 1660 panic(ErrNaN{"division of zero by zero or infinity by infinity"}) 1661 } 1662 1663 if x.form == zero || y.form == inf { 1664 // ±0 / y 1665 // x / ±Inf 1666 z.form = zero 1667 return z 1668 } 1669 1670 // x / ±0 1671 // ±Inf / y 1672 z.form = inf 1673 return z 1674 } 1675 1676 // Cmp compares x and y and returns: 1677 // 1678 // -1 if x < y 1679 // 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf) 1680 // +1 if x > y 1681 func (x *Float) Cmp(y *Float) int { 1682 if debugFloat { 1683 x.validate() 1684 y.validate() 1685 } 1686 1687 mx := x.ord() 1688 my := y.ord() 1689 switch { 1690 case mx < my: 1691 return -1 1692 case mx > my: 1693 return +1 1694 } 1695 // mx == my 1696 1697 // only if |mx| == 1 we have to compare the mantissae 1698 switch mx { 1699 case -1: 1700 return y.ucmp(x) 1701 case +1: 1702 return x.ucmp(y) 1703 } 1704 1705 return 0 1706 } 1707 1708 // ord classifies x and returns: 1709 // 1710 // -2 if -Inf == x 1711 // -1 if -Inf < x < 0 1712 // 0 if x == 0 (signed or unsigned) 1713 // +1 if 0 < x < +Inf 1714 // +2 if x == +Inf 1715 func (x *Float) ord() int { 1716 var m int 1717 switch x.form { 1718 case finite: 1719 m = 1 1720 case zero: 1721 return 0 1722 case inf: 1723 m = 2 1724 } 1725 if x.neg { 1726 m = -m 1727 } 1728 return m 1729 } 1730 1731 func umax32(x, y uint32) uint32 { 1732 if x > y { 1733 return x 1734 } 1735 return y 1736 } 1737