...

Source file src/math/big/float.go

Documentation: math/big

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file implements multi-precision floating-point numbers.
     6  // Like in the GNU MPFR library (https://www.mpfr.org/), operands
     7  // can be of mixed precision. Unlike MPFR, the rounding mode is
     8  // not specified with each operation, but with each operand. The
     9  // rounding mode of the result operand determines the rounding
    10  // mode of an operation. This is a from-scratch implementation.
    11  
    12  package big
    13  
    14  import (
    15  	"fmt"
    16  	"math"
    17  	"math/bits"
    18  )
    19  
    20  const debugFloat = false // enable for debugging
    21  
    22  // A nonzero finite Float represents a multi-precision floating point number
    23  //
    24  //	sign × mantissa × 2**exponent
    25  //
    26  // with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
    27  // A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
    28  // All Floats are ordered, and the ordering of two Floats x and y
    29  // is defined by x.Cmp(y).
    30  //
    31  // Each Float value also has a precision, rounding mode, and accuracy.
    32  // The precision is the maximum number of mantissa bits available to
    33  // represent the value. The rounding mode specifies how a result should
    34  // be rounded to fit into the mantissa bits, and accuracy describes the
    35  // rounding error with respect to the exact result.
    36  //
    37  // Unless specified otherwise, all operations (including setters) that
    38  // specify a *Float variable for the result (usually via the receiver
    39  // with the exception of [Float.MantExp]), round the numeric result according
    40  // to the precision and rounding mode of the result variable.
    41  //
    42  // If the provided result precision is 0 (see below), it is set to the
    43  // precision of the argument with the largest precision value before any
    44  // rounding takes place, and the rounding mode remains unchanged. Thus,
    45  // uninitialized Floats provided as result arguments will have their
    46  // precision set to a reasonable value determined by the operands, and
    47  // their mode is the zero value for RoundingMode (ToNearestEven).
    48  //
    49  // By setting the desired precision to 24 or 53 and using matching rounding
    50  // mode (typically [ToNearestEven]), Float operations produce the same results
    51  // as the corresponding float32 or float64 IEEE-754 arithmetic for operands
    52  // that correspond to normal (i.e., not denormal) float32 or float64 numbers.
    53  // Exponent underflow and overflow lead to a 0 or an Infinity for different
    54  // values than IEEE-754 because Float exponents have a much larger range.
    55  //
    56  // The zero (uninitialized) value for a Float is ready to use and represents
    57  // the number +0.0 exactly, with precision 0 and rounding mode [ToNearestEven].
    58  //
    59  // Operations always take pointer arguments (*Float) rather
    60  // than Float values, and each unique Float value requires
    61  // its own unique *Float pointer. To "copy" a Float value,
    62  // an existing (or newly allocated) Float must be set to
    63  // a new value using the [Float.Set] method; shallow copies
    64  // of Floats are not supported and may lead to errors.
    65  type Float struct {
    66  	prec uint32
    67  	mode RoundingMode
    68  	acc  Accuracy
    69  	form form
    70  	neg  bool
    71  	mant nat
    72  	exp  int32
    73  }
    74  
    75  // An ErrNaN panic is raised by a [Float] operation that would lead to
    76  // a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
    77  type ErrNaN struct {
    78  	msg string
    79  }
    80  
    81  func (err ErrNaN) Error() string {
    82  	return err.msg
    83  }
    84  
    85  // NewFloat allocates and returns a new [Float] set to x,
    86  // with precision 53 and rounding mode [ToNearestEven].
    87  // NewFloat panics with [ErrNaN] if x is a NaN.
    88  func NewFloat(x float64) *Float {
    89  	if math.IsNaN(x) {
    90  		panic(ErrNaN{"NewFloat(NaN)"})
    91  	}
    92  	return new(Float).SetFloat64(x)
    93  }
    94  
    95  // Exponent and precision limits.
    96  const (
    97  	MaxExp  = math.MaxInt32  // largest supported exponent
    98  	MinExp  = math.MinInt32  // smallest supported exponent
    99  	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
   100  )
   101  
   102  // Internal representation: The mantissa bits x.mant of a nonzero finite
   103  // Float x are stored in a nat slice long enough to hold up to x.prec bits;
   104  // the slice may (but doesn't have to) be shorter if the mantissa contains
   105  // trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
   106  // the msb is shifted all the way "to the left"). Thus, if the mantissa has
   107  // trailing 0 bits or x.prec is not a multiple of the Word size _W,
   108  // x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
   109  // to the value 0.5; the exponent x.exp shifts the binary point as needed.
   110  //
   111  // A zero or non-finite Float x ignores x.mant and x.exp.
   112  //
   113  // x                 form      neg      mant         exp
   114  // ----------------------------------------------------------
   115  // ±0                zero      sign     -            -
   116  // 0 < |x| < +Inf    finite    sign     mantissa     exponent
   117  // ±Inf              inf       sign     -            -
   118  
   119  // A form value describes the internal representation.
   120  type form byte
   121  
   122  // The form value order is relevant - do not change!
   123  const (
   124  	zero form = iota
   125  	finite
   126  	inf
   127  )
   128  
   129  // RoundingMode determines how a [Float] value is rounded to the
   130  // desired precision. Rounding may change the [Float] value; the
   131  // rounding error is described by the [Float]'s [Accuracy].
   132  type RoundingMode byte
   133  
   134  // These constants define supported rounding modes.
   135  const (
   136  	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
   137  	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
   138  	ToZero                            // == IEEE 754-2008 roundTowardZero
   139  	AwayFromZero                      // no IEEE 754-2008 equivalent
   140  	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
   141  	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
   142  )
   143  
   144  //go:generate stringer -type=RoundingMode
   145  
   146  // Accuracy describes the rounding error produced by the most recent
   147  // operation that generated a [Float] value, relative to the exact value.
   148  type Accuracy int8
   149  
   150  // Constants describing the [Accuracy] of a [Float].
   151  const (
   152  	Below Accuracy = -1
   153  	Exact Accuracy = 0
   154  	Above Accuracy = +1
   155  )
   156  
   157  //go:generate stringer -type=Accuracy
   158  
   159  // SetPrec sets z's precision to prec and returns the (possibly) rounded
   160  // value of z. Rounding occurs according to z's rounding mode if the mantissa
   161  // cannot be represented in prec bits without loss of precision.
   162  // SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
   163  // If prec > [MaxPrec], it is set to [MaxPrec].
   164  func (z *Float) SetPrec(prec uint) *Float {
   165  	z.acc = Exact // optimistically assume no rounding is needed
   166  
   167  	// special case
   168  	if prec == 0 {
   169  		z.prec = 0
   170  		if z.form == finite {
   171  			// truncate z to 0
   172  			z.acc = makeAcc(z.neg)
   173  			z.form = zero
   174  		}
   175  		return z
   176  	}
   177  
   178  	// general case
   179  	if prec > MaxPrec {
   180  		prec = MaxPrec
   181  	}
   182  	old := z.prec
   183  	z.prec = uint32(prec)
   184  	if z.prec < old {
   185  		z.round(0)
   186  	}
   187  	return z
   188  }
   189  
   190  func makeAcc(above bool) Accuracy {
   191  	if above {
   192  		return Above
   193  	}
   194  	return Below
   195  }
   196  
   197  // SetMode sets z's rounding mode to mode and returns an exact z.
   198  // z remains unchanged otherwise.
   199  // z.SetMode(z.Mode()) is a cheap way to set z's accuracy to [Exact].
   200  func (z *Float) SetMode(mode RoundingMode) *Float {
   201  	z.mode = mode
   202  	z.acc = Exact
   203  	return z
   204  }
   205  
   206  // Prec returns the mantissa precision of x in bits.
   207  // The result may be 0 for |x| == 0 and |x| == Inf.
   208  func (x *Float) Prec() uint {
   209  	return uint(x.prec)
   210  }
   211  
   212  // MinPrec returns the minimum precision required to represent x exactly
   213  // (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
   214  // The result is 0 for |x| == 0 and |x| == Inf.
   215  func (x *Float) MinPrec() uint {
   216  	if x.form != finite {
   217  		return 0
   218  	}
   219  	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
   220  }
   221  
   222  // Mode returns the rounding mode of x.
   223  func (x *Float) Mode() RoundingMode {
   224  	return x.mode
   225  }
   226  
   227  // Acc returns the accuracy of x produced by the most recent
   228  // operation, unless explicitly documented otherwise by that
   229  // operation.
   230  func (x *Float) Acc() Accuracy {
   231  	return x.acc
   232  }
   233  
   234  // Sign returns:
   235  //
   236  //	-1 if x <   0
   237  //	 0 if x is ±0
   238  //	+1 if x >   0
   239  func (x *Float) Sign() int {
   240  	if debugFloat {
   241  		x.validate()
   242  	}
   243  	if x.form == zero {
   244  		return 0
   245  	}
   246  	if x.neg {
   247  		return -1
   248  	}
   249  	return 1
   250  }
   251  
   252  // MantExp breaks x into its mantissa and exponent components
   253  // and returns the exponent. If a non-nil mant argument is
   254  // provided its value is set to the mantissa of x, with the
   255  // same precision and rounding mode as x. The components
   256  // satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
   257  // Calling MantExp with a nil argument is an efficient way to
   258  // get the exponent of the receiver.
   259  //
   260  // Special cases are:
   261  //
   262  //	(  ±0).MantExp(mant) = 0, with mant set to   ±0
   263  //	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
   264  //
   265  // x and mant may be the same in which case x is set to its
   266  // mantissa value.
   267  func (x *Float) MantExp(mant *Float) (exp int) {
   268  	if debugFloat {
   269  		x.validate()
   270  	}
   271  	if x.form == finite {
   272  		exp = int(x.exp)
   273  	}
   274  	if mant != nil {
   275  		mant.Copy(x)
   276  		if mant.form == finite {
   277  			mant.exp = 0
   278  		}
   279  	}
   280  	return
   281  }
   282  
   283  func (z *Float) setExpAndRound(exp int64, sbit uint) {
   284  	if exp < MinExp {
   285  		// underflow
   286  		z.acc = makeAcc(z.neg)
   287  		z.form = zero
   288  		return
   289  	}
   290  
   291  	if exp > MaxExp {
   292  		// overflow
   293  		z.acc = makeAcc(!z.neg)
   294  		z.form = inf
   295  		return
   296  	}
   297  
   298  	z.form = finite
   299  	z.exp = int32(exp)
   300  	z.round(sbit)
   301  }
   302  
   303  // SetMantExp sets z to mant × 2**exp and returns z.
   304  // The result z has the same precision and rounding mode
   305  // as mant. SetMantExp is an inverse of [Float.MantExp] but does
   306  // not require 0.5 <= |mant| < 1.0. Specifically, for a
   307  // given x of type *[Float], SetMantExp relates to [Float.MantExp]
   308  // as follows:
   309  //
   310  //	mant := new(Float)
   311  //	new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
   312  //
   313  // Special cases are:
   314  //
   315  //	z.SetMantExp(  ±0, exp) =   ±0
   316  //	z.SetMantExp(±Inf, exp) = ±Inf
   317  //
   318  // z and mant may be the same in which case z's exponent
   319  // is set to exp.
   320  func (z *Float) SetMantExp(mant *Float, exp int) *Float {
   321  	if debugFloat {
   322  		z.validate()
   323  		mant.validate()
   324  	}
   325  	z.Copy(mant)
   326  
   327  	if z.form == finite {
   328  		// 0 < |mant| < +Inf
   329  		z.setExpAndRound(int64(z.exp)+int64(exp), 0)
   330  	}
   331  	return z
   332  }
   333  
   334  // Signbit reports whether x is negative or negative zero.
   335  func (x *Float) Signbit() bool {
   336  	return x.neg
   337  }
   338  
   339  // IsInf reports whether x is +Inf or -Inf.
   340  func (x *Float) IsInf() bool {
   341  	return x.form == inf
   342  }
   343  
   344  // IsInt reports whether x is an integer.
   345  // ±Inf values are not integers.
   346  func (x *Float) IsInt() bool {
   347  	if debugFloat {
   348  		x.validate()
   349  	}
   350  	// special cases
   351  	if x.form != finite {
   352  		return x.form == zero
   353  	}
   354  	// x.form == finite
   355  	if x.exp <= 0 {
   356  		return false
   357  	}
   358  	// x.exp > 0
   359  	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
   360  }
   361  
   362  // debugging support
   363  func (x *Float) validate() {
   364  	if !debugFloat {
   365  		// avoid performance bugs
   366  		panic("validate called but debugFloat is not set")
   367  	}
   368  	if msg := x.validate0(); msg != "" {
   369  		panic(msg)
   370  	}
   371  }
   372  
   373  func (x *Float) validate0() string {
   374  	if x.form != finite {
   375  		return ""
   376  	}
   377  	m := len(x.mant)
   378  	if m == 0 {
   379  		return "nonzero finite number with empty mantissa"
   380  	}
   381  	const msb = 1 << (_W - 1)
   382  	if x.mant[m-1]&msb == 0 {
   383  		return fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0))
   384  	}
   385  	if x.prec == 0 {
   386  		return "zero precision finite number"
   387  	}
   388  	return ""
   389  }
   390  
   391  // round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
   392  // sbit must be 0 or 1 and summarizes any "sticky bit" information one might
   393  // have before calling round. z's mantissa must be normalized (with the msb set)
   394  // or empty.
   395  //
   396  // CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
   397  // sign of z. For correct rounding, the sign of z must be set correctly before
   398  // calling round.
   399  func (z *Float) round(sbit uint) {
   400  	if debugFloat {
   401  		z.validate()
   402  	}
   403  
   404  	z.acc = Exact
   405  	if z.form != finite {
   406  		// ±0 or ±Inf => nothing left to do
   407  		return
   408  	}
   409  	// z.form == finite && len(z.mant) > 0
   410  	// m > 0 implies z.prec > 0 (checked by validate)
   411  
   412  	m := uint32(len(z.mant)) // present mantissa length in words
   413  	bits := m * _W           // present mantissa bits; bits > 0
   414  	if bits <= z.prec {
   415  		// mantissa fits => nothing to do
   416  		return
   417  	}
   418  	// bits > z.prec
   419  
   420  	// Rounding is based on two bits: the rounding bit (rbit) and the
   421  	// sticky bit (sbit). The rbit is the bit immediately before the
   422  	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
   423  	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
   424  	//
   425  	//   rbit  sbit  => "fractional part"
   426  	//
   427  	//   0     0        == 0
   428  	//   0     1        >  0  , < 0.5
   429  	//   1     0        == 0.5
   430  	//   1     1        >  0.5, < 1.0
   431  
   432  	// bits > z.prec: mantissa too large => round
   433  	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
   434  	rbit := z.mant.bit(r) & 1    // rounding bit; be safe and ensure it's a single bit
   435  	// The sticky bit is only needed for rounding ToNearestEven
   436  	// or when the rounding bit is zero. Avoid computation otherwise.
   437  	if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
   438  		sbit = z.mant.sticky(r)
   439  	}
   440  	sbit &= 1 // be safe and ensure it's a single bit
   441  
   442  	// cut off extra words
   443  	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
   444  	if m > n {
   445  		copy(z.mant, z.mant[m-n:]) // move n last words to front
   446  		z.mant = z.mant[:n]
   447  	}
   448  
   449  	// determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
   450  	ntz := n*_W - z.prec // 0 <= ntz < _W
   451  	lsb := Word(1) << ntz
   452  
   453  	// round if result is inexact
   454  	if rbit|sbit != 0 {
   455  		// Make rounding decision: The result mantissa is truncated ("rounded down")
   456  		// by default. Decide if we need to increment, or "round up", the (unsigned)
   457  		// mantissa.
   458  		inc := false
   459  		switch z.mode {
   460  		case ToNegativeInf:
   461  			inc = z.neg
   462  		case ToZero:
   463  			// nothing to do
   464  		case ToNearestEven:
   465  			inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
   466  		case ToNearestAway:
   467  			inc = rbit != 0
   468  		case AwayFromZero:
   469  			inc = true
   470  		case ToPositiveInf:
   471  			inc = !z.neg
   472  		default:
   473  			panic("unreachable")
   474  		}
   475  
   476  		// A positive result (!z.neg) is Above the exact result if we increment,
   477  		// and it's Below if we truncate (Exact results require no rounding).
   478  		// For a negative result (z.neg) it is exactly the opposite.
   479  		z.acc = makeAcc(inc != z.neg)
   480  
   481  		if inc {
   482  			// add 1 to mantissa
   483  			if addVW(z.mant, z.mant, lsb) != 0 {
   484  				// mantissa overflow => adjust exponent
   485  				if z.exp >= MaxExp {
   486  					// exponent overflow
   487  					z.form = inf
   488  					return
   489  				}
   490  				z.exp++
   491  				// adjust mantissa: divide by 2 to compensate for exponent adjustment
   492  				shrVU(z.mant, z.mant, 1)
   493  				// set msb == carry == 1 from the mantissa overflow above
   494  				const msb = 1 << (_W - 1)
   495  				z.mant[n-1] |= msb
   496  			}
   497  		}
   498  	}
   499  
   500  	// zero out trailing bits in least-significant word
   501  	z.mant[0] &^= lsb - 1
   502  
   503  	if debugFloat {
   504  		z.validate()
   505  	}
   506  }
   507  
   508  func (z *Float) setBits64(neg bool, x uint64) *Float {
   509  	if z.prec == 0 {
   510  		z.prec = 64
   511  	}
   512  	z.acc = Exact
   513  	z.neg = neg
   514  	if x == 0 {
   515  		z.form = zero
   516  		return z
   517  	}
   518  	// x != 0
   519  	z.form = finite
   520  	s := bits.LeadingZeros64(x)
   521  	z.mant = z.mant.setUint64(x << uint(s))
   522  	z.exp = int32(64 - s) // always fits
   523  	if z.prec < 64 {
   524  		z.round(0)
   525  	}
   526  	return z
   527  }
   528  
   529  // SetUint64 sets z to the (possibly rounded) value of x and returns z.
   530  // If z's precision is 0, it is changed to 64 (and rounding will have
   531  // no effect).
   532  func (z *Float) SetUint64(x uint64) *Float {
   533  	return z.setBits64(false, x)
   534  }
   535  
   536  // SetInt64 sets z to the (possibly rounded) value of x and returns z.
   537  // If z's precision is 0, it is changed to 64 (and rounding will have
   538  // no effect).
   539  func (z *Float) SetInt64(x int64) *Float {
   540  	u := x
   541  	if u < 0 {
   542  		u = -u
   543  	}
   544  	// We cannot simply call z.SetUint64(uint64(u)) and change
   545  	// the sign afterwards because the sign affects rounding.
   546  	return z.setBits64(x < 0, uint64(u))
   547  }
   548  
   549  // SetFloat64 sets z to the (possibly rounded) value of x and returns z.
   550  // If z's precision is 0, it is changed to 53 (and rounding will have
   551  // no effect). SetFloat64 panics with [ErrNaN] if x is a NaN.
   552  func (z *Float) SetFloat64(x float64) *Float {
   553  	if z.prec == 0 {
   554  		z.prec = 53
   555  	}
   556  	if math.IsNaN(x) {
   557  		panic(ErrNaN{"Float.SetFloat64(NaN)"})
   558  	}
   559  	z.acc = Exact
   560  	z.neg = math.Signbit(x) // handle -0, -Inf correctly
   561  	if x == 0 {
   562  		z.form = zero
   563  		return z
   564  	}
   565  	if math.IsInf(x, 0) {
   566  		z.form = inf
   567  		return z
   568  	}
   569  	// normalized x != 0
   570  	z.form = finite
   571  	fmant, exp := math.Frexp(x) // get normalized mantissa
   572  	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
   573  	z.exp = int32(exp) // always fits
   574  	if z.prec < 53 {
   575  		z.round(0)
   576  	}
   577  	return z
   578  }
   579  
   580  // fnorm normalizes mantissa m by shifting it to the left
   581  // such that the msb of the most-significant word (msw) is 1.
   582  // It returns the shift amount. It assumes that len(m) != 0.
   583  func fnorm(m nat) int64 {
   584  	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
   585  		panic("msw of mantissa is 0")
   586  	}
   587  	s := nlz(m[len(m)-1])
   588  	if s > 0 {
   589  		c := shlVU(m, m, s)
   590  		if debugFloat && c != 0 {
   591  			panic("nlz or shlVU incorrect")
   592  		}
   593  	}
   594  	return int64(s)
   595  }
   596  
   597  // SetInt sets z to the (possibly rounded) value of x and returns z.
   598  // If z's precision is 0, it is changed to the larger of x.BitLen()
   599  // or 64 (and rounding will have no effect).
   600  func (z *Float) SetInt(x *Int) *Float {
   601  	// TODO(gri) can be more efficient if z.prec > 0
   602  	// but small compared to the size of x, or if there
   603  	// are many trailing 0's.
   604  	bits := uint32(x.BitLen())
   605  	if z.prec == 0 {
   606  		z.prec = umax32(bits, 64)
   607  	}
   608  	z.acc = Exact
   609  	z.neg = x.neg
   610  	if len(x.abs) == 0 {
   611  		z.form = zero
   612  		return z
   613  	}
   614  	// x != 0
   615  	z.mant = z.mant.set(x.abs)
   616  	fnorm(z.mant)
   617  	z.setExpAndRound(int64(bits), 0)
   618  	return z
   619  }
   620  
   621  // SetRat sets z to the (possibly rounded) value of x and returns z.
   622  // If z's precision is 0, it is changed to the largest of a.BitLen(),
   623  // b.BitLen(), or 64; with x = a/b.
   624  func (z *Float) SetRat(x *Rat) *Float {
   625  	if x.IsInt() {
   626  		return z.SetInt(x.Num())
   627  	}
   628  	var a, b Float
   629  	a.SetInt(x.Num())
   630  	b.SetInt(x.Denom())
   631  	if z.prec == 0 {
   632  		z.prec = umax32(a.prec, b.prec)
   633  	}
   634  	return z.Quo(&a, &b)
   635  }
   636  
   637  // SetInf sets z to the infinite Float -Inf if signbit is
   638  // set, or +Inf if signbit is not set, and returns z. The
   639  // precision of z is unchanged and the result is always
   640  // [Exact].
   641  func (z *Float) SetInf(signbit bool) *Float {
   642  	z.acc = Exact
   643  	z.form = inf
   644  	z.neg = signbit
   645  	return z
   646  }
   647  
   648  // Set sets z to the (possibly rounded) value of x and returns z.
   649  // If z's precision is 0, it is changed to the precision of x
   650  // before setting z (and rounding will have no effect).
   651  // Rounding is performed according to z's precision and rounding
   652  // mode; and z's accuracy reports the result error relative to the
   653  // exact (not rounded) result.
   654  func (z *Float) Set(x *Float) *Float {
   655  	if debugFloat {
   656  		x.validate()
   657  	}
   658  	z.acc = Exact
   659  	if z != x {
   660  		z.form = x.form
   661  		z.neg = x.neg
   662  		if x.form == finite {
   663  			z.exp = x.exp
   664  			z.mant = z.mant.set(x.mant)
   665  		}
   666  		if z.prec == 0 {
   667  			z.prec = x.prec
   668  		} else if z.prec < x.prec {
   669  			z.round(0)
   670  		}
   671  	}
   672  	return z
   673  }
   674  
   675  // Copy sets z to x, with the same precision, rounding mode, and
   676  // accuracy as x, and returns z. x is not changed even if z and
   677  // x are the same.
   678  func (z *Float) Copy(x *Float) *Float {
   679  	if debugFloat {
   680  		x.validate()
   681  	}
   682  	if z != x {
   683  		z.prec = x.prec
   684  		z.mode = x.mode
   685  		z.acc = x.acc
   686  		z.form = x.form
   687  		z.neg = x.neg
   688  		if z.form == finite {
   689  			z.mant = z.mant.set(x.mant)
   690  			z.exp = x.exp
   691  		}
   692  	}
   693  	return z
   694  }
   695  
   696  // msb32 returns the 32 most significant bits of x.
   697  func msb32(x nat) uint32 {
   698  	i := len(x) - 1
   699  	if i < 0 {
   700  		return 0
   701  	}
   702  	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
   703  		panic("x not normalized")
   704  	}
   705  	switch _W {
   706  	case 32:
   707  		return uint32(x[i])
   708  	case 64:
   709  		return uint32(x[i] >> 32)
   710  	}
   711  	panic("unreachable")
   712  }
   713  
   714  // msb64 returns the 64 most significant bits of x.
   715  func msb64(x nat) uint64 {
   716  	i := len(x) - 1
   717  	if i < 0 {
   718  		return 0
   719  	}
   720  	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
   721  		panic("x not normalized")
   722  	}
   723  	switch _W {
   724  	case 32:
   725  		v := uint64(x[i]) << 32
   726  		if i > 0 {
   727  			v |= uint64(x[i-1])
   728  		}
   729  		return v
   730  	case 64:
   731  		return uint64(x[i])
   732  	}
   733  	panic("unreachable")
   734  }
   735  
   736  // Uint64 returns the unsigned integer resulting from truncating x
   737  // towards zero. If 0 <= x <= math.MaxUint64, the result is [Exact]
   738  // if x is an integer and [Below] otherwise.
   739  // The result is (0, [Above]) for x < 0, and ([math.MaxUint64], [Below])
   740  // for x > [math.MaxUint64].
   741  func (x *Float) Uint64() (uint64, Accuracy) {
   742  	if debugFloat {
   743  		x.validate()
   744  	}
   745  
   746  	switch x.form {
   747  	case finite:
   748  		if x.neg {
   749  			return 0, Above
   750  		}
   751  		// 0 < x < +Inf
   752  		if x.exp <= 0 {
   753  			// 0 < x < 1
   754  			return 0, Below
   755  		}
   756  		// 1 <= x < Inf
   757  		if x.exp <= 64 {
   758  			// u = trunc(x) fits into a uint64
   759  			u := msb64(x.mant) >> (64 - uint32(x.exp))
   760  			if x.MinPrec() <= 64 {
   761  				return u, Exact
   762  			}
   763  			return u, Below // x truncated
   764  		}
   765  		// x too large
   766  		return math.MaxUint64, Below
   767  
   768  	case zero:
   769  		return 0, Exact
   770  
   771  	case inf:
   772  		if x.neg {
   773  			return 0, Above
   774  		}
   775  		return math.MaxUint64, Below
   776  	}
   777  
   778  	panic("unreachable")
   779  }
   780  
   781  // Int64 returns the integer resulting from truncating x towards zero.
   782  // If [math.MinInt64] <= x <= [math.MaxInt64], the result is [Exact] if x is
   783  // an integer, and [Above] (x < 0) or [Below] (x > 0) otherwise.
   784  // The result is ([math.MinInt64], [Above]) for x < [math.MinInt64],
   785  // and ([math.MaxInt64], [Below]) for x > [math.MaxInt64].
   786  func (x *Float) Int64() (int64, Accuracy) {
   787  	if debugFloat {
   788  		x.validate()
   789  	}
   790  
   791  	switch x.form {
   792  	case finite:
   793  		// 0 < |x| < +Inf
   794  		acc := makeAcc(x.neg)
   795  		if x.exp <= 0 {
   796  			// 0 < |x| < 1
   797  			return 0, acc
   798  		}
   799  		// x.exp > 0
   800  
   801  		// 1 <= |x| < +Inf
   802  		if x.exp <= 63 {
   803  			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
   804  			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
   805  			if x.neg {
   806  				i = -i
   807  			}
   808  			if x.MinPrec() <= uint(x.exp) {
   809  				return i, Exact
   810  			}
   811  			return i, acc // x truncated
   812  		}
   813  		if x.neg {
   814  			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
   815  			if x.exp == 64 && x.MinPrec() == 1 {
   816  				acc = Exact
   817  			}
   818  			return math.MinInt64, acc
   819  		}
   820  		// x too large
   821  		return math.MaxInt64, Below
   822  
   823  	case zero:
   824  		return 0, Exact
   825  
   826  	case inf:
   827  		if x.neg {
   828  			return math.MinInt64, Above
   829  		}
   830  		return math.MaxInt64, Below
   831  	}
   832  
   833  	panic("unreachable")
   834  }
   835  
   836  // Float32 returns the float32 value nearest to x. If x is too small to be
   837  // represented by a float32 (|x| < [math.SmallestNonzeroFloat32]), the result
   838  // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
   839  // If x is too large to be represented by a float32 (|x| > [math.MaxFloat32]),
   840  // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
   841  func (x *Float) Float32() (float32, Accuracy) {
   842  	if debugFloat {
   843  		x.validate()
   844  	}
   845  
   846  	switch x.form {
   847  	case finite:
   848  		// 0 < |x| < +Inf
   849  
   850  		const (
   851  			fbits = 32                //        float size
   852  			mbits = 23                //        mantissa size (excluding implicit msb)
   853  			ebits = fbits - mbits - 1 //     8  exponent size
   854  			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
   855  			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
   856  			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
   857  			emax  = bias              //   127  largest unbiased exponent (normal)
   858  		)
   859  
   860  		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
   861  		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
   862  
   863  		// Compute precision p for float32 mantissa.
   864  		// If the exponent is too small, we have a denormal number before
   865  		// rounding and fewer than p mantissa bits of precision available
   866  		// (the exponent remains fixed but the mantissa gets shifted right).
   867  		p := mbits + 1 // precision of normal float
   868  		if e < emin {
   869  			// recompute precision
   870  			p = mbits + 1 - emin + int(e)
   871  			// If p == 0, the mantissa of x is shifted so much to the right
   872  			// that its msb falls immediately to the right of the float32
   873  			// mantissa space. In other words, if the smallest denormal is
   874  			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
   875  			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
   876  			// If m == 0.5, it is rounded down to even, i.e., 0.0.
   877  			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
   878  			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
   879  				// underflow to ±0
   880  				if x.neg {
   881  					var z float32
   882  					return -z, Above
   883  				}
   884  				return 0.0, Below
   885  			}
   886  			// otherwise, round up
   887  			// We handle p == 0 explicitly because it's easy and because
   888  			// Float.round doesn't support rounding to 0 bits of precision.
   889  			if p == 0 {
   890  				if x.neg {
   891  					return -math.SmallestNonzeroFloat32, Below
   892  				}
   893  				return math.SmallestNonzeroFloat32, Above
   894  			}
   895  		}
   896  		// p > 0
   897  
   898  		// round
   899  		var r Float
   900  		r.prec = uint32(p)
   901  		r.Set(x)
   902  		e = r.exp - 1
   903  
   904  		// Rounding may have caused r to overflow to ±Inf
   905  		// (rounding never causes underflows to 0).
   906  		// If the exponent is too large, also overflow to ±Inf.
   907  		if r.form == inf || e > emax {
   908  			// overflow
   909  			if x.neg {
   910  				return float32(math.Inf(-1)), Below
   911  			}
   912  			return float32(math.Inf(+1)), Above
   913  		}
   914  		// e <= emax
   915  
   916  		// Determine sign, biased exponent, and mantissa.
   917  		var sign, bexp, mant uint32
   918  		if x.neg {
   919  			sign = 1 << (fbits - 1)
   920  		}
   921  
   922  		// Rounding may have caused a denormal number to
   923  		// become normal. Check again.
   924  		if e < emin {
   925  			// denormal number: recompute precision
   926  			// Since rounding may have at best increased precision
   927  			// and we have eliminated p <= 0 early, we know p > 0.
   928  			// bexp == 0 for denormals
   929  			p = mbits + 1 - emin + int(e)
   930  			mant = msb32(r.mant) >> uint(fbits-p)
   931  		} else {
   932  			// normal number: emin <= e <= emax
   933  			bexp = uint32(e+bias) << mbits
   934  			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
   935  		}
   936  
   937  		return math.Float32frombits(sign | bexp | mant), r.acc
   938  
   939  	case zero:
   940  		if x.neg {
   941  			var z float32
   942  			return -z, Exact
   943  		}
   944  		return 0.0, Exact
   945  
   946  	case inf:
   947  		if x.neg {
   948  			return float32(math.Inf(-1)), Exact
   949  		}
   950  		return float32(math.Inf(+1)), Exact
   951  	}
   952  
   953  	panic("unreachable")
   954  }
   955  
   956  // Float64 returns the float64 value nearest to x. If x is too small to be
   957  // represented by a float64 (|x| < [math.SmallestNonzeroFloat64]), the result
   958  // is (0, [Below]) or (-0, [Above]), respectively, depending on the sign of x.
   959  // If x is too large to be represented by a float64 (|x| > [math.MaxFloat64]),
   960  // the result is (+Inf, [Above]) or (-Inf, [Below]), depending on the sign of x.
   961  func (x *Float) Float64() (float64, Accuracy) {
   962  	if debugFloat {
   963  		x.validate()
   964  	}
   965  
   966  	switch x.form {
   967  	case finite:
   968  		// 0 < |x| < +Inf
   969  
   970  		const (
   971  			fbits = 64                //        float size
   972  			mbits = 52                //        mantissa size (excluding implicit msb)
   973  			ebits = fbits - mbits - 1 //    11  exponent size
   974  			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
   975  			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
   976  			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
   977  			emax  = bias              //  1023  largest unbiased exponent (normal)
   978  		)
   979  
   980  		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
   981  		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
   982  
   983  		// Compute precision p for float64 mantissa.
   984  		// If the exponent is too small, we have a denormal number before
   985  		// rounding and fewer than p mantissa bits of precision available
   986  		// (the exponent remains fixed but the mantissa gets shifted right).
   987  		p := mbits + 1 // precision of normal float
   988  		if e < emin {
   989  			// recompute precision
   990  			p = mbits + 1 - emin + int(e)
   991  			// If p == 0, the mantissa of x is shifted so much to the right
   992  			// that its msb falls immediately to the right of the float64
   993  			// mantissa space. In other words, if the smallest denormal is
   994  			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
   995  			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
   996  			// If m == 0.5, it is rounded down to even, i.e., 0.0.
   997  			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
   998  			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
   999  				// underflow to ±0
  1000  				if x.neg {
  1001  					var z float64
  1002  					return -z, Above
  1003  				}
  1004  				return 0.0, Below
  1005  			}
  1006  			// otherwise, round up
  1007  			// We handle p == 0 explicitly because it's easy and because
  1008  			// Float.round doesn't support rounding to 0 bits of precision.
  1009  			if p == 0 {
  1010  				if x.neg {
  1011  					return -math.SmallestNonzeroFloat64, Below
  1012  				}
  1013  				return math.SmallestNonzeroFloat64, Above
  1014  			}
  1015  		}
  1016  		// p > 0
  1017  
  1018  		// round
  1019  		var r Float
  1020  		r.prec = uint32(p)
  1021  		r.Set(x)
  1022  		e = r.exp - 1
  1023  
  1024  		// Rounding may have caused r to overflow to ±Inf
  1025  		// (rounding never causes underflows to 0).
  1026  		// If the exponent is too large, also overflow to ±Inf.
  1027  		if r.form == inf || e > emax {
  1028  			// overflow
  1029  			if x.neg {
  1030  				return math.Inf(-1), Below
  1031  			}
  1032  			return math.Inf(+1), Above
  1033  		}
  1034  		// e <= emax
  1035  
  1036  		// Determine sign, biased exponent, and mantissa.
  1037  		var sign, bexp, mant uint64
  1038  		if x.neg {
  1039  			sign = 1 << (fbits - 1)
  1040  		}
  1041  
  1042  		// Rounding may have caused a denormal number to
  1043  		// become normal. Check again.
  1044  		if e < emin {
  1045  			// denormal number: recompute precision
  1046  			// Since rounding may have at best increased precision
  1047  			// and we have eliminated p <= 0 early, we know p > 0.
  1048  			// bexp == 0 for denormals
  1049  			p = mbits + 1 - emin + int(e)
  1050  			mant = msb64(r.mant) >> uint(fbits-p)
  1051  		} else {
  1052  			// normal number: emin <= e <= emax
  1053  			bexp = uint64(e+bias) << mbits
  1054  			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
  1055  		}
  1056  
  1057  		return math.Float64frombits(sign | bexp | mant), r.acc
  1058  
  1059  	case zero:
  1060  		if x.neg {
  1061  			var z float64
  1062  			return -z, Exact
  1063  		}
  1064  		return 0.0, Exact
  1065  
  1066  	case inf:
  1067  		if x.neg {
  1068  			return math.Inf(-1), Exact
  1069  		}
  1070  		return math.Inf(+1), Exact
  1071  	}
  1072  
  1073  	panic("unreachable")
  1074  }
  1075  
  1076  // Int returns the result of truncating x towards zero;
  1077  // or nil if x is an infinity.
  1078  // The result is [Exact] if x.IsInt(); otherwise it is [Below]
  1079  // for x > 0, and [Above] for x < 0.
  1080  // If a non-nil *[Int] argument z is provided, [Int] stores
  1081  // the result in z instead of allocating a new [Int].
  1082  func (x *Float) Int(z *Int) (*Int, Accuracy) {
  1083  	if debugFloat {
  1084  		x.validate()
  1085  	}
  1086  
  1087  	if z == nil && x.form <= finite {
  1088  		z = new(Int)
  1089  	}
  1090  
  1091  	switch x.form {
  1092  	case finite:
  1093  		// 0 < |x| < +Inf
  1094  		acc := makeAcc(x.neg)
  1095  		if x.exp <= 0 {
  1096  			// 0 < |x| < 1
  1097  			return z.SetInt64(0), acc
  1098  		}
  1099  		// x.exp > 0
  1100  
  1101  		// 1 <= |x| < +Inf
  1102  		// determine minimum required precision for x
  1103  		allBits := uint(len(x.mant)) * _W
  1104  		exp := uint(x.exp)
  1105  		if x.MinPrec() <= exp {
  1106  			acc = Exact
  1107  		}
  1108  		// shift mantissa as needed
  1109  		if z == nil {
  1110  			z = new(Int)
  1111  		}
  1112  		z.neg = x.neg
  1113  		switch {
  1114  		case exp > allBits:
  1115  			z.abs = z.abs.shl(x.mant, exp-allBits)
  1116  		default:
  1117  			z.abs = z.abs.set(x.mant)
  1118  		case exp < allBits:
  1119  			z.abs = z.abs.shr(x.mant, allBits-exp)
  1120  		}
  1121  		return z, acc
  1122  
  1123  	case zero:
  1124  		return z.SetInt64(0), Exact
  1125  
  1126  	case inf:
  1127  		return nil, makeAcc(x.neg)
  1128  	}
  1129  
  1130  	panic("unreachable")
  1131  }
  1132  
  1133  // Rat returns the rational number corresponding to x;
  1134  // or nil if x is an infinity.
  1135  // The result is [Exact] if x is not an Inf.
  1136  // If a non-nil *[Rat] argument z is provided, [Rat] stores
  1137  // the result in z instead of allocating a new [Rat].
  1138  func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
  1139  	if debugFloat {
  1140  		x.validate()
  1141  	}
  1142  
  1143  	if z == nil && x.form <= finite {
  1144  		z = new(Rat)
  1145  	}
  1146  
  1147  	switch x.form {
  1148  	case finite:
  1149  		// 0 < |x| < +Inf
  1150  		allBits := int32(len(x.mant)) * _W
  1151  		// build up numerator and denominator
  1152  		z.a.neg = x.neg
  1153  		switch {
  1154  		case x.exp > allBits:
  1155  			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
  1156  			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
  1157  			// z already in normal form
  1158  		default:
  1159  			z.a.abs = z.a.abs.set(x.mant)
  1160  			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
  1161  			// z already in normal form
  1162  		case x.exp < allBits:
  1163  			z.a.abs = z.a.abs.set(x.mant)
  1164  			t := z.b.abs.setUint64(1)
  1165  			z.b.abs = t.shl(t, uint(allBits-x.exp))
  1166  			z.norm()
  1167  		}
  1168  		return z, Exact
  1169  
  1170  	case zero:
  1171  		return z.SetInt64(0), Exact
  1172  
  1173  	case inf:
  1174  		return nil, makeAcc(x.neg)
  1175  	}
  1176  
  1177  	panic("unreachable")
  1178  }
  1179  
  1180  // Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
  1181  // and returns z.
  1182  func (z *Float) Abs(x *Float) *Float {
  1183  	z.Set(x)
  1184  	z.neg = false
  1185  	return z
  1186  }
  1187  
  1188  // Neg sets z to the (possibly rounded) value of x with its sign negated,
  1189  // and returns z.
  1190  func (z *Float) Neg(x *Float) *Float {
  1191  	z.Set(x)
  1192  	z.neg = !z.neg
  1193  	return z
  1194  }
  1195  
  1196  func validateBinaryOperands(x, y *Float) {
  1197  	if !debugFloat {
  1198  		// avoid performance bugs
  1199  		panic("validateBinaryOperands called but debugFloat is not set")
  1200  	}
  1201  	if len(x.mant) == 0 {
  1202  		panic("empty mantissa for x")
  1203  	}
  1204  	if len(y.mant) == 0 {
  1205  		panic("empty mantissa for y")
  1206  	}
  1207  }
  1208  
  1209  // z = x + y, ignoring signs of x and y for the addition
  1210  // but using the sign of z for rounding the result.
  1211  // x and y must have a non-empty mantissa and valid exponent.
  1212  func (z *Float) uadd(x, y *Float) {
  1213  	// Note: This implementation requires 2 shifts most of the
  1214  	// time. It is also inefficient if exponents or precisions
  1215  	// differ by wide margins. The following article describes
  1216  	// an efficient (but much more complicated) implementation
  1217  	// compatible with the internal representation used here:
  1218  	//
  1219  	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
  1220  	// Point Addition With Exact Rounding (as in the MPFR Library)"
  1221  	// http://www.vinc17.net/research/papers/rnc6.pdf
  1222  
  1223  	if debugFloat {
  1224  		validateBinaryOperands(x, y)
  1225  	}
  1226  
  1227  	// compute exponents ex, ey for mantissa with "binary point"
  1228  	// on the right (mantissa.0) - use int64 to avoid overflow
  1229  	ex := int64(x.exp) - int64(len(x.mant))*_W
  1230  	ey := int64(y.exp) - int64(len(y.mant))*_W
  1231  
  1232  	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
  1233  
  1234  	// TODO(gri) having a combined add-and-shift primitive
  1235  	//           could make this code significantly faster
  1236  	switch {
  1237  	case ex < ey:
  1238  		if al {
  1239  			t := nat(nil).shl(y.mant, uint(ey-ex))
  1240  			z.mant = z.mant.add(x.mant, t)
  1241  		} else {
  1242  			z.mant = z.mant.shl(y.mant, uint(ey-ex))
  1243  			z.mant = z.mant.add(x.mant, z.mant)
  1244  		}
  1245  	default:
  1246  		// ex == ey, no shift needed
  1247  		z.mant = z.mant.add(x.mant, y.mant)
  1248  	case ex > ey:
  1249  		if al {
  1250  			t := nat(nil).shl(x.mant, uint(ex-ey))
  1251  			z.mant = z.mant.add(t, y.mant)
  1252  		} else {
  1253  			z.mant = z.mant.shl(x.mant, uint(ex-ey))
  1254  			z.mant = z.mant.add(z.mant, y.mant)
  1255  		}
  1256  		ex = ey
  1257  	}
  1258  	// len(z.mant) > 0
  1259  
  1260  	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
  1261  }
  1262  
  1263  // z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
  1264  // but using the sign of z for rounding the result.
  1265  // x and y must have a non-empty mantissa and valid exponent.
  1266  func (z *Float) usub(x, y *Float) {
  1267  	// This code is symmetric to uadd.
  1268  	// We have not factored the common code out because
  1269  	// eventually uadd (and usub) should be optimized
  1270  	// by special-casing, and the code will diverge.
  1271  
  1272  	if debugFloat {
  1273  		validateBinaryOperands(x, y)
  1274  	}
  1275  
  1276  	ex := int64(x.exp) - int64(len(x.mant))*_W
  1277  	ey := int64(y.exp) - int64(len(y.mant))*_W
  1278  
  1279  	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
  1280  
  1281  	switch {
  1282  	case ex < ey:
  1283  		if al {
  1284  			t := nat(nil).shl(y.mant, uint(ey-ex))
  1285  			z.mant = t.sub(x.mant, t)
  1286  		} else {
  1287  			z.mant = z.mant.shl(y.mant, uint(ey-ex))
  1288  			z.mant = z.mant.sub(x.mant, z.mant)
  1289  		}
  1290  	default:
  1291  		// ex == ey, no shift needed
  1292  		z.mant = z.mant.sub(x.mant, y.mant)
  1293  	case ex > ey:
  1294  		if al {
  1295  			t := nat(nil).shl(x.mant, uint(ex-ey))
  1296  			z.mant = t.sub(t, y.mant)
  1297  		} else {
  1298  			z.mant = z.mant.shl(x.mant, uint(ex-ey))
  1299  			z.mant = z.mant.sub(z.mant, y.mant)
  1300  		}
  1301  		ex = ey
  1302  	}
  1303  
  1304  	// operands may have canceled each other out
  1305  	if len(z.mant) == 0 {
  1306  		z.acc = Exact
  1307  		z.form = zero
  1308  		z.neg = false
  1309  		return
  1310  	}
  1311  	// len(z.mant) > 0
  1312  
  1313  	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
  1314  }
  1315  
  1316  // z = x * y, ignoring signs of x and y for the multiplication
  1317  // but using the sign of z for rounding the result.
  1318  // x and y must have a non-empty mantissa and valid exponent.
  1319  func (z *Float) umul(x, y *Float) {
  1320  	if debugFloat {
  1321  		validateBinaryOperands(x, y)
  1322  	}
  1323  
  1324  	// Note: This is doing too much work if the precision
  1325  	// of z is less than the sum of the precisions of x
  1326  	// and y which is often the case (e.g., if all floats
  1327  	// have the same precision).
  1328  	// TODO(gri) Optimize this for the common case.
  1329  
  1330  	e := int64(x.exp) + int64(y.exp)
  1331  	if x == y {
  1332  		z.mant = z.mant.sqr(x.mant)
  1333  	} else {
  1334  		z.mant = z.mant.mul(x.mant, y.mant)
  1335  	}
  1336  	z.setExpAndRound(e-fnorm(z.mant), 0)
  1337  }
  1338  
  1339  // z = x / y, ignoring signs of x and y for the division
  1340  // but using the sign of z for rounding the result.
  1341  // x and y must have a non-empty mantissa and valid exponent.
  1342  func (z *Float) uquo(x, y *Float) {
  1343  	if debugFloat {
  1344  		validateBinaryOperands(x, y)
  1345  	}
  1346  
  1347  	// mantissa length in words for desired result precision + 1
  1348  	// (at least one extra bit so we get the rounding bit after
  1349  	// the division)
  1350  	n := int(z.prec/_W) + 1
  1351  
  1352  	// compute adjusted x.mant such that we get enough result precision
  1353  	xadj := x.mant
  1354  	if d := n - len(x.mant) + len(y.mant); d > 0 {
  1355  		// d extra words needed => add d "0 digits" to x
  1356  		xadj = make(nat, len(x.mant)+d)
  1357  		copy(xadj[d:], x.mant)
  1358  	}
  1359  	// TODO(gri): If we have too many digits (d < 0), we should be able
  1360  	// to shorten x for faster division. But we must be extra careful
  1361  	// with rounding in that case.
  1362  
  1363  	// Compute d before division since there may be aliasing of x.mant
  1364  	// (via xadj) or y.mant with z.mant.
  1365  	d := len(xadj) - len(y.mant)
  1366  
  1367  	// divide
  1368  	var r nat
  1369  	z.mant, r = z.mant.div(nil, xadj, y.mant)
  1370  	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
  1371  
  1372  	// The result is long enough to include (at least) the rounding bit.
  1373  	// If there's a non-zero remainder, the corresponding fractional part
  1374  	// (if it were computed), would have a non-zero sticky bit (if it were
  1375  	// zero, it couldn't have a non-zero remainder).
  1376  	var sbit uint
  1377  	if len(r) > 0 {
  1378  		sbit = 1
  1379  	}
  1380  
  1381  	z.setExpAndRound(e-fnorm(z.mant), sbit)
  1382  }
  1383  
  1384  // ucmp returns -1, 0, or +1, depending on whether
  1385  // |x| < |y|, |x| == |y|, or |x| > |y|.
  1386  // x and y must have a non-empty mantissa and valid exponent.
  1387  func (x *Float) ucmp(y *Float) int {
  1388  	if debugFloat {
  1389  		validateBinaryOperands(x, y)
  1390  	}
  1391  
  1392  	switch {
  1393  	case x.exp < y.exp:
  1394  		return -1
  1395  	case x.exp > y.exp:
  1396  		return +1
  1397  	}
  1398  	// x.exp == y.exp
  1399  
  1400  	// compare mantissas
  1401  	i := len(x.mant)
  1402  	j := len(y.mant)
  1403  	for i > 0 || j > 0 {
  1404  		var xm, ym Word
  1405  		if i > 0 {
  1406  			i--
  1407  			xm = x.mant[i]
  1408  		}
  1409  		if j > 0 {
  1410  			j--
  1411  			ym = y.mant[j]
  1412  		}
  1413  		switch {
  1414  		case xm < ym:
  1415  			return -1
  1416  		case xm > ym:
  1417  			return +1
  1418  		}
  1419  	}
  1420  
  1421  	return 0
  1422  }
  1423  
  1424  // Handling of sign bit as defined by IEEE 754-2008, section 6.3:
  1425  //
  1426  // When neither the inputs nor result are NaN, the sign of a product or
  1427  // quotient is the exclusive OR of the operands’ signs; the sign of a sum,
  1428  // or of a difference x−y regarded as a sum x+(−y), differs from at most
  1429  // one of the addends’ signs; and the sign of the result of conversions,
  1430  // the quantize operation, the roundToIntegral operations, and the
  1431  // roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
  1432  // These rules shall apply even when operands or results are zero or infinite.
  1433  //
  1434  // When the sum of two operands with opposite signs (or the difference of
  1435  // two operands with like signs) is exactly zero, the sign of that sum (or
  1436  // difference) shall be +0 in all rounding-direction attributes except
  1437  // roundTowardNegative; under that attribute, the sign of an exact zero
  1438  // sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
  1439  // sign as x even when x is zero.
  1440  //
  1441  // See also: https://play.golang.org/p/RtH3UCt5IH
  1442  
  1443  // Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
  1444  // it is changed to the larger of x's or y's precision before the operation.
  1445  // Rounding is performed according to z's precision and rounding mode; and
  1446  // z's accuracy reports the result error relative to the exact (not rounded)
  1447  // result. Add panics with [ErrNaN] if x and y are infinities with opposite
  1448  // signs. The value of z is undefined in that case.
  1449  func (z *Float) Add(x, y *Float) *Float {
  1450  	if debugFloat {
  1451  		x.validate()
  1452  		y.validate()
  1453  	}
  1454  
  1455  	if z.prec == 0 {
  1456  		z.prec = umax32(x.prec, y.prec)
  1457  	}
  1458  
  1459  	if x.form == finite && y.form == finite {
  1460  		// x + y (common case)
  1461  
  1462  		// Below we set z.neg = x.neg, and when z aliases y this will
  1463  		// change the y operand's sign. This is fine, because if an
  1464  		// operand aliases the receiver it'll be overwritten, but we still
  1465  		// want the original x.neg and y.neg values when we evaluate
  1466  		// x.neg != y.neg, so we need to save y.neg before setting z.neg.
  1467  		yneg := y.neg
  1468  
  1469  		z.neg = x.neg
  1470  		if x.neg == yneg {
  1471  			// x + y == x + y
  1472  			// (-x) + (-y) == -(x + y)
  1473  			z.uadd(x, y)
  1474  		} else {
  1475  			// x + (-y) == x - y == -(y - x)
  1476  			// (-x) + y == y - x == -(x - y)
  1477  			if x.ucmp(y) > 0 {
  1478  				z.usub(x, y)
  1479  			} else {
  1480  				z.neg = !z.neg
  1481  				z.usub(y, x)
  1482  			}
  1483  		}
  1484  		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
  1485  			z.neg = true
  1486  		}
  1487  		return z
  1488  	}
  1489  
  1490  	if x.form == inf && y.form == inf && x.neg != y.neg {
  1491  		// +Inf + -Inf
  1492  		// -Inf + +Inf
  1493  		// value of z is undefined but make sure it's valid
  1494  		z.acc = Exact
  1495  		z.form = zero
  1496  		z.neg = false
  1497  		panic(ErrNaN{"addition of infinities with opposite signs"})
  1498  	}
  1499  
  1500  	if x.form == zero && y.form == zero {
  1501  		// ±0 + ±0
  1502  		z.acc = Exact
  1503  		z.form = zero
  1504  		z.neg = x.neg && y.neg // -0 + -0 == -0
  1505  		return z
  1506  	}
  1507  
  1508  	if x.form == inf || y.form == zero {
  1509  		// ±Inf + y
  1510  		// x + ±0
  1511  		return z.Set(x)
  1512  	}
  1513  
  1514  	// ±0 + y
  1515  	// x + ±Inf
  1516  	return z.Set(y)
  1517  }
  1518  
  1519  // Sub sets z to the rounded difference x-y and returns z.
  1520  // Precision, rounding, and accuracy reporting are as for [Float.Add].
  1521  // Sub panics with [ErrNaN] if x and y are infinities with equal
  1522  // signs. The value of z is undefined in that case.
  1523  func (z *Float) Sub(x, y *Float) *Float {
  1524  	if debugFloat {
  1525  		x.validate()
  1526  		y.validate()
  1527  	}
  1528  
  1529  	if z.prec == 0 {
  1530  		z.prec = umax32(x.prec, y.prec)
  1531  	}
  1532  
  1533  	if x.form == finite && y.form == finite {
  1534  		// x - y (common case)
  1535  		yneg := y.neg
  1536  		z.neg = x.neg
  1537  		if x.neg != yneg {
  1538  			// x - (-y) == x + y
  1539  			// (-x) - y == -(x + y)
  1540  			z.uadd(x, y)
  1541  		} else {
  1542  			// x - y == x - y == -(y - x)
  1543  			// (-x) - (-y) == y - x == -(x - y)
  1544  			if x.ucmp(y) > 0 {
  1545  				z.usub(x, y)
  1546  			} else {
  1547  				z.neg = !z.neg
  1548  				z.usub(y, x)
  1549  			}
  1550  		}
  1551  		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
  1552  			z.neg = true
  1553  		}
  1554  		return z
  1555  	}
  1556  
  1557  	if x.form == inf && y.form == inf && x.neg == y.neg {
  1558  		// +Inf - +Inf
  1559  		// -Inf - -Inf
  1560  		// value of z is undefined but make sure it's valid
  1561  		z.acc = Exact
  1562  		z.form = zero
  1563  		z.neg = false
  1564  		panic(ErrNaN{"subtraction of infinities with equal signs"})
  1565  	}
  1566  
  1567  	if x.form == zero && y.form == zero {
  1568  		// ±0 - ±0
  1569  		z.acc = Exact
  1570  		z.form = zero
  1571  		z.neg = x.neg && !y.neg // -0 - +0 == -0
  1572  		return z
  1573  	}
  1574  
  1575  	if x.form == inf || y.form == zero {
  1576  		// ±Inf - y
  1577  		// x - ±0
  1578  		return z.Set(x)
  1579  	}
  1580  
  1581  	// ±0 - y
  1582  	// x - ±Inf
  1583  	return z.Neg(y)
  1584  }
  1585  
  1586  // Mul sets z to the rounded product x*y and returns z.
  1587  // Precision, rounding, and accuracy reporting are as for [Float.Add].
  1588  // Mul panics with [ErrNaN] if one operand is zero and the other
  1589  // operand an infinity. The value of z is undefined in that case.
  1590  func (z *Float) Mul(x, y *Float) *Float {
  1591  	if debugFloat {
  1592  		x.validate()
  1593  		y.validate()
  1594  	}
  1595  
  1596  	if z.prec == 0 {
  1597  		z.prec = umax32(x.prec, y.prec)
  1598  	}
  1599  
  1600  	z.neg = x.neg != y.neg
  1601  
  1602  	if x.form == finite && y.form == finite {
  1603  		// x * y (common case)
  1604  		z.umul(x, y)
  1605  		return z
  1606  	}
  1607  
  1608  	z.acc = Exact
  1609  	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
  1610  		// ±0 * ±Inf
  1611  		// ±Inf * ±0
  1612  		// value of z is undefined but make sure it's valid
  1613  		z.form = zero
  1614  		z.neg = false
  1615  		panic(ErrNaN{"multiplication of zero with infinity"})
  1616  	}
  1617  
  1618  	if x.form == inf || y.form == inf {
  1619  		// ±Inf * y
  1620  		// x * ±Inf
  1621  		z.form = inf
  1622  		return z
  1623  	}
  1624  
  1625  	// ±0 * y
  1626  	// x * ±0
  1627  	z.form = zero
  1628  	return z
  1629  }
  1630  
  1631  // Quo sets z to the rounded quotient x/y and returns z.
  1632  // Precision, rounding, and accuracy reporting are as for [Float.Add].
  1633  // Quo panics with [ErrNaN] if both operands are zero or infinities.
  1634  // The value of z is undefined in that case.
  1635  func (z *Float) Quo(x, y *Float) *Float {
  1636  	if debugFloat {
  1637  		x.validate()
  1638  		y.validate()
  1639  	}
  1640  
  1641  	if z.prec == 0 {
  1642  		z.prec = umax32(x.prec, y.prec)
  1643  	}
  1644  
  1645  	z.neg = x.neg != y.neg
  1646  
  1647  	if x.form == finite && y.form == finite {
  1648  		// x / y (common case)
  1649  		z.uquo(x, y)
  1650  		return z
  1651  	}
  1652  
  1653  	z.acc = Exact
  1654  	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
  1655  		// ±0 / ±0
  1656  		// ±Inf / ±Inf
  1657  		// value of z is undefined but make sure it's valid
  1658  		z.form = zero
  1659  		z.neg = false
  1660  		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
  1661  	}
  1662  
  1663  	if x.form == zero || y.form == inf {
  1664  		// ±0 / y
  1665  		// x / ±Inf
  1666  		z.form = zero
  1667  		return z
  1668  	}
  1669  
  1670  	// x / ±0
  1671  	// ±Inf / y
  1672  	z.form = inf
  1673  	return z
  1674  }
  1675  
  1676  // Cmp compares x and y and returns:
  1677  //
  1678  //	-1 if x <  y
  1679  //	 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
  1680  //	+1 if x >  y
  1681  func (x *Float) Cmp(y *Float) int {
  1682  	if debugFloat {
  1683  		x.validate()
  1684  		y.validate()
  1685  	}
  1686  
  1687  	mx := x.ord()
  1688  	my := y.ord()
  1689  	switch {
  1690  	case mx < my:
  1691  		return -1
  1692  	case mx > my:
  1693  		return +1
  1694  	}
  1695  	// mx == my
  1696  
  1697  	// only if |mx| == 1 we have to compare the mantissae
  1698  	switch mx {
  1699  	case -1:
  1700  		return y.ucmp(x)
  1701  	case +1:
  1702  		return x.ucmp(y)
  1703  	}
  1704  
  1705  	return 0
  1706  }
  1707  
  1708  // ord classifies x and returns:
  1709  //
  1710  //	-2 if -Inf == x
  1711  //	-1 if -Inf < x < 0
  1712  //	 0 if x == 0 (signed or unsigned)
  1713  //	+1 if 0 < x < +Inf
  1714  //	+2 if x == +Inf
  1715  func (x *Float) ord() int {
  1716  	var m int
  1717  	switch x.form {
  1718  	case finite:
  1719  		m = 1
  1720  	case zero:
  1721  		return 0
  1722  	case inf:
  1723  		m = 2
  1724  	}
  1725  	if x.neg {
  1726  		m = -m
  1727  	}
  1728  	return m
  1729  }
  1730  
  1731  func umax32(x, y uint32) uint32 {
  1732  	if x > y {
  1733  		return x
  1734  	}
  1735  	return y
  1736  }
  1737  

View as plain text