From 622cd6d894f183a334eedfcfb293c6202be73f52 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Sat, 23 Oct 2021 23:05:26 -0400
Subject: [PATCH 01/12] put fma in the right spot

---
 base/floatfuncs.jl | 53 +++++++++++++++++++++++++++++++++++++++-------
 test/math.jl       | 13 ++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index bae27d642e7c7..9720eb4e1b8d4 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -342,30 +342,67 @@ significantly more expensive than `x*y+z`. `fma` is used to improve accuracy in
 algorithms. See [`muladd`](@ref).
 """
 function fma end
+function fma_emulated(a::Float32, b::Float32, c::Float32)
+    return Float32(Float64(a) * Float64(b) + Float64(c))
+end
+
+@inline function splitbits(x::Float64)
+    xhi = reinterpret(Float64, reinterpret(UInt64, x) & 0xfffffffffc000000)
+    xlo = x-xhi
+    return xhi,xlo
+end
+
+@inline function twomul(a::Float64, b::Float64)
+    ahi, alo = splitbits(a)
+    bhi, blo = splitbits(b)
+    abhi = ahi*bhi
+    ablo = alo*blo + ahi*blo + alo*bhi
+    return abhi, ablo
+end
 
-fma_libm(x::Float32, y::Float32, z::Float32) =
-    ccall(("fmaf", libm_name), Float32, (Float32,Float32,Float32), x, y, z)
-fma_libm(x::Float64, y::Float64, z::Float64) =
-    ccall(("fma", libm_name), Float64, (Float64,Float64,Float64), x, y, z)
+function fma_emulated(a::Float64, b::Float64,c::Float64)
+    abhi, ablo = twomul(a,b)
+    abhitemp = abhi+ablo
+    signab = sign(abhi)
+    if abhitemp == signab*Inf #rethink our life choices
+        if sign(c) == signab  || abhi == 0.0
+            return signab * Inf
+        else #compute a*b+c = a*(b+c/a)
+            inva = inv(a)
+            aochi = c * inva
+            aoclo = fma_emulated(-aochi, a, c) * inva # Yes this is recursive. Bite me
+            aocpbhi = b + aochi
+            aocpblo = (b-aocpbhi+aochi+aoclo)
+            reshi,reslo = twomul(aocpbhi, a)
+            return reshi + (reslo+a*aocpblo)
+        end
+    end
+    ablo = abhitemp - abhi
+    r = abhi+c
+    s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
+    zh = r+s
+    return zh
+end
 fma_llvm(x::Float32, y::Float32, z::Float32) = fma_float(x, y, z)
 fma_llvm(x::Float64, y::Float64, z::Float64) = fma_float(x, y, z)
 # Disable LLVM's fma if it is incorrect, e.g. because LLVM falls back
-# onto a broken system libm; if so, use openlibm's fma instead
+# onto a broken system libm; if so, use a software emulated fma
 # 1.0000305f0 = 1 + 1/2^15
 # 1.0000000009313226 = 1 + 1/2^30
 # If fma_llvm() clobbers the rounding mode, the result of 0.1 + 0.2 will be 0.3
 # instead of the properly-rounded 0.30000000000000004; check after calling fma
+# TODO actually detect fma in hardware and switch on that.
 if (Sys.ARCH !== :i686 && fma_llvm(1.0000305f0, 1.0000305f0, -1.0f0) == 6.103609f-5 &&
     (fma_llvm(1.0000000009313226, 1.0000000009313226, -1.0) ==
      1.8626451500983188e-9) && 0.1 + 0.2 == 0.30000000000000004)
     fma(x::Float32, y::Float32, z::Float32) = fma_llvm(x,y,z)
     fma(x::Float64, y::Float64, z::Float64) = fma_llvm(x,y,z)
 else
-    fma(x::Float32, y::Float32, z::Float32) = fma_libm(x,y,z)
-    fma(x::Float64, y::Float64, z::Float64) = fma_libm(x,y,z)
+    fma(x::Float32, y::Float32, z::Float32) = fma_emulated(x,y,z)
+    fma(x::Float64, y::Float64, z::Float64) = fma_emulated(x,y,z)
 end
 function fma(a::Float16, b::Float16, c::Float16)
-    Float16(fma(Float32(a), Float32(b), Float32(c)))
+    Float16(muladd(Float32(a), Float32(b), Float32(c))) #don't use fma if the hardware doesn't have it.
 end
 
 # This is necessary at least on 32-bit Intel Linux, since fma_llvm may
diff --git a/test/math.jl b/test/math.jl
index cdcc1c5c6a47d..78dc950a6aeee 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1286,3 +1286,16 @@ end
         @test_throws MethodError f(x)
     end
 end
+
+@testset "fma" begin
+    @test fma(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
+    @test fma(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
+    for T in (Float32, Float64)
+        @test fma(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
+        @test fma(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
+        @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
+        @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
+    end
+    @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
+    @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
+end
\ No newline at end of file

From 3b2d796efb3c5d4f81d37db3d065150761dcdc6d Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Mon, 25 Oct 2021 13:15:14 -0400
Subject: [PATCH 02/12] working most of the time

---
 base/floatfuncs.jl | 24 ++++++++++--------------
 test/math.jl       | 30 ++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 9720eb4e1b8d4..9d5f69ccfcd1b 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -346,25 +346,22 @@ function fma_emulated(a::Float32, b::Float32, c::Float32)
     return Float32(Float64(a) * Float64(b) + Float64(c))
 end
 
-@inline function splitbits(x::Float64)
-    xhi = reinterpret(Float64, reinterpret(UInt64, x) & 0xfffffffffc000000)
-    xlo = x-xhi
-    return xhi,xlo
-end
-
 @inline function twomul(a::Float64, b::Float64)
-    ahi, alo = splitbits(a)
-    bhi, blo = splitbits(b)
-    abhi = ahi*bhi
-    ablo = alo*blo + ahi*blo + alo*bhi
-    return abhi, ablo
+    ahi = reinterpret(Float64, reinterpret(UInt64, a) & 0xffff_ffff_f800_0000)
+    alo = a-ahi
+    bhi = reinterpret(Float64, reinterpret(UInt64, b) & 0xffff_ffff_f800_0000)
+    blo = b-bhi
+    abhi = a*b
+    blohi = reinterpret(Float64, reinterpret(UInt64, blo) & 0xffff_ffff_f800_0000)
+    blolo = blo-blohi
+    ablo = alo*blohi - (((abhi - ahi*bhi) - alo*bhi) - ahi*blo) + blolo*alo
+    return abhi, ifelse(isfinite(abhi), ablo, abhi)
 end
 
 function fma_emulated(a::Float64, b::Float64,c::Float64)
     abhi, ablo = twomul(a,b)
-    abhitemp = abhi+ablo
     signab = sign(abhi)
-    if abhitemp == signab*Inf #rethink our life choices
+    if abhi == signab*Inf #rethink our life choices
         if sign(c) == signab  || abhi == 0.0
             return signab * Inf
         else #compute a*b+c = a*(b+c/a)
@@ -377,7 +374,6 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             return reshi + (reslo+a*aocpblo)
         end
     end
-    ablo = abhitemp - abhi
     r = abhi+c
     s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
     zh = r+s
diff --git a/test/math.jl b/test/math.jl
index 78dc950a6aeee..f0e39e8a2cd67 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1288,14 +1288,24 @@ end
 end
 
 @testset "fma" begin
-    @test fma(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
-    @test fma(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
-    for T in (Float32, Float64)
-        @test fma(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
-        @test fma(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
-        @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
-        @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
+    for fma in (fma, Base.fma_emulated)
+        @test fma(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
+        @test fma(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
+        for T in (Float32, Float64)
+            @test fma(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
+            @test fma(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
+            @test fma(T(Inf), T(Inf), T(Inf)) === T(Inf)
+            @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
+            @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
+
+        end
+        @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
+        @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
+        @test fma(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
+        @test fma(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
+        for _ in 1:2^18
+            a, b, c = reinterpret.(Float64, rand(-4503599627370497:9218868437227405311, 3))
+            @test Base.fma_emulated(a, b, c) === fma(a, b, c) || a,b,c
+        end
     end
-    @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
-    @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
-end
\ No newline at end of file
+end

From c82ef92d29c47d208648d099556e17fb79488b68 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Mon, 25 Oct 2021 17:14:35 -0400
Subject: [PATCH 03/12] stuff is broken

---
 base/floatfuncs.jl | 31 +++++++++++++++----------------
 test/math.jl       |  4 ++--
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 9d5f69ccfcd1b..bd734b98c8724 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -359,25 +359,24 @@ end
 end
 
 function fma_emulated(a::Float64, b::Float64,c::Float64)
-    abhi, ablo = twomul(a,b)
-    signab = sign(abhi)
-    if abhi == signab*Inf #rethink our life choices
-        if sign(c) == signab  || abhi == 0.0
-            return signab * Inf
-        else #compute a*b+c = a*(b+c/a)
-            inva = inv(a)
-            aochi = c * inva
-            aoclo = fma_emulated(-aochi, a, c) * inva # Yes this is recursive. Bite me
-            aocpbhi = b + aochi
-            aocpblo = (b-aocpbhi+aochi+aoclo)
-            reshi,reslo = twomul(aocpbhi, a)
-            return reshi + (reslo+a*aocpblo)
-        end
+    absab = abs(a*b)
+    if !isfinite(absab) || issubnormal(absab) || issubnormal(a) || issubnormal(b) || iszero(a)
+        !(isfinite(a) && isfinite(b) && isfinite(c)) && return a*b+c
+        iszero(a) || iszero(b) && return a*b+c
+        a_norm = reinterpret(Float64, reinterpret(UInt64, a) & 0xc00fffffffffffff)
+        b_norm = reinterpret(Float64, reinterpret(UInt64, b) & 0xc00fffffffffffff)
+        abhi, ablo = twomul(a_norm,b_norm)
+        bias = exponent(a) + exponent(b)
+        c_denorm = ldexp(c, bias)
+        isfinite(c_denorm) && return ldexp(fma_emulated(a_norm, b_norm, c_denorm), -bias)
+        c_denorm = ldexp(c, bias-54)
+        isfinite(c_denorm) && return ldexp(fma_emulated(ldexp(a_norm, -54), b_norm, c_denorm), bias+54)
+        return a*b+c
     end
+    abhi, ablo = twomul(a,b)
     r = abhi+c
     s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
-    zh = r+s
-    return zh
+    return r+s
 end
 fma_llvm(x::Float32, y::Float32, z::Float32) = fma_float(x, y, z)
 fma_llvm(x::Float64, y::Float64, z::Float64) = fma_float(x, y, z)
diff --git a/test/math.jl b/test/math.jl
index f0e39e8a2cd67..c1f7b3ced0a77 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1297,7 +1297,7 @@ end
             @test fma(T(Inf), T(Inf), T(Inf)) === T(Inf)
             @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
             @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
-
+            @test fma(-zero(T), zero(T), -zero(T)) === -zero(T)
         end
         @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
         @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
@@ -1305,7 +1305,7 @@ end
         @test fma(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
         for _ in 1:2^18
             a, b, c = reinterpret.(Float64, rand(-4503599627370497:9218868437227405311, 3))
-            @test Base.fma_emulated(a, b, c) === fma(a, b, c) || a,b,c
+            @test Base.fma_emulated(a, b, c) === fma(a, b, c) || (a,b,c)
         end
     end
 end

From c762c9c125fd76f055d8dcfeaf55be1993345410 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Tue, 26 Oct 2021 17:45:13 -0400
Subject: [PATCH 04/12] fix Float64, found bug in Float32

---
 base/floatfuncs.jl | 65 +++++++++++++++++++++++++++++++---------------
 test/math.jl       | 10 +++----
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index bd734b98c8724..70d13d40e9678 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -343,37 +343,60 @@ algorithms. See [`muladd`](@ref).
 """
 function fma end
 function fma_emulated(a::Float32, b::Float32, c::Float32)
-    return Float32(Float64(a) * Float64(b) + Float64(c))
+    ab = Float64(a) * b
+    result = ab+c
+    # If converting to Float32 doesn't cause rounding problems
+    (reinterpret(UInt64, result)&0x0000_0000_1fff_ffff != 0x1000_0000) && isequal(ab, result-c)  && return Float32(result)
+    return result #TODO fixed.
+end
+
+""" Splits a Float64 into a hi bit and a low bit where the high bit has 27 trailing 0s and the low bit has 26 trailing 0s"""
+@inline function splitbits(x::Float64)
+    hi = reinterpret(Float64, reinterpret(UInt64, x) & 0xffff_ffff_f800_0000)
+    return hi, x-hi
 end
 
 @inline function twomul(a::Float64, b::Float64)
-    ahi = reinterpret(Float64, reinterpret(UInt64, a) & 0xffff_ffff_f800_0000)
-    alo = a-ahi
-    bhi = reinterpret(Float64, reinterpret(UInt64, b) & 0xffff_ffff_f800_0000)
-    blo = b-bhi
+    ahi, alo = splitbits(a)
+    bhi, blo = splitbits(b)
     abhi = a*b
-    blohi = reinterpret(Float64, reinterpret(UInt64, blo) & 0xffff_ffff_f800_0000)
-    blolo = blo-blohi
+    blohi, blolo = splitbits(blo)
     ablo = alo*blohi - (((abhi - ahi*bhi) - alo*bhi) - ahi*blo) + blolo*alo
-    return abhi, ifelse(isfinite(abhi), ablo, abhi)
+    return abhi, ablo
 end
 
 function fma_emulated(a::Float64, b::Float64,c::Float64)
-    absab = abs(a*b)
-    if !isfinite(absab) || issubnormal(absab) || issubnormal(a) || issubnormal(b) || iszero(a)
-        !(isfinite(a) && isfinite(b) && isfinite(c)) && return a*b+c
-        iszero(a) || iszero(b) && return a*b+c
-        a_norm = reinterpret(Float64, reinterpret(UInt64, a) & 0xc00fffffffffffff)
-        b_norm = reinterpret(Float64, reinterpret(UInt64, b) & 0xc00fffffffffffff)
-        abhi, ablo = twomul(a_norm,b_norm)
+    abhi, ablo = twomul(a,b)
+    if !isfinite(abhi+c) || issubnormal(ablo) || issubnormal(a) || issubnormal(b)
+        !(isfinite(abhi+c)) && return abhi+c
+        (iszero(a) || iszero(b)) && return abhi+c
         bias = exponent(a) + exponent(b)
-        c_denorm = ldexp(c, bias)
-        isfinite(c_denorm) && return ldexp(fma_emulated(a_norm, b_norm, c_denorm), -bias)
-        c_denorm = ldexp(c, bias-54)
-        isfinite(c_denorm) && return ldexp(fma_emulated(ldexp(a_norm, -54), b_norm, c_denorm), bias+54)
-        return a*b+c
+        c_denorm = ldexp(c, -bias)
+        if isfinite(c_denorm)
+            # rescale a and b to [1,2), equivalent to ldexp(a, -exponent(a))
+            issubnormal(a) && (a *= 0x1p52)
+            issubnormal(b) && (b *= 0x1p52)
+            a = reinterpret(Float64, (reinterpret(UInt64, a) & 0x800fffffffffffff) | 0x3ff0000000000000)
+            b = reinterpret(Float64, (reinterpret(UInt64, b) & 0x800fffffffffffff) | 0x3ff0000000000000)
+            c = c_denorm
+            abhi, ablo = twomul(a,b)
+            r = abhi+c
+            s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
+            sumhi = r+s
+            if issubnormal(ldexp(sumhi, bias)) #black magic don't worry about it.
+                sumlo = sumhi-r-s
+                bits_lost = -bias-exponent(sumhi)-1022
+                sumhiInt = reinterpret(UInt64, sumhi)
+                if sumlo != 0 && ((bits_lost != 1) ⊻ (sumhiInt&1 == 1))
+                    sumhi = nextfloat(sumhi, ifelse(signbit(sumhi)==signbit(sumlo), -1, 1))
+                end
+            end
+            return ldexp(sumhi, bias)
+        end
+        isinf(abhi) && signbit(c) == signbit(a*b) && return abhi
+        # fall through
     end
-    abhi, ablo = twomul(a,b)
+
     r = abhi+c
     s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
     return r+s
diff --git a/test/math.jl b/test/math.jl
index c1f7b3ced0a77..3e4f8e7d53a2e 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1291,21 +1291,21 @@ end
     for fma in (fma, Base.fma_emulated)
         @test fma(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
         @test fma(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
-        for T in (Float32, Float64)
+        @testset "$T" for T in (Float32, Float64)
             @test fma(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
             @test fma(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
             @test fma(T(Inf), T(Inf), T(Inf)) === T(Inf)
             @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
             @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
             @test fma(-zero(T), zero(T), -zero(T)) === -zero(T)
+            for _ in 1:2^18
+                a, b, c = reinterpret.(T, rand(uinttype(T), 3))
+                @test isequal(Base.fma_emulated(a, b, c), fma(a, b, c)) || (a,b,c)
+            end
         end
         @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
         @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
         @test fma(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
         @test fma(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
-        for _ in 1:2^18
-            a, b, c = reinterpret.(Float64, rand(-4503599627370497:9218868437227405311, 3))
-            @test Base.fma_emulated(a, b, c) === fma(a, b, c) || (a,b,c)
-        end
     end
 end

From 5091397e81da8ecf14b359eb92b543de32a39f8f Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Wed, 27 Oct 2021 10:57:45 -0400
Subject: [PATCH 05/12] fixed float32

---
 base/floatfuncs.jl | 12 +++++++-----
 test/math.jl       |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 70d13d40e9678..a3e0ab9c83cd6 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -342,12 +342,14 @@ significantly more expensive than `x*y+z`. `fma` is used to improve accuracy in
 algorithms. See [`muladd`](@ref).
 """
 function fma end
-function fma_emulated(a::Float32, b::Float32, c::Float32)
+function fma_emulated(a::Float32, b::Float32, c::Float32)::Float32
     ab = Float64(a) * b
-    result = ab+c
-    # If converting to Float32 doesn't cause rounding problems
-    (reinterpret(UInt64, result)&0x0000_0000_1fff_ffff != 0x1000_0000) && isequal(ab, result-c)  && return Float32(result)
-    return result #TODO fixed.
+    res = ab+c
+    reinterpret(UInt64, res)&0x1fff_ffff!=0x1000_0000 && return res
+    # yes error compensation is necessary. It sucks
+    reslo = abs(c)>abs(ab) ? ab-(res - c) : c-(res - ab)
+    res = iszero(reslo) ? res : (signbit(reslo) ? prevfloat(res) : nextfloat(res))
+    return res
 end
 
 """ Splits a Float64 into a hi bit and a low bit where the high bit has 27 trailing 0s and the low bit has 26 trailing 0s"""
diff --git a/test/math.jl b/test/math.jl
index 3e4f8e7d53a2e..e0c913e904aa6 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1307,5 +1307,6 @@ end
         @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
         @test fma(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
         @test fma(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
+        @test fma(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6
     end
 end

From cd104897e8c6dc5ac91b5f0a3ebcb2fd9e06b2c7 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Wed, 27 Oct 2021 11:32:30 -0400
Subject: [PATCH 06/12] fix float64

---
 base/floatfuncs.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index a3e0ab9c83cd6..eb7e6ee8b7edd 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -369,11 +369,13 @@ end
 
 function fma_emulated(a::Float64, b::Float64,c::Float64)
     abhi, ablo = twomul(a,b)
-    if !isfinite(abhi+c) || issubnormal(ablo) || issubnormal(a) || issubnormal(b)
-        !(isfinite(abhi+c)) && return abhi+c
+    #display(twomul(a,b))
+    if !isfinite(abhi+c) || isless(abs(abhi), nextfloat(0x1p-969)) || issubnormal(a) || issubnormal(b)
+        isfinite(abhi+c) || return abhi+c
         (iszero(a) || iszero(b)) && return abhi+c
         bias = exponent(a) + exponent(b)
         c_denorm = ldexp(c, -bias)
+        #display((a,b,c_denorm))
         if isfinite(c_denorm)
             # rescale a and b to [1,2), equivalent to ldexp(a, -exponent(a))
             issubnormal(a) && (a *= 0x1p52)
@@ -381,6 +383,7 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             a = reinterpret(Float64, (reinterpret(UInt64, a) & 0x800fffffffffffff) | 0x3ff0000000000000)
             b = reinterpret(Float64, (reinterpret(UInt64, b) & 0x800fffffffffffff) | 0x3ff0000000000000)
             c = c_denorm
+            #display((a,b,c))
             abhi, ablo = twomul(a,b)
             r = abhi+c
             s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
@@ -398,7 +401,6 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
         isinf(abhi) && signbit(c) == signbit(a*b) && return abhi
         # fall through
     end
-
     r = abhi+c
     s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
     return r+s

From 28dc996a81a6c51ce8b3737ed1b27d5b31eed82c Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Thu, 28 Oct 2021 12:19:53 -0400
Subject: [PATCH 07/12] fix test

---
 test/math.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/math.jl b/test/math.jl
index e0c913e904aa6..501e270e9445a 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1299,7 +1299,7 @@ end
             @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
             @test fma(-zero(T), zero(T), -zero(T)) === -zero(T)
             for _ in 1:2^18
-                a, b, c = reinterpret.(T, rand(uinttype(T), 3))
+                a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3))
                 @test isequal(Base.fma_emulated(a, b, c), fma(a, b, c)) || (a,b,c)
             end
         end

From 5a2f20f2c3779cd57f6e2e23693348ec8989f307 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Tue, 2 Nov 2021 17:43:43 -0400
Subject: [PATCH 08/12] just before I give up

---
 base/floatfuncs.jl | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index eb7e6ee8b7edd..b6d011b7d81a6 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -369,13 +369,11 @@ end
 
 function fma_emulated(a::Float64, b::Float64,c::Float64)
     abhi, ablo = twomul(a,b)
-    #display(twomul(a,b))
     if !isfinite(abhi+c) || isless(abs(abhi), nextfloat(0x1p-969)) || issubnormal(a) || issubnormal(b)
-        isfinite(abhi+c) || return abhi+c
+        (isfinite(a) && isfinite(b) && isfinite(c)) || return abhi+c
         (iszero(a) || iszero(b)) && return abhi+c
         bias = exponent(a) + exponent(b)
         c_denorm = ldexp(c, -bias)
-        #display((a,b,c_denorm))
         if isfinite(c_denorm)
             # rescale a and b to [1,2), equivalent to ldexp(a, -exponent(a))
             issubnormal(a) && (a *= 0x1p52)
@@ -383,17 +381,18 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             a = reinterpret(Float64, (reinterpret(UInt64, a) & 0x800fffffffffffff) | 0x3ff0000000000000)
             b = reinterpret(Float64, (reinterpret(UInt64, b) & 0x800fffffffffffff) | 0x3ff0000000000000)
             c = c_denorm
-            #display((a,b,c))
+            display((a,b,c))
             abhi, ablo = twomul(a,b)
             r = abhi+c
             s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
             sumhi = r+s
             if issubnormal(ldexp(sumhi, bias)) #black magic don't worry about it.
-                sumlo = sumhi-r-s
+                sumlo = r-sumhi+s
                 bits_lost = -bias-exponent(sumhi)-1022
                 sumhiInt = reinterpret(UInt64, sumhi)
+                display((sumhi, sumlo, bits_lost, sumhiInt, bias))
                 if sumlo != 0 && ((bits_lost != 1) ⊻ (sumhiInt&1 == 1))
-                    sumhi = nextfloat(sumhi, ifelse(signbit(sumhi)==signbit(sumlo), -1, 1))
+                    sumhi = nextfloat(sumhi, ifelse(signbit(sumhi)==signbit(sumlo), 1, -1))
                 end
             end
             return ldexp(sumhi, bias)

From 367e920a78c25d5eb9430751d984aafcb2a1f213 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Tue, 2 Nov 2021 17:48:54 -0400
Subject: [PATCH 09/12] I gave up

---
 base/floatfuncs.jl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index b6d011b7d81a6..0bba3dc2dd684 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -372,6 +372,9 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
     if !isfinite(abhi+c) || isless(abs(abhi), nextfloat(0x1p-969)) || issubnormal(a) || issubnormal(b)
         (isfinite(a) && isfinite(b) && isfinite(c)) || return abhi+c
         (iszero(a) || iszero(b)) && return abhi+c
+        if issubnormal(abhi+c) #todo optimize
+            return Float64(big(a)*b+c)
+        end
         bias = exponent(a) + exponent(b)
         c_denorm = ldexp(c, -bias)
         if isfinite(c_denorm)
@@ -381,20 +384,10 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             a = reinterpret(Float64, (reinterpret(UInt64, a) & 0x800fffffffffffff) | 0x3ff0000000000000)
             b = reinterpret(Float64, (reinterpret(UInt64, b) & 0x800fffffffffffff) | 0x3ff0000000000000)
             c = c_denorm
-            display((a,b,c))
             abhi, ablo = twomul(a,b)
             r = abhi+c
             s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
             sumhi = r+s
-            if issubnormal(ldexp(sumhi, bias)) #black magic don't worry about it.
-                sumlo = r-sumhi+s
-                bits_lost = -bias-exponent(sumhi)-1022
-                sumhiInt = reinterpret(UInt64, sumhi)
-                display((sumhi, sumlo, bits_lost, sumhiInt, bias))
-                if sumlo != 0 && ((bits_lost != 1) ⊻ (sumhiInt&1 == 1))
-                    sumhi = nextfloat(sumhi, ifelse(signbit(sumhi)==signbit(sumlo), 1, -1))
-                end
-            end
             return ldexp(sumhi, bias)
         end
         isinf(abhi) && signbit(c) == signbit(a*b) && return abhi

From c7a3192a7d7674b695e7299b69f545c01093e8ed Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Wed, 3 Nov 2021 10:20:09 -0400
Subject: [PATCH 10/12] everything works no bigfloat hacks needed!

---
 base/floatfuncs.jl | 11 ++++++++---
 test/math.jl       | 30 +++++++++++++++---------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 0bba3dc2dd684..824285a4cdfe3 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -372,9 +372,6 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
     if !isfinite(abhi+c) || isless(abs(abhi), nextfloat(0x1p-969)) || issubnormal(a) || issubnormal(b)
         (isfinite(a) && isfinite(b) && isfinite(c)) || return abhi+c
         (iszero(a) || iszero(b)) && return abhi+c
-        if issubnormal(abhi+c) #todo optimize
-            return Float64(big(a)*b+c)
-        end
         bias = exponent(a) + exponent(b)
         c_denorm = ldexp(c, -bias)
         if isfinite(c_denorm)
@@ -388,6 +385,14 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             r = abhi+c
             s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
             sumhi = r+s
+            if issubnormal(ldexp(sumhi, bias)) #black magic don't worry about it.
+                sumlo = r-sumhi+s
+                bits_lost = -bias-exponent(sumhi)-1022
+                sumhiInt = reinterpret(UInt64, sumhi)
+                if (bits_lost != 1) ⊻ (sumhiInt&1 == 1)
+                    sumhi = nextfloat(sumhi, cmp(sumlo,0))
+                end
+            end
             return ldexp(sumhi, bias)
         end
         isinf(abhi) && signbit(c) == signbit(a*b) && return abhi
diff --git a/test/math.jl b/test/math.jl
index 501e270e9445a..f935156a029f5 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1288,25 +1288,25 @@ end
 end
 
 @testset "fma" begin
-    for fma in (fma, Base.fma_emulated)
-        @test fma(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
-        @test fma(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
+    for func in (fma, Base.fma_emulated)
+        @test func(nextfloat(1.),nextfloat(1.),-1.0) === 4.440892098500626e-16
+        @test func(nextfloat(1f0),nextfloat(1f0),-1f0) === 2.3841858f-7
         @testset "$T" for T in (Float32, Float64)
-            @test fma(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
-            @test fma(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
-            @test fma(T(Inf), T(Inf), T(Inf)) === T(Inf)
-            @test isnan_type(T, fma(T(Inf), T(1), -T(Inf)))
-            @test isnan_type(T, fma(T(Inf), T(0), -T(0)))
-            @test fma(-zero(T), zero(T), -zero(T)) === -zero(T)
+            @test func(floatmax(T), T(2), -floatmax(T)) === floatmax(T)
+            @test func(floatmax(T), T(1), eps(floatmax((T)))) === T(Inf)
+            @test func(T(Inf), T(Inf), T(Inf)) === T(Inf)
+            @test isnan_type(T, func(T(Inf), T(1), -T(Inf)))
+            @test isnan_type(T, func(T(Inf), T(0), -T(0)))
+            @test func(-zero(T), zero(T), -zero(T)) === -zero(T)
             for _ in 1:2^18
                 a, b, c = reinterpret.(T, rand(Base.uinttype(T), 3))
-                @test isequal(Base.fma_emulated(a, b, c), fma(a, b, c)) || (a,b,c)
+                @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c)
             end
         end
-        @test fma(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
-        @test fma(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
-        @test fma(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
-        @test fma(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
-        @test fma(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6
+        @test fmafuncfloatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
+        @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
+        @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
+        @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf
+        @test func(-1.9369631f13, 2.1513551f-7, -1.7354427f-24) == -4.1670958f6
     end
 end

From b82ef4adbd9a5a7676d4787e3203581ab4a12135 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Wed, 3 Nov 2021 10:47:22 -0400
Subject: [PATCH 11/12] oops fixed tests

---
 test/math.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/math.jl b/test/math.jl
index f935156a029f5..da027fa8919f4 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -1303,7 +1303,7 @@ end
                 @test isequal(func(a, b, c), fma(a, b, c)) || (a,b,c)
             end
         end
-        @test fmafuncfloatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
+        @test func(floatmax(Float64), nextfloat(1.0), -floatmax(Float64)) === 3.991680619069439e292
         @test func(floatmax(Float32), nextfloat(1f0), -floatmax(Float32)) === 4.0564817f31
         @test func(1.6341681540852291e308, -2., floatmax(Float64)) == -1.4706431733081426e308 # case where inv(a)*c*a == Inf
         @test func(-2., 1.6341681540852291e308, floatmax(Float64)) == -1.4706431733081426e308 # case where inv(b)*c*b == Inf

From 892c1639b3b57b1da1d14735d719fa257303f818 Mon Sep 17 00:00:00 2001
From: oscarddssmith <oscar.smith@juliacomputing.com>
Date: Wed, 3 Nov 2021 13:11:32 -0400
Subject: [PATCH 12/12] make comment more explanatory

---
 base/floatfuncs.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl
index 824285a4cdfe3..4c2f03d304f81 100644
--- a/base/floatfuncs.jl
+++ b/base/floatfuncs.jl
@@ -385,7 +385,9 @@ function fma_emulated(a::Float64, b::Float64,c::Float64)
             r = abhi+c
             s = (abs(abhi) > abs(c)) ? (abhi-r+c+ablo) : (c-r+abhi+ablo)
             sumhi = r+s
-            if issubnormal(ldexp(sumhi, bias)) #black magic don't worry about it.
+            # If result is subnormal, ldexp will cause double rounding because subnormals have fewer mantisa bits.
+            # As such, we need to check whether round to even would lead to double rounding and manually round sumhi to avoid it.
+            if issubnormal(ldexp(sumhi, bias))
                 sumlo = r-sumhi+s
                 bits_lost = -bias-exponent(sumhi)-1022
                 sumhiInt = reinterpret(UInt64, sumhi)