Akhil Indurti has uploaded this change for review.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 deterministically uses instructions satisfying
IEEE 754, code for a GOARCH with a recently introduced or no FMA
instruction set may perform intermediate rounding. Consequently,
applications cannot rely on results being identical across
GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Assembly implementations
are given for architectures that support them. Otherwise, a software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: A translation of MUSL's implementation with
mostly fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept,
and doesn't rely on detecting the FPU's rounding mode.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 and
https://go-review.googlesource.com/c/go/+/123157 being merged in first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
A src/math/fma_arm64.s
A src/math/fma_ppc64x.s
A src/math/fma_s390x.s
A src/math/fma_x86.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
10 files changed, 596 insertions(+), 1 deletion(-)
diff --git a/src/math/all_test.go b/src/math/all_test.go
index 261d209..2a2242d 100644
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -1994,6 +1994,270 @@
1023,
}
+// FMA test cases generated Berkeley TestFloat-3e/testfloat_gen, a BSD 3-Clause licensed program.
+// http://www.jhauser.us/arithmetic/TestFloat.html
+// The following are a random sampling of 256 from at least 180,795,884,544.
+//
+// ./testfloat_gen -seed 24227 -level 2 f64_mulAdd | head -n 256
+var fmaC = []struct{ x, y, z, want uint64 }{
+ {0x3A600000FFFFFFF0, 0x1FB7D1145C02D065, 0x3FE000010000007E, 0x3FE000010000007E},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
+ {0xC0122FFD19D13A1F, 0x41E00FFFC0000000, 0x400E00000000003F, 0xC202422CCE0D16F2},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000001},
+ {0xC00FFE1000000000, 0xBFDFFFFFC000000F, 0x40300005FFFFFFFF, 0x4031FFE6FC003E00},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000002, 0x0000000000000002},
+ {0x37EFFFF7FE000000, 0x37F0000100000003, 0xC7FFFFFFFFDFEFFE, 0xC7FFFFFFFFDFEFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000004, 0x0000000000000004},
+ {0x680007FFFFFFF7FF, 0xFD6A6E84A1E4D0C8, 0xAD836F812C4529E5, 0xFFF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000008, 0x0000000000000008},
+ {0x2E50000120000000, 0x46012B30A716C9EF, 0xC3CFFFBF7FFFFFFE, 0xC3CFFFBF7FFFFFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000010, 0x0000000000000010},
+ {0xBFCFFFFBFFFFFF80, 0x000010000000FFFE, 0x3FB087FFFFFFFFFF, 0x3FB087FFFFFFFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000020, 0x0000000000000020},
+ {0xBF20001FFFFFFFDF, 0x41E7FFFFFFFFDFFE, 0xFFF2813FAC2600CC, 0xFFFA813FAC2600CC},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000040, 0x0000000000000040},
+ {0x7FEA80AFBF1335BB, 0xC3400000000001FE, 0xC80000FBFFFFFFFE, 0xFFF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000080, 0x0000000000000080},
+ {0x409FBF0000000000, 0xB80B2F730A498E98, 0xC7FFFFFFE001FFFF, 0xC7FFFFFFE001FFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000100, 0x0000000000000100},
+ {0xBFF1FFFFFFFFFDFE, 0xC2417FFFFFFFFFFF, 0xBE001FFFFFBFFFFF, 0x4243AFFFFFFFFDCD},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000200, 0x0000000000000200},
+ {0xB420000000200002, 0xC7EFFFFFFFC0001F, 0x40200000000083FE, 0x40200000000083FE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000400, 0x0000000000000400},
+ {0xC80FFFFFFF80FFFF, 0x41D3FFFFFFFFF7FF, 0x3CA007F7FFFFFFFE, 0xC9F3FFFFFFB097FE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000000800, 0x0000000000000800},
+ {0xC7E58435956B2D20, 0xFFE00000010FFFFE, 0x7FEF800000000001, 0x7FF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000001000, 0x0000000000001000},
+ {0x380559E9B91B4F12, 0xF110000000FFFFFB, 0xC1D000004000007F, 0xE92559E9BA70EDA7},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000002000, 0x0000000000002000},
+ {0xC3C07FFFFFFFFEFF, 0x2FC6AEDE98522524, 0xB81007FFFFFFFF7F, 0xB81007FFFFFFFF7F},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000004000, 0x0000000000004000},
+ {0x4EC00010007FFFFF, 0xC1DBBB282F030256, 0x41CFFFF000003FFF, 0xD0ABBB43EB090A99},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000008000, 0x0000000000008000},
+ {0x381100003FFFFFFF, 0x6CE000200003FFFE, 0xB9207F7FFFFFFFFE, 0x650100224004BFFD},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000010000, 0x0000000000010000},
+ {0xC054DB1F5904AF61, 0x3D20200000000002, 0x41CFFF7FFFFFFEFF, 0x41CFFF7FFFFFFEFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000020000, 0x0000000000020000},
+ {0x37E8621B1475919C, 0xBA20000000002001, 0x37EF800008000000, 0x37EF800008000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000040000, 0x0000000000040000},
+ {0xBBBFFFFF5FFFFFFE, 0x41C000004007FFFF, 0xC15C901DFE41CF2C, 0xC15C901DFE41CF2C},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000080000, 0x0000000000080000},
+ {0x43D000000047FFFF, 0x380FFFFFFFFBFFEF, 0x3F9FFFFF0000003E, 0x3F9FFFFF0000003E},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000100000, 0x0000000000100000},
+ {0xB947A1CFDB22C999, 0xB806433140232624, 0xBFD3FFFFEFFFFFFF, 0xBFD3FFFFEFFFFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000200000, 0x0000000000200000},
+ {0x4B244985F34D547A, 0x7FD1FFFFFFFFEFFE, 0x41E830544F8065F7, 0x7FF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000400000, 0x0000000000400000},
+ {0x0020000040001FFE, 0x403FFFFFFFF5FFFE, 0xDE400000000000DF, 0xDE400000000000DF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000000800000, 0x0000000000800000},
+ {0x3F0BF47621F86C18, 0xBFFFFFFE0000FFFE, 0xC1C10003FFFFFFFE, 0xC1C100040000037D},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000001000000, 0x0000000001000000},
+ {0xD960000000410000, 0x4010121F2C69547A, 0x419FFFFFFFFFF001, 0xD980121F2CAA9E19},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000002000000, 0x0000000002000000},
+ {0x902000000003FFF6, 0x800000007FFFFFEE, 0xC03E0000000001FF, 0xC03E0000000001FF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000004000000, 0x0000000004000000},
+ {0xEEDFFFFFFF7FFFFF, 0xC10E000002000000, 0xC4B1007FFFFFFFFF, 0x6FFE00000187FFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000008000000, 0x0000000008000000},
+ {0x4060000FFFFFFE00, 0xC084CD34529AF257, 0x52CDEEFB4866E094, 0x52CDEEFB4866E094},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000010000000, 0x0000000010000000},
+ {0xC7FA82550B3F06E6, 0x7F900000FF000000, 0xC1D000000001FFFE, 0xFFF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000020000000, 0x0000000020000000},
+ {0x4B2FFFFFFBFFEFFF, 0x3E60007FFFF7FFFF, 0x524FFFBFFFFFFFF6, 0x524FFFBFFFFFFFF6},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000040000000, 0x0000000040000000},
+ {0xB7EFFC0FFFFFFFFF, 0x43EE2234EC52B0F8, 0xCA800000000007F8, 0xCA800000000007F8},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000080000000, 0x0000000080000000},
+ {0x0FF000000FFF7FFE, 0x381022D7907EBDC1, 0x3C4FFC0000000100, 0x3C4FFC0000000100},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000100000000, 0x0000000100000000},
+ {0x37F290D8DAD31C97, 0xC00FF7FFFFFFFEFE, 0x3800080000000000, 0xB80510694938CE74},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000200000000, 0x0000000200000000},
+ {0xBFBFFF4000000000, 0xC1DFFFFFFFFFFFFF, 0xC02FFFFFFE007FFF, 0x41AFFF3FE0000001},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000400000000, 0x0000000400000000},
+ {0x000000000000021E, 0xC0108000000001FF, 0x401F800000000200, 0x401F800000000200},
+ {0x0000000000000000, 0x0000000000000000, 0x0000000800000000, 0x0000000800000000},
+ {0x244FFFFF7FFFFBFE, 0x47F0001FFFFFFBFF, 0x802001003FFFFFFE, 0x2C50001FBFFF79FE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000001000000000, 0x0000001000000000},
+ {0x43E092F440CC8231, 0xBFEFFFFBFFFFFFFA, 0x6DB8E34F4F48B9F7, 0x6DB8E34F4F48B9F7},
+ {0x0000000000000000, 0x0000000000000000, 0x0000002000000000, 0x0000002000000000},
+ {0xC1CFFFFFFFFFEF7E, 0x418FFFFFFC000FFF, 0xBFE00080FFFFFFFF, 0xC36FFFFFFBFFFF7D},
+ {0x0000000000000000, 0x0000000000000000, 0x0000004000000000, 0x0000004000000000},
+ {0xB80FFFFFFFE00400, 0xB815C7BB8EEB803F, 0x3FCC000080000000, 0x3FCC000080000000},
+ {0x0000000000000000, 0x0000000000000000, 0x0000008000000000, 0x0000008000000000},
+ {0xC4340D3E00FABDB5, 0x1A6FE08000000000, 0xC1F000000000FFFB, 0xC1F000000000FFFB},
+ {0x0000000000000000, 0x0000000000000000, 0x0000010000000000, 0x0000010000000000},
+ {0x41D20003FFFFFFFF, 0x41DFFBFFFFFFFFEE, 0x43F0000000000000, 0x43F23FB87FEFFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000020000000000, 0x0000020000000000},
+ {0x43C000100FFFFFFF, 0x3FCEE9704EC2A692, 0x405FFC0000001FFF, 0x439EE98F571C65A3},
+ {0x0000000000000000, 0x0000000000000000, 0x0000040000000000, 0x0000040000000000},
+ {0x3F2FFBF7FFFFFFFF, 0xC4777F08C2C4B070, 0x3FE0FFFEFFFFFFFF, 0xC3B77C1301EA2728},
+ {0x0000000000000000, 0x0000000000000000, 0x0000080000000000, 0x0000080000000000},
+ {0x41FFFFFFFFFFFFED, 0x401000000FFFFFF7, 0xB801FFFEFFFFFFFF, 0x422000000FFFFFED},
+ {0x0000000000000000, 0x0000000000000000, 0x0000100000000000, 0x0000100000000000},
+ {0xC020000000001EFF, 0x4644000001000000, 0x705003FFEFFFFFFE, 0x705003FFEFFFFFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000200000000000, 0x0000200000000000},
+ {0x0010000001FFFF7F, 0xC3DFDFFFFFFFDFFF, 0xC1E0007EFFFFFFFF, 0xC1E0007EFFFFFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0000400000000000, 0x0000400000000000},
+ {0xC0EFFFEFFFF7FFFF, 0x9ABE6ECE440B95FA, 0xBFB1FFFFFBFFFFFE, 0xBFB1FFFFFBFFFFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x0000800000000000, 0x0000800000000000},
+ {0xC01000001000007E, 0xC3E1051533EA6A42, 0xFFF00000003BFFFF, 0xFFF80000003BFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x0001000000000000, 0x0001000000000000},
+ {0x47F0800000001FFF, 0x41EFFFFFFFBFF7FF, 0x403FFFDFFFFC0000, 0x49F07FFFFFDF1BDE},
+ {0x0000000000000000, 0x0000000000000000, 0x0002000000000000, 0x0002000000000000},
+ {0x3D4B936A1CE81C52, 0x3F9E4D820D12ED90, 0x00000200001FFFFE, 0x3CFA1CFE4492713F},
+ {0x0000000000000000, 0x0000000000000000, 0x0004000000000000, 0x0004000000000000},
+ {0x43ED0B6F0B7202F7, 0xF0B0080000000004, 0xA4EFDFFFFFDFFFFE, 0xF4AD19F4C2F7BC00},
+ {0x0000000000000000, 0x0000000000000000, 0x0008000000000000, 0x0008000000000000},
+ {0xC1F008001FFFFFFE, 0xC0DFFFFFFFFFF7F6, 0xB7C0001004000000, 0x42E008001FFFFBF7},
+ {0x0000000000000000, 0x0000000000000000, 0x000C000000000000, 0x000C000000000000},
+ {0x40207FFFFFFFFFFF, 0x6DAFF80003FFFFFF, 0xA440040000001FFF, 0x6DE07BE0020FFFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x000E000000000000, 0x000E000000000000},
+ {0xBFC0007800000000, 0xC1E4B615059952A4, 0xC03ACE1E170E22CA, 0x41B4B6B04068DE8B},
+ {0x0000000000000000, 0x0000000000000000, 0x000F000000000000, 0x000F000000000000},
+ {0xC01A91F459B3B16D, 0x0AB8EAC4F3D7A2D6, 0x3E44C50931E0038A, 0x3E44C50931E0038A},
+ {0x0000000000000000, 0x0000000000000000, 0x000F800000000000, 0x000F800000000000},
+ {0x3FE0010000000FFF, 0xC3E0000002000010, 0xB817DBD0ADD317E3, 0xC3D001000200300F},
+ {0x0000000000000000, 0x0000000000000000, 0x000FC00000000000, 0x000FC00000000000},
+ {0x3CB0000003FFFFFF, 0xC34001FFF8000000, 0x381FFFFFFFFFD7FF, 0xC00001FFFC007FFD},
+ {0x0000000000000000, 0x0000000000000000, 0x000FE00000000000, 0x000FE00000000000},
+ {0x40000000207FFFFF, 0x7FFFFFFFFFFFBF7F, 0x37E0000020007FFF, 0x7FFFFFFFFFFFBF7F},
+ {0x0000000000000000, 0x0000000000000000, 0x000FF00000000000, 0x000FF00000000000},
+ {0xC480000000408000, 0xA09FFDFFFFFFFFFF, 0xFFF00000007FFFFE, 0xFFF80000007FFFFE},
+ {0x0000000000000000, 0x0000000000000000, 0x000FF80000000000, 0x000FF80000000000},
+ {0xBE1FFFFFFF000FFF, 0x7FEFFF8007FFFFFF, 0x4687625EB01B3B85, 0xFE1FFF80070013FE},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFC0000000000, 0x000FFC0000000000},
+ {0xC1FFDFFFFFFDFFFF, 0x3DAF00003FFFFFFF, 0xC3C0007FFFFFFBFF, 0xC3C0007FFFFFFBFF},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFE0000000000, 0x000FFE0000000000},
+ {0x3EAFFFFFFFFFF802, 0x326000200000FFFF, 0x3ECDF8007752B787, 0x3ECDF8007752B787},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFF0000000000, 0x000FFF0000000000},
+ {0x41D000008000000E, 0xC3FFFFFFFFDFFFF7, 0xDEF00000800000FE, 0xDEF00000800000FE},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFF8000000000, 0x000FFF8000000000},
+ {0xDADFFFFFDC000000, 0x43E1FFFB5E85A588, 0x41E00200000FFFFF, 0xDED1FFFB4A45AABE},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFC000000000, 0x000FFFC000000000},
+ {0x800F22F515F2D593, 0x3B00007FFFFFFFF7, 0x3E0B5496C93F1DB6, 0x3E0B5496C93F1DB6},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFE000000000, 0x000FFFE000000000},
+ {0x3FD000001FFF7FFE, 0xFFDFFFFFFF1FFFFE, 0x37F000FFFFFFFFFC, 0xFFC000001F8F7FFC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFF000000000, 0x000FFFF000000000},
+ {0x39FFFFFFFFFFE800, 0xC01FFFFFFFFFF7BF, 0x406FFFFFFFFF8006, 0x406FFFFFFFFF8006},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFF800000000, 0x000FFFF800000000},
+ {0xC2001FF800000000, 0x7FFA8ED51F1206E0, 0xC1FC000001FFFFFF, 0x7FFA8ED51F1206E0},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFC00000000, 0x000FFFFC00000000},
+ {0xC3C4258B518163B8, 0xBFC00000000007F7, 0xBB500000000000DF, 0x4394258B51816DBF},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFE00000000, 0x000FFFFE00000000},
+ {0xBF707FFFFFFFFFF0, 0xB93000041FFFFFFE, 0xC1C7592437EE97E2, 0xC1C7592437EE97E2},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFF00000000, 0x000FFFFF00000000},
+ {0x0DF0000000FFFF7F, 0x41D000007FFFFF7E, 0xBFCC6E8D659E4D60, 0xBFCC6E8D659E4D60},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFF80000000, 0x000FFFFF80000000},
+ {0xBF605DD50836E397, 0x424FFFFC0000001F, 0x3FF39CB7D3DBCC1C, 0xC1C05DD2FBDF5CE1},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFC0000000, 0x000FFFFFC0000000},
+ {0xC5900000040FFFFF, 0x3D1B81385EF84A47, 0x43D0FFFFFFFFFF7F, 0x43D0FFF91FB1E602},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFE0000000, 0x000FFFFFE0000000},
+ {0xC39023FFFFFFFFFF, 0xC1D7FC5D4638372A, 0x0008FA15004C1955, 0x45783255181635A5},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFF0000000, 0x000FFFFFF0000000},
+ {0x404F9FFFFFFFFFFF, 0x403BA1A785BFC1CA, 0x3FF00021FFFFFFFE, 0x409B52C297AE8284},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFF8000000, 0x000FFFFFF8000000},
+ {0xC0200000FFDFFFFF, 0xC7F26F008BD60EB9, 0xBF500000000011FE, 0x48226F01B2A13974},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFC000000, 0x000FFFFFFC000000},
+ {0x40DCFB7EE68DFCBF, 0x436FFFFFFF8001FF, 0xB43A10E0857B778E, 0x445CFB7EE61A1092},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFE000000, 0x000FFFFFFE000000},
+ {0xC00FDFFFFEFFFFFE, 0xC019AC104FC6F761, 0x14D000005FFFFFFF, 0x403992643EA9CFE6},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFF000000, 0x000FFFFFFF000000},
+ {0x43F00000007FF000, 0xFFE000007F000000, 0x41DFFFFFFEFFFFFF, 0xFFF0000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFF800000, 0x000FFFFFFF800000},
+ {0x2B4B6F99EB8DC68F, 0xC397D7F26039B041, 0xBF9003FFFFFFFFFB, 0xBF9003FFFFFFFFFB},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFC00000, 0x000FFFFFFFC00000},
+ {0x7E60000040000FFF, 0x4029DA5AD4EFCB6D, 0x37F000020000001E, 0x7E99DA5B3C595099},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFE00000, 0x000FFFFFFFE00000},
+ {0xFFD917E731FD2616, 0x3FCFFFFFFEFFFFDF, 0x4010000080000FFF, 0xFFB917E7313466C3},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFF00000, 0x000FFFFFFFF00000},
+ {0xC060000000DFFFFF, 0x40000000001F7FFE, 0xBE60400000008000, 0xC070000001079FFD},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFF80000, 0x000FFFFFFFF80000},
+ {0x40780000007FFFFF, 0xC3FFFFE1FFFFFFFF, 0xC01FFFFFF0020000, 0xC487FFE9807FFF86},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFC0000, 0x000FFFFFFFFC0000},
+ {0x38198DFE22D46B33, 0xBFF3FFFFF7FFFFFE, 0xBFE00001FFFFFFBF, 0xBFE00001FFFFFFBF},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFE0000, 0x000FFFFFFFFE0000},
+ {0xB94D312B64D81765, 0x3F91134546D1F7B4, 0x41DFFFFFFFEFFC00, 0x41DFFFFFFFEFFC00},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFF0000, 0x000FFFFFFFFF0000},
+ {0x382000001FFFFFC0, 0x3A4000200000003E, 0xBF28172AC8ABE532, 0xBF28172AC8ABE532},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFF8000, 0x000FFFFFFFFF8000},
+ {0xFF7F372E320C99D2, 0x801DFFFFFFFFDFFF, 0x47F480CD1AC016E4, 0x47F480CD1AC016E4},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFC000, 0x000FFFFFFFFFC000},
+ {0x3FE413E4F1DAEB3B, 0x275FF80000200000, 0xBFECEE526E30E56D, 0xBFECEE526E30E56D},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFE000, 0x000FFFFFFFFFE000},
+ {0xBFDFFFFFFC000001, 0xA5C00000000001F7, 0xFFEFFFFFBFFFFC00, 0xFFEFFFFFBFFFFC00},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFF000, 0x000FFFFFFFFFF000},
+ {0x3FD020007FFFFFFF, 0x3F90FDE6200C1C5F, 0xBFF073298B792452, 0xBFF06209A904E8EC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFF800, 0x000FFFFFFFFFF800},
+ {0xC100000008FFFFFF, 0xBF80000400000020, 0x3CDFFFFF9FFFFFFF, 0x409000040900025F},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFC00, 0x000FFFFFFFFFFC00},
+ {0x360245A7AE7E955B, 0x3A5007FFEFFFFFFF, 0x480020E2618CEE28, 0x480020E2618CEE28},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFE00, 0x000FFFFFFFFFFE00},
+ {0x38100FFFFFFFFFEF, 0xCE30000000FFFFFB, 0x001BFFFFFFFFFFFF, 0xC65010000100FFEA},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFF00, 0x000FFFFFFFFFFF00},
+ {0x40474C8EE93534BA, 0x43FFFDFFFFFFFDFE, 0x763C00000007FFFF, 0x763C00000007FFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFF80, 0x000FFFFFFFFFFF80},
+ {0x8020000400002000, 0x40252257ED7E42FC, 0x43FFF3A2D1376478, 0x43FFF3A2D1376478},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFC0, 0x000FFFFFFFFFFFC0},
+ {0xB1ADDB4ADF7E1695, 0xBF1C70AD5A8F4BF1, 0xC020000004000003, 0xC020000004000003},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFE0, 0x000FFFFFFFFFFFE0},
+ {0x3FF21F3F197F9A48, 0x411FFFFC0000003E, 0x41F6922E2764B9B2, 0x41F692BF214B6670},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFF0, 0x000FFFFFFFFFFFF0},
+ {0x403BFFFFFFFFFFE0, 0x41F1FFFFFFF7FFFE, 0xBFD9F83A22C6F0F4, 0x423F7FFFFFF197F8},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFF8, 0x000FFFFFFFFFFFF8},
+ {0xC3473E10747ADC99, 0x3FFFFFFFFFFF8006, 0xBFC7FFFBFFFFFFFE, 0xC3573E10747A7FA5},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFFC, 0x000FFFFFFFFFFFFC},
+ {0xC1C98279403ED336, 0x43E0000000F80000, 0xBF3338A7797A485F, 0xC5B9827941CA398D},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFFE, 0x000FFFFFFFFFFFFE},
+ {0x400FFFFFF000001E, 0xC0068FDF91795918, 0xC3E8000000000000, 0xC3E8000000000000},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFFF, 0x000FFFFFFFFFFFFF},
+ {0xBF6FFFFFC000001E, 0xBFB000000008000F, 0x403C100000000000, 0x403C100FFFFFE008},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFFD, 0x000FFFFFFFFFFFFD},
+ {0xC01000000FFBFFFE, 0x380E000000001FFF, 0x40200FFFFFFFFFFC, 0x40200FFFFFFFFFFC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFFB, 0x000FFFFFFFFFFFFB},
+ {0x3F30F27923F1F9D4, 0x22A0000002001FFF, 0xC7F28591E5A9DC81, 0xC7F28591E5A9DC81},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFF7, 0x000FFFFFFFFFFFF7},
+ {0xB7E0000200000000, 0xBF90000020800000, 0xB7EF800002000000, 0xB7EF3FFFF97DFFF0},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFEF, 0x000FFFFFFFFFFFEF},
+ {0x521FFE0000002000, 0x3FFFFF8000001FFF, 0xC7E0000000000000, 0x522FFD8008003FFD},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFDF, 0x000FFFFFFFFFFFDF},
+ {0xD5AFFFDFFFBFFFFE, 0x40307FFFFFFFFFDE, 0xC77FFFFFFFFFFFFE, 0xD5F07FEF7FDEFFDD},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFFBF, 0x000FFFFFFFFFFFBF},
+ {0x80010000000000FF, 0x4010000000001000, 0x37EFF9FFFFFFFFFF, 0x37EFF9FFFFFFFFFF},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFF7F, 0x000FFFFFFFFFFF7F},
+ {0xF1F0800000000001, 0xBFEFFFFFF8000000, 0xBFEFF77FFFFFFFFF, 0x71F07FFFFBE00001},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFEFF, 0x000FFFFFFFFFFEFF},
+ {0x37E0000000000020, 0x8F740000000FFFFF, 0xBF0FFFFFFFFFEFFC, 0xBF0FFFFFFFFFEFFC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFDFF, 0x000FFFFFFFFFFDFF},
+ {0xC0E0000000000024, 0x3F80000003EFFFFE, 0x43400103FFFFFFFE, 0x43400103FFFFFF7E},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFFBFF, 0x000FFFFFFFFFFBFF},
+ {0xC0B00080000000FE, 0xC007FFFF7FFFFFFF, 0x0F2FFF8000000003, 0x40C800BF7FFC017C},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFF7FF, 0x000FFFFFFFFFF7FF},
+ {0x3E93FFFFFF800000, 0x3FEFBFFFFFFC0000, 0xC1B0080000000010, 0xC1B008000000000B},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFEFFF, 0x000FFFFFFFFFEFFF},
+ {0xB81F8CEEB216EB17, 0x40000FFBFFFFFFFE, 0xB6B3FFFC00000000, 0xB82FAC73E58D4D78},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFDFFF, 0x000FFFFFFFFFDFFF},
+ {0xBE3FF800000FFFFE, 0xC801FFFFFFFFFFF7, 0xBE0000000010FFFF, 0x4651FB800008FFF6},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFFBFFF, 0x000FFFFFFFFFBFFF},
+ {0x43E7FFBFFFFFFFFE, 0xFFF0F109870AB7C3, 0xC01CAD7BDCE24C12, 0xFFF8F109870AB7C3},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFF7FFF, 0x000FFFFFFFFF7FFF},
+ {0x3FBFFFFFE00007FE, 0x3FF0200000400000, 0x30900007FFFF0000, 0x3FC01FFFF0200407},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFEFFFF, 0x000FFFFFFFFEFFFF},
+ {0xB80001FFFFFFFFFF, 0x480FFFFFFDFFFFFF, 0x3FFE9E7BA4C50CC5, 0xC0185C6114CE7CCC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFDFFFF, 0x000FFFFFFFFDFFFF},
+ {0x400018F23CBD52B3, 0xBC3020000000001E, 0x3FC000010000000E, 0x3FC000010000000E},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFFBFFFF, 0x000FFFFFFFFBFFFF},
+ {0x1EB00000400001FF, 0x80B0000010020000, 0xC0191F114A87369D, 0xC0191F114A87369D},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFF7FFFF, 0x000FFFFFFFF7FFFF},
+ {0x0A1FDFFFFFFFF800, 0x3FDFDFFFFFF7FFFF, 0xC21FFFFFFFD80000, 0xC21FFFFFFFD80000},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFEFFFFF, 0x000FFFFFFFEFFFFF},
+ {0x380FFFFF7FFFFFEE, 0x40A0000000010FFE, 0x3FD0002000200000, 0x3FD0002000200000},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFDFFFFF, 0x000FFFFFFFDFFFFF},
+ {0x514F3A3A6351CB07, 0xA98FFFF0000FFFFF, 0x8000007FFDFFFFFF, 0xBAEF3A2AC644367A},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFFBFFFFF, 0x000FFFFFFFBFFFFF},
+ {0xB5800000000021FE, 0xC01000007FFFFFDF, 0x32DFD643F3D43430, 0x35A00000800022DC},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFF7FFFFF, 0x000FFFFFFF7FFFFF},
+ {0x47CF807FFFFFFFFF, 0xC034515F96FEEADD, 0x3805BB98AEBD2687, 0xC814006B5E214B2D},
+ {0x0000000000000000, 0x0000000000000000, 0x000FFFFFFEFFFFFF, 0x000FFFFFFEFFFFFF},
+}
+
func tolerance(a, b, e float64) bool {
// Multiplying by e here can underflow denormal values to zero.
// Check a==b so that at least if a and b are small and identical
@@ -2970,6 +3234,19 @@
}
}
+func TestFma(t *testing.T) {
+ for _, c := range fmaC {
+ x := Float64frombits(c.x)
+ y := Float64frombits(c.y)
+ z := Float64frombits(c.z)
+ want := Float64frombits(c.want)
+ got := Fma(x, y, z)
+ if !alike(got, want) {
+ t.Errorf("Fma(%v,%v,%v) == %v; want %v", x, y, z, got, want)
+ }
+ }
+}
+
// Check that math functions of high angle values
// return accurate results. [Since (vf[i] + large) - large != vf[i],
// testing for Trig(vf[i] + large) == Trig(vf[i]), where large is
@@ -3627,3 +3904,11 @@
}
GlobalF = x
}
+
+func BenchmarkFMA(b *testing.B) {
+ x := 0.0
+ for i := 0; i < b.N; i++ {
+ x = Fma(math.E, math.Pi, math.Phi)
+ }
+ GlobalF = x
+}
diff --git a/src/math/fma.go b/src/math/fma.go
new file mode 100644
index 0000000..762b95a
--- /dev/null
+++ b/src/math/fma.go
@@ -0,0 +1,230 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+// The original C code is written for MUSL and can be found at
+// http://git.musl-libc.org/cgit/musl/tree/src/math/fma.c.
+// The Go code is a simplified version of the original C.
+// MUSL is licensed as follows:
+//
+// Copyright (c) 2005-2014 Rich Felker, et al. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import "math/bits"
+
+// Fma returns (x * y) + z, computed as if to infinite
+// precision with one rounding into a float64.
+func Fma(x, y, z float64) float64
+
+type num struct {
+ m uint64
+ e int32
+ sign int32
+}
+
+func normalNum(x float64) num {
+ ix := Float64bits(x)
+ e := int32(ix >> 52)
+ sign := e & 0x800
+ e &= 0x7ff
+ if e == 0 {
+ ix = Float64bits(x * (1 << 63))
+ e = int32(ix >> 52 & 0x7ff)
+ if e == 0 {
+ e = 0x800
+ } else {
+ e -= 63
+ }
+ }
+ ix &= (1 << 52) - 1
+ ix |= 1 << 52
+ ix <<= 1
+ e -= 0x3ff + 52 + 1
+ return num{ix, e, sign}
+}
+
+func fma(x, y, z float64) float64 {
+ // normalize so top 10 bits and last bit are 0
+ nx := normalNum(x)
+ ny := normalNum(y)
+ nz := normalNum(z)
+
+ const ZEROINFNAN = 0x7ff - 0x3ff - 52 - 1
+ if nx.e >= ZEROINFNAN || ny.e >= ZEROINFNAN {
+ return x*y + z
+ }
+ if nz.e >= ZEROINFNAN {
+ if nz.e > ZEROINFNAN {
+ return x*y + z
+ }
+ return z
+ }
+
+ // r = x * y
+ rhi, rlo := bits.Mul64(nx.m, ny.m)
+ // align exponents
+ e := nx.e + ny.e
+ d := nz.e - e
+ var zlo, zhi uint64
+ // shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz)
+ if d > 0 {
+ if d < 64 {
+ zlo = nz.m << uint(d)
+ zhi = nz.m >> (64 - uint(d))
+ } else {
+ zhi = nz.m
+ e = nz.e - 64
+ d -= 64
+ if d < 64 {
+ rlo = rhi<<(64-uint(d)) | rlo>>uint(d)
+ if (rlo << (64 - uint(d))) != 0 {
+ rlo |= 1
+ }
+ rhi = rhi >> uint(d)
+ } else if d != 0 {
+ rlo = 1
+ rhi = 0
+ }
+ }
+ } else {
+ d = -d
+ if d == 0 {
+ zlo = nz.m
+ } else if d < 64 {
+ zlo = nz.m >> uint(d)
+ if (nz.m << (64 - uint(d))) != 0 {
+ zlo |= 1
+ }
+ } else {
+ zlo = 1
+ }
+ }
+
+ // add
+ sign32 := (nx.sign ^ ny.sign)
+ sign := sign32 != 0
+ samesign := (sign32 ^ nz.sign) == 0
+ nonzero := true
+ if samesign {
+ rlo += zlo
+ rhi += zhi
+ if rlo < zlo {
+ rhi++
+ }
+ } else {
+ t := rlo
+ rlo -= zlo
+ rhi = rhi - zhi
+ if t < rlo {
+ rhi--
+ }
+ if (rhi >> 63) != 0 {
+ rlo = -rlo
+ rhi = -rhi
+ if rlo != 0 {
+ rhi--
+ }
+ sign = !sign
+ }
+ nonzero = rhi != 0
+ }
+
+ // set rhi to top 63bit of the result (last bit is sticky)
+ if nonzero {
+ e += 64
+ d = int32(bits.LeadingZeros64(rhi) - 1)
+ rhi = rhi<<uint(d) | rlo>>(64-uint(d))
+ if (rlo << uint(d)) != 0 {
+ rhi |= 1
+ }
+ } else if rlo != 0 {
+ d = int32(bits.LeadingZeros64(rlo) - 1)
+ if d < 0 {
+ rhi = rlo>>1 | (rlo & 1)
+ } else {
+ rhi = rlo << uint(d)
+ }
+ } else {
+ // exact +-0
+ return x*y + z
+ }
+ e -= d
+
+ // convert to float64
+ // i is in [1<<62,(1<<63)-1]
+ i := int64(rhi)
+ if sign {
+ i = -i
+ }
+ // |r| is in [0x1p62,0x1p63]
+ r := float64(i)
+
+ if e < -1022-62 {
+ // result is subnormal before rounding
+ if e == -1022-63 {
+ const FLT_MIN = 1.1754943508222875079687365372222456778186655567720875e-38
+ const DBL_MIN = 2.225073858507201383090232717332404064219215980462331e-308
+ c := float64(1 << 63)
+ if sign {
+ c = -c
+ }
+ if r == c {
+ // min normal after rounding, underflow depends
+ // on arch behaviour which can be imitated by
+ // a double to float conversion
+ fltmin := float32((0.268435448 * (1.0 / (1 << 63))) * FLT_MIN * r)
+ return float64(DBL_MIN / FLT_MIN * fltmin)
+ }
+ // one bit is lost when scaled, add another top bit to
+ // only round once at conversion if it is inexact
+ if (rhi << 53) != 0 {
+ i = int64(rhi>>1 | (rhi & 1) | 1<<62)
+ if sign {
+ i = -i
+ }
+ r = float64(i)
+ // remove top bit
+ r = 2*r - c
+ {
+ // raise underflow portably, such that it
+ // cannot be optimized away
+ tiny := DBL_MIN / FLT_MIN * r
+ r += (tiny * tiny) * (r - r)
+ }
+ }
+ } else {
+ // only round once when scaled
+ d = 10
+ ui := rhi >> uint(d)
+ if (rhi << (64 - uint(d))) != 0 {
+ ui |= 1
+ }
+ i = int64(ui << uint(d))
+ if sign {
+ i = -i
+ }
+ r = float64(i)
+ }
+ }
+ return Ldexp(r, int(e))
+}
diff --git a/src/math/fma_arm.s b/src/math/fma_arm.s
new file mode 100644
index 0000000..70e7481
--- /dev/null
+++ b/src/math/fma_arm.s
@@ -0,0 +1,16 @@
+#include "textflag.h"
+
+// func Fma(x, y, z float64) float64
+TEXT ·Fma(SB),NOSPLIT,$0
+ MOVBU ·internal∕cpu·ARM+const_arm_HasVFPv4(SB), R0
+ CMP $0, R0
+ BEQ soft
+ // hardware supports fma.
+ MOVD x+0(FP), F0
+ MOVD y+8(FP), F1
+ MOVD z+16(FP), F2
+ WORD $0xEEA12B00 // VFMA F0, F1, F2
+ MOVD F2, ret+24(FP)
+ RET
+soft:
+ B ·fma(SB)
diff --git a/src/math/fma_arm64.s b/src/math/fma_arm64.s
new file mode 100644
index 0000000..b6f5d0b
--- /dev/null
+++ b/src/math/fma_arm64.s
@@ -0,0 +1,10 @@
+#include "textflag.h"
+
+// func Fma(x, y, z float64) float64
+TEXT ·Fma(SB),NOSPLIT,$0
+ FMOVD x+0(FP), F0
+ FMOVD y+8(FP), F1
+ FMOVD z+16(FP), F2
+ FMADDD F0, F2, F1, F0
+ FMOVD F0, ret+24(FP)
+ RET
diff --git a/src/math/fma_ppc64x.s b/src/math/fma_ppc64x.s
new file mode 100644
index 0000000..5709148
--- /dev/null
+++ b/src/math/fma_ppc64x.s
@@ -0,0 +1,12 @@
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// func Fma(x, y, z float64) float64
+TEXT ·Fma(SB),NOSPLIT,$0
+ FMOVD x+0(FP), F0
+ FMOVD y+8(FP), F1
+ FMOVD z+16(FP), F2
+ FMADD F0, F2, F1, F0
+ FMOVD F0, ret+24(FP)
+ RET
diff --git a/src/math/fma_s390x.s b/src/math/fma_s390x.s
new file mode 100644
index 0000000..1771b8b
--- /dev/null
+++ b/src/math/fma_s390x.s
@@ -0,0 +1,10 @@
+#include "textflag.h"
+
+// func Fma(x, y, z float64) float64
+TEXT ·Fma(SB),NOSPLIT,$0
+ FMOVD x+0(FP), F0
+ FMOVD y+8(FP), F1
+ FMOVD z+16(FP), F2
+ FMADD F0, F1, F2
+ FMOVD F2, ret+24(FP)
+ RET
diff --git a/src/math/fma_x86.s b/src/math/fma_x86.s
new file mode 100644
index 0000000..89d2b0a
--- /dev/null
+++ b/src/math/fma_x86.s
@@ -0,0 +1,19 @@
+// +build 386 amd64 amd64p32
+
+#include "textflag.h"
+
+// func Fma(x, y, z float64) float64
+TEXT ·Fma(SB),NOSPLIT,$0
+ MOVB ·internal∕cpu·X86+const_x86_HasFMA(SB), AL
+ CMPB AL, $0
+ JE soft
+ // hardware supports FMA3
+ MOVLPD x+0(FP), X0
+ MOVLPD y+8(FP), X1
+ MOVLPD z+16(FP), X2
+ // X0 = X0 * X1 + X2
+ VFMADD213SD X2, X1, X0
+ MOVLPD X0, ret+24(FP)
+ RET
+soft:
+ JMP ·fma(SB)
diff --git a/src/math/stubs_mips64x.s b/src/math/stubs_mips64x.s
index b3ffa5b..389741d 100644
--- a/src/math/stubs_mips64x.s
+++ b/src/math/stubs_mips64x.s
@@ -113,3 +113,6 @@
TEXT ·Pow(SB),NOSPLIT,$0
JMP ·pow(SB)
+
+TEXT ·FMA(SB),NOSPLIT,$0
+ JMP ·fma(SB)
diff --git a/src/math/stubs_mipsx.s b/src/math/stubs_mipsx.s
index 129898e..d44bd93 100644
--- a/src/math/stubs_mipsx.s
+++ b/src/math/stubs_mipsx.s
@@ -111,3 +111,5 @@
TEXT ·Pow(SB),NOSPLIT,$0
JMP ·pow(SB)
+TEXT ·FMA(SB),NOSPLIT,$0
+ JMP ·fma(SB)
diff --git a/src/math/unsafe.go b/src/math/unsafe.go
index 5ae6742..fdd71ff 100644
--- a/src/math/unsafe.go
+++ b/src/math/unsafe.go
@@ -4,7 +4,15 @@
package math
-import "unsafe"
+import (
+ "internal/cpu"
+ "unsafe"
+)
+
+var (
+ arm_HasVFPv4 = unsafe.Offsetof(cpu.ARM.HasVFPv4)
+ x86_HasFMA = unsafe.Offsetof(cpu.X86.HasFMA)
+)
// Float32bits returns the IEEE 754 binary representation of f.
func Float32bits(f float32) uint32 { return *(*uint32)(unsafe.Pointer(&f)) }
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
All new files need the copyright header.
Akhil Indurti uploaded patch set #2 to this change.
10 files changed, 616 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Patch Set 1:
All new files need the copyright header.
Done.
Akhil Indurti uploaded patch set #3 to this change.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 deterministically uses instructions satisfying
IEEE 754, code for a GOARCH with or without a recently introduced FMA
instruction set may perform intermediate rounding. Consequently,
applications cannot rely on results being identical across
GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Assembly implementations
are given for architectures that support them. Otherwise, a software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept,
and doesn't rely on detecting the FPU's rounding mode.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 and
https://go-review.googlesource.com/c/go/+/123157 being merged in first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
A src/math/fma_arm64.s
A src/math/fma_ppc64x.s
A src/math/fma_s390x.s
A src/math/fma_x86.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
10 files changed, 616 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
I want to mention that I now have three competing software implementations for float64 FMA.
FreeBSD: https://github.com/smasher164/fma/blob/master/fma_bsd.go
MUSL: https://github.com/smasher164/fma/blob/master/fma_musl.go
Bellard: https://github.com/smasher164/fma/blob/master/fma_bellard.go
Benchmarking reveals
goos: darwin
goarch: amd64
pkg: github.com/smasher164/fma
BenchmarkBSD-4 20000000 58.9 ns/op
BenchmarkMUSL-4 50000000 33.8 ns/op
BenchmarkBellard-4 50000000 33.4 ns/op
PASS
ok github.com/smasher164/fma 4.677s
All of these implementations should be taken into account when reviewing this CL.
For arm64, ppc64{,le} and s390x I'd be tempted to make math.FMA a compiler intrinsic and not bother adding assembly at all. You should just need to add a couple of new cases to cmd/compile/internal/gc/ssa.go and some new tests in tests/codegen.
Akhil Indurti uploaded patch set #4 to this change.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 deterministically uses instructions satisfying
IEEE 754, code for a GOARCH with or without a recently introduced FMA
instruction set may perform intermediate rounding. Consequently,
applications cannot rely on results being identical across
GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Assembly implementations
are given for architectures that support them. Otherwise, a software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 and
https://go-review.googlesource.com/c/go/+/123157 being merged in first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/cmd/compile/internal/gc/ssa.go
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
A src/math/fma_x86.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
M test/codegen/math.go
9 files changed, 595 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Patch Set 3:
For arm64, ppc64{,le} and s390x I'd be tempted to make math.FMA a compiler intrinsic and not bother adding assembly at all. You should just need to add a couple of new cases to cmd/compile/internal/gc/ssa.go and some new tests in tests/codegen.
Patch Set 4 hopefully addresses this. This is my first time working in cmd/compile, so I appreciate any feedback! :)
2 comments:
File src/cmd/compile/internal/gc/ssa.go:
addF("math", "Fma",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpARM64FMADDD, types.Types[TFLOAT64], args[0], args[1], args[2])
},
sys.ARM64)
addF("math", "Fma",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpPPC64FMADD, types.Types[TFLOAT64], args[0], args[1], args[2])
},
sys.PPC64)
addF("math", "Fma",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpS390XFMADD, types.Types[TFLOAT64], args[0], args[1], args[2])
},
sys.S390X)
We should not directly generate machine-specific Ops here. We should introduce a generic Op (in cmd/compile/internal/ssa/gen/genericOps.go), generate that Op here, and lower it to the machine-specific Ops in the lower pass (using rewriting rules in cmd/compile/internal/ssa/*.rules).
Also, I think the compiler change should be a separate CL.
I'm probably late to the game, but I think Fma doesn't sound like a good name. The proposal says FMA, which it better, I think. Or maybe a more descriptive name like FusedMultiplyAdd, or FusedMulAdd?
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #4, Line 11: MOVB ·internal∕cpu·X86+const_x86_HasFMA(SB), AL
Instead of using a function here could we just make this an intrinsic that emits the guard in the compiler? See support_POPCNT in the runtime and compiler code. Probably best to add the generic code in math first and then intrinsics for each architecture each in an extra CL with codegen tests.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #5 to this change.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 deterministically uses instructions satisfying
IEEE 754, code for a GOARCH with or without a recently introduced FMA
instruction set may perform intermediate rounding. Consequently,
applications cannot rely on results being identical across
GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Assembly implementations
are given for architectures that support them. Otherwise, a software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 and
https://go-review.googlesource.com/c/go/+/123157 being merged in first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
A src/math/fma_x86.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
7 files changed, 572 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
2 comments:
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), supportSSE41, s.sb)
v := s.load(types.Types[TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
We should not directly generate machine-specific Ops here. […]
I have submitted https://go-review.googlesource.com/c/go/+/131959/1 as a separate CL.
I'm probably late to the game, but I think Fma doesn't sound like a good name. […]
“Fma” came from the convention of taking a <cmath> function and capitalizing its first letter. However, given that the name is an acronym, using the fully capitalized “FMA” makes logical sense.
If we are departing from the cmath convention, I’m a fan of “FusedMulAdd” for its brevity.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #4, Line 11: MOVB ·internal∕cpu·X86+const_x86_HasFMA(SB), AL
Instead of using a function here could we just make this an intrinsic that emits the guard in the co […]
I assume the CL would involve
- preinitializing "support_fma" to cpu.X86.HasFMA.
- constructing an *ssa.Value that checks the value "support_fma" and either
- uses the ssa opcode directly or
- calls the normal function
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #7 to this change.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and arm
will exhibit different behavior depending on available instruction set. Consequently, applications cannot rely on results being identical across
GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Hardware implementations
are used when available. Otherwise, a software fallback is given as
the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 being merged first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
6 files changed, 546 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
I assume the CL would involve […]
I attempt to address this in https://go-review.googlesource.com/c/go/+/137156/1
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #8 to this change.
math: add guaranteed-precision FMA intrinsic
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces an intrinsic that performs an IEEE 754
double-precision fused-multiply-add operation. Hardware implementations
are used when available. Otherwise, a software fallback is given as
the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to the software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 being merged first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
A src/math/fma_arm.s
M src/math/stubs_mips64x.s
M src/math/stubs_mipsx.s
M src/math/unsafe.go
6 files changed, 544 insertions(+), 1 deletion(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #8, Line 9: MOVBU ·internal∕cpu·ARM+const_arm_HasVFPv4(SB), R0
I plan to make the arm implementation an intrinsic as well. The introduction of the amd64 intrinsic broke the ability to link the arm assembly to the function declaration. Full intrinsification should address this issue.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
3 comments:
Patch Set #8, Line 17: This CL introduces an intrinsic
I dont think it does anymore but adds a public function
Patch Set #8, Line 9: MOVBU ·internal∕cpu·ARM+const_arm_HasVFPv4(SB), R0
I plan to make the arm implementation an intrinsic as well. […]
I would leave the arm optimization out here to make this CL simpler and to not need to delete this code again later with the planed CL that introduces an intrinsic optimization also for arm in the compiler.
File src/math/stubs_mips64x.s:
TEXT ·FMA(SB),NOSPLIT,$0
JMP ·fma(SB)
I think these stubs are obsolete as Fma seems to be
the new function added to math with a software implementation
for in pure go and hardware implementations
are implemented through intrinsics.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #8, Line 9: MOVBU ·internal∕cpu·ARM+const_arm_HasVFPv4(SB), R0
I would leave the arm optimization out here to make this CL simpler and to not need to delete this c […]
Sounds good to me. When I do introduce the ARM CL, will I have to modify golang/arch? Currently, the VFMA instruction isn't allowed in the assembler, so would adding it be a precursor to creating an intrinsic?
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #9 to this change.
math: add guaranteed-precision FMA implementation
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces a software implementation that performs an IEEE 754
double-precision fused-multiply-add operation. Separate CLs include
hardware implementations when available. Otherwise, this software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to this software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 being merged first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
M src/math/stubs_mipsx.s
M src/math/unsafe.go
4 files changed, 516 insertions(+), 2 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
3 comments:
Patch Set #8, Line 17: This CL introduces a software i
I dont think it does anymore but adds a public function
Sounds good to me. […]
Done
File src/math/stubs_mips64x.s:
I think these stubs are obsolete as Fma seems to be […]
Done
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
2 comments:
please revert the file so it does not contain white space cleanups these can be done in standalone cls.
import (
"unsafe"
)
if gofmt didnt automatically introduce this change please leave it out of the CL.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #10 to this change.
math: add guaranteed-precision FMA implementation
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces a software implementation that performs an IEEE 754
double-precision fused-multiply-add operation. Separate CLs include
hardware implementations when available. Otherwise, this software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to this software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
DO NOT SUBMIT. Depends on
https://go-review.googlesource.com/c/go/+/126315 being merged first.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
2 files changed, 513 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
2 comments:
please revert the file so it does not contain white space cleanups these can be done in standalone c […]
done
import "unsafe"
/
if gofmt didnt automatically introduce this change please leave it out of the CL.
done
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #11 to this change.
math: add guaranteed-precision FMA implementation
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces a software implementation that performs an IEEE 754
double-precision fused-multiply-add operation. Separate CLs include
hardware implementations when available. Otherwise, this software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to this software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Translates MUSL's implementation with mostly
fixed-point arithmetic. This was chosen as an alternative to
BSD's after observing a nearly 50% speedup with MUSL's. It also aims
to portably raise exceptions without a function like feraiseexcept.
Fixes #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
2 files changed, 513 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #12 to this change.
Updates #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
2 files changed, 513 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti removed Robert Griesemer from this change.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
The code looks fine. The proposal is accepted.
The only thing that gives me pause is the licencing stuff. I'm going to cc people who understand that area just to make sure. @andybons @iant
Patch set 12:Code-Review +2
1 comment:
Patch Set #12, Line 3957: x = Fma(E, Pi, Phi)
Probably want x = Fma(E, Pi, x) so each iteration is data-dependent on the previous iteration.
(The other benchmarks in this file fail in this respect. We should probably fix them all. But might as well start now.)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Sure there be further discussion about the name? (Fma vs. FMA vs. FusedMulAdd vs. something else?)
Or will that be caught in some "new API" pass before the 1.12 release?
Patch set 12:Code-Review -2
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
This copyright licence is a problem. It means that everyone who distributes a Go program must incorporate this copyright and permission notice in their own license notice. That's pretty much a non-starter. Sorry.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
This copyright licence is a problem. […]
Is this not just the MIT license though? I thought we included MIT licensed code in the standard library:
https://github.com/golang/go/search?l=Go&p=1&q=The+above+copyright+notice+and+this+permission+notice+shall+be+included+in+all+copies+or+substantial+portions+of+the+Software.&type=
I suppose the alternative is to use the implementation that Julia uses (FreeBSD's) under the 2-clause license, which I also have a translation of.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
Is this not just the MIT license though? I thought we included MIT licensed code in the standard lib […]
It's just the MIT license, yes, but it's specifically copyright "Rich Felker, et al," which means that the current LICENSE file, which is copyright "The Go Authors", does not suffice to meet this restriction.
If there are other examples in the standard library (not the tools--the tools are not distributed with every Go program) then we will need to take a look at them.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
It's just the MIT license, yes, but it's specifically copyright "Rich Felker, et al," which means th […]
The best approach is to work out from first principles what the code needs to look like, without worrying about efficiency, and write that.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
The best approach is to work out from first principles what the code needs to look like, without wor […]
Ian, there are currently 9 files in the runtime with an MIT for people other than (but including) the Go Authors. These other people are Lucent and Vita Nuova.
runtime/memclr_arm.s
runtime/memmove_386.s
runtime/memmove_amd64.s
runtime/memmove_arm.s
runtime/memmove_plan9_386.s
runtime/memmove_plan9_amd64.s
runtime/vlop_386.s
runtime/vlop_arm.s
runtime/vlrt.go
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
Ian, there are currently 9 files in the runtime with an MIT for people other than (but including) th […]
Thanks. That copyright license (Lucent, Vita Nuova) has been there since before the first public release. It's not something worth worrying about. But that doesn't make me comfortable adding new ones.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
Thanks. […]
I have been working on a custom software implementation based on the algorithm described in the "Handbook of Floating-Point Arithmetic" 2nd edition, section 7.5.4. However, I'm having some issues dealing with subnormal values. If anybody else wants to provide their own software implementation, feel free to work on it.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 22: // The above copyright notice and this permission notice shall be
I have been working on a custom software implementation based on the algorithm described in the "Han […]
By the way, it's also an option to ask Rich and whoever else contributed to this file to sign the CLA and confirm in this thread that they want their contribution submitted to the Go project under the CLA.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #13 to this change.
2 files changed, 505 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Patch Set 12:
(1 comment)
I have posted on the golang-dev mailing list about this option: https://groups.google.com/forum/#!topic/golang-dev/CBKfrWbxLtI.
Akhil Indurti uploaded patch set #14 to this change.
math: add guaranteed-precision FMA implementation
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces a software implementation that performs an IEEE 754
double-precision fused-multiply-add operation. Separate CLs include
hardware implementations when available. Otherwise, this software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to this software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Adapts algorithm used in MUSL's
implementation with mostly fixed-point arithmetic.
This was chosen as an alternative to BSD's after observing a
nearly 50% speedup with MUSL's. It also aims to portably
raise exceptions without a function like feraiseexcept.
Updates #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
M src/math/bits.go
A src/math/fma.go
3 files changed, 470 insertions(+), 11 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #15 to this change.
3 files changed, 472 insertions(+), 11 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Patch set 15:Run-TryBot +1
TryBots beginning. Status page: https://farmer.golang.org/try?commit=e60585ab
Build is still in progress...
This change failed on freebsd-amd64-12_0:
See https://storage.googleapis.com/go-build-log/e60585ab/freebsd-amd64-12_0_59dded50.log
Other builds still in progress; subsequent failure notices suppressed until final report. Consult https://build.golang.org/ to see whether they are new failures. Keep in mind that TryBots currently test *exactly* your git commit, without rebasing. If your commit's git parent is old, the failure might've already been fixed.
10 of 21 TryBots failed:
Failed on freebsd-amd64-12_0: https://storage.googleapis.com/go-build-log/e60585ab/freebsd-amd64-12_0_59dded50.log
Failed on linux-386: https://storage.googleapis.com/go-build-log/e60585ab/linux-386_c0c6f3bf.log
Failed on android-amd64-emu: https://storage.googleapis.com/go-build-log/e60585ab/android-amd64-emu_3ca2160a.log
Failed on linux-amd64: https://storage.googleapis.com/go-build-log/e60585ab/linux-amd64_05579156.log
Failed on misc-compile-ppc: https://storage.googleapis.com/go-build-log/e60585ab/misc-compile-ppc_67569833.log
Failed on linux-amd64-race: https://storage.googleapis.com/go-build-log/e60585ab/linux-amd64-race_79b26adc.log
Failed on openbsd-amd64-64: https://storage.googleapis.com/go-build-log/e60585ab/openbsd-amd64-64_0c96d7b4.log
Failed on js-wasm: https://storage.googleapis.com/go-build-log/e60585ab/js-wasm_bc727464.log
Failed on windows-amd64-2016: https://storage.googleapis.com/go-build-log/e60585ab/windows-amd64-2016_da06f8fb.log
Failed on windows-386-2008: https://storage.googleapis.com/go-build-log/e60585ab/windows-386-2008_de0aaabb.log
Consult https://build.golang.org/ to see whether they are new failures. Keep in mind that TryBots currently test *exactly* your git commit, without rebasing. If your commit's git parent is old, the failure might've already been fixed.
Patch set 15:TryBot-Result -1
Patch Set 15: TryBot-Result-1
10 of 21 TryBots failed:
Failed on freebsd-amd64-12_0: https://storage.googleapis.com/go-build-log/e60585ab/freebsd-amd64-12_0_59dded50.log
Failed on linux-386: https://storage.googleapis.com/go-build-log/e60585ab/linux-386_c0c6f3bf.log
Failed on android-amd64-emu: https://storage.googleapis.com/go-build-log/e60585ab/android-amd64-emu_3ca2160a.log
Failed on linux-amd64: https://storage.googleapis.com/go-build-log/e60585ab/linux-amd64_05579156.log
Failed on misc-compile-ppc: https://storage.googleapis.com/go-build-log/e60585ab/misc-compile-ppc_67569833.log
Failed on linux-amd64-race: https://storage.googleapis.com/go-build-log/e60585ab/linux-amd64-race_79b26adc.log
Failed on openbsd-amd64-64: https://storage.googleapis.com/go-build-log/e60585ab/openbsd-amd64-64_0c96d7b4.log
Failed on js-wasm: https://storage.googleapis.com/go-build-log/e60585ab/js-wasm_bc727464.log
Failed on windows-amd64-2016: https://storage.googleapis.com/go-build-log/e60585ab/windows-amd64-2016_da06f8fb.log
Failed on windows-386-2008: https://storage.googleapis.com/go-build-log/e60585ab/windows-386-2008_de0aaabb.logConsult https://build.golang.org/ to see whether they are new failures. Keep in mind that TryBots currently test *exactly* your git commit, without rebasing. If your commit's git parent is old, the failure might've already been fixed.
Should I rebase this commit on top of master?
Should I rebase this commit on top of master?
I rebased it for you.
Patch set 16:Run-TryBot +1
TryBots beginning. Status page: https://farmer.golang.org/try?commit=519ad954
TryBots are happy.
Patch set 16:TryBot-Result +1
Thanks for looking into this fallback.
The implementation seems remarkably complicated.
Are there no algorithms implementing float64 FMA
in terms of float64 addition and multiplication?
For example, https://play.golang.org/p/C2CNHIhYOvG
is based on the building blocks in
http://web.mit.edu/tabbott/Public/quaddouble-debian/qd-2.3.4-old/docs/qd.pdf
and except for NaN/Inf border cases passes all the tests.
I don't know whether it is perfectly correct in all cases.
But for something we have to maintain, something along those lines
seems clearly preferable to the bit-level operations in this CL.
(It also seems like it would be faster.)
Patch Set 16:
Thanks for looking into this fallback.
The implementation seems remarkably complicated.
Are there no algorithms implementing float64 FMA
in terms of float64 addition and multiplication?For example, https://play.golang.org/p/C2CNHIhYOvG
is based on the building blocks in
http://web.mit.edu/tabbott/Public/quaddouble-debian/qd-2.3.4-old/docs/qd.pdf
and except for NaN/Inf border cases passes all the tests.
I don't know whether it is perfectly correct in all cases.
But for something we have to maintain, something along those lines
seems clearly preferable to the bit-level operations in this CL.
(It also seems like it would be faster.)
Sorry for the long response.
I ran the full test suite against the one you have provided
and here is a sample of 10 failing cases:
fma(-0.4843749999990904,-3.6893487872543293e+19,9.223653786709391e+18) == 2.709393697493899e+19; want 2.7093936974938993e+19
fma(-1.9999999999927274,31.998046636581414,15.874998092651364) == -48.12109518027876; want -48.12109518027875
fma(-10.685751956359399,-1.5951846316580996e-39,-1.1714765478653753e-38) == 5.330981819841231e-39; want 5.330981819841232e-39
fma(1.1754942807659169e-38,-0.5000000000000292,-2.9387379790034146e-39) == -8.816209382833343e-39; want -8.816209382833342e-39
fma(-3.24249303957913e-05,6143.999984741209,-0.5000000002401064) == -0.6992187720970828; want -0.6992187720970829
fma(-0.12501525875995864,1.999999046092853,-3.999984741210923) == -4.250015139477892; want -4.250015139477891
fma(-1.717097413070979e+19,1.3715840035743056e+16,1.6647801802673513e+35) == -6.903631640796252e+34; want -6.903631640796254e+34
fma(1.0001220740377903,0.12475585937477261,-0.24999999999636205) == -0.12522891117009755; want -0.12522891117009757
fma(7.9980468675494185,0.2500000004656577,-2.000976562499943) == -0.00146484188823659; want -0.0014648418882365862
fma(-0.18877129066661416,2.1171874999999996,-7.044236961874658) == -7.4439011788328795; want -7.44390117883288
This implementation is similar to another
mostly-float-arithmetic implementation found in
https://hal-ens-lyon.archives-ouvertes.fr/inria-00080427v2/document.
It also uses the split, two-sum, and two-prod operations
from the double-double/triple-double article, but also makes
use of a non-standard rounding mode. I have also experimented
with implementations using this idea, along with double-double,
triple-double, and big.Float. However, I always failed to get the
correct values with either subnormal inputs, or an overflow in
the split operation.
Specifically,
func split(x float64) (x1, x2 float64) {
t := (1<<27 + 1) * x <--------- This can overflow.
x1 = float64(t - float64(t-x))
x2 = float64(x - x1)
return x1, x2
}
In fact, I benchmarked FreeBSD's implementation
(using mostly float arithmetic) against MUSL's
(which uses mostly bit arithmetic), and found it to be slower:
BenchmarkBSD-4 20000000 63.0 ns/op
BenchmarkMUSL-4 50000000 29.1 ns/op
Moreover, here is a sample of 10 cases that FreeBSD's implementation
actually fails (these have also been tested against the C implementation):
FMA_BSD(7.999999999998067,16.000000357627865,-2.305843009213694e+18) == -2.305843009213694e+18; want -2.3058430092136937e+18
FMA_BSD(-0.9999999999999433,-2.9387351750380307e-39,-2.6469779601696886e-23) == -2.6469779601696886e-23; want -2.6469779601696883e-23
FMA_BSD(3.255268204692044e+19,0.00019829311349665364,-8.112963841460668e+31) == -8.112963841460668e+31; want -8.112963841460667e+31
FMA_BSD(-1.4614793365856989e+48,-1.2344238069487042e-63,-32) == -32; want -31.999999999999996
FMA_BSD(-9.041109663385309e-39,0.12499999999994327,1.3234889800848443e-23) == 1.3234889800848443e-23; want 1.3234889800848441e-23
FMA_BSD(-2.1265093767368582e-16,-1.2172237104420662e+09,-4.294967296e+09) == -4.294967296e+09; want -4.2949672959999995e+09
FMA_BSD(5.8771130217045684e-39,-0.00048826634883880626,2.5849394142282115e-26) == 2.5849394142282115e-26; want 2.5849394142282112e-26
FMA_BSD(-128.0000076291617,-2.9514790517935276e+20,-6.80564733841877e+38) == -6.80564733841877e+38; want -6.8056473384187685e+38
FMA_BSD(1.9058667961481752,105.20092326795302,-2.305843009213694e+18) == -2.305843009213694e+18; want -2.3058430092136937e+18
FMA_BSD(-15.999999046325694,2.0816681709136746e-16,32) == 32; want 31.999999999999996
The implementation described in this CL however
has been tested for nearly 6 hours with the test suite
and has not uncovered a single failure case.
Indeed, I made no attempt to deal with subnormals/overflow.
Interesting about the faster performance for the int code. Thanks.
Akhil Indurti uploaded patch set #17 to this change.
Updates #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
M src/math/bits.go
A src/math/fma.go
3 files changed, 472 insertions(+), 11 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Rebased all commits in stack on top of master.
Hi again,
I'm still concerned about the remaining similarity to the MUSL code and also the complete lack of comments. The code is a big mystery that can't be explained without reference to MUSL, which is not ideal.
I sketched out a commented, from-scratch implementation without reference to either the MUSL code or this CL at https://play.golang.org/p/xnEAq0E77d6. It runs about 2X faster than the code in this CL too. The denormal handling is not exactly right, as shown by the prints, but it shouldn't be too hard to find. (It does pass all the tests in this CL despite that bug, which is a little concerning.)
I won't have time to try to finish it up for quite a while.
Are you interested in taking that code and making it work, in place of this MUSL code?
Thanks.
Best,
Russ
Patch Set 17:
Hi again,
I'm still concerned about the remaining similarity to the MUSL code and also the complete lack of comments. The code is a big mystery that can't be explained without reference to MUSL, which is not ideal.
I sketched out a commented, from-scratch implementation without reference to either the MUSL code or this CL at https://play.golang.org/p/xnEAq0E77d6. It runs about 2X faster than the code in this CL too. The denormal handling is not exactly right, as shown by the prints, but it shouldn't be too hard to find. (It does pass all the tests in this CL despite that bug, which is a little concerning.)
I won't have time to try to finish it up for quite a while.
Are you interested in taking that code and making it work, in place of this MUSL code?Thanks.
Best,
Russ
I will definitely take a crack at this! Thanks.
Patch Set 17:
Hi again,
I'm still concerned about the remaining similarity to the MUSL code and also the complete lack of comments. The code is a big mystery that can't be explained without reference to MUSL, which is not ideal.
I sketched out a commented, from-scratch implementation without reference to either the MUSL code or this CL at https://play.golang.org/p/xnEAq0E77d6. It runs about 2X faster than the code in this CL too. The denormal handling is not exactly right, as shown by the prints, but it shouldn't be too hard to find. (It does pass all the tests in this CL despite that bug, which is a little concerning.)
I won't have time to try to finish it up for quite a while.
Are you interested in taking that code and making it work, in place of this MUSL code?Thanks.
Best,
Russ
Here is an amended implementation at https://play.golang.org/p/H6svgmg4Jof.
The zero/inf/nan handling is modified to deal with the case when adding
opposite-signed infinities, e.g. where x*y = inf, and z = inf.
Subnormal values are normalized in the split function, which simplifies
subsequent code. Operands to addition are swapped based on their magnitude,
similar to the runtime’s softfloat implementation.
There is definitely room for performance improvements in the form of special
casing the effective addition/subtraction cases (product-anchored, addend-anchored, cancellation, etc…) and optimizing the helper functions. In particular,
the benchmark is very sensitive to the function (shrcompress) that compresses
the bottom n bits of a two word significand into a sticky bit. Still,
this implementation offers about a 20% speedup over MUSL’s, while passing
all of the tests.
Also, I agree that a random sampling of test cases makes it difficult to
discover future errors, so the playground link contains some test cases
that are categorized by the codepath that should be triggered.
I would appreciate it if you could take a look at it.
Akhil
Akhil Indurti uploaded patch set #18 to this change.
math: add guaranteed-precision FMA implementation
Currently, the precision of the float64 multiply-add operation
(x * y) + z varies across architectures. While generated code for
ppc64, s390x, and arm64 can guarantee that there is no intermediate
rounding on those platforms, other architectures like x86, mips, and
arm will exhibit different behavior depending on available instruction
set. Consequently, applications cannot rely on results being identical
across GOARCH-dependent codepaths.
This CL introduces a software implementation that performs an IEEE 754
double-precision fused-multiply-add operation. The only supported
rounding mode is round-to-nearest ties-to-even. Separate CLs include
hardware implementations when available. Otherwise, this software
fallback is given as the default implementation.
Specifically,
- arm64, ppc64, s390x: Uses the FMA instruction provided by all
of these ISAs.
- mips[64][le]: Falls back to this software implementation. Only
release 6 of the ISA includes a strict FMA instruction with
MADDF.D (not implementation defined). Because the number of R6
processors in the wild is scarce, the assembly implementation
is left as a future optimization.
- x86: Guards the use of VFMADD213SD by checking cpu.X86.HasFMA.
- arm: Guards the use of VFMA by checking cpu.ARM.HasVFPv4.
- software fallback: Uses mostly integer arithmetic except
for input that involves Inf, NaN, or zero.
Updates #25819.
Change-Id: Iadadff2219638bacc9fec78d3ab885393fea4a08
---
M src/math/all_test.go
A src/math/fma.go
2 files changed, 243 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Updated CL to make the new implementation easier to review.
Akhil Indurti uploaded patch set #19 to this change.
2 files changed, 238 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Patch set 19:-Code-Review
4 comments:
Patch Set #12, Line 3957: x = Fma(E, Pi, Phi)
Probably want x = Fma(E, Pi, x) so each iteration is data-dependent on the previous iteration. […]
This still needs doing.
Patch Set #19, Line 39: func shrcompress(hi, lo uint64, n uint) (rhi, rlo uint64) {
This function needs a comment about what it computes.
Patch Set #19, Line 94: if f := x/x + y/y; f != f || z == 0.0 {
Divides are kind of expensive. Isn't there a faster way to detect this condition?
Patch Set #19, Line 117: pm1, pm2
Here you're using 1,2 where above you use hi, lo. It would be better to keep the naming consistent.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
4 comments:
Patch Set #12, Line 3957: x = Fma(E, Pi, Phi)
This still needs doing.
Ack
Patch Set #19, Line 39: func shrcompress(hi, lo uint64, n uint) (rhi, rlo uint64) {
This function needs a comment about what it computes.
Ack
Patch Set #19, Line 94: if f := x/x + y/y; f != f || z == 0.0 {
Divides are kind of expensive. […]
I assumed that hardware divides would be available on most platforms. A faster way to do this would be
bx := math.Float64bits(x)
by := math.Float64bits(y)
bz := math.Float64bits(z)
// Inf or NaN or zero involved.
// Let hardware figure it out - at most one rounding will happen.
if x == 0.0 || y == 0.0 || z == 0.0 || bx&uvinf == uvinf || by&uvinf == uvinf {
return x*y + z
}
// Handle non-normal z separately. Evaluating x*y+z with no
// intermediate rounding, where the product (x*y) and z are
// opposite-signed infinities, should result in infinity.
if bz&uvinf == uvinf {
return z
}
The underlying logic is that the first if statement checks to see if either x or y is zero/inf/nan, or z is 0. The second statement checks that z is inf/nan. You can techincally substitute the zero/inf/nan checks for x and y with
f == 0.0 || math.IsInf(f, 0) || math.IsNaN(f)
and even encapsulate that in a function, but there would be extraneous shifting and masking in that case.
Patch Set #19, Line 117: pm1, pm2
Here you're using 1,2 where above you use hi, lo. It would be better to keep the naming consistent.
Ack
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #20 to this change.
2 files changed, 243 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Patch Set #12, Line 3957: x = Fma(E, Pi, Phi)
Ack
I filed #33441 for the other benchmarks.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti removed Brad Fitzpatrick from this change.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #21 to this change.
2 files changed, 242 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #22 to this change.
5 comments:
Patch Set #22, Line 2008: var fmaC = []struct{ x, y, z, want float64 }{
Could you add a comment about how these test cases were generated? How do you know what the right answer is? How do you know you're testing FMA? (Presumably, x*y+z != want using normal float64 arithmetic - should you assert that somewhere?)
n+1?
n+1?
Patch Set #22, Line 60: | u2&(1<<n-1)
Couldn't this be just | u2? n>=64 in this case.
Evaluating x*y+z with no
// intermediate rounding, where the product (x*y) and z are
// opposite-signed infinities, should result in infinity.
Where does this condition come from?
Wherever it comes from, it's weird enough that a description of this case should go in the doc comment for this function.
Also I don't think there's a test case for this.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
4 comments:
n+1?
Right, since the sticky bit is the nth bit from the bottom, so that would be n+1 compressed bits. Thanks for catching that.
n+1?
Ack
Patch Set #22, Line 60: | u2&(1<<n-1)
Couldn't this be just | u2? n>=64 in this case.
Ack
Evaluating x*y+z with no
// intermediate rounding, where the product (x*y) and z are
// opposite-signed infinities, should result in infinity.
Where does this condition come from? […]
I can document this, but this behavior falls out of the definition of fma. I.e. if x and y are finite, their non-rounded product is pn, and their rounded product pr is infinite, then x*y+z is infinite. This is because the computation with no intermediate rounding (as done in fma) turns into pn+z, instead of pr+z. Perhaps I should change the comment to not say "opposite-signed infinities," since that sounds like pr+z is being performed.
Also, test cases 21 and 22 trigger this behavior.
// Opposite-signed addition of infinity where z = (+/-)inf, and x*y = -z
{31.99218749627471, -1.7976930544991702e+308, Inf(0), Inf(0)},
{-1.7976931281784667e+308, -2.0009765625002265, Inf(-1), Inf(-1)},
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
1 comment:
Evaluating x*y+z with no
// intermediate rounding, where the product (x*y) and z are
// opposite-signed infinities, should result in infinity.
I can document this, but this behavior falls out of the definition of fma. I.e. […]
Ah yes, I was confused by your text. I would say "evaluating x*y+z where x and y are finite, but z is infinite, should always result in z".
Those test cases look good, thanks. Not sure how I missed them.
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Akhil Indurti uploaded patch set #23 to this change.
2 files changed, 244 insertions(+), 0 deletions(-)
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
Hi! I just wanted to kindly ping to see if there's anything that needs to be done to get this and its dependent CLs ready for review. Thanks!
Code looks good.
Ian, you satisfied with the copyright situation now? If so, remove your -2.
Patch set 23:Run-TryBot +1
TryBots beginning. Status page: https://farmer.golang.org/try?commit=bebec1d6
TryBots are happy.
Patch set 23:TryBot-Result +1
Should I go ahead and rebase all of the commits in this stack?
Patch Set 23:
Should I go ahead and rebase all of the commits in this stack?
Sure, rebasing and rerunning the trybots would help.
Patch Set 23:
Patch Set 23:
Should I go ahead and rebase all of the commits in this stack?
Sure, rebasing and rerunning the trybots would help.
Thanks, Done.
Pinging Ian.
Removing my -2 as the license issues have been resolved.
Patch set 23:-Code-Review
Patch set 24:Run-TryBot +1
TryBots are happy.
Patch set 24:TryBot-Result +1
Keith, is this ready for a +2? You said "code looks good" but didn't +2 it. Thanks.
Patch set 24:Code-Review +2
Keith Randall submitted this change.
Reviewed-on: https://go-review.googlesource.com/c/go/+/127458
Run-TryBot: Ian Lance Taylor <ia...@golang.org>
TryBot-Result: Gobot Gobot <go...@golang.org>
Reviewed-by: Keith Randall <k...@golang.org>
---
M src/math/all_test.go
A src/math/fma.go
2 files changed, 244 insertions(+), 0 deletions(-)
diff --git a/src/math/all_test.go b/src/math/all_test.go
index 208c823..e8fa2b8 100644
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -2005,6 +2005,64 @@
1023,
}
+// Test cases were generated with Berkeley TestFloat-3e/testfloat_gen.
+// http://www.jhauser.us/arithmetic/TestFloat.html.
+// The default rounding mode is selected (nearest/even), and exception flags are ignored.
+var fmaC = []struct{ x, y, z, want float64 }{
+ // Large exponent spread
+ {-3.999999999999087, -1.1123914289620494e-16, -7.999877929687506, -7.999877929687505},
+ {-262112.0000004768, -0.06251525855623184, 1.1102230248837136e-16, 16385.99945072085},
+ {-6.462348523533467e-27, -2.3763644720331857e-211, 4.000000000931324, 4.000000000931324},
+
+ // Effective addition
+ {-2.0000000037252907, 6.7904383376e-313, -3.3951933161e-313, -1.697607001654e-312},
+ {-0.12499999999999999, 512.007568359375, -1.4193627164960366e-16, -64.00094604492188},
+ {-2.7550648847397148e-39, -3.4028301595800694e+38, 0.9960937495343386, 1.9335955376735676},
+ {5.723369164769208e+24, 3.8149300927159385e-06, 1.84489958778182e+19, 4.028324913621874e+19},
+ {-0.4843749999990904, -3.6893487872543293e+19, 9.223653786709391e+18, 2.7093936974938993e+19},
+ {-3.8146972665201165e-06, 4.2949672959999385e+09, -2.2204460489938386e-16, -16384.000003844263},
+ {6.98156394130982e-309, -1.1072962560000002e+09, -4.4414561548793455e-308, -7.73065965765153e-300},
+
+ // Effective subtraction
+ {5e-324, 4.5, -2e-323, 0},
+ {5e-324, 7, -3.5e-323, 0},
+ {5e-324, 0.5000000000000001, -5e-324, Copysign(0, -1)},
+ {-2.1240680525e-314, -1.233647078189316e+308, -0.25781249999954525, -0.25780987964919844},
+ {8.579992955364441e-308, 0.6037391876780558, -4.4501307410480706e-308, 7.29947236107098e-309},
+ {-4.450143471986689e-308, -0.9960937499927239, -4.450419332475649e-308, -1.7659233458788e-310},
+ {1.4932076393918112, -2.2248022430460833e-308, 4.449875571054211e-308, 1.127783865601762e-308},
+
+ // Overflow
+ {-2.288020632214759e+38, -8.98846570988901e+307, 1.7696041796300924e+308, Inf(0)},
+ {1.4888652783208255e+308, -9.007199254742012e+15, -6.807282911929205e+38, Inf(-1)},
+ {9.142703268902826e+192, -1.3504889569802838e+296, -1.9082200803806996e-89, Inf(-1)},
+
+ // Finite x and y, but non-finite z.
+ {31.99218749627471, -1.7976930544991702e+308, Inf(0), Inf(0)},
+ {-1.7976931281784667e+308, -2.0009765625002265, Inf(-1), Inf(-1)},
+
+ // Special
+ {0, 0, 0, 0},
+ {-1.1754226043408471e-38, NaN(), Inf(0), NaN()},
+ {0, 0, 2.22507385643494e-308, 2.22507385643494e-308},
+ {-8.65697792e+09, NaN(), -7.516192799999999e+09, NaN()},
+ {-0.00012207403779029757, 3.221225471996093e+09, NaN(), NaN()},
+ {Inf(-1), 0.1252441407414153, -1.387184532981584e-76, Inf(-1)},
+ {Inf(0), 1.525878907671432e-05, -9.214364835452549e+18, Inf(0)},
+
+ // Random
+ {0.1777916152213626, -32.000015266239636, -2.2204459148334633e-16, -5.689334401293007},
+ {-2.0816681711722314e-16, -0.4997558592585846, -0.9465627129124969, -0.9465627129124968},
+ {-1.9999997615814211, 1.8518819259933516e+19, 16.874999999999996, -3.703763410463646e+19},
+ {-0.12499994039717421, 32767.99999976135, -2.0752587082923246e+19, -2.075258708292325e+19},
+ {7.705600568510257e-34, -1.801432979000528e+16, -0.17224197722973714, -0.17224197722973716},
+ {3.8988133103758913e-308, -0.9848632812499999, 3.893879244098556e-308, 5.40811742605814e-310},
+ {-0.012651981190687427, 6.911985574912436e+38, 6.669240527007144e+18, -8.745031148409496e+36},
+ {4.612811918325842e+18, 1.4901161193847641e-08, 2.6077032311277997e-08, 6.873625395187494e+10},
+ {-9.094947033611148e-13, 4.450691014249257e-308, 2.086006742350485e-308, 2.086006742346437e-308},
+ {-7.751454006381804e-05, 5.588653777189071e-308, -2.2207280111272877e-308, -2.2211612130544025e-308},
+}
+
func tolerance(a, b, e float64) bool {
// Multiplying by e here can underflow denormal values to zero.
// Check a==b so that at least if a and b are small and identical
@@ -2995,6 +3053,15 @@
}
}
+func TestFma(t *testing.T) {
+ for _, c := range fmaC {
+ got := Fma(c.x, c.y, c.z)
+ if !alike(got, c.want) {
+ t.Errorf("Fma(%g,%g,%g) == %g; want %g", c.x, c.y, c.z, got, c.want)
+ }
+ }
+}
+
// Check that math functions of high angle values
// return accurate results. [Since (vf[i] + large) - large != vf[i],
// testing for Trig(vf[i] + large) == Trig(vf[i]), where large is
@@ -3725,3 +3792,11 @@
}
GlobalF = float64(x)
}
+
+func BenchmarkFma(b *testing.B) {
+ x := 0.0
+ for i := 0; i < b.N; i++ {
+ x = Fma(E, Pi, x)
+ }
+ GlobalF = x
+}
diff --git a/src/math/fma.go b/src/math/fma.go
new file mode 100644
index 0000000..7624922
--- /dev/null
+++ b/src/math/fma.go
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+import "math/bits"
+
+func zero(x uint64) uint64 {
+ if x == 0 {
+ return 1
+ }
+ return 0
+ // branchless:
+ // return ((x>>1 | x&1) - 1) >> 63
+}
+
+func nonzero(x uint64) uint64 {
+ if x != 0 {
+ return 1
+ }
+ return 0
+ // branchless:
+ // return 1 - ((x>>1|x&1)-1)>>63
+}
+
+func shl(u1, u2 uint64, n uint) (r1, r2 uint64) {
+ r1 = u1<<n | u2>>(64-n) | u2<<(n-64)
+ r2 = u2 << n
+ return
+}
+
+func shr(u1, u2 uint64, n uint) (r1, r2 uint64) {
+ r2 = u2>>n | u1<<(64-n) | u1>>(n-64)
+ r1 = u1 >> n
+ return
+}
+
+// shrcompress compresses the bottom n+1 bits of the two-word
+// value into a single bit. the result is equal to the value
+// shifted to the right by n, except the result's 0th bit is
+// set to the bitwise OR of the bottom n+1 bits.
+func shrcompress(u1, u2 uint64, n uint) (r1, r2 uint64) {
+ // TODO: Performance here is really sensitive to the
+ // order/placement of these branches. n == 0 is common
+ // enough to be in the fast path. Perhaps more measurement
+ // needs to be done to find the optimal order/placement?
+ switch {
+ case n == 0:
+ return u1, u2
+ case n == 64:
+ return 0, u1 | nonzero(u2)
+ case n >= 128:
+ return 0, nonzero(u1 | u2)
+ case n < 64:
+ r1, r2 = shr(u1, u2, n)
+ r2 |= nonzero(u2 & (1<<n - 1))
+ case n < 128:
+ r1, r2 = shr(u1, u2, n)
+ r2 |= nonzero(u1&(1<<(n-64)-1) | u2)
+ }
+ return
+}
+
+func lz(u1, u2 uint64) (l int32) {
+ l = int32(bits.LeadingZeros64(u1))
+ if l == 64 {
+ l += int32(bits.LeadingZeros64(u2))
+ }
+ return l
+}
+
+// split splits b into sign, biased exponent, and mantissa.
+// It adds the implicit 1 bit to the mantissa for normal values,
+// and normalizes subnormal values.
+func split(b uint64) (sign uint32, exp int32, mantissa uint64) {
+ sign = uint32(b >> 63)
+ exp = int32(b>>52) & mask
+ mantissa = b & fracMask
+
+ if exp == 0 {
+ // Normalize value if subnormal.
+ shift := uint(bits.LeadingZeros64(mantissa) - 11)
+ mantissa <<= shift
+ exp = 1 - int32(shift)
+ } else {
+ // Add implicit 1 bit
+ mantissa |= 1 << 52
+ }
+ return
+}
+
+// Fma returns x * y + z, computed with only one rounding.
+func Fma(x, y, z float64) float64 {
+ bx, by, bz := Float64bits(x), Float64bits(y), Float64bits(z)
+
+ // Inf or NaN or zero involved. At most one rounding will occur.
+ if x == 0.0 || y == 0.0 || z == 0.0 || bx&uvinf == uvinf || by&uvinf == uvinf {
+ return x*y + z
+ }
+ // Handle non-finite z separately. Evaluating x*y+z where
+ // x and y are finite, but z is infinite, should always result in z.
+ if bz&uvinf == uvinf {
+ return z
+ }
+
+ // Inputs are (sub)normal.
+ // Split x, y, z into sign, exponent, mantissa.
+ xs, xe, xm := split(bx)
+ ys, ye, ym := split(by)
+ zs, ze, zm := split(bz)
+
+ // Compute product p = x*y as sign, exponent, two-word mantissa.
+ // Start with exponent. "is normal" bit isn't subtracted yet.
+ pe := xe + ye - bias + 1
+
+ // pm1:pm2 is the double-word mantissa for the product p.
+ // Shift left to leave top bit in product. Effectively
+ // shifts the 106-bit product to the left by 21.
+ pm1, pm2 := bits.Mul64(xm<<10, ym<<11)
+ zm1, zm2 := zm<<10, uint64(0)
+ ps := xs ^ ys // product sign
+
+ // normalize to 62nd bit
+ is62zero := uint((^pm1 >> 62) & 1)
+ pm1, pm2 = shl(pm1, pm2, is62zero)
+ pe -= int32(is62zero)
+
+ // Swap addition operands so |p| >= |z|
+ if pe < ze || (pe == ze && (pm1 < zm1 || (pm1 == zm1 && pm2 < zm2))) {
+ ps, pe, pm1, pm2, zs, ze, zm1, zm2 = zs, ze, zm1, zm2, ps, pe, pm1, pm2
+ }
+
+ // Align significands
+ zm1, zm2 = shrcompress(zm1, zm2, uint(pe-ze))
+
+ // Compute resulting significands, normalizing if necessary.
+ var m, c uint64
+ if ps == zs {
+ // Adding (pm1:pm2) + (zm1:zm2)
+ pm2, c = bits.Add64(pm2, zm2, 0)
+ pm1, _ = bits.Add64(pm1, zm1, c)
+ pe -= int32(^pm1 >> 63)
+ pm1, m = shrcompress(pm1, pm2, uint(64+pm1>>63))
+ } else {
+ // Subtracting (pm1:pm2) - (zm1:zm2)
+ // TODO: should we special-case cancellation?
+ pm2, c = bits.Sub64(pm2, zm2, 0)
+ pm1, _ = bits.Sub64(pm1, zm1, c)
+ nz := lz(pm1, pm2)
+ pe -= nz
+ m, pm2 = shl(pm1, pm2, uint(nz-1))
+ m |= nonzero(pm2)
+ }
+
+ // Round and break ties to even
+ if pe > 1022+bias || pe == 1022+bias && (m+1<<9)>>63 == 1 {
+ // rounded value overflows exponent range
+ return Float64frombits(uint64(ps)<<63 | uvinf)
+ }
+ if pe < 0 {
+ n := uint(-pe)
+ m = m>>n | nonzero(m&(1<<n-1))
+ pe = 0
+ }
+ m = ((m + 1<<9) >> 10) & ^zero((m&(1<<10-1))^1<<9)
+ pe &= -int32(nonzero(m))
+ return Float64frombits(uint64(ps)<<63 + uint64(pe)<<52 + m)
+}
To view, visit change 127458. To unsubscribe, or for help writing mail filters, visit settings.
RELNOTE=yes