crypto/internal/fips140/edwards25519/field: delete Square amd64 assembly
The preceding commit made the compiler-generated code faster
than the assembly.
Since the generic/assembly split is gone, use nicer function names.
The fact that they are functions instead of methods is itself a vestige
of their assembly roots. But unwinding that makes for a large diff.
goos: linux
goarch: amd64
cpu: AMD Ryzen Threadripper PRO 7975WX 32-Cores
pkg: crypto/ed25519
│ before │ after │
│ sec/op │ sec/op vs base │
KeyGeneration-64 12.70µ ± 1% 12.38µ ± 2% -2.53% (p=0.000 n=30)
NewKeyFromSeed-64 12.52µ ± 0% 12.27µ ± 1% -2.00% (p=0.000 n=30)
Signing-64 15.42µ ± 0% 14.81µ ± 0% -3.97% (p=0.000 n=30)
Verification-64 34.84µ ± 0% 34.68µ ± 0% -0.44% (p=0.006 n=30)
geomean 17.10µ 16.71µ -2.24%
pkg: crypto/internal/fips140/edwards25519
│ before │ after │
│ sec/op │ sec/op vs base │
EncodingDecoding-64 5.159µ ± 0% 4.589µ ± 1% -11.05% (p=0.000 n=30)
ScalarBaseMult-64 9.761µ ± 0% 9.780µ ± 1% ~ (p=0.965 n=30)
ScalarMult-64 31.99µ ± 0% 32.46µ ± 0% +1.47% (p=0.000 n=30)
VarTimeDoubleScalarBaseMult-64 29.82µ ± 0% 30.16µ ± 0% +1.14% (p=0.000 n=30)
geomean 14.80µ 14.48µ -2.20%
pkg: crypto/internal/fips140/edwards25519/field
│ before │ after │
│ sec/op │ sec/op vs base │
Add-64 2.571n ± 2% 2.573n ± 1% ~ (p=0.460 n=30)
Multiply-64 10.67n ± 0% 10.62n ± 0% -0.47% (p=0.001 n=30)
Square-64 8.849n ± 0% 8.412n ± 0% -4.94% (p=0.000 n=30)
Invert-64 2.401µ ± 0% 2.156µ ± 2% -10.20% (p=0.000 n=30)
Mult32-64 3.226n ± 0% 3.240n ± 0% +0.47% (p=0.004 n=30)
Bytes-64 7.974n ± 1% 7.905n ± 1% -0.87% (p=0.015 n=30)
geomean 15.70n 15.27n -2.74%
Change-Id: I995209e72e202e7ca4e436615424120ef09e8b37
diff --git a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
index ecb713b..e2fcf0b 100644
--- a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
+++ b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
@@ -19,7 +19,6 @@
Package("crypto/internal/fips140/edwards25519/field")
ConstraintExpr("!purego")
feMul()
- feSquare()
Generate()
}
@@ -37,95 +36,6 @@
func (c uint128) String() string { return c.name }
-func feSquare() {
- TEXT("feSquare", NOSPLIT, "func(out, a *Element)")
- Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
- Pragma("noescape")
-
- a := Dereference(Param("a"))
- l0 := namedComponent{a.Field("l0"), "l0"}
- l1 := namedComponent{a.Field("l1"), "l1"}
- l2 := namedComponent{a.Field("l2"), "l2"}
- l3 := namedComponent{a.Field("l3"), "l3"}
- l4 := namedComponent{a.Field("l4"), "l4"}
-
- // r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
- r0 := uint128{"r0", GP64(), GP64()}
- mul64(r0, 1, l0, l0)
- addMul64(r0, 38, l1, l4)
- addMul64(r0, 38, l2, l3)
-
- // r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
- r1 := uint128{"r1", GP64(), GP64()}
- mul64(r1, 2, l0, l1)
- addMul64(r1, 38, l2, l4)
- addMul64(r1, 19, l3, l3)
-
- // r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
- r2 := uint128{"r2", GP64(), GP64()}
- mul64(r2, 2, l0, l2)
- addMul64(r2, 1, l1, l1)
- addMul64(r2, 38, l3, l4)
-
- // r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
- r3 := uint128{"r3", GP64(), GP64()}
- mul64(r3, 2, l0, l3)
- addMul64(r3, 2, l1, l2)
- addMul64(r3, 19, l4, l4)
-
- // r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
- r4 := uint128{"r4", GP64(), GP64()}
- mul64(r4, 2, l0, l4)
- addMul64(r4, 2, l1, l3)
- addMul64(r4, 1, l2, l2)
-
- Comment("First reduction chain")
- maskLow51Bits := GP64()
- MOVQ(Imm((1<<51)-1), maskLow51Bits)
- c0, r0lo := shiftRightBy51(&r0)
- c1, r1lo := shiftRightBy51(&r1)
- c2, r2lo := shiftRightBy51(&r2)
- c3, r3lo := shiftRightBy51(&r3)
- c4, r4lo := shiftRightBy51(&r4)
- maskAndAdd(r0lo, maskLow51Bits, c4, 19)
- maskAndAdd(r1lo, maskLow51Bits, c0, 1)
- maskAndAdd(r2lo, maskLow51Bits, c1, 1)
- maskAndAdd(r3lo, maskLow51Bits, c2, 1)
- maskAndAdd(r4lo, maskLow51Bits, c3, 1)
-
- Comment("Second reduction chain (carryPropagate)")
- // c0 = r0 >> 51
- MOVQ(r0lo, c0)
- SHRQ(Imm(51), c0)
- // c1 = r1 >> 51
- MOVQ(r1lo, c1)
- SHRQ(Imm(51), c1)
- // c2 = r2 >> 51
- MOVQ(r2lo, c2)
- SHRQ(Imm(51), c2)
- // c3 = r3 >> 51
- MOVQ(r3lo, c3)
- SHRQ(Imm(51), c3)
- // c4 = r4 >> 51
- MOVQ(r4lo, c4)
- SHRQ(Imm(51), c4)
- maskAndAdd(r0lo, maskLow51Bits, c4, 19)
- maskAndAdd(r1lo, maskLow51Bits, c0, 1)
- maskAndAdd(r2lo, maskLow51Bits, c1, 1)
- maskAndAdd(r3lo, maskLow51Bits, c2, 1)
- maskAndAdd(r4lo, maskLow51Bits, c3, 1)
-
- Comment("Store output")
- out := Dereference(Param("out"))
- Store(r0lo, out.Field("l0"))
- Store(r1lo, out.Field("l1"))
- Store(r2lo, out.Field("l2"))
- Store(r3lo, out.Field("l3"))
- Store(r4lo, out.Field("l4"))
-
- RET()
-}
-
func feMul() {
TEXT("feMul", NOSPLIT, "func(out, a, b *Element)")
Doc("feMul sets out = a * b. It works like feMulGeneric.")
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
index 00bf8f4..1f3ce86 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
@@ -8,8 +8,3 @@
//
//go:noescape
func feMul(out *Element, a *Element, b *Element)
-
-// feSquare sets out = a * a. It works like feSquareGeneric.
-//
-//go:noescape
-func feSquare(out *Element, a *Element)
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
index 5e06e24..a24a241 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
@@ -229,170 +229,3 @@
MOVQ R13, 24(AX)
MOVQ R15, 32(AX)
RET
-
-// func feSquare(out *Element, a *Element)
-TEXT ·feSquare(SB), NOSPLIT, $0-16
- MOVQ a+8(FP), CX
-
- // r0 = l0×l0
- MOVQ (CX), AX
- MULQ (CX)
- MOVQ AX, SI
- MOVQ DX, BX
-
- // r0 += 38×l1×l4
- MOVQ 8(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- SHLQ $0x01, AX
- MULQ 32(CX)
- ADDQ AX, SI
- ADCQ DX, BX
-
- // r0 += 38×l2×l3
- MOVQ 16(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- SHLQ $0x01, AX
- MULQ 24(CX)
- ADDQ AX, SI
- ADCQ DX, BX
-
- // r1 = 2×l0×l1
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 8(CX)
- MOVQ AX, R8
- MOVQ DX, DI
-
- // r1 += 38×l2×l4
- MOVQ 16(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- SHLQ $0x01, AX
- MULQ 32(CX)
- ADDQ AX, R8
- ADCQ DX, DI
-
- // r1 += 19×l3×l3
- MOVQ 24(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- MULQ 24(CX)
- ADDQ AX, R8
- ADCQ DX, DI
-
- // r2 = 2×l0×l2
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 16(CX)
- MOVQ AX, R10
- MOVQ DX, R9
-
- // r2 += l1×l1
- MOVQ 8(CX), AX
- MULQ 8(CX)
- ADDQ AX, R10
- ADCQ DX, R9
-
- // r2 += 38×l3×l4
- MOVQ 24(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- SHLQ $0x01, AX
- MULQ 32(CX)
- ADDQ AX, R10
- ADCQ DX, R9
-
- // r3 = 2×l0×l3
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 24(CX)
- MOVQ AX, R12
- MOVQ DX, R11
-
- // r3 += 2×l1×l2
- MOVQ 8(CX), AX
- SHLQ $0x01, AX
- MULQ 16(CX)
- ADDQ AX, R12
- ADCQ DX, R11
-
- // r3 += 19×l4×l4
- MOVQ 32(CX), DX
- LEAQ (DX)(DX*8), AX
- LEAQ (DX)(AX*2), AX
- MULQ 32(CX)
- ADDQ AX, R12
- ADCQ DX, R11
-
- // r4 = 2×l0×l4
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 32(CX)
- MOVQ AX, R14
- MOVQ DX, R13
-
- // r4 += 2×l1×l3
- MOVQ 8(CX), AX
- SHLQ $0x01, AX
- MULQ 24(CX)
- ADDQ AX, R14
- ADCQ DX, R13
-
- // r4 += l2×l2
- MOVQ 16(CX), AX
- MULQ 16(CX)
- ADDQ AX, R14
- ADCQ DX, R13
-
- // First reduction chain
- MOVQ $0x0007ffffffffffff, AX
- SHLQ $0x0d, SI, BX
- SHLQ $0x0d, R8, DI
- SHLQ $0x0d, R10, R9
- SHLQ $0x0d, R12, R11
- SHLQ $0x0d, R14, R13
- ANDQ AX, SI
- IMUL3Q $0x13, R13, R13
- ADDQ R13, SI
- ANDQ AX, R8
- ADDQ BX, R8
- ANDQ AX, R10
- ADDQ DI, R10
- ANDQ AX, R12
- ADDQ R9, R12
- ANDQ AX, R14
- ADDQ R11, R14
-
- // Second reduction chain (carryPropagate)
- MOVQ SI, BX
- SHRQ $0x33, BX
- MOVQ R8, DI
- SHRQ $0x33, DI
- MOVQ R10, R9
- SHRQ $0x33, R9
- MOVQ R12, R11
- SHRQ $0x33, R11
- MOVQ R14, R13
- SHRQ $0x33, R13
- ANDQ AX, SI
- IMUL3Q $0x13, R13, R13
- ADDQ R13, SI
- ANDQ AX, R8
- ADDQ BX, R8
- ANDQ AX, R10
- ADDQ DI, R10
- ANDQ AX, R12
- ADDQ R9, R12
- ANDQ AX, R14
- ADDQ R11, R14
-
- // Store output
- MOVQ out+0(FP), AX
- MOVQ SI, (AX)
- MOVQ R8, 8(AX)
- MOVQ R10, 16(AX)
- MOVQ R12, 24(AX)
- MOVQ R14, 32(AX)
- RET
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go b/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
index 792b84f..a85eb10 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
@@ -7,7 +7,3 @@
package field
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
-
-func feSquare(v, x *Element) { feSquareGeneric(v, x) }
-
-func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
index dd9a0fb..579eab6 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
@@ -183,7 +183,7 @@
v.l4 = rr4&maskLow51Bits + rr3>>51
}
-func feSquareGeneric(v, a *Element) {
+func feSquare(v, a *Element) {
l0 := a.l0
l1 := a.l1
l2 := a.l2
@@ -256,9 +256,9 @@
v.l4 = rr4&maskLow51Bits + rr3>>51
}
-// feSquareNGeneric squares a n times and writes the result to v.
+// feSquareN squares a n times and writes the result to v.
// It uses local variables to keep limbs in registers.
-func feSquareNGeneric(v, a *Element, n int) {
+func feSquareN(v, a *Element, n int) {
l0 := a.l0
l1 := a.l1
l2 := a.l2
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go b/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go
deleted file mode 100644
index 021899b..0000000
--- a/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) 2026 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64 && !purego
-
-package field
-
-func feSquareN(v, a *Element, n int) {
- feSquare(v, a)
- for range n - 1 {
- feSquare(v, v)
- }
-}
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_test.go b/src/crypto/internal/fips140/edwards25519/field/fe_test.go
index 424074f..a98c152 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_test.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_test.go
@@ -514,51 +514,6 @@
}
}
-func TestFeSquareN(t *testing.T) {
- asmLikeGeneric := func(a Element) bool {
- for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
- t1 := a
- t2 := a
-
- feSquareNGeneric(&t1, &t1, n)
- feSquareN(&t2, &t2, n)
-
- if t1 != t2 {
- t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
- return false
- }
- if !isInBounds(&t2) {
- return false
- }
- }
- return true
- }
-
- if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
- t.Error(err)
- }
-}
-
-func TestFeSquare(t *testing.T) {
- asmLikeGeneric := func(a Element) bool {
- t1 := a
- t2 := a
-
- feSquareGeneric(&t1, &t1)
- feSquare(&t2, &t2)
-
- if t1 != t2 {
- t.Logf("got: %#v,\nexpected: %#v", t1, t2)
- }
-
- return t1 == t2 && isInBounds(&t2)
- }
-
- if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
- t.Error(err)
- }
-}
-
func TestFeMul(t *testing.T) {
asmLikeGeneric := func(a, b Element) bool {
a1 := a
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
of their assembly roots. But unwinding that makes for a large diff.I can send that unwinding as a follow-up if desired.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +2 |
of their assembly roots. But unwinding that makes for a large diff.I can send that unwinding as a follow-up if desired.
It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
of their assembly roots. But unwinding that makes for a large diff.Filippo ValsordaI can send that unwinding as a follow-up if desired.
It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)
Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
of their assembly roots. But unwinding that makes for a large diff.Filippo ValsordaI can send that unwinding as a follow-up if desired.
Josh Bleecher SnyderIt'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)
Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?
Depending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.
However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
of their assembly roots. But unwinding that makes for a large diff.Filippo ValsordaI can send that unwinding as a follow-up if desired.
Josh Bleecher SnyderIt'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)
Filippo ValsordaMul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?
Depending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.
However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.
Oh right. Sadness. OK, guess the burden lies with the scheduler then…it’s doing deeply silly things but only with a tiny subset of functions, those with many interleavable short carry chains. I’ve struggled to get the heuristics good in a way that does smell weird or add too much code. Will try again in a bit. (Unless say Daniel wants to take it over? Would be happy to share my goo so far.)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
of their assembly roots. But unwinding that makes for a large diff.Filippo ValsordaI can send that unwinding as a follow-up if desired.
Josh Bleecher SnyderIt'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)
Filippo ValsordaMul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?
Josh Bleecher SnyderDepending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.
However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.
Oh right. Sadness. OK, guess the burden lies with the scheduler then…it’s doing deeply silly things but only with a tiny subset of functions, those with many interleavable short carry chains. I’ve struggled to get the heuristics good in a way that does smell weird or add too much code. Will try again in a bit. (Unless say Daniel wants to take it over? Would be happy to share my goo so far.)
I am out of ideas for feMulGeneric.
Up to you whether you want to submit this + the previous change now or wait for Daniel to come up with something brilliant for feMulGeneric. It's kind of a hard nut to crack...
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |