Josh Bleecher Snyder (Gerrit)

unread,

May 16, 2026, 9:39:46 AM (3 days ago) May 16

to goph...@pubsubhelper.golang.org, Josh Bleecher Snyder, golang-co...@googlegroups.com

Josh Bleecher Snyder has uploaded the change for review

Commit message

crypto/internal/fips140/edwards25519/field: delete Square amd64 assembly

The preceding commit made the compiler-generated code faster
than the assembly.

Since the generic/assembly split is gone, use nicer function names.
The fact that they are functions instead of methods is itself a vestige
of their assembly roots. But unwinding that makes for a large diff.

goos: linux
goarch: amd64
cpu: AMD Ryzen Threadripper PRO 7975WX 32-Cores

pkg: crypto/ed25519
                │   before    │               after                │
                │   sec/op    │   sec/op     vs base               │
KeyGeneration-64    12.70µ ± 1%   12.38µ ± 2%  -2.53% (p=0.000 n=30)
NewKeyFromSeed-64   12.52µ ± 0%   12.27µ ± 1%  -2.00% (p=0.000 n=30)
Signing-64          15.42µ ± 0%   14.81µ ± 0%  -3.97% (p=0.000 n=30)
Verification-64     34.84µ ± 0%   34.68µ ± 0%  -0.44% (p=0.006 n=30)
geomean             17.10µ        16.71µ       -2.24%

pkg: crypto/internal/fips140/edwards25519
                                │   before    │                after                │
                                │   sec/op    │   sec/op     vs base                │
EncodingDecoding-64              5.159µ ± 0%   4.589µ ± 1%  -11.05% (p=0.000 n=30)
ScalarBaseMult-64                9.761µ ± 0%   9.780µ ± 1%        ~ (p=0.965 n=30)
ScalarMult-64                    31.99µ ± 0%   32.46µ ± 0%   +1.47% (p=0.000 n=30)
VarTimeDoubleScalarBaseMult-64   29.82µ ± 0%   30.16µ ± 0%   +1.14% (p=0.000 n=30)
geomean                          14.80µ        14.48µ        -2.20%

pkg: crypto/internal/fips140/edwards25519/field
            │   before    │                after                │
            │   sec/op    │   sec/op     vs base                │
Add-64        2.571n ± 2%   2.573n ± 1%        ~ (p=0.460 n=30)
Multiply-64   10.67n ± 0%   10.62n ± 0%   -0.47% (p=0.001 n=30)
Square-64     8.849n ± 0%   8.412n ± 0%   -4.94% (p=0.000 n=30)
Invert-64     2.401µ ± 0%   2.156µ ± 2%  -10.20% (p=0.000 n=30)
Mult32-64     3.226n ± 0%   3.240n ± 0%   +0.47% (p=0.004 n=30)
Bytes-64      7.974n ± 1%   7.905n ± 1%   -0.87% (p=0.015 n=30)
geomean       15.70n        15.27n        -2.74%


Change-Id: I995209e72e202e7ca4e436615424120ef09e8b37

Change diff

diff --git a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
index ecb713b..e2fcf0b 100644
--- a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
+++ b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
@@ -19,7 +19,6 @@
 	Package("crypto/internal/fips140/edwards25519/field")
 	ConstraintExpr("!purego")
 	feMul()
-	feSquare()
 	Generate()
 }
 
@@ -37,95 +36,6 @@
 
 func (c uint128) String() string { return c.name }
 
-func feSquare() {
-	TEXT("feSquare", NOSPLIT, "func(out, a *Element)")
-	Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
-	Pragma("noescape")
-
-	a := Dereference(Param("a"))
-	l0 := namedComponent{a.Field("l0"), "l0"}
-	l1 := namedComponent{a.Field("l1"), "l1"}
-	l2 := namedComponent{a.Field("l2"), "l2"}
-	l3 := namedComponent{a.Field("l3"), "l3"}
-	l4 := namedComponent{a.Field("l4"), "l4"}
-
-	// r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
-	r0 := uint128{"r0", GP64(), GP64()}
-	mul64(r0, 1, l0, l0)
-	addMul64(r0, 38, l1, l4)
-	addMul64(r0, 38, l2, l3)
-
-	// r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
-	r1 := uint128{"r1", GP64(), GP64()}
-	mul64(r1, 2, l0, l1)
-	addMul64(r1, 38, l2, l4)
-	addMul64(r1, 19, l3, l3)
-
-	// r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
-	r2 := uint128{"r2", GP64(), GP64()}
-	mul64(r2, 2, l0, l2)
-	addMul64(r2, 1, l1, l1)
-	addMul64(r2, 38, l3, l4)
-
-	// r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
-	r3 := uint128{"r3", GP64(), GP64()}
-	mul64(r3, 2, l0, l3)
-	addMul64(r3, 2, l1, l2)
-	addMul64(r3, 19, l4, l4)
-
-	// r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
-	r4 := uint128{"r4", GP64(), GP64()}
-	mul64(r4, 2, l0, l4)
-	addMul64(r4, 2, l1, l3)
-	addMul64(r4, 1, l2, l2)
-
-	Comment("First reduction chain")
-	maskLow51Bits := GP64()
-	MOVQ(Imm((1<<51)-1), maskLow51Bits)
-	c0, r0lo := shiftRightBy51(&r0)
-	c1, r1lo := shiftRightBy51(&r1)
-	c2, r2lo := shiftRightBy51(&r2)
-	c3, r3lo := shiftRightBy51(&r3)
-	c4, r4lo := shiftRightBy51(&r4)
-	maskAndAdd(r0lo, maskLow51Bits, c4, 19)
-	maskAndAdd(r1lo, maskLow51Bits, c0, 1)
-	maskAndAdd(r2lo, maskLow51Bits, c1, 1)
-	maskAndAdd(r3lo, maskLow51Bits, c2, 1)
-	maskAndAdd(r4lo, maskLow51Bits, c3, 1)
-
-	Comment("Second reduction chain (carryPropagate)")
-	// c0 = r0 >> 51
-	MOVQ(r0lo, c0)
-	SHRQ(Imm(51), c0)
-	// c1 = r1 >> 51
-	MOVQ(r1lo, c1)
-	SHRQ(Imm(51), c1)
-	// c2 = r2 >> 51
-	MOVQ(r2lo, c2)
-	SHRQ(Imm(51), c2)
-	// c3 = r3 >> 51
-	MOVQ(r3lo, c3)
-	SHRQ(Imm(51), c3)
-	// c4 = r4 >> 51
-	MOVQ(r4lo, c4)
-	SHRQ(Imm(51), c4)
-	maskAndAdd(r0lo, maskLow51Bits, c4, 19)
-	maskAndAdd(r1lo, maskLow51Bits, c0, 1)
-	maskAndAdd(r2lo, maskLow51Bits, c1, 1)
-	maskAndAdd(r3lo, maskLow51Bits, c2, 1)
-	maskAndAdd(r4lo, maskLow51Bits, c3, 1)
-
-	Comment("Store output")
-	out := Dereference(Param("out"))
-	Store(r0lo, out.Field("l0"))
-	Store(r1lo, out.Field("l1"))
-	Store(r2lo, out.Field("l2"))
-	Store(r3lo, out.Field("l3"))
-	Store(r4lo, out.Field("l4"))
-
-	RET()
-}
-
 func feMul() {
 	TEXT("feMul", NOSPLIT, "func(out, a, b *Element)")
 	Doc("feMul sets out = a * b. It works like feMulGeneric.")
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
index 00bf8f4..1f3ce86 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
@@ -8,8 +8,3 @@
 //
 //go:noescape
 func feMul(out *Element, a *Element, b *Element)
-
-// feSquare sets out = a * a. It works like feSquareGeneric.
-//
-//go:noescape
-func feSquare(out *Element, a *Element)
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
index 5e06e24..a24a241 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
@@ -229,170 +229,3 @@
 	MOVQ R13, 24(AX)
 	MOVQ R15, 32(AX)
 	RET
-
-// func feSquare(out *Element, a *Element)
-TEXT ·feSquare(SB), NOSPLIT, $0-16
-	MOVQ a+8(FP), CX
-
-	// r0 = l0×l0
-	MOVQ (CX), AX
-	MULQ (CX)
-	MOVQ AX, SI
-	MOVQ DX, BX
-
-	// r0 += 38×l1×l4
-	MOVQ 8(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	SHLQ $0x01, AX
-	MULQ 32(CX)
-	ADDQ AX, SI
-	ADCQ DX, BX
-
-	// r0 += 38×l2×l3
-	MOVQ 16(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	SHLQ $0x01, AX
-	MULQ 24(CX)
-	ADDQ AX, SI
-	ADCQ DX, BX
-
-	// r1 = 2×l0×l1
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 8(CX)
-	MOVQ AX, R8
-	MOVQ DX, DI
-
-	// r1 += 38×l2×l4
-	MOVQ 16(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	SHLQ $0x01, AX
-	MULQ 32(CX)
-	ADDQ AX, R8
-	ADCQ DX, DI
-
-	// r1 += 19×l3×l3
-	MOVQ 24(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	MULQ 24(CX)
-	ADDQ AX, R8
-	ADCQ DX, DI
-
-	// r2 = 2×l0×l2
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 16(CX)
-	MOVQ AX, R10
-	MOVQ DX, R9
-
-	// r2 += l1×l1
-	MOVQ 8(CX), AX
-	MULQ 8(CX)
-	ADDQ AX, R10
-	ADCQ DX, R9
-
-	// r2 += 38×l3×l4
-	MOVQ 24(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	SHLQ $0x01, AX
-	MULQ 32(CX)
-	ADDQ AX, R10
-	ADCQ DX, R9
-
-	// r3 = 2×l0×l3
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 24(CX)
-	MOVQ AX, R12
-	MOVQ DX, R11
-
-	// r3 += 2×l1×l2
-	MOVQ 8(CX), AX
-	SHLQ $0x01, AX
-	MULQ 16(CX)
-	ADDQ AX, R12
-	ADCQ DX, R11
-
-	// r3 += 19×l4×l4
-	MOVQ 32(CX), DX
-	LEAQ (DX)(DX*8), AX
-	LEAQ (DX)(AX*2), AX
-	MULQ 32(CX)
-	ADDQ AX, R12
-	ADCQ DX, R11
-
-	// r4 = 2×l0×l4
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 32(CX)
-	MOVQ AX, R14
-	MOVQ DX, R13
-
-	// r4 += 2×l1×l3
-	MOVQ 8(CX), AX
-	SHLQ $0x01, AX
-	MULQ 24(CX)
-	ADDQ AX, R14
-	ADCQ DX, R13
-
-	// r4 += l2×l2
-	MOVQ 16(CX), AX
-	MULQ 16(CX)
-	ADDQ AX, R14
-	ADCQ DX, R13
-
-	// First reduction chain
-	MOVQ   $0x0007ffffffffffff, AX
-	SHLQ   $0x0d, SI, BX
-	SHLQ   $0x0d, R8, DI
-	SHLQ   $0x0d, R10, R9
-	SHLQ   $0x0d, R12, R11
-	SHLQ   $0x0d, R14, R13
-	ANDQ   AX, SI
-	IMUL3Q $0x13, R13, R13
-	ADDQ   R13, SI
-	ANDQ   AX, R8
-	ADDQ   BX, R8
-	ANDQ   AX, R10
-	ADDQ   DI, R10
-	ANDQ   AX, R12
-	ADDQ   R9, R12
-	ANDQ   AX, R14
-	ADDQ   R11, R14
-
-	// Second reduction chain (carryPropagate)
-	MOVQ   SI, BX
-	SHRQ   $0x33, BX
-	MOVQ   R8, DI
-	SHRQ   $0x33, DI
-	MOVQ   R10, R9
-	SHRQ   $0x33, R9
-	MOVQ   R12, R11
-	SHRQ   $0x33, R11
-	MOVQ   R14, R13
-	SHRQ   $0x33, R13
-	ANDQ   AX, SI
-	IMUL3Q $0x13, R13, R13
-	ADDQ   R13, SI
-	ANDQ   AX, R8
-	ADDQ   BX, R8
-	ANDQ   AX, R10
-	ADDQ   DI, R10
-	ANDQ   AX, R12
-	ADDQ   R9, R12
-	ANDQ   AX, R14
-	ADDQ   R11, R14
-
-	// Store output
-	MOVQ out+0(FP), AX
-	MOVQ SI, (AX)
-	MOVQ R8, 8(AX)
-	MOVQ R10, 16(AX)
-	MOVQ R12, 24(AX)
-	MOVQ R14, 32(AX)
-	RET
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go b/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
index 792b84f..a85eb10 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
@@ -7,7 +7,3 @@
 package field
 
 func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
-
-func feSquare(v, x *Element) { feSquareGeneric(v, x) }
-
-func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
index dd9a0fb..579eab6 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_generic.go
@@ -183,7 +183,7 @@
 	v.l4 = rr4&maskLow51Bits + rr3>>51
 }
 
-func feSquareGeneric(v, a *Element) {
+func feSquare(v, a *Element) {
 	l0 := a.l0
 	l1 := a.l1
 	l2 := a.l2
@@ -256,9 +256,9 @@
 	v.l4 = rr4&maskLow51Bits + rr3>>51
 }
 
-// feSquareNGeneric squares a n times and writes the result to v.
+// feSquareN squares a n times and writes the result to v.
 // It uses local variables to keep limbs in registers.
-func feSquareNGeneric(v, a *Element, n int) {
+func feSquareN(v, a *Element, n int) {
 	l0 := a.l0
 	l1 := a.l1
 	l2 := a.l2
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go b/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go
deleted file mode 100644
index 021899b..0000000
--- a/src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) 2026 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64 && !purego
-
-package field
-
-func feSquareN(v, a *Element, n int) {
-	feSquare(v, a)
-	for range n - 1 {
-		feSquare(v, v)
-	}
-}
diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_test.go b/src/crypto/internal/fips140/edwards25519/field/fe_test.go
index 424074f..a98c152 100644
--- a/src/crypto/internal/fips140/edwards25519/field/fe_test.go
+++ b/src/crypto/internal/fips140/edwards25519/field/fe_test.go
@@ -514,51 +514,6 @@
 	}
 }
 
-func TestFeSquareN(t *testing.T) {
-	asmLikeGeneric := func(a Element) bool {
-		for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
-			t1 := a
-			t2 := a
-
-			feSquareNGeneric(&t1, &t1, n)
-			feSquareN(&t2, &t2, n)
-
-			if t1 != t2 {
-				t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
-				return false
-			}
-			if !isInBounds(&t2) {
-				return false
-			}
-		}
-		return true
-	}
-
-	if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
-		t.Error(err)
-	}
-}
-
-func TestFeSquare(t *testing.T) {
-	asmLikeGeneric := func(a Element) bool {
-		t1 := a
-		t2 := a
-
-		feSquareGeneric(&t1, &t1)
-		feSquare(&t2, &t2)
-
-		if t1 != t2 {
-			t.Logf("got: %#v,\nexpected: %#v", t1, t2)
-		}
-
-		return t1 == t2 && isInBounds(&t2)
-	}
-
-	if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
-		t.Error(err)
-	}
-}
-
 func TestFeMul(t *testing.T) {
 	asmLikeGeneric := func(a, b Element) bool {
 		a1 := a

Change information

Files:

M src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go
M src/crypto/internal/fips140/edwards25519/field/fe_amd64.go
M src/crypto/internal/fips140/edwards25519/field/fe_amd64.s
M src/crypto/internal/fips140/edwards25519/field/fe_amd64_noasm.go
M src/crypto/internal/fips140/edwards25519/field/fe_generic.go
D src/crypto/internal/fips140/edwards25519/field/fe_squaren_amd64.go
M src/crypto/internal/fips140/edwards25519/field/fe_test.go

Change size: L

Delta: 7 files changed, 3 insertions(+), 328 deletions(-)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

satisfied_requirement

open

diffy

Josh Bleecher Snyder (Gerrit)

unread,

May 16, 2026, 9:42:17 AM (3 days ago) May 16

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Attention needed from Filippo Valsorda

Josh Bleecher Snyder added 1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . unresolved

I can send that unwinding as a follow-up if desired.

Open in Gerrit

Related details

Attention is currently required from:

Filippo Valsorda

Submit Requirements:

Code-Review

No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

unsatisfied_requirement

open

diffy

Filippo Valsorda (Gerrit)

unread,

May 16, 2026, 9:57:28 AM (3 days ago) May 16

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Attention needed from Josh Bleecher Snyder

Filippo Valsorda voted and added 1 comment

Votes added by Filippo Valsorda

Code-Review

+2

1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . resolved

I can send that unwinding as a follow-up if desired.

Filippo Valsorda

It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)

Open in Gerrit

Related details

Attention is currently required from:

Josh Bleecher Snyder

Submit Requirements:

Code-Review
No-Unresolved-Comments

Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Josh Bleecher Snyder (Gerrit)

unread,

May 16, 2026, 10:04:39 AM (3 days ago) May 16

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Josh Bleecher Snyder added 1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . resolved

I can send that unwinding as a follow-up if desired.

Filippo Valsorda

It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)

Josh Bleecher Snyder

Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Filippo Valsorda (Gerrit)

unread,

May 16, 2026, 10:06:38 AM (3 days ago) May 16

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Attention needed from Josh Bleecher Snyder

Filippo Valsorda added 1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . resolved

I can send that unwinding as a follow-up if desired.

Filippo Valsorda

It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)

Josh Bleecher Snyder

Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?

Filippo Valsorda

Depending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.

However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.

Open in Gerrit

Related details

Attention is currently required from:

Josh Bleecher Snyder

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Josh Bleecher Snyder (Gerrit)

unread,

May 16, 2026, 10:12:06 AM (3 days ago) May 16

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Josh Bleecher Snyder added 1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . resolved

I can send that unwinding as a follow-up if desired.

Filippo Valsorda

It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)

Josh Bleecher Snyder

Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?

Filippo Valsorda

Depending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.
However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.

Josh Bleecher Snyder

Oh right. Sadness. OK, guess the burden lies with the scheduler then…it’s doing deeply silly things but only with a tiny subset of functions, those with many interleavable short carry chains. I’ve struggled to get the heuristics good in a way that does smell weird or add too much code. Will try again in a bit. (Unless say Daniel wants to take it over? Would be happy to share my goo so far.)

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Josh Bleecher Snyder (Gerrit)

unread,

May 18, 2026, 11:32:39 PM (9 hours ago) May 18

to Josh Bleecher Snyder, goph...@pubsubhelper.golang.org, Filippo Valsorda, golang-co...@googlegroups.com

Josh Bleecher Snyder added 1 comment

Commit Message

Line 14, Patchset 1 (Latest):of their assembly roots. But unwinding that makes for a large diff.

Josh Bleecher Snyder . resolved

I can send that unwinding as a follow-up if desired.

Filippo Valsorda

It'd be a bit weird to have Square work differently from Mul, and it's nice to have then next to each other in the code. Probably fine as is until we remove the Mul assembly, too. (wink)

Josh Bleecher Snyder

Mul is within striking range. Mainly needs a scheduler tweak, but introducing MULX would help too. But MULX would only help on GOAMD > 3 (or maybe >= 3, don’t recall). Is “fast on newer machines” good enough for asm deletion?

Filippo Valsorda

Depending on the definition of new, yes, we've stopped caring about speed on older machines, see https://github.com/golang/go/issues/69587.
However, GOAMD is opt-in, isn't it? Or does it support runtime feature-gating? It wouldn't be good to regress the default build.

Josh Bleecher Snyder

Oh right. Sadness. OK, guess the burden lies with the scheduler then…it’s doing deeply silly things but only with a tiny subset of functions, those with many interleavable short carry chains. I’ve struggled to get the heuristics good in a way that does smell weird or add too much code. Will try again in a bit. (Unless say Daniel wants to take it over? Would be happy to share my goo so far.)

Josh Bleecher Snyder

I am out of ideas for feMulGeneric.

Up to you whether you want to submit this + the previous change now or wait for Daniel to come up with something brilliant for feMulGeneric. It's kind of a hard nut to crack...

Open in Gerrit

Related details

Attention set is empty

Submit Requirements:

Code-Review
No-Unresolved-Comments
Review-Enforcement
TryBots-Pass

Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings.

Gerrit

satisfied_requirement

unsatisfied_requirement

open

diffy

Reply all

Reply to author

Forward

[go] crypto/internal/fips140/edwards25519/field: delete Square amd64 assembly

Josh Bleecher Snyder (Gerrit)

Josh Bleecher Snyder has uploaded the change for review

Commit message

Change diff

Change information

Related details

Josh Bleecher Snyder (Gerrit)

Josh Bleecher Snyder added 1 comment

Related details

Filippo Valsorda (Gerrit)

Filippo Valsorda voted and added 1 comment

Votes added by Filippo Valsorda

1 comment

Related details

Josh Bleecher Snyder (Gerrit)

Josh Bleecher Snyder added 1 comment

Related details

Filippo Valsorda (Gerrit)

Filippo Valsorda added 1 comment

Related details

Josh Bleecher Snyder (Gerrit)

Josh Bleecher Snyder added 1 comment

Related details

Josh Bleecher Snyder (Gerrit)

Josh Bleecher Snyder added 1 comment

Related details