[go] crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

Nick Craig-Wood (Gerrit)

unread,

Mar 22, 2017, 4:34:08 PM3/22/17

to Ian Lance Taylor, golang-co...@googlegroups.com

Nick Craig-Wood has uploaded this change for review.

crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

ARM assembly for AES crypto adapted from openssl giving an
encrypt/decrypt speed up of 1.6-2.7x and a key scheduling speedup of
2.3-4.5x.

Raspberry Pi 3 BCM2709 ARMv7 Processor rev 5 (v7l)

name       old time/op    new time/op     delta
Encrypt-4    3.12µs ± 1%     1.13µs ± 2%   -63.86%  (p=0.000 n=20+20)
Decrypt-4    3.10µs ± 1%     1.21µs ± 2%   -60.91%  (p=0.000 n=20+20)
Expand-4     11.3µs ± 1%      2.5µs ± 1%   -78.11%  (p=0.000 n=16+16)

name       old speed      new speed       delta
Encrypt-4  5.13MB/s ± 2%  14.18MB/s ± 2%  +176.58%  (p=0.000 n=20+20)
Decrypt-4  5.16MB/s ± 1%  13.19MB/s ± 2%  +155.78%  (p=0.000 n=20+20)

Chrombook Samsung Exynos5 ARMv7 Processor rev 4 (v7l)

name       old time/op    new time/op    delta
Encrypt-2     342ns ± 1%     210ns ± 1%  -38.80%  (p=0.000 n=16+20)
Decrypt-2     343ns ± 6%     209ns ± 2%  -39.16%  (p=0.000 n=17+18)
Expand-2     1.64µs ± 5%    0.70µs ± 1%  -57.29%  (p=0.000 n=17+19)

name       old speed      new speed      delta
Encrypt-2  46.7MB/s ± 1%  76.2MB/s ± 1%  +63.35%  (p=0.000 n=16+20)
Decrypt-2  46.4MB/s ± 7%  76.4MB/s ± 2%  +64.75%  (p=0.000 n=18+18)

Issue #4299

Change-Id: I13df6a87f5697de255cb9a494022dd7f7dbde8f5
---
A src/crypto/aes/asm_arm.s
M src/crypto/aes/block.go
A src/crypto/aes/cipher_arm.go
M src/crypto/aes/cipher_generic.go
4 files changed, 946 insertions(+), 1 deletion(-)

diff --git a/src/crypto/aes/asm_arm.s b/src/crypto/aes/asm_arm.s
new file mode 100644
index 0000000..8b24e16
--- /dev/null
+++ b/src/crypto/aes/asm_arm.s
@@ -0,0 +1,842 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is a derived work from OpenSSL of AES using assembly optimizations. The
+// original code was written by Andy Polyakov <ap...@openssl.org> and it's dual
+// licensed under OpenSSL and CRYPTOGAMS licenses depending on where you obtain
+// it. For further details see http://www.openssl.org/~appro/cryptogams/.
+
+// Original code can be found at the link bellow:
+// https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl
+
+// This code is based on crypto/aes/asm/aes-armv4.pl version
+// 6aa36e8e5a062e31543e7796f0351ff9628832ce from 21 May 2017
+
+// Apart from assembler syntax and calling convention changes, the
+// major change needed was to spill one register to the stack as go
+// can't use R10
+
+// AES for ARMv4
+
+// January 2007.
+//
+// Code uses single 1K S-box and is >2 times faster than code generated
+// by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+// allows to merge logical or arithmetic operation with shift or rotate
+// in one instruction and emit combined result every cycle. The module
+// is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+// key [on single-issue Xscale PXA250 core].
+
+// May 2007.
+//
+// AES_set_[en|de]crypt_key is added.
+
+// July 2010.
+//
+// Rescheduling for dual-issue pipeline resulted in 12% improvement on
+// Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
+// February 2011.
+//
+// Profiler-assisted and platform-specific optimization resulted in 16%
+// improvement on Cortex A8 core and ~21.5 cycles per byte.
+
+#include "textflag.h"
+
+#define s0 R0
+#define s1 R1
+#define s2 R2
+#define s3 R3
+#define t1 R4
+#define t2 R5
+#define t3 R6
+#define i1 R7
+#define i2 R8
+#define i3 R9
+#define mask80 i1
+#define mask1b i2
+#define mask7f i3
+
+#define tbl R11
+#define key R12
+#define t4 R14
+
+// #define ARM_ARCH_7 1
+
+// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT	·encryptBlockAsm(SB), NOSPLIT, $4-16
+	MOVW	src+12(FP), t4
+	MOVW	$·te0(SB), tbl
+	MOVW	xk+4(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	ORR	t2<<16,s3,s3
+	ORR	t3<<24,s3,s3
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+#endif
+
+	MOVM.IA.W	(key),[t1-i1]
+	EOR	t1,s0,s0
+	EOR	t2,s1,s1
+	EOR	t3,s2,s2
+	EOR	i1,s3,s3
+	MOVW	$255,t4
+
+	AND	s0,t4,i1
+	AND	s0>>8,t4,i2
+	AND	s0>>16,t4,i3
+	MOVW	s0>>24,s0
+enc_loop:
+	MOVW	i1<<2(tbl),t1	// Te3[s0>>0]
+	AND	s1>>16,t4,i1	// i0
+	MOVW	i2<<2(tbl),t2	// Te2[s0>>8]
+	AND	s1,t4,i2
+	MOVW	i3<<2(tbl),t3	// Te1[s0>>16]
+	AND	s1>>8,t4,i3
+	MOVW	s0<<2(tbl),s0	// Te0[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVW	i1<<2(tbl),i1	// Te1[s1>>16]
+	MOVW	i2<<2(tbl),i2	// Te3[s1>>0]
+	MOVW	i3<<2(tbl),i3	// Te2[s1>>8]
+	EOR	i1@>8,s0,s0
+	MOVW	s1<<2(tbl),s1	// Te0[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2@>8,t2,t2
+	AND	s2>>16,t4,i2	// i1
+	EOR	i3@>8,t3,t3
+	AND	s2,t4,i3
+	MOVW	i1<<2(tbl),i1	// Te2[s2>>8]
+	EOR	t1@>24,s1,s1
+	MOVW	i2<<2(tbl),i2	// Te1[s2>>16]
+	MOVW	s2>>24,s2
+
+	MOVW	i3<<2(tbl),i3	// Te3[s2>>0]
+	EOR	i1@>16,s0,s0
+	MOVW	s2<<2(tbl),s2	// Te0[s2>>24]
+	AND	s3,t4,i1	// i0
+	EOR	i2@>8,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	i3@>16,t3,t3
+	AND	s3>>16,t4,i3	// i2
+	MOVW	i1<<2(tbl),i1	// Te3[s3>>0]
+	EOR	t2@>16,s2,s2
+	MOVW	i2<<2(tbl),i2	// Te2[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVW	i3<<2(tbl),i3	// Te1[s3>>16]
+	EOR	i1@>24,s0,s0
+	MOVW.P	16(key),i1
+	EOR	i2@>16,s1,s1
+	MOVW	s3<<2(tbl),s3	// Te0[s3>>24]
+	EOR	i3@>8,s2,s2
+	MOVW	-12(key),t1
+	EOR	t3@>8,s3,s3
+
+	MOVW	-8(key),t2
+	EOR	i1,s0,s0
+	MOVW	-4(key),t3
+	AND	s0,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0>>16,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	enc_loop
+
+	ADD	$2,tbl,tbl
+
+	MOVBU	i1<<2(tbl),t1	// Te4[s0>>0]
+	AND	s1>>16,t4,i1	// i0
+	MOVBU	i2<<2(tbl),t2	// Te4[s0>>8]
+	AND	s1,t4,i2
+	MOVBU	i3<<2(tbl),t3	// Te4[s0>>16]
+	AND	s1>>8,t4,i3
+	MOVBU	s0<<2(tbl),s0	// Te4[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVBU	i1<<2(tbl),i1	// Te4[s1>>16]
+	MOVBU	i2<<2(tbl),i2	// Te4[s1>>0]
+	MOVBU	i3<<2(tbl),i3	// Te4[s1>>8]
+	EOR	s0<<8,i1,s0
+	MOVBU	s1<<2(tbl),s1	// Te4[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	t2<<8,i2,t2
+	AND	s2>>16,t4,i2	// i1
+	EOR	t3<<8,i3,t3
+	AND	s2,t4,i3
+	MOVBU	i1<<2(tbl),i1	// Te4[s2>>8]
+	EOR	s1<<24,t1,s1
+	MOVBU	i2<<2(tbl),i2	// Te4[s2>>16]
+	MOVW	s2>>24,s2
+
+	MOVBU	i3<<2(tbl),i3	// Te4[s2>>0]
+	EOR	s0<<8,i1,s0
+	MOVBU	s2<<2(tbl),s2	// Te4[s2>>24]
+	AND	s3,t4,i1	// i0
+	EOR	i2<<16,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	t3<<8,i3,t3
+	AND	s3>>16,t4,i3	// i2
+	MOVBU	i1<<2(tbl),i1	// Te4[s3>>0]
+	EOR	s2<<24,t2,s2
+	MOVBU	i2<<2(tbl),i2	// Te4[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVBU	i3<<2(tbl),i3	// Te4[s3>>16]
+	EOR	s0<<8,i1,s0
+	MOVW	0(key),i1
+	MOVBU	s3<<2(tbl),s3	// Te4[s3>>24]
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	i3<<16,s2,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
+
+// func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+TEXT	·expandKeyEncAsm(SB), NOSPLIT, $4-12
+	MOVW	userKey+4(FP), t4	// inp
+	MOVW	nr+0(FP), tbl		// rounds
+	MOVW	enc+8(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	MOVW.P	s0,16(key)
+	ORR	t2<<16,s3,s3
+	MOVW	s1,-12(key)
+	ORR	t3<<24,s3,s3
+	MOVW	s2,-8(key)
+	MOVW	s3,-4(key)
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW.P	s0,16(key)
+	MOVW	s1,-12(key)
+	MOVW	s2,-8(key)
+	MOVW	s3,-4(key)
+#endif
+
+	TEQ	$10,tbl
+	BNE	ek_not128
+	MOVW	$·rcon(SB), t3
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$255,t4
+
+ek_128_loop:
+	AND	s3>>24,t4,t2
+	AND	s3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	s3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	s3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,t2
+	MOVW	nr+0(FP), t1
+	EOR	t2,s0,s0	// rk[4]=rk[0]^...
+	EOR	s0,s1,s1	// rk[5]=rk[1]^rk[4]
+	MOVW.P	s0,16(key)
+	EOR	s1,s2,s2	// rk[6]=rk[2]^rk[5]
+	MOVW	s1,-12(key)
+	EOR	s2,s3,s3	// rk[7]=rk[3]^rk[6]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-8(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-4(key)
+	BNE	ek_128_loop
+	SUB	$176,key,R2
+	B	ek_done
+
+ek_not128:
+#ifndef ARM_ARCH_7
+	MOVBU	19(t4),i2
+	MOVBU	18(t4),t1
+	MOVBU	17(t4),t2
+	MOVBU	16(t4),t3
+	ORR	t1<<8,i2,i2
+	MOVBU	23(t4),i3
+	ORR	t2<<16,i2,i2
+	MOVBU	22(t4),t1
+	ORR	t3<<24,i2,i2
+	MOVBU	21(t4),t2
+	MOVBU	20(t4),t3
+	ORR	t1<<8,i3,i3
+	ORR	t2<<16,i3,i3
+	MOVW.P	i2,8(key)
+	ORR	t3<<24,i3,i3
+	MOVW	i3,-4(key)
+#else
+	MOVW	16(t4),i2
+	MOVW	20(t4),i3
+	REV	i2,i2
+	REV	i3,i3
+	MOVW.P	i2,8(key)
+	MOVW	i3,-4(key)
+#endif
+
+	TEQ	$12,tbl
+	BNE	ek_not192
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$·rcon(SB), t3
+	MOVW	$8,t1
+	MOVW	$255,t4
+	MOVW	t1, nr+0(FP)
+
+ek_192_loop:
+	AND	i3>>24,t4,t2
+	AND	i3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	i3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	i3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,i3
+	MOVW	nr+0(FP), t1
+	EOR	i3,s0,s0	// rk[6]=rk[0]^...
+	EOR	s0,s1,s1	// rk[7]=rk[1]^rk[6]
+	MOVW.P	s0,24(key)
+	EOR	s1,s2,s2	// rk[8]=rk[2]^rk[7]
+	MOVW	s1,-20(key)
+	EOR	s2,s3,s3	// rk[9]=rk[3]^rk[8]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-16(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-12(key)
+	BEQ	ek_done
+
+	MOVW	-32(key),i1
+	MOVW	-28(key),i2
+	EOR	s3,i1,i1	// rk[10]=rk[4]^rk[9]
+	EOR	i1,i2,i3	// rk[11]=rk[5]^rk[10]
+	MOVW	i1,-8(key)
+	MOVW	i3,-4(key)
+	B	ek_192_loop
+
+ek_not192:
+#ifndef ARM_ARCH_7
+	MOVBU	27(t4),i2
+	MOVBU	26(t4),t1
+	MOVBU	25(t4),t2
+	MOVBU	24(t4),t3
+	ORR	t1<<8,i2,i2
+	MOVBU	31(t4),i3
+	ORR	t2<<16,i2,i2
+	MOVBU	30(t4),t1
+	ORR	t3<<24,i2,i2
+	MOVBU	29(t4),t2
+	MOVBU	28(t4),t3
+	ORR	t1<<8,i3,i3
+	ORR	t2<<16,i3,i3
+	MOVW.P	i2,8(key)
+	ORR	t3<<24,i3,i3
+	MOVW	i3,-4(key)
+#else
+	MOVW	24(t4),i2
+	MOVW	28(t4),i3
+	REV	i2,i2
+	REV	i3,i3
+	MOVW.P	i2,8(key)
+	MOVW	i3,-4(key)
+#endif
+
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$·rcon(SB), t3
+	MOVW	$7,t1
+	MOVW	$255,t4
+	MOVW	t1, nr+0(FP)
+
+ek_256_loop:
+	AND	i3>>24,t4,t2
+	AND	i3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	i3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	i3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,i3
+	MOVW	nr+0(FP), t1
+	EOR	i3,s0,s0	// rk[8]=rk[0]^...
+	EOR	s0,s1,s1	// rk[9]=rk[1]^rk[8]
+	MOVW.P	s0,32(key)
+	EOR	s1,s2,s2	// rk[10]=rk[2]^rk[9]
+	MOVW	s1,-28(key)
+	EOR	s2,s3,s3	// rk[11]=rk[3]^rk[10]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-24(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-20(key)
+	BEQ	ek_done
+
+	AND	s3,t4,t2
+	AND	s3>>8,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	s3>>16,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	s3>>24,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<8,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW	-48(key),t1
+	ORR	i3<<24,t2,t2
+
+	MOVW	-44(key),i1
+	MOVW	-40(key),i2
+	EOR	t2,t1,t1	// rk[12]=rk[4]^...
+	MOVW	-36(key),i3
+	EOR	t1,i1,i1	// rk[13]=rk[5]^rk[12]
+	MOVW	t1,-16(key)
+	EOR	i1,i2,i2	// rk[14]=rk[6]^rk[13]
+	MOVW	i1,-12(key)
+	EOR	i2,i3,i3	// rk[15]=rk[7]^rk[14]
+	MOVW	i2,-8(key)
+	MOVW	i3,-4(key)
+	B	ek_256_loop
+
+ek_done:
+	RET
+
+// func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+TEXT	·expandKeyDecAsm(SB), NOSPLIT, $4-12
+	MOVW	nr+0(FP), t4	// rounds
+	MOVW	enc+4(FP), i1	// input
+	ADD	t4<<4,i1,i2
+	MOVW	dec+8(FP), key	// output
+	ADD	t4<<4,key,tbl
+
+dk_inv:	MOVW.P	16(i1),s0
+	MOVW	-12(i1),s1
+	MOVW	-8(i1),s2
+	MOVW	-4(i1),s3
+	MOVW.P	-16(i2),t1
+	MOVW	16+4(i2),t2
+	MOVW	16+8(i2),t3
+	MOVW	16+12(i2),i3
+	MOVW.P	s0,-16(tbl)
+	MOVW	s1,16+4(tbl)
+	MOVW	s2,16+8(tbl)
+	MOVW	s3,16+12(tbl)
+	MOVW.P	t1,16(key)
+	MOVW	t2,-12(key)
+	MOVW	t3,-8(key)
+	MOVW	i3,-4(key)
+	TEQ	i2,i1
+	BNE	dk_inv
+
+	MOVW	(i1),s0
+	MOVW	4(i1),s1
+	MOVW	8(i1),s2
+	MOVW	12(i1),s3
+	MOVW	s0,(key)
+	MOVW	s1,4(key)
+	MOVW	s2,8(key)
+	MOVW	s3,12(key)
+	SUB	t4<<3,key,key
+
+	MOVW.W	16(key),s0	// prefetch tp1
+	MOVW	$0x80,mask80
+	MOVW	$0x1b,mask1b
+	ORR	$0x8000,mask80,mask80
+	ORR	$0x1b00,mask1b,mask1b
+	ORR	mask80<<16,mask80,mask80
+	ORR	mask1b<<16,mask1b,mask1b
+	SUB	$1,t4,t4
+	MVN	mask80,mask7f
+	MOVW	t4<<2,t4	// (rounds-1)*4
+
+dk_mix:	AND	mask80,s0,t1
+	AND	mask7f,s0,s1
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s1<<1,t1,s1	// tp2
+
+	AND	mask80,s1,t1
+	AND	mask7f,s1,s2
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s2<<1,t1,s2	// tp4
+
+	AND	mask80,s2,t1
+	AND	mask7f,s2,s3
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s3<<1,t1,s3	// tp8
+
+	EOR	s2,s1,t1
+	EOR	s3,s0,t2	// tp9
+	EOR	s3,t1,t1	// tpe
+	EOR	s1@>24,t1,t1
+	EOR	t2@>24,t1,t1	// ^= ROTATE(tpb=tp9^tp2,8)
+	EOR	s2@>16,t1,t1
+	EOR	t2@>16,t1,t1	// ^= ROTATE(tpd=tp9^tp4,16)
+	EOR	t2@>8,t1,t1	// ^= ROTATE(tp9,24)
+
+	MOVW	4(key),s0	// prefetch tp1
+	MOVW.P	t1,4(key)
+	SUB.S	$1,t4,t4
+	BNE	dk_mix
+
+	RET
+
+// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT	·decryptBlockAsm(SB), NOSPLIT, $4-16
+	MOVW	src+12(FP), t4
+	MOVW	$·td0(SB), tbl
+	MOVW	xk+4(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	ORR	t2<<16,s3,s3
+	ORR	t3<<24,s3,s3
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+#endif
+
+	MOVM.IA.W	(key),[t1-i1]
+	EOR	t1,s0,s0
+	MOVW	240-16(key),t4
+	EOR	t2,s1,s1
+	EOR	t3,s2,s2
+	EOR	i1,s3,s3
+	SUB	$1,t4,t4
+	MOVW	$255,t4
+
+	AND	s0>>16,t4,i1
+	AND	s0>>8,t4,i2
+	AND	s0,t4,i3
+	MOVW	s0>>24,s0
+dec_loop:
+	MOVW	i1<<2(tbl),t1	// Td1[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVW	i2<<2(tbl),t2	// Td2[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVW	i3<<2(tbl),t3	// Td3[s0>>0]
+	AND	s1>>8,t4,i3
+	MOVW	s0<<2(tbl),s0	// Td0[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVW	i1<<2(tbl),i1	// Td3[s1>>0]
+	MOVW	i2<<2(tbl),i2	// Td1[s1>>16]
+	MOVW	i3<<2(tbl),i3	// Td2[s1>>8]
+	EOR	i1@>24,s0,s0
+	MOVW	s1<<2(tbl),s1	// Td0[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	t2@>8,i2,t2
+	AND	s2,t4,i2	// i1
+	EOR	t3@>8,i3,t3
+	AND	s2>>16,t4,i3
+	MOVW	i1<<2(tbl),i1	// Td2[s2>>8]
+	EOR	t1@>8,s1,s1
+	MOVW	i2<<2(tbl),i2	// Td3[s2>>0]
+	MOVW	s2>>24,s2
+
+	MOVW	i3<<2(tbl),i3	// Td1[s2>>16]
+	EOR	i1@>16,s0,s0
+	MOVW	s2<<2(tbl),s2	// Td0[s2>>24]
+	AND	s3>>16,t4,i1	// i0
+	EOR	i2@>24,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	t3@>8,i3,t3
+	AND	s3,t4,i3	// i2
+	MOVW	i1<<2(tbl),i1	// Td1[s3>>16]
+	EOR	t2@>8,s2,s2
+	MOVW	i2<<2(tbl),i2	// Td2[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVW	i3<<2(tbl),i3	// Td3[s3>>0]
+	EOR	i1@>8,s0,s0
+	MOVW.P	16(key),i1
+	EOR	i2@>16,s1,s1
+	MOVW	s3<<2(tbl),s3	// Td0[s3>>24]
+	EOR	i3@>24,s2,s2
+
+	MOVW	-12(key),t1
+	EOR	i1,s0,s0
+	MOVW	-8(key),t2
+	EOR	t3@>8,s3,s3
+	MOVW	-4(key),t3
+	AND	s0>>16,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	dec_loop
+
+	MOVW	$·sbox1(SB),tbl
+
+	MOVW	0(tbl),t2	// prefetch Td4
+	MOVW	32(tbl),t3
+	MOVW	64(tbl),t1
+	MOVW	96(tbl),t2
+	MOVW	128(tbl),t3
+	MOVW	160(tbl),t1
+	MOVW	192(tbl),t2
+	MOVW	224(tbl),t3
+
+	MOVBU	s0<<0(tbl),s0	// Td4[s0>>24]
+	MOVBU	i1<<0(tbl),t1	// Td4[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVBU	i2<<0(tbl),t2	// Td4[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVBU	i3<<0(tbl),t3	// Td4[s0>>0]
+	AND	s1>>8,t4,i3
+
+	ADD	s1>>24,tbl,s1
+	MOVBU	i1<<0(tbl),i1	// Td4[s1>>0]
+	MOVBU	(s1),s1		// Td4[s1>>24]
+	MOVBU	i2<<0(tbl),i2	// Td4[s1>>16]
+	EOR	s0<<24,i1,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s1>>8]
+	EOR	s1<<8,t1,s1
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2<<8,t2,t2
+	AND	s2,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s2>>8]
+	EOR	i3<<8,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s2>>0]
+	AND	s2>>16,t4,i3
+
+	ADD	s2>>24,tbl,s2
+	MOVBU	(s2),s2		// Td4[s2>>24]
+	EOR	i1<<8,s0,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s2>>16]
+	EOR	s1<<16,i2,s1
+	AND	s3>>16,t4,i1	// i0
+	EOR	s2<<16,t2,s2
+	AND	s3>>8,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s3>>16]
+	EOR	i3<<16,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s3>>8]
+	AND	s3,t4,i3	// i2
+
+	ADD	s3>>24,tbl,s3
+	MOVBU	i3<<0(tbl),i3	// Td4[s3>>0]
+	MOVBU	(s3),s3		// Td4[s3>>24]
+	EOR	i1<<16,s0,s0
+	MOVW	0(key),i1
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	s2<<8,i3,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
diff --git a/src/crypto/aes/block.go b/src/crypto/aes/block.go
index 41ea9cf..e525081 100644
--- a/src/crypto/aes/block.go
+++ b/src/crypto/aes/block.go
@@ -36,6 +36,8 @@
 
 package aes
 
+import "fmt"
+
 // Encrypt one block from src into dst, using the expanded key xk.
 func encryptBlockGo(xk []uint32, dst, src []byte) {
 	var s0, s1, s2, s3, t0, t1, t2, t3 uint32
@@ -56,6 +58,7 @@
 	nr := len(xk)/4 - 2 // - 2: one above, one more below
 	k := 4
 	for r := 0; r < nr; r++ {
+		fmt.Printf("0x%08X 0x%08X 0x%08X 0x%08X\n", s0, s1, s2, s3)
 		t0 = xk[k+0] ^ te0[uint8(s0>>24)] ^ te1[uint8(s1>>16)] ^ te2[uint8(s2>>8)] ^ te3[uint8(s3)]
 		t1 = xk[k+1] ^ te0[uint8(s1>>24)] ^ te1[uint8(s2>>16)] ^ te2[uint8(s3>>8)] ^ te3[uint8(s0)]
 		t2 = xk[k+2] ^ te0[uint8(s2>>24)] ^ te1[uint8(s3>>16)] ^ te2[uint8(s0>>8)] ^ te3[uint8(s1)]
@@ -63,6 +66,7 @@
 		k += 4
 		s0, s1, s2, s3 = t0, t1, t2, t3
 	}
+	fmt.Printf("0x%08X 0x%08X 0x%08X 0x%08X\n", s0, s1, s2, s3)
 
 	// Last round uses s-box directly and XORs to produce output.
 	s0 = uint32(sbox0[t0>>24])<<24 | uint32(sbox0[t1>>16&0xff])<<16 | uint32(sbox0[t2>>8&0xff])<<8 | uint32(sbox0[t3&0xff])
diff --git a/src/crypto/aes/cipher_arm.go b/src/crypto/aes/cipher_arm.go
new file mode 100644
index 0000000..d09f404
--- /dev/null
+++ b/src/crypto/aes/cipher_arm.go
@@ -0,0 +1,99 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+import (
+	"crypto/cipher"
+	"fmt"
+)
+
+// defined in asm_arm.s
+func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+
+// func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
+
+type aesCipherAsm struct {
+	aesCipher
+}
+
+func newCipher(key []byte) (cipher.Block, error) {
+	n := len(key) + 28
+	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
+	rounds := 10
+	switch len(key) {
+	case 128 / 8:
+		rounds = 10
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	//expandKeyAsm(rounds, &key[0], &c.enc[0], &c.dec[0])
+	//expandKeyAsm(key, c.enc, c.dec)
+	expandKeyEncAsm(rounds, &key[0], &c.enc[0])
+	expandKeyDecAsm(rounds, &c.enc[0], &c.dec[0])
+
+	// FIXME
+	// if hasGCMAsm() {
+	// 	return &aesCipherGCM{c}, nil
+	// }
+
+	return &c, nil
+}
+
+func (c *aesCipherAsm) BlockSize() int { return BlockSize }
+
+func (c *aesCipherAsm) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockAsm(len(c.enc)/4-2, &c.enc[0], &dst[0], &src[0])
+}
+
+func (c *aesCipherAsm) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockAsm(len(c.dec)/4-2, &c.dec[0], &dst[0], &src[0])
+}
+
+// expandKey is used by BenchmarkExpand to ensure that the asm implementation
+// of key expansion is used for the benchmark when it is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	rounds := 10 // rounds needed for AES128
+	switch len(key) {
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	//expandKeyAsm(rounds, &key[0], &enc[0], &dec[0])
+	expandKeyEncAsm(rounds, &key[0], &enc[0])
+	if dec != nil {
+		expandKeyDecAsm(rounds, &enc[0], &dec[0])
+	}
+}
+
+// for debugging
+func printUint32(x uint32) {
+	fmt.Printf("0x%08X\n", x)
+}
+
+// rcon table used by asm_arm.s
+var rcon = [16]uint32{
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, 0, 0,
+	0, 0, 0, 0,
+}
diff --git a/src/crypto/aes/cipher_generic.go b/src/crypto/aes/cipher_generic.go
index ca74aa8..411adc8 100644
--- a/src/crypto/aes/cipher_generic.go
+++ b/src/crypto/aes/cipher_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x,!ppc64le
+// +build !amd64,!s390x,!ppc64le,!arm
 
 package aes

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Mar 22, 2017, 4:41:47 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 1:

RELNOTE=yes

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Mar 22, 2017, 4:42:07 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 1:Run-TryBot +1

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 4:42:19 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 1:

TryBots beginning. Status page: http://farmer.golang.org/try?commit=e8a35f90

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 4:43:38 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 1:

Build is still in progress... This change failed on darwin-amd64-10_11: See https://storage.googleapis.com/go-build-log/e8a35f90/darwin-amd64-10_11_4bb365ef.log

Consult https://build.golang.org/ to see whether it's a new failure. Other builds still in progress; subsequent failure notices suppressed until final report.

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 4:47:57 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 1:TryBot-Result -1

Consult https://build.golang.org/ to see whether they are new failures.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 22, 2017, 5:07:21 PM3/22/17

to Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood uploaded patch set #2 to this change.

View Change

crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

ARM assembly for AES crypto adapted from openssl giving an
encrypt/decrypt speed up of 1.6-2.7x and a key scheduling speedup of
2.3-4.5x.

Raspberry Pi 3 BCM2709 ARMv7 Processor rev 5 (v7l)

name       old time/op    new time/op     delta
Encrypt-4    3.12µs ± 1%     1.13µs ± 2%   -63.86%  (p=0.000 n=20+20)
Decrypt-4    3.10µs ± 1%     1.21µs ± 2%   -60.91%  (p=0.000 n=20+20)
Expand-4     11.3µs ± 1%      2.5µs ± 1%   -78.11%  (p=0.000 n=16+16)

name       old speed      new speed       delta
Encrypt-4  5.13MB/s ± 2%  14.18MB/s ± 2%  +176.58%  (p=0.000 n=20+20)
Decrypt-4  5.16MB/s ± 1%  13.19MB/s ± 2%  +155.78%  (p=0.000 n=20+20)

Chrombook Samsung Exynos5 ARMv7 Processor rev 4 (v7l)

name       old time/op    new time/op    delta
Encrypt-2     342ns ± 1%     210ns ± 1%  -38.80%  (p=0.000 n=16+20)
Decrypt-2     343ns ± 6%     209ns ± 2%  -39.16%  (p=0.000 n=17+18)
Expand-2     1.64µs ± 5%    0.70µs ± 1%  -57.29%  (p=0.000 n=17+19)

name       old speed      new speed      delta
Encrypt-2  46.7MB/s ± 1%  76.2MB/s ± 1%  +63.35%  (p=0.000 n=16+20)
Decrypt-2  46.4MB/s ± 7%  76.4MB/s ± 2%  +64.75%  (p=0.000 n=18+18)

Issue #4299

Change-Id: I13df6a87f5697de255cb9a494022dd7f7dbde8f5
---
A src/crypto/aes/asm_arm.s
A src/crypto/aes/cipher_arm.go
M src/crypto/aes/cipher_generic.go
3 files changed, 925 insertions(+), 1 deletion(-)

>8,s3,s3
+	MOVW	-4(key),t3
+	AND	s0>>16,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	dec_loop
+
+	MOVW	$·sbox1(SB),tbl
+
+	MOVW	0(tbl),t2	// prefetch Td4
+	MOVW	32(tbl),t3
+	MOVW	64(tbl),t1
+	MOVW	96(tbl),t2
+	MOVW	128(tbl),t3
+	MOVW	160(tbl),t1
+	MOVW	192(tbl),t2
+	MOVW	224(tbl),t3
+
+	MOVBU	s0<<0(tbl),s0	// Td4[s0>>24]
+	MOVBU	i1<<0(tbl),t1	// Td4[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVBU	i2<<0(tbl),t2	// Td4[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVBU	i3<<0(tbl),t3	// Td4[s0>>0]
+	AND	s1>>8,t4,i3
+
+	ADD	s1>>24,tbl,s1
+	MOVBU	i1<<0(tbl),i1	// Td4[s1>>0]
+	MOVBU	(s1),s1		// Td4[s1>>24]
+	MOVBU	i2<<0(tbl),i2	// Td4[s1>>16]
+	EOR	s0<<24,i1,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s1>>8]
+	EOR	s1<<8,t1,s1
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2<<8,t2,t2
+	AND	s2,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s2>>8]
+	EOR	i3<<8,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s2>>0]
+	AND	s2>>16,t4,i3
+
+	ADD	s2>>24,tbl,s2
+	MOVBU	(s2),s2		// Td4[s2>>24]
+	EOR	i1<<8,s0,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s2>>16]
+	EOR	s1<<16,i2,s1
+	AND	s3>>16,t4,i1	// i0
+	EOR	s2<<16,t2,s2
+	AND	s3>>8,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s3>>16]
+	EOR	i3<<16,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s3>>8]
+	AND	s3,t4,i3	// i2
+
+	ADD	s3>>24,tbl,s3
+	MOVBU	i3<<0(tbl),i3	// Td4[s3>>0]
+	MOVBU	(s3),s3		// Td4[s3>>24]
+	EOR	i1<<16,s0,s0
+	MOVW	0(key),i1
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	s2<<8,i3,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
diff --git a/src/crypto/aes/cipher_arm.go b/src/crypto/aes/cipher_arm.go
new file mode 100644
index 0000000..a0b3823
--- /dev/null
+++ b/src/crypto/aes/cipher_arm.go
@@ -0,0 +1,82 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+import (
+	"crypto/cipher"
+)
+
+// defined in asm_arm.s
+func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+
+type aesCipherAsm struct {
+	aesCipher
+}
+
+func newCipher(key []byte) (cipher.Block, error) {
+	n := len(key) + 28
+	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
+	rounds := 10
+	switch len(key) {
+	case 128 / 8:
+		rounds = 10
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &c.enc[0])
+	expandKeyDecAsm(rounds, &c.enc[0], &c.dec[0])
+	return &c, nil
+}
+
+func (c *aesCipherAsm) BlockSize() int { return BlockSize }
+
+func (c *aesCipherAsm) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockAsm(len(c.enc)/4-2, &c.enc[0], &dst[0], &src[0])
+}
+
+func (c *aesCipherAsm) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockAsm(len(c.dec)/4-2, &c.dec[0], &dst[0], &src[0])
+}
+
+// expandKey is used by BenchmarkExpand to ensure that the asm implementation
+// of key expansion is used for the benchmark when it is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	rounds := 10 // rounds needed for AES128
+	switch len(key) {
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &enc[0])
+	if dec != nil {
+		expandKeyDecAsm(rounds, &enc[0], &dec[0])
+	}
+}
+
+// rcon table used by asm_arm.s
+var rcon = [16]uint32{
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, 0, 0,
+	0, 0, 0, 0,
+}
diff --git a/src/crypto/aes/cipher_generic.go b/src/crypto/aes/cipher_generic.go
index ca74aa8..411adc8 100644
--- a/src/crypto/aes/cipher_generic.go
+++ b/src/crypto/aes/cipher_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x,!ppc64le
+// +build !amd64,!s390x,!ppc64le,!arm
 
 package aes

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Mar 22, 2017, 5:08:31 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, Gobot Gobot, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 2:

(1 comment)

File src/crypto/aes/cipher_arm.go:
- Patch Set #2, Line 1: // Copyright 2012 The Go Authors. All rights reserved.
  2017?

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 22, 2017, 5:27:24 PM3/22/17

to Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood uploaded patch set #3 to this change.

View Change

crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

ARM assembly for AES crypto adapted from openssl giving an
encrypt/decrypt speed up of 1.6-2.7x and a key scheduling speedup of
2.3-4.5x.

Raspberry Pi 3 BCM2709 ARMv7 Processor rev 5 (v7l)

name       old time/op    new time/op     delta
Encrypt-4    3.12µs ± 1%     1.13µs ± 2%   -63.86%  (p=0.000 n=20+20)
Decrypt-4    3.10µs ± 1%     1.21µs ± 2%   -60.91%  (p=0.000 n=20+20)
Expand-4     11.3µs ± 1%      2.5µs ± 1%   -78.11%  (p=0.000 n=16+16)

name       old speed      new speed       delta
Encrypt-4  5.13MB/s ± 2%  14.18MB/s ± 2%  +176.58%  (p=0.000 n=20+20)
Decrypt-4  5.16MB/s ± 1%  13.19MB/s ± 2%  +155.78%  (p=0.000 n=20+20)

Chrombook Samsung Exynos5 ARMv7 Processor rev 4 (v7l)

name       old time/op    new time/op    delta
Encrypt-2     342ns ± 1%     210ns ± 1%  -38.80%  (p=0.000 n=16+20)
Decrypt-2     343ns ± 6%     209ns ± 2%  -39.16%  (p=0.000 n=17+18)
Expand-2     1.64µs ± 5%    0.70µs ± 1%  -57.29%  (p=0.000 n=17+19)

name       old speed      new speed      delta
Encrypt-2  46.7MB/s ± 1%  76.2MB/s ± 1%  +63.35%  (p=0.000 n=16+20)
Decrypt-2  46.4MB/s ± 7%  76.4MB/s ± 2%  +64.75%  (p=0.000 n=18+18)

Issue #4299

Change-Id: I13df6a87f5697de255cb9a494022dd7f7dbde8f5
---
A src/crypto/aes/asm_arm.s
A src/crypto/aes/cipher_arm.go
M src/crypto/aes/cipher_generic.go
3 files changed, 925 insertions(+), 1 deletion(-)

>8,s3,s3
+	MOVW	-4(key),t3
+	AND	s0>>16,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	dec_loop
+
+	MOVW	$·sbox1(SB),tbl
+
+	MOVW	0(tbl),t2	// prefetch Td4
+	MOVW	32(tbl),t3
+	MOVW	64(tbl),t1
+	MOVW	96(tbl),t2
+	MOVW	128(tbl),t3
+	MOVW	160(tbl),t1
+	MOVW	192(tbl),t2
+	MOVW	224(tbl),t3
+
+	MOVBU	s0<<0(tbl),s0	// Td4[s0>>24]
+	MOVBU	i1<<0(tbl),t1	// Td4[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVBU	i2<<0(tbl),t2	// Td4[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVBU	i3<<0(tbl),t3	// Td4[s0>>0]
+	AND	s1>>8,t4,i3
+
+	ADD	s1>>24,tbl,s1
+	MOVBU	i1<<0(tbl),i1	// Td4[s1>>0]
+	MOVBU	(s1),s1		// Td4[s1>>24]
+	MOVBU	i2<<0(tbl),i2	// Td4[s1>>16]
+	EOR	s0<<24,i1,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s1>>8]
+	EOR	s1<<8,t1,s1
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2<<8,t2,t2
+	AND	s2,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s2>>8]
+	EOR	i3<<8,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s2>>0]
+	AND	s2>>16,t4,i3
+
+	ADD	s2>>24,tbl,s2
+	MOVBU	(s2),s2		// Td4[s2>>24]
+	EOR	i1<<8,s0,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s2>>16]
+	EOR	s1<<16,i2,s1
+	AND	s3>>16,t4,i1	// i0
+	EOR	s2<<16,t2,s2
+	AND	s3>>8,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s3>>16]
+	EOR	i3<<16,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s3>>8]
+	AND	s3,t4,i3	// i2
+
+	ADD	s3>>24,tbl,s3
+	MOVBU	i3<<0(tbl),i3	// Td4[s3>>0]
+	MOVBU	(s3),s3		// Td4[s3>>24]
+	EOR	i1<<16,s0,s0
+	MOVW	0(key),i1
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	s2<<8,i3,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
diff --git a/src/crypto/aes/cipher_arm.go b/src/crypto/aes/cipher_arm.go
new file mode 100644
index 0000000..d501c10
--- /dev/null
+++ b/src/crypto/aes/cipher_arm.go
@@ -0,0 +1,82 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+import (
+	"crypto/cipher"
+)
+
+// defined in asm_arm.s
+func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+
+type aesCipherAsm struct {
+	aesCipher
+}
+
+func newCipher(key []byte) (cipher.Block, error) {
+	n := len(key) + 28
+	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
+	rounds := 10
+	switch len(key) {
+	case 128 / 8:
+		rounds = 10
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &c.enc[0])
+	expandKeyDecAsm(rounds, &c.enc[0], &c.dec[0])
+	return &c, nil
+}
+
+func (c *aesCipherAsm) BlockSize() int { return BlockSize }
+
+func (c *aesCipherAsm) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockAsm(len(c.enc)/4-2, &c.enc[0], &dst[0], &src[0])
+}
+
+func (c *aesCipherAsm) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockAsm(len(c.dec)/4-2, &c.dec[0], &dst[0], &src[0])
+}
+
+// expandKey is used by BenchmarkExpand to ensure that the asm implementation
+// of key expansion is used for the benchmark when it is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	rounds := 10 // rounds needed for AES128
+	switch len(key) {
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &enc[0])
+	if dec != nil {
+		expandKeyDecAsm(rounds, &enc[0], &dec[0])
+	}
+}
+
+// rcon table used by asm_arm.s
+var rcon = [16]uint32{
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, 0, 0,
+	0, 0, 0, 0,
+}
diff --git a/src/crypto/aes/cipher_generic.go b/src/crypto/aes/cipher_generic.go
index ca74aa8..411adc8 100644
--- a/src/crypto/aes/cipher_generic.go
+++ b/src/crypto/aes/cipher_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x,!ppc64le
+// +build !amd64,!s390x,!ppc64le,!arm
 
 package aes

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Mar 22, 2017, 5:47:16 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, Gobot Gobot, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 3:Run-TryBot +1

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 5:47:28 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 3:

TryBots beginning. Status page: http://farmer.golang.org/try?commit=24efb630

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 5:50:54 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 3:

Build is still in progress... This change failed on misc-compile: See https://storage.googleapis.com/go-build-log/24efb630/misc-compile_f8882959.log

Consult https://build.golang.org/ to see whether it's a new failure. Other builds still in progress; subsequent failure notices suppressed until final report.

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 22, 2017, 5:55:04 PM3/22/17

to Nick Craig-Wood, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 3:TryBot-Result -1

1 of 17 TryBots failed: Failed on misc-compile: https://storage.googleapis.com/go-build-log/24efb630/misc-compile_f8882959.log

Consult https://build.golang.org/ to see whether they are new failures.

To view, visit change 38366. To unsubscribe, visit settings.

Cherry Zhang (Gerrit)

unread,

Mar 22, 2017, 6:17:09 PM3/22/17

to Nick Craig-Wood, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Cherry Zhang posted comments on this change.

View Change

Patch set 3:

(3 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #3, Line 17: spill
  where is the spill? I couldn't find it with a quick look.
- Patch Set #3, Line 56: #define i3 R9
  When dynamic linking, access of global variable will use R9. You probably want to make sure R9 is not holding live value when accessing global. Also R9 is reserved on NaCl.
- Patch Set #3, Line 68: 4
  add a comment that this is to save the link register

To view, visit change 38366. To unsubscribe, visit settings.

Josselin Costanzi (Gerrit)

unread,

Mar 22, 2017, 6:57:59 PM3/22/17

to Nick Craig-Wood, Cherry Zhang, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Josselin Costanzi posted comments on this change.

View Change

Patch set 3:

(1 comment)

File src/crypto/aes/asm_arm.s:
- Patch Set #3, Line 14: // 6aa36e8e5a062e31543e7796f0351ff9628832ce from 21 May 2017
  Wrong date

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 23, 2017, 5:59:51 AM3/23/17

to Josselin Costanzi, Cherry Zhang, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 3:

(4 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #3, Line 14: // 6aa36e8e5a062e31543e7796f0351ff9628832ce from 21 May 2017
  Wrong date
  Well spotted - that should be 2016 rather than some time into the mysterious future!

- Patch Set #3, Line 17: spill
  where is the spill? I couldn't find it with a quick look.

- In the inner loops nr+0(FP) is saved to and from the stack

- Patch Set #3, Line 56: #define i3 R9

- Ah, I was under the impression that post https://github.com/golang/go/commit/89f185fe8a036b0fabce30b20c480cf1c832bdd7 it was safe to use R9.

- When dynamic linking, access of global variable will use R9

- The only globals that are in use are the fetching of the table addresses. These are all addresses from within the same package.
  MOVW $·te0(SB), tbl
  Would that clobber R9 when dynamic linking?
  I can probably work around that with a bit of care.

- Also R9 is reserved on NaCl

- This is a definite problem - I either need to
  - rewrite to not use R9
  - not support NaCl
  Rewriting to not use R9 would mean spilling another register to the stack. That diverges further from the carefully tuned OpenSSL assembler code but perhaps obeys the ARM assembler contract better...
  What is your opinion?

- Patch Set #3, Line 68: 4
  add a comment that this is to save the link register

- Ack

To view, visit change 38366. To unsubscribe, visit settings.

Cherry Zhang (Gerrit)

unread,

Mar 23, 2017, 9:35:25 AM3/23/17

to Nick Craig-Wood, Josselin Costanzi, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Cherry Zhang posted comments on this change.

View Change

Patch set 3:

(2 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #3, Line 17: spill

- In the inner loops nr+0(FP) is saved to and from the stack

- Oh, it is spill to argument slot, not local slot. Thanks.

- Patch Set #3, Line 56: #define i3 R9
  Ah, I was under the impression that post https://github.com/golang/go/commi

- Only loading the address is probably ok for R9. In this case R11 may be used as a temporary register and get clobbered though. Since your target register is also R11, it is probably ok. You can try it by passing -shared to the assembler.
  Falling back to pure Go version on NaCl sounds good to me.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 23, 2017, 5:57:20 PM3/23/17

to Josselin Costanzi, Cherry Zhang, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 3:

(4 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #3, Line 14: // 6aa36e8e5a062e31543e7796f0351ff9628832ce from 21 May 2017

- Well spotted - that should be 2016 rather than some time into the mysteriou

- Done

- Patch Set #3, Line 17: spill

- Oh, it is spill to argument slot, not local slot. Thanks.
  Done

- Patch Set #3, Line 56: #define i3 R9

- Only loading the address is probably ok for R9. In this case R11 may be use
  I have excluded nacl from the build.
  I've also looked at the assembly generated by go tool asm with and without -shared.
  The linker appears to do the sensible thing to make it relocatable, converting
```
     6b0:	e59f6268 	ldr	r6, [pc, #616]	; 0x920
     6b4:	e59fb268 	ldr	r11, [pc, #616]	; 0x924
```
  into
```
     6b4:	e59f6280 	ldr	r6, [pc, #640]	; 0x93c
     6b8:	e08f6006 	add	r6, r15, r6
     6bc:	e59fb27c 	ldr	r11, [pc, #636]	; 0x940
     6c0:	e08fb00b 	add	r11, r15, r11
```
  So no use of any extra registers!

- Patch Set #3, Line 68: 4

- Ack
  Done

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 23, 2017, 5:58:44 PM3/23/17

to Gobot Gobot, Brad Fitzpatrick, Josselin Costanzi, Cherry Zhang, golang-co...@googlegroups.com

Nick Craig-Wood uploaded patch set #4 to this change.

View Change

crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

ARM assembly for AES crypto adapted from openssl giving an
encrypt/decrypt speed up of 1.6-2.7x and a key scheduling speedup of
2.3-4.5x.

Raspberry Pi 3 BCM2709 ARMv7 Processor rev 5 (v7l)

name       old time/op    new time/op     delta
Encrypt-4    3.12µs ± 1%     1.13µs ± 2%   -63.86%  (p=0.000 n=20+20)
Decrypt-4    3.10µs ± 1%     1.21µs ± 2%   -60.91%  (p=0.000 n=20+20)
Expand-4     11.3µs ± 1%      2.5µs ± 1%   -78.11%  (p=0.000 n=16+16)

name       old speed      new speed       delta
Encrypt-4  5.13MB/s ± 2%  14.18MB/s ± 2%  +176.58%  (p=0.000 n=20+20)
Decrypt-4  5.16MB/s ± 1%  13.19MB/s ± 2%  +155.78%  (p=0.000 n=20+20)

Chrombook Samsung Exynos5 ARMv7 Processor rev 4 (v7l)

name       old time/op    new time/op    delta
Encrypt-2     342ns ± 1%     210ns ± 1%  -38.80%  (p=0.000 n=16+20)
Decrypt-2     343ns ± 6%     209ns ± 2%  -39.16%  (p=0.000 n=17+18)
Expand-2     1.64µs ± 5%    0.70µs ± 1%  -57.29%  (p=0.000 n=17+19)

name       old speed      new speed      delta
Encrypt-2  46.7MB/s ± 1%  76.2MB/s ± 1%  +63.35%  (p=0.000 n=16+20)
Decrypt-2  46.4MB/s ± 7%  76.4MB/s ± 2%  +64.75%  (p=0.000 n=18+18)

Issue #4299

Change-Id: I13df6a87f5697de255cb9a494022dd7f7dbde8f5
---
A src/crypto/aes/asm_arm.s
A src/crypto/aes/cipher_arm.go
M src/crypto/aes/cipher_generic.go
3 files changed, 931 insertions(+), 0 deletions(-)

diff --git a/src/crypto/aes/asm_arm.s b/src/crypto/aes/asm_arm.s
new file mode 100644
index 0000000..3fcbae4
--- /dev/null
+++ b/src/crypto/aes/asm_arm.s
@@ -0,0 +1,846 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm,!nacl
+
+// This is a derived work from OpenSSL of AES using assembly optimizations. The
+// original code was written by Andy Polyakov <ap...@openssl.org> and it's dual
+// licensed under OpenSSL and CRYPTOGAMS licenses depending on where you obtain
+// it. For further details see http://www.openssl.org/~appro/cryptogams/.
+
+// Original code can be found at the link bellow:
+// https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl

+
+// This code is based on crypto/aes/asm/aes-armv4.pl

 version
+// 6aa36e8e5a062e31543e7796f0351ff9628832ce from 21 May 2016
+
+// Apart from assembler syntax and calling convention changes, the
+// major change needed was to spill one register to the stack as go
+// can't use R10
+
+// Note that we don't build for nacl since it needs R9
+
+// AES for ARMv4
+
+// January 2007.
+//
+// Code uses single 1K S-box and is >2 times faster than code generated
+// by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
+// allows to merge logical or arithmetic operation with shift or rotate
+// in one instruction and emit combined result every cycle. The module
+// is endian-neutral. The performance is ~42 cycles/byte for 128-bit
+// key [on single-issue Xscale PXA250 core].
+
+// May 2007.
+//
+// AES_set_[en|de]crypt_key is added.
+
+// July 2010.
+//
+// Rescheduling for dual-issue pipeline resulted in 12% improvement on
+// Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+
+// February 2011.
+//
+// Profiler-assisted and platform-specific optimization resulted in 16%
+// improvement on Cortex A8 core and ~21.5 cycles per byte.
+
+#include "textflag.h"
+
+#define s0 R0
+#define s1 R1
+#define s2 R2
+#define s3 R3
+#define t1 R4
+#define t2 R5
+#define t3 R6
+#define i1 R7
+#define i2 R8
+#define i3 R9		// forbidden on nacl, check usage with -shared
+#define mask80 i1
+#define mask1b i2
+#define mask7f i3
+
+#define tbl R11		// can be used by the linker to synthesise instructions
+#define key R12
+#define t4 R14
+
+// #define ARM_ARCH_7 1
+
+// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT	·encryptBlockAsm(SB), NOSPLIT, $4-16	// $4 here to save LR
+	MOVW	src+12(FP), t4
+	MOVW	$·te0(SB), tbl
+	MOVW	xk+4(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	ORR	t2<<16,s3,s3
+	ORR	t3<<24,s3,s3
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+#endif
+
+	MOVM.IA.W	(key),[t1-i1]
+	EOR	t1,s0,s0
+	EOR	t2,s1,s1
+	EOR	t3,s2,s2
+	EOR	i1,s3,s3
+	MOVW	$255,t4
+
+	AND	s0,t4,i1
+	AND	s0>>8,t4,i2
+	AND	s0>>16,t4,i3
+	MOVW	s0>>24,s0
+enc_loop:
+	MOVW	i1<<2(tbl),t1	// Te3[s0>>0]
+	AND	s1>>16,t4,i1	// i0
+	MOVW	i2<<2(tbl),t2	// Te2[s0>>8]
+	AND	s1,t4,i2
+	MOVW	i3<<2(tbl),t3	// Te1[s0>>16]
+	AND	s1>>8,t4,i3
+	MOVW	s0<<2(tbl),s0	// Te0[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVW	i1<<2(tbl),i1	// Te1[s1>>16]
+	MOVW	i2<<2(tbl),i2	// Te3[s1>>0]
+	MOVW	i3<<2(tbl),i3	// Te2[s1>>8]
+	EOR	i1@>8,s0,s0
+	MOVW	s1<<2(tbl),s1	// Te0[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2@>8,t2,t2
+	AND	s2>>16,t4,i2	// i1
+	EOR	i3@>8,t3,t3
+	AND	s2,t4,i3
+	MOVW	i1<<2(tbl),i1	// Te2[s2>>8]
+	EOR	t1@>24,s1,s1
+	MOVW	i2<<2(tbl),i2	// Te1[s2>>16]
+	MOVW	s2>>24,s2
+
+	MOVW	i3<<2(tbl),i3	// Te3[s2>>0]
+	EOR	i1@>16,s0,s0
+	MOVW	s2<<2(tbl),s2	// Te0[s2>>24]
+	AND	s3,t4,i1	// i0
+	EOR	i2@>8,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	i3@>16,t3,t3
+	AND	s3>>16,t4,i3	// i2
+	MOVW	i1<<2(tbl),i1	// Te3[s3>>0]
+	EOR	t2@>16,s2,s2
+	MOVW	i2<<2(tbl),i2	// Te2[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVW	i3<<2(tbl),i3	// Te1[s3>>16]
+	EOR	i1@>24,s0,s0
+	MOVW.P	16(key),i1
+	EOR	i2@>16,s1,s1
+	MOVW	s3<<2(tbl),s3	// Te0[s3>>24]
+	EOR	i3@>8,s2,s2
+	MOVW	-12(key),t1
+	EOR	t3@>8,s3,s3
+
+	MOVW	-8(key),t2
+	EOR	i1,s0,s0
+	MOVW	-4(key),t3
+	AND	s0,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0>>16,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	enc_loop
+
+	ADD	$2,tbl,tbl
+
+	MOVBU	i1<<2(tbl),t1	// Te4[s0>>0]
+	AND	s1>>16,t4,i1	// i0
+	MOVBU	i2<<2(tbl),t2	// Te4[s0>>8]
+	AND	s1,t4,i2
+	MOVBU	i3<<2(tbl),t3	// Te4[s0>>16]
+	AND	s1>>8,t4,i3
+	MOVBU	s0<<2(tbl),s0	// Te4[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVBU	i1<<2(tbl),i1	// Te4[s1>>16]
+	MOVBU	i2<<2(tbl),i2	// Te4[s1>>0]
+	MOVBU	i3<<2(tbl),i3	// Te4[s1>>8]
+	EOR	s0<<8,i1,s0
+	MOVBU	s1<<2(tbl),s1	// Te4[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	t2<<8,i2,t2
+	AND	s2>>16,t4,i2	// i1
+	EOR	t3<<8,i3,t3
+	AND	s2,t4,i3
+	MOVBU	i1<<2(tbl),i1	// Te4[s2>>8]
+	EOR	s1<<24,t1,s1
+	MOVBU	i2<<2(tbl),i2	// Te4[s2>>16]
+	MOVW	s2>>24,s2
+
+	MOVBU	i3<<2(tbl),i3	// Te4[s2>>0]
+	EOR	s0<<8,i1,s0
+	MOVBU	s2<<2(tbl),s2	// Te4[s2>>24]
+	AND	s3,t4,i1	// i0
+	EOR	i2<<16,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	t3<<8,i3,t3
+	AND	s3>>16,t4,i3	// i2
+	MOVBU	i1<<2(tbl),i1	// Te4[s3>>0]
+	EOR	s2<<24,t2,s2
+	MOVBU	i2<<2(tbl),i2	// Te4[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVBU	i3<<2(tbl),i3	// Te4[s3>>16]
+	EOR	s0<<8,i1,s0
+	MOVW	0(key),i1
+	MOVBU	s3<<2(tbl),s3	// Te4[s3>>24]
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	i3<<16,s2,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
+
+// func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+TEXT	·expandKeyEncAsm(SB), NOSPLIT, $4-12	// $4 here to save LR
+	MOVW	userKey+4(FP), t4	// inp
+	MOVW	nr+0(FP), tbl		// rounds
+	MOVW	enc+8(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	MOVW.P	s0,16(key)
+	ORR	t2<<16,s3,s3
+	MOVW	s1,-12(key)
+	ORR	t3<<24,s3,s3
+	MOVW	s2,-8(key)
+	MOVW	s3,-4(key)
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW.P	s0,16(key)
+	MOVW	s1,-12(key)
+	MOVW	s2,-8(key)
+	MOVW	s3,-4(key)
+#endif
+
+	TEQ	$10,tbl
+	BNE	ek_not128
+	MOVW	$·rcon(SB), t3
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$255,t4
+
+ek_128_loop:
+	AND	s3>>24,t4,t2
+	AND	s3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	s3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	s3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,t2
+	MOVW	nr+0(FP), t1
+	EOR	t2,s0,s0	// rk[4]=rk[0]^...
+	EOR	s0,s1,s1	// rk[5]=rk[1]^rk[4]
+	MOVW.P	s0,16(key)
+	EOR	s1,s2,s2	// rk[6]=rk[2]^rk[5]
+	MOVW	s1,-12(key)
+	EOR	s2,s3,s3	// rk[7]=rk[3]^rk[6]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-8(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-4(key)
+	BNE	ek_128_loop
+	SUB	$176,key,R2
+	B	ek_done
+
+ek_not128:
+#ifndef ARM_ARCH_7
+	MOVBU	19(t4),i2
+	MOVBU	18(t4),t1
+	MOVBU	17(t4),t2
+	MOVBU	16(t4),t3
+	ORR	t1<<8,i2,i2
+	MOVBU	23(t4),i3
+	ORR	t2<<16,i2,i2
+	MOVBU	22(t4),t1
+	ORR	t3<<24,i2,i2
+	MOVBU	21(t4),t2
+	MOVBU	20(t4),t3
+	ORR	t1<<8,i3,i3
+	ORR	t2<<16,i3,i3
+	MOVW.P	i2,8(key)
+	ORR	t3<<24,i3,i3
+	MOVW	i3,-4(key)
+#else
+	MOVW	16(t4),i2
+	MOVW	20(t4),i3
+	REV	i2,i2
+	REV	i3,i3
+	MOVW.P	i2,8(key)
+	MOVW	i3,-4(key)
+#endif
+
+	TEQ	$12,tbl
+	BNE	ek_not192
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$·rcon(SB), t3
+	MOVW	$8,t1
+	MOVW	$255,t4
+	MOVW	t1, nr+0(FP)
+
+ek_192_loop:
+	AND	i3>>24,t4,t2
+	AND	i3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	i3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	i3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,i3
+	MOVW	nr+0(FP), t1
+	EOR	i3,s0,s0	// rk[6]=rk[0]^...
+	EOR	s0,s1,s1	// rk[7]=rk[1]^rk[6]
+	MOVW.P	s0,24(key)
+	EOR	s1,s2,s2	// rk[8]=rk[2]^rk[7]
+	MOVW	s1,-20(key)
+	EOR	s2,s3,s3	// rk[9]=rk[3]^rk[8]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-16(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-12(key)
+	BEQ	ek_done
+
+	MOVW	-32(key),i1
+	MOVW	-28(key),i2
+	EOR	s3,i1,i1	// rk[10]=rk[4]^rk[9]
+	EOR	i1,i2,i3	// rk[11]=rk[5]^rk[10]
+	MOVW	i1,-8(key)
+	MOVW	i3,-4(key)
+	B	ek_192_loop
+
+ek_not192:
+#ifndef ARM_ARCH_7
+	MOVBU	27(t4),i2
+	MOVBU	26(t4),t1
+	MOVBU	25(t4),t2
+	MOVBU	24(t4),t3
+	ORR	t1<<8,i2,i2
+	MOVBU	31(t4),i3
+	ORR	t2<<16,i2,i2
+	MOVBU	30(t4),t1
+	ORR	t3<<24,i2,i2
+	MOVBU	29(t4),t2
+	MOVBU	28(t4),t3
+	ORR	t1<<8,i3,i3
+	ORR	t2<<16,i3,i3
+	MOVW.P	i2,8(key)
+	ORR	t3<<24,i3,i3
+	MOVW	i3,-4(key)
+#else
+	MOVW	24(t4),i2
+	MOVW	28(t4),i3
+	REV	i2,i2
+	REV	i3,i3
+	MOVW.P	i2,8(key)
+	MOVW	i3,-4(key)
+#endif
+
+	MOVW	$·sbox0(SB), tbl	// Te4
+	MOVW	$·rcon(SB), t3
+	MOVW	$7,t1
+	MOVW	$255,t4
+	MOVW	t1, nr+0(FP)
+
+ek_256_loop:
+	AND	i3>>24,t4,t2
+	AND	i3>>16,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	i3>>8,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	i3,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<24,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW.P	4(t3),t1	// rcon[i++]
+	ORR	i3<<8,t2,t2
+	EOR	t1,t2,i3
+	MOVW	nr+0(FP), t1
+	EOR	i3,s0,s0	// rk[8]=rk[0]^...
+	EOR	s0,s1,s1	// rk[9]=rk[1]^rk[8]
+	MOVW.P	s0,32(key)
+	EOR	s1,s2,s2	// rk[10]=rk[2]^rk[9]
+	MOVW	s1,-28(key)
+	EOR	s2,s3,s3	// rk[11]=rk[3]^rk[10]
+	SUB.S	$1,t1,t1
+	MOVW	s2,-24(key)
+	MOVW	t1, nr+0(FP)
+	MOVW	s3,-20(key)
+	BEQ	ek_done
+
+	AND	s3,t4,t2
+	AND	s3>>8,t4,i1
+	MOVBU	t2<<0(tbl),t2
+	AND	s3>>16,t4,i2
+	MOVBU	i1<<0(tbl),i1
+	AND	s3>>24,t4,i3
+	MOVBU	i2<<0(tbl),i2
+	ORR	i1<<8,t2,t2
+	MOVBU	i3<<0(tbl),i3
+	ORR	i2<<16,t2,t2
+	MOVW	-48(key),t1
+	ORR	i3<<24,t2,t2
+
+	MOVW	-44(key),i1
+	MOVW	-40(key),i2
+	EOR	t2,t1,t1	// rk[12]=rk[4]^...
+	MOVW	-36(key),i3
+	EOR	t1,i1,i1	// rk[13]=rk[5]^rk[12]
+	MOVW	t1,-16(key)
+	EOR	i1,i2,i2	// rk[14]=rk[6]^rk[13]
+	MOVW	i1,-12(key)
+	EOR	i2,i3,i3	// rk[15]=rk[7]^rk[14]
+	MOVW	i2,-8(key)
+	MOVW	i3,-4(key)
+	B	ek_256_loop
+
+ek_done:
+	RET
+
+// func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+TEXT	·expandKeyDecAsm(SB), NOSPLIT, $4-12	// $4 here to save LR
+	MOVW	nr+0(FP), t4	// rounds
+	MOVW	enc+4(FP), i1	// input
+	ADD	t4<<4,i1,i2
+	MOVW	dec+8(FP), key	// output
+	ADD	t4<<4,key,tbl
+
+dk_inv:	MOVW.P	16(i1),s0
+	MOVW	-12(i1),s1
+	MOVW	-8(i1),s2
+	MOVW	-4(i1),s3
+	MOVW.P	-16(i2),t1
+	MOVW	16+4(i2),t2
+	MOVW	16+8(i2),t3
+	MOVW	16+12(i2),i3
+	MOVW.P	s0,-16(tbl)
+	MOVW	s1,16+4(tbl)
+	MOVW	s2,16+8(tbl)
+	MOVW	s3,16+12(tbl)
+	MOVW.P	t1,16(key)
+	MOVW	t2,-12(key)
+	MOVW	t3,-8(key)
+	MOVW	i3,-4(key)
+	TEQ	i2,i1
+	BNE	dk_inv
+
+	MOVW	(i1),s0
+	MOVW	4(i1),s1
+	MOVW	8(i1),s2
+	MOVW	12(i1),s3
+	MOVW	s0,(key)
+	MOVW	s1,4(key)
+	MOVW	s2,8(key)
+	MOVW	s3,12(key)
+	SUB	t4<<3,key,key
+
+	MOVW.W	16(key),s0	// prefetch tp1
+	MOVW	$0x80,mask80
+	MOVW	$0x1b,mask1b
+	ORR	$0x8000,mask80,mask80
+	ORR	$0x1b00,mask1b,mask1b
+	ORR	mask80<<16,mask80,mask80
+	ORR	mask1b<<16,mask1b,mask1b
+	SUB	$1,t4,t4
+	MVN	mask80,mask7f
+	MOVW	t4<<2,t4	// (rounds-1)*4
+
+dk_mix:	AND	mask80,s0,t1
+	AND	mask7f,s0,s1
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s1<<1,t1,s1	// tp2
+
+	AND	mask80,s1,t1
+	AND	mask7f,s1,s2
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s2<<1,t1,s2	// tp4
+
+	AND	mask80,s2,t1
+	AND	mask7f,s2,s3
+	SUB	t1>>7,t1,t1
+	AND	mask1b,t1,t1
+	EOR	s3<<1,t1,s3	// tp8
+
+	EOR	s2,s1,t1
+	EOR	s3,s0,t2	// tp9
+	EOR	s3,t1,t1	// tpe
+	EOR	s1@>24,t1,t1
+	EOR	t2@>24,t1,t1	// ^= ROTATE(tpb=tp9^tp2,8)
+	EOR	s2@>16,t1,t1
+	EOR	t2@>16,t1,t1	// ^= ROTATE(tpd=tp9^tp4,16)
+	EOR	t2@>8,t1,t1	// ^= ROTATE(tp9,24)
+
+	MOVW	4(key),s0	// prefetch tp1
+	MOVW.P	t1,4(key)
+	SUB.S	$1,t4,t4
+	BNE	dk_mix
+
+	RET
+
+// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+TEXT	·decryptBlockAsm(SB), NOSPLIT, $4-16	// $4 here to save LR
+	MOVW	src+12(FP), t4
+	MOVW	$·td0(SB), tbl
+	MOVW	xk+4(FP), key
+
+#ifndef ARM_ARCH_7
+	MOVBU	3(t4),s0	// load input data in endian-neutral
+	MOVBU	2(t4),t1	// manner...
+	MOVBU	1(t4),t2
+	MOVBU	0(t4),t3
+	ORR	t1<<8,s0,s0
+	MOVBU	7(t4),s1
+	ORR	t2<<16,s0,s0
+	MOVBU	6(t4),t1
+	ORR	t3<<24,s0,s0
+	MOVBU	5(t4),t2
+	MOVBU	4(t4),t3
+	ORR	t1<<8,s1,s1
+	MOVBU	11(t4),s2
+	ORR	t2<<16,s1,s1
+	MOVBU	10(t4),t1
+	ORR	t3<<24,s1,s1
+	MOVBU	9(t4),t2
+	MOVBU	8(t4),t3
+	ORR	t1<<8,s2,s2
+	MOVBU	15(t4),s3
+	ORR	t2<<16,s2,s2
+	MOVBU	14(t4),t1
+	ORR	t3<<24,s2,s2
+	MOVBU	13(t4),t2
+	MOVBU	12(t4),t3
+	ORR	t1<<8,s3,s3
+	ORR	t2<<16,s3,s3
+	ORR	t3<<24,s3,s3
+#else
+	MOVW	0(t4),s0
+	MOVW	4(t4),s1
+	MOVW	8(t4),s2
+	MOVW	12(t4),s3
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+#endif
+
+	MOVM.IA.W	(key),[t1-i1]
+	EOR	t1,s0,s0
+	MOVW	240-16(key),t4
+	EOR	t2,s1,s1
+	EOR	t3,s2,s2
+	EOR	i1,s3,s3
+	SUB	$1,t4,t4
+	MOVW	$255,t4
+
+	AND	s0>>16,t4,i1
+	AND	s0>>8,t4,i2
+	AND	s0,t4,i3
+	MOVW	s0>>24,s0
+dec_loop:
+	MOVW	i1<<2(tbl),t1	// Td1[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVW	i2<<2(tbl),t2	// Td2[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVW	i3<<2(tbl),t3	// Td3[s0>>0]
+	AND	s1>>8,t4,i3
+	MOVW	s0<<2(tbl),s0	// Td0[s0>>24]
+	MOVW	s1>>24,s1
+
+	MOVW	i1<<2(tbl),i1	// Td3[s1>>0]
+	MOVW	i2<<2(tbl),i2	// Td1[s1>>16]
+	MOVW	i3<<2(tbl),i3	// Td2[s1>>8]
+	EOR	i1@>24,s0,s0
+	MOVW	s1<<2(tbl),s1	// Td0[s1>>24]
+	AND	s2>>8,t4,i1	// i0
+	EOR	t2@>8,i2,t2
+	AND	s2,t4,i2	// i1
+	EOR	t3@>8,i3,t3
+	AND	s2>>16,t4,i3
+	MOVW	i1<<2(tbl),i1	// Td2[s2>>8]
+	EOR	t1@>8,s1,s1
+	MOVW	i2<<2(tbl),i2	// Td3[s2>>0]
+	MOVW	s2>>24,s2
+
+	MOVW	i3<<2(tbl),i3	// Td1[s2>>16]
+	EOR	i1@>16,s0,s0
+	MOVW	s2<<2(tbl),s2	// Td0[s2>>24]
+	AND	s3>>16,t4,i1	// i0
+	EOR	i2@>24,s1,s1
+	AND	s3>>8,t4,i2	// i1
+	EOR	t3@>8,i3,t3
+	AND	s3,t4,i3	// i2
+	MOVW	i1<<2(tbl),i1	// Td1[s3>>16]
+	EOR	t2@>8,s2,s2
+	MOVW	i2<<2(tbl),i2	// Td2[s3>>8]
+	MOVW	s3>>24,s3
+
+	MOVW	i3<<2(tbl),i3	// Td3[s3>>0]
+	EOR	i1@>8,s0,s0
+	MOVW.P	16(key),i1
+	EOR	i2@>16,s1,s1
+	MOVW	s3<<2(tbl),s3	// Td0[s3>>24]
+	EOR	i3@>24,s2,s2
+
+	MOVW	-12(key),t1
+	EOR	i1,s0,s0
+	MOVW	-8(key),t2
+	EOR	t3@>8,s3,s3
+	MOVW	-4(key),t3
+	AND	s0>>16,t4,i1
+	EOR	t1,s1,s1
+	MOVW	nr+0(FP), t1
+	AND	s0>>8,t4,i2
+	EOR	t2,s2,s2
+	AND	s0,t4,i3
+	EOR	t3,s3,s3
+	SUB.S	$1,t1,t1
+	MOVW	s0>>24,s0
+
+	MOVW	t1, nr+0(FP)
+	BGT	dec_loop
+
+	MOVW	$·sbox1(SB),tbl
+
+	MOVW	0(tbl),t2	// prefetch Td4
+	MOVW	32(tbl),t3
+	MOVW	64(tbl),t1
+	MOVW	96(tbl),t2
+	MOVW	128(tbl),t3
+	MOVW	160(tbl),t1
+	MOVW	192(tbl),t2
+	MOVW	224(tbl),t3
+
+	MOVBU	s0<<0(tbl),s0	// Td4[s0>>24]
+	MOVBU	i1<<0(tbl),t1	// Td4[s0>>16]
+	AND	s1,t4,i1	// i0
+	MOVBU	i2<<0(tbl),t2	// Td4[s0>>8]
+	AND	s1>>16,t4,i2
+	MOVBU	i3<<0(tbl),t3	// Td4[s0>>0]
+	AND	s1>>8,t4,i3
+
+	ADD	s1>>24,tbl,s1
+	MOVBU	i1<<0(tbl),i1	// Td4[s1>>0]
+	MOVBU	(s1),s1		// Td4[s1>>24]
+	MOVBU	i2<<0(tbl),i2	// Td4[s1>>16]
+	EOR	s0<<24,i1,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s1>>8]
+	EOR	s1<<8,t1,s1
+	AND	s2>>8,t4,i1	// i0
+	EOR	i2<<8,t2,t2
+	AND	s2,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s2>>8]
+	EOR	i3<<8,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s2>>0]
+	AND	s2>>16,t4,i3
+
+	ADD	s2>>24,tbl,s2
+	MOVBU	(s2),s2		// Td4[s2>>24]
+	EOR	i1<<8,s0,s0
+	MOVBU	i3<<0(tbl),i3	// Td4[s2>>16]
+	EOR	s1<<16,i2,s1
+	AND	s3>>16,t4,i1	// i0
+	EOR	s2<<16,t2,s2
+	AND	s3>>8,t4,i2	// i1
+	MOVBU	i1<<0(tbl),i1	// Td4[s3>>16]
+	EOR	i3<<16,t3,t3
+	MOVBU	i2<<0(tbl),i2	// Td4[s3>>8]
+	AND	s3,t4,i3	// i2
+
+	ADD	s3>>24,tbl,s3
+	MOVBU	i3<<0(tbl),i3	// Td4[s3>>0]
+	MOVBU	(s3),s3		// Td4[s3>>24]
+	EOR	i1<<16,s0,s0
+	MOVW	0(key),i1
+	EOR	i2<<8,s1,s1
+	MOVW	4(key),t1
+	EOR	s2<<8,i3,s2
+	MOVW	8(key),t2
+	EOR	s3<<24,t3,s3
+	MOVW	12(key),t3
+
+	EOR	i1,s0,s0
+	EOR	t1,s1,s1
+	EOR	t2,s2,s2
+	EOR	t3,s3,s3
+
+	MOVW	dst+8(FP), t4
+
+#ifdef ARM_ARCH_7
+	REV	s0,s0
+	REV	s1,s1
+	REV	s2,s2
+	REV	s3,s3
+	MOVW	s0,0(t4)
+	MOVW	s1,4(t4)
+	MOVW	s2,8(t4)
+	MOVW	s3,12(t4)
+#else
+	MOVW	s0>>24,t1	// write output in endian-neutral
+	MOVW	s0>>16,t2	// manner...
+	MOVW	s0>>8,t3
+	MOVBU	t1,0(t4)
+	MOVBU	t2,1(t4)
+	MOVW	s1>>24,t1
+	MOVBU	t3,2(t4)
+	MOVW	s1>>16,t2
+	MOVBU	s0,3(t4)
+	MOVW	s1>>8,t3
+	MOVBU	t1,4(t4)
+	MOVBU	t2,5(t4)
+	MOVW	s2>>24,t1
+	MOVBU	t3,6(t4)
+	MOVW	s2>>16,t2
+	MOVBU	s1,7(t4)
+	MOVW	s2>>8,t3
+	MOVBU	t1,8(t4)
+	MOVBU	t2,9(t4)
+	MOVW	s3>>24,t1
+	MOVBU	t3,10(t4)
+	MOVW	s3>>16,t2
+	MOVBU	s2,11(t4)
+	MOVW	s3>>8,t3
+	MOVBU	t1,12(t4)
+	MOVBU	t2,13(t4)
+	MOVBU	t3,14(t4)
+	MOVBU	s3,15(t4)
+#endif
+	RET
diff --git a/src/crypto/aes/cipher_arm.go b/src/crypto/aes/cipher_arm.go
new file mode 100644
index 0000000..9cd161c
--- /dev/null
+++ b/src/crypto/aes/cipher_arm.go
@@ -0,0 +1,84 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm,!nacl
+
+package aes
+
+import (
+	"crypto/cipher"
+)
+
+// defined in asm_arm.s
+func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+func expandKeyEncAsm(nr int, userKey *byte, enc *uint32)
+func expandKeyDecAsm(nr int, enc *uint32, dec *uint32)
+
+type aesCipherAsm struct {
+	aesCipher
+}
+
+func newCipher(key []byte) (cipher.Block, error) {
+	n := len(key) + 28
+	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
+	rounds := 10
+	switch len(key) {
+	case 128 / 8:
+		rounds = 10
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &c.enc[0])
+	expandKeyDecAsm(rounds, &c.enc[0], &c.dec[0])
+	return &c, nil
+}
+
+func (c *aesCipherAsm) BlockSize() int { return BlockSize }
+
+func (c *aesCipherAsm) Encrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	encryptBlockAsm(len(c.enc)/4-2, &c.enc[0], &dst[0], &src[0])
+}
+
+func (c *aesCipherAsm) Decrypt(dst, src []byte) {
+	if len(src) < BlockSize {
+		panic("crypto/aes: input not full block")
+	}
+	if len(dst) < BlockSize {
+		panic("crypto/aes: output not full block")
+	}
+	decryptBlockAsm(len(c.dec)/4-2, &c.dec[0], &dst[0], &src[0])
+}
+
+// expandKey is used by BenchmarkExpand to ensure that the asm implementation
+// of key expansion is used for the benchmark when it is available.
+func expandKey(key []byte, enc, dec []uint32) {
+	rounds := 10 // rounds needed for AES128
+	switch len(key) {
+	case 192 / 8:
+		rounds = 12
+	case 256 / 8:
+		rounds = 14
+	}
+	expandKeyEncAsm(rounds, &key[0], &enc[0])
+	if dec != nil {
+		expandKeyDecAsm(rounds, &enc[0], &dec[0])
+	}
+}
+
+// rcon table used by asm_arm.s
+var rcon = [16]uint32{
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000, 0, 0,
+	0, 0, 0, 0,
+}
diff --git a/src/crypto/aes/cipher_generic.go b/src/crypto/aes/cipher_generic.go
index ca74aa8..bfe2da8 100644
--- a/src/crypto/aes/cipher_generic.go
+++ b/src/crypto/aes/cipher_generic.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build !amd64,!s390x,!ppc64le
+// +build !arm nacl
 
 package aes

To view, visit change 38366. To unsubscribe, visit settings.

Cherry Zhang (Gerrit)

unread,

Mar 27, 2017, 3:23:04 PM3/27/17

to Nick Craig-Wood, Josselin Costanzi, Gobot Gobot, Brad Fitzpatrick, golang-co...@googlegroups.com

Cherry Zhang posted comments on this change.

View Change

Patch set 4:Run-TryBot +1Code-Review +1

I only looked at the assembly from a general perspective (Go's calling convention, etc.), not the algorithm. Leave the actual review to someone who knows the algorithm.

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 27, 2017, 3:23:53 PM3/27/17

to Nick Craig-Wood, Cherry Zhang, Josselin Costanzi, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 4:

TryBots beginning. Status page: http://farmer.golang.org/try?commit=ced74308

To view, visit change 38366. To unsubscribe, visit settings.

Gobot Gobot (Gerrit)

unread,

Mar 27, 2017, 3:31:28 PM3/27/17

to Nick Craig-Wood, Cherry Zhang, Josselin Costanzi, Brad Fitzpatrick, golang-co...@googlegroups.com

Gobot Gobot posted comments on this change.

View Change

Patch set 4:TryBot-Result +1

TryBots are happy.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Mar 28, 2017, 4:53:42 AM3/28/17

to Gobot Gobot, Cherry Zhang, Josselin Costanzi, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

Patch Set 4: Run-TryBot+1 Code-Review+1

I only looked at the assembly from a general perspective (Go's calling convention, etc.), not the algorithm. Leave the actual review to someone who knows the algorithm.

The review is probably more a question of looking at

https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl

And make sure the translation to asm_arm.s is faithful. I think we can assume that the openssl team got the algorithm right, especially since the unit tests pass.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 8, 2017, 12:41:50 PM6/8/17

to Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, Brad Fitzpatrick, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

Please can you take a look at this? It has been languishing, unloved, probably because it has lots of ARM assembler in.

Thanks :-)

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Jun 8, 2017, 12:44:02 PM6/8/17

to Nick Craig-Wood, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 4:

Adam is busy with something else for a bit and has no time for Go work at the moment.

Perhaps somebody else who knows ARM + crypto stuff can review.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 8, 2017, 12:49:05 PM6/8/17

to Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

Adam is busy with something else for a bit and has no time for Go work at the moment.
Perhaps somebody else who knows ARM + crypto stuff can review.

Any suggestions?

Thanks

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Jun 8, 2017, 1:05:39 PM6/8/17

to Nick Craig-Wood, Filippo Valsorda, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 4:

Filippo, is something you could review?

To view, visit change 38366. To unsubscribe, visit settings.

Emmanuel Odeke (Gerrit)

unread,

Jun 8, 2017, 5:44:22 PM6/8/17

to Nick Craig-Wood, Andreas Auernhammer, Filippo Valsorda, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Emmanuel Odeke posted comments on this change.

View Change

Patch set 4:

/cc @Andreas too for crypto and ARM

To view, visit change 38366. To unsubscribe, visit settings.

Filippo Valsorda (Gerrit)

unread,

Jun 11, 2017, 7:31:55 AM6/11/17

to Nick Craig-Wood, Andreas Auernhammer, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Filippo Valsorda posted comments on this change.

View Change

Patch set 4:

Some preliminary comments. (Most important one is the one about the CLA.)

To review the assembly, I worked towards a minimal diff between this and the OpenSSL code. Here are the changes I did:

OpenSSL: __ARMEL__ defined (when ARM_ARCH_7 is defined)
OpenSSL: __thumb2__ and __APPLE__ not defined
OpenSSL: inlined armv4_AES_decrypt/encrypt
OpenSSL: regex'd most instructions
Go: removed needless Rx<<0
Go: inlined maskXX defines

https://gist.github.com/FiloSottile/85eefcd4a7181678a28d68c8d354f28e#file-not-diff

Then by aliasing both lr and $rounds to t4 in the OpenSSL code:

https://gist.github.com/FiloSottile/85eefcd4a7181678a28d68c8d354f28e#file-aliasing-diff

Still have to review the diff fully, but seems sane. Have to check that the lr+rounds aliasing is correct. Also have to check that the tables match.

(Couldn't find anywhere confirmation that MOVM.W means MOVM with writeback. The Plan9 assembly reference just calls it a special addressing mode bit.)

(6 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #4, Line 10: // it. For further details see http://www.openssl.org/~appro/cryptogams/.
  I assume the CLA and license concerns have been settled out of band?
- Patch Set #4, Line 12: bellow
  below
- Patch Set #4, Line 13: // https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl
  https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl;hb=6aa36e8e5a062e31543e7796f0351ff9628832ce
- Patch Set #4, Line 60: #define i3 R9 // forbidden on nacl, check usage with -shared
  Looks like you did check usage with -shared. If so, cut the comment.
- Patch Set #4, Line 65: #define tbl R11 // can be used by the linker to synthesise instructions
  tbl is used as a stable register (but not across functions). I'd either add a comment saying why it's ok, or remove the comment if you verified it's guaranteed not to be clobbered.
  FWIW, https://golang.org/doc/asm still says
  The registers R10 and R11 are reserved by the compiler and linker.
- Patch Set #4, Line 69: // #define ARM_ARCH_7 1
  Is this a magic Go #define, or are all #ifdef ARM_ARCH_7 dead code?
  Go has GOARM to choose what version to target https://github.com/golang/go/wiki/GoArm
  In any case, this needs a better comment or to be removed.

To view, visit change 38366. To unsubscribe, visit settings.

Andreas Auernhammer (Gerrit)

unread,

Jun 11, 2017, 9:17:15 PM6/11/17

to Nick Craig-Wood, Filippo Valsorda, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Andreas Auernhammer posted comments on this change.

View Change

Patch set 4:

(3 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #4, Line 69: // #define ARM_ARCH_7 1

- Is this a magic Go #define, or are all #ifdef ARM_ARCH_7 dead code?

- This does not seem to work - at least for cross-compilation...
  So I guess ARM_ARCH_7 code is currently dead.
File src/crypto/aes/cipher_generic.go:
- Patch Set #4, Line 5: // +build !amd64,!s390x,!ppc64le
  // +build !amd64,!s390x,!ppc64le,!arm nacl
- Patch Set #4, Line 6: // +build !arm nacl
  remove - see above

To view, visit change 38366. To unsubscribe, visit settings.

Filippo Valsorda (Gerrit)

unread,

Jun 12, 2017, 4:54:21 AM6/12/17

to Nick Craig-Wood, Andreas Auernhammer, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Cherry Zhang, Josselin Costanzi, golang-co...@googlegroups.com

Filippo Valsorda posted comments on this change.

View Change

Patch set 4:

(1 comment)

File src/crypto/aes/cipher_generic.go:

- Patch Set #4, Line 5: // +build !amd64,!s390x,!ppc64le
  // +build !amd64,!s390x,!ppc64le,!arm nacl

No, that would be

    (!amd64 AND !s390x AND !ppc64le AND !arm) OR nacl

while the multi-line was

    (!amd64 AND !s390x AND !ppc64le) AND (!arm OR nacl)

The latter is correct.

To view, visit change 38366. To unsubscribe, visit settings.

Cherry Zhang (Gerrit)

unread,

Jun 12, 2017, 8:16:21 AM6/12/17

to Nick Craig-Wood, Filippo Valsorda, Andreas Auernhammer, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Cherry Zhang posted comments on this change.

View Change

Patch set 4:

(1 comment)

File src/crypto/aes/asm_arm.s:
- Patch Set #4, Line 65: #define tbl R11 // can be used by the linker to synthesise instructions

- tbl is used as a stable register (but not across functions). I'd either add

- R10 is the g (goroutine) register. It is needed at any safe point (preemption point). If there is no safe point in the function, it is ok to use g register as long as it restores it upon return.
  R11 is the temp register used in the assembler to synthesize instructions. It is possible to carefully write the code (e.g. not using synthesized instructions) so R11 is not clobbered.

To view, visit change 38366. To unsubscribe, visit settings.

Andreas Auernhammer (Gerrit)

unread,

Jun 12, 2017, 12:30:03 PM6/12/17

to Nick Craig-Wood, Cherry Zhang, Filippo Valsorda, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Andreas Auernhammer posted comments on this change.

View Change

Patch set 4:

(1 comment)

File src/crypto/aes/cipher_generic.go:

- Patch Set #4, Line 5: // +build !amd64,!s390x,!ppc64le

- No, that would be
  Oh, yes I see - sry, never mind! Thanks Filippo!

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 12, 2017, 5:01:32 PM6/12/17

to Andreas Auernhammer, Cherry Zhang, Filippo Valsorda, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

Thanks for the thorough review Filippo

I've replied in line to your comments.

I'll upload a new version when we've decided on what to do about the ARM7 #ifdef.

To review the assembly, I worked towards a minimal diff between this and the OpenSSL code. Here are the changes I did:
OpenSSL: __ARMEL__ defined (when ARM_ARCH_7 is defined)
OpenSSL: __thumb2__ and __APPLE__ not defined
OpenSSL: inlined armv4_AES_decrypt/encrypt
OpenSSL: regex'd most instructions
Go: removed needless Rx<<0
Go: inlined maskXX defines

That looks correct.

https://gist.github.com/FiloSottile/85eefcd4a7181678a28d68c8d354f28e#file-not-diff
Then by aliasing both lr and $rounds to t4 in the OpenSSL code:
https://gist.github.com/FiloSottile/85eefcd4a7181678a28d68c8d354f28e#file-aliasing-diff
Still have to review the diff fully, but seems sane. Have to check that the lr+rounds aliasing is correct. Also have to check that the tables match.
(Couldn't find anywhere confirmation that MOVM.W means MOVM with writeback. The Plan9 assembly reference just calls it a special addressing mode bit.)

Yes I think you'd probably have to look in the ARM architecture for this. ARM assembly notates this a different way usually.

Here are the ARM docs MOVM == STMIA & LDMIA, the writeback bit "W" is written as ! in arm instructions.

http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0553a/BABCAEDD.html

FYI here is the perl program I used to convert ARM style assembler to Go style: https://gist.github.com/ncw/e2edab9aab09891f9d81a8ab2684137

It isn't a general purpose converter, it does just enough to convert the AES code!

(6 comments)

File src/crypto/aes/asm_arm.s:
- Patch Set #4, Line 10: // it. For further details see http://www.openssl.org/~appro/cryptogams/.

- I assume the CLA and license concerns have been settled out of band?

- I assumed this was OK since I borrowed this wording from src/crypto/aes/asm_ppc64le.s

- Patch Set #4, Line 12: bellow
  below

- Ack

- Patch Set #4, Line 13: // https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl
  https://git.openssl.org/?p=openssl.git;a=blob;f=crypto/aes/asm/aes-armv4.pl

- Ack

- Patch Set #4, Line 60: #define i3 R9 // forbidden on nacl, check usage with -shared
  Looks like you did check usage with -shared. If so, cut the comment.

- Ack

- Patch Set #4, Line 65: #define tbl R11 // can be used by the linker to synthesise instructions

- R10 is the g (goroutine) register. It is needed at any safe point (preempti
  I have checked that R11 isn't in use in synthesized instructions so I've adjusted the comment slightly.

- Patch Set #4, Line 69: // #define ARM_ARCH_7 1

- This does not seem to work - at least for cross-compilation...
  Those ARM_ARCH_7 defines do not work currently. As far as I know there is no compile time way of detecting which ARM version you are using in ARM assembler code. It would be really, really useful if there was!
  The code commented out will make this run much faster on ARM7. The REV instruction is available in ARM6 and above and openssl assumes that nonaligned loads will definitely be available in ARM7 and above.
  I'm happy to remove the #ifdefed code if that is the consensus, though I thought it merited discussion first.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 12, 2017, 5:03:48 PM6/12/17

to Andreas Auernhammer, Cherry Zhang, Filippo Valsorda, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

FYI here is the perl program I used to convert ARM style assembler to Go style: https://gist.github.com/ncw/e2edab9aab09891f9d81a8ab2684137

It isn't a general purpose converter, it does just enough to convert the AES code!

Cut and paste fail - this is the URL

https://gist.github.com/ncw/e2edab9aab09891f9d81a8ab26841373

To view, visit change 38366. To unsubscribe, visit settings.

Filippo Valsorda (Gerrit)

unread,

Jun 13, 2017, 10:47:24 AM6/13/17

to Nick Craig-Wood, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Brad Fitzpatrick, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Filippo Valsorda posted comments on this change.

View Change

Patch set 4:

(3 comments)

File src/crypto/aes/asm_arm.s:

- Patch Set #4, Line 10: // it. For further details see http://www.openssl.org/~appro/cryptogams/.

- I assumed this was OK since I borrowed this wording from src/crypto/aes/asm
  Turns out this is ok! Yay.
  https://go-review.googlesource.com/c/33587/#message-aedbbdb82b7c33eb462343d157d581e099d97206

- Patch Set #4, Line 65: #define tbl R11 // can be used by the linker to synthesise instructions

- I have checked that R11 isn't in use in synthesized instructions so I've ad
  It looks like there is no CALL, so no preemption points? If that's the case, it would probably be nicer to store R10 and then use it, instead of aliasing lr+rounds. It would also make the review faster.

- Patch Set #4, Line 69: // #define ARM_ARCH_7 1

- Those ARM_ARCH_7 defines do not work currently. As far as I know there is n
  I'm surprised there's no way or switching on GOARM. If that's the case, I'd open an issue to add a way, leave the ARM_ARCH_7 defines, and add a comment at the top mentioning ARM_ARCH_7, the fact that it's currently dead code, and linking to the issue.

To view, visit change 38366. To unsubscribe, visit settings.

Brad Fitzpatrick (Gerrit)

unread,

Jun 13, 2017, 4:00:08 PM6/13/17

to Nick Craig-Wood, Brad Fitzpatrick, Filippo Valsorda, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Brad Fitzpatrick posted comments on this change.

View Change

Patch set 4:

R=go1.10

Nick, are you fine with this being in Go 1.10? That's what it looks like to me, but I haven't been following this CL closely.

If this is for Go 1.9, let me know, but I'm not sure how much review+testing remains.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 14, 2017, 4:13:32 AM6/14/17

to Brad Fitzpatrick, Filippo Valsorda, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

Patch Set 4:

R=go1.10
Nick, are you fine with this being in Go 1.10? That's what it looks like to me, but I haven't been following this CL closely.
If this is for Go 1.9, let me know, but I'm not sure how much review+testing remains.

I would have liked it for go 1.9 but I didn't give the review process enough pushing so go 1.10 seems sensible to me given where we are in the release cycle.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Jun 15, 2017, 1:05:25 PM6/15/17

to Brad Fitzpatrick, Filippo Valsorda, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Nick Craig-Wood posted comments on this change.

View Change

Patch set 4:

See inline comment!

(1 comment)

File src/crypto/aes/asm_arm.s:
- Patch Set #4, Line 65: #define tbl R11 // can be used by the linker to synthesise instructions
  It looks like there is no CALL, so no preemption points? If that's the case
  I did a quick code grep and found that there are other bits of ARM code which use g (R10)...
  So I'll change this to use R10 which will give a small amount of performance.
  It will take me a few days though as I've got lots of other stuff on.

To view, visit change 38366. To unsubscribe, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Oct 2, 2017, 11:14:56 AM10/2/17

to Adam Langley, Cherry Zhang, Andreas Auernhammer, Gobot Gobot, Brad Fitzpatrick, goph...@pubsubhelper.golang.org, Josselin Costanzi, Emmanuel Odeke, golang-co...@googlegroups.com

Nick Craig-Wood uploaded patch set #5 to this change.

View Change

crypto/aes: ARM assembly versions of encrypt, decrypt and expandKey

ARM assembly for AES crypto adapted from openssl giving an
encrypt/decrypt speed up of 1.6-2.7x and a key scheduling speedup of

2.2-4.7x.



Raspberry Pi 3 BCM2709 ARMv7 Processor rev 5 (v7l)

name       old time/op    new time/op     delta

Encrypt-4    3.12µs ± 1%     1.12µs ± 1%   -63.96%  (p=0.000 n=20+20)
Decrypt-4    3.10µs ± 1%     1.21µs ± 1%   -61.10%  (p=0.000 n=20+20)
Expand-4     11.3µs ± 1%      2.4µs ± 1%   -78.38%  (p=0.000 n=16+20)



name       old speed      new speed       delta

Encrypt-4  5.13MB/s ± 2%  14.22MB/s ± 1%  +177.32%  (p=0.000 n=20+20)
Decrypt-4  5.16MB/s ± 1%  13.25MB/s ± 1%  +157.06%  (p=0.000 n=20+20)



Chrombook Samsung Exynos5 ARMv7 Processor rev 4 (v7l)

name       old time/op    new time/op    delta

Encrypt-2     342ns ± 1%     217ns ± 3%  -36.47%  (p=0.000 n=16+19)
Decrypt-2     343ns ± 6%     221ns ± 7%  -35.52%  (p=0.000 n=17+18)
Expand-2     1.64µs ± 5%    0.73µs ±10%  -55.56%  (p=0.000 n=17+20)



name       old speed      new speed      delta

Encrypt-2  46.7MB/s ± 1%  73.2MB/s ± 7%  +56.86%  (p=0.000 n=16+20)
Decrypt-2  46.4MB/s ± 7%  71.8MB/s ± 9%  +54.90%  (p=0.000 n=18+19)



Issue #4299

Change-Id: I13df6a87f5697de255cb9a494022dd7f7dbde8f5
---
A src/crypto/aes/asm_arm.s
A src/crypto/aes/cipher_arm.go
M src/crypto/aes/cipher_generic.go

3 files changed, 940 insertions(+), 0 deletions(-)

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Oct 2, 2017, 11:18:11 AM10/2/17

to goph...@pubsubhelper.golang.org, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Uploaded patch set 5.

Here is the patch redone to use the g register, which means it is much closer to the openssl original code and hopefully much easier to review.

I've also addressed all the remaining issue I think.

Filippo: Apologies for the delay in resubmitting - I finished this all bar one benchmark when I saw you at Gophercon and it kind of slipped off my radar since then!

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Nov 7, 2017, 9:33:57 AM11/7/17

to goph...@pubsubhelper.golang.org, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Is there any interest in merging this for go 1.10? I feel the release freeze approaching ;-)

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Ian Lance Taylor (Gerrit)

unread,

Nov 7, 2017, 10:27:21 AM11/7/17

to Nick Craig-Wood, goph...@pubsubhelper.golang.org, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Adam Langley, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Ping Filippo and Cherry, I guess.

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Adam Langley (Gerrit)

unread,

Nov 8, 2017, 6:38:59 PM11/8/17

to Nick Craig-Wood, goph...@pubsubhelper.golang.org, Ian Lance Taylor, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

I'm afraid that we cannot derive work from OpenSSL's sources without respecting the OpenSSL license, but Go is 3-BSD licensed.

Patch set 5:Code-Review -2

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Nov 9, 2017, 3:12:07 AM11/9/17

to goph...@pubsubhelper.golang.org, Adam Langley, Ian Lance Taylor, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Patch Set 5: Code-Review-2

I'm afraid that we cannot derive work from OpenSSL's sources without respecting the OpenSSL license, but Go is 3-BSD licensed.

The low level assembler code in OpenSSL is licensed under the cryptograms licence: https://www.openssl.org/~appro/cryptogams/ which is a 3-clause BSD license (I think) which seems eminently compatible.

Also we already have code derived from OpenSSL, src/crypto/aes/asm_ppc64le.s and src/crypto/sha256/sha256block_ppc64le.s

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Russ Cox (Gerrit)

unread,

Nov 9, 2017, 10:21:47 AM11/9/17

to Nick Craig-Wood, goph...@pubsubhelper.golang.org, Russ Cox, Adam Langley, Ian Lance Taylor, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Patch Set 5:

Patch Set 5: Code-Review-2
I'm afraid that we cannot derive work from OpenSSL's sources without respecting the OpenSSL license, but Go is 3-BSD licensed.
The low level assembler code in OpenSSL is licensed under the cryptograms licence: https://www.openssl.org/~appro/cryptogams/ which is a 3-clause BSD license (I think) which seems eminently compatible.
Also we already have code derived from OpenSSL, src/crypto/aes/asm_ppc64le.s and src/crypto/sha256/sha256block_ppc64le.s

Yes, I believe those should be removed too. The comment in the code being added in this file (and in those) say that the license depends on where you get the code. Clearly you got the code from OpenSSL, since the code is not posted on the cryptogams site.

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Nick Craig-Wood (Gerrit)

unread,

Nov 10, 2017, 8:37:03 AM11/10/17

to goph...@pubsubhelper.golang.org, Russ Cox, Adam Langley, Ian Lance Taylor, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

The comment in the code being added in this file (and in those) say that the license depends on where you get the code. Clearly you got the code from OpenSSL, since the code is not posted on the cryptogams site.

I agree that is unclear. I think the intention of the statement on the cryptogams site is clear though, clearly ap...@openssl.org desires his work to have a wider use. I emailed him to see he could clarify the matter.

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Russ Cox (Gerrit)

unread,

Nov 10, 2017, 8:52:21 AM11/10/17

to Nick Craig-Wood, goph...@pubsubhelper.golang.org, Russ Cox, Adam Langley, Ian Lance Taylor, Brad Fitzpatrick, Andreas Auernhammer, Cherry Zhang, Emmanuel Odeke, Gobot Gobot, Josselin Costanzi, golang-co...@googlegroups.com

Patch Set 5:

The comment in the code being added in this file (and in those) say that the license depends on where you get the code. Clearly you got the code from OpenSSL, since the code is not posted on the cryptogams site.
I agree that is unclear. I think the intention of the statement on the cryptogams site is clear though, clearly ap...@openssl.org desires his work to have a wider use. I emailed him to see he could clarify the matter.

Thanks very much. If fresh versions are posted on the cryptogams site then we should still remove these and start over from the cryptogams version (to make sure we're not picking up any OpenSSL contributions), but I don't think that should be too hard.

View Change

To view, visit change 38366. To unsubscribe, or for help writing mail filters, visit settings.

Reply all

Reply to author

Forward