[PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function

Tim Chen

unread,

Sep 10, 2015, 6:27:13 PM9/10/15

to Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

This patch includes the Intel SHA Extensions optimized implementation
of SHA-1 update function. This function has been tested on Broxton
platform and measured a speed up of 3.6x over the SSSE3 implementiation
for 4K blocks.

Originally-by: Chandramouli Narayanan <mouli...@yahoo.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---
arch/x86/crypto/sha1_ni_asm.S | 302 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 302 insertions(+)
create mode 100644 arch/x86/crypto/sha1_ni_asm.S

diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 0000000..874a651
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m...@intel.com>
+ * Tim Chen <tim.c...@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define RSPSAVE %rax
+
+/* gcc conversion */
+#define FRAME_SIZE 32 /* space for 2x16 bytes */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+ mov %rsp, RSPSAVE
+ sub $FRAME_SIZE, %rsp
+ and $~0xF, %rsp
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /* load initial hash values */
+ pinsrd $3, 1*16(DIGEST_PTR), E0
+ movdqu 0*16(DIGEST_PTR), ABCD
+ pand UPPER_WORD_MASK(%rip), E0
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa E0, (0*16)(%rsp)
+ movdqa ABCD, (1*16)(%rsp)
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG0
+ pshufb SHUF_MASK, MSG0
+ paddd MSG0, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG1
+ pshufb SHUF_MASK, MSG1
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG1, MSG0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG2
+ pshufb SHUF_MASK, MSG2
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG3
+ pshufb SHUF_MASK, MSG3
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 16-19 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 20-23 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 24-27 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 28-31 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 32-35 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 36-39 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 40-43 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 44-47 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 48-51 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 52-55 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 56-59 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 60-63 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $3, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 64-67 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $3, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 68-71 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $3, E1, ABCD
+ pxor MSG1, MSG3
+
+ /* Rounds 72-75 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $3, E0, ABCD
+
+ /* Rounds 76-79 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1rnds4 $3, E1, ABCD
+
+ /* Add current hash values with previously saved */
+ sha1nexte (0*16)(%rsp), E0
+ paddd (1*16)(%rsp), ABCD
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, 0*16(DIGEST_PTR)
+ pextrd $3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+ mov RSPSAVE, %rsp
+
+ ret
+ENDPROC(sha1_ni_transform)
+
+.data
+
+.align 64
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:
+ .octa 0xFFFFFFFF000000000000000000000000
--
2.4.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Tim Chen

unread,

Sep 10, 2015, 6:27:25 PM9/10/15

to Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

This patch includes the Intel SHA Extensions optimized implementation

of SHA-256 update function. This function has been tested on Broxton

platform and measured a speed up of 3.6x over the SSSE3 implementiation
for 4K blocks.

Originally-by: Chandramouli Narayanan <mouli...@yahoo.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---

arch/x86/crypto/sha256_ni_asm.S | 353 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 353 insertions(+)
create mode 100644 arch/x86/crypto/sha256_ni_asm.S

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 0000000..748cdf2
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function

+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSGTMP0 %xmm3
+#define MSGTMP1 %xmm4
+#define MSGTMP2 %xmm5
+#define MSGTMP3 %xmm6
+#define MSGTMP4 %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function

+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *

+ * void sha256_ni_transform(uint32_t *digest, const void *data,

+ uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+

+.text
+.align 32
+ENTRY(sha256_ni_transform)

+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*

+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(DIGEST_PTR), STATE0
+ movdqu 1*16(DIGEST_PTR), STATE1
+
+ pshufd $0xB1, STATE0, STATE0 /* CDAB */
+ pshufd $0x1B, STATE1, STATE1 /* EFGH */
+ movdqa STATE0, MSGTMP4
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */

+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK

+ lea K256(%rip), SHA256CONSTANTS

+
+.Lloop0:
+ /* Save hash values for addition after rounds */

+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE

+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG

+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP0
+ paddd 0*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0

+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG

+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP1
+ paddd 1*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0

+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG

+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP2
+ paddd 2*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1

+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG

+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP3
+ paddd 3*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2

+
+ /* Rounds 16-19 */

+ movdqa MSGTMP0, MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3

+
+ /* Rounds 20-23 */

+ movdqa MSGTMP1, MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0

+
+ /* Rounds 24-27 */

+ movdqa MSGTMP2, MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1

+
+ /* Rounds 28-31 */

+ movdqa MSGTMP3, MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2

+
+ /* Rounds 32-35 */

+ movdqa MSGTMP0, MSG
+ paddd 8*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3

+
+ /* Rounds 36-39 */

+ movdqa MSGTMP1, MSG
+ paddd 9*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0

+
+ /* Rounds 40-43 */

+ movdqa MSGTMP2, MSG
+ paddd 10*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1

+
+ /* Rounds 44-47 */

+ movdqa MSGTMP3, MSG
+ paddd 11*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2

+
+ /* Rounds 48-51 */

+ movdqa MSGTMP0, MSG
+ paddd 12*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3

+
+ /* Rounds 52-55 */

+ movdqa MSGTMP1, MSG
+ paddd 13*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0

+
+ /* Rounds 56-59 */

+ movdqa MSGTMP2, MSG
+ paddd 14*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0

+
+ /* Rounds 60-63 */

+ movdqa MSGTMP3, MSG
+ paddd 15*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0

+
+ /* Add current hash values with previously saved */

+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1

+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */

+ pshufd $0x1B, STATE0, STATE0 /* FEBA */
+ pshufd $0xB1, STATE1, STATE1 /* DCHG */
+ movdqa STATE0, MSGTMP4
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, MSGTMP4, STATE1 /* HGFE */
+
+ movdqu STATE0, 0*16(DIGEST_PTR)
+ movdqu STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+ ret
+ENDPROC(sha256_ni_transform)
+
+.data
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203

Tim Chen

unread,

Sep 10, 2015, 6:27:29 PM9/10/15

to Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

This patch adds the glue code to detect and utilize the Intel SHA
extensions optimized SHA1 and SHA256 update transforms when available.

This code has been tested on Broxton for functionality.

Originally-by: Chandramouli Narayanan <mouli...@yahoo.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---

arch/x86/crypto/sha1_ssse3_glue.c | 12 +++++++++++-
arch/x86/crypto/sha256_ssse3_glue.c | 38 ++++++++++++++++++++++---------------
2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7c48e8b..98be8cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -44,6 +44,10 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
unsigned int rounds);
#endif
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+ unsigned int rounds);
+#endif

static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);

@@ -166,12 +170,18 @@ static int __init sha1_ssse3_mod_init(void)
#endif
}
#endif
+#ifdef CONFIG_AS_SHA1_NI
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ sha1_transform_asm = sha1_ni_transform;
+ algo_name = "SHA-NI";
+ }
+#endif

if (sha1_transform_asm) {
pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
return crypto_register_shash(&alg);
}
- pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
+ pr_info("Neither AVX nor AVX2 nor SSSE3/SHA-NI is available/usable.\n");

return -ENODEV;
}
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f8097fc..9c7b22c 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -50,6 +50,10 @@ asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
u64 rounds);
#endif
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+ u64 rounds); /*unsigned int rounds);*/
+#endif

static void (*sha256_transform_asm)(u32 *, const char *, u64);

@@ -142,36 +146,40 @@ static bool __init avx_usable(void)

static int __init sha256_ssse3_mod_init(void)
{
+ char *algo;
+
/* test for SSSE3 first */
- if (cpu_has_ssse3)
+ if (cpu_has_ssse3) {
sha256_transform_asm = sha256_transform_ssse3;
+ algo = "SSSE3";
+ }

#ifdef CONFIG_AS_AVX
/* allow AVX to override SSSE3, it's a little faster */
if (avx_usable()) {
+ sha256_transform_asm = sha256_transform_avx;
+ algo = "AVX";
#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2)) {
sha256_transform_asm = sha256_transform_rorx;
- else
+ algo = "AVX2";
+ }
+#endif
+ }
#endif
- sha256_transform_asm = sha256_transform_avx;
+#ifdef CONFIG_AS_SHA256_NI
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ sha256_transform_asm = sha256_ni_transform;
+ algo = "SHA-256-NI";
}
#endif

if (sha256_transform_asm) {
-#ifdef CONFIG_AS_AVX
- if (sha256_transform_asm == sha256_transform_avx)
- pr_info("Using AVX optimized SHA-256 implementation\n");
-#ifdef CONFIG_AS_AVX2
- else if (sha256_transform_asm == sha256_transform_rorx)
- pr_info("Using AVX2 optimized SHA-256 implementation\n");
-#endif
- else
-#endif
- pr_info("Using SSSE3 optimized SHA-256 implementation\n");
+ pr_info("Using %s optimized SHA-256 implementation\n", algo);
return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ pr_info("Neither AVX nor SSSE3/SHA-NI is available/usable.\n");

return -ENODEV;

Tim Chen

unread,

Sep 10, 2015, 6:27:38 PM9/10/15

to Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

This patch provides the configuration and build support to
include and build the optimized SHA1 and SHA256 update transforms
for the kernel's crypto library.

Originally-by: Chandramouli Narayanan <mouli...@yahoo.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---

arch/x86/Makefile | 6 ++++--
arch/x86/crypto/Makefile | 8 ++++++++
crypto/Kconfig | 10 ++++++----
3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 747860c..a8009c7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -165,9 +165,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)

-KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)

LDFLAGS := -m elf_$(UTS_MACHINE)

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9a2838c..b9b912a 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)

obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o

@@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 48ee3e1..fc93444 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -597,17 +597,18 @@ config CRYPTO_SHA1
SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).

config CRYPTO_SHA1_SSSE3
- tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2)"
+ tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)"
depends on X86 && 64BIT
select CRYPTO_SHA1
select CRYPTO_HASH
help
SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
- Extensions (AVX/AVX2), when available.
+ Extensions (AVX/AVX2) or SHA-NI(SHA Extensions New Instructions),
+ when available.

config CRYPTO_SHA256_SSSE3
- tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
+ tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)"
depends on X86 && 64BIT
select CRYPTO_SHA256
select CRYPTO_HASH
@@ -615,7 +616,8 @@ config CRYPTO_SHA256_SSSE3
SHA-256 secure hash standard (DFIPS 180-2) implemented
using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
Extensions version 1 (AVX1), or Advanced Vector Extensions
- version 2 (AVX2) instructions, when available.
+ version 2 (AVX2) instructions, or SHA-NI (SHA Extensions New
+ Instructions) when available.

config CRYPTO_SHA512_SSSE3
tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"

Stephan Mueller

unread,

Sep 10, 2015, 6:52:33 PM9/10/15

to Tim Chen, Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Am Donnerstag, 10. September 2015, 15:27:20 schrieb Tim Chen:

Hi Tim,

>This patch adds the glue code to detect and utilize the Intel SHA
>extensions optimized SHA1 and SHA256 update transforms when available.
>
>This code has been tested on Broxton for functionality.

A general comment on this file: shouldn't this file be cleaned and use the
standard mechanisms of the kernel crypto API?

This glue implements its own selection of which SHA implementation to use. But
the kernel crypto API implements that logic already. The issue with the
current implementation in this file is that you have no clue which particular
implementation of SHA is in use in one particular case.

So, may I suggest a restructuring to define independent instances of SHA, such
as

- cra_name == "sha1", cra_driver_name="sha1_ssse3", cra_priority=300
- cra_name == "sha1", cra_driver_name="sha1_avx", cra_priority=400
- cra_name == "sha1", cra_driver_name="sha1_avx2", cra_priority=500
- cra_name == "sha1", cra_driver_name="sha1_shavx", cra_priority=600

Similarly for the other SHAs?

In all the register functions for the ciphers, you can bail out if the
hardware does not support an implementation.

Ciao
Stephan

Tim Chen

unread,

Sep 10, 2015, 8:04:39 PM9/10/15

to Stephan Mueller, Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Fri, 2015-09-11 at 00:52 +0200, Stephan Mueller wrote:
> Am Donnerstag, 10. September 2015, 15:27:20 schrieb Tim Chen:
>
> Hi Tim,
>
> >This patch adds the glue code to detect and utilize the Intel SHA
> >extensions optimized SHA1 and SHA256 update transforms when available.
> >
> >This code has been tested on Broxton for functionality.
>
> A general comment on this file: shouldn't this file be cleaned and use the
> standard mechanisms of the kernel crypto API?
>
> This glue implements its own selection of which SHA implementation to use. But
> the kernel crypto API implements that logic already. The issue with the
> current implementation in this file is that you have no clue which particular
> implementation of SHA is in use in one particular case.
>
> So, may I suggest a restructuring to define independent instances of SHA, such
> as
>
> - cra_name == "sha1", cra_driver_name="sha1_ssse3", cra_priority=300
> - cra_name == "sha1", cra_driver_name="sha1_avx", cra_priority=400
> - cra_name == "sha1", cra_driver_name="sha1_avx2", cra_priority=500
> - cra_name == "sha1", cra_driver_name="sha1_shavx", cra_priority=600
>
> Similarly for the other SHAs?
>
> In all the register functions for the ciphers, you can bail out if the
> hardware does not support an implementation.

Stephen,

Is there a scenario you can think of
when a lower performing sha1 transform needs to
be exposed as a separate driver?

Otherwise the glue code logic will only expose the
best performing one for a cpu and hide the others, which was intentional
on our part to prevent a lower performing sha from getting used.

Tim

Stephan Mueller

unread,

Sep 11, 2015, 1:02:18 PM9/11/15

to Tim Chen, Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:

Hi Tim,
>

>Is there a scenario you can think of
>when a lower performing sha1 transform needs to
>be exposed as a separate driver?

My immediate concern is testing: it is hard to test the individual
implementations.

>
>Otherwise the glue code logic will only expose the
>best performing one for a cpu and hide the others, which was intentional
>on our part to prevent a lower performing sha from getting used.

Agreed, but the kernel crypto API does that already using the priorities --
IMHO a very clean and easy to interpret solution.

Furthermore, if somebody really has a need to not use the fastest HW
implementation, the kernel crypto API allows him to do that. With the hard-
wired approach in the glue file, you are stuck.

Ciao
Stephan

Tim Chen

unread,

Sep 11, 2015, 2:49:42 PM9/11/15

to Stephan Mueller, Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Fri, 2015-09-11 at 19:02 +0200, Stephan Mueller wrote:
> Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:
>
> Hi Tim,
> >
> >Is there a scenario you can think of
> >when a lower performing sha1 transform needs to
> >be exposed as a separate driver?
>
> My immediate concern is testing: it is hard to test the individual
> implementations.
> >

Not hard, just one line in the glue code to set the transform
to the one you need it you really want to test individual
implementation. Usually user of sha don't care which sha driver
they got, but just the highest priority one.
So you will anyway need to patch and change the priority of the sha
driver to expose a specific one for testing.

> >Otherwise the glue code logic will only expose the
> >best performing one for a cpu and hide the others, which was intentional
> >on our part to prevent a lower performing sha from getting used.
>
> Agreed, but the kernel crypto API does that already using the priorities --
> IMHO a very clean and easy to interpret solution.
>
> Furthermore, if somebody really has a need to not use the fastest HW
> implementation, the kernel crypto API allows him to do that. With the hard-
> wired approach in the glue file, you are stuck.

Still, why would some kernel module specifically not want to
use the fastest HW implementation, and explicitly ask for
a slower driver?

Tim

David Miller

unread,

Sep 11, 2015, 3:15:53 PM9/11/15

to tim.c...@linux.intel.com, smue...@chronox.de, her...@gondor.apana.org.au, h...@zytor.com, sean.m...@intel.com, mouli...@yahoo.com, vinodh...@intel.com, james.g...@intel.com, wajdi.k...@intel.com, jussi.k...@iki.fi, linux-...@vger.kernel.org, linux-...@vger.kernel.org

From: Tim Chen <tim.c...@linux.intel.com>
Date: Fri, 11 Sep 2015 11:49:32 -0700

> Still, why would some kernel module specifically not want to
> use the fastest HW implementation, and explicitly ask for
> a slower driver?

Temporary workaround if a bug is found.

There is really no reason to prevent the user from having this
flexibility, and in return anyone can test any implementation
their cpu can support.

Stephan Mueller

unread,

Sep 11, 2015, 3:15:58 PM9/11/15

to Tim Chen, Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford, Wajdi Feghali, Jussi Kivilinna, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Am Freitag, 11. September 2015, 11:49:32 schrieb Tim Chen:

Hi Tim,

>On Fri, 2015-09-11 at 19:02 +0200, Stephan Mueller wrote:
>> Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:
>>
>> Hi Tim,
>>
>> >Is there a scenario you can think of
>> >when a lower performing sha1 transform needs to
>> >be exposed as a separate driver?
>>
>> My immediate concern is testing: it is hard to test the individual
>> implementations.
>
>Not hard, just one line in the glue code to set the transform
>to the one you need it you really want to test individual
>implementation. Usually user of sha don't care which sha driver
>they got, but just the highest priority one.
>So you will anyway need to patch and change the priority of the sha
>driver to expose a specific one for testing.

Sure, it is not hard when you recompile. But when you have to test one given
kernel binary, it is a challenge.

>
>> >Otherwise the glue code logic will only expose the
>> >best performing one for a cpu and hide the others, which was intentional
>> >on our part to prevent a lower performing sha from getting used.
>>
>> Agreed, but the kernel crypto API does that already using the priorities --
>> IMHO a very clean and easy to interpret solution.
>>
>> Furthermore, if somebody really has a need to not use the fastest HW
>> implementation, the kernel crypto API allows him to do that. With the hard-
>> wired approach in the glue file, you are stuck.
>
>Still, why would some kernel module specifically not want to
>use the fastest HW implementation, and explicitly ask for
>a slower driver?

I have seen one instance where a hardware driver was broken on one particular
hardware. So, the only way was to disable it. In our case here, disabling
means to go back to the software implementation of SHA.

Ciao
Stephan

Tim Chen

unread,

Sep 11, 2015, 4:10:36 PM9/11/15

to David Miller, smue...@chronox.de, her...@gondor.apana.org.au, h...@zytor.com, sean.m...@intel.com, mouli...@yahoo.com, vinodh...@intel.com, james.g...@intel.com, wajdi.k...@intel.com, jussi.k...@iki.fi, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Fri, 2015-09-11 at 12:15 -0700, David Miller wrote:
> From: Tim Chen <tim.c...@linux.intel.com>
> Date: Fri, 11 Sep 2015 11:49:32 -0700
>
> > Still, why would some kernel module specifically not want to
> > use the fastest HW implementation, and explicitly ask for
> > a slower driver?
>
> Temporary workaround if a bug is found.
>
> There is really no reason to prevent the user from having this
> flexibility, and in return anyone can test any implementation
> their cpu can support.

Mmmm..., this is a restructuring of the algorithms within
the glue code into multiple drivers instead of one and exposing
them all. It is a bit orthogonal to the intention of this
patch set. I think it is better that I create a
separate patch on the glue code on top of this patch set
to implement this.

Herbert, do you agree with this approach?

Tim

Herbert Xu

unread,

Sep 12, 2015, 4:10:24 AM9/12/15

to Tim Chen, David Miller, smue...@chronox.de, h...@zytor.com, sean.m...@intel.com, mouli...@yahoo.com, vinodh...@intel.com, james.g...@intel.com, wajdi.k...@intel.com, jussi.k...@iki.fi, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Fri, Sep 11, 2015 at 01:10:27PM -0700, Tim Chen wrote:
>
> Mmmm..., this is a restructuring of the algorithms within
> the glue code into multiple drivers instead of one and exposing
> them all. It is a bit orthogonal to the intention of this
> patch set. I think it is better that I create a
> separate patch on the glue code on top of this patch set
> to implement this.
>
> Herbert, do you agree with this approach?

Yes I think we can work on the individual crypto registration
separately from this patch-set.

Thanks,
--
Email: Herbert Xu <her...@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt