From: Tong Tiangen <
tongt...@huawei.com>
The copy_mc_to_kernel() helper is memory copy implementation that handles
source exceptions. It can be used in memory copy scenarios that tolerate
hardware memory errors(e.g: pmem_read/dax_copy_to_iter).
Currently, only x86 and ppc support this helper, Add this for ARM64 as
well, if ARCH_HAS_COPY_MC is defined, by implementing copy_mc_to_kernel()
and memcpy_mc() functions.
Because there is no caller-saved GPR is available for saving "bytes not
copied" in memcpy(), the memcpy_mc() is referenced to the implementation
of copy_from_user(). In addition, the fixup of MOPS insn is not considered
at present.
[Ruidong: refactor memcpy_mc on top of the new memcpy implementation.]
Signed-off-by: Tong Tiangen <
tongt...@huawei.com>
arch/arm64/include/asm/string.h | 5 +
arch/arm64/include/asm/uaccess.h | 17 +++
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/memcpy.S | 253 +++----------------------------
arch/arm64/lib/memcpy_mc.S | 56 +++++++
arch/arm64/lib/memcpy_template.S | 249 ++++++++++++++++++++++++++++++
mm/kasan/shadow.c | 12 ++
7 files changed, 359 insertions(+), 235 deletions(-)
create mode 100644 arch/arm64/lib/memcpy_mc.S
create mode 100644 arch/arm64/lib/memcpy_template.S
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3a3264ff47b9..23eca4fb24fa 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -35,6 +35,10 @@ extern void *memchr(const void *, int, __kernel_size_t);
extern void *memcpy(void *, const void *, __kernel_size_t);
extern void *__memcpy(void *, const void *, __kernel_size_t);
+#define __HAVE_ARCH_MEMCPY_MC
+extern int memcpy_mc(void *, const void *, __kernel_size_t);
+extern int __memcpy_mc(void *, const void *, __kernel_size_t);
+
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *, const void *, __kernel_size_t);
extern void *__memmove(void *, const void *, __kernel_size_t);
@@ -57,6 +61,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt);
*/
#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memcpy_mc(dst, src, len) __memcpy_mc(dst, src, len)
#define memmove(dst, src, len) __memmove(dst, src, len)
#define memset(s, c, n) __memset(s, c, n)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index b0c83a08dda9..93277eca2268 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -499,5 +499,22 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr,
}
#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
+#ifdef CONFIG_ARCH_HAS_COPY_MC
+/**
+ * copy_mc_to_kernel - memory copy that handles source exceptions
+ *
+ * @to: destination address
+ * @from: source address
+ * @size: number of bytes to copy
+ *
+ * Return 0 for success, or bytes not copied.
+ */
+static inline unsigned long __must_check
+copy_mc_to_kernel(void *to, const void *from, unsigned long size)
+{
+ return memcpy_mc(to, from, size);
+}
+#define copy_mc_to_kernel copy_mc_to_kernel
+#endif
#endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 1f4c3f743a20..a5820e6c33d4 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -7,7 +7,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
-lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc_page.o
+lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc_page.o memcpy_mc.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 9b99106fb95f..ef6aea2de9b4 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -15,247 +15,32 @@
*
*/
-#define L(label) .L ## label
+ .macro ldrb1 reg, addr:vararg
+ ldrb \reg, \addr
+ .endm
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_l x10
-#define C_lw w10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l x14
-#define E_h x15
-#define F_l x16
-#define F_h x17
-#define G_l count
-#define G_h dst
-#define H_l src
-#define H_h srcend
-#define tmp1 x14
+ .macro ldr1 reg, addr:vararg
+ ldr \reg, \addr
+ .endm
-/* This implementation handles overlaps and supports both memcpy and memmove
- from a single entry point. It uses unaligned accesses and branchless
- sequences to keep the code small, simple and improve performance.
+ .macro ldp1 reg1, reg2, addr:vararg
+ ldp \reg1, \reg2, \addr
+ .endm
- Copies are split into 3 main cases: small copies of up to 32 bytes, medium
- copies of up to 128 bytes, and large copies. The overhead of the overlap
- check is negligible since it is only required for large copies.
+ .macro ret1
+ ret
+ .endm
- Large copies use a software pipelined loop processing 64 bytes per iteration.
- The destination pointer is 16-byte aligned to minimize unaligned accesses.
- The loop tail is handled by always copying 64 bytes from the end.
-*/
+ .macro cpy1 dst, src, count
+ .arch_extension mops
+ cpyp [\dst]!, [\src]!, \count!
+ cpym [\dst]!, [\src]!, \count!
+ cpye [\dst]!, [\src]!, \count!
+ .endm
-SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 128
- b.hi L(copy_long)
- cmp count, 32
- b.hi L(copy32_128)
-
- /* Small copies: 0..32 bytes. */
- cmp count, 16
- b.lo L(copy16)
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- /* Copy 8-15 bytes. */
-L(copy16):
- tbz count, 3, L(copy8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
-
- .p2align 3
- /* Copy 4-7 bytes. */
-L(copy8):
- tbz count, 2, L(copy4)
- ldr A_lw, [src]
- ldr B_lw, [srcend, -4]
- str A_lw, [dstin]
- str B_lw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- ldp D_l, D_h, [srcend, -16]
- cmp count, 64
- b.hi L(copy128)
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
- /* Copy 65..128 bytes. */
-L(copy128):
- ldp E_l, E_h, [src, 32]
- ldp F_l, F_h, [src, 48]
- cmp count, 96
-
b.ls L(copy96)
- ldp G_l, G_h, [srcend, -64]
- ldp H_l, H_h, [srcend, -48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
-L(copy96):
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp E_l, E_h, [dstin, 32]
- stp F_l, F_h, [dstin, 48]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
- /* Copy more than 128 bytes. */
-L(copy_long):
- /* Use backwards copy if there is an overlap. */
- sub tmp1, dstin, src
- cbz tmp1, L(copy0)
- cmp tmp1, count
- b.lo L(copy_long_backwards)
-
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
-
- ldp D_l, D_h, [src]
- and tmp1, dstin, 15
- bic dst, dstin, 15
- sub src, src, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
-
b.ls L(copy64_from_end)
-
-L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last iteration and copy 64 bytes from the end. */
-L(copy64_from_end):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
- ret
-
- .p2align 4
-
- /* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align dst to 16-byte alignment. */
-L(copy_long_backwards):
- ldp D_l, D_h, [srcend, -16]
- and tmp1, dstend, 15
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
-
b.ls L(copy64_from_start)
-
-L(loop64_backwards):
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi L(loop64_backwards)
-
- /* Write the last iteration and copy 64 bytes from the start. */
-L(copy64_from_start):
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
- ret
-SYM_FUNC_END(__pi_memcpy_generic)
-
-#ifdef CONFIG_AS_HAS_MOPS
- .arch_extension mops
SYM_FUNC_START(__pi_memcpy)
-alternative_if_not ARM64_HAS_MOPS
- b __pi_memcpy_generic
-alternative_else_nop_endif
-
- mov dst, dstin
- cpyp [dst]!, [src]!, count!
- cpym [dst]!, [src]!, count!
- cpye [dst]!, [src]!, count!
- ret
+#include "memcpy_template.S"
SYM_FUNC_END(__pi_memcpy)
-#else
-SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
-#endif
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memcpy_mc.S b/arch/arm64/lib/memcpy_mc.S
new file mode 100644
index 000000000000..90624d35af4b
--- /dev/null
+++ b/arch/arm64/lib/memcpy_mc.S
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2012-2021, Arm Limited.
+ *
+ * Adapted from the original at:
+ *
https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/asm-uaccess.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+ .macro ldrb1 reg, addr:vararg
+ KERNEL_MEM_ERR(9998f, ldrb \reg, \addr)
+ .endm
+
+ .macro ldr1 reg, addr:vararg
+ KERNEL_MEM_ERR(9998f, ldr \reg, \addr)
+ .endm
+
+ .macro ldp1 reg1, reg2, addr:vararg
+ KERNEL_MEM_ERR(9998f, ldp \reg1, \reg2, \addr)
+ .endm
+
+ .macro ret1
+ mov x0, #0
+ ret
+ .endm
+
+ .macro cpy1 dst, src, count
+ .arch_extension mops
+ USER_CPY(9998f, 0, cpyp [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 0, cpym [\dst]!, [\src]!, \count!)
+ USER_CPY(9996f, 0, cpye [\dst]!, [\src]!, \count!)
+ .endm
+
+SYM_FUNC_START(__memcpy_mc)
+#include "memcpy_template.S"
+
+ // Exception fixups
+9996: b.cs 9998f
+ // Registers are in Option A format
+ add dst, dst, count
+9998: sub x0, dstend, dstin // bytes not copied
+ ret
+SYM_FUNC_END(__memcpy_mc)
+
+EXPORT_SYMBOL(__memcpy_mc)
+SYM_FUNC_ALIAS_WEAK(memcpy_mc, __memcpy_mc)
+EXPORT_SYMBOL(memcpy_mc)
diff --git a/arch/arm64/lib/memcpy_template.S b/arch/arm64/lib/memcpy_template.S
new file mode 100644
index 000000000000..205516c6e076
--- /dev/null
+++ b/arch/arm64/lib/memcpy_template.S
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2012-2021, Arm Limited.
+ *
+ * Adapted from the original at:
+ *
https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define L(label) .L ## label
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+#ifdef CONFIG_AS_HAS_MOPS
+alternative_if_not ARM64_HAS_MOPS
+ b L(no_mops):
+alternative_else_nop_endif
+
+ cpy1 dst, src, count
+ ret1
+#endif
+
+L(no_mops):
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp1 A_l, A_h, [src]
+ ldp1 D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret1
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr1 A_l, [src]
+ ldr1 A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret1
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr1 A_lw, [src]
+ ldr1 B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret1
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb1 A_lw, [src]
+ ldrb1 C_lw, [srcend, -1]
+ ldrb1 B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret1
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp1 A_l, A_h, [src]
+ ldp1 B_l, B_h, [src, 16]
+ ldp1 C_l, C_h, [srcend, -32]
+ ldp1 D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret1
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp1 E_l, E_h, [src, 32]
+ ldp1 F_l, F_h, [src, 48]
+ cmp count, 96
+
b.ls L(copy96)
+ ldp1 G_l, G_h, [srcend, -64]
+ ldp1 H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret1
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp1 D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp1 A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp1 B_l, B_h, [src, 32]
+ ldp1 C_l, C_h, [src, 48]
+ ldp1 D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+
b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp1 A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp1 B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp1 C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp1 D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp1 E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp1 A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp1 B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp1 C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret1
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp1 D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp1 A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp1 B_l, B_h, [srcend, -32]
+ ldp1 C_l, C_h, [srcend, -48]
+ ldp1 D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+
b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp1 A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp1 B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp1 C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp1 D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp1 G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp1 A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp1 B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp1 C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret1
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d286e0a04543..3128f0d9cc46 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -79,6 +79,18 @@ void *memcpy(void *dest, const void *src, size_t len)
}
#endif
+#ifdef __HAVE_ARCH_MEMCPY_MC
+#undef memcpy_mc
+int memcpy_mc(void *dest, const void *src, size_t len)
+{
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
+ return (int)len;
+
+ return __memcpy_mc(dest, src, len);
+}
+#endif
+
void *__asan_memset(void *addr, int c, ssize_t len)
{
if (!kasan_check_range(addr, len, true, _RET_IP_))
--
2.39.3