Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH] ARM: Use udiv/sdiv for __aeabi_{u}idiv library functions

19 views
Skip to first unread message

Stephen Boyd

unread,
Nov 7, 2013, 2:30:02 PM11/7/13
to
If we're running on a v7 ARM CPU, detect if the CPU supports the
sdiv/udiv instructions and replace the signed and unsigned
division library functions with an sdiv/udiv instruction.

Running the perf messaging benchmark in pipe mode

$ perf bench sched messaging -p

shows a modest improvement on my v7 CPU.

before:
(5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805

after:
(4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538

(5.805 - 5.538) / 5.805 = 4.6%

Signed-off-by: Stephen Boyd <sb...@codeaurora.org>
---

Should we add in the __div0() call if the denominator is 0?

arch/arm/kernel/setup.c | 10 +++++++++
arch/arm/lib/Makefile | 3 +++
arch/arm/lib/div-v7.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
arch/arm/lib/lib1funcs.S | 16 +++++++++++++
4 files changed, 87 insertions(+)
create mode 100644 arch/arm/lib/div-v7.c

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 0e1e2b3..7d519f4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -30,6 +30,7 @@
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/sort.h>
+#include <linux/static_key.h>

#include <asm/unified.h>
#include <asm/cp15.h>
@@ -365,6 +366,8 @@ void __init early_print(const char *str, ...)
printk("%s", buf);
}

+struct static_key cpu_has_idiv = STATIC_KEY_INIT_FALSE;
+
static void __init cpuid_init_hwcaps(void)
{
unsigned int divide_instrs, vmsa;
@@ -381,6 +384,13 @@ static void __init cpuid_init_hwcaps(void)
elf_hwcap |= HWCAP_IDIVT;
}

+#ifdef CONFIG_THUMB2_KERNEL
+ if (elf_hwcap & HWCAP_IDIVT)
+#else
+ if (elf_hwcap & HWCAP_IDIVA)
+#endif
+ static_key_slow_inc(&cpu_has_idiv);
+
/* LPAE implies atomic ldrd/strd instructions */
vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
if (vmsa >= 5)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index bd454b0..6ed6496 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -15,6 +15,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
call_with_stack.o

+lib-$(CONFIG_CPU_V7) += div-v7.o
+CFLAGS_div-v7.o := -march=armv7-a
+
mmu-y := clear_user.o copy_page.o getuser.o putuser.o

# the code in uaccess.S is not preemption safe and
diff --git a/arch/arm/lib/div-v7.c b/arch/arm/lib/div-v7.c
new file mode 100644
index 0000000..96ceb92
--- /dev/null
+++ b/arch/arm/lib/div-v7.c
@@ -0,0 +1,58 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/static_key.h>
+
+extern int ___aeabi_idiv(int, int);
+extern unsigned ___aeabi_uidiv(int, int);
+
+extern struct static_key cpu_has_idiv;
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+ if (static_key_false(&cpu_has_idiv)) {
+ int ret;
+
+ asm volatile (
+ ".arch_extension idiv\n"
+ "sdiv %0, %1, %2"
+ : "=&r" (ret)
+ : "r" (numerator), "r" (denominator));
+
+ return ret;
+ }
+
+ return ___aeabi_idiv(numerator, denominator);
+}
+
+int __divsi3(int numerator, int denominator)
+ __attribute__((alias("__aeabi_idiv")));
+
+unsigned __aeabi_uidiv(int numerator, int denominator)
+{
+ if (static_key_false(&cpu_has_idiv)) {
+ int ret;
+
+ asm volatile (
+ ".arch_extension idiv\n"
+ "udiv %0, %1, %2"
+ : "=&r" (ret)
+ : "r" (numerator), "r" (denominator));
+
+ return ret;
+ }
+
+ return ___aeabi_uidiv(numerator, denominator);
+}
+
+unsigned __udivsi3(int numerator, int denominator)
+ __attribute__((alias("__aeabi_uidiv")));
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..adea088 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,8 +205,12 @@ Boston, MA 02111-1307, USA. */
.endm


+#if defined(ZIMAGE) || !defined(CONFIG_CPU_V7)
ENTRY(__udivsi3)
ENTRY(__aeabi_uidiv)
+#else
+ENTRY(___aeabi_uidiv)
+#endif
UNWIND(.fnstart)

subs r2, r1, #1
@@ -232,8 +236,12 @@ UNWIND(.fnstart)
mov pc, lr

UNWIND(.fnend)
+#if defined(ZIMAGE) || !defined(CONFIG_CPU_V7)
ENDPROC(__udivsi3)
ENDPROC(__aeabi_uidiv)
+#else
+ENDPROC(___aeabi_uidiv)
+#endif

ENTRY(__umodsi3)
UNWIND(.fnstart)
@@ -253,8 +261,12 @@ UNWIND(.fnstart)
UNWIND(.fnend)
ENDPROC(__umodsi3)

+#if defined(ZIMAGE) || !defined(CONFIG_CPU_V7)
ENTRY(__divsi3)
ENTRY(__aeabi_idiv)
+#else
+ENTRY(___aeabi_idiv)
+#endif
UNWIND(.fnstart)

cmp r1, #0
@@ -293,8 +305,12 @@ UNWIND(.fnstart)
mov pc, lr

UNWIND(.fnend)
+#if defined(ZIMAGE) || !defined(CONFIG_CPU_V7)
ENDPROC(__divsi3)
ENDPROC(__aeabi_idiv)
+#else
+ENDPROC(___aeabi_idiv)
+#endif

ENTRY(__modsi3)
UNWIND(.fnstart)
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Rob Herring

unread,
Nov 7, 2013, 8:40:01 PM11/7/13
to
On Thu, Nov 7, 2013 at 1:20 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.

[snip]

> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index bd454b0..6ed6496 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -15,6 +15,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
> io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
> call_with_stack.o
>
> +lib-$(CONFIG_CPU_V7) += div-v7.o
> +CFLAGS_div-v7.o := -march=armv7-a

Won't this fail to build if the compiler doesn't have armv7-a support.
Perhaps we don't care about compilers that old.

Rob

Jean-Christophe PLAGNIOL-VILLARD

unread,
Nov 8, 2013, 5:00:02 AM11/8/13
to
if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && elf_hwcap & HWCAP_IDIVT)
> _______________________________________________
> linux-arm-kernel mailing list
> linux-ar...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Jean-Christophe PLAGNIOL-VILLARD

unread,
Nov 8, 2013, 7:10:02 AM11/8/13
to
On 19:34 Thu 07 Nov , Rob Herring wrote:
> On Thu, Nov 7, 2013 at 1:20 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
> > If we're running on a v7 ARM CPU, detect if the CPU supports the
> > sdiv/udiv instructions and replace the signed and unsigned
> > division library functions with an sdiv/udiv instruction.
>
> [snip]
>
> > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> > index bd454b0..6ed6496 100644
> > --- a/arch/arm/lib/Makefile
> > +++ b/arch/arm/lib/Makefile
> > @@ -15,6 +15,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
> > io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
> > call_with_stack.o
> >
> > +lib-$(CONFIG_CPU_V7) += div-v7.o
> > +CFLAGS_div-v7.o := -march=armv7-a
>
> Won't this fail to build if the compiler doesn't have armv7-a support.
> Perhaps we don't care about compilers that old.

use the propoer compiler to compile a armv7 kernel

Best Regards,
J.
>
> Rob
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-ar...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Christopher Covington

unread,
Nov 8, 2013, 11:50:01 AM11/8/13
to
Hi Stephen,

On 11/07/2013 02:20 PM, Stephen Boyd wrote:
> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.

[...]

> +++ b/arch/arm/lib/div-v7.c
> @@ -0,0 +1,58 @@
> +/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 and
> + * only version 2 as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/static_key.h>
> +
> +extern int ___aeabi_idiv(int, int);
> +extern unsigned ___aeabi_uidiv(int, int);

Why are the input parameters signed?

> +extern struct static_key cpu_has_idiv;
> +
> +int __aeabi_idiv(int numerator, int denominator)
> +{
> + if (static_key_false(&cpu_has_idiv)) {
> + int ret;
> +
> + asm volatile (
> + ".arch_extension idiv\n"
> + "sdiv %0, %1, %2"
> + : "=&r" (ret)
> + : "r" (numerator), "r" (denominator));
> +
> + return ret;
> + }
> +
> + return ___aeabi_idiv(numerator, denominator);
> +}
> +
> +int __divsi3(int numerator, int denominator)
> + __attribute__((alias("__aeabi_idiv")));
> +
> +unsigned __aeabi_uidiv(int numerator, int denominator)

Unsigned inputs?

> +{
> + if (static_key_false(&cpu_has_idiv)) {
> + int ret;
> +
> + asm volatile (
> + ".arch_extension idiv\n"
> + "udiv %0, %1, %2"
> + : "=&r" (ret)
> + : "r" (numerator), "r" (denominator));
> +
> + return ret;
> + }
> +
> + return ___aeabi_uidiv(numerator, denominator);
> +}
> +
> +unsigned __udivsi3(int numerator, int denominator)
> + __attribute__((alias("__aeabi_uidiv")));

Unsigned inputs?

[...]

Thanks,
Christopher

--
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.

Russell King - ARM Linux

unread,
Nov 8, 2013, 12:00:02 PM11/8/13
to
On Fri, Nov 08, 2013 at 10:58:42AM +0100, Jean-Christophe PLAGNIOL-VILLARD wrote:
> On 11:20 Thu 07 Nov , Stephen Boyd wrote:
> > @@ -381,6 +384,13 @@ static void __init cpuid_init_hwcaps(void)
> > elf_hwcap |= HWCAP_IDIVT;
> > }
> >
> > +#ifdef CONFIG_THUMB2_KERNEL
> if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && elf_hwcap & HWCAP_IDIVT)
> > + if (elf_hwcap & HWCAP_IDIVT)
> > +#else
> > + if (elf_hwcap & HWCAP_IDIVA)
> > +#endif

Take another look, and you'll see the change that you're suggesting is
wrong. Instead, the following may be a more reasonable suggestion as
a suitable replacement:

if (elf_hwcap & (IS_ENABLED(CONFIG_THUMB2_KERNEL) ?
HWCAP_IDIVT : HWCAP_IDIVA))

Russell King - ARM Linux

unread,
Nov 8, 2013, 12:00:02 PM11/8/13
to
On Fri, Nov 08, 2013 at 12:50:04PM +0100, Jean-Christophe PLAGNIOL-VILLARD wrote:
> On 19:34 Thu 07 Nov , Rob Herring wrote:
> > On Thu, Nov 7, 2013 at 1:20 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
> > > If we're running on a v7 ARM CPU, detect if the CPU supports the
> > > sdiv/udiv instructions and replace the signed and unsigned
> > > division library functions with an sdiv/udiv instruction.
> >
> > [snip]
> >
> > > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> > > index bd454b0..6ed6496 100644
> > > --- a/arch/arm/lib/Makefile
> > > +++ b/arch/arm/lib/Makefile
> > > @@ -15,6 +15,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
> > > io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
> > > call_with_stack.o
> > >
> > > +lib-$(CONFIG_CPU_V7) += div-v7.o
> > > +CFLAGS_div-v7.o := -march=armv7-a
> >
> > Won't this fail to build if the compiler doesn't have armv7-a support.
> > Perhaps we don't care about compilers that old.
>
> use the propoer compiler to compile a armv7 kernel

It's probably about time to get rid of the conditionals for this in
the main arch/arm/Makefile actually - some of those date back some
10 or so years. That's something for the v3.14 merge window.

Måns Rullgård

unread,
Nov 8, 2013, 12:20:02 PM11/8/13
to
Stephen Boyd <sb...@codeaurora.org> writes:

> +int __aeabi_idiv(int numerator, int denominator)
> +{
> + if (static_key_false(&cpu_has_idiv)) {
> + int ret;
> +
> + asm volatile (
> + ".arch_extension idiv\n"
> + "sdiv %0, %1, %2"
> + : "=&r" (ret)

There is no need for the & in the output constraint. Dropping it allows
using one of the source registers as destination which may sometimes be
beneficial.

> + : "r" (numerator), "r" (denominator));
> +
> + return ret;
> + }
> +
> + return ___aeabi_idiv(numerator, denominator);
> +}

--
M�ns Rullg�rd
ma...@mansr.com

Stephen Boyd

unread,
Nov 8, 2013, 2:00:01 PM11/8/13
to
On 11/08/13 08:52, Russell King - ARM Linux wrote:
> On Fri, Nov 08, 2013 at 10:58:42AM +0100, Jean-Christophe PLAGNIOL-VILLARD wrote:
>> On 11:20 Thu 07 Nov , Stephen Boyd wrote:
>>> @@ -381,6 +384,13 @@ static void __init cpuid_init_hwcaps(void)
>>> elf_hwcap |= HWCAP_IDIVT;
>>> }
>>>
>>> +#ifdef CONFIG_THUMB2_KERNEL
>> if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && elf_hwcap & HWCAP_IDIVT)
>>> + if (elf_hwcap & HWCAP_IDIVT)
>>> +#else
>>> + if (elf_hwcap & HWCAP_IDIVA)
>>> +#endif
> Take another look, and you'll see the change that you're suggesting is
> wrong. Instead, the following may be a more reasonable suggestion as
> a suitable replacement:
>
> if (elf_hwcap & (IS_ENABLED(CONFIG_THUMB2_KERNEL) ?
> HWCAP_IDIVT : HWCAP_IDIVA))

I can use IS_ENABLED() but I'd prefer a local variable to make it
simpler in the conditional.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 8, 2013, 2:00:01 PM11/8/13
to
On 11/08/13 08:54, Russell King - ARM Linux wrote:
> On Fri, Nov 08, 2013 at 12:50:04PM +0100, Jean-Christophe PLAGNIOL-VILLARD wrote:
>> On 19:34 Thu 07 Nov , Rob Herring wrote:
>>> On Thu, Nov 7, 2013 at 1:20 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
>>>> If we're running on a v7 ARM CPU, detect if the CPU supports the
>>>> sdiv/udiv instructions and replace the signed and unsigned
>>>> division library functions with an sdiv/udiv instruction.
>>> [snip]
>>>
>>>> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
>>>> index bd454b0..6ed6496 100644
>>>> --- a/arch/arm/lib/Makefile
>>>> +++ b/arch/arm/lib/Makefile
>>>> @@ -15,6 +15,9 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
>>>> io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
>>>> call_with_stack.o
>>>>
>>>> +lib-$(CONFIG_CPU_V7) += div-v7.o
>>>> +CFLAGS_div-v7.o := -march=armv7-a
>>> Won't this fail to build if the compiler doesn't have armv7-a support.
>>> Perhaps we don't care about compilers that old.
>> use the propoer compiler to compile a armv7 kernel
> It's probably about time to get rid of the conditionals for this in
> the main arch/arm/Makefile actually - some of those date back some
> 10 or so years. That's something for the v3.14 merge window.

I'll take that as an endorsement for not falling back to -march=armv5t
-Wa,-march=armv7-a like is done in arch/arm/Makefile.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 8, 2013, 2:00:02 PM11/8/13
to
Copy pasta. Fixed thanks.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 8, 2013, 2:10:01 PM11/8/13
to
On 11/08/13 09:02, M�ns Rullg�rd wrote:
> Stephen Boyd <sb...@codeaurora.org> writes:
>
>> +int __aeabi_idiv(int numerator, int denominator)
>> +{
>> + if (static_key_false(&cpu_has_idiv)) {
>> + int ret;
>> +
>> + asm volatile (
>> + ".arch_extension idiv\n"
>> + "sdiv %0, %1, %2"
>> + : "=&r" (ret)
> There is no need for the & in the output constraint. Dropping it allows
> using one of the source registers as destination which may sometimes be
> beneficial.

Ok. Thanks. That does seem to improve things.

before:

00000000 <__aeabi_idiv>:
0: e320f000 nop {0}
4: eafffffe b 0 <___aeabi_idiv>
8: e713f110 sdiv r3, r0, r1
c: e1a00003 mov r0, r3
10: e12fff1e bx lr

00000014 <__aeabi_uidiv>:
14: e320f000 nop {0}
18: eafffffe b 0 <___aeabi_uidiv>
1c: e733f110 udiv r3, r0, r1
20: e1a00003 mov r0, r3
24: e12fff1e bx lr

after:

00000000 <__aeabi_idiv>:
0: e320f000 nop {0}
4: eafffffe b 0 <___aeabi_idiv>
8: e710f110 sdiv r0, r0, r1
c: e12fff1e bx lr

00000010 <__aeabi_uidiv>:
10: e320f000 nop {0}
14: eafffffe b 0 <___aeabi_uidiv>
18: e730f110 udiv r0, r0, r1
1c: e12fff1e bx lr


--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 8, 2013, 6:10:02 PM11/8/13
to
If we're running on a v7 ARM CPU, detect if the CPU supports the
sdiv/udiv instructions and replace the signed and unsigned
division library functions with an sdiv/udiv instruction.

Running the perf messaging benchmark in pipe mode

$ perf bench sched messaging -p

shows a modest improvement on my v7 CPU.

before:
(5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805

after:
(4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538

(5.805 - 5.538) / 5.805 = 4.6%

Signed-off-by: Stephen Boyd <sb...@codeaurora.org>
---

Changes since v1:
* Replace signed with unsigned in unsigned divide function
* drop & in inline assembly
* Use IS_ENABLED() instead of #ifdef
* Pass DIV_V7 into lib1funcs.S instead of depending on ZIMAGE or CPU_V7

arch/arm/kernel/setup.c | 13 ++++++++++-
arch/arm/lib/Makefile | 6 +++++
arch/arm/lib/div-v7.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
arch/arm/lib/lib1funcs.S | 16 +++++++++++++
4 files changed, 92 insertions(+), 1 deletion(-)
create mode 100644 arch/arm/lib/div-v7.c

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 0e1e2b3..f9e577a 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -30,6 +30,7 @@
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/sort.h>
+#include <linux/static_key.h>

#include <asm/unified.h>
#include <asm/cp15.h>
@@ -365,9 +366,11 @@ void __init early_print(const char *str, ...)
printk("%s", buf);
}

+struct static_key cpu_has_idiv = STATIC_KEY_INIT_FALSE;
+
static void __init cpuid_init_hwcaps(void)
{
- unsigned int divide_instrs, vmsa;
+ unsigned int divide_instrs, vmsa, idiv_mask;

if (cpu_architecture() < CPU_ARCH_ARMv7)
return;
@@ -381,6 +384,14 @@ static void __init cpuid_init_hwcaps(void)
elf_hwcap |= HWCAP_IDIVT;
}

+ if (IS_ENABLED(CONFIG_THUMB2_KERNEL))
+ idiv_mask = HWCAP_IDIVT;
+ else
+ idiv_mask = HWCAP_IDIVA;
+
+ if (elf_hwcap & idiv_mask)
+ static_key_slow_inc(&cpu_has_idiv);
+
/* LPAE implies atomic ldrd/strd instructions */
vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
if (vmsa >= 5)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index bd454b0..38621729 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -15,6 +15,12 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
call_with_stack.o

+lib-$(CONFIG_CPU_V7) += div-v7.o
+CFLAGS_div-v7.o := -march=armv7-a
+ifeq ($(CONFIG_CPU_V7),y)
+ AFLAGS_lib1funcs.o := -DDIV_V7
+endif
+
mmu-y := clear_user.o copy_page.o getuser.o putuser.o

# the code in uaccess.S is not preemption safe and
diff --git a/arch/arm/lib/div-v7.c b/arch/arm/lib/div-v7.c
new file mode 100644
index 0000000..e20945a
--- /dev/null
+++ b/arch/arm/lib/div-v7.c
@@ -0,0 +1,58 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/static_key.h>
+
+extern int ___aeabi_idiv(int, int);
+extern unsigned ___aeabi_uidiv(int, int);
+
+extern struct static_key cpu_has_idiv;
+
+int __aeabi_idiv(int numerator, int denominator)
+{
+ if (static_key_false(&cpu_has_idiv)) {
+ int ret;
+
+ asm volatile (
+ ".arch_extension idiv\n"
+ "sdiv %0, %1, %2"
+ : "=r" (ret)
+ : "r" (numerator), "r" (denominator));
+
+ return ret;
+ }
+
+ return ___aeabi_idiv(numerator, denominator);
+}
+
+int __divsi3(int numerator, int denominator)
+ __attribute__((alias("__aeabi_idiv")));
+
+unsigned __aeabi_uidiv(unsigned numerator, unsigned denominator)
+{
+ if (static_key_false(&cpu_has_idiv)) {
+ unsigned ret;
+
+ asm volatile (
+ ".arch_extension idiv\n"
+ "udiv %0, %1, %2"
+ : "=r" (ret)
+ : "r" (numerator), "r" (denominator));
+
+ return ret;
+ }
+
+ return ___aeabi_uidiv(numerator, denominator);
+}
+
+unsigned __udivsi3(unsigned numerator, unsigned denominator)
+ __attribute__((alias("__aeabi_uidiv")));
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index c562f64..82bbcc7 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -205,8 +205,12 @@ Boston, MA 02111-1307, USA. */
.endm


+#ifdef DIV_V7
+ENTRY(___aeabi_uidiv)
+#else
ENTRY(__udivsi3)
ENTRY(__aeabi_uidiv)
+#endif
UNWIND(.fnstart)

subs r2, r1, #1
@@ -232,8 +236,12 @@ UNWIND(.fnstart)
mov pc, lr

UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_uidiv)
+#else
ENDPROC(__udivsi3)
ENDPROC(__aeabi_uidiv)
+#endif

ENTRY(__umodsi3)
UNWIND(.fnstart)
@@ -253,8 +261,12 @@ UNWIND(.fnstart)
UNWIND(.fnend)
ENDPROC(__umodsi3)

+#ifdef DIV_V7
+ENTRY(___aeabi_idiv)
+#else
ENTRY(__divsi3)
ENTRY(__aeabi_idiv)
+#endif
UNWIND(.fnstart)

cmp r1, #0
@@ -293,8 +305,12 @@ UNWIND(.fnstart)
mov pc, lr

UNWIND(.fnend)
+#ifdef DIV_V7
+ENDPROC(___aeabi_idiv)
+#else
ENDPROC(__divsi3)
ENDPROC(__aeabi_idiv)
+#endif

ENTRY(__modsi3)
UNWIND(.fnstart)
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,

Matt Sealey

unread,
Nov 9, 2013, 1:50:02 AM11/9/13
to
On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
>
> Running the perf messaging benchmark in pipe mode
>
> $ perf bench sched messaging -p
>
> shows a modest improvement on my v7 CPU.
>
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>
> (5.805 - 5.538) / 5.805 = 4.6%

Even with the change to the output constraint suggested by Mans, you
get absolutely identical benchmark results? There's a lot of variance
in any case..

BTW has there been any evaluation of the penalty for the extra
branching, or the performance hit for the ARMv7-without-division
cases?

Ta,
Matt Sealey <ne...@bakuhatsu.net>

Måns Rullgård

unread,
Nov 9, 2013, 1:30:03 PM11/9/13
to
Matt Sealey <ne...@bakuhatsu.net> writes:

> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

The branches themselves probably have minimal overhead. There will
however be code to preserve call-clobbered registers (and move the
values to/from r0/r1) that would not be needed if the div instructions
were done inline (obviously such a kernel could only run on hardware
with division support).

--
Måns Rullgård
ma...@mansr.com

Nicolas Pitre

unread,
Nov 10, 2013, 12:10:02 AM11/10/13
to
On Fri, 8 Nov 2013, Stephen Boyd wrote:

> If we're running on a v7 ARM CPU, detect if the CPU supports the
> sdiv/udiv instructions and replace the signed and unsigned
> division library functions with an sdiv/udiv instruction.
>
> Running the perf messaging benchmark in pipe mode
>
> $ perf bench sched messaging -p
>
> shows a modest improvement on my v7 CPU.
>
> before:
> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>
> after:
> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>
> (5.805 - 5.538) / 5.805 = 4.6%
>
> Signed-off-by: Stephen Boyd <sb...@codeaurora.org>

Bah..... NAK.

We are doing runtime patching of the kernel for many many things
already. So why not do the same here?

The obvious strategy is to simply overwrite the start of the existing
__aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.

Similarly for the unsigned case.

That let you test the hardware capability only once during boot instead
of everytime a divide operation is performed.


Nicolas

Uwe Kleine-König

unread,
Nov 11, 2013, 2:50:02 AM11/11/13
to
Hello,

On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index bd454b0..38621729 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -15,6 +15,12 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
> io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
> call_with_stack.o
>
> +lib-$(CONFIG_CPU_V7) += div-v7.o
CPU_V7M could make use of that, too.
(If you follow Nico's advice to use runtime patching I cannot test it
for you on v7-M though as my machine has to use an XIP kernel.)

Best regards
Uwe

--
Pengutronix e.K. | Uwe Kleine-K�nig |
Industrial Linux Solutions | http://www.pengutronix.de/ |

Stephen Boyd

unread,
Nov 11, 2013, 8:30:01 PM11/11/13
to
On 11/08/13 22:46, Matt Sealey wrote:
> On Fri, Nov 8, 2013 at 5:00 PM, Stephen Boyd <sb...@codeaurora.org> wrote:
>> If we're running on a v7 ARM CPU, detect if the CPU supports the
>> sdiv/udiv instructions and replace the signed and unsigned
>> division library functions with an sdiv/udiv instruction.
>>
>> Running the perf messaging benchmark in pipe mode
>>
>> $ perf bench sched messaging -p
>>
>> shows a modest improvement on my v7 CPU.
>>
>> before:
>> (5.060 + 5.960 + 5.971 + 5.643 + 6.029 + 5.665 + 6.050 + 5.870 + 6.117 + 5.683) / 10 = 5.805
>>
>> after:
>> (4.884 + 5.549 + 5.749 + 6.001 + 5.460 + 5.103 + 5.956 + 6.112 + 5.468 + 5.093) / 10 = 5.538
>>
>> (5.805 - 5.538) / 5.805 = 4.6%
> Even with the change to the output constraint suggested by Mans, you
> get absolutely identical benchmark results? There's a lot of variance
> in any case..

Yeah sorry I didn't run the testcase again to see if numbers changed
because I assumed one less instruction would be in the noise. I agree
there is a lot of variance so if you have any better
benchmarks/testcases please let me know.

>
> BTW has there been any evaluation of the penalty for the extra
> branching, or the performance hit for the ARMv7-without-division
> cases?

I haven't done any. I'll factor that in for the next round.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 11, 2013, 9:40:01 PM11/11/13
to
On 11/10/13 23:46, Uwe Kleine-König wrote:
> Hello,
>
> On Fri, Nov 08, 2013 at 03:00:32PM -0800, Stephen Boyd wrote:
>> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
>> index bd454b0..38621729 100644
>> --- a/arch/arm/lib/Makefile
>> +++ b/arch/arm/lib/Makefile
>> @@ -15,6 +15,12 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
>> io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
>> call_with_stack.o
>>
>> +lib-$(CONFIG_CPU_V7) += div-v7.o
> CPU_V7M could make use of that, too.
> (If you follow Nico's advice to use runtime patching I cannot test it
> for you on v7-M though as my machine has to use an XIP kernel.)
>

It already is runtime patching so I suspect you won't be able to test it
anyway. I suppose we need another config like MIGHT_HAVE_IDIV or
something that both v7 and v7M select?

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Stephen Boyd

unread,
Nov 11, 2013, 9:40:02 PM11/11/13
to
On 11/09/13 21:03, Nicolas Pitre wrote:
> Bah..... NAK. We are doing runtime patching of the kernel for many
> many things already. So why not do the same here?

static keys are a form of runtime patching, albeit not as extreme as
you're suggesting.

>
> The obvious strategy is to simply overwrite the start of the existing
> __aeabi_idiv code with the "sdiv r0, r0, r1" and "bx lr" opcodes.
>
> Similarly for the unsigned case.

I was thinking the same thing when I wrote this, but I didn't know how
to tell the compiler to either inline this function or to let me inilne
an assembly stub with some section magic.

>
> That let you test the hardware capability only once during boot instead
> of everytime a divide operation is performed.

The test for hardware capability really isn't done more than once during
boot. The assembly is like so at compile time

00000000 <__aeabi_idiv>:
0: nop {0}
4: b 0 <___aeabi_idiv>
8: sdiv r0, r0, r1
c: bx lr

and after we test and find support for the instruction it will be
replaced with

00000000 <__aeabi_idiv>:
0: b 8
4: b 0 <___aeabi_idiv>
8: sdiv r0, r0, r1
c: bx lr

Unfortunately we still have to jump to this function. It would be great
if we could inline this function at the call site but as I already said
I don't know how to do that.

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

Måns Rullgård

unread,
Nov 12, 2013, 6:30:01 AM11/12/13
to
Ideally the bl instruction at the call site would be patched over with
sdiv/udiv when supported. This would leave things exactly as they are
for hardware without div capability and incur only the call setup cost
(but no actual call) on div-capable hardware. No, I don't know how to
achieve this.

--
Måns Rullgård
ma...@mansr.com

Nicolas Pitre

unread,
Nov 12, 2013, 9:10:01 AM11/12/13
to
What about this patch which I think is currently your best option. Note
it would need to use the facilities from asm/opcodes.h to make it endian
agnostic.

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6a1b8a81b1..379cffe4ab 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
elf_hwcap |= HWCAP_IDIVT;
}

+ /*
+ * Patch our division routines with the corresponding opcode
+ * if the hardware supports it.
+ */
+ if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
+ extern char __aeabi_uidiv, __aeabi_idiv;
+ u16 *uidiv = (u16 *)&__aeabi_uidiv;
+ u16 *idiv = (u16 *)&__aeabi_idiv;
+
+ uidiv[0] = 0xfbb0; /* udiv r0, r0, r1 */
+ uidiv[1] = 0xf0f1;
+ uidiv[2] = 0x4770; /* bx lr */
+
+ idiv[0] = 0xfb90; /* sdiv r0, r0, r1 */
+ idiv[1] = 0xf0f1;
+ idiv[2] = 0x4770; /* bx lr */
+ } else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+ extern char __aeabi_uidiv, __aeabi_idiv;
+ u32 *uidiv = (u32 *)&__aeabi_uidiv;
+ u32 *idiv = (u32 *)&__aeabi_idiv;
+
+ uidiv[0] = 0xe730f110; /* udiv r0, r0, r1 */
+ uidiv[1] = 0xe12fff1e; /* bx lr */
+
+ idiv[0] = 0xe710f110; /* sdiv r0, r0, r1 */
+ idiv[1] = 0xe12fff1e; /* bx lr */
+ }
+
/* LPAE implies atomic ldrd/strd instructions */
vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0;
if (vmsa >= 5)


Nicolas

Russell King - ARM Linux

unread,
Nov 12, 2013, 9:10:01 AM11/12/13
to
What about endianness, and what if XIP is enabled?

Nicolas Pitre

unread,
Nov 12, 2013, 9:20:02 AM11/12/13
to
Just as I said above the diff: this needs refined.

Obviously XIP can't use this and doesn't need it either as a XIP kernel
should be optimized for the very platform it will run onto i.e. gcc
should already emit those div opcodes inline if appropriate.


Nicolas

Ben Dooks

unread,
Nov 12, 2013, 9:20:03 AM11/12/13
to
I was also going to add a note about endian-ness.

Given these are single instructoins for ARM, is it possible we could
make a table of all the callers and fix them up when we initialise
as we do for the SMP/UP case and for page-offset?


--
Ben Dooks http://www.codethink.co.uk/
Senior Engineer Codethink - Providing Genius

Måns Rullgård

unread,
Nov 12, 2013, 9:30:02 AM11/12/13
to
Nicolas Pitre <nicola...@linaro.org> writes:

> What about this patch which I think is currently your best option. Note
> it would need to use the facilities from asm/opcodes.h to make it endian
> agnostic.
>
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index 6a1b8a81b1..379cffe4ab 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> elf_hwcap |= HWCAP_IDIVT;
> }
>
> + /*
> + * Patch our division routines with the corresponding opcode
> + * if the hardware supports it.
> + */
> + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> + extern char __aeabi_uidiv, __aeabi_idiv;

It would be safer to declare these as arrays of unspecified size.
Otherwise the compiler might do evil things with what to it looks like
out of bounds indexing.

There should also be some cache maintenance after this patching, or is
that already happening for some other reason?

--
Måns Rullgård
ma...@mansr.com

Nicolas Pitre

unread,
Nov 12, 2013, 9:40:03 AM11/12/13
to
On Tue, 12 Nov 2013, Ben Dooks wrote:

> Given these are single instructoins for ARM, is it possible we could
> make a table of all the callers and fix them up when we initialise
> as we do for the SMP/UP case and for page-offset?

Not really. Calls to those functions are generated by the compiler
implicitly when a divisor operand is used and therefore we cannot
annotate those calls. We'd have to use special accessors everywhere to
replace the standard division operand (like we do for 64 by 32 bit
divisions) but I doubt that people would accept that.

You cannot just scan the binary for the appropriate branch opcode either
as you may turn up false positives in literal pools.


Nicolas

Nicolas Pitre

unread,
Nov 12, 2013, 9:40:03 AM11/12/13
to
On Tue, 12 Nov 2013, Måns Rullgård wrote:

> Nicolas Pitre <nicola...@linaro.org> writes:
>
> > What about this patch which I think is currently your best option. Note
> > it would need to use the facilities from asm/opcodes.h to make it endian
> > agnostic.
> >
> > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> > index 6a1b8a81b1..379cffe4ab 100644
> > --- a/arch/arm/kernel/setup.c
> > +++ b/arch/arm/kernel/setup.c
> > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void)
> > elf_hwcap |= HWCAP_IDIVT;
> > }
> >
> > + /*
> > + * Patch our division routines with the corresponding opcode
> > + * if the hardware supports it.
> > + */
> > + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) {
> > + extern char __aeabi_uidiv, __aeabi_idiv;
>
> It would be safer to declare these as arrays of unspecified size.
> Otherwise the compiler might do evil things with what to it looks like
> out of bounds indexing.

Right.

>
> There should also be some cache maintenance after this patching, or is
> that already happening for some other reason?

This is so early during boot that the MMU isn't even fully initialized
yet. The cache will be flushed.


Nicolas

Måns Rullgård

unread,
Nov 12, 2013, 9:50:02 AM11/12/13
to
Nicolas Pitre <nicola...@linaro.org> writes:

> On Tue, 12 Nov 2013, Ben Dooks wrote:
>
>> Given these are single instructoins for ARM, is it possible we could
>> make a table of all the callers and fix them up when we initialise
>> as we do for the SMP/UP case and for page-offset?
>
> Not really. Calls to those functions are generated by the compiler
> implicitly when a divisor operand is used and therefore we cannot
> annotate those calls. We'd have to use special accessors everywhere to
> replace the standard division operand (like we do for 64 by 32 bit
> divisions) but I doubt that people would accept that.

It might be possible to extract this information from relocation tables.

--
Måns Rullgård
ma...@mansr.com

Nicolas Pitre

unread,
Nov 12, 2013, 10:00:03 AM11/12/13
to
On Tue, 12 Nov 2013, Måns Rullgård wrote:

> Nicolas Pitre <nicola...@linaro.org> writes:
>
> > On Tue, 12 Nov 2013, Ben Dooks wrote:
> >
> >> Given these are single instructoins for ARM, is it possible we could
> >> make a table of all the callers and fix them up when we initialise
> >> as we do for the SMP/UP case and for page-offset?
> >
> > Not really. Calls to those functions are generated by the compiler
> > implicitly when a divisor operand is used and therefore we cannot
> > annotate those calls. We'd have to use special accessors everywhere to
> > replace the standard division operand (like we do for 64 by 32 bit
> > divisions) but I doubt that people would accept that.
>
> It might be possible to extract this information from relocation tables.

True, but only for individual .o files. Once the linker puts them
together the information is lost, and trying to infer what the linker
has done is insane.

Filtering the compiler output to annotate idiv calls before it is
assembled would probably be a better solution.

Is it worth it? I'm not sure.


Nicolas

Nicolas Pitre

unread,
Nov 12, 2013, 10:30:01 AM11/12/13
to
Another solution is to patch the call site from within __aeabi_idiv at
run time using lr. That wouldn't work in the presence of tail call
optimization though.

Again this might not be worth it and patching __aeabi_idiv instead might
be a good enough compromize.


Nicolas

Måns Rullgård

unread,
Nov 12, 2013, 1:10:02 PM11/12/13
to
OK, here's an extremely ugly hootenanny of a patch. It seems to work on
an A7 Cubieboard2. I would never suggest actually doing this, but maybe
it can be useful for comparing performance against the more palatable
solutions.

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 7397db6..cf1cd30 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -113,7 +113,7 @@ endif
endif

# Need -Uarm for gcc < 3.x
-KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
+KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_THUMB2) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm -include asm/divhack.h
KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_THUMB2) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float

CHECKFLAGS += -D__arm__
diff --git a/arch/arm/include/asm/divhack.h b/arch/arm/include/asm/divhack.h
new file mode 100644
index 0000000..c750b78
--- /dev/null
+++ b/arch/arm/include/asm/divhack.h
@@ -0,0 +1,23 @@
+__asm__ (".macro dobl tgt \n"
+ " .ifc \\tgt, __aeabi_idiv \n"
+ " .L.sdiv.\\@: \n"
+ " .pushsection .sdiv_tab.init, \"a\", %progbits \n"
+ " .word .L.sdiv.\\@ \n"
+ " .popsection \n"
+ " .endif \n"
+ " .ifc \\tgt, __aeabi_uidiv \n"
+ " .L.udiv.\\@: \n"
+ " .pushsection .udiv_tab.init, \"a\", %progbits \n"
+ " .word .L.udiv.\\@ \n"
+ " .popsection \n"
+ " .endif \n"
+ " bl \\tgt \n"
+ ".endm \n"
+ ".macro defbl \n"
+ " .macro bl tgt \n"
+ " .purgem bl \n"
+ " dobl \\tgt \n"
+ " defbl \n"
+ " .endm \n"
+ ".endm \n"
+ "defbl \n");
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 067815c1..b3a3fe1 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -375,6 +375,18 @@ static void __init cpuid_init_hwcaps(void)
case 1:
elf_hwcap |= HWCAP_IDIVT;
}
+
+ if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) {
+ extern u32 __sdiv_tab_start, __sdiv_tab_end;
+ extern u32 __udiv_tab_start, __udiv_tab_end;
+ u32 *div;
+
+ for (div = &__sdiv_tab_start; div < &__sdiv_tab_end; div++)
+ *(u32 *)*div = 0xe710f110;
+
+ for (div = &__udiv_tab_start; div < &__udiv_tab_end; div++)
+ *(u32 *)*div = 0xe730f110;
+ }
}

static void __init feat_v6_fixup(void)
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 43a31fb..3d5c103 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -176,6 +176,8 @@ SECTIONS
CON_INITCALL
SECURITY_INITCALL
INIT_RAM_FS
+ __sdiv_tab_start = .; *(.sdiv_tab.init); __sdiv_tab_end = .;
+ __udiv_tab_start = .; *(.udiv_tab.init); __udiv_tab_end = .;
}
#ifndef CONFIG_XIP_KERNEL
.exit.data : {
0 new messages