This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Fwd: [PATCH, ARM] Improve 64 bit division performance
- From: Charles Baylis <charles dot baylis at linaro dot org>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>, Richard Earnshaw <rearnsha at arm dot com>
- Date: Thu, 27 Feb 2014 16:38:40 +0000
- Subject: Fwd: [PATCH, ARM] Improve 64 bit division performance
- Authentication-results: sourceware.org; auth=none
- References: <CADnVucCt+c7adXBASfRkKFvYr+fdixQccegHAGsGGpu0uHmbSg at mail dot gmail dot com>
[resending as text/plain]
Hi
These patches optimise 64 bit division by removing the use of the
__gnu_[u]ldivmod_helper functions and hence avoiding the redundant
calculation of the remainder in those functions.
Bootstrapped, tested and checked for arm-unknown-linux-gnueabihf.
Benchmarked on Chromebook and Raspberry Pi using attached divbench3.c.
Loop1 varies the divisor and loop2 varies the dividend.
Chromebook:
before:
loop1 unsigned: 3.474419
loop2 unsigned: 6.564871
loop1 signed: 4.127967
loop2 signed: 6.071490
after:
loop1 unsigned: 2.781364
loop2 unsigned: 6.166478
loop1 signed: 2.800974
loop2 signed: 6.129588
Raspberry pi:
before
loop1 unsigned: 28.881753
loop2 unsigned: 19.876385
loop1 signed: 32.074941
loop2 signed: 20.594860
after:
loop1 unsigned: 24.893846
loop2 unsigned: 19.537562
loop1 signed: 25.334509
loop2 signed: 19.615088
Any comments? OK for stage 1?
Patch 1:
2014-02-27 Charles Baylis <charles.baylis@linaro.org>
* config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call
to __udivmoddi4.
Patch 2:
2014-02-27 Charles Baylis <charles.baylis@linaro.org>
* config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division via
call to __udivmoddi4 and fixing up for negative operands.
From 35254b813303e7fb40eb8aa0bb749216fd8f96fc Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 25 Feb 2014 18:34:38 +0000
Subject: [PATCH 1/2] Optimise __aeabi_uldivmod
2014-02-25 Charles Baylis <charles.baylis@linaro.org>
* config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call
to __udivmoddi4.
* config/arm/bpabi.S (__aeabi_uldivmod): Optimise stack pointer
manipulation.
---
libgcc/config/arm/bpabi.S | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index 7772301..e020af5 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -120,6 +120,16 @@ ARM_FUNC_START aeabi_ulcmp
#endif
.endm
+/* we can use STRD/LDRD on v5TE and later, and any Thumb-2 architecture. */
+#if (defined(__ARM_EABI__) \
+ && (defined(__thumb2__) \
+ || (__ARM_ARCH >= 5 && defined(__TARGET_FEATURE_DSP))))
+#define CAN_USE_LDRD 1
+#else
+#define CAN_USE_LDRD 0
+#endif
+
+
#ifdef L_aeabi_ldivmod
ARM_FUNC_START aeabi_ldivmod
@@ -149,18 +159,23 @@ ARM_FUNC_START aeabi_uldivmod
cfi_start __aeabi_uldivmod, LSYM(Lend_aeabi_uldivmod)
test_div_by_zero unsigned
- sub sp, sp, #8
-#if defined(__thumb2__)
- mov ip, sp
- push {ip, lr}
+#if defined(__thumb2__) && CAN_USE_LDRD
+ sub ip, sp, #8
+ strd ip,lr, [sp, #-16]!
#else
+ sub sp, sp, #8
do_push {sp, lr}
#endif
98: cfi_push 98b - __aeabi_uldivmod, 0xe, -0xc, 0x10
- bl SYM(__gnu_uldivmod_helper) __PLT__
+ bl SYM(__udivmoddi4) __PLT__
ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+ ldrd r2, r3, [sp, #8]
+ add sp, sp, #16
+#else
add sp, sp, #8
do_pop {r2, r3}
+#endif
RET
cfi_end LSYM(Lend_aeabi_uldivmod)
--
1.8.3.2
From 975d9c624e77ee00476e6866250b0e2e31461fca Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 25 Feb 2014 16:27:59 +0000
Subject: [PATCH 2/2] Optimise __aeabi_ldivmod
2014-02-25 Charles Baylis <charles.baylis@linaro.org>
* config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division using
unsigned division via call to __udivmoddi4 and additional logic.
---
libgcc/config/arm/bpabi.S | 74 +++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 69 insertions(+), 5 deletions(-)
diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index e020af5..8b75a28 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -136,20 +136,84 @@ ARM_FUNC_START aeabi_ldivmod
cfi_start __aeabi_ldivmod, LSYM(Lend_aeabi_ldivmod)
test_div_by_zero signed
- sub sp, sp, #8
-#if defined(__thumb2__)
- mov ip, sp
- push {ip, lr}
+#if defined(__thumb2__) && CAN_USE_LDRD
+ sub ip, sp, #8
+ strd ip,lr, [sp, #-16]!
#else
+ sub sp, sp, #8
do_push {sp, lr}
#endif
+ cmp xxh, #0
+ blt 1f
+ cmp yyh, #0
+ blt 2f
+
+98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+ bl SYM(__udivmoddi4) __PLT__
+ ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+ ldrd r2, r3, [sp, #8]
+ add sp, sp, #16
+#else
+ add sp, sp, #8
+ do_pop {r2, r3}
+#endif
+ RET
+1: /* xxh:xxl is negative */
+ rsbs xxl, xxl, #0
+ sbc xxh, xxh, xxh, lsl #1
+ cmp yyh, #0
+ blt 3f
+98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+ bl SYM(__udivmoddi4) __PLT__
+ ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+ ldrd r2, r3, [sp, #8]
+ add sp, sp, #16
+#else
+ add sp, sp, #8
+ do_pop {r2, r3}
+#endif
+ rsbs xxl, xxl, #0
+ sbc xxh, xxh, xxh, lsl #1
+ rsbs yyl, yyl, #0
+ sbc yyh, yyh, yyh, lsl #1
+ RET
+
+2: /* only yyh:yyl is negative */
+ rsbs yyl, yyl, #0
+ sbc yyh, yyh, yyh, lsl #1
98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
- bl SYM(__gnu_ldivmod_helper) __PLT__
+ bl SYM(__udivmoddi4) __PLT__
ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+ ldrd r2, r3, [sp, #8]
+ add sp, sp, #16
+#else
add sp, sp, #8
do_pop {r2, r3}
+#endif
+ rsbs xxl, xxl, #0
+ sbc xxh, xxh, xxh, lsl #1
RET
+
+3: /* both xxh:xxl and yyh:yyl are negative */
+ rsbs yyl, yyl, #0
+ sbc yyh, yyh, yyh, lsl #1
cfi_end LSYM(Lend_aeabi_ldivmod)
+98: cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+ bl SYM(__udivmoddi4) __PLT__
+ ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+ ldrd r2, r3, [sp, #8]
+ add sp, sp, #16
+#else
+ add sp, sp, #8
+ do_pop {r2, r3}
+#endif
+ rsbs yyl, yyl, #0
+ sbc yyh, yyh, yyh, lsl #1
+ RET
#endif /* L_aeabi_ldivmod */
--
1.8.3.2
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
double tv_to_s(struct timeval tv)
{
return tv.tv_sec + ((double)tv.tv_usec)/1.0e6;
}
#define STEP (0x7fffffffffff0000/100000000)
#define END (0x7fffffffffff0001-STEP)
#define START1 (37ll)
#define START2 (3ll)
uint64_t __aeabi_uldivmod(uint64_t,uint64_t);
int64_t __aeabi_ldivmod(int64_t,int64_t);
int main(int argc, char **argv)
{
double time1, time2, time3, time4;
struct timeval start, end;
volatile uint64_t dummy;
uint64_t i;
volatile int64_t sdummy;
int64_t si;
gettimeofday (&start, NULL);
for (i = START2; i < END; i += STEP)
{
dummy = __aeabi_uldivmod(END, i);
}
gettimeofday (&end, NULL);
time1 = tv_to_s (end) - tv_to_s (start);
gettimeofday (&start, NULL);
for (i = START1; i < END; i += STEP * 5)
{
dummy = __aeabi_uldivmod(i, 373459);
}
gettimeofday (&end, NULL);
time2 = tv_to_s (end) - tv_to_s (start);
gettimeofday (&start, NULL);
for (si = START2; si < END; si += STEP)
{
sdummy = __aeabi_ldivmod(END, si);
}
gettimeofday (&end, NULL);
time3 = tv_to_s (end) - tv_to_s (start);
gettimeofday (&start, NULL);
for (si = START1; si < END; si += STEP * 5)
{
sdummy = __aeabi_ldivmod(si, 373459);
}
gettimeofday (&end, NULL);
time4 = tv_to_s (end) - tv_to_s (start);
printf ("loop1 unsigned: %12f\n"
"loop2 unsigned: %12f\n"
"loop1 signed: %12f\n"
"loop2 signed: %12f\n",
time1, time2, time3, time4);
return 0;
}