This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Fwd: [PATCH, ARM] Improve 64 bit division performance


[resending as text/plain]

Hi

These patches optimise 64 bit division by removing the use of the
__gnu_[u]ldivmod_helper functions and hence avoiding the redundant
calculation of the remainder in those functions.

Bootstrapped, tested and checked for arm-unknown-linux-gnueabihf.

Benchmarked on Chromebook and Raspberry Pi using attached divbench3.c.
Loop1 varies the divisor and loop2 varies the dividend.

Chromebook:

before:
loop1 unsigned:         3.474419
loop2 unsigned:         6.564871
loop1 signed:           4.127967
loop2 signed:           6.071490

after:
loop1 unsigned:         2.781364
loop2 unsigned:         6.166478
loop1 signed:           2.800974
loop2 signed:           6.129588

Raspberry pi:
before
loop1 unsigned:        28.881753
loop2 unsigned:        19.876385
loop1 signed:          32.074941
loop2 signed:          20.594860

after:
loop1 unsigned:        24.893846
loop2 unsigned:        19.537562
loop1 signed:          25.334509
loop2 signed:          19.615088

Any comments? OK for stage 1?


Patch 1:

2014-02-27  Charles Baylis  <charles.baylis@linaro.org>

        * config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call
        to __udivmoddi4.


Patch 2:

2014-02-27  Charles Baylis  <charles.baylis@linaro.org>

        * config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division via
        call to __udivmoddi4 and fixing up for negative operands.
From 35254b813303e7fb40eb8aa0bb749216fd8f96fc Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 25 Feb 2014 18:34:38 +0000
Subject: [PATCH 1/2] Optimise __aeabi_uldivmod

2014-02-25  Charles Baylis  <charles.baylis@linaro.org>

	* config/arm/bpabi.S (__aeabi_uldivmod): Perform division using call
	to __udivmoddi4.
	* config/arm/bpabi.S (__aeabi_uldivmod): Optimise stack pointer
	manipulation.
---
 libgcc/config/arm/bpabi.S | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index 7772301..e020af5 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -120,6 +120,16 @@ ARM_FUNC_START aeabi_ulcmp
 #endif
 .endm
 
+/* we can use STRD/LDRD on v5TE and later, and any Thumb-2 architecture. */
+#if (defined(__ARM_EABI__)                                            \
+     && (defined(__thumb2__)                                          \
+         || (__ARM_ARCH >= 5 && defined(__TARGET_FEATURE_DSP))))
+#define CAN_USE_LDRD 1
+#else
+#define CAN_USE_LDRD 0
+#endif
+
+
 #ifdef L_aeabi_ldivmod
 
 ARM_FUNC_START aeabi_ldivmod
@@ -149,18 +159,23 @@ ARM_FUNC_START aeabi_uldivmod
 	cfi_start	__aeabi_uldivmod, LSYM(Lend_aeabi_uldivmod)
 	test_div_by_zero unsigned
 
-	sub sp, sp, #8
-#if defined(__thumb2__)
-	mov ip, sp
-	push {ip, lr}
+#if defined(__thumb2__) && CAN_USE_LDRD
+	sub ip, sp, #8
+	strd ip,lr, [sp, #-16]!
 #else
+	sub sp, sp, #8
 	do_push {sp, lr}
 #endif
 98:	cfi_push 98b - __aeabi_uldivmod, 0xe, -0xc, 0x10
-	bl SYM(__gnu_uldivmod_helper) __PLT__
+	bl SYM(__udivmoddi4) __PLT__
 	ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+	ldrd r2, r3, [sp, #8]
+	add sp, sp, #16
+#else
 	add sp, sp, #8
 	do_pop {r2, r3}
+#endif
 	RET
 	cfi_end	LSYM(Lend_aeabi_uldivmod)
 
-- 
1.8.3.2

From 975d9c624e77ee00476e6866250b0e2e31461fca Mon Sep 17 00:00:00 2001
From: Charles Baylis <charles.baylis@linaro.org>
Date: Tue, 25 Feb 2014 16:27:59 +0000
Subject: [PATCH 2/2] Optimise __aeabi_ldivmod

2014-02-25  Charles Baylis  <charles.baylis@linaro.org>

        * config/arm/bpabi.S (__aeabi_ldivmod): Perform signed division using
	unsigned division via call to __udivmoddi4 and additional logic.
---
 libgcc/config/arm/bpabi.S | 74 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
index e020af5..8b75a28 100644
--- a/libgcc/config/arm/bpabi.S
+++ b/libgcc/config/arm/bpabi.S
@@ -136,20 +136,84 @@ ARM_FUNC_START aeabi_ldivmod
 	cfi_start	__aeabi_ldivmod, LSYM(Lend_aeabi_ldivmod)
 	test_div_by_zero signed
 
-	sub sp, sp, #8
-#if defined(__thumb2__)
-	mov ip, sp
-	push {ip, lr}
+#if defined(__thumb2__) && CAN_USE_LDRD
+	sub ip, sp, #8
+	strd ip,lr, [sp, #-16]!
 #else
+	sub sp, sp, #8
 	do_push {sp, lr}
 #endif
+	cmp xxh, #0
+	blt 1f
+	cmp yyh, #0
+	blt 2f
+
+98:	cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+	bl SYM(__udivmoddi4) __PLT__
+	ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+	ldrd r2, r3, [sp, #8]
+	add sp, sp, #16
+#else
+	add sp, sp, #8
+	do_pop {r2, r3}
+#endif
+	RET
+1: /* xxh:xxl is negative */
+	rsbs xxl, xxl, #0
+	sbc xxh, xxh, xxh, lsl #1
+	cmp yyh, #0
+	blt 3f
+98:	cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+	bl SYM(__udivmoddi4) __PLT__
+	ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+	ldrd r2, r3, [sp, #8]
+	add sp, sp, #16
+#else
+	add sp, sp, #8
+	do_pop {r2, r3}
+#endif
+	rsbs xxl, xxl, #0
+	sbc xxh, xxh, xxh, lsl #1
+	rsbs yyl, yyl, #0
+	sbc yyh, yyh, yyh, lsl #1
+	RET
+
+2: /* only yyh:yyl is negative */
+	rsbs yyl, yyl, #0
+	sbc yyh, yyh, yyh, lsl #1
 98:	cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
-	bl SYM(__gnu_ldivmod_helper) __PLT__
+	bl SYM(__udivmoddi4) __PLT__
 	ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+	ldrd r2, r3, [sp, #8]
+	add sp, sp, #16
+#else
 	add sp, sp, #8
 	do_pop {r2, r3}
+#endif
+	rsbs xxl, xxl, #0
+	sbc xxh, xxh, xxh, lsl #1
 	RET
+
+3: /* both xxh:xxl and yyh:yyl are negative */
+	rsbs yyl, yyl, #0
+	sbc yyh, yyh, yyh, lsl #1
 	cfi_end	LSYM(Lend_aeabi_ldivmod)
+98:	cfi_push 98b - __aeabi_ldivmod, 0xe, -0xc, 0x10
+	bl SYM(__udivmoddi4) __PLT__
+	ldr lr, [sp, #4]
+#if CAN_USE_LDRD
+	ldrd r2, r3, [sp, #8]
+	add sp, sp, #16
+#else
+	add sp, sp, #8
+	do_pop {r2, r3}
+#endif
+	rsbs yyl, yyl, #0
+	sbc yyh, yyh, yyh, lsl #1
+	RET
 	
 #endif /* L_aeabi_ldivmod */
 
-- 
1.8.3.2

#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>

double tv_to_s(struct timeval tv)
{
  return tv.tv_sec + ((double)tv.tv_usec)/1.0e6;
}

#define STEP (0x7fffffffffff0000/100000000)
#define END  (0x7fffffffffff0001-STEP)
#define START1 (37ll)
#define START2 (3ll)

uint64_t __aeabi_uldivmod(uint64_t,uint64_t);
int64_t __aeabi_ldivmod(int64_t,int64_t);

int main(int argc, char **argv)
{
  double time1, time2, time3, time4;
  struct timeval start, end;


  volatile uint64_t dummy;
  uint64_t i;

  volatile int64_t sdummy;
  int64_t si;

  gettimeofday (&start, NULL);
  for (i = START2; i < END; i += STEP)
    {
      dummy = __aeabi_uldivmod(END, i);
    }
  gettimeofday (&end, NULL);
  time1 = tv_to_s (end) - tv_to_s (start);

  gettimeofday (&start, NULL);
  for (i = START1; i < END; i += STEP * 5)
    {
      dummy = __aeabi_uldivmod(i, 373459);
    }
  gettimeofday (&end, NULL);
  time2 = tv_to_s (end) - tv_to_s (start);

  gettimeofday (&start, NULL);
  for (si = START2; si < END; si += STEP)
    {
      sdummy = __aeabi_ldivmod(END, si);
    }
  gettimeofday (&end, NULL);
  time3 = tv_to_s (end) - tv_to_s (start);

  gettimeofday (&start, NULL);
  for (si = START1; si < END; si += STEP * 5)
    {
      sdummy = __aeabi_ldivmod(si, 373459);
    }
  gettimeofday (&end, NULL);
  time4 = tv_to_s (end) - tv_to_s (start);

  printf ("loop1 unsigned:     %12f\n"
          "loop2 unsigned:     %12f\n"
          "loop1 signed:       %12f\n"
          "loop2 signed:       %12f\n",
          time1, time2, time3, time4);

  return 0;
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]