This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- From: "Andre Vieira (lists)" <Andre dot SimoesDiasVieira at arm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>
- Date: Wed, 6 Jul 2016 11:52:34 +0100
- Subject: Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- Authentication-results: sourceware.org; auth=none
- References: <561D38F7.809@arm.com> <577672E3.6080002@foss.arm.com>
On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>
>
> On 13/10/15 18:01, Andre Vieira wrote:
>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>
>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>
> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>
>>
>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>
>> Make check passed for armv6-m.
>>
>> libgcc/ChangeLog:
>> 2015-08-10 Hale Wang <hale.wang@arm.com>
>> Andre Vieira <andre.simoesdiasvieira@arm.com>
>>
>> * config/arm/lib1funcs.S: Add new wrapper.
>>
>> 0001-integer-division.patch
>>
>>
>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>> Subject: [PATCH] new wrapper idivmod
>>
>> ---
>> libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>> 1 file changed, 217 insertions(+), 33 deletions(-)
>>
>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>> --- a/libgcc/config/arm/lib1funcs.S
>> +++ b/libgcc/config/arm/lib1funcs.S
>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>> #ifdef __ARM_EABI__
>> .macro THUMB_LDIV0 name signed
>> #if defined(__ARM_ARCH_6M__)
>> - .ifc \signed, unsigned
>> - cmp r0, #0
>> - beq 1f
>> - mov r0, #0
>> - mvn r0, r0 @ 0xffffffff
>> -1:
>> - .else
>> - cmp r0, #0
>> - beq 2f
>> - blt 3f
>> +
>> + push {r0, lr}
>> mov r0, #0
>> - mvn r0, r0
>> - lsr r0, r0, #1 @ 0x7fffffff
>> - b 2f
>> -3: mov r0, #0x80
>> - lsl r0, r0, #24 @ 0x80000000
>> -2:
>> - .endif
>> - push {r0, r1, r2}
>> - ldr r0, 4f
>> - adr r1, 4f
>> - add r0, r1
>> - str r0, [sp, #8]
>> - @ We know we are not on armv4t, so pop pc is safe.
>> - pop {r0, r1, pc}
>> - .align 2
>> -4:
>> - .word __aeabi_idiv0 - 4b
>> + bl SYM(__aeabi_idiv0)
>> + pop {r1, pc}
>> +
>
> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>
>> #elif defined(__thumb2__)
>> .syntax unified
>> .ifc \signed, unsigned
>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>> add dividend, work
>> .endif
>> LSYM(Lgot_result):
>> -.endm
>> +.endm
>> +
>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>> +/* If performance is preferred, the following functions are provided. */
>> +
>
> Comment above #if please and also check elsewhere in patch.
>
>> +/* Branch to div(n), and jump to label if curbit is lo than divisior. */
>> +.macro BranchToDiv n, label
>> + lsr curbit, dividend, \n
>> + cmp curbit, divisor
>> + blo \label
>> +.endm
>> +
>> +/* Body of div(n). Shift the divisor in n bits and compare the divisor
>> + and dividend. Update the dividend as the substruction result. */
>> +.macro DoDiv n
>> + lsr curbit, dividend, \n
>> + cmp curbit, divisor
>> + bcc 1f
>> + lsl curbit, divisor, \n
>> + sub dividend, dividend, curbit
>> +
>> +1: adc result, result
>> +.endm
>> +
>> +/* The body of division with positive divisor. Unless the divisor is very
>> + big, shift it up in multiples of four bits, since this is the amount of
>> + unwinding in the main division loop. Continue shifting until the divisor
>> + is larger than the dividend. */
>> +.macro THUMB1_Div_Positive
>> + mov result, #0
>> + BranchToDiv #1, LSYM(Lthumb1_div1)
>> + BranchToDiv #4, LSYM(Lthumb1_div4)
>> + BranchToDiv #8, LSYM(Lthumb1_div8)
>> + BranchToDiv #12, LSYM(Lthumb1_div12)
>> + BranchToDiv #16, LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_large_positive):
>> + mov result, #0xff
>> + lsl divisor, divisor, #8
>> + rev result, result
>> + lsr curbit, dividend, #16
>> + cmp curbit, divisor
>> + blo 1f
>> + asr result, #8
>> + lsl divisor, divisor, #8
>> + beq LSYM(Ldivbyzero_waypoint)
>> +
>> +1: lsr curbit, dividend, #12
>> + cmp curbit, divisor
>> + blo LSYM(Lthumb1_div12)
>> + b LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_loop):
>> + lsr divisor, divisor, #8
>> +LSYM(Lthumb1_div16):
>> + Dodiv #15
>> + Dodiv #14
>> + Dodiv #13
>> + Dodiv #12
>> +LSYM(Lthumb1_div12):
>> + Dodiv #11
>> + Dodiv #10
>> + Dodiv #9
>> + Dodiv #8
>> + bcs LSYM(Lthumb1_div_loop)
>> +LSYM(Lthumb1_div8):
>> + Dodiv #7
>> + Dodiv #6
>> + Dodiv #5
>> +LSYM(Lthumb1_div5):
>> + Dodiv #4
>> +LSYM(Lthumb1_div4):
>> + Dodiv #3
>> +LSYM(Lthumb1_div3):
>> + Dodiv #2
>> +LSYM(Lthumb1_div2):
>> + Dodiv #1
>> +LSYM(Lthumb1_div1):
>> + sub divisor, dividend, divisor
>> + bcs 1f
>> + cpy divisor, dividend
>> +
>> +1: adc result, result
>> + cpy dividend, result
>> + RET
>> +
>> +LSYM(Ldivbyzero_waypoint):
>> + b LSYM(Ldiv0)
>> +.endm
>> +
>> +/* The body of division with negative divisor. Similar with
>> + THUMB1_Div_Positive except that the shift steps are in multiples
>> + of six bits. */
>> +.macro THUMB1_Div_Negative
>> + lsr result, divisor, #31
>> + beq 1f
>> + neg divisor, divisor
>> +
>> +1: asr curbit, dividend, #32
>> + bcc 2f
>> + neg dividend, dividend
>> +
>> +2: eor curbit, result
>> + mov result, #0
>> + cpy ip, curbit
>> + BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>> + BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_large):
>> + mov result, #0xfc
>> + lsl divisor, divisor, #6
>> + rev result, result
>> + lsr curbit, dividend, #8
>> + cmp curbit, divisor
>> + blo LSYM(Lthumb1_div_negative8)
>> +
>> + lsl divisor, divisor, #6
>> + asr result, result, #6
>> + cmp curbit, divisor
>> + blo LSYM(Lthumb1_div_negative8)
>> +
>> + lsl divisor, divisor, #6
>> + asr result, result, #6
>> + cmp curbit, divisor
>> + blo LSYM(Lthumb1_div_negative8)
>> +
>> + lsl divisor, divisor, #6
>> + beq LSYM(Ldivbyzero_negative)
>> + asr result, result, #6
>> + b LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_negative_loop):
>> + lsr divisor, divisor, #6
>> +LSYM(Lthumb1_div_negative8):
>> + DoDiv #7
>> + DoDiv #6
>> + DoDiv #5
>> + DoDiv #4
>> +LSYM(Lthumb1_div_negative4):
>> + DoDiv #3
>> + DoDiv #2
>> + bcs LSYM(Lthumb1_div_negative_loop)
>> + DoDiv #1
>> + sub divisor, dividend, divisor
>> + bcs 1f
>> + cpy divisor, dividend
>> +
>> +1: cpy curbit, ip
>> + adc result, result
>> + asr curbit, curbit, #1
>> + cpy dividend, result
>> + bcc 2f
>> + neg dividend, dividend
>> + cmp curbit, #0
>> +
>> +2: bpl 3f
>> + neg divisor, divisor
>> +
>> +3: RET
>> +
>> +LSYM(Ldivbyzero_negative):
>> + cpy curbit, ip
>> + asr curbit, curbit, #1
>> + bcc LSYM(Ldiv0)
>> + neg dividend, dividend
>> +.endm
>> +#endif /* ARM Thumb version. */
>> +
>> /* ------------------------------------------------------------------------ */
>> /* Start of the Real Functions */
>> /* ------------------------------------------------------------------------ */
>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>
>> FUNC_START udivsi3
>> FUNC_ALIAS aeabi_uidiv udivsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>
>> cmp divisor, #0
>> beq LSYM(Ldiv0)
>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>> pop { work }
>> RET
>>
>> +#else
>> + /* Implementation of aeabi_uidiv for ARMv6m. This version is only
>> + used in ARMv6-M when we need an efficient implementation. */
>> +LSYM(udivsi3_skip_div0_test):
>> + THUMB1_Div_Positive
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>> +
>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>
>> ARM_FUNC_START udivsi3
>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>> FUNC_START aeabi_uidivmod
>> cmp r1, #0
>> beq LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>> push {r0, r1, lr}
>> bl LSYM(udivsi3_skip_div0_test)
>> POP {r1, r2, r3}
>> mul r2, r0
>> sub r1, r1, r2
>> bx r3
>> +# else
>> + /* Both the quotient and remainder are calculated simultaneously
>> + in THUMB1_Div_Positive. There is no need to calculate the
>> + remainder again here. */
>> + b LSYM(udivsi3_skip_div0_test)
>> + RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>> ARM_FUNC_START aeabi_uidivmod
>> cmp r1, #0
>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>> RET
>>
>> #else /* ARM version. */
>> -
>> +
>> FUNC_START umodsi3
>>
>> subs r2, r1, #1 @ compare divisor with 1
>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>
>> #if defined(__prefer_thumb__)
>>
>> - FUNC_START divsi3
>> + FUNC_START divsi3
>> FUNC_ALIAS aeabi_idiv divsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>
>> cmp divisor, #0
>> beq LSYM(Ldiv0)
>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>> blo LSYM(Lgot_result)
>>
>> THUMB_DIV_MOD_BODY 0
>> -
>> +
>> mov r0, result
>> mov work, ip
>> cmp work, #0
>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>> LSYM(Lover12):
>> pop { work }
>> RET
>> +#else
>> + /* Implementation of aeabi_idiv for ARMv6m. This version is only
>> + used in ARMv6-M when we need an efficient implementation. */
>> +LSYM(divsi3_skip_div0_test):
>> + cpy curbit, dividend
>> + orr curbit, divisor
>> + bmi LSYM(Lthumb1_div_negative)
>> +
>> +LSYM(Lthumb1_div_positive):
>> + THUMB1_Div_Positive
>> +
>> +LSYM(Lthumb1_div_negative):
>> + THUMB1_Div_Negative
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>>
>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>
>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>> RET
>>
>> #else /* ARM/Thumb-2 version. */
>> -
>> - ARM_FUNC_START divsi3
>> +
>> + ARM_FUNC_START divsi3
>> ARM_FUNC_ALIAS aeabi_idiv divsi3
>>
>> cmp r1, #0
>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>> FUNC_START aeabi_idivmod
>> cmp r1, #0
>> beq LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>> push {r0, r1, lr}
>> bl LSYM(divsi3_skip_div0_test)
>> POP {r1, r2, r3}
>> mul r2, r0
>> sub r1, r1, r2
>> bx r3
>> +# else
>> + /* Both the quotient and remainder are calculated simultaneously
>> + in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
>> + need to calculate the remainder again here. */
>> + b LSYM(divsi3_skip_div0_test)
>> + RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>> ARM_FUNC_START aeabi_idivmod
>> cmp r1, #0
>> -- 1.9.1
>>
>
> Otherwise OK if no regressions and the following request passes.
>
> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ?
>
> regards
> Ramana
>
Hi Ramana,
Thank you for the comments. Sorry about the license, must have been a
mixup somewhere.
I put back the 'pop pc is safe' assembly comment and I moved some
comments before the #if and #else as requested. I left some in place
because they did not apply to the whole block but simply to the first
assembly instruction after the #if/else.
I checked that the assembly generated for libgcc was the same with and
without the patch for armv7-a in arm mode and armv7-m in thumb mode.
Is this OK?
Cheers,
Andre
libgcc/ChangeLog:
2016-07-06 Hale Wang <hale.wang@arm.com>
Andre Vieira <andre.simoesdiasvieira@arm.com>
* config/arm/lib1funcs.S: Add new wrapper.
>From b5b129e698b9e7446907d3da1fbce0236b09b67c Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Mon, 16 May 2016 18:34:52 +0100
Subject: [PATCH 1/2] integer division
---
libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
1 file changed, 218 insertions(+), 32 deletions(-)
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 375a5135110895faa44267ebee045fd315515027..8a245b7a9333b249f120cd7e7ecc77248b150610 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,13 @@ LSYM(Lend_fde):
#ifdef __ARM_EABI__
.macro THUMB_LDIV0 name signed
#if defined(__ARM_ARCH_6M__)
- .ifc \signed, unsigned
- cmp r0, #0
- beq 1f
- mov r0, #0
- mvn r0, r0 @ 0xffffffff
-1:
- .else
- cmp r0, #0
- beq 2f
- blt 3f
+
+ push {r0, lr}
mov r0, #0
- mvn r0, r0
- lsr r0, r0, #1 @ 0x7fffffff
- b 2f
-3: mov r0, #0x80
- lsl r0, r0, #24 @ 0x80000000
-2:
- .endif
- push {r0, r1, r2}
- ldr r0, 4f
- adr r1, 4f
- add r0, r1
- str r0, [sp, #8]
+ bl SYM(__aeabi_idiv0)
@ We know we are not on armv4t, so pop pc is safe.
- pop {r0, r1, pc}
- .align 2
-4:
- .word __aeabi_idiv0 - 4b
+ pop {r1, pc}
+
#elif defined(__thumb2__)
.syntax unified
.ifc \signed, unsigned
@@ -945,7 +924,170 @@ LSYM(Lover7):
add dividend, work
.endif
LSYM(Lgot_result):
-.endm
+.endm
+
+/* If performance is preferred, the following functions are provided. */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior. */
+.macro BranchToDiv n, label
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ blo \label
+.endm
+
+/* Body of div(n). Shift the divisor in n bits and compare the divisor
+ and dividend. Update the dividend as the substruction result. */
+.macro DoDiv n
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ bcc 1f
+ lsl curbit, divisor, \n
+ sub dividend, dividend, curbit
+
+1: adc result, result
+.endm
+
+/* The body of division with positive divisor. Unless the divisor is very
+ big, shift it up in multiples of four bits, since this is the amount of
+ unwinding in the main division loop. Continue shifting until the divisor
+ is larger than the dividend. */
+.macro THUMB1_Div_Positive
+ mov result, #0
+ BranchToDiv #1, LSYM(Lthumb1_div1)
+ BranchToDiv #4, LSYM(Lthumb1_div4)
+ BranchToDiv #8, LSYM(Lthumb1_div8)
+ BranchToDiv #12, LSYM(Lthumb1_div12)
+ BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+ mov result, #0xff
+ lsl divisor, divisor, #8
+ rev result, result
+ lsr curbit, dividend, #16
+ cmp curbit, divisor
+ blo 1f
+ asr result, #8
+ lsl divisor, divisor, #8
+ beq LSYM(Ldivbyzero_waypoint)
+
+1: lsr curbit, dividend, #12
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div12)
+ b LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+ lsr divisor, divisor, #8
+LSYM(Lthumb1_div16):
+ Dodiv #15
+ Dodiv #14
+ Dodiv #13
+ Dodiv #12
+LSYM(Lthumb1_div12):
+ Dodiv #11
+ Dodiv #10
+ Dodiv #9
+ Dodiv #8
+ bcs LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+ Dodiv #7
+ Dodiv #6
+ Dodiv #5
+LSYM(Lthumb1_div5):
+ Dodiv #4
+LSYM(Lthumb1_div4):
+ Dodiv #3
+LSYM(Lthumb1_div3):
+ Dodiv #2
+LSYM(Lthumb1_div2):
+ Dodiv #1
+LSYM(Lthumb1_div1):
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: adc result, result
+ cpy dividend, result
+ RET
+
+LSYM(Ldivbyzero_waypoint):
+ b LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor. Similar with
+ THUMB1_Div_Positive except that the shift steps are in multiples
+ of six bits. */
+.macro THUMB1_Div_Negative
+ lsr result, divisor, #31
+ beq 1f
+ neg divisor, divisor
+
+1: asr curbit, dividend, #32
+ bcc 2f
+ neg dividend, dividend
+
+2: eor curbit, result
+ mov result, #0
+ cpy ip, curbit
+ BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+ BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+ mov result, #0xfc
+ lsl divisor, divisor, #6
+ rev result, result
+ lsr curbit, dividend, #8
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ beq LSYM(Ldivbyzero_negative)
+ asr result, result, #6
+ b LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+ lsr divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+ DoDiv #7
+ DoDiv #6
+ DoDiv #5
+ DoDiv #4
+LSYM(Lthumb1_div_negative4):
+ DoDiv #3
+ DoDiv #2
+ bcs LSYM(Lthumb1_div_negative_loop)
+ DoDiv #1
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: cpy curbit, ip
+ adc result, result
+ asr curbit, curbit, #1
+ cpy dividend, result
+ bcc 2f
+ neg dividend, dividend
+ cmp curbit, #0
+
+2: bpl 3f
+ neg divisor, divisor
+
+3: RET
+
+LSYM(Ldivbyzero_negative):
+ cpy curbit, ip
+ asr curbit, curbit, #1
+ bcc LSYM(Ldiv0)
+ neg dividend, dividend
+.endm
+#endif /* ARM Thumb version. */
+
/* ------------------------------------------------------------------------ */
/* Start of the Real Functions */
/* ------------------------------------------------------------------------ */
@@ -955,6 +1097,7 @@ LSYM(Lgot_result):
FUNC_START udivsi3
FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -972,6 +1115,14 @@ LSYM(udivsi3_skip_div0_test):
pop { work }
RET
+/* Implementation of aeabi_uidiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(udivsi3_skip_div0_test):
+ THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START udivsi3
@@ -1023,12 +1174,21 @@ LSYM(udivsi3_skip_div0_test):
FUNC_START aeabi_uidivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(udivsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive. There is no need to calculate the
+ remainder again here. */
+ b LSYM(udivsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_uidivmod
cmp r1, #0
@@ -1084,7 +1244,7 @@ LSYM(Lover10):
RET
#else /* ARM version. */
-
+
FUNC_START umodsi3
subs r2, r1, #1 @ compare divisor with 1
@@ -1109,8 +1269,9 @@ LSYM(Lover10):
#if defined(__prefer_thumb__)
- FUNC_START divsi3
+ FUNC_START divsi3
FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -1133,7 +1294,7 @@ LSYM(Lover11):
blo LSYM(Lgot_result)
THUMB_DIV_MOD_BODY 0
-
+
mov r0, result
mov work, ip
cmp work, #0
@@ -1143,6 +1304,22 @@ LSYM(Lover12):
pop { work }
RET
+/* Implementation of aeabi_idiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(divsi3_skip_div0_test):
+ cpy curbit, dividend
+ orr curbit, divisor
+ bmi LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+ THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+ THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START divsi3
@@ -1154,8 +1331,8 @@ LSYM(Lover12):
RET
#else /* ARM/Thumb-2 version. */
-
- ARM_FUNC_START divsi3
+
+ ARM_FUNC_START divsi3
ARM_FUNC_ALIAS aeabi_idiv divsi3
cmp r1, #0
@@ -1209,12 +1386,21 @@ LSYM(divsi3_skip_div0_test):
FUNC_START aeabi_idivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(divsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
+ need to calculate the remainder again here. */
+ b LSYM(divsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_idivmod
cmp r1, #0
--
1.9.1