This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- From: "Andre Vieira (lists)" <Andre dot SimoesDiasVieira at arm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>
- Date: Mon, 11 Jul 2016 18:09:44 +0100
- Subject: Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- Authentication-results: sourceware.org; auth=none
- References: <561D38F7.809@arm.com> <577672E3.6080002@foss.arm.com> <577CE2F2.1080805@arm.com>
On 06/07/16 11:52, Andre Vieira (lists) wrote:
> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>
>>
>> On 13/10/15 18:01, Andre Vieira wrote:
>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>>
>>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>>
>> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>>
>>>
>>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>
>>> Make check passed for armv6-m.
>>>
>>> libgcc/ChangeLog:
>>> 2015-08-10 Hale Wang <hale.wang@arm.com>
>>> Andre Vieira <andre.simoesdiasvieira@arm.com>
>>>
>>> * config/arm/lib1funcs.S: Add new wrapper.
>>>
>>> 0001-integer-division.patch
>>>
>>>
>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>> Subject: [PATCH] new wrapper idivmod
>>>
>>> ---
>>> libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>> 1 file changed, 217 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>>> --- a/libgcc/config/arm/lib1funcs.S
>>> +++ b/libgcc/config/arm/lib1funcs.S
>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>> #ifdef __ARM_EABI__
>>> .macro THUMB_LDIV0 name signed
>>> #if defined(__ARM_ARCH_6M__)
>>> - .ifc \signed, unsigned
>>> - cmp r0, #0
>>> - beq 1f
>>> - mov r0, #0
>>> - mvn r0, r0 @ 0xffffffff
>>> -1:
>>> - .else
>>> - cmp r0, #0
>>> - beq 2f
>>> - blt 3f
>>> +
>>> + push {r0, lr}
>>> mov r0, #0
>>> - mvn r0, r0
>>> - lsr r0, r0, #1 @ 0x7fffffff
>>> - b 2f
>>> -3: mov r0, #0x80
>>> - lsl r0, r0, #24 @ 0x80000000
>>> -2:
>>> - .endif
>>> - push {r0, r1, r2}
>>> - ldr r0, 4f
>>> - adr r1, 4f
>>> - add r0, r1
>>> - str r0, [sp, #8]
>>> - @ We know we are not on armv4t, so pop pc is safe.
>>> - pop {r0, r1, pc}
>>> - .align 2
>>> -4:
>>> - .word __aeabi_idiv0 - 4b
>>> + bl SYM(__aeabi_idiv0)
>>> + pop {r1, pc}
>>> +
>>
>> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>>
>>> #elif defined(__thumb2__)
>>> .syntax unified
>>> .ifc \signed, unsigned
>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>> add dividend, work
>>> .endif
>>> LSYM(Lgot_result):
>>> -.endm
>>> +.endm
>>> +
>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>> +/* If performance is preferred, the following functions are provided. */
>>> +
>>
>> Comment above #if please and also check elsewhere in patch.
>>
>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior. */
>>> +.macro BranchToDiv n, label
>>> + lsr curbit, dividend, \n
>>> + cmp curbit, divisor
>>> + blo \label
>>> +.endm
>>> +
>>> +/* Body of div(n). Shift the divisor in n bits and compare the divisor
>>> + and dividend. Update the dividend as the substruction result. */
>>> +.macro DoDiv n
>>> + lsr curbit, dividend, \n
>>> + cmp curbit, divisor
>>> + bcc 1f
>>> + lsl curbit, divisor, \n
>>> + sub dividend, dividend, curbit
>>> +
>>> +1: adc result, result
>>> +.endm
>>> +
>>> +/* The body of division with positive divisor. Unless the divisor is very
>>> + big, shift it up in multiples of four bits, since this is the amount of
>>> + unwinding in the main division loop. Continue shifting until the divisor
>>> + is larger than the dividend. */
>>> +.macro THUMB1_Div_Positive
>>> + mov result, #0
>>> + BranchToDiv #1, LSYM(Lthumb1_div1)
>>> + BranchToDiv #4, LSYM(Lthumb1_div4)
>>> + BranchToDiv #8, LSYM(Lthumb1_div8)
>>> + BranchToDiv #12, LSYM(Lthumb1_div12)
>>> + BranchToDiv #16, LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_large_positive):
>>> + mov result, #0xff
>>> + lsl divisor, divisor, #8
>>> + rev result, result
>>> + lsr curbit, dividend, #16
>>> + cmp curbit, divisor
>>> + blo 1f
>>> + asr result, #8
>>> + lsl divisor, divisor, #8
>>> + beq LSYM(Ldivbyzero_waypoint)
>>> +
>>> +1: lsr curbit, dividend, #12
>>> + cmp curbit, divisor
>>> + blo LSYM(Lthumb1_div12)
>>> + b LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_loop):
>>> + lsr divisor, divisor, #8
>>> +LSYM(Lthumb1_div16):
>>> + Dodiv #15
>>> + Dodiv #14
>>> + Dodiv #13
>>> + Dodiv #12
>>> +LSYM(Lthumb1_div12):
>>> + Dodiv #11
>>> + Dodiv #10
>>> + Dodiv #9
>>> + Dodiv #8
>>> + bcs LSYM(Lthumb1_div_loop)
>>> +LSYM(Lthumb1_div8):
>>> + Dodiv #7
>>> + Dodiv #6
>>> + Dodiv #5
>>> +LSYM(Lthumb1_div5):
>>> + Dodiv #4
>>> +LSYM(Lthumb1_div4):
>>> + Dodiv #3
>>> +LSYM(Lthumb1_div3):
>>> + Dodiv #2
>>> +LSYM(Lthumb1_div2):
>>> + Dodiv #1
>>> +LSYM(Lthumb1_div1):
>>> + sub divisor, dividend, divisor
>>> + bcs 1f
>>> + cpy divisor, dividend
>>> +
>>> +1: adc result, result
>>> + cpy dividend, result
>>> + RET
>>> +
>>> +LSYM(Ldivbyzero_waypoint):
>>> + b LSYM(Ldiv0)
>>> +.endm
>>> +
>>> +/* The body of division with negative divisor. Similar with
>>> + THUMB1_Div_Positive except that the shift steps are in multiples
>>> + of six bits. */
>>> +.macro THUMB1_Div_Negative
>>> + lsr result, divisor, #31
>>> + beq 1f
>>> + neg divisor, divisor
>>> +
>>> +1: asr curbit, dividend, #32
>>> + bcc 2f
>>> + neg dividend, dividend
>>> +
>>> +2: eor curbit, result
>>> + mov result, #0
>>> + cpy ip, curbit
>>> + BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>> + BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_large):
>>> + mov result, #0xfc
>>> + lsl divisor, divisor, #6
>>> + rev result, result
>>> + lsr curbit, dividend, #8
>>> + cmp curbit, divisor
>>> + blo LSYM(Lthumb1_div_negative8)
>>> +
>>> + lsl divisor, divisor, #6
>>> + asr result, result, #6
>>> + cmp curbit, divisor
>>> + blo LSYM(Lthumb1_div_negative8)
>>> +
>>> + lsl divisor, divisor, #6
>>> + asr result, result, #6
>>> + cmp curbit, divisor
>>> + blo LSYM(Lthumb1_div_negative8)
>>> +
>>> + lsl divisor, divisor, #6
>>> + beq LSYM(Ldivbyzero_negative)
>>> + asr result, result, #6
>>> + b LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_negative_loop):
>>> + lsr divisor, divisor, #6
>>> +LSYM(Lthumb1_div_negative8):
>>> + DoDiv #7
>>> + DoDiv #6
>>> + DoDiv #5
>>> + DoDiv #4
>>> +LSYM(Lthumb1_div_negative4):
>>> + DoDiv #3
>>> + DoDiv #2
>>> + bcs LSYM(Lthumb1_div_negative_loop)
>>> + DoDiv #1
>>> + sub divisor, dividend, divisor
>>> + bcs 1f
>>> + cpy divisor, dividend
>>> +
>>> +1: cpy curbit, ip
>>> + adc result, result
>>> + asr curbit, curbit, #1
>>> + cpy dividend, result
>>> + bcc 2f
>>> + neg dividend, dividend
>>> + cmp curbit, #0
>>> +
>>> +2: bpl 3f
>>> + neg divisor, divisor
>>> +
>>> +3: RET
>>> +
>>> +LSYM(Ldivbyzero_negative):
>>> + cpy curbit, ip
>>> + asr curbit, curbit, #1
>>> + bcc LSYM(Ldiv0)
>>> + neg dividend, dividend
>>> +.endm
>>> +#endif /* ARM Thumb version. */
>>> +
>>> /* ------------------------------------------------------------------------ */
>>> /* Start of the Real Functions */
>>> /* ------------------------------------------------------------------------ */
>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>
>>> FUNC_START udivsi3
>>> FUNC_ALIAS aeabi_uidiv udivsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>
>>> cmp divisor, #0
>>> beq LSYM(Ldiv0)
>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>> pop { work }
>>> RET
>>>
>>> +#else
>>> + /* Implementation of aeabi_uidiv for ARMv6m. This version is only
>>> + used in ARMv6-M when we need an efficient implementation. */
>>> +LSYM(udivsi3_skip_div0_test):
>>> + THUMB1_Div_Positive
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>> +
>>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>
>>> ARM_FUNC_START udivsi3
>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>> FUNC_START aeabi_uidivmod
>>> cmp r1, #0
>>> beq LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>> push {r0, r1, lr}
>>> bl LSYM(udivsi3_skip_div0_test)
>>> POP {r1, r2, r3}
>>> mul r2, r0
>>> sub r1, r1, r2
>>> bx r3
>>> +# else
>>> + /* Both the quotient and remainder are calculated simultaneously
>>> + in THUMB1_Div_Positive. There is no need to calculate the
>>> + remainder again here. */
>>> + b LSYM(udivsi3_skip_div0_test)
>>> + RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>> ARM_FUNC_START aeabi_uidivmod
>>> cmp r1, #0
>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>> RET
>>>
>>> #else /* ARM version. */
>>> -
>>> +
>>> FUNC_START umodsi3
>>>
>>> subs r2, r1, #1 @ compare divisor with 1
>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>
>>> #if defined(__prefer_thumb__)
>>>
>>> - FUNC_START divsi3
>>> + FUNC_START divsi3
>>> FUNC_ALIAS aeabi_idiv divsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>
>>> cmp divisor, #0
>>> beq LSYM(Ldiv0)
>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>> blo LSYM(Lgot_result)
>>>
>>> THUMB_DIV_MOD_BODY 0
>>> -
>>> +
>>> mov r0, result
>>> mov work, ip
>>> cmp work, #0
>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>> LSYM(Lover12):
>>> pop { work }
>>> RET
>>> +#else
>>> + /* Implementation of aeabi_idiv for ARMv6m. This version is only
>>> + used in ARMv6-M when we need an efficient implementation. */
>>> +LSYM(divsi3_skip_div0_test):
>>> + cpy curbit, dividend
>>> + orr curbit, divisor
>>> + bmi LSYM(Lthumb1_div_negative)
>>> +
>>> +LSYM(Lthumb1_div_positive):
>>> + THUMB1_Div_Positive
>>> +
>>> +LSYM(Lthumb1_div_negative):
>>> + THUMB1_Div_Negative
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>
>>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>
>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>> RET
>>>
>>> #else /* ARM/Thumb-2 version. */
>>> -
>>> - ARM_FUNC_START divsi3
>>> +
>>> + ARM_FUNC_START divsi3
>>> ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>
>>> cmp r1, #0
>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>> FUNC_START aeabi_idivmod
>>> cmp r1, #0
>>> beq LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>> push {r0, r1, lr}
>>> bl LSYM(divsi3_skip_div0_test)
>>> POP {r1, r2, r3}
>>> mul r2, r0
>>> sub r1, r1, r2
>>> bx r3
>>> +# else
>>> + /* Both the quotient and remainder are calculated simultaneously
>>> + in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
>>> + need to calculate the remainder again here. */
>>> + b LSYM(divsi3_skip_div0_test)
>>> + RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>> #elif defined(__ARM_ARCH_EXT_IDIV__)
>>> ARM_FUNC_START aeabi_idivmod
>>> cmp r1, #0
>>> -- 1.9.1
>>>
>>
>> Otherwise OK if no regressions and the following request passes.
>>
>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ?
>>
>> regards
>> Ramana
>>
> Hi Ramana,
>
> Thank you for the comments. Sorry about the license, must have been a
> mixup somewhere.
>
> I put back the 'pop pc is safe' assembly comment and I moved some
> comments before the #if and #else as requested. I left some in place
> because they did not apply to the whole block but simply to the first
> assembly instruction after the #if/else.
>
> I checked that the assembly generated for libgcc was the same with and
> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
>
> Is this OK?
>
> Cheers,
> Andre
>
> libgcc/ChangeLog:
> 2016-07-06 Hale Wang <hale.wang@arm.com>
> Andre Vieira <andre.simoesdiasvieira@arm.com>
>
> * config/arm/lib1funcs.S: Add new wrapper.
>
I had to rebase the patch due to the ARMv8-M patches. This implied
changing a context line that changed due to the code for ARMv6-M being
reused for ARMv8-M Baseline.
I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
Thumb mode, observing no changes.
Applying patch, as it was previously OK'ed.
Cheers,
Andre
>From ad757144fad2d9608ed840153071bb5d470193ef Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Thu, 7 Jul 2016 10:49:19 +0100
Subject: [PATCH] integer division
---
libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
1 file changed, 218 insertions(+), 32 deletions(-)
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 96e206ee542126c5d68091087446afe9f01aa51f..ba52e7b762f5573445349a574a3878859a992f13 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -311,34 +311,13 @@ LSYM(Lend_fde):
#ifdef __ARM_EABI__
.macro THUMB_LDIV0 name signed
#ifdef NOT_ISA_TARGET_32BIT
- .ifc \signed, unsigned
- cmp r0, #0
- beq 1f
- mov r0, #0
- mvn r0, r0 @ 0xffffffff
-1:
- .else
- cmp r0, #0
- beq 2f
- blt 3f
+
+ push {r0, lr}
mov r0, #0
- mvn r0, r0
- lsr r0, r0, #1 @ 0x7fffffff
- b 2f
-3: mov r0, #0x80
- lsl r0, r0, #24 @ 0x80000000
-2:
- .endif
- push {r0, r1, r2}
- ldr r0, 4f
- adr r1, 4f
- add r0, r1
- str r0, [sp, #8]
+ bl SYM(__aeabi_idiv0)
@ We know we are not on armv4t, so pop pc is safe.
- pop {r0, r1, pc}
- .align 2
-4:
- .word __aeabi_idiv0 - 4b
+ pop {r1, pc}
+
#elif defined(__thumb2__)
.syntax unified
.ifc \signed, unsigned
@@ -950,7 +929,170 @@ LSYM(Lover7):
add dividend, work
.endif
LSYM(Lgot_result):
-.endm
+.endm
+
+/* If performance is preferred, the following functions are provided. */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior. */
+.macro BranchToDiv n, label
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ blo \label
+.endm
+
+/* Body of div(n). Shift the divisor in n bits and compare the divisor
+ and dividend. Update the dividend as the substruction result. */
+.macro DoDiv n
+ lsr curbit, dividend, \n
+ cmp curbit, divisor
+ bcc 1f
+ lsl curbit, divisor, \n
+ sub dividend, dividend, curbit
+
+1: adc result, result
+.endm
+
+/* The body of division with positive divisor. Unless the divisor is very
+ big, shift it up in multiples of four bits, since this is the amount of
+ unwinding in the main division loop. Continue shifting until the divisor
+ is larger than the dividend. */
+.macro THUMB1_Div_Positive
+ mov result, #0
+ BranchToDiv #1, LSYM(Lthumb1_div1)
+ BranchToDiv #4, LSYM(Lthumb1_div4)
+ BranchToDiv #8, LSYM(Lthumb1_div8)
+ BranchToDiv #12, LSYM(Lthumb1_div12)
+ BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+ mov result, #0xff
+ lsl divisor, divisor, #8
+ rev result, result
+ lsr curbit, dividend, #16
+ cmp curbit, divisor
+ blo 1f
+ asr result, #8
+ lsl divisor, divisor, #8
+ beq LSYM(Ldivbyzero_waypoint)
+
+1: lsr curbit, dividend, #12
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div12)
+ b LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+ lsr divisor, divisor, #8
+LSYM(Lthumb1_div16):
+ Dodiv #15
+ Dodiv #14
+ Dodiv #13
+ Dodiv #12
+LSYM(Lthumb1_div12):
+ Dodiv #11
+ Dodiv #10
+ Dodiv #9
+ Dodiv #8
+ bcs LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+ Dodiv #7
+ Dodiv #6
+ Dodiv #5
+LSYM(Lthumb1_div5):
+ Dodiv #4
+LSYM(Lthumb1_div4):
+ Dodiv #3
+LSYM(Lthumb1_div3):
+ Dodiv #2
+LSYM(Lthumb1_div2):
+ Dodiv #1
+LSYM(Lthumb1_div1):
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: adc result, result
+ cpy dividend, result
+ RET
+
+LSYM(Ldivbyzero_waypoint):
+ b LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor. Similar with
+ THUMB1_Div_Positive except that the shift steps are in multiples
+ of six bits. */
+.macro THUMB1_Div_Negative
+ lsr result, divisor, #31
+ beq 1f
+ neg divisor, divisor
+
+1: asr curbit, dividend, #32
+ bcc 2f
+ neg dividend, dividend
+
+2: eor curbit, result
+ mov result, #0
+ cpy ip, curbit
+ BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+ BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+ mov result, #0xfc
+ lsl divisor, divisor, #6
+ rev result, result
+ lsr curbit, dividend, #8
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ asr result, result, #6
+ cmp curbit, divisor
+ blo LSYM(Lthumb1_div_negative8)
+
+ lsl divisor, divisor, #6
+ beq LSYM(Ldivbyzero_negative)
+ asr result, result, #6
+ b LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+ lsr divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+ DoDiv #7
+ DoDiv #6
+ DoDiv #5
+ DoDiv #4
+LSYM(Lthumb1_div_negative4):
+ DoDiv #3
+ DoDiv #2
+ bcs LSYM(Lthumb1_div_negative_loop)
+ DoDiv #1
+ sub divisor, dividend, divisor
+ bcs 1f
+ cpy divisor, dividend
+
+1: cpy curbit, ip
+ adc result, result
+ asr curbit, curbit, #1
+ cpy dividend, result
+ bcc 2f
+ neg dividend, dividend
+ cmp curbit, #0
+
+2: bpl 3f
+ neg divisor, divisor
+
+3: RET
+
+LSYM(Ldivbyzero_negative):
+ cpy curbit, ip
+ asr curbit, curbit, #1
+ bcc LSYM(Ldiv0)
+ neg dividend, dividend
+.endm
+#endif /* ARM Thumb version. */
+
/* ------------------------------------------------------------------------ */
/* Start of the Real Functions */
/* ------------------------------------------------------------------------ */
@@ -960,6 +1102,7 @@ LSYM(Lgot_result):
FUNC_START udivsi3
FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -977,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test):
pop { work }
RET
+/* Implementation of aeabi_uidiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(udivsi3_skip_div0_test):
+ THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START udivsi3
@@ -1028,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test):
FUNC_START aeabi_uidivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(udivsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive. There is no need to calculate the
+ remainder again here. */
+ b LSYM(udivsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_uidivmod
cmp r1, #0
@@ -1089,7 +1249,7 @@ LSYM(Lover10):
RET
#else /* ARM version. */
-
+
FUNC_START umodsi3
subs r2, r1, #1 @ compare divisor with 1
@@ -1114,8 +1274,9 @@ LSYM(Lover10):
#if defined(__prefer_thumb__)
- FUNC_START divsi3
+ FUNC_START divsi3
FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
cmp divisor, #0
beq LSYM(Ldiv0)
@@ -1138,7 +1299,7 @@ LSYM(Lover11):
blo LSYM(Lgot_result)
THUMB_DIV_MOD_BODY 0
-
+
mov r0, result
mov work, ip
cmp work, #0
@@ -1148,6 +1309,22 @@ LSYM(Lover12):
pop { work }
RET
+/* Implementation of aeabi_idiv for ARMv6m. This version is only
+ used in ARMv6-M when we need an efficient implementation. */
+#else
+LSYM(divsi3_skip_div0_test):
+ cpy curbit, dividend
+ orr curbit, divisor
+ bmi LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+ THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+ THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START divsi3
@@ -1159,8 +1336,8 @@ LSYM(Lover12):
RET
#else /* ARM/Thumb-2 version. */
-
- ARM_FUNC_START divsi3
+
+ ARM_FUNC_START divsi3
ARM_FUNC_ALIAS aeabi_idiv divsi3
cmp r1, #0
@@ -1214,12 +1391,21 @@ LSYM(divsi3_skip_div0_test):
FUNC_START aeabi_idivmod
cmp r1, #0
beq LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
push {r0, r1, lr}
bl LSYM(divsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
+# else
+ /* Both the quotient and remainder are calculated simultaneously
+ in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
+ need to calculate the remainder again here. */
+ b LSYM(divsi3_skip_div0_test)
+ RET
+# endif /* __OPTIMIZE_SIZE__ */
+
#elif defined(__ARM_ARCH_EXT_IDIV__)
ARM_FUNC_START aeabi_idivmod
cmp r1, #0
--
1.9.1