This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- From: Ramana Radhakrishnan <ramana dot radhakrishnan at foss dot arm dot com>
- To: Andre Vieira <Andre dot SimoesDiasVieira at arm dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 1 Jul 2016 14:40:51 +0100
- Subject: Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
- Authentication-results: sourceware.org; auth=none
- References: <561D38F7 dot 809 at arm dot com>
On 13/10/15 18:01, Andre Vieira wrote:
> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>
> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>
> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>
> Make check passed for armv6-m.
>
> libgcc/ChangeLog:
> 2015-08-10 Hale Wang <hale.wang@arm.com>
> Andre Vieira <andre.simoesdiasvieira@arm.com>
>
> * config/arm/lib1funcs.S: Add new wrapper.
>
> 0001-integer-division.patch
>
>
> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
> From: Andre Simoes Dias Vieira <andsim01@arm.com>
> Date: Fri, 21 Aug 2015 14:23:28 +0100
> Subject: [PATCH] new wrapper idivmod
>
> ---
> libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
> 1 file changed, 217 insertions(+), 33 deletions(-)
>
> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
> --- a/libgcc/config/arm/lib1funcs.S
> +++ b/libgcc/config/arm/lib1funcs.S
> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
> #ifdef __ARM_EABI__
> .macro THUMB_LDIV0 name signed
> #if defined(__ARM_ARCH_6M__)
> - .ifc \signed, unsigned
> - cmp r0, #0
> - beq 1f
> - mov r0, #0
> - mvn r0, r0 @ 0xffffffff
> -1:
> - .else
> - cmp r0, #0
> - beq 2f
> - blt 3f
> +
> + push {r0, lr}
> mov r0, #0
> - mvn r0, r0
> - lsr r0, r0, #1 @ 0x7fffffff
> - b 2f
> -3: mov r0, #0x80
> - lsl r0, r0, #24 @ 0x80000000
> -2:
> - .endif
> - push {r0, r1, r2}
> - ldr r0, 4f
> - adr r1, 4f
> - add r0, r1
> - str r0, [sp, #8]
> - @ We know we are not on armv4t, so pop pc is safe.
> - pop {r0, r1, pc}
> - .align 2
> -4:
> - .word __aeabi_idiv0 - 4b
> + bl SYM(__aeabi_idiv0)
> + pop {r1, pc}
> +
I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
> #elif defined(__thumb2__)
> .syntax unified
> .ifc \signed, unsigned
> @@ -945,7 +923,170 @@ LSYM(Lover7):
> add dividend, work
> .endif
> LSYM(Lgot_result):
> -.endm
> +.endm
> +
> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
> +/* If performance is preferred, the following functions are provided. */
> +
Comment above #if please and also check elsewhere in patch.
> +/* Branch to div(n), and jump to label if curbit is lo than divisior. */
> +.macro BranchToDiv n, label
> + lsr curbit, dividend, \n
> + cmp curbit, divisor
> + blo \label
> +.endm
> +
> +/* Body of div(n). Shift the divisor in n bits and compare the divisor
> + and dividend. Update the dividend as the substruction result. */
> +.macro DoDiv n
> + lsr curbit, dividend, \n
> + cmp curbit, divisor
> + bcc 1f
> + lsl curbit, divisor, \n
> + sub dividend, dividend, curbit
> +
> +1: adc result, result
> +.endm
> +
> +/* The body of division with positive divisor. Unless the divisor is very
> + big, shift it up in multiples of four bits, since this is the amount of
> + unwinding in the main division loop. Continue shifting until the divisor
> + is larger than the dividend. */
> +.macro THUMB1_Div_Positive
> + mov result, #0
> + BranchToDiv #1, LSYM(Lthumb1_div1)
> + BranchToDiv #4, LSYM(Lthumb1_div4)
> + BranchToDiv #8, LSYM(Lthumb1_div8)
> + BranchToDiv #12, LSYM(Lthumb1_div12)
> + BranchToDiv #16, LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_large_positive):
> + mov result, #0xff
> + lsl divisor, divisor, #8
> + rev result, result
> + lsr curbit, dividend, #16
> + cmp curbit, divisor
> + blo 1f
> + asr result, #8
> + lsl divisor, divisor, #8
> + beq LSYM(Ldivbyzero_waypoint)
> +
> +1: lsr curbit, dividend, #12
> + cmp curbit, divisor
> + blo LSYM(Lthumb1_div12)
> + b LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_loop):
> + lsr divisor, divisor, #8
> +LSYM(Lthumb1_div16):
> + Dodiv #15
> + Dodiv #14
> + Dodiv #13
> + Dodiv #12
> +LSYM(Lthumb1_div12):
> + Dodiv #11
> + Dodiv #10
> + Dodiv #9
> + Dodiv #8
> + bcs LSYM(Lthumb1_div_loop)
> +LSYM(Lthumb1_div8):
> + Dodiv #7
> + Dodiv #6
> + Dodiv #5
> +LSYM(Lthumb1_div5):
> + Dodiv #4
> +LSYM(Lthumb1_div4):
> + Dodiv #3
> +LSYM(Lthumb1_div3):
> + Dodiv #2
> +LSYM(Lthumb1_div2):
> + Dodiv #1
> +LSYM(Lthumb1_div1):
> + sub divisor, dividend, divisor
> + bcs 1f
> + cpy divisor, dividend
> +
> +1: adc result, result
> + cpy dividend, result
> + RET
> +
> +LSYM(Ldivbyzero_waypoint):
> + b LSYM(Ldiv0)
> +.endm
> +
> +/* The body of division with negative divisor. Similar with
> + THUMB1_Div_Positive except that the shift steps are in multiples
> + of six bits. */
> +.macro THUMB1_Div_Negative
> + lsr result, divisor, #31
> + beq 1f
> + neg divisor, divisor
> +
> +1: asr curbit, dividend, #32
> + bcc 2f
> + neg dividend, dividend
> +
> +2: eor curbit, result
> + mov result, #0
> + cpy ip, curbit
> + BranchToDiv #4, LSYM(Lthumb1_div_negative4)
> + BranchToDiv #8, LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_large):
> + mov result, #0xfc
> + lsl divisor, divisor, #6
> + rev result, result
> + lsr curbit, dividend, #8
> + cmp curbit, divisor
> + blo LSYM(Lthumb1_div_negative8)
> +
> + lsl divisor, divisor, #6
> + asr result, result, #6
> + cmp curbit, divisor
> + blo LSYM(Lthumb1_div_negative8)
> +
> + lsl divisor, divisor, #6
> + asr result, result, #6
> + cmp curbit, divisor
> + blo LSYM(Lthumb1_div_negative8)
> +
> + lsl divisor, divisor, #6
> + beq LSYM(Ldivbyzero_negative)
> + asr result, result, #6
> + b LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_negative_loop):
> + lsr divisor, divisor, #6
> +LSYM(Lthumb1_div_negative8):
> + DoDiv #7
> + DoDiv #6
> + DoDiv #5
> + DoDiv #4
> +LSYM(Lthumb1_div_negative4):
> + DoDiv #3
> + DoDiv #2
> + bcs LSYM(Lthumb1_div_negative_loop)
> + DoDiv #1
> + sub divisor, dividend, divisor
> + bcs 1f
> + cpy divisor, dividend
> +
> +1: cpy curbit, ip
> + adc result, result
> + asr curbit, curbit, #1
> + cpy dividend, result
> + bcc 2f
> + neg dividend, dividend
> + cmp curbit, #0
> +
> +2: bpl 3f
> + neg divisor, divisor
> +
> +3: RET
> +
> +LSYM(Ldivbyzero_negative):
> + cpy curbit, ip
> + asr curbit, curbit, #1
> + bcc LSYM(Ldiv0)
> + neg dividend, dividend
> +.endm
> +#endif /* ARM Thumb version. */
> +
> /* ------------------------------------------------------------------------ */
> /* Start of the Real Functions */
> /* ------------------------------------------------------------------------ */
> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>
> FUNC_START udivsi3
> FUNC_ALIAS aeabi_uidiv udivsi3
> +#if defined(__OPTIMIZE_SIZE__)
>
> cmp divisor, #0
> beq LSYM(Ldiv0)
> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
> pop { work }
> RET
>
> +#else
> + /* Implementation of aeabi_uidiv for ARMv6m. This version is only
> + used in ARMv6-M when we need an efficient implementation. */
> +LSYM(udivsi3_skip_div0_test):
> + THUMB1_Div_Positive
> +
> +#endif /* __OPTIMIZE_SIZE__ */
> +
> #elif defined(__ARM_ARCH_EXT_IDIV__)
>
> ARM_FUNC_START udivsi3
> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
> FUNC_START aeabi_uidivmod
> cmp r1, #0
> beq LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
> push {r0, r1, lr}
> bl LSYM(udivsi3_skip_div0_test)
> POP {r1, r2, r3}
> mul r2, r0
> sub r1, r1, r2
> bx r3
> +# else
> + /* Both the quotient and remainder are calculated simultaneously
> + in THUMB1_Div_Positive. There is no need to calculate the
> + remainder again here. */
> + b LSYM(udivsi3_skip_div0_test)
> + RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
> #elif defined(__ARM_ARCH_EXT_IDIV__)
> ARM_FUNC_START aeabi_uidivmod
> cmp r1, #0
> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
> RET
>
> #else /* ARM version. */
> -
> +
> FUNC_START umodsi3
>
> subs r2, r1, #1 @ compare divisor with 1
> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>
> #if defined(__prefer_thumb__)
>
> - FUNC_START divsi3
> + FUNC_START divsi3
> FUNC_ALIAS aeabi_idiv divsi3
> +#if defined(__OPTIMIZE_SIZE__)
>
> cmp divisor, #0
> beq LSYM(Ldiv0)
> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
> blo LSYM(Lgot_result)
>
> THUMB_DIV_MOD_BODY 0
> -
> +
> mov r0, result
> mov work, ip
> cmp work, #0
> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
> LSYM(Lover12):
> pop { work }
> RET
> +#else
> + /* Implementation of aeabi_idiv for ARMv6m. This version is only
> + used in ARMv6-M when we need an efficient implementation. */
> +LSYM(divsi3_skip_div0_test):
> + cpy curbit, dividend
> + orr curbit, divisor
> + bmi LSYM(Lthumb1_div_negative)
> +
> +LSYM(Lthumb1_div_positive):
> + THUMB1_Div_Positive
> +
> +LSYM(Lthumb1_div_negative):
> + THUMB1_Div_Negative
> +
> +#endif /* __OPTIMIZE_SIZE__ */
>
> #elif defined(__ARM_ARCH_EXT_IDIV__)
>
> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
> RET
>
> #else /* ARM/Thumb-2 version. */
> -
> - ARM_FUNC_START divsi3
> +
> + ARM_FUNC_START divsi3
> ARM_FUNC_ALIAS aeabi_idiv divsi3
>
> cmp r1, #0
> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
> FUNC_START aeabi_idivmod
> cmp r1, #0
> beq LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
> push {r0, r1, lr}
> bl LSYM(divsi3_skip_div0_test)
> POP {r1, r2, r3}
> mul r2, r0
> sub r1, r1, r2
> bx r3
> +# else
> + /* Both the quotient and remainder are calculated simultaneously
> + in THUMB1_Div_Positive and THUMB1_Div_Negative. There is no
> + need to calculate the remainder again here. */
> + b LSYM(divsi3_skip_div0_test)
> + RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
> #elif defined(__ARM_ARCH_EXT_IDIV__)
> ARM_FUNC_START aeabi_idivmod
> cmp r1, #0
> -- 1.9.1
>
Otherwise OK if no regressions and the following request passes.
Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ?
regards
Ramana