/* longlong.S arithmetics for 64bit integers for ARM. Copyright (C) 2005 Free Software Foundation, Inc. Contributed by Fredrik Hederstierna, Purple Scout AB. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. In addition to the permissions in the GNU General Public License, the Free Software Foundation gives you unlimited permission to link the compiled version of this file into combinations with other programs, and to distribute those combinations without any restriction coming from the use of this file. (The General Public License restrictions do apply in other respects; for example, they cover modification of the file, and distribution when not linked into a combine executable.) This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* This file is only compilable to a processor with long multiply instructions. The ARM_ARCH define equals 4 even for ARMv3M which also has long multiply.*/ #if (__ARM_ARCH__ > 3) /* Special function that will always be coded in ARM assembly, even if in Thumb-only compilation.*/ .macro ARM_DIV_FUNC_END name cfi_start __\name, LSYM(Lend_div0) LSYM(Ldiv0): ARM_LDIV0 \name cfi_end LSYM(Lend_div0) FUNC_END \name .endm /* -------------------------------------------------- */ /* HELPER MACROS FOR 64BIT OPERATIONS */ /* -------------------------------------------------- */ /* This define disables the table used for counting leading zeros. It takes 1k memory and could speedup the calculation of clz in a system with a fast memory bus. By default it is disabled to save memory and because it can be slow if the memory bus is slow.*/ #define ARM_DISABLE_CLZ_TABLE /* -------------------------------------------------- */ /* Macro to negate a 64bit number (make 2-complement). * IN/OUT: lo = low word of 64 bit number * IN/OUT: hi = high word of 64 bit number */ .macro ARM_NEG_64BIT lo, hi rsbs \lo, \lo, #0 rsc \hi, \hi, #0 .endm /* ARM_NEG_64BIT */ /* -------------------------------------------------- */ /* Macro to 64bit logic shift left * IN/OUT: lo = low word of 64 bit value to shift * IN/OUT: hi = high word of 64 bit value to shift * IN: steps = steps to logic shift left * SCRATCH: tmp = scratch register */ .macro ARM_LSHL_64BIT lo, hi, steps, tmp subs \tmp, \steps, #32 rsbmi \tmp, \steps, #32 movmi \hi, \hi, lsl \steps movpl \hi, \lo, lsl \tmp orrmi \hi, \hi, \lo, lsr \tmp mov \lo, \lo, lsl \steps .endm /* ARM_LSHL_64BIT */ /* -------------------------------------------------- */ /* Macro to 64bit arithmetic shift right * IN/OUT: lo = low word of 64 bit value to shift * IN/OUT: hi = high word of 64 bit value to shift * IN: steps = steps to arithmetic shift right * SCRATCH: tmp = scratch register */ .macro ARM_ASHR_64BIT lo, hi, steps, tmp subs \tmp, \steps, #32 rsbmi \tmp, \steps, #32 movmi \lo, \lo, lsr \steps movpl \lo, \hi, asr \tmp orrmi \lo, \lo, \hi, lsl \tmp mov \hi, \hi, asr \steps .endm /* ARM_ASHR_64BIT */ /* -------------------------------------------------- */ /* Macro to 64bit logic shift right * IN/OUT: lo = low word of 64 bit value to shift * IN/OUT: hi = high word of 64 bit value to shift * IN: steps = steps to logic shift right * SCRATCH: tmp = scratch register */ .macro ARM_LSHR_64BIT lo, hi, steps, tmp subs \tmp, \steps, #32 rsbmi \tmp, \steps, #32 movmi \lo, \lo, lsr \steps movpl \lo, \hi, lsr \tmp orrmi \lo, \lo, \hi, lsl \tmp mov \hi, \hi, lsr \steps .endm /* ARM_LSHR_64BIT */ /* -------------------------------------------------- */ /* Macro to 64bit logic shift right 1 step * IN/OUT: lo = low word of 64 bit number * IN/OUT: hi = high word of 64 bit number */ .macro ARM_LSHR_64BIT_1STEP lo, hi movs \hi, \hi, lsr #1 mov \lo, \lo, rrx .endm /* ARM_LSHR_64BIT_1STEP */ /* -------------------------------------------------- */ /* Macro to 64bit arithmetic shift right 1 step * IN/OUT: lo = low word of 64 bit number * IN/OUT: hi = high word of 64 bit number */ .macro ARM_ASHR_64BIT_1STEP lo, hi movs \hi, \hi, asr #1 mov \lo, \lo, rrx .endm /* ARM_ASHR_64BIT_1STEP */ /* -------------------------------------------------- */ /* Macro to 64bit logic shift left 1 step (multiply by 2) * IN/OUT: lo = low word of 64 bit number * IN/OUT: hi = high word of 64 bit number */ .macro ARM_LSHL_64BIT_1STEP lo, hi adds \lo, \lo, \lo adc \hi, \hi, \hi .endm /* ARM_LSHL_64BIT_1STEP */ /* -------------------------------------------------- */ /* If lookup table is used for the last 8 bits in clz routine for ARMv4.*/ #if (__ARM_ARCH__ < 5) && ! defined (__OPTIMIZE_SIZE__) && ! defined(ARM_DISABLE_CLZ_TABLE) #define ARM_USE_CLZ_TABLE #endif /* (__ARM_ARCH__ < 5) */ /* -------------------------------------------------- */ /* Don't use CLZ-table if it's required to save space.*/ #if defined(ARM_USE_CLZ_TABLE) .globl __arm_clz_tab __arm_clz_tab: .byte 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 .byte 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 .byte 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 .byte 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 .byte 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 #endif /* ARM_USE_CLZ_TABLE */ /* -------------------------------------------------- */ /* Count leading zeros in 32bit number and set flags for result. * IN: lo = word of 32 bit number (not altered) * OUT: result = number of leading zeros (0-32) * SCRATCH: tmp = scratch register (only used for ARMv4) * SCRATCH: tmp2 = scratch register2 (only used for ARMv4).*/ .macro ARM_CLZ_32BIT lo, result, tmp, tmp2 #if (__ARM_ARCH__ >= 5) clz \result, \lo #else /* tmp2 is set to the most significant word, not zero.*/ mov \tmp2, \lo mov \result, #32 cmp \tmp2, #0x10000 movhs \tmp2, \tmp2, lsr #16 subhs \result, \result, #16 tst \tmp2, #0xff00 movne \tmp2, \tmp2, lsr #8 subne \result, \result, #8 #if defined(ARM_USE_CLZ_TABLE) ldr \tmp, =__arm_clz_tab ldrb \tmp2, [\tmp, \tmp2] sub \result, \result, \tmp2 #else tst \tmp2, #0xf0 movne \tmp2, \tmp2, lsr #4 subne \result, \result, #4 tst \tmp2, #0xc movne \tmp2, \tmp2, lsr #2 subne \result, \result, #2 cmp \tmp2, #1 subhi \result, \result, #2 /* i.e. if tmp2 == #2 */ subeq \result, \result, #1 /* i.e. if tmp2 == #1 */ #endif /* ARM_USE_CLZ_TABLE */ #endif /* ARM_ARCH_v5 */ .endm /* ARM_CLZ */ /* -------------------------------------------------- */ /* Count leading zeros for 64bit number. * IN: lo = low word of 64 bit number (not altered) * IN: hi = high word of 64 bit number (not altered) * OUT: result = number of leading zeros (0-64) * SCRATCH: tmp = scratch register (only used for ARMv4) * SCRATCH: tmp2 = scratch register2 (only used for ARMv4).*/ .macro ARM_CLZ_64BIT lo, hi, result, tmp, tmp2 #if (__ARM_ARCH__ >= 5) cmp \hi, #0 clzne \result, \hi clzeq \result, \lo addeq \result, \result, #32 #else /* set tmp2 to the most significant word, not zero.*/ movs \tmp2, \hi /* if (hi == 0), then tmp2 = lo */ moveq \tmp2, \lo /* tmp2 is now the word of lo/hi that is not 0. Now init result bit cnt.*/ /* if (hi != 0), then check high word */ movne \result, #32 /* if (hi == 0), then check low word */ moveq \result, #64 cmp \tmp2, #0x10000 movhs \tmp2, \tmp2, lsr #16 subhs \result, \result, #16 tst \tmp2, #0xff00 movne \tmp2, \tmp2, lsr #8 subne \result, \result, #8 #if defined(ARM_USE_CLZ_TABLE) ldr \tmp, =__arm_clz_tab ldrb \tmp2, [\tmp, \tmp2] sub \result, \result, \tmp2 #else tst \tmp2, #0xf0 movne \tmp2, \tmp2, lsr #4 subne \result, \result, #4 tst \tmp2, #0xc movne \tmp2, \tmp2, lsr #2 subne \result, \result, #2 cmp \tmp2, #1 subhi \result, \result, #2 /* i.e. if tmp2 == #2 */ subeq \result, \result, #1 /* i.e. if tmp2 == #1 */ #endif /* ARM_USE_CLZ_TABLE */ #endif /* ARM_ARCH_v5 */ .endm /* ARM_CLZ_64BIT */ /* -------------------------------------------------- */ /* Calculate log2-1 for 64bit number, used as shift value * when dividing with divisor that is a power of 2. * IN: lo = low word of 64 bit number (not altered) * IN: hi = high word of 64 bit number (not altered) * OUT: result = log2-1 (0-63) * SCRATCH: tmp = scratch register (only used for ARMv4)*/ .macro ARM_LOG2_64BIT lo, hi, result, tmp #if (__ARM_ARCH__ >= 5) cmp \hi, #0 clzne \result, \hi clzeq \result, \lo addeq \result, \result, #32 rsb \result, \result, #63 #else /* set tmp to the most significant word that is not zero.*/ movs \tmp, \hi moveq \tmp, \lo /* if hi == 0, then tmp = lo */ /* tmp is now the word of lo/hi that is not 0, Now init result bit cnt. */ /* if (hi != 0), then checking high word */ movne \result, #32 /* if (hi == 0), then checking low word */ moveq \result, #0 /* Binary search alike algorithm to find log2.*/ cmp \tmp, #(1 << 16) movhs \tmp, \tmp, lsr #16 addhs \result, \result, #16 cmp \tmp, #(1 << 8) movhs \tmp, \tmp, lsr #8 addhs \result, \result, #8 cmp \tmp, #(1 << 4) movhs \tmp, \tmp, lsr #4 addhs \result, \result, #4 cmp \tmp, #(1 << 2) addhi \result, \result, #3 addls \result, \result, \tmp, lsr #1 #endif .endm /* ARM_LOG2_64BIT */ /* -------------------------------------------------- */ /* Count leading zeros and normalize 32bit number. * IN/OUT: lo = word with 32bit number to normalize. * OUT: result = number of leading zeros (0-32) shifted.*/ .macro ARM_NORMALIZE_32BIT lo, result #if (__ARM_ARCH__ >= 5) clz \result, \lo mov \lo, \lo, lsl \result #else cmp \lo, #(1 << 16) movlo \lo, \lo, lsl #16 movhs \result, #0 movlo \result, #16 cmp \lo, #(1 << 24) addlo \result, \result, #8 movlo \lo, \lo, lsl #8 cmp \lo, #(1 << 28) addlo \result, \result, #4 movlo \lo, \lo, lsl #4 cmp \lo, #(1 << 30) addlo \result, \result, #2 movlo \lo, \lo, lsl #2 cmp \lo, #(1 << 31) addlo \result, \result, #1 movlo \lo, \lo, lsl #1 #endif .endm /* ARM_NORMALIZE_32BIT */ /* -------------------------------------------------- */ /* Count leading zeros and normalize 64bit number. * * Note: It's required that (hi != 0). * * IN/OUT: lo = word with 32bit number to normalize. * IN/OUT: hi = word with 32bit number to normalize. * OUT: result = number of leading zeros (0-64) shifted. * SCRATCH: tmp = scratch register (only used for ARMv4) * * Note: Register tmp is set to "rsb tmp, result, #32", used * in the calling function.*/ .macro ARM_NORMALIZE_64BIT lo, hi, result, tmp #if (__ARM_ARCH__ >= 5) clz \result, \hi ARM_LSHL_64BIT \lo, \hi, \result, \tmp /* Since hi != 0, clz(hi|lo) < 32, the LSHL shift macro will also set tmp to 32-result: subs \tmp, \steps, #32 (result will be minus) rsbmi \tmp, \steps, #32 => no need for rsb \tmp, \result, #32 */ #else cmp \hi, #(1 << 16) movlo \hi, \hi, lsl #16 movhs \result, #0 movlo \result, #16 cmp \hi, #(1 << 24) addlo \result, \result, #8 movlo \hi, \hi, lsl #8 cmp \hi, #(1 << 28) addlo \result, \result, #4 movlo \hi, \hi, lsl #4 cmp \hi, #(1 << 30) addlo \result, \result, #2 movlo \hi, \hi, lsl #2 cmp \hi, #(1 << 31) addlo \result, \result, #1 movlo \hi, \hi, lsl #1 /* Ok, now hi is finished and result if calculated.*/ rsb \tmp, \result, #32 orr \hi, \hi, \lo, lsr \tmp mov \lo, \lo, lsl \result #endif .endm /* ARM_NORMALIZE_64BIT */ /* -------------------------------------------------- */ /* Macro to calculate unsigned 64/32bit division. Uses a semi-numeric algorithm that approximates q with n1, then recalculates a new n and adds new n1 to approximation etc. The d value must have most significant bit set to 1, and n >= d. Quick QRNND (QQRNND) Algorithm in pseudo code: q = n1 m1m0 = n1 * d0 r1 = (n1n0 - m1m0) >> 32 while (r1 > 0) { q += r1 m1m0 += r1 * d0 r1 = (n1n0 - m1m0) >> 32 } r = n0 - m0 if (d0 <= r) r -= d0 else q += 1 The algorithm converges quite fast and do less iterations than a normal QRNND division loop. Algorithm analysis. The worst case work done is O(n), where n is the number of bits in the high-word of the denominator. Worst case is when n1 much larger than d. Then it could take 32 iterations before function exists. Worst case example n1=0xffffffff and d=0x80000001 Note: 2nd and 4th argument always same when calling! * OUT q = quotaint, 32 bit number * OUT r = remaineder, 32 bit number * IN: n1 = low word of 64 bit denominator (not needed to be normalized) * IN: n0 = high word of 64 bit denominator, n1n0 >= d0 * IN: d = divisor, 32 bit number, clz(_d) = 0 * SCRATCH: nxlo = scratch register * SCRATCH: nxhi = scratch register2 * SCRATCH: tmp = scratch register3 * INPARAM: set_q = if q should be set (running as division) * INPARAM: set_r = if r should be set (running as modulo) * INPARAM: check_early = check if (n1 == 0) */ .macro ARM_QUICK_QRNND q, r, n1, n0, d, nxhi, nxlo, tmp, set_q, set_r, check_early movs \q, \n1 .if \check_early /* It might be assumed that n1 is small.*/ beq 3f .endif /* Try to make a good start approximation for q. If d = 0xffffffff, then n1 is good approx for q. But if d = 0x80000001, then this starting q is very bad, almost a factor 2 wrong.*/ /* By numeric experimentation, these start approximations is valid. Calculations was done by evaluating function 1/x.*/ cmp \d, #0xaa000000 bhi 0f addls \q, \q, \n1, lsr #1 cmp \d, #0x92000000 addls \q, \q, \n1, lsr #2 cmp \d, #0x88000000 addls \q, \q, \n1, lsr #3 cmp \d, #0x84000000 addls \q, \q, \n1, lsr #4 0: /* Don't use d as last operand for multipliers, since clz(d)=0 which gives worst case timing for umull and umlal.*/ umull \nxlo, \nxhi, \d, \q subs \tmp, \n0, \nxlo sbcs \tmp, \n1, \nxhi beq 2f 1: #if ! defined (__OPTIMIZE_SIZE__) /* Loop unroll in total four times.*/ .if \set_q addne \q, \q, \tmp .endif umlalne \nxlo, \nxhi, \d, \tmp subs \tmp, \n0, \nxlo sbcs \tmp, \n1, \nxhi .if \set_q addne \q, \q, \tmp .endif umlalne \nxlo, \nxhi, \d, \tmp subs \tmp, \n0, \nxlo sbcs \tmp, \n1, \nxhi .if \set_q addne \q, \q, \tmp .endif umlalne \nxlo, \nxhi, \d, \tmp subs \tmp, \n0, \nxlo sbcs \tmp, \n1, \nxhi #endif /* __OPTIMIZE_SIZE__ */ .if \set_q addne \q, \q, \tmp .endif umlalne \nxlo, \nxhi, \d, \tmp subs \tmp, \n0, \nxlo sbcs \tmp, \n1, \nxhi bne 1b 2: subs \r, \n0, \nxlo 3: cmp \d, \r .if \set_r /* Set output value of r */ subls \r, \r, \d .endif .if \set_q /* Set output value of q */ addls \q, \q, #1 .endif .endm /* ARM_QUICK_QRNND */ /* -------------------------------------------------- */ /* Temporary work registers, store these on stack if used.*/ #define work_reg_tmp1 r4 #define work_reg_tmp2 r5 #define work_reg_tmp3 r6 #define work_reg_tmp4 r7 /* Register aliases used as temporary variables.*/ #define btmp work_reg_tmp1 #define bm work_reg_tmp2 #define n2 work_reg_tmp3 #define m0 work_reg_tmp1 #define m1 work_reg_tmp3 #define nxlo work_reg_tmp4 #define nxhi work_reg_tmp2 /* -------------------------------------------------- */ /* Macro to store register on stack. * INPARAM: reg = register to save on stack.*/ .macro ARM_PUSH_REG reg str \reg, [sp, #-4]! .endm /* ARM_PUSH_REG */ /* -------------------------------------------------- */ /* Macro to restore register from stack. * INPARAM: reg = register to restore from stack.*/ .macro ARM_POP_REG reg ldr \reg, [sp], #4 .endm /* ARM_POP_REG */ /* -------------------------------------------------- */ /* Macro to store all work registers on stack. * INPARAM: save_ip = if macro should save and restore IP on stack.*/ .macro ARM_DIV_MOD_PUSH_WORK_REGS save_ip .if \save_ip stmfd sp!, { work_reg_tmp1, work_reg_tmp2, work_reg_tmp3, work_reg_tmp4, ip } .else stmfd sp!, { work_reg_tmp1, work_reg_tmp2, work_reg_tmp3, work_reg_tmp4 } .endif .endm /* ARM_DIV_MOD_PUSH_WORK_REGS */ /* -------------------------------------------------- */ /* Macro to restore all work registers from stack. * INPARAM: save_ip = if macro should save and restore IP on stack.*/ .macro ARM_DIV_MOD_POP_WORK_REGS save_ip .if \save_ip ldmfd sp!, { work_reg_tmp1, work_reg_tmp2, work_reg_tmp3, work_reg_tmp4, ip } .else ldmfd sp!, { work_reg_tmp1, work_reg_tmp2, work_reg_tmp3, work_reg_tmp4 } .endif .endm /* ARM_DIV_MOD_POP_WORK_REGS */ /* -------------------------------------------------- */ /* Macro to restore all work registers from stack and exit macro * INPARAM: dont_return = if macro should not return to callee. * INPARAM: save_ip = if macro should save and restore IP on stack.*/ .macro ARM_DIV_MOD_RETURN dont_return, save_ip .if \dont_return ARM_DIV_MOD_POP_WORK_REGS \save_ip RET .else /* goto common exit point.*/ b 999f .endif .endm /* ARM_DIV_MOD_RETURN */ /* -------------------------------------------------- */ /* Macro to divide or calculate modulo. * IN: n0 = low word of 64 bit denominator * IN: n1 = high word of 64 bit denominator * IN: d0 = low word of 64 bit divisor * IN: d1 = high word of 64 bit divisor * OUT: rq0 = low word of 64 bit remainder/quotaint * OUT: rq1 = high word of 64 bit remainder/quotaint * INPARAM: modulo = if modulo and not division code should be generated * INPARAM: dont_return = if macro should not return to callee. * INPARAM: save_ip = if macro should save and restore IP on stack. * INPARAM: bit32_zero = if most significant bits in n, d is 0 (sign.div/mod). * SCRATCH ip = scratch register, if save_ip is set, then save it on stack.*/ .macro ARM_DIV_MOD_64BIT_BODY n0, n1, d0, d1, rq0, rq1, modulo, dont_return, save_ip, bit32_zero /* Store work regs on stack.*/ ARM_DIV_MOD_PUSH_WORK_REGS \save_ip /* Check power of 2 by check if ((d & (d - 1)) == 0) is d a power of 2, then we can just shift n right log2(d).*/ /* subtract #1 and store in tmp regs */ subs nxlo, \d0, #1 sbc nxhi, \d1, #0 /* calc d & (d-1) */ ands ip, nxlo, \d0 andeqs ip, nxhi, \d1 /* if both regs zero, then d is a power of 2 */ beq 222f /* if (d1 == 0) */ cmp \d1, #0 bne 99f /* if (d0 > n1) */ cmp \d0, \n1 bls 66f /* check if clz(d0) = 0 */ mvns bm, \d0, asr #31 /* if (bm == 0) */ beq 55f /* 0q = nn / 0D */ ARM_NORMALIZE_32BIT \d0, bm /* n1n0 = n1n0 << bm */ ARM_LSHL_64BIT \n0, \n1, bm, ip 55: /* udiv_qrnnd (q0, n0, n1, n0, d0)*/ /* n2 can be used as temp for q0, d1 can be used as temp for nxhi.*/ .if \modulo /* params: set_q = 0, set_r = 1, check_early = 1 */ ARM_QUICK_QRNND n2, \n0, \n1, \n0, \d0, \d1, nxlo, ip, 0, 1, 1 /* Remainder in n0 >> bm.*/ mov \rq0, \n0, lsr bm .else /* params: set_q = 1, set_r = 0, check_early = 1 */ ARM_QUICK_QRNND n2, \n0, \n1, \n0, \d0, \d1, nxlo, ip, 1, 0, 1 /* set q0 = n2 */ mov \rq0, n2 .endif /* rq1 = 0 */ mov \rq1, #0 /* Exit */ ARM_DIV_MOD_RETURN \dont_return, \save_ip 66: /* if (d0 <= n1) */ /* check if clz(d0) == 0 */ mvns bm, \d0, asr #31 /* if (bm == 0) From (n1 >= d0) & (the most significant bit of d0 is set), conclude (the most significant bit of n1 is set) & (the leading quotient digit q1 = 1). n1 -= d0 q1 = 1 */ subeq \n1,\n1,\d0 .ifeq \modulo /* use btmp as temp for rq1 */ moveq btmp, #1 .endif beq 88f /* qq = NN / 0d */ ARM_NORMALIZE_32BIT \d0, bm /* btmp = 32 - bm */ rsb btmp, bm, #32 /* n2 = n1 >> btmp */ mov n2, \n1, lsr btmp /* n1n0 = n1n0 << bm */ ARM_LSHL_64BIT \n0, \n1, bm, ip /* udiv_qrnnd (q1, n1, n2, n1, d0)*/ /* use btmp as temp for rq1, use d1 as temp for nxhi.*/ .if \modulo /* params: set_q = 0, set_r = 1, check_early = 0 */ ARM_QUICK_QRNND btmp, \n1, n2, \n1, \d0, \d1, nxlo, ip, 0, 1, 0 .else /* params: set_q = 1, set_r = 1, check_early = 0 */ ARM_QUICK_QRNND btmp, \n1, n2, \n1, \d0, \d1, nxlo, ip, 1, 1, 0 .endif 88: /* udiv_qrnnd (q0, n0, n1, n0, d0)*/ /* (n1 != d0) use n2 as tmp for rq0, use d1 as temp for nxhi.*/ .if \modulo /* params: set_q = 0, set_r = 1, check_early = 1 */ ARM_QUICK_QRNND n2, \n0, \n1, \n0, \d0, \d1, nxlo, ip, 0, 1, 1 /* Remainder in n0 >> bm.*/ mov \rq0, \n0, lsr bm mov \rq1, #0 .else /* params: set_q = 1, set_r = 0, check_early = 1 */ ARM_QUICK_QRNND n2, \n0, \n1, \n0, \d0, \d1, nxlo, ip, 1, 0, 1 /* set rq0, rq1 */ mov \rq0, n2 /* btmp used as temp for q1.*/ mov \rq1, btmp .endif /* Exit */ ARM_DIV_MOD_RETURN \dont_return, \save_ip 222: /* Handle division with Pow2-divisor => simple shift */ .if \modulo /* do bitwise AND with (divisor-1) which is the bitmask for modulo.*/ and \rq0, nxlo, \n0 and \rq1, nxhi, \n1 .else /* Calc steps to shift for division */ ARM_LOG2_64BIT \d0, \d1, bm, ip /* divide rest by shifting dividend */ ARM_LSHR_64BIT \n0, \n1, bm, ip .endif /* Exit */ ARM_DIV_MOD_RETURN \dont_return, \save_ip .ifeq \bit32_zero 555: /* if (bm == 0) */ /* From (n1 >= d1) /\ (the most significant bit of d1 is set), conclude (the most significant bit of n1 is set) /\ (the quotient digit q0 = 0 or 1). Since we in calling function checks that n > d, then q0 = 1 */ .if \modulo subs \n0, \n0, \d0 sbc \n1, \n1, \d1 .else mov \rq0, #1 mov \rq1, #0 .endif /* Exit */ ARM_DIV_MOD_RETURN \dont_return, \save_ip .endif 99: /* if (d1 != 0), mean also (n1 != 0). */ /* 0q = NN / dd */ .ifeq \bit32_zero /* check if clz(d1) == 0 */ mvns bm, \d1, asr #31 beq 555b .endif /* Normalize d, since (d1 != 0), clz(d) = clz(d1) */ /* count_leading_zeros (bm, d1) */ /* btmp can be used as temp, btmp = 32 - bm */ ARM_NORMALIZE_64BIT \d0, \d1, bm, btmp /* n2 = n1 >> btmp */ mov n2, \n1, lsr btmp /* n1n0 = n1n0 << bm */ ARM_LSHL_64BIT \n0, \n1, bm, ip .if \modulo /* Modulo need to save bm (same reg as nxhi)*/ ARM_PUSH_REG bm .endif /* udiv_qrnnd (rq0, n1, n2, n1, d1)*/ /* use nxlo as temp for q0, nxhi will overwrite bm, saved on stack.*/ /* params: set_q = 1, set_r = 1, check_early = 0 */ ARM_QUICK_QRNND nxlo, \n1, n2, \n1, \d1, nxhi, btmp, ip, 1, 1, 0 /* m1m0 = q0 * d0 */ umull m0, m1, nxlo, \d0 .if \modulo /* try test n1n0 - m1m0 */ subs ip, \n0, m0 sbcs ip, \n1, m1 bhs 888f /* m1m0 = m1m0 - d1d0 */ subs m0, m0, \d0 sbc m1, m1, \d1 888: /* Modulo need to restore bm (same reg as nxhi)*/ ARM_POP_REG bm /* Remainder in (n1n0 - m1m0) >> bm.*/ subs \rq0, \n0, m0 sbc \rq1, \n1, m1 /* r1r0 = n1n0 >> bm */ ARM_LSHR_64BIT \rq0, \rq1, bm, ip .else /* opposite minus to get sharp diff */ subs ip, \n0, m0 sbcs ip, \n1, m1 sublo \rq0, nxlo, #1 movhs \rq0, nxlo mov \rq1, #0 .endif 999: /* Common exit point */ ARM_DIV_MOD_POP_WORK_REGS \save_ip .if \dont_return RET .endif .endm /* ARM_DIV_MOD_64BIT_BODY */ /* -------------------------------------------------- */ /* Functions for 64bit division and modulo */ /* -------------------------------------------------- */ /* Handle big and little endian variants. */ #ifdef __ARMEB__ /* Big endian for 64bit numbers.*/ #define lln_lo r1 #define lln_hi r0 #define lld_lo r3 #define lld_hi r2 #define llq_lo r1 #define llq_hi r0 #else /* __ARMEB__ */ /* Little endian for 64bit numbers.*/ #define lln_lo r0 #define lln_hi r1 #define lld_lo r2 #define lld_hi r3 #define llq_lo r0 #define llq_hi r1 #endif /* __ARMEB__ */ /*--------------------------------------------------*/ /* Unsigned 64bit division */ /*--------------------------------------------------*/ #ifdef L_udivdi3 ARM_FUNC_START udivdi3 /* Check div0 */ orrs ip, lld_hi, lld_lo beq LSYM(Ldiv0) /* Check if (d >= n), then jump */ subs ip, lld_lo, lln_lo sbcs ip, lld_hi, lln_hi bhs 9f /* Call 64bit division algorithm using division */ /* params: modulo = 0, dont_return = 1, save_ip = 0, bit32_zero = 0 */ ARM_DIV_MOD_64BIT_BODY lln_lo, lln_hi, lld_lo, lld_hi, llq_lo, llq_hi, 0, 1, 0, 0 /* No return */ 9: /* result hi is zero */ mov llq_hi, #0 /* check lo if hi is same */ cmpeq lld_lo, lln_lo /* if lo also same, div1 */ moveq llq_lo, #1 /* else (d > n), result 0 */ movne llq_lo, #0 RET ARM_DIV_FUNC_END udivdi3 #endif /* L_udivdi3 */ /*--------------------------------------------------*/ /* Signed 64bit division */ /*--------------------------------------------------*/ #ifdef L_divdi3 ARM_FUNC_START divdi3 /* Check div0 */ orrs ip, lld_hi, lld_lo beq LSYM(Ldiv0) /* save the sign of the result (most significant bit in hi-word).*/ eor ip, lln_hi, lld_hi /* flip sign for n */ tst lln_hi, lln_hi bpl 2f ARM_NEG_64BIT lln_lo, lln_hi 2: /* flip sign for d */ tst lld_hi, lld_hi bpl 3f ARM_NEG_64BIT lld_lo, lld_hi 3: /* Check not n less than d */ /* Jump if (d < n) */ cmp lln_hi, lld_hi bhi 5f blo 4f /* if same hi-word, check low-word */ cmp lln_lo, lld_lo bhi 5f 4: /* both div less and div n */ mov llq_hi, #0 /* dividend < divisor, result zero */ movne llq_lo, #0 RETc(ne) /* dividend == divisor, result 1 */ moveq llq_lo, #1 tst ip, ip RETc(pl) ARM_NEG_64BIT llq_lo, llq_hi RET 5: /* Call 64bit division algorithm using division */ /* params: modulo = 0, dont_return = 0, save_ip = 1, bit32_zero = 1 */ ARM_DIV_MOD_64BIT_BODY lln_lo, lln_hi, lld_lo, lld_hi, llq_lo, llq_hi, 0, 0, 1, 1 /* set correct sign of result */ tst ip, ip RETc(pl) ARM_NEG_64BIT llq_lo, llq_hi /* return */ RET ARM_DIV_FUNC_END divdi3 #endif /* L_divdi3 */ /*--------------------------------------------------*/ /* Unsigned 64bit modulo */ /*--------------------------------------------------*/ #ifdef L_umoddi3 ARM_FUNC_START umoddi3 /* Check div0 */ orrs ip, lld_hi, lld_lo beq LSYM(Ldiv0) /* Check if (d >= n) */ subs ip, lld_lo, lln_lo sbcs ip, lld_hi, lln_hi bhs 9f /* Call 64bit division algorithm using modulo */ /* params: modulo = 1, dont_return = 1, save_ip = 0, bit32_zero = 0 */ ARM_DIV_MOD_64BIT_BODY lln_lo, lln_hi, lld_lo, lld_hi, llq_lo, llq_hi, 1, 1, 0, 0 /* No return */ 9: RETc(ne) /* Check lo if hi is same */ cmpeq lld_lo, lln_lo /* if lo also same, div1 */ moveq llq_hi, #0 moveq llq_lo, #0 /* else (d > n), result remains n */ RET ARM_DIV_FUNC_END umoddi3 #endif /* L_umoddi3 */ /*--------------------------------------------------*/ /* Signed 64bit modulo */ /*--------------------------------------------------*/ #ifdef L_moddi3 ARM_FUNC_START moddi3 /* Check div0 */ orrs ip, lld_hi, lld_lo beq LSYM(Ldiv0) /* save the sign of the result (hibyte bit31) in ip */ mov ip, lln_hi /* flip sign for n */ tst lln_hi, lln_hi bpl 2f ARM_NEG_64BIT lln_lo, lln_hi 2: /* flip sign for d */ tst lld_hi, lld_hi bpl 3f ARM_NEG_64BIT lld_lo, lld_hi 3: /* Check not n less than d */ /* Jump if (d < n) */ cmp lln_hi, lld_hi bhi 5f blo 4f /* if same hi-word, check low-word */ cmp lln_lo, lld_lo bhi 5f 4: /* both div less and div n */ /* dividend == divisor, result 0 */ moveq llq_lo, #0 moveq llq_hi, #0 /* return */ RETc(eq) /* dividend < divisor, keep nominator, set correct sign of result */ tst ip, ip RETc(pl) /* flip sign */ ARM_NEG_64BIT llq_lo, llq_hi /* return */ RET 5: /* Call 64bit division algorithm using modulo */ /* params: modulo = 1, dont_return = 0, save_ip = 1, bit32_zero = 1 */ ARM_DIV_MOD_64BIT_BODY lln_lo, lln_hi, lld_lo, lld_hi, llq_lo, llq_hi, 1, 0, 1, 1 /* set correct sign of result */ tst ip, ip RETc(pl) /* flip sign */ ARM_NEG_64BIT llq_lo, llq_hi /* return */ RET ARM_DIV_FUNC_END moddi3 #endif /* L_moddi3 */ #endif /* __ARM_ARCH__ > 3 */ /* -------------------------------------------------- */