This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Faster __mulsi3 for gcc on NEC v850
- To: gcc at gcc dot gnu dot org
- Subject: Faster __mulsi3 for gcc on NEC v850
- From: Matteo Frigo <athena at fftw dot org>
- Date: Tue, 24 Oct 2000 14:03:28 +0200
Dear gcc developers,
this email contains a replacement for the __mulsi3 routine that comes
with gcc-2.95 for the v850 processor (see
gcc/config/v850/lib1funcs.asm).
The replacement is much faster than the original one, because:
1) it performs multiplication in chunks of 12 bits instead of 7.
Consequently, the new routine performs 6 16-bit multiplications
instead of 15 like the original routine.
2) the loops in the original routine have been unrolled.
The new routine can be compiled with v850-gcc -O2 from the
following C code:
#define SHIFT 12
#define MASK ((1 << SHIFT) - 1)
#define STEP(i, j) \
({ \
short a_part = (a >> (i)) & MASK; \
short b_part = (b >> (j)) & MASK; \
int res = (((int)a_part) * ((int)b_part)); \
res; \
})
int __mulsi3 (unsigned a, unsigned b)
{
int ret;
ret = STEP(0, 0) +
((STEP(SHIFT, 0) + STEP(0, SHIFT)) << SHIFT) +
((STEP(0, 2 * SHIFT) + STEP(SHIFT, SHIFT) + STEP(2 * SHIFT, 0))
<< (2 * SHIFT));
return ret;
}
Alternatively, the following output of gcc can replace __mulsi3 in
gcc/config/v850/lib1funcs.asm:
___mulsi3:
mov r6,r13
movea lo(4095),r0,r16
and r16,r13
mov r7,r15
and r16,r15
mov r13,r10
mulh r15,r10
shr 12,r6
mov r6,r14
and r16,r14
mov r14,r11
mulh r15,r11
shr 12,r7
mov r7,r12
and r16,r12
shr 12,r7
and r16,r7
mulh r13,r7
shr 12,r6
mulh r12,r13
and r16,r6
add r13,r11
shl 12,r11
add r11,r10
mov r14,r11
mulh r12,r11
mulh r15,r6
add r11,r7
add r6,r7
shl 24,r7
add r7,r10
jmp [r31]
Feel free to distribute the new __mulsi3 under whatever license is
appropriate.
Regards,
Matteo Frigo