This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Faster __mulsi3 for gcc on NEC v850


Dear gcc developers,

this email contains a replacement for the __mulsi3 routine that comes
with gcc-2.95 for the v850 processor (see
gcc/config/v850/lib1funcs.asm).  

The replacement is much faster than the original one, because:

1) it performs multiplication in chunks of 12 bits instead of 7.
   Consequently, the new routine performs 6 16-bit multiplications
   instead of 15 like the original routine.

2) the loops in the original routine have been unrolled.

The new routine can be compiled with v850-gcc -O2 from the
following C code:

#define SHIFT 12
#define MASK ((1 << SHIFT) - 1)

#define STEP(i, j)				\
({						\
     short a_part = (a >> (i)) & MASK;		\
     short b_part = (b >> (j)) & MASK;		\
     int res = (((int)a_part) * ((int)b_part));	\
     res;					\
})

int __mulsi3 (unsigned a, unsigned b)
{
     int ret;
     
     ret = STEP(0, 0) +
	  ((STEP(SHIFT, 0) + STEP(0, SHIFT)) << SHIFT) +
	  ((STEP(0, 2 * SHIFT) + STEP(SHIFT, SHIFT) + STEP(2 * SHIFT, 0))
	   << (2 * SHIFT));

     return ret;
}


Alternatively, the following output of gcc can replace __mulsi3 in
gcc/config/v850/lib1funcs.asm:

___mulsi3:
	mov r6,r13
	movea lo(4095),r0,r16
	and r16,r13
	mov r7,r15
	and r16,r15
	mov r13,r10
	mulh r15,r10
	shr 12,r6
	mov r6,r14
	and r16,r14
	mov r14,r11
	mulh r15,r11
	shr 12,r7
	mov r7,r12
	and r16,r12
	shr 12,r7
	and r16,r7
	mulh r13,r7
	shr 12,r6
	mulh r12,r13
	and r16,r6
	add r13,r11
	shl 12,r11
	add r11,r10
	mov r14,r11
	mulh r12,r11
	mulh r15,r6
	add r11,r7
	add r6,r7
	shl 24,r7
	add r7,r10
	jmp [r31]

Feel free to distribute the new __mulsi3 under whatever license is
appropriate.

Regards,
Matteo Frigo


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]