gcc 4.0.x: MMX built-ins regression


I am using MMX built-ins and gcc-4.0-20050825 and I am experiencing generation
of uneeded movq (at least I guess so, I am no assembler pro). I don't know
which gcc snapshot introduced this, but a I know that some pre-release gcc 4.0
didn't show this bad behaviour. (It's been some time I played with this...)

BTW, this is using gcc built-ins. The situation is much wors when using
intrinsics via mmintrin.h. (Again old pre4.0 gcc didn't have the problem;
using gcc builtins or mmintin.h intrinsics made no differnece; both generated
nice code.)

Here source + assembly mixed + my comments (compiled with -O2 -g3

Just shout, if you need anything else.

typedef int v2si __attribute__ ((vector_size (8)));
typedef int di __attribute__ ((vector_size (8)));
typedef short v4hi __attribute__ ((vector_size (8)));

00000320 <MixAudio16_MMX_T>:

void MixAudio16_MMX_T(char* src1, char* src2, char* dst)
 320:	55                   	push   %ebp
 321:	89 e5                	mov    %esp,%ebp
 323:	83 ec 10             	sub    $0x10,%esp
	v4hi indata;
	v4hi signmask;
	v2si loout;
	v2si hiout;
	v2si temp;

	__attribute__((aligned(16))) static const short sm[4] =
	static const v4hi *m = (v4hi*)sm;

	indata   = *(v4hi*)src1;
	signmask = __builtin_ia32_pand(indata, *m);
 326:	8b 15 04 00 00 00    	mov    0x4,%edx
 32c:	8b 45 08             	mov    0x8(%ebp),%eax
 32f:	0f 6f 10             	movq   (%eax),%mm2
	signmask = __builtin_ia32_pcmpeqw(signmask, *m);
	loout = __builtin_ia32_punpcklwd(indata, signmask);
 332:	0f 6f ca             	movq   %mm2,%mm1
	hiout = __builtin_ia32_punpckhwd(indata, signmask);
	indata   = *(v4hi*)src2;
 335:	8b 45 0c             	mov    0xc(%ebp),%eax
 338:	0f 7f 55 f8          	movq   %mm2,0xfffffff8(%ebp)
 33c:	0f 6f 45 f8          	movq   0xfffffff8(%ebp),%mm0

Why not movq %mm2, %mm0 ?

 340:	0f db 02             	pand   (%edx),%mm0
 343:	0f 7f 45 f0          	movq   %mm0,0xfffffff0(%ebp)
 347:	0f 6f 45 f0          	movq   0xfffffff0(%ebp),%mm0

what the heck?

 34b:	0f 75 02             	pcmpeqw (%edx),%mm0
 34e:	0f 61 c8             	punpcklwd %mm0,%mm1
 351:	0f 69 d0             	punpckhwd %mm0,%mm2
 354:	0f 7f 4d f8          	movq   %mm1,0xfffffff8(%ebp)
 358:	0f 6f 5d f8          	movq   0xfffffff8(%ebp),%mm3

As above, this happens throughout, as you can see:

 35c:	0f 7f 55 f8          	movq   %mm2,0xfffffff8(%ebp)
 360:	0f 6f 10             	movq   (%eax),%mm2
 363:	0f 6f 65 f8          	movq   0xfffffff8(%ebp),%mm4
	signmask = __builtin_ia32_pand(indata, *m);
 367:	0f 7f 55 f8          	movq   %mm2,0xfffffff8(%ebp)
 36b:	0f 6f 45 f8          	movq   0xfffffff8(%ebp),%mm0
	signmask = __builtin_ia32_pcmpeqw(signmask, *m);

	temp  = __builtin_ia32_punpcklwd(indata, signmask);
 36f:	0f 6f ca             	movq   %mm2,%mm1
 372:	0f db 02             	pand   (%edx),%mm0
	loout = __builtin_ia32_paddd(loout, temp); \
	temp  = __builtin_ia32_punpckhwd(indata, signmask);
	hiout = __builtin_ia32_paddd(hiout, temp);
	*(v4hi*)dst = __builtin_ia32_packssdw(loout, hiout);
 375:	8b 45 10             	mov    0x10(%ebp),%eax
 378:	0f 7f 45 f0          	movq   %mm0,0xfffffff0(%ebp)
 37c:	0f 6f 45 f0          	movq   0xfffffff0(%ebp),%mm0
 380:	0f 75 02             	pcmpeqw (%edx),%mm0
 383:	0f 61 c8             	punpcklwd %mm0,%mm1
 386:	0f 69 d0             	punpckhwd %mm0,%mm2
 389:	0f 7f 4d f8          	movq   %mm1,0xfffffff8(%ebp)
 38d:	0f fe 5d f8          	paddd  0xfffffff8(%ebp),%mm3

Why not using MMX register directly?

 391:	0f 7f 55 f8          	movq   %mm2,0xfffffff8(%ebp)
 395:	0f fe 65 f8          	paddd  0xfffffff8(%ebp),%mm4


 399:	0f 6b dc             	packssdw %mm4,%mm3
 39c:	0f 7f 18             	movq   %mm3,(%eax)
 39f:	0f 77                	emms
 3a1:	c9                   	leave
 3a2:	c3                   	ret

