This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

gcc 4.2.3 and MMX to mem move oddity

From: Prakash Punnoor <prakash at punnoor dot de>
To: gcc at gcc dot gnu dot org
Date: Fri, 22 Feb 2008 23:07:07 +0100
Subject: gcc 4.2.3 and MMX to mem move oddity

Hi,

I am playing with following code (from ffmpeg) translated to intrinsics:


Original code:

#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)

void diff_pixels_mmx(char *block, const uint8_t *s1, const uint8_t *s2, long 
stride)
{
    long offset = -128;
    MOVQ_ZERO(mm7);
    do {
        asm volatile(
            "movq (%0), %%mm0         \n\t"
            "movq (%1), %%mm2         \n\t"
            "movq %%mm0, %%mm1        \n\t"
            "movq %%mm2, %%mm3        \n\t"
            "punpcklbw %%mm7, %%mm0   \n\t"
            "punpckhbw %%mm7, %%mm1   \n\t"
            "punpcklbw %%mm7, %%mm2   \n\t"
            "punpckhbw %%mm7, %%mm3   \n\t"
            "psubw %%mm2, %%mm0       \n\t"
            "psubw %%mm3, %%mm1       \n\t"
            "movq %%mm0, (%2, %3)     \n\t"
            "movq %%mm1, 8(%2, %3)    \n\t"
            : : "r" (s1), "r" (s2), "r" (block+64),  "r" (offset)
            : "memory");
        s1 += stride;
        s2 += stride;
        offset += 16;
    } while (offset < 0);
}

compiles to:

0000000000000000 <diff_pixels_mmx>:
   0:   0f ef ff                pxor   %mm7,%mm7
   3:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
   a:   48 83 c7 40             add    $0x40,%rdi
   e:   66 90                   xchg   %ax,%ax
  10:   0f 6f 06                movq   (%rsi),%mm0
  13:   0f 6f 12                movq   (%rdx),%mm2
  16:   0f 6f c8                movq   %mm0,%mm1
  19:   0f 6f da                movq   %mm2,%mm3
  1c:   0f 60 c7                punpcklbw %mm7,%mm0
  1f:   0f 68 cf                punpckhbw %mm7,%mm1
  22:   0f 60 d7                punpcklbw %mm7,%mm2
  25:   0f 68 df                punpckhbw %mm7,%mm3
  28:   0f f9 c2                psubw  %mm2,%mm0
  2b:   0f f9 cb                psubw  %mm3,%mm1
  2e:   0f 7f 04 07             movq   %mm0,(%rdi,%rax,1)
  32:   0f 7f 4c 07 08          movq   %mm1,0x8(%rdi,%rax,1)
  37:   48 01 ce                add    %rcx,%rsi
  3a:   48 01 ca                add    %rcx,%rdx
  3d:   48 83 c0 10             add    $0x10,%rax
  41:   75 cd                   jne    10 <diff_pixels_mmx+0x10>
  43:   f3 c3                   repz retq
  45:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  4c:   00 00 00 00


This is the intrinsic version:

#include <mmintrin.h>
void diff_pixels_mmx3(char *block, const uint8_t *s1, const uint8_t *s2, long 
stride)
{
	
	long offset = -128;
	block+=64;
	__m64 mm7 = _mm_setzero_si64();
	do {
		__m64 mm0 = *(__m64*)s1;
		__m64 mm2 = *(__m64*)s2;
		__m64 mm1 = mm0;
		__m64 mm3 = mm2;
		mm0 = _mm_unpacklo_pi8(mm0, mm7);
		mm1 = _mm_unpackhi_pi8(mm1, mm7);
		mm2 = _mm_unpacklo_pi8(mm2, mm7);
		mm3 = _mm_unpackhi_pi8(mm3, mm7);
		mm0 = _mm_sub_pi16(mm0, mm2);
		mm1 = _mm_sub_pi16(mm1, mm3);
		*(__m64*)(block+offset) = mm0;
		*(__m64*)(block+offset+8) = mm1;
		s1 += stride;
		s2 += stride;
		offset +=16;
	} while (offset < 0);
}

compiles to
00000000000000c0 <diff_pixels_mmx3>:
  c0:   53                      push   %rbx
  c1:   0f ef e4                pxor   %mm4,%mm4
  c4:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
  cb:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  d0:   0f 6f 0e                movq   (%rsi),%mm1
  d3:   48 01 ce                add    %rcx,%rsi
  d6:   0f 6f 02                movq   (%rdx),%mm0
  d9:   48 01 ca                add    %rcx,%rdx
  dc:   0f 6f d1                movq   %mm1,%mm2
  df:   0f 6f d9                movq   %mm1,%mm3
  e2:   0f 6f c8                movq   %mm0,%mm1
  e5:   0f 68 c4                punpckhbw %mm4,%mm0
  e8:   0f 60 d4                punpcklbw %mm4,%mm2
  eb:   0f 68 dc                punpckhbw %mm4,%mm3
  ee:   0f 60 cc                punpcklbw %mm4,%mm1
  f1:   0f f9 d8                psubw  %mm0,%mm3
  f4:   0f f9 d1                psubw  %mm1,%mm2
  f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
  fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
 101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
 106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
 10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
 110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
 115:   48 83 c0 10             add    $0x10,%rax
 119:   75 b5                   jne    d0 <diff_pixels_mmx3+0x10>
 11b:   5b                      pop    %rbx
 11c:   c3                      retq


Flags used are:-O2 -march=k8
Compiler: gcc 4.2.3 (gentoo) x86_64

As you see in the intrinsic version gcc moves to mmx register to the stack, 
reloads from the stack and writes to the destination. Why?

I don't know whether earlier gcc 4.2 versions produced such stupid code.
Compiling as 32 does similar stupidity, though gcc reloads into a mmx 
register...

(As I note on a side: Why does gcc want to use rbx, requiring push and pop? I 
think other registers are free...)

bye,
-- 
(Â=                 =Â)
//\ Prakash Punnoor /\\
V_/                 \_V

Attachment: signature.asc
Description: This is a digitally signed message part.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]