inline asm mmx: how to movq from memory to %mmX

Jack Andrews effbiae@gmail.com
Sat Apr 21 13:46:00 GMT 2007


Ian wrote:
> For mmintrin.h functions, use __m64, not __v2si.
>
> Why the memcpy?  Use _mm_set_pi32(is[i], is[i + 1]).
>
> Don't extract the values by taking the address of q.  Instead do
> something like this:
>    union { long ai[2]; __m64 m } u;
>    u.m = q;
>    return u.ai[0] + u.ai[1];
>
> Ian
>

followed this and now the mmintrin version runs even slower!

my RUNME.sh stays the same, and my v.c is now:


#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>

typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2


simd_mmintrin(n, is)
I *is;
{   __m64 q,r;
    I i;
    _m_empty();
    q=_m_from_int(0);
    for (i=0; i < n; i+=W) {
        r=_mm_set_pi32(is[i],is[i+1]);
        q=_m_paddd(q,r);
    }
    union {long a[2];__m64 m;}u;
    u.m=q;
    return u.a[0]+u.a[1];
}

simd_asm(n, is)
I *is;
{   I i,*r=malloc(IZ*W*8);
    asm("emms");
    asm("pxor %mm0,%mm0");
    for (i=0; i < n; i+=W) {
        asm("movq %0,%%mm1\n\t"
            "paddd %%mm1,%%mm0"
            :
            :"m"(is[i])           );
    }
    asm("movq %%mm0,%0":"=m"(*(__m64*)r));
    return r[0]+r[1];
}

sisd(n, is)
I *is;
{
    I i = 0, j = 0;
    for (i = 0; i < n; i++)
	j += is[i];
    return j;
}

main(c, v)
C **v;
{
    I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
    I result, *is=malloc(IZ*(z*=2)), i;
    int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
    C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
    for(i=0;i<z;i++)is[i]=i;
    printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
    printf("impl: %s\n",ss[m]);
    while (n--)
	result=fs[m](z, is);
    printf("%d\n",result);
}



More information about the Gcc-help mailing list