inline asm mmx: how to movq from memory to %mmX
Jack Andrews
effbiae@gmail.com
Sat Apr 21 13:46:00 GMT 2007
Ian wrote:
> For mmintrin.h functions, use __m64, not __v2si.
>
> Why the memcpy? Use _mm_set_pi32(is[i], is[i + 1]).
>
> Don't extract the values by taking the address of q. Instead do
> something like this:
> union { long ai[2]; __m64 m } u;
> u.m = q;
> return u.ai[0] + u.ai[1];
>
> Ian
>
followed this and now the mmintrin version runs even slower!
my RUNME.sh stays the same, and my v.c is now:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>
typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2
simd_mmintrin(n, is)
I *is;
{ __m64 q,r;
I i;
_m_empty();
q=_m_from_int(0);
for (i=0; i < n; i+=W) {
r=_mm_set_pi32(is[i],is[i+1]);
q=_m_paddd(q,r);
}
union {long a[2];__m64 m;}u;
u.m=q;
return u.a[0]+u.a[1];
}
simd_asm(n, is)
I *is;
{ I i,*r=malloc(IZ*W*8);
asm("emms");
asm("pxor %mm0,%mm0");
for (i=0; i < n; i+=W) {
asm("movq %0,%%mm1\n\t"
"paddd %%mm1,%%mm0"
:
:"m"(is[i]) );
}
asm("movq %%mm0,%0":"=m"(*(__m64*)r));
return r[0]+r[1];
}
sisd(n, is)
I *is;
{
I i = 0, j = 0;
for (i = 0; i < n; i++)
j += is[i];
return j;
}
main(c, v)
C **v;
{
I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
I result, *is=malloc(IZ*(z*=2)), i;
int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
for(i=0;i<z;i++)is[i]=i;
printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
printf("impl: %s\n",ss[m]);
while (n--)
result=fs[m](z, is);
printf("%d\n",result);
}
More information about the Gcc-help
mailing list