This is the mail archive of the gcc-help@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: inline asm mmx: how to movq from memory to %mmX


<sent to ian taylor in error>:

You are missing commas between the operands.
    movq %0, %%mm1

thanks for that.


Instead of using inline assembler I recommend that you use the
functions in mmintrin.h.

i've written a program (appended below) that sums a list of 32 bit integers. there are 3 functions that do exactly the same thing (sum the list) and they get very different timings.

i have a script RUNME.sh:

$ sh RUNME.sh
---
expect: 199990000
impl: C (SISD)
199990000

real    0m0.604s
user    0m0.580s
sys     0m0.004s


--- expect: 199990000 impl: ASM (SIMD) 199990000


real 0m0.377s user 0m0.360s sys 0m0.008s


--- expect: 199990000 impl: MMINTRIN (SIMD) 199990000

real    0m1.235s
user    0m1.228s
sys     0m0.004s

so mmintr functions are slower than plain C and my assembly function
is faster than the others.  am i using mmintr correctly?

jack


$ cat RUNME.sh #!/bin/sh repeats=4000 # number of times to repeat the test vectorsize=10000 # size of the vector in 32 bit ints gcc -O -mmmx v.c -o v for which in 0 1 2; do time ./v $repeats $vectorsize $which; done

$ cat v.c
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>

typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2


simd_mmintrin(n, is) I *is; { __v2si q,r; I i; _m_empty(); q=_m_from_int(0); for (i=0; i < n; i+=W) { memcpy(&r,is+i,IZ*W); q=_m_paddd(q,r); } I*qq=(I*)&q; return qq[0]+qq[1]; }

simd_asm(n, is)
I *is;
{   I i,*r=malloc(IZ*W*8);
  asm("emms");
  asm("pxor %mm0,%mm0");
  for (i=0; i < n; i+=W) {
      asm("movq %0,%%mm1\n\t"
          "paddd %%mm1,%%mm0"
          :
          :"m"(is[i])           );
  }
  asm("movq %%mm0,%0":"=m"(*(__m64*)r));
  return r[0]+r[1];
}

sisd(n, is)
I *is;
{
  I i = 0, j = 0;
  for (i = 0; i < n; i++)
      j += is[i];
  return j;
}

main(c, v)
C **v;
{
  I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
  I result, *is=malloc(IZ*(z*=2)), i;
  int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
  C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
  for(i=0;i<z;i++)is[i]=i;
  printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
  printf("impl: %s\n",ss[m]);
  while (n--)
      result=fs[m](z, is);
  printf("%d\n",result);
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]