Advice about using SIMD extensions

Richard Beare Richard.Beare@csiro.au
Thu Feb 24 12:45:00 GMT 2005


Hi Everyone,
This is probably a common query, but I haven't managed to find any hints 
about what I'm doing wrong.

I'm trying to use the SIMD extensions to accelerate array arithmetic. My 
test code is below. I'm running gcc-3.3.3 on a pentium 4 3GHz, running 
Fedora Core 2.

My problem is that the SIMD code seems to be running slower than the 
optimized standard code. In fact if I turn on the optimization and cpu 
flag then I get a huge slowdown.
I can confirm with objdump that faddp instructions are being generated 
at least some of the time.

I've experimented with a few different compilers (only stable versions) 
but not achieved any consistent speed up.

I'd have thought that this was the simplest example to accelerate.

Am I doing something obvious wrong at the C level? Is there a particular 
compiler version that is known to do this sort of thing well?

I would appreciate any advice.

Here is the log of some test runs:
============================================================
Standardized arithmetic

19.41user 0.01system 0:19.44elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps

Standard with optimization


cc -DDONORMAL -O2    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials


Standardized arithmetic

5.48user 0.00system 0:05.49elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps
----------------------------
Vectorized without optimization


cc -DDOVEC -mcpu=pentium4    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials


Vectorized arithmetic

9.02user 0.00system 0:09.03elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+60minor)pagefaults 0swaps

Vectorized with optimization

cc -DDOVEC -O2 -mcpu=pentium4    -c -o vectrials.o vectrials.c
cc -static  vectrials.o   -o vectrials

Vectorized arithmetic

35.89user 0.03system 0:36.17elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+58minor)pagefaults 0swaps
============================================================
And here is the code

#define _XOPEN_SOURCE 600
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>

#define LEN 1000

#define THISTYPE float

/* typedef v8qi myvec; */
typedef int myvec __attribute__ ((mode(V4SF)));

#define myvecSize (sizeof(myvec)/sizeof(THISTYPE))

/**********************************************/

void * myalloc(size_t size)
{
   /* alignement should be on 16 byte boundaries! */
   const size_t align=2*sizeof(double);
   void *res=NULL;
   int status;

   status = posix_memalign(&res, align, size);
   switch (status) {
   case EINVAL:
     fprintf(stderr, "Alignment parameter no good\n");
     return NULL;
     break;
   case ENOMEM:
     fprintf(stderr, "Insufficient memory\n");
     return NULL;
   default:
     return res;
   }
}

/**********************************************/

void f1(myvec *in1,myvec *in2, myvec *out, int len)
{
   int i;
/*   fprintf(stderr, "Vectorised length =%d\n", len); */

   for (i=0;i<len;i++) {
     out[i] = in1[i] + in2[i];
   }

}

/**********************************************/

void f2(THISTYPE *in1, THISTYPE *in2, THISTYPE *out, int len)
{
   int i;
/*   fprintf(stderr, "Standard length =%d\n", len); */
   for (i=0;i<len;i++) {
     out[i] = in1[i] + in2[i];
   }
}

/**********************************************/
void init(THISTYPE *I1, THISTYPE *I2, int len)
{
   int i;

   for (i=0;i<len;i++) {
     I1[i] = 34.0;
     I2[i] = 354.0;
   }

}

void check(THISTYPE *OO, int len)
{
   fprintf(stderr, "First=%f, Last=%f\n", OO[0], OO[len-1]);
}

#define TESTS 1000000

int main()
{
   myvec *input1, *input2, *output;
   THISTYPE *I1, *I2, *OO;
   int tt;

/*   fprintf(stderr, "(%d, %d, %d)\n", sizeof(double), sizeof(void *), 
sizeof(myvec)); */

   input1 = (myvec *)myalloc(LEN * sizeof(myvec));
   input2 = (myvec *)myalloc(LEN * sizeof(myvec));
   output = (myvec *)myalloc(LEN * sizeof(myvec));

   I1 = (THISTYPE *)input1;
   I2 = (THISTYPE *)input2;
   OO = (THISTYPE *)output;

   init(I1, I2, LEN*sizeof(myvec)/sizeof(THISTYPE));

#ifdef DOVEC
   /* the vectorized one */
   fprintf(stderr, "Vectorized arithmetic\n");
   for (tt=0;tt<TESTS;tt++) {
     f1(input1, input2, output, LEN);
   }
#endif

#ifdef DONORMAL
   fprintf(stderr, "Standardized arithmetic\n");
   for (tt=0;tt<TESTS;tt++) {
     f2(I1, I2, OO, LEN * sizeof(myvec)/sizeof(THISTYPE));
   }
#endif
   check(OO, LEN * sizeof(myvec)/sizeof(THISTYPE));
   return 0;
}


--
Richard Beare, CSIRO Mathematical & Information Sciences
Locked Bag 17, North Ryde, NSW 1670, Australia
Phone: +61-2-93253221 (GMT+~10hrs)  Fax: +61-2-93253200

Richard.Beare@csiro.au



More information about the Gcc-help mailing list