This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics



------- Comment #2 from drepper at redhat dot com  2009-04-21 19:37 -------
[I couldn't attach the code as an attachment, bugzilla has a bug.]

The program below has to be compiled with -mavx to allow the AVX intrinsics
being used.  But this also triggers using the use of the vmovss instruction to
load the parameter for the sin() call from memory.

(Forget the reference to memset in the original report, it's as simple as
passing floating point parameters that triggers the problem.)

#include <math.h>
#include <stdio.h>
#include <immintrin.h>


static unsigned int eax, ebx, ecx, edx;


static int
has_avx (void)
{
  if ((ecx & (1 << 27)) == 0)
    /* No OSXSAVE.  */
    return 0;

  unsigned int feat_eax, feat_edx;
  asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0));
  if ((feat_eax & 6) != 6)
    return 0;

  return (ecx & (1 << 28)) != 0;
}


template <typename T, int N>
struct vec {
  union {
    T n[N];
    __v4sf f[N / (sizeof (__v4sf) / sizeof (T))];
    __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))];
  };
};


template <typename T, int N>
T
optscalar(const vec<T,N> &src1, const vec<T,N> &src2)
{
  T r = 0;
  for (int i = 0; i < N; ++i)
    r += src1[i] * src2[i];
  return r;
}


template <int N>
float
optscalar(const vec<float,N> &src1, const vec<float,N> &src2)
{
  if (has_avx ())
    {
      __m256 tmp = _mm256_setzero_ps ();
      for (int i = 0; i < N / 8; ++i)
        tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i], src2.fa[i]));
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      union
      {
        __m256 v;
        float a[8];
      } cvt = { tmp };
      return cvt.a[0];
    }
  else
    {
      __m128 tmp = _mm_setzero_ps ();
      for (int i = 0; i < N / 4; ++i)
        tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i]));
      tmp = _mm_hadd_ps (tmp, tmp);
      tmp = _mm_hadd_ps (tmp, tmp);
      return __builtin_ia32_vec_ext_v4sf (tmp, 0);
    }
}


#define N 100000
#define DEF(type) vec<type,N> v##type##1, v##type##2; type type##res, type##cmp
DEF(float);

float g;

int
main ()
{
  float f = sinf  (g);
  printf ("%g\n", f);

  asm volatile ("cpuid"
                : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                : "0" (1));

  float floatres = optscalar (vfloat1, vfloat2);
  printf ("%g\n", floatres);

  return 0;
}


-- 

drepper at redhat dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |UNCONFIRMED


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]