This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics

From: "drepper at redhat dot com" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: 21 Apr 2009 19:37:49 -0000
Subject: [Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics
References: <bug-39840-700@http.gcc.gnu.org/bugzilla/>
Reply-to: gcc-bugzilla at gcc dot gnu dot org


------- Comment #2 from drepper at redhat dot com  2009-04-21 19:37 -------
[I couldn't attach the code as an attachment, bugzilla has a bug.]

The program below has to be compiled with -mavx to allow the AVX intrinsics
being used.  But this also triggers using the use of the vmovss instruction to
load the parameter for the sin() call from memory.

(Forget the reference to memset in the original report, it's as simple as
passing floating point parameters that triggers the problem.)

#include <math.h>
#include <stdio.h>
#include <immintrin.h>


static unsigned int eax, ebx, ecx, edx;


static int
has_avx (void)
{
  if ((ecx & (1 << 27)) == 0)
    /* No OSXSAVE.  */
    return 0;

  unsigned int feat_eax, feat_edx;
  asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0));
  if ((feat_eax & 6) != 6)
    return 0;

  return (ecx & (1 << 28)) != 0;
}


template <typename T, int N>
struct vec {
  union {
    T n[N];
    __v4sf f[N / (sizeof (__v4sf) / sizeof (T))];
    __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))];
  };
};


template <typename T, int N>
T
optscalar(const vec<T,N> &src1, const vec<T,N> &src2)
{
  T r = 0;
  for (int i = 0; i < N; ++i)
    r += src1[i] * src2[i];
  return r;
}


template <int N>
float
optscalar(const vec<float,N> &src1, const vec<float,N> &src2)
{
  if (has_avx ())
    {
      __m256 tmp = _mm256_setzero_ps ();
      for (int i = 0; i < N / 8; ++i)
        tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i], src2.fa[i]));
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      union
      {
        __m256 v;
        float a[8];
      } cvt = { tmp };
      return cvt.a[0];
    }
  else
    {
      __m128 tmp = _mm_setzero_ps ();
      for (int i = 0; i < N / 4; ++i)
        tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i]));
      tmp = _mm_hadd_ps (tmp, tmp);
      tmp = _mm_hadd_ps (tmp, tmp);
      return __builtin_ia32_vec_ext_v4sf (tmp, 0);
    }
}


#define N 100000
#define DEF(type) vec<type,N> v##type##1, v##type##2; type type##res, type##cmp
DEF(float);

float g;

int
main ()
{
  float f = sinf  (g);
  printf ("%g\n", f);

  asm volatile ("cpuid"
                : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                : "0" (1));

  float floatres = optscalar (vfloat1, vfloat2);
  printf ("%g\n", floatres);

  return 0;
}


-- 

drepper at redhat dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |UNCONFIRMED


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840

Follow-Ups:
- Re: [Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics
  - From: Andrew Thomas Pinski

References:
- [Bug middle-end/39840] New: Non-optimal (or wrong) implementation of SSE intrinsics
  - From: drepper at redhat dot com

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]