This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics
- From: "pinskia at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 21 Apr 2009 19:41:55 -0000
- Subject: [Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics
- References: <bug-39840-700@http.gcc.gnu.org/bugzilla/>
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
------- Comment #3 from pinskia at gmail dot com 2009-04-21 19:41 -------
Subject: Re: Non-optimal (or wrong) implementation of SSE intrinsics
Gcc 4.4 and above supports different target options on the function
level but not on a basic block level. So you can create an interneral
version for AVX.
Sent from my iPhone
On Apr 21, 2009, at 12:37 PM, "drepper at redhat dot com"
<gcc-bugzilla@gcc.gnu.org
> wrote:
>
>
> ------- Comment #2 from drepper at redhat dot com 2009-04-21 19:37
> -------
> [I couldn't attach the code as an attachment, bugzilla has a bug.]
>
> The program below has to be compiled with -mavx to allow the AVX
> intrinsics
> being used. But this also triggers using the use of the vmovss
> instruction to
> load the parameter for the sin() call from memory.
>
> (Forget the reference to memset in the original report, it's as
> simple as
> passing floating point parameters that triggers the problem.)
>
> #include <math.h>
> #include <stdio.h>
> #include <immintrin.h>
>
>
> static unsigned int eax, ebx, ecx, edx;
>
>
> static int
> has_avx (void)
> {
> if ((ecx & (1 << 27)) == 0)
> /* No OSXSAVE. */
> return 0;
>
> unsigned int feat_eax, feat_edx;
> asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0));
> if ((feat_eax & 6) != 6)
> return 0;
>
> return (ecx & (1 << 28)) != 0;
> }
>
>
> template <typename T, int N>
> struct vec {
> union {
> T n[N];
> __v4sf f[N / (sizeof (__v4sf) / sizeof (T))];
> __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))];
> };
> };
>
>
> template <typename T, int N>
> T
> optscalar(const vec<T,N> &src1, const vec<T,N> &src2)
> {
> T r = 0;
> for (int i = 0; i < N; ++i)
> r += src1[i] * src2[i];
> return r;
> }
>
>
> template <int N>
> float
> optscalar(const vec<float,N> &src1, const vec<float,N> &src2)
> {
> if (has_avx ())
> {
> __m256 tmp = _mm256_setzero_ps ();
> for (int i = 0; i < N / 8; ++i)
> tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i],
> src2.fa[i]));
> tmp = _mm256_hadd_ps (tmp, tmp);
> tmp = _mm256_hadd_ps (tmp, tmp);
> tmp = _mm256_hadd_ps (tmp, tmp);
> union
> {
> __m256 v;
> float a[8];
> } cvt = { tmp };
> return cvt.a[0];
> }
> else
> {
> __m128 tmp = _mm_setzero_ps ();
> for (int i = 0; i < N / 4; ++i)
> tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i]));
> tmp = _mm_hadd_ps (tmp, tmp);
> tmp = _mm_hadd_ps (tmp, tmp);
> return __builtin_ia32_vec_ext_v4sf (tmp, 0);
> }
> }
>
>
> #define N 100000
> #define DEF(type) vec<type,N> v##type##1, v##type##2; type
> type##res, type##cmp
> DEF(float);
>
> float g;
>
> int
> main ()
> {
> float f = sinf (g);
> printf ("%g\n", f);
>
> asm volatile ("cpuid"
> : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
> : "0" (1));
>
> float floatres = optscalar (vfloat1, vfloat2);
> printf ("%g\n", floatres);
>
> return 0;
> }
>
>
> --
>
> drepper at redhat dot com changed:
>
> What |Removed |Added
> ---
> ---
> ----------------------------------------------------------------------
> Status|WAITING |UNCONFIRMED
>
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840
>
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840