[Bug fortran/31067] MINLOC should sometimes be inlined (gas_dyn is sooooo sloooow)

jakub at gcc dot gnu dot org gcc-bugzilla@gcc.gnu.org
Mon Jul 27 11:03:00 GMT 2009



------- Comment #36 from jakub at gcc dot gnu dot org  2009-07-27 11:02 -------
Here is the loop in C and vectorized by hand as well:
#include <emmintrin.h>

float arr[1024];

unsigned int
foo (unsigned int end)
{
  unsigned int pos = 1;
  unsigned int i;
  float limit = __FLT_MAX__;
  for (i = 0; i < end; i++)
    if (arr[i] < limit)
      {
limit = arr[i];
pos = i + 1;
      }
  return pos;
}

unsigned int
bar (unsigned int end)
{
  __m128 pos = (__m128) _mm_set1_epi32 (1);
  __m128 limit = _mm_set1_ps (__FLT_MAX__);
  __m128i curi = _mm_set_epi32 (4, 3, 2, 1);
  __m128i inc = _mm_set1_epi32 (4);
  unsigned int i = 0;
  if (end > 4)
    {
      for (; i < end - 4; i += 4)
{
  __m128 val = _mm_loadu_ps (arr + i);
  __m128 mask = _mm_cmplt_ps (val, limit);
  limit = _mm_min_ps (limit, val);
  pos = _mm_andnot_ps (mask, pos);
  pos = _mm_or_ps (pos, _mm_and_ps (mask, (__m128) curi));
  curi = _mm_add_epi32 (curi, inc);
}
      /* Reduction.  */
      __m128 tmp1 = _mm_movehl_ps (limit, limit);
      __m128 tmp2 = _mm_movehl_ps (pos, pos);
      __m128 mask = _mm_cmplt_ps (tmp1, limit);
      limit = _mm_min_ps (tmp1, limit);
      tmp2 = _mm_and_ps (mask, tmp2);
      pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos));
      tmp1 = _mm_shuffle_ps (limit, limit, _MM_SHUFFLE (1, 1, 1, 1));
      tmp2 = _mm_shuffle_ps (pos, pos, _MM_SHUFFLE (1, 1, 1, 1));
      mask = _mm_cmplt_ps (tmp1, limit);
      limit = _mm_min_ps (tmp1, limit);
      tmp2 = _mm_and_ps (mask, tmp2);
      pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos));
    }
  float limit_ = _mm_cvtss_f32 (limit);
  unsigned int pos_ = (unsigned int) _mm_cvtsi128_si32 ((__m128i) pos);
  for (; i < end; i++)
    if (arr[i] < limit_)
      {
limit_ = arr[i];
pos_ = i + 1;
      }
  return pos_;
}

int
main (void)
{
  unsigned int k;
  arr[0] = -1;
  arr[2] = -3;
  arr[8] = -5;
  arr[9] = -6;
  if (foo (32) != bar (32))
    __builtin_abort ();
  for (k = 10; k < 32; k++)
    {
      arr[k] = -k;
      if (foo (32) != bar (32))
        __builtin_abort ();
    }
  return 0;
}

Don't know how hard would be to vectorize this in the vectorizer, but clearly
icc manages to handle that.
The loop is:
<bb 4>:
  # pos_22 = PHI <pos_1(7), 1(3)>
  # i_23 = PHI <i_15(7), 0(3)>
  # limit_24 = PHI <limit_4(7), 3.4028234663852885981170418348451692544e+38(3)>
  limit_11 = arr[i_23];
  D.2700_12 = limit_11 < limit_24;
  pos_1 = [cond_expr] D.2700_12 ? i_23 : pos_22;
  limit_4 = [cond_expr] D.2700_12 ? limit_11 : limit_24;
  i_15 = i_23 + 1;
  D.2703_9 = (long unsigned int) i_15;
  if (D.2703_9 < end_10(D))
    goto <bb 7>;
  else
    goto <bb 8>;

<bb 7>:
  goto <bb 4>;
before vectorization.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31067



More information about the Gcc-bugs mailing list