[Bug fortran/31067] MINLOC should sometimes be inlined (gas_dyn is sooooo sloooow)
jakub at gcc dot gnu dot org
gcc-bugzilla@gcc.gnu.org
Mon Jul 27 11:03:00 GMT 2009
------- Comment #36 from jakub at gcc dot gnu dot org 2009-07-27 11:02 -------
Here is the loop in C and vectorized by hand as well:
#include <emmintrin.h>
float arr[1024];
unsigned int
foo (unsigned int end)
{
unsigned int pos = 1;
unsigned int i;
float limit = __FLT_MAX__;
for (i = 0; i < end; i++)
if (arr[i] < limit)
{
limit = arr[i];
pos = i + 1;
}
return pos;
}
unsigned int
bar (unsigned int end)
{
__m128 pos = (__m128) _mm_set1_epi32 (1);
__m128 limit = _mm_set1_ps (__FLT_MAX__);
__m128i curi = _mm_set_epi32 (4, 3, 2, 1);
__m128i inc = _mm_set1_epi32 (4);
unsigned int i = 0;
if (end > 4)
{
for (; i < end - 4; i += 4)
{
__m128 val = _mm_loadu_ps (arr + i);
__m128 mask = _mm_cmplt_ps (val, limit);
limit = _mm_min_ps (limit, val);
pos = _mm_andnot_ps (mask, pos);
pos = _mm_or_ps (pos, _mm_and_ps (mask, (__m128) curi));
curi = _mm_add_epi32 (curi, inc);
}
/* Reduction. */
__m128 tmp1 = _mm_movehl_ps (limit, limit);
__m128 tmp2 = _mm_movehl_ps (pos, pos);
__m128 mask = _mm_cmplt_ps (tmp1, limit);
limit = _mm_min_ps (tmp1, limit);
tmp2 = _mm_and_ps (mask, tmp2);
pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos));
tmp1 = _mm_shuffle_ps (limit, limit, _MM_SHUFFLE (1, 1, 1, 1));
tmp2 = _mm_shuffle_ps (pos, pos, _MM_SHUFFLE (1, 1, 1, 1));
mask = _mm_cmplt_ps (tmp1, limit);
limit = _mm_min_ps (tmp1, limit);
tmp2 = _mm_and_ps (mask, tmp2);
pos = _mm_or_ps (tmp2, _mm_andnot_ps (mask, pos));
}
float limit_ = _mm_cvtss_f32 (limit);
unsigned int pos_ = (unsigned int) _mm_cvtsi128_si32 ((__m128i) pos);
for (; i < end; i++)
if (arr[i] < limit_)
{
limit_ = arr[i];
pos_ = i + 1;
}
return pos_;
}
int
main (void)
{
unsigned int k;
arr[0] = -1;
arr[2] = -3;
arr[8] = -5;
arr[9] = -6;
if (foo (32) != bar (32))
__builtin_abort ();
for (k = 10; k < 32; k++)
{
arr[k] = -k;
if (foo (32) != bar (32))
__builtin_abort ();
}
return 0;
}
Don't know how hard would be to vectorize this in the vectorizer, but clearly
icc manages to handle that.
The loop is:
<bb 4>:
# pos_22 = PHI <pos_1(7), 1(3)>
# i_23 = PHI <i_15(7), 0(3)>
# limit_24 = PHI <limit_4(7), 3.4028234663852885981170418348451692544e+38(3)>
limit_11 = arr[i_23];
D.2700_12 = limit_11 < limit_24;
pos_1 = [cond_expr] D.2700_12 ? i_23 : pos_22;
limit_4 = [cond_expr] D.2700_12 ? limit_11 : limit_24;
i_15 = i_23 + 1;
D.2703_9 = (long unsigned int) i_15;
if (D.2703_9 < end_10(D))
goto <bb 7>;
else
goto <bb 8>;
<bb 7>:
goto <bb 4>;
before vectorization.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31067
More information about the Gcc-bugs
mailing list