This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug rtl-optimization/21395] Performance degradation when building code that uses MMX intrinsics with gcc-4.0.0
- From: "pinskia at gcc dot gnu dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 5 May 2005 17:00:24 -0000
- Subject: [Bug rtl-optimization/21395] Performance degradation when building code that uses MMX intrinsics with gcc-4.0.0
- References: <20050505091136.21395.asuraparaju@gmail.com>
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
------- Additional Comments From pinskia at gcc dot gnu dot org 2005-05-05 17:00 -------
Note with the following code, I get back to what it is without -mmx:
union b
{
int i[2];
__m64 j;
}a;
__m64 sum = _mm_set_pi32(0, 0);
for (int j=0 ; j < yl ; j++)
{
short *p = &pic_data[j][0];
short *r = &ref_data[j][0];
for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 )
{
__m64 pic = *(__m64 *)p;
__m64 ref = *(__m64 *)r;
// pic - ref
pic = _mm_sub_pi16 (pic, ref);
// abs (pic - ref)
ref = _mm_srai_pi16(pic, 15);
pic = _mm_xor_si64(pic, ref);
pic = _mm_sub_pi16 (pic, ref);
// sum += abs(pic -ref)
ref = _mm_xor_si64(ref, ref);
ref = _mm_unpackhi_pi16(pic, ref);
pic = _mm_unpacklo_pi16(pic, pic);
pic = _mm_srai_pi32 (pic, 16);
//ref = _mm_srai_pi32 (ref, 16);
pic = _mm_add_pi32 (pic, ref);
sum = _mm_add_pi32 (sum, pic);
}
}
a.j = sum;
// int *result = (int *) ∑
_mm_empty();
// return result[0] + result[1];
return a.i[0] + a.i[1];
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395