This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug rtl-optimization/21395] Performance degradation when building code that uses MMX intrinsics with gcc-4.0.0


------- Additional Comments From pinskia at gcc dot gnu dot org  2005-05-05 17:00 -------
Note with the following code, I get back to what it is without -mmx:
union b
{
  int i[2];
  __m64 j;
}a;
    __m64 sum = _mm_set_pi32(0, 0);
    
        for (int j=0 ; j < yl ; j++)
        {
                short *p = &pic_data[j][0];
                short *r = &ref_data[j][0];
                
                for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 )
        {       
           __m64 pic = *(__m64 *)p;
           __m64 ref = *(__m64 *)r;
            // pic - ref
            pic = _mm_sub_pi16 (pic, ref);
            // abs (pic - ref)
            ref = _mm_srai_pi16(pic, 15);
            pic = _mm_xor_si64(pic, ref);
            pic = _mm_sub_pi16 (pic, ref);
            // sum += abs(pic -ref)
            ref = _mm_xor_si64(ref, ref);
            ref = _mm_unpackhi_pi16(pic, ref);
            pic = _mm_unpacklo_pi16(pic, pic);
            pic = _mm_srai_pi32 (pic, 16);
            //ref = _mm_srai_pi32 (ref, 16);
            pic = _mm_add_pi32 (pic, ref);
            sum = _mm_add_pi32 (sum, pic);
        }
    }
    a.j = sum;
   // int *result = (int *) &sum;
    _mm_empty();

   // return result[0] + result[1];
   return a.i[0] + a.i[1];

-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]