[Bug rtl-optimization/22152] New: Poor loop optimization when using sse2 builtins - regression from 3.3
fjahanian at apple dot com
gcc-bugzilla@gcc.gnu.org
Wed Jun 22 20:06:00 GMT 2005
In the following trivial test case, gcc-4.1 produces very ineffecient code for the loop. gcc-3.3 produces
much better code.
typedef int __m64 __attribute__ ((__vector_size__ (8)));
__m64 unsigned_add3( const __m64 *a, const __m64 *b, unsigned long count )
{
__m64 sum;
unsigned int i;
for( i = 1; i < count; i++ )
{
sum = (__m64) __builtin_ia32_paddq ((long long)a[i], (long long)b[i]);
}
return sum;
}
1) Loop when compiled with gcc-4.1 -O2 -msse2 (note in particular the extra movq to memory):
L4:
movl 12(%ebp), %esi
movq (%eax,%edx,8), %mm0
paddq (%esi,%edx,8), %mm0
incl %edx
cmpl %edx, %ecx
movq %mm0, -16(%ebp)
movl -16(%ebp), %esi
movl -12(%ebp), %edi
jne L4
2) Loop using gcc-3.3 compiled with -O2 -msse2:
L6:
movq (%esi,%edx,8), %mm0
paddq (%eax,%edx,8), %mm0
addl $1, %edx
cmpl %ecx, %edx
jb L6
AFAICT, culprit is reload which generates extra load and store of %mm0:
(insn 62 30 63 2 (set (mem:V2SI (plus:SI (reg/f:SI 6 bp)
(const_int -16 [0xfffffffffffffff0])) [0 S8 A8])
(reg:V2SI 29 mm0)) 736 {*movv2si_internal} (nil)
(nil))
(insn 63 62 32 2 (set (reg/v:V2SI 4 si [orig:61 sum ] [61])
(mem:V2SI (plus:SI (reg/f:SI 6 bp)
(const_int -16 [0xfffffffffffffff0])) [0 S8 A8])) 736 {*movv2si_internal} (nil)
(nil))
Here is the larger test case from which above test was extracted:
#include <xmmintrin.h>
__m64 unsigned_add3( const __m64 *a, const __m64 *b, __m64 *result, unsigned long count )
{
__m64 carry, temp, sum, one, onesCarry, _a, _b;
unsigned int i;
if( count > 0 )
{
_a = a[0];
_b = b[0];
one = _mm_cmpeq_pi8( _a, _a ); //-1
one = _mm_sub_si64( _mm_xor_si64( one, one ), one ); //1
sum = _mm_add_si64( _a, _b );
onesCarry = _mm_and_si64( _a, _b ); //the 1's bit is set only if the 1's bit add
generates a carry
onesCarry = _mm_and_si64( onesCarry, one ); //onesCarry &= 1
//Trim off the one's bit on both vA and vB to make room for a carry bit at the top after the
add
_a = _mm_srli_si64( _a, 1 ); //vA >>= 1
_b = _mm_srli_si64( _b, 1 ); //vB >>= 1
//Add vA to vB and add the carry bit
carry = _mm_add_si64( _a, _b );
carry = _mm_add_si64( carry, onesCarry );
//right shift by 63 bits to get the carry bit for the high 64 bit quantity
carry = _mm_srli_si64( carry, 63 );
for( i = 1; i < count; i++ )
{
result[i-1] = sum;
_a = a[i];
_b = b[i];
onesCarry = _mm_and_si64( _a, _b );
onesCarry = _mm_and_si64( onesCarry, one );
sum = _mm_add_si64( _a, _b );
_a = _mm_add_si64( _a, onesCarry );
onesCarry = _mm_and_si64( carry, _a ); //find low bit carry
sum = _mm_add_si64( sum, carry ); //add in carry bit to low word sum
carry = _mm_add_si64( _a, onesCarry ); //add in low bit carry to high result
}
result[i-1] = sum;
}
return carry;
}
Again, gcc-3.3 produces much better code for this loop.
--
Summary: Poor loop optimization when using sse2 builtins -
regression from 3.3
Product: gcc
Version: 4.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P2
Component: rtl-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: fjahanian at apple dot com
CC: gcc-bugs at gcc dot gnu dot org
GCC build triplet: apple-x86-darwin
GCC host triplet: apple-x86-darwin
GCC target triplet: apple-x86-darwin
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22152
More information about the Gcc-bugs
mailing list