This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Unnecessary PRE optimization


Hello,
I encounter an issue with PRE optimization, which created worse
code than no optimization.

This the test function: 

void foo(int *data, int *m_v4w, int num)
{
  int i;
  int m0;
  for( i=0; i<num; i++ )
  {
    int *data1 = (data[i] - 2);
    int *data2 =  data[i];
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    int d0, d1, d2, d3;

    d0 = data1[0];
    d1 = data2[0];
    d2 = data1[1];
    d3 = data2[1];

    tmp0 = m_v4w[0];
    tmp1 = m_v4w[2];
    tmp2 = m_v4w[4];
    tmp3 = m_v4w[6];
    tmp4 = m_v4w[8];
    tmp5 = m_v4w[10];
    tmp6 = m_v4w[12];
    tmp7 = m_v4w[14];

    m0 = tmp0 * d0;
    m0 += tmp1 * d0;
    m0 += tmp2 * d1;
    m0 += tmp3 * d1;
    m0 += tmp4 * d2;
    m0 += tmp5 * d2;
    m0 += tmp6 * d3;
    m0 += tmp7 * d3;

    data2[0] =  m0;
  }             
}   

The following is the code generated for our processor (easy to understand). 
PRE pass moves address expression (reg + constant offset) out of loop.
It increases both size and cycle unnecessarily since our architecture support
reg + constant offset addressing mode. It is even worse in bigger loops. 
Too many registers cause spill in register allocation and performance suffers
further. This happens on both 4.4 and TRUNK versions.  I also tested on x86,
which is the same. 

gcc_compiled.:
	.section .text, "axU"
	.align 8
	.global	foo
	.type	foo,@function
foo:
	cmplew p0, r2, zr
	sbpt p0.0, [link]
	addw r9, r1, #0x08	:	addw r8, r1, #0x10
	addw r7, r1, #0x18	:	addw r6, r1, #0x20
	addw r5, r1, #0x28	:	addw r4, r1, #0x30
	addw r3, r1, #0x38
.L3:
	loop  r2,.L6
	ldw r10, [r0], #4!	:	ldw r11, [r1]
	ldw r14, [r9]	:	ldw r22, [r7]
	ldw r15, [r8]	:	ldw r21, [r5]
	ldw r20, [r3]	:	ldw r13, [r6]
	ldw r12, [r4]
	ldw r19, [r10]
	ldw r18, [r10, #-2]	:	ldw r17, [r10, #2]
	ldw r16, [r10, #4]	:	addw r14, r14, r11
	addw r15, r22, r15	:	addw r13, r21, r13
	addw r12, r20, r12
	mulw r15, r15, r19
	mulw r14, r14, r18	:	mulw r13, r13, r17
	mulw r12, r12, r16
	addw r11, r15, r14
	addw r11, r11, r13
	addw r11, r11, r12
.L6:
	stw r11, [r10]
.L7:
	sbl [link]
	.size	foo,.-foo


The following is assembly code generated with -fno-tree-pre option. 
foo:
	cmplew p0, r2, zr
	sbpt p0.0, [link]
.L3:
	loop  r2,.L6
	ldw r3, [r0], #4!	:	ldw r4, [r1]
	ldw r7, [r1, #8]	:	ldw r15, [r1, #24]
	ldw r8, [r1, #16]	:	ldw r14, [r1, #40]
	ldw r13, [r1, #56]	:	ldw r6, [r1, #32]
	ldw r5, [r1, #48]
	ldw r12, [r3]
	ldw r11, [r3, #-2]	:	ldw r10, [r3, #2]
	ldw r9, [r3, #4]	:	addw r7, r7, r4
	addw r8, r15, r8	:	addw r6, r14, r6
	addw r5, r13, r5
	mulw r8, r8, r12
	mulw r7, r7, r11	:	mulw r6, r6, r10
	mulw r5, r5, r9
	addw r4, r8, r7
	addw r4, r4, r6
	addw r4, r4, r5
.L6:
	stw r4, [r3]
.L7:
	sbl [link]
	.size	foo,.-foo




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]