This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Unnecessary PRE optimization
- From: "Bingfeng Mei" <bmei at broadcom dot com>
- To: "gcc at gcc dot gnu dot org" <gcc at gcc dot gnu dot org>
- Cc: "dberlin at dberlin dot org" <dberlin at dberlin dot org>
- Date: Wed, 23 Dec 2009 03:49:55 -0800
- Subject: Unnecessary PRE optimization
Hello,
I encounter an issue with PRE optimization, which created worse
code than no optimization.
This the test function:
void foo(int *data, int *m_v4w, int num)
{
int i;
int m0;
for( i=0; i<num; i++ )
{
int *data1 = (data[i] - 2);
int *data2 = data[i];
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int d0, d1, d2, d3;
d0 = data1[0];
d1 = data2[0];
d2 = data1[1];
d3 = data2[1];
tmp0 = m_v4w[0];
tmp1 = m_v4w[2];
tmp2 = m_v4w[4];
tmp3 = m_v4w[6];
tmp4 = m_v4w[8];
tmp5 = m_v4w[10];
tmp6 = m_v4w[12];
tmp7 = m_v4w[14];
m0 = tmp0 * d0;
m0 += tmp1 * d0;
m0 += tmp2 * d1;
m0 += tmp3 * d1;
m0 += tmp4 * d2;
m0 += tmp5 * d2;
m0 += tmp6 * d3;
m0 += tmp7 * d3;
data2[0] = m0;
}
}
The following is the code generated for our processor (easy to understand).
PRE pass moves address expression (reg + constant offset) out of loop.
It increases both size and cycle unnecessarily since our architecture support
reg + constant offset addressing mode. It is even worse in bigger loops.
Too many registers cause spill in register allocation and performance suffers
further. This happens on both 4.4 and TRUNK versions. I also tested on x86,
which is the same.
gcc_compiled.:
.section .text, "axU"
.align 8
.global foo
.type foo,@function
foo:
cmplew p0, r2, zr
sbpt p0.0, [link]
addw r9, r1, #0x08 : addw r8, r1, #0x10
addw r7, r1, #0x18 : addw r6, r1, #0x20
addw r5, r1, #0x28 : addw r4, r1, #0x30
addw r3, r1, #0x38
.L3:
loop r2,.L6
ldw r10, [r0], #4! : ldw r11, [r1]
ldw r14, [r9] : ldw r22, [r7]
ldw r15, [r8] : ldw r21, [r5]
ldw r20, [r3] : ldw r13, [r6]
ldw r12, [r4]
ldw r19, [r10]
ldw r18, [r10, #-2] : ldw r17, [r10, #2]
ldw r16, [r10, #4] : addw r14, r14, r11
addw r15, r22, r15 : addw r13, r21, r13
addw r12, r20, r12
mulw r15, r15, r19
mulw r14, r14, r18 : mulw r13, r13, r17
mulw r12, r12, r16
addw r11, r15, r14
addw r11, r11, r13
addw r11, r11, r12
.L6:
stw r11, [r10]
.L7:
sbl [link]
.size foo,.-foo
The following is assembly code generated with -fno-tree-pre option.
foo:
cmplew p0, r2, zr
sbpt p0.0, [link]
.L3:
loop r2,.L6
ldw r3, [r0], #4! : ldw r4, [r1]
ldw r7, [r1, #8] : ldw r15, [r1, #24]
ldw r8, [r1, #16] : ldw r14, [r1, #40]
ldw r13, [r1, #56] : ldw r6, [r1, #32]
ldw r5, [r1, #48]
ldw r12, [r3]
ldw r11, [r3, #-2] : ldw r10, [r3, #2]
ldw r9, [r3, #4] : addw r7, r7, r4
addw r8, r15, r8 : addw r6, r14, r6
addw r5, r13, r5
mulw r8, r8, r12
mulw r7, r7, r11 : mulw r6, r6, r10
mulw r5, r5, r9
addw r4, r8, r7
addw r4, r4, r6
addw r4, r4, r5
.L6:
stw r4, [r3]
.L7:
sbl [link]
.size foo,.-foo