[Bug middle-end/42973] New: [4.5 regression] IRA appraently systematically making reload too busy on 2 address instructions with 3 operands
hubicka at gcc dot gnu dot org
gcc-bugzilla@gcc.gnu.org
Fri Feb 5 13:09:00 GMT 2010
Following testcase (derived from pattern appearing in internal loops of NAND
SPEC2k6 benchmark):
double a[1000];
double b[1000];
void t(void)
{
int i;
double carried = 0;
double inv=a[99];
for (i=0;i<1000;i++)
{
carried = inv-carried;
double tmp = carried *carried;
carried = a[i];
b[i]-= tmp;
}
}
Needs one FP move in internal loop, but with -funroll-instructions we get 16
moves for 8 unrollings:
.L2:
movapd %xmm0, %xmm3
subsd %xmm2, %xmm3
movapd %xmm3, %xmm15
movsd a(%rax), %xmm13
mulsd %xmm3, %xmm15
movsd b(%rax), %xmm14
subsd %xmm15, %xmm14
movsd %xmm14, b(%rax)
leaq 8(%rax), %r10
movapd %xmm0, %xmm12
subsd %xmm13, %xmm12
movapd %xmm12, %xmm11
movsd a(%r10), %xmm9
mulsd %xmm12, %xmm11
movsd b(%r10), %xmm10
subsd %xmm11, %xmm10
movsd %xmm10, b(%r10)
leaq 16(%rax), %r9
movapd %xmm0, %xmm8
subsd %xmm9, %xmm8
movapd %xmm8, %xmm7
the problem is instruction:
(insn 83 75 84 3 /home/jh/q.c:10 (set (reg/v:DF 109 [ carried ])
(minus:DF (reg/v:DF 70 [ inv ])
(reg/v:DF 104 [ carried ]))) 726 {*fop_df_1_sse}
(expr_list:REG_DEAD (reg/v:DF 104 [ carried ])
(nil)))
This instruction always needs one move (inv is invariant and must be preserved
over loop) IRA allocate both 109 and 104 into same register. This creates
situation where reload needs to move instructions instead of one:
(insn 156 76 84 3 /home/jh/q.c:10 (set (reg:DF 22 xmm1)
(reg/v:DF 21 xmm0 [orig:62 inv ] [62])) 103 {*movdf_integer_rex64}
(nil))
(insn 84 156 157 3 /home/jh/q.c:10 (set (reg:DF 22 xmm1)
(minus:DF (reg:DF 22 xmm1)
(reg/v:DF 23 xmm2 [orig:89 carried.37 ] [89]))) 733 {*fop_df_1_sse}
(nil))
(insn 157 84 85 3 /home/jh/q.c:10 (set (reg:DF 23 xmm2 [96])
(reg:DF 22 xmm1)) 103 {*movdf_integer_rex64} (nil))
that cause significant slowdown at NAND benchmark.
I looked why regmove does not fix the instruction into 2 address form. It is
because:
if (! (src_note = find_reg_note (insn, REG_DEAD, src)))
{
/* We used to force the copy here like in other cases, but
it produces worse code, as it eliminates no copy
instructions and the copy emitted will be produced by
reload anyway. On patterns with multiple alternatives,
there may be better solution available.
In particular this change produced slower code for numeric
i387 programs. */
continue;
}
Following patch:
Index: regmove.c
===================================================================
--- regmove.c (revision 156173)
+++ regmove.c (working copy)
@@ -1037,6 +1037,11 @@ regmove_backward_pass (void)
In particular this change produced slower code for numeric
i387 programs. */
+ if (!copy_src)
+ {
+ copy_src = src;
+ copy_dst = dst;
+ }
continue;
}
reverts the ancient change and inserts the move. This leads to PR42961
with a workaround we get:
.L2:
movapd %xmm0, %xmm8
subsd %xmm3, %xmm8
movsd a(%rax), %xmm6
mulsd %xmm8, %xmm8
movsd b(%rax), %xmm7
subsd %xmm8, %xmm7
movsd %xmm7, b(%rax)
leaq 8(%rax), %r10
movapd %xmm0, %xmm5
subsd %xmm6, %xmm5
movsd a(%r10), %xmm3
mulsd %xmm5, %xmm5
movsd b(%r10), %xmm4
subsd %xmm5, %xmm4
movsd %xmm4, b(%r10)
leaq 16(%rax), %r9
movapd %xmm0, %xmm1
subsd %xmm3, %xmm1
movsd a(%r9), %xmm15
mulsd %xmm1, %xmm1
movsd b(%r9), %xmm2
subsd %xmm1, %xmm2
movsd %xmm2, b(%r9)
leaq 24(%rax), %r8
i.e. problem is gone and we also get for % speedup at NAMD itself.
Martin pointed out that http://gcc.gnu.org/ml/gcc/2010-02/msg00055.html
seems related. I also see suffle copies here:
cp0:a0(r60)<->a35(r68)@15:shuffle
cp1:a32(r78)<->a33(r74)@15:shuffle
cp2:a28(r84)<->a29(r77)@15:shuffle
cp3:a24(r90)<->a25(r83)@15:shuffle
cp4:a20(r96)<->a21(r89)@15:shuffle
cp5:a16(r102)<->a17(r95)@15:shuffle
cp6:a12(r108)<->a13(r101)@15:shuffle
cp7:a8(r113)<->a9(r107)@15:shuffle
So it seems that IRA makes this pattern by design. Can't we instead of
the regmove patch above just make IRA to record conflict in between destination
and second operand of 2 address instruction (They can't sit in same register
anyway) instead of recording the copy?
Honza
--
Summary: [4.5 regression] IRA appraently systematically making
reload too busy on 2 address instructions with 3
operands
Product: gcc
Version: 4.5.0
Status: UNCONFIRMED
Keywords: missed-optimization, ra
Severity: normal
Priority: P3
Component: middle-end
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: hubicka at gcc dot gnu dot org
GCC target triplet: x86_64-linux
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42973
More information about the Gcc-bugs
mailing list