This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
NON_OPTIMAL CODE GENERATED FOR SH4
- From: naveens at noida dot hcltech dot com
- To: gcc-gnats at gcc dot gnu dot org
- Cc: gcc at gcc dot gnu dot org, sanjivg at noida dot hcltech dot com
- Date: Fri, 11 Jan 2002 13:11:10 +0530
- Subject: NON_OPTIMAL CODE GENERATED FOR SH4
- Reply-to: naveens at noida dot hcltech dot com
>Submitter-Id: net
>Originator: Naveen Sharma
>Organization: HCLT
>Confidential: no
>Synopsis: NON_OPTIMAL CODE GENERATED FOR SH4
>Severity: serious
>Priority: medium
>Category:
>Class: pessimizes-code
>Release: 3.0.3
>Environment:
System: Linux 2.4.2-2 #1 Sun Apr 8 20:41:30 EDT 2001 i686 unknown
Architecture: i686
host: i686-pc-linux-gnu
build: i686-pc-linux-gnu
target: sh-unknown-linux-gnu
configured with: ../gcc/configure --target=sh-linux --prefix=/home/naveens/local --with-as=/home/gcc/gnu_env/tools/sh4/bin/sh-linux-as --with-ld=/home/gcc/gnu_env/tools/sh4/bin/sh-linux-ld --with-headers=/usr/sh4-linux/include --with-libs=/usr/sh4-linux/lib
>Description:
For the following piece of code:
void
loop_p (np, non0, coeff)
int np,non0;
double coeff[][2048];
{
int i, j, k;
double tmp1;
for (j = non0;j < np;j++)
for (k = 0;k < j;k++) {
coeff[j][j] -= tmp1 * coeff[j][k]; <-- HERE
}
}
The Code Generated for innermost Loop is
.L9:
fmov.s @r2+,fr5
dt r3
fmov.s @r7+,fr3
fmov.s @r2+,fr4
fmov.s @r7,fr2 ! Note this
fmul dr6,dr4
add #-4,r7 <-- Subtract 4 ??
add #4,r7 <-- Add 4 ??
fsub dr4,dr2
fmov.s fr2,@r7 ! Note this
bf/s .L9
fmov.s fr3,@-r7
There are redundant statements in the assembler output.( Market with arrow )
These are generated in insn splitting after the reload pass(along with
marked fmov.s insns).
These effect the performance since they are in innermost loop.
>How-To-Repeat:
Repeat with Code Fragment attached in description.
>Fix:
Investigation shows follwing pieces of code in sh.md cause the
problems.Somebody can comment as to why it is written that way
so that fix can be worked out.
Line 2869 and after:
---------
.....
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(match_operand:DF 1 "memory_operand" ""))
(use (match_operand:PSI 2 "fpscr_operand" "c"))
(clobber (match_scratch:SI 3 "X"))]
"TARGET_SH4 && ! TARGET_FMOVD && reload_completed
&& FP_OR_XD_REGISTER_P (true_regnum (operands[0]))"
[(const_int 0)]
"
{
int regno = true_regnum (operands[0]);
rtx addr, insn, adjust = NULL_RTX;
rtx mem2 = copy_rtx (operands[1]);
rtx reg0 = gen_rtx_REG (SFmode, regno + !! TARGET_LITTLE_ENDIAN);
rtx reg1 = gen_rtx_REG (SFmode, regno + ! TARGET_LITTLE_ENDIAN);
PUT_MODE (mem2, SFmode);
operands[1] = copy_rtx (mem2);
addr = XEXP (mem2, 0);
if (GET_CODE (addr) != POST_INC)
{
/* If we have to modify the stack pointer, the value that we have
read with post-increment might be modified by an interrupt,
so write it back. */
if (REGNO (addr) == STACK_POINTER_REGNUM)
adjust = gen_push_e (reg0);
else
adjust = gen_addsi3 (addr, addr, GEN_INT (-4)); <-- This is it !!
XEXP (mem2, 0) = addr = gen_rtx_POST_INC (SImode, addr);
}
.............
Line 2910 and after:
(define_split
[(set (match_operand:DF 0 "memory_operand" "")
(match_operand:DF 1 "register_operand" ""))
(use (match_operand:PSI 2 "fpscr_operand" "c"))
(clobber (match_scratch:SI 3 "X"))]
"TARGET_SH4 && ! TARGET_FMOVD && reload_completed
&& FP_OR_XD_REGISTER_P (true_regnum (operands[1]))"
[(const_int 0)]
"
{
int regno = true_regnum (operands[1]);
rtx insn, addr, adjust = NULL_RTX;
operands[0] = copy_rtx (operands[0]);
PUT_MODE (operands[0], SFmode);
insn = emit_insn (gen_movsf_ie (operands[0],
gen_rtx (REG, SFmode,
regno + ! TARGET_LITTLE_ENDIAN),
operands[2]));
operands[0] = copy_rtx (operands[0]);
addr = XEXP (operands[0], 0);
if (GET_CODE (addr) != PRE_DEC)
{
adjust = gen_addsi3 (addr, addr, GEN_INT (4)); <-- This is it.
emit_insn_before (adjust, insn);
XEXP (operands[0], 0) = addr = gen_rtx (PRE_DEC, SImode, addr);
}
addr = XEXP (addr, 0);
if (! adjust)
REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, addr, NULL_RTX);
insn = emit_insn (gen_movsf_ie (operands[0],
gen_rtx (REG, SFmode,
regno + !! TARGET_LITTLE_ENDIAN),
operands[2]));
REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, addr, NULL_RTX);
DONE;
}")
pasting the complete assembler output for reference.
.file "tmp1.c"
.little
.text
.align 5
.global loop_p
.type loop_p,@function
loop_p:
mov.l r8,@-r15
mov.l r9,@-r15
mov r4,r8
mov.l r14,@-r15
mov r6,r4
cmp/ge r8,r5
sts.l pr,@-r15
bt/s .L12
mov r15,r14
mov #11,r1
mov r5,r6
shld r1,r6
mov.w .L14,r1
mul.l r1,r5
mov r1,r9
mov.w .L15,r1
sts macl,r0
add r4,r0
.L5:
mov #0,r3
cmp/ge r5,r3
bt .L13
mov #3,r3
mov r6,r2
mov r0,r7
shld r3,r2
mov r5,r3
add r4,r2
.L9:
fmov.s @r2+,fr5
dt r3
fmov.s @r7+,fr3
fmov.s @r2+,fr4
fmov.s @r7,fr2
fmul dr6,dr4
add #-4,r7
add #4,r7
fsub dr4,dr2
fmov.s fr2,@r7
bf/s .L9
fmov.s fr3,@-r7
.L13:
add #1,r5
add r1,r6
cmp/ge r8,r5
bf/s .L5
add r9,r0
.L12:
mov r14,r15
lds.l @r15+,pr
mov.l @r15+,r14
mov.l @r15+,r9
rts
mov.l @r15+,r8
.align 1
.L14:
.short 16392
.L15:
.short 2048
.Lfe1:
.size loop_p,.Lfe1-loop_p
.ident "GCC: (GNU) 3.0.3"