This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/49473] New: [arm] poor scheduling of loads
- From: "philb at gnu dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 20 Jun 2011 11:43:40 +0000
- Subject: [Bug target/49473] New: [arm] poor scheduling of loads
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49473
Summary: [arm] poor scheduling of loads
Product: gcc
Version: 4.7.0
Status: UNCONFIRMED
Severity: minor
Priority: P3
Component: target
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: philb@gnu.org
Target: arm-linux
The instruction scheduler doesn't seem to be doing a very good job of
accounting for the load delay slots on ARM1136JF-S. See for example the
attached testcase:
$ ./cc1 -fPIC -O2 -mtune=arm1136jf-s -march=armv6 -mfpu=vfp -mfloat-abi=soft
which yields:
gst_mpegts_demux_sink_setcaps:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r4, r5, r6, r7, r8, lr}
sub sp, sp, #16
mov r7, r1
bl gst_object_get_parent(PLT)
mov r1, #0
ldr r4, .L7
.LPIC0:
add r4, pc, r4
mov r5, r0
mov r0, r7
bl gst_caps_get_structure(PLT)
ldr r3, .L7+4
ldr r6, [r4, r3]
ldr r3, [r6, #0]
cmp r3, #3
mov r8, r0
bls .L5
ldr r3, .L7+8
ldr r1, .L7+12
.LPIC2:
add r3, pc, r3
add r2, r3, #64
stmia sp, {r1, r5}
str r2, [sp, #8]
str r7, [sp, #12]
add r2, r3, #12
mov r0, #0
mov r1, #4
add r3, r3, #32
bl gst_debug_log(PLT)
.L5:
ldr r4, .L7+16
add r2, r5, #32768
.LPIC1:
add r4, pc, r4
mov r0, r8
mov r1, r4
add r2, r2, #172
bl gst_structure_get_int(PLT)
cmp r0, #0
bne .L3
ldr r3, [r6, #0]
cmp r3, #3
bls .L3
mov r2, #484
add r3, r4, #88
stmia sp, {r2, r5}
str r3, [sp, #8]
mov r1, #4
add r2, r4, #12
add r3, r4, #32
bl gst_debug_log(PLT)
.L3:
mov r0, r5
bl gst_object_unref(PLT)
mov r0, #1
add sp, sp, #16
ldmfd sp!, {r4, r5, r6, r7, r8, pc}
Note that:
- the add at .LPIC0 will stall for two cycles because the preceding load has a
result latency of three. The two subsequent MOVs could have been scheduled in
these slots since they don't have any data dependency on the ADD;
- the add at .LPIC1 will stall for one cycle for the same reason, and the same
applies to the following MOV.
On this topic I noticed that arm1136jfs.md has:
;; An alu op can start sooner after a load, if that alu op does not
;; have an early register dependency on the load
(define_bypass 2 "11_load1"
"11_alu_op")
(define_bypass 2 "11_load1"
"11_alu_shift_op"
"arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_load1"
"11_alu_shift_reg_op"
"arm_no_early_alu_shift_dep")
... which seems a little strange, since the result latency of LDR is three not
two according to the documentation. The above bypasses look like they would be
correct for instructions where the dependency is a Late Reg, but that isn't the
case for alu_ops.