This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/51509] New: Inefficient neon intrinsic code sequence
- From: "carrot at google dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 12 Dec 2011 07:25:34 +0000
- Subject: [Bug target/51509] New: Inefficient neon intrinsic code sequence
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51509
Bug #: 51509
Summary: Inefficient neon intrinsic code sequence
Classification: Unclassified
Product: gcc
Version: 4.7.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: carrot@google.com
Target: arm-linux-androideabi
Compile the following code with options -march=armv7-a -mfloat-abi=softfp
-mfpu=neon -mthumb -O2 -Wall -fpic
#include <arm_neon.h>
void simple_vld_intrin(uint8_t *src, uint8_t *dst)
{
uint8x8x4_t x;
uint8x8x2_t y;
x = vld4_lane_u8(src, x, 0);
y.val[0][0] = x.val[1][0];
y.val[1][0] = x.val[2][0];
vst2_lane_u8(dst, y, 0);
}
gcc 4.7 generates:
.LC0:
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.text
.align 2
.global simple_vld_intrin
.thumb
.thumb_func
.type simple_vld_intrin, %function
simple_vld_intrin:
@ args = 0, pretend = 0, frame = 32
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r2, .L2
sub sp, sp, #32
.LPIC0:
add r2, pc
vldmia r2, {d18-d21}
vmov.i32 d19, #0 @ v8qi
vmov d20, d19 @ v8qi
vmov q11, q9 @ ti
vmov q12, q10 @ ti
vmov d16, d19 @ v8qi
vmov d17, d19 @ v8qi
vld4.8 {d22[0], d23[0], d24[0], d25[0]}, [r0]
vstmia sp, {d22-d25}
ldrb r2, [sp, #8] @ zero_extendqisi2
vmov.8 d16[0], r2
vmov.u8 r3, d24[0]
vmov.8 d17[0], r3
vst2.8 {d16[0], d17[0]}, [r1]
add sp, sp, #32
bx lr
.L3:
.align 2
.L2:
.word .LC0-(.LPIC0+4)
An ideal result should be:
vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r0]
vmov d20, d17 @ v8qi
vmov d21, d18 @ v8qi
vst2.8 {d20[0], d21[0]}, [r1]
bx lr