Bug 43118 - vld4 and vst4 intrinsics are not handled correctly
Summary: vld4 and vst4 intrinsics are not handled correctly
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.4.1
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: 47562
  Show dependency treegraph
 
Reported: 2010-02-19 10:51 UTC by Samuel Rødal
Modified: 2011-07-19 08:41 UTC (History)
6 users (show)

See Also:
Host: arm-none-linux-gnueabi-gcc
Target: arm-none-linux-gnueabi-gcc
Build: arm-none-linux-gnueabi-gcc
Known to work:
Known to fail: 4.4.3, 4.5.0
Last reconfirmed: 2010-02-19 13:45:57


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Samuel Rødal 2010-02-19 10:51:57 UTC
The vldX and vstX variation of NEON intrinsics, where X > 1, seem to cause the compiler to generate an obscene amount of code.

Example:

void blend1(uint8_t *src, uint8_t *dst)
{
        uint8x8_t temp = vld1_u8(src);
        vst1_u8(dst, temp);
}

generates the sensible

        vld1.8  {d16}, [r0]
        vst1.8  {d16}, [r1]
        bx      lr

Whereas:

void blend4(uint8_t *src, uint8_t *dst)
{
        uint8x8x4_t temp = vld4_u8(src);
        vst4_u8(dst, temp);
}

generates

        stmfd   sp!, {r4, r5, r6}
        .save {r4, r5, r6}
.LCFI4:
        .pad #132
        sub     sp, sp, #132
.LCFI5:
        vld4.8  {d16-d19}, [r0]
        add     r6, sp, #64
        vstmia  r6, {d16-d19}
        mov     r5, r1
        ldmia   r6!, {r0, r1, r2, r3}
        add     ip, sp, #96
        mov     r4, ip
        stmia   r4!, {r0, r1, r2, r3}
        ldmia   r6, {r0, r1, r2, r3}
        stmia   r4, {r0, r1, r2, r3}
        ldmia   ip!, {r0, r1, r2, r3}
        add     ip, sp, #32
        mov     r6, ip
        stmia   r6!, {r0, r1, r2, r3}
        ldmia   r4, {r0, r1, r2, r3}
        stmia   r6, {r0, r1, r2, r3}
        ldmia   ip!, {r0, r1, r2, r3}
        mov     r4, sp
        stmia   r4!, {r0, r1, r2, r3}
        ldmia   r6, {r0, r1, r2, r3}
        stmia   r4, {r0, r1, r2, r3}
        vldmia  sp, {d16-d19}
        vst4.8  {d16-d19}, [r5]
        add     sp, sp, #132
        ldmfd   sp!, {r4, r5, r6}
        bx      lr

Compile flags used were "-mfloat-abi=softfp -mfpu=neon -O3".
Comment 1 Richard Biener 2010-02-19 11:08:17 UTC
Likely because of the union in

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
{
  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
}

which does copy-initialization of __bu.  Also try GCC 4.5.
Comment 2 Ramana Radhakrishnan 2010-02-19 13:45:57 UTC
Trunk behaves similarly - I wonder if this is similar to 41021.

Here's what trunk generates. 

        push    {r4, r5, r6, r7}
        vld4.8  {d16-d19}, [r0]
        sub     sp, sp, #96
        mov     r7, r1
        vstmia  sp, {d16-d19}
        mov     r6, sp
        add     r5, sp, #64
        add     ip, sp, #32
        ldmia   r6!, {r0, r1, r2, r3}
        mov     r4, r5
        stmia   r5!, {r0, r1, r2, r3}
        ldmia   r6, {r0, r1, r2, r3}
        stmia   r5, {r0, r1, r2, r3}
        ldmia   r4!, {r0, r1, r2, r3}
        stmia   ip!, {r0, r1, r2, r3}
        ldmia   r4, {r0, r1, r2, r3}
        stmia   ip, {r0, r1, r2, r3}
        add     r3, sp, #32
        vldmia  r3, {d16-d19}
        vst4.8  {d16-d19}, [r7]
        add     sp, sp, #96
        pop     {r4, r5, r6, r7}
        bx      lr
Comment 3 Daniel Jacobowitz 2010-02-22 21:14:23 UTC
Subject: Re:  vld4 and vst4 intrinsics are not handled
 correctly

On Fri, Feb 19, 2010 at 11:08:18AM -0000, rguenth at gcc dot gnu dot org wrote:
> Likely because of the union in
> 
> __extension__ static __inline void __attribute__ ((__always_inline__))
> vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
> {
>   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
>   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
> }
> 
> which does copy-initialization of __bu.

Right.  FYI, my best idea to date of how to fix this is to convert the
multiple-vector types (like uint8x8x4_t) to builtin types.  At that
point we can use the neon_reinterpret patterns to do the necessary
type punning without involving __builtin_neon_oi and the union.

Comment 4 Richard Biener 2010-02-23 10:42:23 UTC
(In reply to comment #3)
> Subject: Re:  vld4 and vst4 intrinsics are not handled
>  correctly
> 
> On Fri, Feb 19, 2010 at 11:08:18AM -0000, rguenth at gcc dot gnu dot org wrote:
> > Likely because of the union in
> > 
> > __extension__ static __inline void __attribute__ ((__always_inline__))
> > vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
> > {
> >   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
> >   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
> > }
> > 
> > which does copy-initialization of __bu.
> 
> Right.  FYI, my best idea to date of how to fix this is to convert the
> multiple-vector types (like uint8x8x4_t) to builtin types.  At that
> point we can use the neon_reinterpret patterns to do the necessary
> type punning without involving __builtin_neon_oi and the union.

Ideally we'd be able to get rid of the extra temporary at the tree level.
Value-numbering can in theory do that, but I suppose the testcase at
hand is obfuscated enough to not do it.
Comment 5 Justin Lebar 2010-04-28 21:56:27 UTC
Is there a workaround for this, short of writing inline assembly?
Comment 6 ruZZ il 2010-09-15 20:54:13 UTC
this bug is bugging me too..
Comment 7 Ramana Radhakrishnan 2011-07-08 11:57:04 UTC
A recent version of 4.6.1 at O1 appears to give me . That would indicate this is fixed in trunk.

blend4:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	vld4.8	{d16-d19}, [r0]
	vst4.8	{d16-d19}, [r1]
	bx	lr
	.size	blend4, .-blend4
	.ident	"GCC: (GNU) 4.7.0 20110616 (experimental)"
	.section	.note.GNU-stack,"",%progbits


Ramana
Comment 8 Richard Sandiford 2011-07-19 08:41:04 UTC
(In reply to comment #7)
> A recent version of 4.6.1 at O1 appears to give me . That would indicate this
> is fixed in trunk.

Yeah, the bug was fixed as part of the load-lanes stuff.
Since it isn't a regression, and since the fix is too
invasive to backport, I hope it's OK to close as fixed.