Bug 31667 - Integer extensions vectorization could be improved
Summary: Integer extensions vectorization could be improved
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.3.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2007-04-23 15:27 UTC by H.J. Lu
Modified: 2021-08-21 21:44 UTC (History)
3 users (show)

See Also:
Host:
Target: i?86-*-* x86_64-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2010-03-13 01:35:56


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description H.J. Lu 2007-04-23 15:27:25 UTC
SSE4.1 has pmovzx and pmovsx. For code like:

[hjl@gnu-2 vect]$ cat pmovzxbw.c
typedef unsigned char vec_t;
typedef unsigned short vecx_t;

extern __attribute__((aligned(16))) vec_t x [64];
extern __attribute__((aligned(16))) vecx_t y [64];

void
foo ()
{
  int i;

  for (i = 0; i < 64; i++)
    y [i]  = x [i];
}

Icc generates

        pmovzxbw  x(%rip), %xmm0                                #13.14
        pmovzxbw  8+x(%rip), %xmm1                              #13.14
        pmovzxbw  16+x(%rip), %xmm2                             #13.14
        pmovzxbw  24+x(%rip), %xmm3                             #13.14
        pmovzxbw  32+x(%rip), %xmm4                             #13.14
        pmovzxbw  40+x(%rip), %xmm5                             #13.14
        pmovzxbw  48+x(%rip), %xmm6                             #13.14
        pmovzxbw  56+x(%rip), %xmm7                             #13.14
        movdqa    %xmm0, y(%rip)                                #13.5
        movdqa    %xmm1, 16+y(%rip)                             #13.5
        movdqa    %xmm2, 32+y(%rip)                             #13.5
        movdqa    %xmm3, 48+y(%rip)                             #13.5
        movdqa    %xmm4, 64+y(%rip)                             #13.5
        movdqa    %xmm5, 80+y(%rip)                             #13.5
        movdqa    %xmm6, 96+y(%rip)                             #13.5
        movdqa    %xmm7, 112+y(%rip)                            #13.5
        ret                                                     #14.1
Comment 1 Andrew Pinski 2010-03-13 01:35:56 UTC
GCC 4.5 is able to produce pmovzxbw via sse4_1_zero_extendv8qiv8hi2 but it does not accept a memory operand for operand 1.
	movdqa	x(%rip), %xmm0
	pmovzxbw	%xmm0, %xmm1
	psrldq	$8, %xmm0
	pmovzxbw	%xmm0, %xmm0
	movdqa	%xmm1, y(%rip)
	movdqa	%xmm0, y+16(%rip)
...
Is what GCC currently produces.
Comment 2 xiaoyuanbo 2012-02-22 12:46:37 UTC
know as byte one it is simulor
Comment 3 Allan Jensen 2016-11-28 12:31:16 UTC
Gcc 5 and 6 produces code with pmovzx when compiling the example with -O3 -msse4.1

I assume this can be closed.
Comment 4 Allan Jensen 2016-11-28 12:38:34 UTC
(In reply to Allan Jensen from comment #3)
> Gcc 5 and 6 produces code with pmovzx when compiling the example with -O3
> -msse4.1
> 
> I assume this can be closed.

Note like comment 1 saids, it will not use a memory load, though instead it does half as many memory reads.

movdqa 0x0(%rip),%xmm0        # 8 <foo+0x8>
pmovzxbw %xmm0,%xmm1
psrldq $0x8,%xmm0
pmovzxbw %xmm0,%xmm0
movaps %xmm1,0x0(%rip)        # 1e <foo+0x1e>
movaps %xmm0,0x0(%rip)        # 25 <foo+0x25>
Comment 5 Andrew Pinski 2021-08-15 23:28:18 UTC
We produce this now:

        movdqa  x(%rip), %xmm1
        pxor    %xmm0, %xmm0
        movdqa  %xmm1, %xmm2
        punpckhbw       %xmm0, %xmm1
        movaps  %xmm1, y+16(%rip)
        movdqa  x+16(%rip), %xmm1
        punpcklbw       %xmm0, %xmm2
        movaps  %xmm2, y(%rip)
        movdqa  %xmm1, %xmm2
        punpckhbw       %xmm0, %xmm1
        movaps  %xmm1, y+48(%rip)
        movdqa  x+32(%rip), %xmm1
        punpcklbw       %xmm0, %xmm2
        movaps  %xmm2, y+32(%rip)
        movdqa  %xmm1, %xmm2
        punpckhbw       %xmm0, %xmm1
        movaps  %xmm1, y+80(%rip)
        movdqa  x+48(%rip), %xmm1
        punpcklbw       %xmm0, %xmm2
        movaps  %xmm2, y+64(%rip)
        movdqa  %xmm1, %xmm2
        punpckhbw       %xmm0, %xmm1
        punpcklbw       %xmm0, %xmm2
        movaps  %xmm1, y+112(%rip)
        movaps  %xmm2, y+96(%rip)

And even ICC produce a similar thing except scheduled differently.
Comment 6 Allan Jensen 2021-08-21 21:44:10 UTC
(In reply to Andrew Pinski from comment #5)
> We produce this now:
> 
>         movdqa  x(%rip), %xmm1
>         pxor    %xmm0, %xmm0
>         movdqa  %xmm1, %xmm2
>         punpckhbw       %xmm0, %xmm1
>         movaps  %xmm1, y+16(%rip)
>         movdqa  x+16(%rip), %xmm1
>         punpcklbw       %xmm0, %xmm2
>         movaps  %xmm2, y(%rip)
>         movdqa  %xmm1, %xmm2
>         punpckhbw       %xmm0, %xmm1
>         movaps  %xmm1, y+48(%rip)
>         movdqa  x+32(%rip), %xmm1
>         punpcklbw       %xmm0, %xmm2
>         movaps  %xmm2, y+32(%rip)
>         movdqa  %xmm1, %xmm2
>         punpckhbw       %xmm0, %xmm1
>         movaps  %xmm1, y+80(%rip)
>         movdqa  x+48(%rip), %xmm1
>         punpcklbw       %xmm0, %xmm2
>         movaps  %xmm2, y+64(%rip)
>         movdqa  %xmm1, %xmm2
>         punpckhbw       %xmm0, %xmm1
>         punpcklbw       %xmm0, %xmm2
>         movaps  %xmm1, y+112(%rip)
>         movaps  %xmm2, y+96(%rip)
> 
> And even ICC produce a similar thing except scheduled differently.

I hope that is because you forgot -msse4.1?