SSE4.1 has pmovzx and pmovsx. For code like: [hjl@gnu-2 vect]$ cat pmovzxbw.c typedef unsigned char vec_t; typedef unsigned short vecx_t; extern __attribute__((aligned(16))) vec_t x [64]; extern __attribute__((aligned(16))) vecx_t y [64]; void foo () { int i; for (i = 0; i < 64; i++) y [i] = x [i]; } Icc generates pmovzxbw x(%rip), %xmm0 #13.14 pmovzxbw 8+x(%rip), %xmm1 #13.14 pmovzxbw 16+x(%rip), %xmm2 #13.14 pmovzxbw 24+x(%rip), %xmm3 #13.14 pmovzxbw 32+x(%rip), %xmm4 #13.14 pmovzxbw 40+x(%rip), %xmm5 #13.14 pmovzxbw 48+x(%rip), %xmm6 #13.14 pmovzxbw 56+x(%rip), %xmm7 #13.14 movdqa %xmm0, y(%rip) #13.5 movdqa %xmm1, 16+y(%rip) #13.5 movdqa %xmm2, 32+y(%rip) #13.5 movdqa %xmm3, 48+y(%rip) #13.5 movdqa %xmm4, 64+y(%rip) #13.5 movdqa %xmm5, 80+y(%rip) #13.5 movdqa %xmm6, 96+y(%rip) #13.5 movdqa %xmm7, 112+y(%rip) #13.5 ret #14.1
GCC 4.5 is able to produce pmovzxbw via sse4_1_zero_extendv8qiv8hi2 but it does not accept a memory operand for operand 1. movdqa x(%rip), %xmm0 pmovzxbw %xmm0, %xmm1 psrldq $8, %xmm0 pmovzxbw %xmm0, %xmm0 movdqa %xmm1, y(%rip) movdqa %xmm0, y+16(%rip) ... Is what GCC currently produces.
know as byte one it is simulor
Gcc 5 and 6 produces code with pmovzx when compiling the example with -O3 -msse4.1 I assume this can be closed.
(In reply to Allan Jensen from comment #3) > Gcc 5 and 6 produces code with pmovzx when compiling the example with -O3 > -msse4.1 > > I assume this can be closed. Note like comment 1 saids, it will not use a memory load, though instead it does half as many memory reads. movdqa 0x0(%rip),%xmm0 # 8 <foo+0x8> pmovzxbw %xmm0,%xmm1 psrldq $0x8,%xmm0 pmovzxbw %xmm0,%xmm0 movaps %xmm1,0x0(%rip) # 1e <foo+0x1e> movaps %xmm0,0x0(%rip) # 25 <foo+0x25>
We produce this now: movdqa x(%rip), %xmm1 pxor %xmm0, %xmm0 movdqa %xmm1, %xmm2 punpckhbw %xmm0, %xmm1 movaps %xmm1, y+16(%rip) movdqa x+16(%rip), %xmm1 punpcklbw %xmm0, %xmm2 movaps %xmm2, y(%rip) movdqa %xmm1, %xmm2 punpckhbw %xmm0, %xmm1 movaps %xmm1, y+48(%rip) movdqa x+32(%rip), %xmm1 punpcklbw %xmm0, %xmm2 movaps %xmm2, y+32(%rip) movdqa %xmm1, %xmm2 punpckhbw %xmm0, %xmm1 movaps %xmm1, y+80(%rip) movdqa x+48(%rip), %xmm1 punpcklbw %xmm0, %xmm2 movaps %xmm2, y+64(%rip) movdqa %xmm1, %xmm2 punpckhbw %xmm0, %xmm1 punpcklbw %xmm0, %xmm2 movaps %xmm1, y+112(%rip) movaps %xmm2, y+96(%rip) And even ICC produce a similar thing except scheduled differently.
(In reply to Andrew Pinski from comment #5) > We produce this now: > > movdqa x(%rip), %xmm1 > pxor %xmm0, %xmm0 > movdqa %xmm1, %xmm2 > punpckhbw %xmm0, %xmm1 > movaps %xmm1, y+16(%rip) > movdqa x+16(%rip), %xmm1 > punpcklbw %xmm0, %xmm2 > movaps %xmm2, y(%rip) > movdqa %xmm1, %xmm2 > punpckhbw %xmm0, %xmm1 > movaps %xmm1, y+48(%rip) > movdqa x+32(%rip), %xmm1 > punpcklbw %xmm0, %xmm2 > movaps %xmm2, y+32(%rip) > movdqa %xmm1, %xmm2 > punpckhbw %xmm0, %xmm1 > movaps %xmm1, y+80(%rip) > movdqa x+48(%rip), %xmm1 > punpcklbw %xmm0, %xmm2 > movaps %xmm2, y+64(%rip) > movdqa %xmm1, %xmm2 > punpckhbw %xmm0, %xmm1 > punpcklbw %xmm0, %xmm2 > movaps %xmm1, y+112(%rip) > movaps %xmm2, y+96(%rip) > > And even ICC produce a similar thing except scheduled differently. I hope that is because you forgot -msse4.1?