The SUSE SPECfp2000 continuous regression tester shows a ~20% runtime regression on Haswell around the end of August 2018. [1] https://gcc.opensuse.org/gcc-old/SPEC/CFP/sb-czerny-head-64/
From the testers data last good r263752, first bad r263787. Bisecting points to Richards vectorizer series r26377[1-4], more specifically r263772. Perf shows nothing conclusive but all functions slower by the same percentage. SPEC 2000 build scripts are oddly redirecting and mangling output so -fopt-info output is illegible. Huh, or rather it's even in the dumps when dumping with -optimized: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: polygon.c:140:4: note: loop vectorized using 32 byte vectors anyhow, differences are for example: fog.c:157:10: note: loop vectorized using 32 byte vectors +fog.c:157:10: note: fog.c:157:10: note: loop versioned for vectorization because of possible aliasing the above is void gl_fog_color_vertices( GLcontext *ctx, GLuint n, GLfloat v[][4], GLubyte color[][4] ) ... case GL_EXP: d = -ctx->Fog.Density; for (i=0;i<n;i++) { GLfloat f = exp( d * ABSF(v[i][2]) ); f = CLAMP( f, 0.0F, 1.0F ); color[i][0] = f * color[i][0] + (1.0F-f) * fogr; color[i][1] = f * color[i][1] + (1.0F-f) * fogg; color[i][2] = f * color[i][2] + (1.0F-f) * fogb; } and the testcase void foo (unsigned int n, float v[][4], unsigned char color[][4], float fogr, float fogg, float fogb) { for (unsigned int i = 0; i < n; ++i) { float f = v[i][2]; color[i][0] = f * color[i][0] + (1.0F-f) * fogr; color[i][1] = f * color[i][1] + (1.0F-f) * fogg; color[i][2] = f * color[i][2] + (1.0F-f) * fogb; } } at r263771 vectorizes to foo: .LFB0: .cfi_startproc testl %edi, %edi je .L8 leal -1(%rdi), %eax vmovss .LC0(%rip), %xmm7 addq $8, %rsi leaq 4(%rdx,%rax,4), %rcx .p2align 4,,10 .p2align 3 .L3: vmovss (%rsi), %xmm6 movzbl (%rdx), %eax vxorps %xmm4, %xmm4, %xmm4 addq $4, %rdx addq $16, %rsi vcvtsi2ss %eax, %xmm4, %xmm4 vsubss %xmm6, %xmm7, %xmm5 vmulss %xmm0, %xmm5, %xmm3 vfmadd132ss %xmm6, %xmm3, %xmm4 vmulss %xmm1, %xmm5, %xmm3 vmulss %xmm2, %xmm5, %xmm5 vcvttss2si %xmm4, %eax vxorps %xmm4, %xmm4, %xmm4 movb %al, -4(%rdx) movzbl -3(%rdx), %eax vcvtsi2ss %eax, %xmm4, %xmm4 vfmadd132ss %xmm6, %xmm3, %xmm4 vxorps %xmm3, %xmm3, %xmm3 vcvttss2si %xmm4, %eax movb %al, -3(%rdx) movzbl -2(%rdx), %eax vcvtsi2ss %eax, %xmm3, %xmm3 vfmadd132ss %xmm6, %xmm5, %xmm3 vcvttss2si %xmm3, %eax movb %al, -2(%rdx) cmpq %rdx, %rcx jne .L3 .L8: ret while at the next rev it is foo: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-32, %rsp subq $520, %rsp vmovss %xmm0, -92(%rsp) vmovss %xmm1, -96(%rsp) vmovss %xmm2, -100(%rsp) testl %edi, %edi je .L20 movl %edi, %eax leaq 8(%rsi), %rcx leal -1(%rdi), %r8d leaq -1(%rdx,%rax,4), %r9 cmpq %r9, %rcx setnb %r9b salq $4, %rax leaq -4(%rsi,%rax), %rax cmpq %rax, %rdx setnb %al orb %al, %r9b je .L3 cmpl $31, %r8d jbe .L3 vmovaps %xmm0, %xmm4 vmovaps %xmm1, %xmm5 vunpcklps %xmm2, %xmm1, %xmm0 movl %r8d, %r9d vunpcklps %xmm4, %xmm2, %xmm1 vunpcklps %xmm5, %xmm4, %xmm2 shrl $5, %r9d movq %rdx, %rax vmovlhps %xmm2, %xmm0, %xmm3 vmovlhps %xmm0, %xmm1, %xmm0 vmovlhps %xmm1, %xmm2, %xmm1 salq $7, %r9 vinsertf128 $0x1, %xmm0, %ymm3, %ymm5 vmovaps .LC11(%rip), %ymm15 addq %rdx, %r9 vmovaps %ymm5, 488(%rsp) vinsertf128 $0x1, %xmm1, %ymm0, %ymm5 vmovaps %ymm5, -88(%rsp) vinsertf128 $0x1, %xmm3, %ymm1, %ymm5 vmovaps %ymm5, 456(%rsp) .p2align 4,,10 .p2align 3 .L4: vmovups 32(%rcx), %ymm4 vunpcklps 64(%rcx), %ymm4, %ymm14 subq $-128, %rax addq $512, %rcx vmovdqa .LC1(%rip), %ymm4 vmovups -352(%rcx), %ymm3 vunpcklps -320(%rcx), %ymm3, %ymm11 vmovups -320(%rcx), %ymm3 vpermd %ymm14, %ymm4, %ymm7 vmovups -448(%rcx), %ymm4 vunpcklps -416(%rcx), %ymm4, %ymm13 vmovdqa .LC2(%rip), %ymm4 vunpcklps -288(%rcx), %ymm3, %ymm10 vmovdqa .LC1(%rip), %ymm6 vmovups -256(%rcx), %ymm3 vpermd %ymm13, %ymm4, %ymm13 vmovups -384(%rcx), %ymm4 vunpcklps -352(%rcx), %ymm4, %ymm12 vmovdqa .LC0(%rip), %ymm4 vunpcklps -224(%rcx), %ymm3, %ymm9 vmovups -224(%rcx), %ymm3 vunpcklps -192(%rcx), %ymm3, %ymm8 vpermd %ymm12, %ymm4, %ymm12 vmovdqa .LC1(%rip), %ymm4 vmovups -512(%rcx), %ymm5 vmovups -192(%rcx), %ymm3 vunpcklps -480(%rcx), %ymm5, %ymm0 vpermd %ymm11, %ymm4, %ymm11 vmovdqa .LC2(%rip), %ymm4 vunpcklps -160(%rcx), %ymm3, %ymm3 vmovdqa .LC0(%rip), %ymm5 vpermd %ymm10, %ymm4, %ymm10 vmovdqa .LC0(%rip), %ymm4 vpermd %ymm0, %ymm5, %ymm5 vmovdqa %ymm5, 424(%rsp) vpermd %ymm9, %ymm4, %ymm9 vpermd %ymm8, %ymm6, %ymm4 vmovdqa .LC2(%rip), %ymm6 vmovdqa %ymm4, 200(%rsp) vpermd %ymm3, %ymm6, %ymm3 vmovups -128(%rcx), %ymm6 vunpcklps -96(%rcx), %ymm6, %ymm0 vmovdqa %ymm3, 168(%rsp) vmovdqa .LC0(%rip), %ymm2 vmovdqu -96(%rax), %ymm5 vmovdqa .LC2(%rip), %ymm1 vmovdqu -128(%rax), %ymm14 vmovdqa %ymm7, 392(%rsp) vpermd %ymm0, %ymm2, %ymm6 vmovups -96(%rcx), %ymm2 vunpcklps -64(%rcx), %ymm2, %ymm0 vpshufb .LC4(%rip), %ymm5, %ymm3 vmovdqa .LC1(%rip), %ymm2 vmovaps -88(%rsp), %ymm7 vmovdqa %ymm6, 136(%rsp) vmovdqa %ymm13, 360(%rsp) vmovdqu -64(%rax), %ymm6 vsubps 360(%rsp), %ymm15, %ymm13 vmovdqa %ymm12, 328(%rsp) vpermd %ymm0, %ymm2, %ymm2 vmovups -64(%rcx), %ymm0 vunpcklps -32(%rcx), %ymm0, %ymm0 vpshufb .LC7(%rip), %ymm6, %ymm4 vmovdqa %ymm2, 104(%rsp) vpermd %ymm0, %ymm1, %ymm0 vmovdqu -128(%rax), %ymm1 vpermq $78, %ymm4, %ymm4 vmovdqa %ymm0, 72(%rsp) vpshufb .LC5(%rip), %ymm14, %ymm0 vmovdqu -32(%rax), %ymm14 vpshufb .LC3(%rip), %ymm1, %ymm2 vpermq $78, %ymm3, %ymm1 vpermq $78, %ymm2, %ymm2 vpor %ymm2, %ymm0, %ymm3 vpshufb .LC6(%rip), %ymm5, %ymm2 vpshufb .LC8(%rip), %ymm5, %ymm0 vpor %ymm1, %ymm3, %ymm3 vpermq $78, %ymm2, %ymm2 vpshufb .LC3(%rip), %ymm6, %ymm1 vpor %ymm4, %ymm1, %ymm1 vpor %ymm2, %ymm0, %ymm2 vpshufb .LC8(%rip), %ymm14, %ymm4 vpor %ymm1, %ymm2, %ymm2 vpermq $78, %ymm4, %ymm4 vpshufb .LC9(%rip), %ymm6, %ymm1 vpermq $78, %ymm1, %ymm1 vpmovzxbw %xmm3, %ymm6 vextracti128 $0x1, %ymm3, %xmm3 vpshufb .LC10(%rip), %ymm14, %ymm0 vpmovzxbw %xmm3, %ymm3 vpmovzxbw %xmm2, %ymm5 vpor %ymm4, %ymm0, %ymm0 vextracti128 $0x1, %ymm2, %xmm2 vsubps 392(%rsp), %ymm15, %ymm14 vpor %ymm0, %ymm1, %ymm1 vpmovzxbw %xmm2, %ymm2 vmovaps 488(%rsp), %ymm0 vsubps 328(%rsp), %ymm15, %ymm12 vmovdqa %ymm10, 264(%rsp) vpmovzxbw %xmm1, %ymm4 vsubps 264(%rsp), %ymm15, %ymm10 vmovaps 456(%rsp), %ymm8 vmulps %ymm0, %ymm13, %ymm13 vextracti128 $0x1, %ymm1, %xmm1 vmovdqa %ymm9, 232(%rsp) vsubps 232(%rsp), %ymm15, %ymm9 vmovdqa %ymm11, 296(%rsp) vmulps %ymm8, %ymm12, %ymm12 vpmovzxbw %xmm1, %ymm1 vsubps 296(%rsp), %ymm15, %ymm11 vmulps %ymm0, %ymm10, %ymm10 vsubps 168(%rsp), %ymm15, %ymm0 vmulps 488(%rsp), %ymm0, %ymm0 vmulps %ymm8, %ymm9, %ymm9 vsubps 200(%rsp), %ymm15, %ymm8 vmulps %ymm7, %ymm14, %ymm14 vmulps %ymm7, %ymm11, %ymm11 vmulps %ymm7, %ymm8, %ymm8 vmovaps %ymm0, 40(%rsp) vsubps 136(%rsp), %ymm15, %ymm0 vmulps 456(%rsp), %ymm0, %ymm0 vmovaps %ymm0, 8(%rsp) vsubps 104(%rsp), %ymm15, %ymm0 vmulps %ymm7, %ymm0, %ymm7 vsubps 72(%rsp), %ymm15, %ymm0 vmulps 488(%rsp), %ymm0, %ymm0 vmovaps %ymm7, -24(%rsp) vpmovzxwd %xmm6, %ymm7 vextracti128 $0x1, %ymm6, %xmm6 vmovaps %ymm0, -56(%rsp) vcvtdq2ps %ymm7, %ymm7 vpmovzxwd %xmm6, %ymm6 vsubps 424(%rsp), %ymm15, %ymm0 vmulps 456(%rsp), %ymm0, %ymm0 vcvtdq2ps %ymm6, %ymm6 vfmadd231ps 392(%rsp), %ymm6, %ymm14 vpmovzxwd %xmm5, %ymm6 vfmadd231ps 424(%rsp), %ymm7, %ymm0 vpmovzxwd %xmm3, %ymm7 vextracti128 $0x1, %ymm3, %xmm3 vcvtdq2ps %ymm6, %ymm6 vpmovzxwd %xmm3, %ymm3 vextracti128 $0x1, %ymm5, %xmm5 vcvtdq2ps %ymm7, %ymm7 vcvtdq2ps %ymm3, %ymm3 vpmovzxwd %xmm5, %ymm5 vfmadd231ps 328(%rsp), %ymm3, %ymm12 vpmovzxwd %xmm2, %ymm3 vextracti128 $0x1, %ymm2, %xmm2 vcvtdq2ps %ymm5, %ymm5 vcvtdq2ps %ymm3, %ymm3 vfmadd231ps 296(%rsp), %ymm6, %ymm11 vpmovzxwd %xmm2, %ymm2 vcvttps2dq %ymm14, %ymm14 vfmadd231ps 264(%rsp), %ymm5, %ymm10 vmovaps 168(%rsp), %ymm5 vfmadd231ps 232(%rsp), %ymm3, %ymm9 vcvtdq2ps %ymm2, %ymm2 vpmovzxwd %xmm4, %ymm3 vcvttps2dq %ymm0, %ymm0 vfmadd231ps 200(%rsp), %ymm2, %ymm8 vcvtdq2ps %ymm3, %ymm3 vpmovzxwd %xmm1, %ymm2 vextracti128 $0x1, %ymm4, %xmm4 vfmadd213ps 40(%rsp), %ymm3, %ymm5 vextracti128 $0x1, %ymm1, %xmm1 vpmovzxwd %xmm4, %ymm4 vcvtdq2ps %ymm2, %ymm2 vpmovzxwd %xmm1, %ymm1 vcvtdq2ps %ymm4, %ymm4 vcvttps2dq %ymm11, %ymm11 vcvttps2dq %ymm12, %ymm12 vfmadd231ps 360(%rsp), %ymm7, %ymm13 vmovaps 8(%rsp), %ymm3 vcvtdq2ps %ymm1, %ymm1 vcvttps2dq %ymm10, %ymm10 vpand .LC12(%rip), %ymm0, %ymm0 vmovaps -24(%rsp), %ymm7 vcvttps2dq %ymm9, %ymm9 vpand .LC12(%rip), %ymm14, %ymm14 vfmadd132ps 104(%rsp), %ymm7, %ymm2 vcvttps2dq %ymm8, %ymm8 vpand .LC12(%rip), %ymm11, %ymm11 vpand .LC12(%rip), %ymm10, %ymm10 vfmadd132ps 136(%rsp), %ymm3, %ymm4 vpackusdw %ymm14, %ymm0, %ymm14 vmovaps -56(%rsp), %ymm3 vfmadd132ps 72(%rsp), %ymm3, %ymm1 vpackusdw %ymm10, %ymm11, %ymm10 vpand .LC12(%rip), %ymm9, %ymm0 vcvttps2dq %ymm13, %ymm13 vpand .LC12(%rip), %ymm8, %ymm8 vpermq $216, %ymm10, %ymm6 vpermq $216, %ymm14, %ymm14 vpand .LC12(%rip), %ymm13, %ymm3 vpand .LC12(%rip), %ymm12, %ymm12 vcvttps2dq %ymm2, %ymm2 vpackusdw %ymm8, %ymm0, %ymm0 vpand .LC13(%rip), %ymm6, %ymm6 vpand .LC12(%rip), %ymm2, %ymm2 vpermq $216, %ymm0, %ymm0 vpand .LC13(%rip), %ymm0, %ymm0 vpackusdw %ymm12, %ymm3, %ymm3 vcvttps2dq %ymm4, %ymm4 vcvttps2dq %ymm1, %ymm1 vpand .LC12(%rip), %ymm4, %ymm4 vpand .LC12(%rip), %ymm1, %ymm1 vpermq $216, %ymm3, %ymm3 vpackuswb %ymm0, %ymm6, %ymm6 vcvttps2dq %ymm5, %ymm0 vpand .LC12(%rip), %ymm0, %ymm0 vpand .LC13(%rip), %ymm14, %ymm7 vpand .LC13(%rip), %ymm3, %ymm3 vpackusdw %ymm1, %ymm2, %ymm1 vpermq $216, %ymm6, %ymm6 vpackusdw %ymm4, %ymm0, %ymm0 vpermq $216, %ymm1, %ymm1 vpand .LC13(%rip), %ymm1, %ymm1 vpackuswb %ymm3, %ymm7, %ymm7 vpermq $216, %ymm0, %ymm0 vpand .LC13(%rip), %ymm0, %ymm0 vpermq $216, %ymm7, %ymm7 vpextrb $0, %xmm7, -128(%rax) vpextrb $1, %xmm7, -127(%rax) vpextrb $2, %xmm7, -126(%rax) vpextrb $3, %xmm7, -124(%rax) vpextrb $4, %xmm7, -123(%rax) vpextrb $5, %xmm7, -122(%rax) vpackuswb %ymm1, %ymm0, %ymm0 vpextrb $6, %xmm7, -120(%rax) vpextrb $7, %xmm7, -119(%rax) vpextrb $8, %xmm7, -118(%rax) vpermq $216, %ymm0, %ymm0 vpextrb $9, %xmm7, -116(%rax) vpextrb $10, %xmm7, -115(%rax) vpextrb $11, %xmm7, -114(%rax) vpextrb $12, %xmm7, -112(%rax) vpextrb $13, %xmm7, -111(%rax) vpextrb $14, %xmm7, -110(%rax) vpextrb $15, %xmm7, -108(%rax) vextracti128 $0x1, %ymm7, %xmm7 vpextrb $0, %xmm6, -86(%rax) vpextrb $1, %xmm6, -84(%rax) vpextrb $2, %xmm6, -83(%rax) vpextrb $3, %xmm6, -82(%rax) vpextrb $4, %xmm6, -80(%rax) vpextrb $5, %xmm6, -79(%rax) vpextrb $6, %xmm6, -78(%rax) vpextrb $7, %xmm6, -76(%rax) vpextrb $8, %xmm6, -75(%rax) vpextrb $0, %xmm7, -107(%rax) vpextrb $1, %xmm7, -106(%rax) vpextrb $2, %xmm7, -104(%rax) vpextrb $3, %xmm7, -103(%rax) vpextrb $4, %xmm7, -102(%rax) vpextrb $5, %xmm7, -100(%rax) vpextrb $6, %xmm7, -99(%rax) vpextrb $7, %xmm7, -98(%rax) vpextrb $8, %xmm7, -96(%rax) vpextrb $9, %xmm7, -95(%rax) vpextrb $10, %xmm7, -94(%rax) vpextrb $11, %xmm7, -92(%rax) vpextrb $12, %xmm7, -91(%rax) vpextrb $13, %xmm7, -90(%rax) vpextrb $14, %xmm7, -88(%rax) vpextrb $15, %xmm7, -87(%rax) vpextrb $9, %xmm6, -74(%rax) vpextrb $10, %xmm6, -72(%rax) vpextrb $11, %xmm6, -71(%rax) vpextrb $12, %xmm6, -70(%rax) vpextrb $13, %xmm6, -68(%rax) vpextrb $14, %xmm6, -67(%rax) vpextrb $15, %xmm6, -66(%rax) vextracti128 $0x1, %ymm6, %xmm6 vpextrb $0, %xmm0, -43(%rax) vpextrb $1, %xmm0, -42(%rax) vpextrb $2, %xmm0, -40(%rax) vpextrb $3, %xmm0, -39(%rax) vpextrb $4, %xmm0, -38(%rax) vpextrb $5, %xmm0, -36(%rax) vpextrb $6, %xmm0, -35(%rax) vpextrb $7, %xmm0, -34(%rax) vpextrb $8, %xmm0, -32(%rax) vpextrb $9, %xmm0, -31(%rax) vpextrb $0, %xmm6, -64(%rax) vpextrb $1, %xmm6, -63(%rax) vpextrb $2, %xmm6, -62(%rax) vpextrb $3, %xmm6, -60(%rax) vpextrb $4, %xmm6, -59(%rax) vpextrb $5, %xmm6, -58(%rax) vpextrb $6, %xmm6, -56(%rax) vpextrb $7, %xmm6, -55(%rax) vpextrb $8, %xmm6, -54(%rax) vpextrb $9, %xmm6, -52(%rax) vpextrb $10, %xmm6, -51(%rax) vpextrb $11, %xmm6, -50(%rax) vpextrb $12, %xmm6, -48(%rax) vpextrb $13, %xmm6, -47(%rax) vpextrb $14, %xmm6, -46(%rax) vpextrb $15, %xmm6, -44(%rax) vpextrb $10, %xmm0, -30(%rax) vpextrb $11, %xmm0, -28(%rax) vpextrb $12, %xmm0, -27(%rax) vpextrb $13, %xmm0, -26(%rax) vpextrb $14, %xmm0, -24(%rax) vpextrb $15, %xmm0, -23(%rax) vextracti128 $0x1, %ymm0, %xmm0 vpextrb $0, %xmm0, -22(%rax) vpextrb $1, %xmm0, -20(%rax) vpextrb $2, %xmm0, -19(%rax) vpextrb $3, %xmm0, -18(%rax) vpextrb $4, %xmm0, -16(%rax) vpextrb $5, %xmm0, -15(%rax) vpextrb $6, %xmm0, -14(%rax) vpextrb $7, %xmm0, -12(%rax) vpextrb $8, %xmm0, -11(%rax) vpextrb $9, %xmm0, -10(%rax) vpextrb $10, %xmm0, -8(%rax) vpextrb $11, %xmm0, -7(%rax) vpextrb $12, %xmm0, -6(%rax) vpextrb $13, %xmm0, -4(%rax) vpextrb $14, %xmm0, -3(%rax) vpextrb $15, %xmm0, -2(%rax) cmpq %r9, %rax jne .L4 vmovss -92(%rsp), %xmm5 vmovss -96(%rsp), %xmm6 andl $-32, %r8d vmovss .LC14(%rip), %xmm3 vmovss -100(%rsp), %xmm7 .p2align 4,,10 .p2align 3 .L5: movl %r8d, %eax vxorps %xmm2, %xmm2, %xmm2 incl %r8d movq %rax, %rcx salq $4, %rcx vmovss 8(%rsi,%rcx), %xmm0 leaq (%rdx,%rax,4), %rcx movzbl (%rcx), %eax vsubss %xmm0, %xmm3, %xmm1 vcvtsi2ss %eax, %xmm2, %xmm2 vmulss %xmm1, %xmm5, %xmm4 vfmadd132ss %xmm0, %xmm4, %xmm2 vmulss %xmm1, %xmm6, %xmm4 vmulss %xmm1, %xmm7, %xmm1 vcvttss2si %xmm2, %eax vxorps %xmm2, %xmm2, %xmm2 movb %al, (%rcx) movzbl 1(%rcx), %eax vcvtsi2ss %eax, %xmm2, %xmm2 vfmadd132ss %xmm0, %xmm4, %xmm2 vcvttss2si %xmm2, %eax vxorps %xmm2, %xmm2, %xmm2 movb %al, 1(%rcx) movzbl 2(%rcx), %eax vcvtsi2ss %eax, %xmm2, %xmm2 vfmadd132ss %xmm2, %xmm1, %xmm0 vcvttss2si %xmm0, %eax movb %al, 2(%rcx) cmpl %r8d, %edi ja .L5 vzeroupper .L20: leave .cfi_remember_state .cfi_def_cfa 7, 8 ret current trunk looks the same.
Errr, before we _dont_ vectorize.
Reducing the VF here should be the goal. For the particular case "filling" the holes with neutral data and blending in the original values at store time will likely be optimal. So do tem = vector load zero all [4] elements compute blend in 'tem' into the [4] elements vector store eliding all the shuffling/striding. Should end up at a VF of 4 (SSE) or 8 (AVX). Doesn't fit very well into the current vectorizer architecture. So currently we can only address this from the costing side. arm can probably leverage load/store-lanes here. With char elements and an SLP size of 3 it's probably the worst case we can think of.
The endless series of vpextrb look terrible, can't that be handled by possibly masked permutation?
(In reply to Richard Biener from comment #3) > Reducing the VF here should be the goal. For the particular case "filling" > the holes with neutral data and blending in the original values at store time > will likely be optimal. So do > > tem = vector load > zero all [4] elements > compute > blend in 'tem' into the [4] elements > vector store MASKMOVDQU [1] should be an excellent fit here. [1] https://www.felixcloutier.com/x86/maskmovdqu
On Wed, 11 Sep 2019, ubizjak at gmail dot com wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91735 > > --- Comment #5 from Uroš Bizjak <ubizjak at gmail dot com> --- > (In reply to Richard Biener from comment #3) > > Reducing the VF here should be the goal. For the particular case "filling" > > the holes with neutral data and blending in the original values at store time > > will likely be optimal. So do > > > > tem = vector load > > zero all [4] elements > > compute > > blend in 'tem' into the [4] elements > > vector store > > MASKMOVDQU [1] should be an excellent fit here. Yes, but it's probably slower. And it avoids store data races, of course plus avoids epilogue peeling (eventually).
On Wed, 11 Sep 2019, jakub at gcc dot gnu.org wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91735 > > Jakub Jelinek <jakub at gcc dot gnu.org> changed: > > What |Removed |Added > ---------------------------------------------------------------------------- > CC| |jakub at gcc dot gnu.org > > --- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> --- > The endless series of vpextrb look terrible, can't that be handled by possibly > masked permutation? Sure, just nobody implemented support for that into the strided store code (likewise for strided loads). I'm also not sure it is really faster in the end. Maybe VPMULTISHIFTQB can also help.
The most trivial improvement is likely to recognize the vector parts we can store via HImode. There's already support for that but only if we can uniformly use HImode and not a mix of sizes.
(In reply to Richard Biener from comment #8) > The most trivial improvement is likely to recognize the vector parts we can > store via HImode. There's already support for that but only if we can > uniformly > use HImode and not a mix of sizes. While for loads we need N "same" pieces to be able to build the CONSTRUCTOR for stores we can do arbitrary extracts so the strided store code could be refactored to decide on that in the main loop walking over the actual elements to store rather than computing this upfront (I sort-of copied the handling from the strided load code retaining this restriction). Might get rid of 1/3 of the pextracts.
Can't really decipher what clang does here. it seems to handle even/odd lanes separately, doing 24 vpextrb stores per loop iteration. Possibly simply an interleaving scheme...
Created attachment 46880 [details] prototype This improves code-gen to use pextrw where possible but that doesn't make any measurable difference on runtime. Maybe the example loop isn't representative or the improvement isn't big enough.
GCC 9.3.0 has been released, adjusting target milestone.
GCC 9.4 is being released, retargeting bugs to GCC 9.5.
GCC 9 branch is being closed
GCC 10.4 is being released, retargeting bugs to GCC 10.5.
GCC 10 branch is being closed.