[Bug middle-end/108376] New: TSVC s1279 runs 40% faster with aocc than gcc at zen4
hubicka at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Wed Jan 11 18:56:30 GMT 2023
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Bug ID: 108376
Summary: TSVC s1279 runs 40% faster with aocc than gcc at zen4
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
jh@alberti:~/tsvc/bin> more s1279.c
#include <math.h>
#include <malloc.h>
typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
// reductions
// if to max reduction
real_t x;
int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t));
for (int i = 0; i < LEN_1D; i = i+5){
(ip)[i] = (i+4);
(ip)[i+1] = (i+2);
(ip)[i+2] = (i);
(ip)[i+3] = (i+3);
(ip)[i+4] = (i+1);
}
for (int nl = 0; nl < iterations; nl++) {
for (int i = 0; i < LEN_1D; i++) {
if (a[i] < (real_t)0.) {
if (b[i] > a[i]) {
c[i] += d[i] * e[i];
}
}
}
//dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return x;
}
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s1279.c
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
2762.85 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
265 page-faults:u # 95.915 /sec
10155904052 cycles:u # 3.676 GHz
(83.34%)
20767 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.36%)
36970 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.36%)
27985795691 instructions:u # 2.76 insn per cycle
# 0.00 stalled cycles per
insn (83.36%)
1999265642 branches:u # 723.624 M/sec
(83.36%)
502031 branch-misses:u # 0.03% of all branches
(83.23%)
2.764553907 seconds time elapsed
2.763249000 seconds user
0.000000000 seconds sys
jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native
s1279.c
jh@alberti:~/tsvc/bin> perf stat ./a.out
Performance counter stats for './a.out':
1980.94 msec task-clock:u # 0.999 CPUs utilized
0 context-switches:u # 0.000 /sec
0 cpu-migrations:u # 0.000 /sec
77 page-faults:u # 38.871 /sec
7261166980 cycles:u # 3.666 GHz
(83.25%)
16796 stalled-cycles-frontend:u # 0.00% frontend cycles
idle (83.25%)
34506 stalled-cycles-backend:u # 0.00% backend cycles
idle (83.25%)
10498254812 instructions:u # 1.45 insn per cycle
# 0.00 stalled cycles per
insn (83.40%)
1500160478 branches:u # 757.299 M/sec
(83.45%)
1000905 branch-misses:u # 0.07% of all branches
(83.40%)
1.982364055 seconds time elapsed
1.981460000 seconds user
0.000000000 seconds sys
aocc does:
.LBB0_6: # %for.inc43.vec.bb
# in Loop: Header=BB0_2 Depth=2
addq $256, %rcx # imm = 0x100
cmpq $128000, %rcx # imm = 0x1F400
je .LBB0_7
.LBB0_2: # %vector.body
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovups a(%rcx), %zmm1
vmovups a+64(%rcx), %zmm2
vmovups a+128(%rcx), %zmm3
vmovups a+192(%rcx), %zmm4
# implicit-def: $k4
vcmpltps %zmm0, %zmm1, %k0
vcmpltps %zmm0, %zmm2, %k1
vcmpltps %zmm0, %zmm3, %k2
vcmpltps %zmm0, %zmm4, %k3
kunpckwd %k0, %k1, %k0
kunpckwd %k2, %k3, %k1
# implicit-def: $k2
# implicit-def: $k3
kunpckdq %k0, %k1, %k0
# implicit-def: $k1
kortestq %k0, %k0
je .LBB0_4
# %bb.3: # %if.then.vec.bb
# in Loop: Header=BB0_2 Depth=2
vcmpltps b(%rcx), %zmm1, %k1
vcmpltps b+64(%rcx), %zmm2, %k2
vcmpltps b+128(%rcx), %zmm3, %k3
vcmpltps b+192(%rcx), %zmm4, %k4
.LBB0_4: # %if.then.vec.join.bb
# in Loop: Header=BB0_2 Depth=2
kunpckwd %k1, %k2, %k5
kunpckwd %k3, %k4, %k6
kunpckdq %k5, %k6, %k5
ktestq %k0, %k5
je .LBB0_6
So mask registers does the conditionals
and GCC with 256bit vectors:
.L2:
vmovdqa %ymm7, %ymm1
vmovdqa %ymm8, %ymm0
addq $160, %rax
vpaddd %ymm4, %ymm8, %ymm8
vpaddd %ymm18, %ymm1, %ymm2
vpaddd %ymm17, %ymm1, %ymm1
vpaddd %ymm4, %ymm7, %ymm7
vextracti64x2 $1, %ymm2, %xmm3
vmovq %xmm2, -160(%rax)
vpextrq $1, %xmm2, -140(%rax)
vmovq %xmm1, -80(%rax)
vpextrq $1, %xmm1, -60(%rax)
valignq $3, %ymm2, %ymm2, %ymm2
vmovq %xmm3, -120(%rax)
vmovdqa %ymm0, %ymm3
vmovq %xmm2, -100(%rax)
vextracti64x2 $1, %ymm1, %xmm2
valignq $3, %ymm1, %ymm1, %ymm1
vmovq %xmm2, -40(%rax)
vpaddd %ymm5, %ymm0, %ymm2
vmovd %xmm2, -144(%rax)
vpextrd $1, %xmm2, -124(%rax)
vpextrd $2, %xmm2, -104(%rax)
vmovq %xmm1, -20(%rax)
vpaddd %ymm6, %ymm0, %ymm1
vpermt2d %ymm1, %ymm16, %ymm3
vpextrd $3, %xmm2, -84(%rax)
vmovq %xmm3, -152(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm15, %ymm3
vmovq %xmm3, -132(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm14, %ymm3
vmovq %xmm3, -112(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm13, %ymm3
vmovq %xmm3, -92(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm12, %ymm3
vmovq %xmm3, -72(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm11, %ymm3
vmovq %xmm3, -52(%rax)
vmovdqa %ymm0, %ymm3
vpermt2d %ymm1, %ymm9, %ymm0
vmovq %xmm0, -12(%rax)
vpermt2d %ymm1, %ymm10, %ymm3
vextracti32x4 $1, %ymm2, %xmm0
vmovq %xmm3, -32(%rax)
vmovd %xmm0, -64(%rax)
valignd $5, %ymm2, %ymm2, %ymm0
vmovd %xmm0, -44(%rax)
valignd $6, %ymm2, %ymm2, %ymm0
valignd $7, %ymm2, %ymm2, %ymm2
vmovd %xmm0, -24(%rax)
vmovd %xmm2, -4(%rax)
cmpq %rax, %rcx
jne .L2
with 512bit vectors:
.L2:
vmovdqa32 %zmm5, %zmm1
addq $320, %rax
vpaddd %zmm2, %zmm5, %zmm5
vmovdqa32 %zmm6, %zmm0
vpaddd %zmm2, %zmm6, %zmm6
vpaddd %zmm24, %zmm1, %zmm25
vpaddd %zmm23, %zmm1, %zmm1
valignq $3, %ymm25, %ymm25, %ymm26
vmovq %xmm25, -320(%rax)
vpextrq $1, %xmm25, -300(%rax)
vmovq %xmm1, -160(%rax)
vpextrq $1, %xmm1, -140(%rax)
vextracti64x2 $1, %ymm25, %xmm27
vextracti64x4 $0x1, %zmm25, %ymm25
vmovq %xmm26, -260(%rax)
vmovq %xmm25, -240(%rax)
vpextrq $1, %xmm25, -220(%rax)
vextracti64x2 $1, %ymm25, %xmm26
vmovq %xmm27, -280(%rax)
valignq $3, %ymm25, %ymm25, %ymm25
vmovq %xmm26, -200(%rax)
vmovq %xmm25, -180(%rax)
valignq $3, %ymm1, %ymm1, %ymm25
vextracti64x2 $1, %ymm1, %xmm26
vextracti64x4 $0x1, %zmm1, %ymm1
vmovq %xmm25, -100(%rax)
vmovq %xmm1, -80(%rax)
vpextrq $1, %xmm1, -60(%rax)
vextracti64x2 $1, %ymm1, %xmm25
vmovq %xmm26, -120(%rax)
vmovdqa32 %zmm0, %zmm26
valignq $3, %ymm1, %ymm1, %ymm1
vmovq %xmm25, -40(%rax)
vpaddd %zmm3, %zmm0, %zmm25
vmovq %xmm1, -20(%rax)
vpaddd %zmm4, %zmm0, %zmm1
vpermt2d %zmm1, %zmm22, %zmm26
vmovq %xmm26, -312(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm21, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -292(%rax)
vpermt2d %zmm1, %zmm20, %zmm26
vmovq %xmm26, -272(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm19, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -252(%rax)
vpermt2d %zmm1, %zmm18, %zmm26
vmovq %xmm26, -232(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm17, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -212(%rax)
vpermt2d %zmm1, %zmm16, %zmm26
vmovq %xmm26, -192(%rax)
vmovdqa32 %zmm0, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -172(%rax)
vpermt2d %zmm1, %zmm14, %zmm26
vmovq %xmm26, -152(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm13, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -132(%rax)
vpermt2d %zmm1, %zmm12, %zmm26
vmovq %xmm26, -112(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm11, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
movq %rdx, -92(%rax)
vpermt2d %zmm1, %zmm10, %zmm26
vmovq %xmm26, -72(%rax)
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm9, %zmm26
vmovq %xmm26, %rdx
vmovdqa32 %zmm0, %zmm26
vpermt2d %zmm1, %zmm7, %zmm0
vmovq %xmm0, -12(%rax)
movq %rdx, -52(%rax)
vmovdqa32 %ymm25, %ymm0
vpermt2d %zmm1, %zmm8, %zmm26
vextracti32x4 $1, %ymm25, %xmm1
vmovq %xmm26, -32(%rax)
vmovd %xmm25, -304(%rax)
vpextrd $1, %xmm0, -284(%rax)
vpextrd $2, %xmm0, -264(%rax)
vmovd %xmm1, -224(%rax)
valignd $5, %ymm25, %ymm25, %ymm1
vpextrd $3, %xmm0, -244(%rax)
valignd $7, %ymm25, %ymm25, %ymm0
vmovd %xmm1, -204(%rax)
valignd $6, %ymm25, %ymm25, %ymm1
vmovd %xmm0, -164(%rax)
vextracti32x8 $0x1, %zmm25, %ymm0
vmovd %xmm0, -144(%rax)
vpextrd $1, %xmm0, -124(%rax)
vmovd %xmm1, -184(%rax)
vextracti32x4 $1, %ymm0, %xmm1
vpextrd $2, %xmm0, -104(%rax)
vpextrd $3, %xmm0, -84(%rax)
vmovd %xmm1, -64(%rax)
valignd $5, %ymm0, %ymm0, %ymm1
vmovd %xmm1, -44(%rax)
valignd $6, %ymm0, %ymm0, %ymm1
valignd $7, %ymm0, %ymm0, %ymm0
vmovd %xmm1, -24(%rax)
vmovd %xmm0, -4(%rax)
cmpq %rax, %rcx
jne .L2
More information about the Gcc-bugs
mailing list