99646 – s111 benchmark of TSVC preffers -mprefer-avx128 on zen3

Bug 99646 - s111 benchmark of TSVC preffers -mprefer-avx128 on zen3

Summary: s111 benchmark of TSVC preffers -mprefer-avx128 on zen3

Status:	NEW

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	11.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:	vectorizer
	Show dependency tree / graph

Reported:	2021-03-18 14:55 UTC by Jan Hubicka
Modified:	2021-03-18 15:18 UTC (History)
CC List:	2 users (show)

See Also:
Host:
Target:	x86_64--
Build:
Known to work:
Known to fail:
Last reconfirmed:	2021-03-18 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Jan Hubicka 2021-03-18 14:55:28 UTC

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256

real_t a[LEN_1D],b[LEN_1D],aa[LEN_2D][LEN_2D];
void main()
{
//    linear dependence testing
//    no dependence - vectorizable

    for (int nl = 0; nl < 2*iterations; nl++) {
        for (int i = 1; i < LEN_1D; i += 2) {
            a[i] = a[i - 1] + b[i];
        }
        dummy();
    }

}

takes 0.73s with -march=native -Ofast -mprefer-avx128 and 0.81s with -march=native -Ofast

128bit version is:
main:
.LFB0:
        .cfi_startproc
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        movl    $200000, %ebx
.L2:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L4:
        vmovaps a(%rax), %xmm2
        vmovups b+4(%rax), %xmm3
        addq    $32, %rax
        vshufps $136, a-16(%rax), %xmm2, %xmm0
        vshufps $136, b-12(%rax), %xmm3, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovss  %xmm0, a-28(%rax)
        vextractps      $1, %xmm0, a-20(%rax)
        vextractps      $2, %xmm0, a-12(%rax)
        vextractps      $3, %xmm0, a-4(%rax)
        cmpq    $127968, %rax
        jne     .L4
        vmovss  b+127972(%rip), %xmm0
        xorl    %eax, %eax
        vaddss  a+127968(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127972(%rip)
        vmovss  a+127976(%rip), %xmm0
        vaddss  b+127980(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127980(%rip)
        vmovss  a+127984(%rip), %xmm0
        vaddss  b+127988(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127988(%rip)
        vmovss  a+127992(%rip), %xmm0
        vaddss  b+127996(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127996(%rip)
        call    dummy


main:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        pushq   %rbx
        .cfi_offset 3, -24
        movl    $200000, %ebx
        andq    $-32, %rsp
        .p2align 4
        .p2align 3
.L2:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L4:
        vmovaps a(%rax), %ymm4
        vmovups b+4(%rax), %ymm5
        addq    $64, %rax
        vshufps $136, a-32(%rax), %ymm4, %ymm1
        vperm2f128      $3, %ymm1, %ymm1, %ymm2
        vshufps $68, %ymm2, %ymm1, %ymm0
        vshufps $238, %ymm2, %ymm1, %ymm2
        vshufps $136, b-28(%rax), %ymm5, %ymm1
        vinsertf128     $1, %xmm2, %ymm0, %ymm0
        vperm2f128      $3, %ymm1, %ymm1, %ymm2
        vshufps $68, %ymm2, %ymm1, %ymm3
        vshufps $238, %ymm2, %ymm1, %ymm2
        vinsertf128     $1, %xmm2, %ymm3, %ymm1
        vaddps  %ymm1, %ymm0, %ymm0
        vmovss  %xmm0, a-60(%rax)
        vextractps      $1, %xmm0, a-52(%rax)
        vextractps      $2, %xmm0, a-44(%rax)
        vextractps      $3, %xmm0, a-36(%rax)
        vextractf128    $0x1, %ymm0, %xmm0
        vmovss  %xmm0, a-28(%rax)
        vextractps      $1, %xmm0, a-20(%rax)
        vextractps      $2, %xmm0, a-12(%rax)
        vextractps      $3, %xmm0, a-4(%rax)
        cmpq    $127936, %rax
        jne     .L4
        vmovaps a+127936(%rip), %xmm6
        vmovups b+127940(%rip), %xmm7
        xorl    %eax, %eax
        vshufps $136, a+127952(%rip), %xmm6, %xmm0
        vshufps $136, b+127956(%rip), %xmm7, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovss  %xmm0, a+127940(%rip)
        vextractps      $1, %xmm0, a+127948(%rip)
        vextractps      $2, %xmm0, a+127956(%rip)
        vextractps      $3, %xmm0, a+127964(%rip)
        vmovss  b+127972(%rip), %xmm0
        vaddss  a+127968(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127972(%rip)
        vmovss  b+127980(%rip), %xmm0
        vaddss  a+127976(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127980(%rip)
        vmovss  b+127988(%rip), %xmm0
        vaddss  a+127984(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127988(%rip)
        vmovss  a+127992(%rip), %xmm0
        vaddss  b+127996(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127996(%rip)
        vzeroupper
        call    dummy

Comment 1 Richard Biener 2021-03-18 15:17:04 UTC

We're using quite inefficient vectorization here and the lack of cross-lane interleaving permutes are harmful to AVX vectorization since there's no extract even / extract odd available and we do not factor this in when costing.

Surprisingly not vectorizing is still slower (1.4s vs 1.35s with AVX for me),
but -funroll-loops w/o vectorizing is comparable to vectorizing with SSE
and unrolling ontop of SSE vectorizing doesn't help.

In the end what we miss (apart from the bad use of interleaving) is the
opportunity to use masked stores (and loads).  Which would halve the number
of usable lanes but likely provide a speedup over scalar unrolled code.