Bug 99415 - s115 benchmark of TSVC is vectorized by icc and not by gcc
Summary: s115 benchmark of TSVC is vectorized by icc and not by gcc
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer TSVC
  Show dependency treegraph
 
Reported: 2021-03-05 15:50 UTC by Jan Hubicka
Modified: 2024-09-18 22:00 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-03-08 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jan Hubicka 2021-03-05 15:50:14 UTC
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256

real_t a[LEN_1D],aa[LEN_2D][LEN_2D];
void main()
{

    for (int nl = 0; nl < 1000*(iterations/LEN_2D); nl++) {
        for (int j = 0; j < LEN_2D; j++) {
            for (int i = j+1; i < LEN_2D; i++) {
                a[i] -= aa[j][i] * a[j];
            }
        }
    }

}

is built as:
main:
..B1.1:                         # Preds ..B1.0
                                # Execution count [1.17e-01]
        .cfi_startproc
..___tag_value_main.1:
..L2:
                                                          #9.1
        pushq     %rbp                                          #9.1
        .cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #9.1
        .cfi_def_cfa 6, 16
        .cfi_offset 6, -16
        andq      $-128, %rsp                                   #9.1
        pushq     %r14                                          #9.1
        pushq     %r15                                          #9.1
        pushq     %rbx                                          #9.1
        subq      $104, %rsp                                    #9.1
        movl      $3, %edi                                      #9.1
        xorl      %esi, %esi                                    #9.1
        call      __intel_new_feature_proc_init                 #9.1
        .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
        .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
                                # LOE rbx r12 r13 r14 r15
..B1.29:                        # Preds ..B1.1
                                # Execution count [1.17e-01]
        vstmxcsr  (%rsp)                                        #9.1
        xorl      %eax, %eax                                    #11.5
        orl       $32832, (%rsp)                                #9.1
        vldmxcsr  (%rsp)                                        #9.1
                                # LOE r12 r13 eax
..B1.2:                         # Preds ..B1.22 ..B1.29
                                # Execution count [4.50e+04]
        xorl      %r11d, %r11d                                  #12.9
        xorl      %edi, %edi                                    #12.9
        xorl      %ebx, %ebx                                    #12.9
        xorl      %r9d, %r9d                                    #12.9
        xorl      %esi, %esi                                    #12.9
                                # LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.3:                         # Preds ..B1.21 ..B1.2
                                # Execution count [1.15e+07]
        incl      %edi                                          #13.28
        decl      %r9d                                          #13.28
        cmpl      $256, %edi                                    #13.35
        jge       ..B1.21       # Prob 50%                      #13.35
                                # LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.4:                         # Preds ..B1.3
                                # Execution count [1.04e+07]
        lea       256(%r9), %r10d                               #13.35
        cmpl      $16, %r10d                                    #13.13
        jl        ..B1.25       # Prob 10%                      #13.13
                                # LOE rbx rsi r11 r12 r13 eax edi r9d r10d
..B1.5:                         # Preds ..B1.4
                                # Execution count [1.04e+07]
        lea       4+aa(%rsi,%rbx), %r8                          #14.25
        andq      $31, %r8                                      #13.13
        lea       (%rsi,%rbx), %r14                             #14.25
        movl      %r8d, %edx                                    #13.13
        negl      %edx                                          #13.13
        addl      $32, %edx                                     #13.13
        shrl      $2, %edx                                      #13.13
        testl     %r8d, %r8d                                    #13.13
        cmovne    %edx, %r8d                                    #13.13
        lea       16(%r8), %ecx                                 #13.13
        cmpl      %ecx, %r10d                                   #13.13
        jl        ..B1.25       # Prob 10%                      #13.13
                                # LOE rbx rsi r8 r11 r12 r13 r14 eax edi r9d r10d
..B1.6:                         # Preds ..B1.5
                                # Execution count [1.15e+07]
        movl      %r10d, %ecx                                   #13.13
        xorl      %r15d, %r15d                                  #13.13
        subl      %r8d, %ecx                                    #13.13
        xorl      %edx, %edx                                    #13.13
        andl      $15, %ecx                                     #13.13
        negl      %ecx                                          #13.13
        addl      %r10d, %ecx                                   #13.13
        testl     %r8d, %r8d                                    #13.13
        jbe       ..B1.10       # Prob 9%                       #13.13
                                # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d
..B1.7:                         # Preds ..B1.6
                                # Execution count [1.04e+07]
        vmovss    a(%rbx), %xmm0                                #14.36
                                # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0
..B1.8:                         # Preds ..B1.8 ..B1.7
                                # Execution count [3.33e+11]
        vmovss    4+aa(%rdx,%r14), %xmm1                        #14.25
        incq      %r15                                          #13.13
        vfnmadd213ss 4+a(%rdx,%rbx), %xmm0, %xmm1               #14.17
        vmovss    %xmm1, 4+a(%rdx,%rbx)                         #14.17
        addq      $4, %rdx                                      #13.13
        cmpq      %r8, %r15                                     #13.13
        jb        ..B1.8        # Prob 99%                      #13.13
                                # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0
..B1.10:                        # Preds ..B1.8 ..B1.6
                                # Execution count [1.04e+07]
        vbroadcastss a(,%r11,4), %ymm0                          #14.36
        lea       (%r8,%r11), %r15                              #13.13
        movslq    %ecx, %rdx                                    #13.13
        .align    16,0x90
                                # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0
..B1.11:                        # Preds ..B1.11 ..B1.10
                                # Execution count [3.33e+11]
        vmovups   4+aa(%r14,%r8,4), %ymm1                       #14.25
        vmovups   36+aa(%r14,%r8,4), %ymm2                      #14.25
        vfnmadd213ps 4+a(,%r15,4), %ymm0, %ymm1                 #14.17
        vfnmadd213ps 36+a(,%r15,4), %ymm0, %ymm2                #14.17
        vmovups   %ymm1, 4+a(,%r15,4)                           #14.17
        vmovups   %ymm2, 36+a(,%r15,4)                          #14.17
        addq      $16, %r8                                      #13.13
        addq      $16, %r15                                     #13.13
        cmpq      %rdx, %r8                                     #13.13
        jb        ..B1.11       # Prob 99%                      #13.13
                                # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0
..B1.12:                        # Preds ..B1.11
                                # Execution count [1.04e+07]
        lea       1(%rcx), %r8d                                 #13.13
        cmpl      %r10d, %r8d                                   #13.13
        ja        ..B1.21       # Prob 50%                      #13.13
                                # LOE rdx rbx rsi r11 r12 r13 r14 eax ecx edi r9d r10d
..B1.13:                        # Preds ..B1.12
                                # Execution count [1.04e+07]
        movslq    %r10d, %r10                                   #13.13
        subq      %rdx, %r10                                    #13.13
        cmpq      $4, %r10                                      #13.13
        jl        ..B1.24       # Prob 10%                      #13.13
                                # LOE rdx rbx rsi r10 r11 r12 r13 r14 eax ecx edi r9d
..B1.14:                        # Preds ..B1.13
                                # Execution count [1.04e+07]
        movl      %r10d, %r8d                                   #13.13
        lea       (%r14,%rdx,4), %r14                           #14.25
        andl      $-4, %r8d                                     #13.13
        addq      %r11, %rdx                                    #13.13
        movslq    %r8d, %r8                                     #13.13
        xorl      %r15d, %r15d                                  #13.13
                                # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d
..B1.15:                        # Preds ..B1.15 ..B1.14
                                # Execution count [3.33e+11]
        vbroadcastss a(%rbx), %xmm1                             #14.36
        vmovups   4+aa(%r14,%r15,4), %xmm0                      #14.25
        vfnmadd213ps 4+a(,%rdx,4), %xmm0, %xmm1                 #14.17
        addq      $4, %r15                                      #13.13
        vmovups   %xmm1, 4+a(,%rdx,4)                           #14.17
        addq      $4, %rdx                                      #13.13
        cmpq      %r8, %r15                                     #13.13
        jb        ..B1.15       # Prob 99%                      #13.13
                                # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d
..B1.17:                        # Preds ..B1.15 ..B1.24 ..B1.26
                                # Execution count [1.15e+07]
        lea       (,%r8,4), %r14                                #13.13
        cmpq      %r10, %r8                                     #13.13
        jae       ..B1.21       # Prob 9%                       #13.13
                                # LOE rbx rsi r8 r10 r11 r12 r13 r14 eax ecx edi r9d
..B1.18:                        # Preds ..B1.17
                                # Execution count [1.04e+07]
        movslq    %ecx, %rcx                                    #14.17
        lea       (%rsi,%r11,4), %r15                           #14.25
        lea       (,%rcx,4), %rdx                               #14.25
        lea       (%rdx,%r11,4), %rdx                           #14.17
        lea       (%r15,%rcx,4), %rcx                           #14.25
                                # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d
..B1.19:                        # Preds ..B1.19 ..B1.18
                                # Execution count [3.33e+11]
        vmovss    a(,%r11,4), %xmm1                             #14.36
        incq      %r8                                           #13.13
        vmovss    4+aa(%r14,%rcx), %xmm0                        #14.25
        vfnmadd213ss 4+a(%r14,%rdx), %xmm0, %xmm1               #14.17
        vmovss    %xmm1, 4+a(%r14,%rdx)                         #14.17
        addq      $4, %r14                                      #13.13
        cmpq      %r10, %r8                                     #13.13
        jb        ..B1.19       # Prob 99%                      #13.13
                                # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d
..B1.21:                        # Preds ..B1.19 ..B1.25 ..B1.12 ..B1.17 ..B1.3
                                #      
                                # Execution count [1.15e+07]
        addq      $4, %rbx                                      #13.28
        addq      $1024, %rsi                                   #13.28
        incq      %r11                                          #13.28
        cmpl      $256, %edi                                    #12.9
        jb        ..B1.3        # Prob 99%                      #12.9
                                # LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.22:                        # Preds ..B1.21
                                # Execution count [4.50e+04]
        .byte     15                                            #11.5
        .byte     31                                            #11.5
        .byte     128                                           #11.5
        .byte     0                                             #11.5
        .byte     0                                             #11.5
        .byte     0                                             #11.5
        .byte     0                                             #11.5
        incl      %eax                                          #11.5
        cmpl      $390000, %eax                                 #11.5
        jb        ..B1.2        # Prob 99%                      #11.5
                                # LOE r12 r13 eax
..B1.23:                        # Preds ..B1.22
                                # Execution count [1.17e-01]
        vzeroupper                                              #19.1
        xorl      %eax, %eax                                    #19.1
        addq      $104, %rsp                                    #19.1
        .cfi_restore 3
        popq      %rbx                                          #19.1
        .cfi_restore 15
        popq      %r15                                          #19.1
        .cfi_restore 14
        popq      %r14                                          #19.1
        movq      %rbp, %rsp                                    #19.1
        popq      %rbp                                          #19.1
        .cfi_def_cfa 7, 8
        .cfi_restore 6
        ret                                                     #19.1
        .cfi_def_cfa 6, 16
        .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
        .cfi_offset 6, -16
        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
        .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
                                # LOE
..B1.24:                        # Preds ..B1.13
                                # Execution count [1.04e+06]: Infreq
        xorl      %r8d, %r8d                                    #13.13
        jmp       ..B1.17       # Prob 100%                     #13.13
                                # LOE rbx rsi r8 r10 r11 r12 r13 eax ecx edi r9d
..B1.25:                        # Preds ..B1.5 ..B1.4
                                # Execution count [1.15e+06]: Infreq
        xorl      %ecx, %ecx                                    #13.13
        cmpl      $1, %r10d                                     #13.13
        jb        ..B1.21       # Prob 50%                      #13.13
                                # LOE rbx rsi r11 r12 r13 eax ecx edi r9d r10d
..B1.26:                        # Preds ..B1.25
                                # Execution count [5.77e+05]: Infreq
        movslq    %r10d, %r10                                   #13.13
        xorl      %r8d, %r8d                                    #13.13
        jmp       ..B1.17       # Prob 100%                     #13.13

which runs 0.7s while gcc binary needs 5.7s
Comment 1 Richard Biener 2021-03-08 08:47:44 UTC
The benchmark is written badly to confuse our loop header copying it seems.  Writing

        for (int j = 0; j < LEN_2D-1; j++) {
            for (int i = j+1; i < LEN_2D; i++) {
                a[i] -= aa[j][i] * a[j];
            }
        }

fixes the vectorizing.

Possibly a mistake users do, so probably worth investigating further.  Not
sure how to most easily address this - we'd like to peel the last iteration
of the outer loop, noting it does nothing.  Maybe loop-splitting can figure
this out?  Alternatively loop header copying should just do its job...

Hmm, actually loop-header copying does do its job but then there's jump
threading messing this up again (the loop header check is redundant for
all but the last iteration of the outer loop).  So -fno-tree-dominator-opts
fixes this as well.  And for some reason ch_vect thinks the loops are
all do-while loops.