Bug 99416 - s211 benchmark of TSVC is vectorized by icc and not by gcc
Summary: s211 benchmark of TSVC is vectorized by icc and not by gcc
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2021-03-05 16:20 UTC by Jan Hubicka
Modified: 2021-03-08 08:50 UTC (History)
4 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-03-08 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Jan Hubicka 2021-03-05 16:20:54 UTC
typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void main()
{

    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 1; i < LEN_1D-1; i++) {
            a[i] = b[i - 1] + c[i] * d[i];
            b[i] = b[i + 1] - e[i] * d[i];
        }
    }
}


Icc produces:
ain:
..B1.1:                         # Preds ..B1.0
                                # Execution count [0.00e+00]
        .cfi_startproc
..___tag_value_ain.1:
..L2:
                                                          #9.1
        subq      $136, %rsp                                    #9.1
        .cfi_def_cfa_offset 144
        xorl      %edx, %edx                                    #11.5
        lea       12+d(%rip), %r8                               #14.38
        vmovss    (%r8), %xmm0                                  #14.38
        movl      $7, %edi                                      #13.38
        lea       12+e(%rip), %r9                               #14.38
        vmulss    (%r9), %xmm0, %xmm12                          #14.38
        xorl      %esi, %esi                                    #13.38
        lea       12+c(%rip), %r10                              #13.38
        vmulss    (%r10), %xmm0, %xmm0                          #13.38
        vmovss    16(%r8), %xmm4                                #14.38
        movl      $31977, %ecx                                  #12.9
        vmulss    16(%r9), %xmm4, %xmm14                        #14.38
        movl      $31975, %eax                                  #12.9
        lea       24+b(%rip), %r11                              #14.20
        vmovss    (%r11), %xmm11                                #14.20
        vmovss    4(%r8), %xmm6                                 #14.38
        vmovss    %xmm12, 104(%rsp)                             #14.38[spill]
        vmovss    %xmm11, 8(%rsp)                               #14.20[spill]
        vmulss    4(%r9), %xmm6, %xmm12                         #14.38
        vmulss    4(%r10), %xmm6, %xmm11                        #13.38
        vmovss    127984+d(%rip), %xmm6                         #14.38
        vmovss    8(%r8), %xmm13                                #14.38
        vmovss    %xmm14, 96(%rsp)                              #14.38[spill]
        vmulss    127984+e(%rip), %xmm6, %xmm14                 #14.38
        vmulss    8(%r9), %xmm13, %xmm1                         #14.38
        vmovss    %xmm14, 112(%rsp)                             #14.38[spill]
        vmovss    127988+d(%rip), %xmm14                        #14.38
        vmovss    %xmm1, 16(%rsp)                               #14.38[spill]
        vmulss    8(%r10), %xmm13, %xmm1                        #13.38
        vmulss    16(%r10), %xmm4, %xmm13                       #13.38
        vmulss    127988+e(%rip), %xmm14, %xmm4                 #14.38
        vmovss    %xmm4, 120(%rsp)                              #14.38[spill]
       vmulss    127988+c(%rip), %xmm14, %xmm4                 #13.38
        vmovss    -4(%r11), %xmm5                               #14.20
        vmovss    -8(%r8), %xmm2                                #14.38
        vmovss    12(%r8), %xmm15                               #14.38
        vmovss    %xmm4, 24(%rsp)                               #13.38[spill]
        vmovss    127992+d(%rip), %xmm4                         #14.38
        vmovss    %xmm5, (%rsp)                                 #14.20[spill]
        vmulss    -8(%r9), %xmm2, %xmm3                         #14.38
        vmulss    -8(%r10), %xmm2, %xmm5                        #13.38
        vmulss    12(%r9), %xmm15, %xmm2                        #14.38
        vmulss    12(%r10), %xmm15, %xmm15                      #13.38
        vmulss    127992+e(%rip), %xmm4, %xmm14                 #14.38
        vmulss    127992+c(%rip), %xmm4, %xmm4                  #13.38
        vmovss    -4(%r8), %xmm10                               #14.38
        vmulss    -4(%r9), %xmm10, %xmm7                        #14.38
        vmulss    -4(%r10), %xmm10, %xmm10                      #13.38
        vmovss    %xmm7, 88(%rsp)                               #14.38[spill]
        vmovss    %xmm4, 32(%rsp)                               #13.38[spill]
        vmovss    %xmm15, 56(%rsp)                              #13.31[spill]
        vmovss    %xmm14, 40(%rsp)                              #13.31[spill]
        vmovss    %xmm3, 80(%rsp)                               #13.31[spill]
        vmovss    -16(%r11), %xmm9                              #14.20
        vmovss    -12(%r11), %xmm8                              #14.20
        vmovss    -8(%r11), %xmm7                               #14.20
        vmovss    127984+c(%rip), %xmm4                         #13.31
        vmovss    %xmm1, 64(%rsp)                               #13.31[spill]
        vmovss    %xmm0, 48(%rsp)                               #13.31[spill]
        vmovss    %xmm2, 72(%rsp)                               #13.31[spill]
        vmovss    16(%rsp), %xmm14                              #13.31[spill]
        vmovss    8(%rsp), %xmm15                               #13.31[spill]
        vmovss    (%rsp), %xmm3                                 #13.31[spill]
                                # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.2:                         # Preds ..B1.10 ..B1.1
                                # Execution count [1.00e+05]
        movq      %rdi, %r8                                     #12.9
        vsubss    80(%rsp), %xmm9, %xmm0                        #14.38[spill]
        vsubss    88(%rsp), %xmm8, %xmm1                        #14.38[spill]
        vsubss    104(%rsp), %xmm7, %xmm2                       #14.38[spill]
        vsubss    %xmm14, %xmm15, %xmm7                         #14.38
        vsubss    %xmm12, %xmm3, %xmm3                          #14.38
        vmovss    28+b(%rip), %xmm8                             #14.20
       vmovss    32+b(%rip), %xmm15                            #14.20
        vmovss    %xmm0, 4+b(%rip)                              #14.13
        vmovss    %xmm1, 8+b(%rip)                              #14.13
        vmovss    %xmm2, 12+b(%rip)                             #14.13
        vmovss    %xmm3, 16+b(%rip)                             #14.13
        vmovss    %xmm7, 20+b(%rip)                             #14.13
        vsubss    72(%rsp), %xmm8, %xmm9                        #14.38[spill]
        vsubss    96(%rsp), %xmm15, %xmm0                       #14.38[spill]
        vmovss    %xmm9, 24+b(%rip)                             #14.13
        vmovss    %xmm0, 28+b(%rip)                             #14.13
                                # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.3:                         # Preds ..B1.3 ..B1.2
                                # Execution count [3.20e+09]
        vmovups   4+e(,%r8,4), %ymm1                            #14.31
        lea       (,%r8,4), %r9                                 #14.13
        vmovups   36+e(,%r8,4), %ymm3                           #14.31
        vmovups   68+e(,%r8,4), %ymm8                           #14.31
        vmovups   100+e(,%r8,4), %ymm15                         #14.31
        vmovups   4+d(,%r8,4), %ymm0                            #14.38
        vmovups   36+d(,%r8,4), %ymm2                           #14.38
        vmovups   68+d(,%r8,4), %ymm7                           #14.38
        vmovups   100+d(,%r8,4), %ymm9                          #14.38
        vfnmadd213ps 8+b(,%r8,4), %ymm0, %ymm1                  #14.38
        vfnmadd213ps 40+b(,%r8,4), %ymm2, %ymm3                 #14.38
        vfnmadd213ps 72+b(,%r8,4), %ymm7, %ymm8                 #14.38
        vfnmadd213ps 104+b(,%r8,4), %ymm9, %ymm15               #14.38
        vmovups   %ymm1, 4+b(%r9)                               #14.13
        vmovups   %ymm3, 36+b(%r9)                              #14.13
        vmovups   %ymm8, 68+b(%r9)                              #14.13
        vmovups   %ymm15, 100+b(%r9)                            #14.13
        addq      $32, %r8                                      #12.9
        cmpq      $31975, %r8                                   #12.9
        jb        ..B1.3        # Prob 99%                      #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.4:                         # Preds ..B1.3
                                # Execution count [1.00e+05]
        movq      %rsi, %r9                                     #12.9
        movq      %rcx, %r8                                     #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.5:                         # Preds ..B1.5 ..B1.4
                                # Execution count [3.20e+09]
      vmovups   127904+e(,%r9,4), %xmm1                       #14.31
        vmovups   127904+d(,%r9,4), %xmm0                       #14.38
        vfnmadd213ps b(,%r8,4), %xmm0, %xmm1                    #14.38
        addq      $4, %r8                                       #12.9
        vmovups   %xmm1, 127904+b(,%r9,4)                       #14.13
        addq      $4, %r9                                       #12.9
        cmpq      $20, %r9                                      #12.9
        jb        ..B1.5        # Prob 99%                      #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.6:                         # Preds ..B1.5
                                # Execution count [1.00e+05]
        vmovss    127996+b(%rip), %xmm9                         #14.20
        movq      %rdi, %r8                                     #12.9
        vmovss    127992+b(%rip), %xmm1                         #14.20
        vmovss    127988+b(%rip), %xmm2                         #14.20
        vaddss    b(%rip), %xmm5, %xmm7                         #13.38
        vaddss    4+b(%rip), %xmm10, %xmm3                      #13.38
        vsubss    40(%rsp), %xmm9, %xmm8                        #14.38[spill]
        vsubss    112(%rsp), %xmm2, %xmm2                       #14.38[spill]
        vsubss    120(%rsp), %xmm1, %xmm1                       #14.38[spill]
        vmovss    %xmm7, 4+a(%rip)                              #13.13
        vmovss    16+b(%rip), %xmm7                             #13.20
        vmovss    %xmm3, 8+a(%rip)                              #13.13
        vmovss    8+b(%rip), %xmm9                              #13.20
        vmovss    %xmm8, 127992+b(%rip)                         #14.13
        vmovss    12+b(%rip), %xmm8                             #13.20
        vmovss    %xmm2, 127984+b(%rip)                         #14.13
        vaddss    %xmm11, %xmm8, %xmm0                          #13.38
        vaddss    64(%rsp), %xmm7, %xmm3                        #13.38[spill]
        vaddss    48(%rsp), %xmm9, %xmm15                       #13.38[spill]
        vmovss    %xmm3, 20+a(%rip)                             #13.13
        vmovss    20+b(%rip), %xmm3                             #13.20
        vmovss    %xmm15, 12+a(%rip)                            #13.13
        vmovss    %xmm0, 16+a(%rip)                             #13.13
        vmovss    %xmm1, 127988+b(%rip)                         #14.13
        vmovss    %xmm9, (%rsp)                                 #13.13[spill]
        vaddss    56(%rsp), %xmm3, %xmm15                       #13.38[spill]
        vmovss    %xmm15, 24+a(%rip)                            #13.13
        vmovss    24+b(%rip), %xmm15                            #13.20
        vaddss    %xmm13, %xmm15, %xmm0                         #13.38
        vmovss    %xmm0, 28+a(%rip)                             #13.13
                               # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.7:                         # Preds ..B1.7 ..B1.6
                                # Execution count [3.20e+09]
        vmovups   4+c(,%r8,4), %ymm9                            #13.31
        lea       (,%r8,4), %r9                                 #13.13
        vmovups   4+d(,%r8,4), %ymm0                            #13.38
        vfmadd213ps b(,%r8,4), %ymm0, %ymm9                     #13.38
        vmovups   36+d(,%r8,4), %ymm0                           #13.38
        vmovups   %ymm9, 4+a(%r9)                               #13.13
        vmovups   36+c(,%r8,4), %ymm9                           #13.31
        vfmadd213ps 32+b(,%r8,4), %ymm0, %ymm9                  #13.38
        vmovups   68+d(,%r8,4), %ymm0                           #13.38
        vmovups   %ymm9, 36+a(%r9)                              #13.13
        vmovups   68+c(,%r8,4), %ymm9                           #13.31
        vfmadd213ps 64+b(,%r8,4), %ymm0, %ymm9                  #13.38
        vmovups   100+d(,%r8,4), %ymm0                          #13.38
        vmovups   %ymm9, 68+a(%r9)                              #13.13
        vmovups   100+c(,%r8,4), %ymm9                          #13.31
        vfmadd213ps 96+b(,%r8,4), %ymm0, %ymm9                  #13.38
        addq      $32, %r8                                      #12.9
        vmovups   %ymm9, 100+a(%r9)                             #13.13
        cmpq      $31975, %r8                                   #12.9
        jb        ..B1.7        # Prob 99%                      #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.8:                         # Preds ..B1.7
                                # Execution count [1.00e+05]
        movq      %rsi, %r9                                     #12.9
        movq      %rax, %r8                                     #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.9:                         # Preds ..B1.9 ..B1.8
                                # Execution count [3.20e+09]
        vmovups   127904+c(,%r9,4), %xmm9                       #13.31
        vmovups   127904+d(,%r9,4), %xmm0                       #13.38
        vfmadd213ps b(,%r8,4), %xmm0, %xmm9                     #13.38
        addq      $4, %r8                                       #12.9
        vmovups   %xmm9, 127904+a(,%r9,4)                       #13.13
        addq      $4, %r9                                       #12.9
        cmpq      $20, %r9                                      #12.9
        jb        ..B1.9        # Prob 99%                      #12.9
                                # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.10:                        # Preds ..B1.9
                               # Execution count [1.07e+09]
        incl      %edx                                          #11.5
        vmovss    127980+b(%rip), %xmm0                         #13.20
        vmovss    (%rsp), %xmm9                                 #[spill]
        vfmadd231ss %xmm6, %xmm4, %xmm0                         #13.38
        cmpl      $100000, %edx                                 #11.5
        jb        ..B1.2        # Prob 99%                      #11.5
                                # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.11:                        # Preds ..B1.10
                                # Execution count [1.00e+00]
        vmovss    %xmm0, 127984+a(%rip)                         #13.13
        vaddss    32(%rsp), %xmm1, %xmm1                        #13.38[spill]
        vaddss    24(%rsp), %xmm2, %xmm2                        #13.38[spill]
        vmovss    %xmm1, 127992+a(%rip)                         #13.13
        vmovss    %xmm2, 127988+a(%rip)                         #13.13
        vzeroupper                                              #17.1
        addq      $136, %rsp                                    #17.1
        .cfi_def_cfa_offset 8
        ret                                                     #17.1
Comment 1 Richard Biener 2021-03-08 08:50:13 UTC
Confirmed.  ICC applies loop distribution but again our cost-modeling doesn't want that to happen.

I suspect we want to detect extra incentives there (make dependences "good",
allow interchange, etc.)