User account creation filtered due to spam.

Bug 81303 - [8 Regression] 410.bwaves regression caused by r249919
Summary: [8 Regression] 410.bwaves regression caused by r249919
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 8.0
: P3 normal
Target Milestone: 8.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2017-07-04 08:06 UTC by Richard Biener
Modified: 2017-07-12 14:19 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Richard Biener 2017-07-04 08:06:00 UTC
I see on Haswell with -march=native -Ofast a runtime increase from 177s to 254s.
This is caused by us vectorizing the innermost reduction loop in block_solver.f:mat_times_vec using strided loads (and with -mprefer-avx128 even peeling for alignment).
Comment 1 Richard Biener 2017-07-04 08:25:47 UTC
Looks like we peel for alignment which, for the loop is quite pointless at it only runs 5 times, so for AVX256 we're likely running into peel for alignment, no vector iteration, epilogue.

Need to tame down that damn alignment peeling more ...

It peels 'x' btw.

block_solver.f:178:0: note: Cost model analysis:
  Vector inside of loop cost: 76
  Vector prologue cost: 61
  Vector epilogue cost: 62
  Scalar iteration cost: 28
  Scalar outside cost: 7
  Vector outside cost: 123
  prologue iterations: 2
  epilogue iterations: 2
  Calculated minimum iters for profitability: 5
block_solver.f:178:0: note:   Runtime profitability threshold = 4
block_solver.f:178:0: note:   Static estimate profitability threshold = 5

but that doesn't take into account that we eventually spend 3 scalar iterations
in the alignment prologue and thus with niter < 7 we'll eventually never enter
the vector loop.  The static estimate is similarly affected by this.
Comment 2 Richard Biener 2017-07-04 09:11:48 UTC
Without peeling for alignment the numbers improve but we still regress from 176s to 205s.  The innermost (unrolled) loop is:

.L11:
        vmovsd  (%rdi,%r15,2), %xmm2
        vmovsd  (%rsi,%r15,2), %xmm1
        movq    -56(%rbp), %rbx
        vmovhpd (%rdi,%r14), %xmm2, %xmm0
        vmovsd  (%rdi), %xmm2
        vmovhpd (%rsi,%r14), %xmm1, %xmm6
        vmovsd  (%rsi), %xmm1
        vmovhpd (%rdi,%r15), %xmm2, %xmm2
        vmovhpd (%rsi,%r15), %xmm1, %xmm1
        addq    %r11, %rdi
        addq    %r11, %rsi
        vinsertf128     $0x1, %xmm0, %ymm2, %ymm2
        vmovsd  (%rcx,%r15,2), %xmm0
        vinsertf128     $0x1, %xmm6, %ymm1, %ymm1
        vmulpd  (%rbx,%rax), %ymm2, %ymm3
        movq    -72(%rbp), %rbx
        vmovapd %ymm1, %ymm2
        vmovhpd (%rcx,%r14), %xmm0, %xmm6
        vmovsd  (%rcx), %xmm0
        vfmadd132pd     (%rbx,%rax), %ymm3, %ymm2
        movq    -88(%rbp), %rbx
        vmovhpd (%rcx,%r15), %xmm0, %xmm0
        addq    %r11, %rcx
        vinsertf128     $0x1, %xmm6, %ymm0, %ymm0
        vmulpd  (%rbx,%rax), %ymm0, %ymm3
        vmovsd  (%rdx,%r15,2), %xmm0
        movq    -64(%rbp), %rbx
        vmovhpd (%rdx,%r14), %xmm0, %xmm6
        vmovsd  (%rdx), %xmm0
        vmovhpd (%rdx,%r15), %xmm0, %xmm0
        addq    %r11, %rdx
        vinsertf128     $0x1, %xmm6, %ymm0, %ymm0
        vfmadd132pd     (%rbx,%rax), %ymm3, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm1
        vmovsd  (%r9,%r15,2), %xmm0
        vmovhpd (%r9,%r14), %xmm0, %xmm3
        vmovsd  (%r9), %xmm0
        vmovhpd (%r9,%r15), %xmm0, %xmm0
        addq    %r11, %r9
        vinsertf128     $0x1, %xmm3, %ymm0, %ymm0
        vmulpd  (%r12,%rax), %ymm0, %ymm2
        vmovsd  (%r8,%r15,2), %xmm0
        vmovhpd (%r8,%r14), %xmm0, %xmm3
        vmovsd  (%r8), %xmm0
        vmovhpd (%r8,%r15), %xmm0, %xmm0
        movq    -80(%rbp), %rbx
        addq    %r11, %r8
        vinsertf128     $0x1, %xmm3, %ymm0, %ymm0
        vfmadd132pd     (%rbx,%rax), %ymm2, %ymm0
        vaddpd  %ymm0, %ymm1, %ymm0
        vmovsd  (%r10,%r15,2), %xmm1
        vmovhpd (%r10,%r14), %xmm1, %xmm2
        vmovsd  (%r10), %xmm1
        vmovhpd (%r10,%r15), %xmm1, %xmm1
        addq    %r11, %r10
        vinsertf128     $0x1, %xmm2, %ymm1, %ymm1
        vfmadd231pd     0(%r13,%rax), %ymm1, %ymm4
        addq    $32, %rax
        vaddpd  %ymm4, %ymm0, %ymm4
        cmpq    -96(%rbp), %rax
        jne     .L11

vs

.L10:
        vmovsd  (%rax,%rbx,8), %xmm0
        vmulsd  (%r15,%rdx), %xmm0, %xmm0
        vmovsd  (%r8,%rdx), %xmm1
        vfmadd132sd     (%rax,%r11,8), %xmm0, %xmm1
        vmovsd  (%rax,%rsi,8), %xmm0
        vmulsd  (%r12,%rdx), %xmm0, %xmm0
        vmovsd  0(%rbp,%rdx), %xmm4
        vfmadd231sd     (%rax), %xmm4, %xmm0
        vmovsd  (%r14,%rdx), %xmm5
        vmovsd  (%rdi,%rdx), %xmm6
        vfmadd231sd     (%rax,%r9,8), %xmm6, %xmm2
        vaddsd  %xmm0, %xmm1, %xmm0
        vmovsd  (%rax,%r10,8), %xmm1
        vmulsd  0(%r13,%rdx), %xmm1, %xmm1
        vfmadd231sd     (%rax,%rcx,8), %xmm5, %xmm1
        addq    -112(%rsp), %rdx
        addq    $8, %rax
        vaddsd  %xmm2, %xmm1, %xmm2
        vaddsd  %xmm2, %xmm0, %xmm2
        cmpq    -120(%rsp), %rax
        jne     .L10

looks like register pressure is high and IVO doesn't do the best job either.
The vectorized loop might also run into CPU arch limits with respect to
loop cache (it's 310 bytes long).