[Bug tree-optimization/98497] New: [Potential Perf regression] jne to hot branch instead je to cold

Sat Jan 2 03:12:01 GMT 2021

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98497

            Bug ID: 98497
           Summary: [Potential Perf regression] jne to hot branch instead
                    je to cold
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hiraditya at msn dot com
  Target Milestone: ---

In the following code generated by gcc 10.2
```
.L2:
        movups  xmm3, XMMWORD PTR [rax]
        add     rax, 16
        addps   xmm0, xmm3
        cmp     rax, rdx
        je      .L6
        jmp     .L2

matrix_sum_column_major.cold:
.L6:
        movaps  xmm2, xmm0
# .....

```

I think `jne .L2; jmp.L6` should be more efficient as it avoids one instruction
in the hot path.

c code:
```
float matrix_sum_column_major(float* x, int n) {
    n = 32767;
    float sum = 0;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            sum += x[j * n + i];
    return sum;
}
```

gcc -Ofast -floop-nest-optimize -o -
```
matrix_sum_column_major:
        mov     eax, 4294836212
        lea     rdx, [rdi+131056]
        pxor    xmm1, xmm1
        lea     rcx, [rdi+rax]
.L3:
        mov     rax, rdi
        pxor    xmm0, xmm0
.L2:
        movups  xmm3, XMMWORD PTR [rax]
        add     rax, 16
        addps   xmm0, xmm3
        cmp     rax, rdx
        je      .L6
        jmp     .L2
matrix_sum_column_major.cold:
.L6:
        movaps  xmm2, xmm0
        addss   xmm1, DWORD PTR [rax+8]
        lea     rdx, [rax+131068]
        add     rdi, 131068
        movhlps xmm2, xmm0
        addps   xmm2, xmm0
        movaps  xmm0, xmm2
        shufps  xmm0, xmm2, 85
        addps   xmm0, xmm2
        movss   xmm2, DWORD PTR [rax+4]
        addss   xmm2, DWORD PTR [rax]
        addss   xmm1, xmm2
        addss   xmm1, xmm0
        cmp     rdx, rcx
        jne     .L3
        movaps  xmm0, xmm1
        ret
```

Link to godbolt: https://gcc.godbolt.org/z/ac7YY1