[Bug tree-optimization/98497] New: [Potential Perf regression] jne to hot branch instead je to cold
hiraditya at msn dot com
gcc-bugzilla@gcc.gnu.org
Sat Jan 2 03:12:01 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98497
Bug ID: 98497
Summary: [Potential Perf regression] jne to hot branch instead
je to cold
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: hiraditya at msn dot com
Target Milestone: ---
In the following code generated by gcc 10.2
```
.L2:
movups xmm3, XMMWORD PTR [rax]
add rax, 16
addps xmm0, xmm3
cmp rax, rdx
je .L6
jmp .L2
matrix_sum_column_major.cold:
.L6:
movaps xmm2, xmm0
# .....
```
I think `jne .L2; jmp.L6` should be more efficient as it avoids one instruction
in the hot path.
c code:
```
float matrix_sum_column_major(float* x, int n) {
n = 32767;
float sum = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
sum += x[j * n + i];
return sum;
}
```
gcc -Ofast -floop-nest-optimize -o -
```
matrix_sum_column_major:
mov eax, 4294836212
lea rdx, [rdi+131056]
pxor xmm1, xmm1
lea rcx, [rdi+rax]
.L3:
mov rax, rdi
pxor xmm0, xmm0
.L2:
movups xmm3, XMMWORD PTR [rax]
add rax, 16
addps xmm0, xmm3
cmp rax, rdx
je .L6
jmp .L2
matrix_sum_column_major.cold:
.L6:
movaps xmm2, xmm0
addss xmm1, DWORD PTR [rax+8]
lea rdx, [rax+131068]
add rdi, 131068
movhlps xmm2, xmm0
addps xmm2, xmm0
movaps xmm0, xmm2
shufps xmm0, xmm2, 85
addps xmm0, xmm2
movss xmm2, DWORD PTR [rax+4]
addss xmm2, DWORD PTR [rax]
addss xmm1, xmm2
addss xmm1, xmm0
cmp rdx, rcx
jne .L3
movaps xmm0, xmm1
ret
```
Link to godbolt: https://gcc.godbolt.org/z/ac7YY1
More information about the Gcc-bugs
mailing list