[Bug tree-optimization/93056] New: Poor codegen for heapsort in stephanov_vector benchmark
hubicka at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Mon Dec 23 20:19:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93056
Bug ID: 93056
Summary: Poor codegen for heapsort in stephanov_vector
benchmark
Product: gcc
Version: 10.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
Created attachment 47543
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47543&action=edit
preprocessed benchmark with other tests disabled.
heap sort test in stepanov_vector benchmark runs about 50% slower when built
with GCC10 compared to clang8 (with -O3 -march=native, on bdver1 hardware)
Clang profile is:
33.19% stepanov_vector stepanov_vector [.]
benchmark::heapsort<std::reverse_iterator<std::reverse_iterator<__gnu_cxx::__normal_iterator<double*,
std::vector<double, std
16.73% stepanov_vector stepanov_vector [.]
benchmark::heapsort<reverse_iterator<reverse_iterator<__gnu_cxx::__normal_iterator<double*,
std::vector<double, std::allocato
16.07% stepanov_vector stepanov_vector [.]
benchmark::heapsort<__gnu_cxx::__normal_iterator<double*, std::vector<double,
std::allocator<double> > > >
15.61% stepanov_vector stepanov_vector [.] benchmark::heapsort<double*>
15.61% stepanov_vector stepanov_vector [.]
benchmark::heapsort<std::reverse_iterator<std::reverse_iterator<double*> > >
while GCC profile is:
32.90% stepanov_vector stepanov_vector [.]
benchmark::__sift_in<std::reverse_iterator<std::reverse_iterator<__gnu_cxx::__normal_iterator<double*,
std::vector<double, st
16.25% stepanov_vector stepanov_vector [.]
benchmark::__sift_in<std::reverse_iterator<std::reverse_iterator<double*> >,
double>
16.01% stepanov_vector stepanov_vector [.]
benchmark::__sift_in<reverse_iterator<reverse_iterator<__gnu_cxx::__normal_iterator<double*,
std::vector<double, std::allocat
15.94% stepanov_vector stepanov_vector [.] benchmark::__sift_in<double*,
double>
15.73% stepanov_vector stepanov_vector [.]
benchmark::__sift_in<__gnu_cxx::__normal_iterator<double*, std::vector<double,
std::allocator<double> > >, double>
In the hottest function the internal iterator loop leads to worse code:
Clang:
│ xor %edx,%edx
│ if ( *(begin+(i-1)) < *(begin+i))
0.88 │180:┌─→vmovsd (%rax,%rsi,8),%xmm1
6.16 │ │ xor %edi,%edi
6.45 │ │ vucomi -0x8(%rax,%rsi,8),%xmm1
22.87 │ │ seta %dil
6.74 │ │ or %rsi,%rdi
│ │ *(begin + free) = *(begin+(i-1));
7.62 │ │ mov -0x8(%rax,%rdi,8),%rcx
14.08 │ │ mov %rcx,(%rax,%rdx,8)
0.59 │ │ lea -0x1(%rdi),%rdx
│ │ for ( i = 2*(free+1); i < count; i += i) {
0.59 │ │ add %rdi,%rdi
2.05 │ │ mov %rdi,%rsi
2.05 │ │ cmp %r11,%rdi
0.29 │ └──jl 180
│ if (i == count) {
GCC:
│ nop
1.81 │18:┌─→mov %rax,%r10
│ │ if ( *(begin+(i-1)) < *(begin+i))
1.21 │1b:│ lea -0x1(%rcx),%rax
│ │_ZNK9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEplEl():
2.02 │ │ lea 0x0(,%rax,8),%r9
0.40 │ │ lea (%rsi,%r9,1),%r8
0.40 │ │ lea 0x8(%rsi,%r9,1),%r9
│
│_ZN9benchmark9__sift_inISt16reverse_iteratorIS1_IN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEEEdEEvlT_lT0_():
3.23 │ │ vmovsd (%r8),%xmm1
7.66 │ │ vmovsd (%r9),%xmm2
0.40 │ │ vcomis %xmm1,%xmm2
8.67 │ │↓ jbe 4d
4.64 │ │ vmovap %xmm2,%xmm1
│ │ i++;
18.55 │ │ mov %rcx,%rax
│ │ mov %r9,%r8
1.01 │ │ inc %rcx
│ │ for ( i = 2*(free+1); i < count; i += i) {
4.64 │4d:│ add %rcx,%rcx
│ │ *(begin + free) = *(begin+(i-1));
19.76 │ │ vmovsd %xmm1,(%rsi,%r10,8)
│ │ for ( i = 2*(free+1); i < count; i += i) {
8.06 │ │ cmp %rcx,%rdi
│ └──jg 18
│ free = i-1;
│ }
The code is:
void __sift_in(ptrdiff_t count, RandomAccessIterator begin, ptrdiff_t free_in,
T next)
{
ptrdiff_t i;
ptrdiff_t free = free_in;
// sift up the free node
for ( i = 2*(free+1); i < count; i += i) {
if ( *(begin+(i-1)) < *(begin+i))
i++;
*(begin + free) = *(begin+(i-1));
free = i-1;
}
// special case in sift up if the last inner node has only 1 child
if (i == count) {
*(begin + free) = *(begin+(i-1));
free = i-1;
}
// sift down the new item next
i = (free-1)/2;
while( (free > free_in) && *(begin+i) < next) {
*(begin + free) = *(begin+i);
free = i;
i = (free-1)/2;
}
*(begin + free) = next;
}
More information about the Gcc-bugs
mailing list