optimization problem: ptr not kept in register
Peter A. Felvegi
petschy@praire-chicken.com
Tue Mar 25 23:32:00 GMT 2014
Hello,
I've run into an optimization issue, please help me understand the why's
and maybe to resolve it.
The reduced test case is at the end. It encodes data into a buffer in a
loop with variable length encoding (not a working real encoding). For
some reason, the write ptr is not kept in a register, but loaded/stored
when used/updated. There is a potential function call in the loop, but
there are __builtin_expect hints, so I think it would be possible to use
a register for the ptr and store just before the call, and load it back
right after the call. This would speed up the common code path: less
code, less loads and stores.I measured around 20-30% more runtime,
compared to a version where a pointer goes in and the updated ptr is
returned. However, passing/returning the ptr has other issues, esp for a
decoder, that would return the decoded value normally, not the ptr.
The platform is amd64 debian jessie, g++ version is 4.8.2, tried with
4.9.0, 4.7 and 4.6 the same code generated with little variations.
To compile:
g++ -c 20140325-reg_vs_mem.cpp -g -std=c++0x -Wall -Wextra -Werror
-Wundef -Wshadow -O3
g++ -o 20140325-reg_vs_mem 20140325-reg_vs_mem.o -g
Dump of assembler code for function encode_node_list(OutBuf&, Node*):
// rdi/rbp points to the out buffer, rsi/rbx to the node
0x00000000004005e0 <+0>: push %rbp
0x00000000004005e1 <+1>: test %rsi,%rsi
0x00000000004005e4 <+4>: mov %rdi,%rbp
0x00000000004005e7 <+7>: push %rbx
0x00000000004005e8 <+8>: mov %rsi,%rbx
0x00000000004005eb <+11>: je 0x400611
<encode_node_list(OutBuf&, Node*)+49>
// why not load cur before the loop?
0x00000000004005ed <+13>: nopl (%rax)
// load cur. this is the start of the loop, so will be loaded in each iter
0x00000000004005f0 <+16>: mov 0x0(%rbp),%rax
// load the data from the node
0x00000000004005f4 <+20>: mov (%rbx),%esi
// calc new cur value
0x00000000004005f6 <+22>: lea 0x4(%rax),%rdx
0x00000000004005fa <+26>: cmp $0xff,%esi
// and store it back, even though the correct new value is not known at
this point
0x0000000000400600 <+32>: mov %rdx,0x0(%rbp)
0x0000000000400604 <+36>: jg 0x400614
<encode_node_list(OutBuf&, Node*)+52>
// the most likely path: store the data, fetch the next node and loop
0x0000000000400606 <+38>: mov %esi,(%rax)
0x0000000000400608 <+40>: mov 0x8(%rbx),%rbx
0x000000000040060c <+44>: test %rbx,%rbx
0x000000000040060f <+47>: jne 0x4005f0
<encode_node_list(OutBuf&, Node*)+16>
// cur could be stored here, if loaded just once before the loop: could
be in %rax the whole time in the likely case
0x0000000000400611 <+49>: pop %rbx
0x0000000000400612 <+50>: pop %rbp
0x0000000000400613 <+51>: retq
// calc new cur value
0x0000000000400614 <+52>: lea 0x8(%rax),%rdx
0x0000000000400618 <+56>: cmp $0xffff,%esi
0x000000000040061e <+62>: movl $0x0,(%rax)
// and store it, again. prev store was at +32
0x0000000000400624 <+68>: mov %rdx,0x0(%rbp)
0x0000000000400628 <+72>: jg 0x40062f
<encode_node_list(OutBuf&, Node*)+79>
0x000000000040062a <+74>: mov %esi,0x4(%rax)
0x000000000040062d <+77>: jmp 0x400608
<encode_node_list(OutBuf&, Node*)+40>
0x000000000040062f <+79>: movl $0x0,0x4(%rax)
0x0000000000400636 <+86>: mov %rbp,%rdi
// why not store just here?
0x0000000000400639 <+89>: callq 0x400570
<encode_noinline(OutBuf&, int)>
// and load it back here?
0x000000000040063e <+94>: jmp 0x400608
<encode_node_list(OutBuf&, Node*)+40>
Is it an optimization issue in the compiler, or is there a way to change
the source to generate better code, without messing up the interfaces?
Thanks, Peter
----8<----8<----
#define LIKELY(x_) __builtin_expect(!!(x_), 1)
#define UNLIKELY(x_) __builtin_expect(!!(x_), 0)
typedef unsigned int uint;
struct OutBuf
{
uint* cur;
uint* end;
uint* beg;
};
__attribute__((noinline))
void encode_noinline(OutBuf& ob, int v)
{
if (LIKELY(v < 0x1000000)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
*ob.cur++ = v;
}
void encode(OutBuf& ob, int v)
{
if (LIKELY(v < 0x100)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
if (LIKELY(v < 0x10000)) {
*ob.cur++ = v;
return;
}
*ob.cur++ = 0;
encode_noinline(ob, v);
}
struct Node
{
int data;
Node* next;
};
void encode_node_list(OutBuf& ob, Node* n)
{
while (n) {
encode(ob, n->data);
n = n->next;
}
}
int main()
{
}
More information about the Gcc-help
mailing list