optimization problem: ptr not kept in register

Peter A. Felvegi petschy@praire-chicken.com
Tue Mar 25 23:32:00 GMT 2014


Hello,

I've run into an optimization issue, please help me understand the why's 
and maybe to resolve it.

The reduced test case is at the end. It encodes data into a buffer in a 
loop with variable length encoding (not a working real encoding). For 
some reason, the write ptr is not kept in a register, but loaded/stored 
when used/updated. There is a potential function call in the loop, but 
there are __builtin_expect hints, so I think it would be possible to use 
a register for the ptr and store just before the call, and load it back 
right after the call. This would speed up the common code path: less 
code, less loads and stores.I measured around 20-30% more runtime, 
compared to a version where a pointer goes in and the updated ptr is 
returned. However, passing/returning the ptr has other issues, esp for a 
decoder, that would return the decoded value normally, not the ptr.

The platform is amd64 debian jessie, g++ version is 4.8.2, tried with 
4.9.0, 4.7 and 4.6 the same code generated with little variations.

To compile:
g++ -c 20140325-reg_vs_mem.cpp -g -std=c++0x -Wall -Wextra -Werror 
-Wundef -Wshadow -O3
g++ -o 20140325-reg_vs_mem 20140325-reg_vs_mem.o -g

Dump of assembler code for function encode_node_list(OutBuf&, Node*):
// rdi/rbp points to the out buffer, rsi/rbx to the node
    0x00000000004005e0 <+0>:    push   %rbp
    0x00000000004005e1 <+1>:    test   %rsi,%rsi
    0x00000000004005e4 <+4>:    mov    %rdi,%rbp
    0x00000000004005e7 <+7>:    push   %rbx
    0x00000000004005e8 <+8>:    mov    %rsi,%rbx
    0x00000000004005eb <+11>:    je     0x400611 
<encode_node_list(OutBuf&, Node*)+49>
// why not load cur before the loop?
    0x00000000004005ed <+13>:    nopl   (%rax)
// load cur. this is the start of the loop, so will be loaded in each iter
    0x00000000004005f0 <+16>:    mov    0x0(%rbp),%rax
// load the data from the node
    0x00000000004005f4 <+20>:    mov    (%rbx),%esi
// calc new cur value
    0x00000000004005f6 <+22>:    lea    0x4(%rax),%rdx
    0x00000000004005fa <+26>:    cmp    $0xff,%esi
// and store it back, even though the correct new value is not known at 
this point
    0x0000000000400600 <+32>:    mov    %rdx,0x0(%rbp)
    0x0000000000400604 <+36>:    jg     0x400614 
<encode_node_list(OutBuf&, Node*)+52>
// the most likely path: store the data, fetch the next node and loop
    0x0000000000400606 <+38>:    mov    %esi,(%rax)
    0x0000000000400608 <+40>:    mov    0x8(%rbx),%rbx
    0x000000000040060c <+44>:    test   %rbx,%rbx
    0x000000000040060f <+47>:    jne    0x4005f0 
<encode_node_list(OutBuf&, Node*)+16>
// cur could be stored here, if loaded just once before the loop: could 
be in %rax the whole time in the likely case
    0x0000000000400611 <+49>:    pop    %rbx
    0x0000000000400612 <+50>:    pop    %rbp
    0x0000000000400613 <+51>:    retq
// calc new cur value
    0x0000000000400614 <+52>:    lea    0x8(%rax),%rdx
    0x0000000000400618 <+56>:    cmp    $0xffff,%esi
    0x000000000040061e <+62>:    movl   $0x0,(%rax)
// and store it, again. prev store was at +32
    0x0000000000400624 <+68>:    mov    %rdx,0x0(%rbp)
    0x0000000000400628 <+72>:    jg     0x40062f 
<encode_node_list(OutBuf&, Node*)+79>
    0x000000000040062a <+74>:    mov    %esi,0x4(%rax)
    0x000000000040062d <+77>:    jmp    0x400608 
<encode_node_list(OutBuf&, Node*)+40>
    0x000000000040062f <+79>:    movl   $0x0,0x4(%rax)
    0x0000000000400636 <+86>:    mov    %rbp,%rdi
// why not store just here?
    0x0000000000400639 <+89>:    callq  0x400570 
<encode_noinline(OutBuf&, int)>
// and load it back here?
    0x000000000040063e <+94>:    jmp    0x400608 
<encode_node_list(OutBuf&, Node*)+40>

Is it an optimization issue in the compiler, or is there a way to change 
the source to generate better code, without messing up the interfaces?

Thanks, Peter

----8<----8<----
#define LIKELY(x_)    __builtin_expect(!!(x_), 1)
#define UNLIKELY(x_)  __builtin_expect(!!(x_), 0)

typedef unsigned int uint;

struct OutBuf
{
     uint*    cur;
     uint*    end;
     uint*    beg;
};

__attribute__((noinline))
void encode_noinline(OutBuf& ob, int v)
{
     if (LIKELY(v < 0x1000000)) {
         *ob.cur++ = v;
         return;
     }
     *ob.cur++ = 0;
     *ob.cur++ = v;
}

void encode(OutBuf& ob, int v)
{
     if (LIKELY(v < 0x100)) {
         *ob.cur++ = v;
         return;
     }
     *ob.cur++ = 0;
     if (LIKELY(v < 0x10000)) {
         *ob.cur++ = v;
         return;
     }
     *ob.cur++ = 0;
     encode_noinline(ob, v);
}

struct Node
{
     int    data;
     Node*    next;
};

void encode_node_list(OutBuf& ob, Node* n)
{
     while (n) {
         encode(ob, n->data);
         n = n->next;
     }
}

int main()
{
}



More information about the Gcc-help mailing list