What would it take to always force indirect inlining?
Daniel Santos
daniel.santos@pobox.com
Thu Jul 17 17:26:00 GMT 2014
I've recently discovered that a function marked always_inline but called
by pointer won't always be inlined. What would it take to assure that
this either always happens or generates an error? Unfortunately, it's
breaking (well, failing to properly optimize) some code where I need the
optimizer to see what's in the inline function (which is a constant at
compile time) so it can optimize it into a REP MOVSx loop on x86 or
similar on other archs. I kinda designed the function so that it would
work that way, but it ends up making a function call and then can't
optimize any further.
static __always_inline void my_copy(const struct qsort_def *def, void
*dest, const void *src) {
const struct size_type __aligned(ALIGN_SIZE) *s = src;
struct size_type __aligned(ALIGN_SIZE) *d = dest;
// fprintf(stderr, "copy: d=%p, s=%p\n", d, s);
*d = *s;
0000000000000020 <my_copy> mov (%rdx),%rax
0000000000000023 <my_copy+0x3> mov %rax,(%rsi)
0000000000000026 <my_copy+0x6> retq
...
static __always_inline __flatten void
_quicksort_ror(const struct qsort_def *def, void *left, void *right,
void *tmp, size_t tmp_size) {
const size_t size = def->size;
char *r = right;
char *l = left;
const ssize_t dist = (r - l) / (ssize_t)def->size; /* left to
right offset */
00000000000003c1 <my_quicksort.isra.0+0x221> sub %rbx,%rdx
00000000000003c4 <my_quicksort.isra.0+0x224> test %rdx,%rdx
00000000000003c7 <my_quicksort.isra.0+0x227> lea 0x7(%rdx),%r12
00000000000003cb <my_quicksort.isra.0+0x22b> cmovns %rdx,%r12
if (size <= tmp_size) {
ssize_t i;
char *left_minus_one = l - size;
def->copy(def, tmp, r);
00000000000003cf <my_quicksort.isra.0+0x22f> mov %r13,%rdx
static __always_inline __flatten void
_quicksort_ror(const struct qsort_def *def, void *left, void *right,
void *tmp, size_t tmp_size) {
const size_t size = def->size;
char *r = right;
char *l = left;
const ssize_t dist = (r - l) / (ssize_t)def->size; /* left to
right offset */
00000000000003d2 <my_quicksort.isra.0+0x232> sar $0x3,%r12
if (size <= tmp_size) {
ssize_t i;
char *left_minus_one = l - size;
def->copy(def, tmp, r);
00000000000003d6 <my_quicksort.isra.0+0x236> callq 0000000000000020
<my_copy>
/* rep movs-friendly loop */
for (i = dist; i; --i) {
00000000000003db <my_quicksort.isra.0+0x23b> test %r12,%r12
00000000000003de <my_quicksort.isra.0+0x23e> je 000000000000041d
<my_quicksort.isra.0+0x27d>
00000000000003e0 <my_quicksort.isra.0+0x240> lea 0x0(,%r12,8),%rdx
00000000000003e8 <my_quicksort.isra.0+0x248> lea (%rbx,%rdx,1),%r14
00000000000003ec <my_quicksort.isra.0+0x24c> add %rdx,%r15
00000000000003ef <my_quicksort.isra.0+0x24f> xchg %ax,%ax
00000000000003f1 <my_quicksort.isra.0+0x251> data32 data32 data32 data32
data32 nopw %cs:0x0(%rax,%rax,1)
def->copy(def, &l[i * size], &left_minus_one[i
* size]);
0000000000000400 <my_quicksort.isra.0+0x260> mov %r15,%rdx
0000000000000403 <my_quicksort.isra.0+0x263> mov %r14,%rsi
0000000000000406 <my_quicksort.isra.0+0x266> mov $0x0,%edi
407: R_X86_64_32 .rodata+0x20
000000000000040b <my_quicksort.isra.0+0x26b> callq 0000000000000020
<my_copy>
0000000000000410 <my_quicksort.isra.0+0x270> sub $0x8,%r14
0000000000000414 <my_quicksort.isra.0+0x274> sub $0x8,%r15
ssize_t i;
char *left_minus_one = l - size;
def->copy(def, tmp, r);
/* rep movs-friendly loop */
for (i = dist; i; --i) {
0000000000000418 <my_quicksort.isra.0+0x278> dec %r12
000000000000041b <my_quicksort.isra.0+0x27b> jne 0000000000000400
<my_quicksort.isra.0+0x260>
def->copy(def, &l[i * size], &left_minus_one[i
* size]);
}
def->copy(def, left, tmp);
000000000000041d <my_quicksort.isra.0+0x27d> mov -0x450(%rbp),%rdx
0000000000000424 <my_quicksort.isra.0+0x284> mov %rbx,%rsi
0000000000000427 <my_quicksort.isra.0+0x287> mov $0x0,%edi
428: R_X86_64_32 .rodata+0x20
000000000000042c <my_quicksort.isra.0+0x28c> callq 0000000000000020
<my_copy>
0000000000000431 <my_quicksort.isra.0+0x291> jmpq 0000000000000378
<my_quicksort.isra.0+0x1d8>
If the optimizer had the body of my_copy above, it should be able to use
two pointers (one for l and another for left_minus_one) and a single
index as long as size is either 1, 2, 4 or 8. All and all, I need to
refine my strategy, but if I can solve this little part, it will help
greatly.
Thanks,
Daniel
More information about the Gcc
mailing list