targeting large function result

Mon Sep 23 10:27:00 GMT 2002

I am implementing a quadruple-precision integer package
and am looking for a way to construct the final result
directly into the destination, rather than into a temp
passed by the caller.  Is there any coding style or
incantation that can accomplish this?

The simple example below shows that the compiler can
adequately optimize an anonymous temporary within an
expression.  But, in spite of much effort, I cannot get
it to provide an assignment's lhs in lieu of a temp.

TIA for any help / suggestions,

/john

=== Source ===

class CNumeric128 {
    signed   int hi;
    unsigned int hl;
    unsigned int lh;
    unsigned int lo;
public:
    CNumeric128(signed int a, unsigned int b, unsigned int c, unsigned int d)
        : hi(a), hl(b), lh(c), lo(d) {}
};

inline static
CNumeric128 operator + (CNumeric128 const& left,
                        CNumeric128 const& right)
__attribute__((const, always_inline));

extern "C"
void overflow_128_arith(void);

inline static
CNumeric128 operator + (CNumeric128 const& __restrict__ left,
                        CNumeric128 const& __restrict__ right)
{ int hi, hl, lh, lo;
asm ( 
   "movl 12+%4,%3\n"
" movl 8+%4,%2\n" 
" movl 4+%4,%1\n" 
" movl %4,%0\n" 
" addl 12+%5,%3\n" 
" adcl 8+%5,%2\n" 
" adcl 4+%5,%1\n" 
" adcl %5,%0\n" 
" jno 1f\n" 
" call overflow_128_arith\n"
"1:\n"
 : "=&r" (hi), "=&r" (hl), "=&r" (lh), "=&r" (lo)
 : "o" (left), "o" (right) 
 : "cc"
);
return CNumeric128(hi,hl,lh,lo);
}

void test(CNumeric128* __restrict__ out,
          CNumeric128& __restrict__ op1,
          CNumeric128& __restrict__ op2,
          CNumeric128& __restrict__ op3)
{
    *out = op1 + op2 + op3;
}

=== Assembler ===

 .file "add128.cpp"
 .version "01.01"
gcc2_compiled.:
.text
 .align 4
.globl test__FP11CNumeric128R11CNumeric128N21
 .type  test__FP11CNumeric128R11CNumeric128N21,@function
test__FP11CNumeric128R11CNumeric128N21:
.LFB1:
 pushl %ebp
.LCFI0:
 movl %esp, %ebp
.LCFI1:
 pushl %edi
.LCFI2:
 pushl %esi
.LCFI3:
 pushl %ebx
.LCFI4:
 subl $60, %esp
.LCFI5:
 movl 12(%ebp), %edx
 movl 16(%ebp), %ecx
#APP
 movl 12+(%edx),%edi
 movl 8+(%edx),%esi
 movl 4+(%edx),%ebx
 movl (%edx),%eax
 addl 12+(%ecx),%edi
 adcl 8+(%ecx),%esi
 adcl 4+(%ecx),%ebx
 adcl (%ecx),%eax
 jno 1f
 call overflow_128_arith
1:

#NO_APP
 movl %ebx, %edx
 movl %esi, -48(%ebp)
 movl %edi, -44(%ebp)
 movl %eax, -56(%ebp)
 movl %edx, -52(%ebp)
 movl 20(%ebp), %esi
#APP
 movl 12+-56(%ebp),%ebx
 movl 8+-56(%ebp),%ecx
 movl 4+-56(%ebp),%edx
 movl -56(%ebp),%eax
 addl 12+(%esi),%ebx
 adcl 8+(%esi),%ecx
 adcl 4+(%esi),%edx
 adcl (%esi),%eax
 jno 1f
 call overflow_128_arith
1:

#NO_APP
 movl %ecx, -32(%ebp)
 movl %ebx, -28(%ebp)
 movl %eax, -40(%ebp)
 movl %edx, -36(%ebp)

 leal -40(%ebp), %esi  <<<< address of temp
 movl 8(%ebp), %edi    <<<< address of lhs
 cld                   <<<< copy direction
 movl $4, %ecx         <<<< 4 words
 rep                   <<<< block
 movsl                 <<<<   move

 addl $60, %esp
 popl %ebx
 popl %esi
 popl %edi
 popl %ebp
 ret
.LFE1:
.Lfe1:
 .size  test__FP11CNumeric128R11CNumeric128N21,.Lfe1-test__FP11CNumeric128R11CNumeric128N21
 .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-81)"