[Bug tree-optimization/59478] New: Optimize variable access via byte copy

Wed Dec 11 23:41:00 GMT 2013

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59478

            Bug ID: 59478
           Summary: Optimize variable access via byte copy
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: olegendo at gcc dot gnu.org
            Target: sh*-*-*

This happens at least on SH with trunk rev 205905 (4.9).
I'm not sure whether these are target specific or not.

Accessing float values as integers can be done in various ways.  One way is to
do a byte copy...

int float_as_int (float val)
{
  char valbytes[sizeof (float)];
  __builtin_memcpy (valbytes, &val, sizeof (float));

  int result;
  __builtin_memcpy (&result, valbytes, sizeof (float));

  return result;
}

The above compiled with -m4-single -ml -O2 results in:

        add     #-8,r15
        fmov.s  fr5,@r15
        mov.l   @r15,r0
        rts
        add     #8,r15

which is not so bad actually, but could be done better by utilizing the fpul
register, as it is done when using the union approach:

int float_as_int (float val)
{
  union { int i; float f; } tmp;
  tmp.f = val;
  return tmp.i;
};

compiled with -m4-single -ml -O2:
        flds    fr5,fpul
        rts
        sts     fpul,r0

It seems that the above could be fixed with a combine pattern, as combine is
looking for:

Failed to match this instruction:
(parallel [
        (set (mem/c:SF (plus:SI (reg/f:SI 153 sfp)
                    (const_int -8 [0xfffffffffffffff8])) [3  S4 A32])
            (reg:SF 69 fr5 [ val ]))
        (use (reg/v:PSI 151 ))
        (set (reg/f:SI 166)
            (plus:SI (reg/f:SI 153 sfp)
                (const_int -8 [0xfffffffffffffff8])))
    ])

However, this might have some side effects if the location in the stack frame
is actually never written.  So probably this should be handled earlier during
compilation.

When writing the mem copy manually, there seems to be another problem:

int float_as_int (float val)
{
  char valbytes[sizeof (float)];
  for (int i = 0; i < sizeof (float); ++i)
    valbytes[i] = ((char*)&val)[i];

  int result;
  for (int i = 0; i < sizeof (float); ++i)
    ((char*)&result)[i] = valbytes[i];

  return result;
}

compiled with -m4-single -ml -O2:
        add     #-8,r15
        fmov.s  fr5,@r15       // store float at (sfp+0)

        mov.b   @(1,r15),r0    // load 4 bytes from (sfp+0)
        mov     r0,r7          // (loop is unrolled at -O2)
        mov.b   @(2,r15),r0
        mov     r0,r3
        mov.b   @(3,r15),r0
        mov     r0,r2
        mov.b   @r15,r0

        mov.b   r0,@(4,r15)   // store 4 bytes from (sfp+0) at (sfp+4)
        mov     r7,r0         // (loop is unrolled at -O2)
        mov.b   r0,@(5,r15)
        mov     r3,r0
        mov.b   r0,@(6,r15)
        mov     r2,r0
        mov.b   r0,@(7,r15)

        mov.l   @(4,r15),r0   // load int from (sfp+4)
        rts
        add     #8,r15

compiled with -m4-single -ml -O3:
        add     #-8,r15       // this is the same as using __builtin_memcpy
        fmov.s  fr5,@r15
        mov.l   @(0,r15),r0
        rts
        add     #8,r15

However, when just doing a simple byte wise integer read, the loop is not
unrolled at -O2:
int read_int (const char* val)
{
  int result;

  for (int i = 0; i < sizeof (int); ++i)
    ((char*)&result)[i] = val[i];

  return result;
}

compiled with -m4-single -ml -O2:
        add     #-4,r15
        mov     #0,r0
        mov     #0,r2
        mov     #4,r1
.L3:
        mov.b   @(r0,r4),r3
        add     #1,r2
        dt      r1
        mov.b   r3,@(r0,r15)
        bf/s    .L3
        mov     r2,r0
        mov.l   @(0,r15),r0
        rts
        add     #4,r15

compiled with -m4-single -ml -O3:
        add     #-4,r15
        mov.b   @r4,r0
        mov.b   r0,@r15
        mov.b   @(1,r4),r0
        mov.b   r0,@(1,r15)
        mov.b   @(2,r4),r0
        mov.b   r0,@(2,r15)
        mov.b   @(3,r4),r0
        mov.b   r0,@(3,r15)
        mov.l   @(0,r15),r0
        rts
        add     #4,r15