This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug tree-optimization/33291] New: a+=2; a+=2 not simplified to a+=4; with -O3 (ok with gcc-4.2.1)


I triggered this is the inner loop of the CPU emulation code of openMSX
(http://openmsx.sf.net/). I tried to reduce the code. Below is the smallest
code I could come with up that still shows the problem:

-------------------------------------------
struct Clock {
 void f();
 void add(unsigned n) { a += n; }
 int a;
};

struct CPU : Clock {
 virtual ~CPU();
 unsigned char readSlow();
 void execute();

 void delay() { add(2); }
 unsigned char readFast() {
  if (unsigned char* p = ptrs[addr >> 8]) {
   // fast-path
   delay();     // ### 1
   delay();     // ### 2
   return p[addr & 255];
  } else {
   // slow-path
   return readSlow();
  }
 }

 typedef void (CPU::*FuncPtr)();
 static FuncPtr tab[256];
 unsigned char* ptrs[256];
 unsigned addr;
};

void CPU::execute() {
 f();
 while (true) {
  unsigned char b = readFast();
  delay();       // # 3
  (this->*tab[b])();
 }
}
----------------------------------------

When compiled with SVN revision 128037 on a linux x86_64 machine:

> g++ -O3 -S CPU.ii
> cat -n CPU.s
     1          .file   "CPU.ii"
     2          .text
     3          .align 2
     4          .p2align 4,,15
     5  .globl _ZN3CPU7executeEv
     6          .type   _ZN3CPU7executeEv, @function
     7  _ZN3CPU7executeEv:
     8  .LFB5:
     9          pushq   %rbp
    10  .LCFI0:
    11          leaq    8(%rdi), %rbp
    12          pushq   %rbx
    13  .LCFI1:
    14          movq    %rdi, %rbx
    15          movq    %rbp, %rdi
    16          subq    $8, %rsp
    17  .LCFI2:
    18          call    _ZN5Clock1fEv
    19          .p2align 4,,10
    20          .p2align 3
    21  .L6:
    22          movl    2064(%rbx), %eax
    23          shrl    $8, %eax
    24          mov     %eax, %eax
    25          movq    16(%rbx,%rax,8), %rdx
    26          testq   %rdx, %rdx
    27          je      .L2
    28          movl    8(%rbx), %eax            ###
    29          addl    $2, %eax                 ### 1
    30          movl    %eax, (%rbp)             ###
    31          movl    8(%rbx), %eax            ###
    32          addl    $2, %eax                 ### 2
    33          movl    %eax, (%rbp)             ###
    34          movzbl  2064(%rbx), %eax
    35          movzbl  (%rdx,%rax), %edx
    36  .L3:
    37          movl    8(%rbx), %eax            #
    38          addl    $2, %eax                 # 3
    39          movl    %eax, (%rbp)             #
    40          movzbl  %dl, %eax
    41          salq    $4, %rax
    42          movq    _ZN3CPU3tabE(%rax), %rdx
    43          testb   $1, %dl
    44          jne     .L4
    45          movq    %rbx, %rdi
    46          addq    _ZN3CPU3tabE+8(%rax), %rdi
    47          call    *%rdx
    48          jmp     .L6
    49          .p2align 4,,10
    50          .p2align 3
    51  .L4:
    52          movq    %rbx, %rdi
    53          addq    _ZN3CPU3tabE+8(%rax), %rdi
    54          movq    (%rdi), %rax
    55          movq    -1(%rdx,%rax), %rdx
    56          call    *%rdx
    57          jmp     .L6
    58  .L2:
    59          movq    %rbx, %rdi
    60          call    _ZN3CPU8readSlowEv
    61          movl    %eax, %edx
    62          .p2align 4,,4
    63          .p2align 3
    64          jmp     .L3
    [skipped the rest of the output]

The missed optimization is visible in lines 28-33. It's also strange to me why
reading the variable is done via 8(%rbx) while writing is done via (%rbp).

gcc-4.2.1 does a better job on this, it optimizes the two consecutive delay()
functions to just:   addl $4, 8(%rbx)




Additionally I would have prefered that all three delay() functions would be
collapsed into a single instruction in the fast code path (and partly
duplicated as   a+=4; readSlow(); a+=2;   in the slow path). But I understand
this might be more difficult to implement.


-- 
           Summary: a+=2; a+=2  not simplified to  a+=4;  with -O3   (ok
                    with gcc-4.2.1)
           Product: gcc
           Version: 4.3.0
            Status: UNCONFIRMED
          Severity: minor
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: wouter dot vermaelen at scarlet dot be


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33291


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]