This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Optimize LTO streaming core routine


This optimizes the very core streaming routines, 
streamer_write_char_stream and streamer_write_uhwi_stream and
streamer_write_hwi_stream.

In streamer_write_char_stream you
can notice that writing the char possibly clobbers the pointer
(and everything else) because it uses alias set zero and may
point to an arbitrary location.  Thus we have to CSE
current_pointer manually here.

This also leads to very inefficent loops in
streamer_write_uhwi_stream and streamer_write_hwi_stream.
To optimize them we have to manually inline streamer_write_char_stream
and apply loop invariant motion and unswitching.

In streamer_write_hwi_stream we do the same but also note that
the

!       more = !((work == 0 && (byte & 0x40) == 0)
!              || (work == -1 && (byte & 0x40) != 0));

test is very inefficent.  We can optimize that if we split
the shifting of work into two pieces like

!       /* If the lower 7-bits are sign-extended 0 or -1 we are finished.  
*/
!       work >>= 6;
!       more = !(work == 0 || work == -1);
        if (more)
!       {
!         /* More bits to follow.  */
!         work >>= 1;
!         byte |= 0x80;

which results in a very nice core loop

.L21:
        movq    %rbp, %rsi
        movl    %ebp, %ecx
        sarq    $6, %rsi
        andl    $127, %ecx
        addq    $1, %rsi
        cmpq    $1, %rsi
        jbe     .L26
        orb     $-128, %cl
        sarq    $7, %rbp
        addl    $1, %r12d
        movb    %cl, (%rax)
        addq    $1, %rax
        subl    $1, %edx
        jne     .L21

and threaded tail for the !more case:

.L26:
        movb    %cl, (%rax)
        subl    $1, %edx
        addq    $1, %rax
        addl    $1, %r12d
        movq    %rax, 16(%rbx)
        addl    %r12d, 32(%rbx)
        movl    %edx, 24(%rbx)
...
	ret

(above produced by g++ 4.6 with -O2).  Compared with what is
there before the patch:

...
        movl    24(%rdi), %eax
        jmp     .L48
        .p2align 4,,10
        .p2align 3
.L52:
        xorl    %r13d, %r13d
        testb   $64, %r12b
        je      .L46
.L45:
        orb     $-128, %r12b
        movl    $1, %r13d
.L46:
        testl   %eax, %eax
        jne     .L47
        movq    %rbx, %rdi
        call    _Z16lto_append_blockP17lto_output_stream
.L47:
        movq    16(%rbx), %rax
        movb    %r12b, (%rax)
        movl    24(%rbx), %eax
        addq    $1, 16(%rbx)
        addl    $1, 32(%rbx)
        subl    $1, %eax
        testl   %r13d, %r13d
        movl    %eax, 24(%rbx)
        je      .L51
.L48:
        movl    %ebp, %r12d
        sarq    $7, %rbp
        andl    $127, %r12d
        testq   %rbp, %rbp
        je      .L52
        cmpq    $-1, %rbp
        jne     .L45
        xorl    %r13d, %r13d
        testb   $64, %r12b
        jne     .L46
        jmp     .L45
        .p2align 4,,10
        .p2align 3
.L51:
        addq    $8, %rsp
...
	ret

that's a _lot_ better.

And hopefully get's streaming down in the profile somewhat.

LTO bootstrapped and tested on x86_64-unknown-linux-gnu, applied
to trunk.

Richard.

2013-06-12  Richard Biener  <rguenther@suse.de>

	* data-streamer.h (streamer_write_char_stream): CSE
	obs->current_pointer.
	* data-streamer-out.c (streamer_write_uhwi_stream): Inline
	streamer_write_char_stream manually and optimize the resulting loop.
	(streamer_write_hwi_stream): Likewise.

Index: gcc/data-streamer.h
===================================================================
*** gcc/data-streamer.h	(revision 199935)
--- gcc/data-streamer.h	(working copy)
*************** streamer_write_char_stream (struct lto_o
*** 183,190 ****
      lto_append_block (obs);
  
    /* Write the actual character.  */
!   *obs->current_pointer = c;
!   obs->current_pointer++;
    obs->total_size++;
    obs->left_in_block--;
  }
--- 183,191 ----
      lto_append_block (obs);
  
    /* Write the actual character.  */
!   char *current_pointer = obs->current_pointer;
!   *(current_pointer++) = c;
!   obs->current_pointer = current_pointer;
    obs->total_size++;
    obs->left_in_block--;
  }
Index: gcc/data-streamer-out.c
===================================================================
*** gcc/data-streamer-out.c	(revision 199935)
--- gcc/data-streamer-out.c	(working copy)
*************** void
*** 187,192 ****
--- 187,197 ----
  streamer_write_uhwi_stream (struct lto_output_stream *obs,
                              unsigned HOST_WIDE_INT work)
  {
+   if (obs->left_in_block == 0)
+     lto_append_block (obs);
+   char *current_pointer = obs->current_pointer;
+   unsigned int left_in_block = obs->left_in_block;
+   unsigned int size = 0;
    do
      {
        unsigned int byte = (work & 0x7f);
*************** streamer_write_uhwi_stream (struct lto_o
*** 195,203 ****
  	/* More bytes to follow.  */
  	byte |= 0x80;
  
!       streamer_write_char_stream (obs, byte);
      }
!   while (work != 0);
  }
  
  
--- 200,233 ----
  	/* More bytes to follow.  */
  	byte |= 0x80;
  
!       *(current_pointer++) = byte;
!       left_in_block--;
!       size++;
      }
!   while (work != 0 && left_in_block > 0);
!   if (work != 0)
!     {
!       obs->left_in_block = 0;
!       lto_append_block (obs);
!       current_pointer = obs->current_pointer;
!       left_in_block = obs->left_in_block;
!       do
! 	{
! 	  unsigned int byte = (work & 0x7f);
! 	  work >>= 7;
! 	  if (work != 0)
! 	    /* More bytes to follow.  */
! 	    byte |= 0x80;
! 
! 	  *(current_pointer++) = byte;
! 	  left_in_block--;
! 	  size++;
! 	}
!       while (work != 0);
!     }
!   obs->current_pointer = current_pointer;
!   obs->left_in_block = left_in_block;
!   obs->total_size += size;
  }
  
  
*************** streamer_write_uhwi_stream (struct lto_o
*** 206,226 ****
  void
  streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work)
  {
!   int more, byte;
! 
    do
      {
!       byte = (work & 0x7f);
!       /* arithmetic shift */
!       work >>= 7;
!       more = !((work == 0 && (byte & 0x40) == 0)
! 	       || (work == -1 && (byte & 0x40) != 0));
        if (more)
! 	byte |= 0x80;
! 
!       streamer_write_char_stream (obs, byte);
      }
!   while (more);
  }
  
  /* Write a GCOV counter value WORK to OBS.  */
--- 236,291 ----
  void
  streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work)
  {
!   if (obs->left_in_block == 0)
!     lto_append_block (obs);
!   char *current_pointer = obs->current_pointer;
!   unsigned int left_in_block = obs->left_in_block;
!   unsigned int size = 0;
!   bool more;
    do
      {
!       unsigned int byte = (work & 0x7f);
!       /* If the lower 7-bits are sign-extended 0 or -1 we are finished.  */
!       work >>= 6;
!       more = !(work == 0 || work == -1);
        if (more)
! 	{
! 	  /* More bits to follow.  */
! 	  work >>= 1;
! 	  byte |= 0x80;
! 	}
! 
!       *(current_pointer++) = byte;
!       left_in_block--;
!       size++;
!     }
!   while (more && left_in_block > 0);
!   if (more)
!     {
!       obs->left_in_block = 0;
!       lto_append_block (obs);
!       current_pointer = obs->current_pointer;
!       left_in_block = obs->left_in_block;
!       do
! 	{
! 	  unsigned int byte = (work & 0x7f);
! 	  work >>= 6;
! 	  more = !(work == 0 || work == -1);
! 	  if (more)
! 	    {
! 	      work >>= 1;
! 	      byte |= 0x80;
! 	    }
! 
! 	  *(current_pointer++) = byte;
! 	  left_in_block--;
! 	  size++;
! 	}
!       while (more);
      }
!   obs->current_pointer = current_pointer;
!   obs->left_in_block = left_in_block;
!   obs->total_size += size;
  }
  
  /* Write a GCOV counter value WORK to OBS.  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]