This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Optimize LTO streaming core routine
- From: Richard Biener <rguenther at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Wed, 12 Jun 2013 13:06:07 +0200 (CEST)
- Subject: [PATCH] Optimize LTO streaming core routine
This optimizes the very core streaming routines,
streamer_write_char_stream and streamer_write_uhwi_stream and
streamer_write_hwi_stream.
In streamer_write_char_stream you
can notice that writing the char possibly clobbers the pointer
(and everything else) because it uses alias set zero and may
point to an arbitrary location. Thus we have to CSE
current_pointer manually here.
This also leads to very inefficent loops in
streamer_write_uhwi_stream and streamer_write_hwi_stream.
To optimize them we have to manually inline streamer_write_char_stream
and apply loop invariant motion and unswitching.
In streamer_write_hwi_stream we do the same but also note that
the
! more = !((work == 0 && (byte & 0x40) == 0)
! || (work == -1 && (byte & 0x40) != 0));
test is very inefficent. We can optimize that if we split
the shifting of work into two pieces like
! /* If the lower 7-bits are sign-extended 0 or -1 we are finished.
*/
! work >>= 6;
! more = !(work == 0 || work == -1);
if (more)
! {
! /* More bits to follow. */
! work >>= 1;
! byte |= 0x80;
which results in a very nice core loop
.L21:
movq %rbp, %rsi
movl %ebp, %ecx
sarq $6, %rsi
andl $127, %ecx
addq $1, %rsi
cmpq $1, %rsi
jbe .L26
orb $-128, %cl
sarq $7, %rbp
addl $1, %r12d
movb %cl, (%rax)
addq $1, %rax
subl $1, %edx
jne .L21
and threaded tail for the !more case:
.L26:
movb %cl, (%rax)
subl $1, %edx
addq $1, %rax
addl $1, %r12d
movq %rax, 16(%rbx)
addl %r12d, 32(%rbx)
movl %edx, 24(%rbx)
...
ret
(above produced by g++ 4.6 with -O2). Compared with what is
there before the patch:
...
movl 24(%rdi), %eax
jmp .L48
.p2align 4,,10
.p2align 3
.L52:
xorl %r13d, %r13d
testb $64, %r12b
je .L46
.L45:
orb $-128, %r12b
movl $1, %r13d
.L46:
testl %eax, %eax
jne .L47
movq %rbx, %rdi
call _Z16lto_append_blockP17lto_output_stream
.L47:
movq 16(%rbx), %rax
movb %r12b, (%rax)
movl 24(%rbx), %eax
addq $1, 16(%rbx)
addl $1, 32(%rbx)
subl $1, %eax
testl %r13d, %r13d
movl %eax, 24(%rbx)
je .L51
.L48:
movl %ebp, %r12d
sarq $7, %rbp
andl $127, %r12d
testq %rbp, %rbp
je .L52
cmpq $-1, %rbp
jne .L45
xorl %r13d, %r13d
testb $64, %r12b
jne .L46
jmp .L45
.p2align 4,,10
.p2align 3
.L51:
addq $8, %rsp
...
ret
that's a _lot_ better.
And hopefully get's streaming down in the profile somewhat.
LTO bootstrapped and tested on x86_64-unknown-linux-gnu, applied
to trunk.
Richard.
2013-06-12 Richard Biener <rguenther@suse.de>
* data-streamer.h (streamer_write_char_stream): CSE
obs->current_pointer.
* data-streamer-out.c (streamer_write_uhwi_stream): Inline
streamer_write_char_stream manually and optimize the resulting loop.
(streamer_write_hwi_stream): Likewise.
Index: gcc/data-streamer.h
===================================================================
*** gcc/data-streamer.h (revision 199935)
--- gcc/data-streamer.h (working copy)
*************** streamer_write_char_stream (struct lto_o
*** 183,190 ****
lto_append_block (obs);
/* Write the actual character. */
! *obs->current_pointer = c;
! obs->current_pointer++;
obs->total_size++;
obs->left_in_block--;
}
--- 183,191 ----
lto_append_block (obs);
/* Write the actual character. */
! char *current_pointer = obs->current_pointer;
! *(current_pointer++) = c;
! obs->current_pointer = current_pointer;
obs->total_size++;
obs->left_in_block--;
}
Index: gcc/data-streamer-out.c
===================================================================
*** gcc/data-streamer-out.c (revision 199935)
--- gcc/data-streamer-out.c (working copy)
*************** void
*** 187,192 ****
--- 187,197 ----
streamer_write_uhwi_stream (struct lto_output_stream *obs,
unsigned HOST_WIDE_INT work)
{
+ if (obs->left_in_block == 0)
+ lto_append_block (obs);
+ char *current_pointer = obs->current_pointer;
+ unsigned int left_in_block = obs->left_in_block;
+ unsigned int size = 0;
do
{
unsigned int byte = (work & 0x7f);
*************** streamer_write_uhwi_stream (struct lto_o
*** 195,203 ****
/* More bytes to follow. */
byte |= 0x80;
! streamer_write_char_stream (obs, byte);
}
! while (work != 0);
}
--- 200,233 ----
/* More bytes to follow. */
byte |= 0x80;
! *(current_pointer++) = byte;
! left_in_block--;
! size++;
}
! while (work != 0 && left_in_block > 0);
! if (work != 0)
! {
! obs->left_in_block = 0;
! lto_append_block (obs);
! current_pointer = obs->current_pointer;
! left_in_block = obs->left_in_block;
! do
! {
! unsigned int byte = (work & 0x7f);
! work >>= 7;
! if (work != 0)
! /* More bytes to follow. */
! byte |= 0x80;
!
! *(current_pointer++) = byte;
! left_in_block--;
! size++;
! }
! while (work != 0);
! }
! obs->current_pointer = current_pointer;
! obs->left_in_block = left_in_block;
! obs->total_size += size;
}
*************** streamer_write_uhwi_stream (struct lto_o
*** 206,226 ****
void
streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work)
{
! int more, byte;
!
do
{
! byte = (work & 0x7f);
! /* arithmetic shift */
! work >>= 7;
! more = !((work == 0 && (byte & 0x40) == 0)
! || (work == -1 && (byte & 0x40) != 0));
if (more)
! byte |= 0x80;
!
! streamer_write_char_stream (obs, byte);
}
! while (more);
}
/* Write a GCOV counter value WORK to OBS. */
--- 236,291 ----
void
streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work)
{
! if (obs->left_in_block == 0)
! lto_append_block (obs);
! char *current_pointer = obs->current_pointer;
! unsigned int left_in_block = obs->left_in_block;
! unsigned int size = 0;
! bool more;
do
{
! unsigned int byte = (work & 0x7f);
! /* If the lower 7-bits are sign-extended 0 or -1 we are finished. */
! work >>= 6;
! more = !(work == 0 || work == -1);
if (more)
! {
! /* More bits to follow. */
! work >>= 1;
! byte |= 0x80;
! }
!
! *(current_pointer++) = byte;
! left_in_block--;
! size++;
! }
! while (more && left_in_block > 0);
! if (more)
! {
! obs->left_in_block = 0;
! lto_append_block (obs);
! current_pointer = obs->current_pointer;
! left_in_block = obs->left_in_block;
! do
! {
! unsigned int byte = (work & 0x7f);
! work >>= 6;
! more = !(work == 0 || work == -1);
! if (more)
! {
! work >>= 1;
! byte |= 0x80;
! }
!
! *(current_pointer++) = byte;
! left_in_block--;
! size++;
! }
! while (more);
}
! obs->current_pointer = current_pointer;
! obs->left_in_block = left_in_block;
! obs->total_size += size;
}
/* Write a GCOV counter value WORK to OBS. */