Bug 87502 - Poor code generation for std::string("c-style string")
Summary: Poor code generation for std::string("c-style string")
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: libstdc++ (show other bugs)
Version: 8.2.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2018-10-03 14:34 UTC by M Welinder
Modified: 2023-12-30 20:43 UTC (History)
4 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments
Preprocessed source code (62.20 KB, text/plain)
2018-10-03 14:34 UTC, M Welinder
Details
Proof-on-concept patch (567 bytes, patch)
2018-10-04 18:19 UTC, M Welinder
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description M Welinder 2018-10-03 14:34:23 UTC
Created attachment 44776 [details]
Preprocessed source code

It appears that gcc is creating quite poor code when "c-style strings"
are used to construct std::string objects.  Ideally, the result ought
to be just a few move instructions for small strings.


Host: Linux x86_64 4.4.140-62-default (OpenSuSE)

Test code:
---------------------------------------------------------------
#include <string>

extern void foo (const std::string &);

void
bar ()
{
  foo ("abc");
  foo (std::string("abc"));
}
---------------------------------------------------------------



# /usr/local/products/gcc/8.2.0/bin/g++ -std=gnu++1z  -S -m32 -O3 ttt.C
# grep 'call.*construct' ttt.s 
	call	_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.constprop.18
	call	_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.constprop.18

Here gcc generates complete calls to the generic string construction
even though the strings are constructed from small, known strings.

"-std=gnu++1z" is important; "-m32" and "-O3" (as opposed to "-m64" and
"-O2") are not.

# /usr/local/products/gcc/8.2.0/bin/g++ -S -m32 -O3 ttt.C
# grep 'call.*construct' ttt.s
# (nada)

No calls -- good.  In this case gcc generates this fragment:

_Z3barv:
.LFB1084:
	.cfi_startproc
	.cfi_personality 0,__gxx_personality_v0
	.cfi_lsda 0,.LLSDA1084
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset 5, -8
	movl	$25185, %edx
	movl	%esp, %ebp
	.cfi_def_cfa_register 5
	pushl	%edi
	pushl	%esi
	.cfi_offset 7, -12
	.cfi_offset 6, -16
	leal	-48(%ebp), %esi
	pushl	%ebx
	.cfi_offset 3, -20
	leal	-40(%ebp), %ebx
	subl	$56, %esp
	movl	%ebx, -48(%ebp)
	pushl	%esi
	movw	%dx, -40(%ebp)
	movb	$99, -38(%ebp)
	movl	$3, -44(%ebp)
	movb	$0, -37(%ebp)
.LEHB6:
	.cfi_escape 0x2e,0x10
	call	_Z3fooRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
[...]

This is better than a call, but not great:
1. The string is moved into position in three chunks (25185, 99, 0).
   This probably comes from inlined memcpy of 3 bytes, but the source
   is zero-terminated so rounding the memcpy size up to 4 would have
   been better.
2. It's unclear why 25185 is passed through a register.
Comment 1 Martin Liška 2018-10-04 15:04:58 UTC
(In reply to M Welinder from comment #0)
> Created attachment 44776 [details]
> Preprocessed source code
> 
> It appears that gcc is creating quite poor code when "c-style strings"
> are used to construct std::string objects.  Ideally, the result ought
> to be just a few move instructions for small strings.
> 
> 
> Host: Linux x86_64 4.4.140-62-default (OpenSuSE)
> 
> Test code:
> ---------------------------------------------------------------
> #include <string>
> 
> extern void foo (const std::string &);
> 
> void
> bar ()
> {
>   foo ("abc");
>   foo (std::string("abc"));
> }
> ---------------------------------------------------------------
> 
> 
> 
> # /usr/local/products/gcc/8.2.0/bin/g++ -std=gnu++1z  -S -m32 -O3 ttt.C
> # grep 'call.*construct' ttt.s 
> 	call
> _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S
> 8_St20forward_iterator_tag.constprop.18
> 	call
> _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S
> 8_St20forward_iterator_tag.constprop.18
> 
> Here gcc generates complete calls to the generic string construction
> even though the strings are constructed from small, known strings.

With -O2 -fdump-ipa-inline says:
function not declared inline and code size would grow

> 
> "-std=gnu++1z" is important; "-m32" and "-O3" (as opposed to "-m64" and
> "-O2") are not.

With -O3 more inlining happens.

> 
> # /usr/local/products/gcc/8.2.0/bin/g++ -S -m32 -O3 ttt.C
> # grep 'call.*construct' ttt.s
> # (nada)
> 
> No calls -- good.  In this case gcc generates this fragment:
> 
> _Z3barv:
> .LFB1084:
> 	.cfi_startproc
> 	.cfi_personality 0,__gxx_personality_v0
> 	.cfi_lsda 0,.LLSDA1084
> 	pushl	%ebp
> 	.cfi_def_cfa_offset 8
> 	.cfi_offset 5, -8
> 	movl	$25185, %edx
> 	movl	%esp, %ebp
> 	.cfi_def_cfa_register 5
> 	pushl	%edi
> 	pushl	%esi
> 	.cfi_offset 7, -12
> 	.cfi_offset 6, -16
> 	leal	-48(%ebp), %esi
> 	pushl	%ebx
> 	.cfi_offset 3, -20
> 	leal	-40(%ebp), %ebx
> 	subl	$56, %esp
> 	movl	%ebx, -48(%ebp)
> 	pushl	%esi
> 	movw	%dx, -40(%ebp)
> 	movb	$99, -38(%ebp)
> 	movl	$3, -44(%ebp)
> 	movb	$0, -37(%ebp)
> .LEHB6:
> 	.cfi_escape 0x2e,0x10
> 	call	_Z3fooRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
> [...]
> 
> This is better than a call, but not great:
> 1. The string is moved into position in three chunks (25185, 99, 0).
>    This probably comes from inlined memcpy of 3 bytes, but the source
>    is zero-terminated so rounding the memcpy size up to 4 would have
>    been better.

Yes we end up with:
  __builtin_memcpy (&D.30710.D.23004._M_local_buf, "abc", 3);


> 2. It's unclear why 25185 is passed through a register.

It's somehow connected to fact that constant are somehow expensive
on x86_64. Jakub can help here..
Comment 2 M Welinder 2018-10-04 18:19:27 UTC
Created attachment 44789 [details]
Proof-on-concept patch

This proof-of-concept improves the situation dramatically.  I don't
know if it is actually correct.  This is done, in part, by not discarding
the fact that the source is zero-terminated.

With this patch I get the code I expect, mostly.  For sizes 2 mod 4 I
still get the zero termination as a separate movb.

The destruction still stinks: the full destructor is inlined instead of
the small-string-only version (i.e., a no-op).  Evidently gcc cannot
see that the string remains a small-string.
Comment 3 Marc Glisse 2018-10-04 20:25:10 UTC
(In reply to M Welinder from comment #2)
> The destruction still stinks: the full destructor is inlined instead of
> the small-string-only version (i.e., a no-op).  Evidently gcc cannot
> see that the string remains a small-string.

void foo (const std::string &s){
  const_cast<std::string&>(s)="some longer string so it needs proper deletion";
}
Comment 4 Martin Liška 2018-10-05 07:03:08 UTC
Ccing Jonathan, libstdc++ maintainer.
Comment 5 Andrew Pinski 2018-10-05 07:22:49 UTC
The C++17 calling a function part of this issue is record as PR 86590 already.

The rest is a different issue and should be looked into.
Comment 6 M Welinder 2018-10-05 12:17:25 UTC
> const_cast<std::string&>(s)="some longer string so it needs proper deletion";

Is that really valid?

This comes down to whether the temporary object creating with the function
call is constant [in which case the above is UB] or not [in which case the
above is perfectly fine].

If I am reading http://cpp14.centaur.ath.cx/dcl.init.ref.html right then
the object is created with the const/volatile specifiers of the argument
type, in this case const.  If so, such an object must not be changed.

Just in case I am reading it wrong, I still say the generated code is
excessive.  gcc creates 16 instructions that, among other things, check
the _S_force_new flag.  Most of those instructions will never be executed
in practice.  I think the generated code, at least for the
temporary-from-small-string case, should simply be

    movq (%rsp), %rdi
    leaq 16(%rsp), %rax
    cmpq %rax, %rdi                ;; check if buffer is allocated
    je .L42
    call do_the_complicated_thing
.L42:
Comment 7 M Welinder 2018-10-05 12:48:58 UTC
Actually, it's more like 50+ instructions for destructing the string that
never (or almost never) needs destructing.  16 of those appear to need
linker fixup.

Sample for the C++14 mode:




	call	Z3fooRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
.LEHE5:
	movq	(%rsp), %rbx
	leaq	16(%rsp), %rax
	cmpq	%rax, %rbx
	je	.L72
	movq	16(%rsp), %rsi
	addq	$1, %rsi
	je	.L72
	testq	%rbx, %rbx
	je	.L72
	cmpq	$128, %rsi
	ja	.L73
	movl	_ZN9__gnu_cxx12__pool_allocIcE12_S_force_newE(%rip), %r9d
	testl	%r9d, %r9d
	jle	.L74
.L73:
	movq	%rbx, %rdi
	call	_ZdlPv
.L72:
[done]
...
[out-of-band code]
	.p2align 4,,10
	.p2align 3
.L74:
	movq	%rsp, %rdi
	movl	$_ZL28__gthrw___pthread_key_createPjPFvPvE, %ebp
	call	_ZN9__gnu_cxx17__pool_alloc_base16_M_get_free_listEm
	movq	%rsp, %rdi
	movq	%rax, %r12
	call	_ZN9__gnu_cxx17__pool_alloc_base12_M_get_mutexEv
	movq	%rax, %r13
	testq	%rbp, %rbp
	je	.L75
	movq	%rax, %rdi
	call	_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t
	testl	%eax, %eax
	je	.L75
	movl	$8, %edi
	call	__cxa_allocate_exception
	movl	$_ZN9__gnu_cxx24__concurrence_lock_errorD1Ev, %edx
	movl	$_ZTIN9__gnu_cxx24__concurrence_lock_errorE, %esi
	movq	$_ZTVN9__gnu_cxx24__concurrence_lock_errorE+16, (%rax)
	movq	%rax, %rdi
.LEHB20:
	call	__cxa_throw
.LEHE20:
	.p2align 4,,10
	.p2align 3
.L75:
	movq	(%r12), %rax
	movq	%rax, (%rbx)
	movq	%rbx, (%r12)
	testq	%rbp, %rbp
	je	.L72
	movq	%r13, %rdi
	call	_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t
	testl	%eax, %eax
	je	.L72
	movl	$8, %edi
	call	__cxa_allocate_exception
	movl	$_ZN9__gnu_cxx26__concurrence_unlock_errorD1Ev, %edx
	movl	$_ZTIN9__gnu_cxx26__concurrence_unlock_errorE, %esi
	movq	$_ZTVN9__gnu_cxx26__concurrence_unlock_errorE+16, (%rax)
	movq	%rax, %rdi
.LEHB21:
	call	__cxa_throw
Comment 8 Andrew Pinski 2021-12-25 12:14:31 UTC
(In reply to Marc Glisse from comment #3)
> (In reply to M Welinder from comment #2)
> > The destruction still stinks: the full destructor is inlined instead of
> > the small-string-only version (i.e., a no-op).  Evidently gcc cannot
> > see that the string remains a small-string.
> 
> void foo (const std::string &s){
>   const_cast<std::string&>(s)="some longer string so it needs proper
> deletion";
> }

I submitted PR 103827 for the similar case but only passed via a hidden reference instead.