96252 – [11/12/13/14 Regression] mis-optimization where identical functions have very different codegen since gcc 10

Bug 96252 - [11/12/13/14 Regression] mis-optimization where identical functions have very different codegen since gcc 10

Summary: [11/12/13/14 Regression] mis-optimization where identical functions have very...

Status:	NEW

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	ipa (show other bugs)
Version:	10.0

Importance:	P2 normal
Target Milestone:	11.5
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Duplicates (1):	101474 (view as bug list)
Depends on:
Blocks:

Reported:	2020-07-20 15:02 UTC by Will Wray
Modified:	2023-07-07 10:37 UTC (History)
CC List:	4 users (show)

See Also:
Host:
Target:
Build:
Known to work:	9.3.0
Known to fail:
Last reconfirmed:	2021-02-14 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Will Wray 2020-07-20 15:02:03 UTC

(Stumbled on this odd effect while examining codegen for operator<=>)

The reduced sample compiles c++11 and up (std version likely irrelevant)
with the different codegen occurring since GCC 10.

Two identical functions:

  bool cmp_x(cmp l, cmp r) noexcept {
      return std::lexicographical_compare(begin(l),end(l)
                                         ,begin(r),end(r)); }
  bool cmp_y(cmp l, cmp r) noexcept {
      return std::lexicographical_compare(begin(l),end(l)
                                         ,begin(r),end(r)); }

generate very different code for cmp = array<int,64> at -O2 and -O3.

The first is looping, the second has much longer unrolled codegen.
My benchmarks show 40% difference in runtime, quick-bench shows 30%. 

Compiler Explorer https://godbolt.org/z/97box6
Quick-bench 1.3x https://quick-bench.com/q/480qkw1sP4OWOH6JBxsm-J_9uOk

(Adding -fno-inline may provide a clue.)

Comment 1 Will Wray 2020-07-20 17:32:44 UTC

Here's the code, compiler invocation and codegen output.

The longer codegen expands memcpy to copy the std::array by-value arguments.

-fno-inline shows the compiler call the first function from the second, then, when it does so, it has to copy the arguments as both functions pass by value.

/** compare_differing_codegen.cpp ******************/

#include <algorithm>
#include <array>

using cmp = std::array<int,64>;

bool cmp_x(cmp l, cmp r) noexcept {
    return std::lexicographical_compare(begin(l),end(l)
                                       ,begin(r),end(r));
}
bool cmp_y(cmp l, cmp r) noexcept {
    return std::lexicographical_compare(begin(l),end(l)
                                       ,begin(r),end(r));
}

/** compiler invocation **************************/

> g++ --version
  g++ (GCC) 10.1.1 20200507 (Red Hat 10.1.1-1)
> g++ -std=c++11 -O2 compare_differing_codegen.cpp -S
> cat compare_differing_codegen.s

	.file	"compare_differing_codegen.cpp"
	.text
	.p2align 4
	.globl	_Z5cmp_xSt5arrayIiLm64EES0_
	.type	_Z5cmp_xSt5arrayIiLm64EES0_, @function
_Z5cmp_xSt5arrayIiLm64EES0_:
.LFB890:
	.cfi_startproc
	leaq	264(%rsp), %rcx
	leaq	8(%rsp), %rax
	movq	%rcx, %rdx
	.p2align 4,,10
	.p2align 3
.L4:
	movl	(%rdx), %esi
	cmpl	%esi, (%rax)
	jl	.L12
	jg	.L7
	addq	$4, %rax
	addq	$4, %rdx
	cmpq	%rcx, %rax
	jne	.L4
	leaq	520(%rsp), %rax
	cmpq	%rax, %rdx
	setne	%al
	ret
	.p2align 4,,10
	.p2align 3
.L12:
	movl	$1, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L7:
	xorl	%eax, %eax
	ret
	.cfi_endproc
.LFE890:
	.size	_Z5cmp_xSt5arrayIiLm64EES0_, .-_Z5cmp_xSt5arrayIiLm64EES0_
	.p2align 4
	.globl	_Z5cmp_ySt5arrayIiLm64EES0_
	.type	_Z5cmp_ySt5arrayIiLm64EES0_, @function
_Z5cmp_ySt5arrayIiLm64EES0_:
.LFB909:
	.cfi_startproc
	subq	$400, %rsp
	.cfi_def_cfa_offset 408
	movdqu	408(%rsp), %xmm0
	leaq	-120(%rsp), %rdx
	movdqu	424(%rsp), %xmm1
	leaq	136(%rsp), %rax
	movdqu	440(%rsp), %xmm2
	movdqu	456(%rsp), %xmm3
	movdqu	472(%rsp), %xmm4
	movups	%xmm0, -120(%rsp)
	movdqu	488(%rsp), %xmm5
	movdqu	504(%rsp), %xmm6
	movups	%xmm1, -104(%rsp)
	movdqu	520(%rsp), %xmm7
	movdqu	536(%rsp), %xmm0
	movups	%xmm2, -88(%rsp)
	movdqu	552(%rsp), %xmm1
	movdqu	568(%rsp), %xmm2
	movups	%xmm3, -72(%rsp)
	movdqu	584(%rsp), %xmm3
	movups	%xmm4, -56(%rsp)
	movdqu	600(%rsp), %xmm4
	movups	%xmm5, -40(%rsp)
	movdqu	616(%rsp), %xmm5
	movups	%xmm6, -24(%rsp)
	movdqu	632(%rsp), %xmm6
	movups	%xmm7, -8(%rsp)
	movdqu	648(%rsp), %xmm7
	movups	%xmm0, 8(%rsp)
	movups	%xmm1, 24(%rsp)
	movups	%xmm2, 40(%rsp)
	movups	%xmm3, 56(%rsp)
	movups	%xmm4, 72(%rsp)
	movups	%xmm5, 88(%rsp)
	movups	%xmm6, 104(%rsp)
	movups	%xmm7, 120(%rsp)
	movdqu	664(%rsp), %xmm0
	movdqu	680(%rsp), %xmm1
	movdqu	696(%rsp), %xmm2
	movdqu	712(%rsp), %xmm3
	movdqu	728(%rsp), %xmm4
	movdqu	744(%rsp), %xmm5
	movups	%xmm0, 136(%rsp)
	movdqu	760(%rsp), %xmm6
	movups	%xmm1, 152(%rsp)
	movdqu	776(%rsp), %xmm7
	movdqu	792(%rsp), %xmm0
	movups	%xmm2, 168(%rsp)
	movdqu	808(%rsp), %xmm1
	movdqu	824(%rsp), %xmm2
	movups	%xmm3, 184(%rsp)
	movdqu	840(%rsp), %xmm3
	movups	%xmm4, 200(%rsp)
	movdqu	856(%rsp), %xmm4
	movups	%xmm5, 216(%rsp)
	movdqu	872(%rsp), %xmm5
	movups	%xmm6, 232(%rsp)
	movdqu	888(%rsp), %xmm6
	movups	%xmm7, 248(%rsp)
	movdqu	904(%rsp), %xmm7
	movups	%xmm0, 264(%rsp)
	movups	%xmm1, 280(%rsp)
	movups	%xmm2, 296(%rsp)
	movups	%xmm3, 312(%rsp)
	movups	%xmm4, 328(%rsp)
	movups	%xmm5, 344(%rsp)
	movups	%xmm6, 360(%rsp)
	movups	%xmm7, 376(%rsp)
	.p2align 4,,10
	.p2align 3
.L15:
	movl	(%rax), %ecx
	cmpl	%ecx, (%rdx)
	jl	.L16
	jg	.L17
	addq	$4, %rax
	leaq	392(%rsp), %rsi
	addq	$4, %rdx
	cmpq	%rsi, %rax
	jne	.L15
.L17:
	xorl	%eax, %eax
	addq	$400, %rsp
	.cfi_remember_state
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L16:
	.cfi_restore_state
	movl	$1, %eax
	addq	$400, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE909:
	.size	_Z5cmp_ySt5arrayIiLm64EES0_, .-_Z5cmp_ySt5arrayIiLm64EES0_
	.ident	"GCC: (GNU) 10.1.1 20200507 (Red Hat 10.1.1-1)"
	.section	.note.GNU-stack,"",@progbits

Comment 2 Richard Biener 2020-07-21 06:57:26 UTC

It's IPA ICF that makes the difference.  Guess the "thunk" isn't a thunk but copies parameters by value (fails to tail-call?).

Comment 3 Richard Biener 2020-07-21 06:58:54 UTC

GCC 9 correctly applies tail calling.

Comment 4 Richard Biener 2020-07-23 06:51:50 UTC

GCC 10.2 is released, adjusting target milestone.

Comment 5 Jan Hubicka 2021-02-14 23:16:14 UTC

This looks like missed memory copy propagation to me.

We inline the icfed function back but for some reason we end up with all those extra moves, so it does not seem to be problem with missed tailcall

IPA function summary for bool cmp_y(cmp, cmp)/767 inlinable
  global time:     92.095312
  self size:       13
  global size:     14
  min size:       11
  self stack:      0
  global stack:    0
    size:11.000000, time:90.095312
    size:3.000000, time:2.000000,  executed if:(not inlined)
  calls:
    bool cmp_x(cmp, cmp)/804 inlined
      freq:1.00
      Stack frame offset 0, callee self size 0
      __lexicographical_compare_impl.isra/803 inlined
        freq:1.00
        Stack frame offset 0, callee self size 0

Funny thing is that inliner seems to believe it is going to reduce code size:
Considering bool cmp_x(cmp, cmp)/766 with 10 size
 to be inlined into bool cmp_y(cmp, cmp)/767 in unknown:0
 Estimated badness is -inf, frequency 1.00.
    Badness calculation for bool cmp_y(cmp, cmp)/767 -> bool cmp_x(cmp, cmp)/766
      size growth -3, time 16.000000 unspec 18.000000  big_speedup
      -inf: Growth -3 <= 0
      Adjusted by hints -inf

The body is:
bool cmp_y (struct cmp l, struct cmp r)
{
  int * __first1;
  int * __first2;
  struct cmp l;
  struct cmp r;
  int _8;
  int _9;
  bool _17;

  <bb 2> [local count: 1073741824]:
  l = l;
  r = r;
  goto <bb 5>; [100.00%]

  <bb 3> [local count: 9416790681]:
  if (_8 > _9)
    goto <bb 6>; [3.66%]
  else
    goto <bb 4>; [96.34%]

  <bb 4> [local count: 9072136140]:
  __first1_11 = __first1_21 + 4;
  __first2_13 = __first2_2 + 4;
  if (&MEM <struct cmp> [(void *)&r + 256B] != __first2_13)
    goto <bb 5>; [95.91%]
  else
    goto <bb 6>; [4.09%]

  <bb 5> [local count: 9774538809]:
  # __first1_21 = PHI <__first1_11(4), &l._M_elems(2)>
  # __first2_2 = PHI <__first2_13(4), &r._M_elems(2)>
  _8 = MEM[(int *)__first1_21];
  _9 = MEM[(int *)__first2_2];
  if (_8 < _9)
    goto <bb 6>; [3.66%]
  else
    goto <bb 3>; [96.34%]

  <bb 6> [local count: 1073741824]:
  # _17 = PHI <0(3), 1(5), 0(4)>

  <bb 6> [local count: 1073741824]:
  # _17 = PHI <0(3), 1(5), 0(4)>
  l ={v} {CLOBBER};
  r ={v} {CLOBBER};
  return _17;

}

Richi,
in any case, we may want to avoid creating wrappers for functions with very large parameters?

Comment 6 Jan Hubicka 2021-02-15 14:40:04 UTC

Thinking of it, perhaps also inliner could take a hint that it is
inlining a tail call and do not produce unnecesary copy of the
functio parameter passed by value.

More generally, mod/ref has good chance to determine that parameter in
its original location is not modified by the call and we could avoid the
copy even for non-tailcalls?

Still would be interesting to know why copy propagation gives up.

Comment 7 Richard Biener 2021-04-08 12:02:02 UTC

GCC 10.3 is being released, retargeting bugs to GCC 10.4.

Comment 8 Andrew Pinski 2021-11-22 06:42:54 UTC

*** Bug 101474 has been marked as a duplicate of this bug. ***

Comment 9 Jakub Jelinek 2022-06-28 10:41:21 UTC

GCC 10.4 is being released, retargeting bugs to GCC 10.5.

Comment 10 Richard Biener 2023-07-07 10:37:50 UTC

GCC 10 branch is being closed.