This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Fix inchash handling of wide_ints (PR91242)


Richard Sandiford <richard.sandiford@arm.com> writes:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Mon, Jul 29, 2019 at 11:11 AM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> inchash::hash::add_wide_int operated directly on the raw encoding
>>> of the wide_int, including any redundant upper bits.  The problem
>>> with that is that the upper bits are only defined for some wide-int
>>> storage types (including wide_int itself).  wi::to_wide(tree) instead
>>> returns a value that is extended according to the signedness of the
>>> type (so that wi::to_widest can use the same encoding) while rtxes
>>> have the awkward special case of BI, which can be zero-extended
>>> rather than sign-extended.
>>>
>>> In the PR, we computed a hash for a "normal" sign-extended wide_int
>>> while the existing entries hashed wi::to_wide(tree).  This gives
>>> different results for unsigned types that have the top bit set.
>>>
>>> The patch fixes that by hashing the canonical sign-extended form even
>>> if the raw encoding happens to be different.
>>>
>>> Tested on aarch64-linux-gnu (with and without SVE), armeb-eabi
>>> and x86_64-linux-gnu.  OK to install?
>>
>> But only the most significant elt needs this treatment, no?  So
>> we can save some compile-time in the hasing by not doing
>> it for all elts.
>
> Yeah, we only need it for the most significant elt.  But the vast
> majority of constants need 64 bits or fewer of encoding anyway
> (even if the precision is greater than 64 bits), so that's usually the
> only one we need to process.  I'm not convinced handling multiple elts
> specially would pay off.

To go into a bit more detail, the code for:

void
f1 (inchash::hash &h, const wide_int &x)
{
  h.add_wide_int (x);
}

doesn't change with the patch, because is_sign_extended is true for
wide_int.  The code on x86_64 from a bootstrap compiler is:

------------------------------------------------------------------------
	.cfi_startproc
	// Hash the length.
	movl	24(%rsi), %edx
	movl	(%rdi), %ecx
	movl	$-1640531527, %r8d
	subl	%edx, %r8d
	subl	%ecx, %edx
	movl	%r8d, %eax
	movl	%ecx, %r8d
	subl	%ecx, %eax
	shrl	$13, %r8d
	xorl	%eax, %r8d
	movl	%edx, %eax
	movl	%r8d, %edx
	subl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$8, %edx
	xorl	%eax, %edx
	movl	%ecx, %eax
	movl	%edx, %ecx
	subl	%edx, %eax
	subl	%edx, %r8d
	shrl	$13, %ecx
	xorl	%eax, %ecx
	movl	%r8d, %eax
	movl	%ecx, %r8d
	subl	%ecx, %eax
	subl	%ecx, %edx
	shrl	$12, %r8d
	xorl	%eax, %r8d
	movl	%edx, %eax
	movl	%r8d, %edx
	subl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$16, %edx
	xorl	%eax, %edx
	movl	%ecx, %eax
	movl	%edx, %ecx
	subl	%edx, %eax
	subl	%edx, %r8d
	shrl	$5, %ecx
	xorl	%eax, %ecx
	movl	%ecx, %eax
	subl	%ecx, %r8d
	subl	%ecx, %edx
	shrl	$3, %eax
	xorl	%eax, %r8d
	movl	%edx, %eax
	movl	%r8d, %edx
	subl	%r8d, %eax
	sall	$10, %edx
	xorl	%eax, %edx
	movl	%ecx, %eax
	subl	%r8d, %eax
	xorl	%r8d, %r8d
	subl	%edx, %eax
	shrl	$15, %edx
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	// Hash the elements.
	movl	24(%rsi), %edx
	testl	%edx, %edx
	je	.L444
	.p2align 4,,10
	.p2align 3
.L446:
	movl	%r8d, %edx
	movq	(%rsi,%rdx,8), %r9
	movq	%r9, %rdx
	subl	%eax, %r9d
	sarq	$32, %rdx
	movl	%r9d, %ecx
	movl	%eax, %r9d
	subl	%edx, %ecx
	shrl	$13, %r9d
	subl	%eax, %edx
	xorl	%ecx, %r9d
	movl	%edx, %ecx
	movl	%r9d, %edx
	subl	%r9d, %ecx
	subl	%r9d, %eax
	sall	$8, %edx
	xorl	%ecx, %edx
	movl	%edx, %ecx
	subl	%edx, %eax
	subl	%edx, %r9d
	shrl	$13, %ecx
	xorl	%ecx, %eax
	movl	%r9d, %ecx
	movl	%eax, %r9d
	subl	%eax, %ecx
	subl	%eax, %edx
	shrl	$12, %r9d
	xorl	%ecx, %r9d
	movl	%edx, %ecx
	movl	%r9d, %edx
	subl	%r9d, %ecx
	subl	%r9d, %eax
	sall	$16, %edx
	xorl	%ecx, %edx
	movl	%edx, %ecx
	subl	%edx, %eax
	subl	%edx, %r9d
	shrl	$5, %ecx
	xorl	%eax, %ecx
	movl	%ecx, %eax
	subl	%ecx, %r9d
	subl	%ecx, %edx
	shrl	$3, %eax
	xorl	%eax, %r9d
	movl	%edx, %eax
	movl	%r9d, %edx
	subl	%r9d, %eax
	sall	$10, %edx
	xorl	%eax, %edx
	movl	%ecx, %eax
	addl	$1, %r8d
	subl	%r9d, %eax
	subl	%edx, %eax
	shrl	$15, %edx
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	cmpl	%r8d, 24(%rsi)
	ja	.L446
.L444:
	ret
------------------------------------------------------------------------

The code for:

void
f2 (inchash::hash &h, const wide_int_ref &x)
{
  h.add_wide_int (x);
}

does change, and becomes:

------------------------------------------------------------------------
	.cfi_startproc
	// Hash the length.
	movl	8(%rsi), %ecx
	movl	(%rdi), %edx
	movl	$-1640531527, %r8d
	subl	%ecx, %r8d
	subl	%edx, %ecx
	movl	%r8d, %eax
	movl	%edx, %r8d
	subl	%edx, %eax
	shrl	$13, %r8d
	xorl	%eax, %r8d
	movl	%ecx, %eax
	movl	%r8d, %ecx
	subl	%r8d, %eax
	subl	%r8d, %edx
	sall	$8, %ecx
	xorl	%eax, %ecx
	movl	%edx, %eax
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$13, %edx
	xorl	%eax, %edx
	movl	%r8d, %eax
	movl	%edx, %r8d
	subl	%edx, %eax
	subl	%edx, %ecx
	shrl	$12, %r8d
	xorl	%eax, %r8d
	movl	%ecx, %eax
	movl	%r8d, %ecx
	subl	%r8d, %eax
	subl	%r8d, %edx
	sall	$16, %ecx
	xorl	%eax, %ecx
	movl	%edx, %eax
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$5, %edx
	xorl	%eax, %edx
	movl	%edx, %eax
	subl	%edx, %r8d
	subl	%edx, %ecx
	shrl	$3, %eax
	xorl	%eax, %r8d
	movl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$10, %eax
	xorl	%eax, %ecx
	subl	%r8d, %edx
	subl	%ecx, %edx
	shrl	$15, %ecx
	movl	%ecx, %eax
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	// Hash the elements.
	movl	8(%rsi), %edx
	testl	%edx, %edx
	je	.L457
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	movq	(%rsi), %r11
	xorl	%r9d, %r9d
	movl	$64, %r10d
	.p2align 4,,10
	.p2align 3
.L453:
	movl	%r9d, %edx
	movl	12(%rsi), %ecx
	movq	(%r11,%rdx,8), %r8
	movl	%r9d, %edx
	sall	$6, %edx
	movl	%ecx, %ebx
	subl	%edx, %ebx
	cmpl	$63, %ebx
	ja	.L452
	movl	%r10d, %ebx
	subl	%ecx, %ebx
	movl	%ebx, %ecx
	addl	%edx, %ecx
	salq	%cl, %r8
	sarq	%cl, %r8
.L452:
	movq	%r8, %rcx
	movl	%eax, %edx
	subl	%eax, %r8d
	sarq	$32, %rcx
	shrl	$13, %edx
	subl	%ecx, %r8d
	subl	%eax, %ecx
	xorl	%edx, %r8d
	movl	%r8d, %edx
	subl	%r8d, %ecx
	subl	%r8d, %eax
	sall	$8, %edx
	xorl	%edx, %ecx
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$13, %edx
	xorl	%edx, %eax
	movl	%eax, %edx
	subl	%eax, %r8d
	subl	%eax, %ecx
	shrl	$12, %edx
	xorl	%edx, %r8d
	movl	%r8d, %edx
	subl	%r8d, %ecx
	subl	%r8d, %eax
	sall	$16, %edx
	xorl	%edx, %ecx
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$5, %edx
	xorl	%eax, %edx
	movl	%edx, %eax
	subl	%edx, %r8d
	subl	%edx, %ecx
	shrl	$3, %eax
	xorl	%eax, %r8d
	movl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$10, %eax
	xorl	%eax, %ecx
	subl	%r8d, %edx
	addl	$1, %r9d
	subl	%ecx, %edx
	shrl	$15, %ecx
	movl	%ecx, %eax
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	cmpl	%r9d, 8(%rsi)
	ja	.L453
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret
------------------------------------------------------------------------

If I instead change the function to something like:

  /* Add wide_int-based value V.  */
  template<typename T>
  void add_wide_int (const generic_wide_int<T> &x)
  {
    add_int (x.get_len ());
    for (unsigned i = 1; i < x.get_len (); i++)
      add_hwi (x.elt (i - 1));
    add_hwi (x.sext_elt (x.get_len () - 1));
  }

then the function becomes much bigger for both f1 and f2.  The f1
version is:

------------------------------------------------------------------------
	.cfi_startproc
	// Hash the length.
	movl	24(%rsi), %ecx
	movl	(%rdi), %edx
	movl	$-1640531527, %r8d
	subl	%ecx, %r8d
	subl	%edx, %ecx
	movl	%r8d, %eax
	movl	%edx, %r8d
	subl	%edx, %eax
	shrl	$13, %r8d
	xorl	%eax, %r8d
	movl	%ecx, %eax
	movl	%r8d, %ecx
	subl	%r8d, %eax
	subl	%r8d, %edx
	sall	$8, %ecx
	xorl	%eax, %ecx
	movl	%edx, %eax
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$13, %edx
	xorl	%eax, %edx
	movl	%r8d, %eax
	movl	%edx, %r8d
	subl	%edx, %eax
	subl	%edx, %ecx
	shrl	$12, %r8d
	xorl	%eax, %r8d
	movl	%ecx, %eax
	movl	%r8d, %ecx
	subl	%r8d, %eax
	subl	%r8d, %edx
	sall	$16, %ecx
	xorl	%eax, %ecx
	movl	%edx, %eax
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$5, %edx
	xorl	%eax, %edx
	movl	%edx, %eax
	subl	%edx, %r8d
	subl	%edx, %ecx
	shrl	$3, %eax
	xorl	%eax, %r8d
	movl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$10, %eax
	xorl	%eax, %ecx
	subl	%r8d, %edx
	subl	%ecx, %edx
	shrl	$15, %ecx
	movl	%ecx, %eax
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	// Hash the elements using elt().
	movl	24(%rsi), %edx
	cmpl	$1, %edx
	jbe	.L445
	xorl	%r9d, %r9d
	jmp	.L448
	.p2align 4,,10
	.p2align 3
.L454:
	subl	$1, %edx
	movq	(%rsi,%rdx,8), %r8
	sarq	$63, %r8
.L447:
	movq	%r8, %rcx
	subl	%eax, %r8d
	sarq	$32, %rcx
	movl	%r8d, %edx
	movl	%eax, %r8d
	subl	%ecx, %edx
	shrl	$13, %r8d
	subl	%eax, %ecx
	xorl	%edx, %r8d
	movl	%ecx, %edx
	movl	%r8d, %ecx
	subl	%r8d, %edx
	subl	%r8d, %eax
	sall	$8, %ecx
	xorl	%edx, %ecx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	movl	%eax, %edx
	movl	%ecx, %eax
	shrl	$13, %eax
	xorl	%edx, %eax
	movl	%r8d, %edx
	movl	%eax, %r8d
	subl	%eax, %edx
	subl	%eax, %ecx
	shrl	$12, %r8d
	xorl	%edx, %r8d
	subl	%r8d, %ecx
	subl	%r8d, %eax
	movl	%ecx, %edx
	movl	%r8d, %ecx
	sall	$16, %ecx
	xorl	%edx, %ecx
	movl	%ecx, %edx
	subl	%ecx, %eax
	subl	%ecx, %r8d
	shrl	$5, %edx
	xorl	%eax, %edx
	movl	%edx, %eax
	subl	%edx, %r8d
	subl	%edx, %ecx
	shrl	$3, %eax
	xorl	%eax, %r8d
	movl	%r8d, %eax
	subl	%r8d, %ecx
	sall	$10, %eax
	xorl	%eax, %ecx
	subl	%r8d, %edx
	addq	$1, %r9
	subl	%ecx, %edx
	shrl	$15, %ecx
	movl	%ecx, %eax
	leal	1(%r9), %ecx
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	movl	24(%rsi), %edx
	cmpl	%ecx, %edx
	jbe	.L445
.L448:
	cmpl	%r9d, %edx
	jbe	.L454
	movq	(%rsi,%r9,8), %r8
	jmp	.L447
	.p2align 4,,10
	.p2align 3
.L445:
	// Hash the final element using sext_elt().
	addl	$-1, %edx
	jc	.L450
	movabsq	$34359738360, %rdx
	movq	(%rsi,%rdx), %rcx
	sarq	$63, %rcx
.L452:
	movq	%rcx, %rdx
	subl	%eax, %ecx
	sarq	$32, %rdx
	movl	%ecx, %esi
	movl	%eax, %ecx
	subl	%edx, %esi
	shrl	$13, %ecx
	subl	%eax, %edx
	xorl	%esi, %ecx
	movl	%edx, %esi
	movl	%ecx, %edx
	subl	%ecx, %esi
	subl	%ecx, %eax
	sall	$8, %edx
	xorl	%esi, %edx
	movl	%edx, %esi
	subl	%edx, %eax
	subl	%edx, %ecx
	shrl	$13, %esi
	xorl	%esi, %eax
	movl	%ecx, %esi
	movl	%eax, %ecx
	subl	%eax, %esi
	subl	%eax, %edx
	shrl	$12, %ecx
	xorl	%esi, %ecx
	movl	%edx, %esi
	movl	%ecx, %edx
	subl	%ecx, %esi
	subl	%ecx, %eax
	sall	$16, %edx
	xorl	%esi, %edx
	subl	%edx, %eax
	subl	%edx, %ecx
	movl	%eax, %esi
	movl	%edx, %eax
	shrl	$5, %eax
	xorl	%esi, %eax
	movl	%eax, %esi
	subl	%eax, %ecx
	subl	%eax, %edx
	shrl	$3, %esi
	xorl	%esi, %ecx
	movl	%ecx, %esi
	subl	%ecx, %edx
	sall	$10, %esi
	xorl	%esi, %edx
	subl	%ecx, %eax
	subl	%edx, %eax
	shrl	$15, %edx
	xorl	%edx, %eax
	movl	%eax, (%rdi)
	ret
.L450:
	movl	%edx, %edx
	movq	(%rsi,%rdx,8), %rcx
	jmp	.L452
	.cfi_endproc
------------------------------------------------------------------------

That's a lot of extra code. :-) And by splitting the final element out,
we lose the optimisation that avoids the sign_mask path in elt(), so
that we end up with an if-then-else diamond that wasn't there before:

	// Hash the final element using sext_elt().
	addl	$-1, %edx
	jc	.L450
	// sign_mask
	movabsq	$34359738360, %rdx
	movq	(%rsi,%rdx), %rcx
	sarq	$63, %rcx
.L452:
	...
.L450:
	// normal read
	movl	%edx, %edx
	movq	(%rsi,%rdx,8), %rcx
	jmp	.L452

The branch should of course be predictable.  And the branch around
the elt() handling should be predictable if it really is the case
that most wide ints don't need it.  But it's still a lot of code
that isn't marked cold.  (And marking it cold would make the
optimisation pointless, since the optimisation is targetting the
case in which the code is executed.)

Thanks,
Richard


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]