This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[wide-int] More optimisations
- From: Richard Sandiford <rdsandiford at googlemail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: zadeck at naturalbridge dot com, mikestump at comcast dot net, rguenther at suse dot de
- Date: Sun, 27 Oct 2013 14:49:08 +0000
- Subject: [wide-int] More optimisations
- Authentication-results: sourceware.org; auth=none
This patch adds some more optimisations to the wi:: comparison functions.
It uses the:
#define CONSTANT(X) (__builtin_constant_p (X) && (X))
idiom that was mentioned before, except that I thought CONSTANT would be
too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
Better names welcome.
The changes are:
- Add a fast path to eq_p for when one of the inputs isn't sign-extended.
This includes code to handle compile-time 0 specially.
- Add the opposite optimisation to Mike's lts_p change, if we can tell at
compile time that it applies.
- Add fast paths to ltu_p for constants.
E.g.:
bool
f1 (const_tree x)
{
return wi::eq_p (x, 0);
}
now gives:
xorl %eax, %eax
cmpw $1, 4(%rdi)
je .L5
rep ret
.p2align 4,,10
.p2align 3
.L5:
cmpq $0, 16(%rdi)
sete %al
ret
bool
f2 (const_tree x, HOST_WIDE_INT y)
{
return wi::eq_p (x, y);
}
gives:
movq 8(%rdi), %rax
movzwl 52(%rax), %edx
xorl %eax, %eax
andw $1023, %dx
cmpw $1, 4(%rdi)
je .L10
rep ret
.p2align 4,,10
.p2align 3
.L10:
xorq 16(%rdi), %rsi
movzwl %dx, %edx
movl $64, %ecx
subl %edx, %ecx
movq %rsi, %rax
salq %cl, %rax
testl %ecx, %ecx
cmovg %rax, %rsi
testq %rsi, %rsi
sete %al
ret
bool
f3 (HOST_WIDE_INT x, const_tree y)
{
return wi::lts_p (x, y);
}
is similarly ugly because of way that it ignores TYPE_SIGN and so has to
explicitly sign-extend "small-prec" cases:
movq 8(%rsi), %rax
movzwl 4(%rsi), %ecx
movzwl 52(%rax), %edx
andl $1023, %edx
cmpl $1, %ecx
je .L16
leal -1(%rcx), %eax
sall $6, %ecx
subl %edx, %ecx
movq 16(%rsi,%rax,8), %rax
movq %rax, %rdx
salq %cl, %rdx
testl %ecx, %ecx
cmovg %rdx, %rax
sarq $63, %rax
addl $1, %eax
ret
.p2align 4,,10
.p2align 3
.L16:
cmpl $63, %edx
movq 16(%rsi), %rax
ja .L13
movb $64, %cl
subl %edx, %ecx
salq %cl, %rax
sarq %cl, %rax
.L13:
cmpq %rdi, %rax
setg %al
ret
but:
bool
f4 (HOST_WIDE_INT x, const_tree y)
{
return wi::lts_p (x, wi::to_widest (y));
}
is a bit more respectable:
movzwl 6(%rsi), %eax
cmpl $1, %eax
je .L20
subl $1, %eax
movq 16(%rsi,%rax,8), %rax
sarq $63, %rax
addl $1, %eax
ret
.p2align 4,,10
.p2align 3
.L20:
cmpq %rdi, 16(%rsi)
setg %al
ret
For similar reasons:
bool
f5 (const_tree x)
{
return wi::ltu_p (x, 100);
}
gives:
movq 8(%rdi), %rax
movzwl 52(%rax), %ecx
xorl %eax, %eax
andw $1023, %cx
cmpw $1, 4(%rdi)
je .L26
rep ret
.p2align 4,,10
.p2align 3
.L26:
cmpw $63, %cx
ja .L23
movl $1, %eax
salq %cl, %rax
subq $1, %rax
andq 16(%rdi), %rax
.L24:
cmpq $99, %rax
setbe %al
ret
.p2align 4,,10
.p2align 3
.L23:
movq 16(%rdi), %rax
jmp .L24
but:
bool
f6 (const_tree x)
{
return wi::ltu_p (wi::to_widest (x), 100);
}
gives:
xorl %eax, %eax
cmpw $1, 6(%rdi)
je .L30
rep ret
.p2align 4,,10
.p2align 3
.L30:
cmpq $99, 16(%rdi)
setbe %al
ret
Tested on powerpc64-linux-gnu and x86_64-linux-gnu. OK for wide-int?
Thanks,
Richard
Index: gcc/system.h
===================================================================
--- gcc/system.h 2013-10-27 14:25:19.144723977 +0000
+++ gcc/system.h 2013-10-27 14:25:20.716738045 +0000
@@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre
#define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__))
#endif
+#if GCC_VERSION >= 3001
+#define CAN_TELL(X) (__builtin_constant_p (X) && (X))
+#else
+#define CAN_TELL(X) (false && (X))
+#endif
+
/* Until we can use STATIC_ASSERT. */
#define STATIC_ASSERT(X) \
typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED
Index: gcc/wide-int.h
===================================================================
--- gcc/wide-int.h 2013-10-27 14:25:19.144723977 +0000
+++ gcc/wide-int.h 2013-10-27 14:37:34.834443832 +0000
@@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y)
WIDE_INT_REF_FOR (T2) yi (y, precision);
if (xi.is_sign_extended && yi.is_sign_extended)
{
+ /* This case reduces to array equality. */
if (xi.len != yi.len)
return false;
unsigned int i = 0;
@@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y)
while (++i != xi.len);
return true;
}
- if (precision <= HOST_BITS_PER_WIDE_INT)
+ if (yi.len == 1)
{
- unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow ();
- return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0;
+ /* XI is only equal to YI if it too has a single HWI. */
+ if (xi.len != 1)
+ return false;
+ /* Excess bits in xi.val[0] will be signs or zeros, so comparisons
+ with 0 are simple. */
+ if (CAN_TELL (yi.val[0] == 0))
+ return xi.val[0] == 0;
+ /* Otherwise flush out any excess bits first. */
+ unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0];
+ int excess = HOST_BITS_PER_WIDE_INT - precision;
+ if (excess > 0)
+ diff <<= excess;
+ return diff == 0;
}
return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision);
}
@@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y)
unsigned int precision = get_binary_precision (x, y);
WIDE_INT_REF_FOR (T1) xi (x, precision);
WIDE_INT_REF_FOR (T2) yi (y, precision);
- // We optimize x < y, where y is 64 or fewer bits.
+ /* We optimize x < y, where y is 64 or fewer bits. */
if (wi::fits_shwi_p (yi))
{
- // If x fits directly into a shwi, we can compare directly.
+ /* Make lts_p (x, 0) as efficient as wi::neg_p (x). */
+ if (CAN_TELL (yi.val[0] == 0))
+ return neg_p (xi);
+ /* If x fits directly into a shwi, we can compare directly. */
if (wi::fits_shwi_p (xi))
return xi.to_shwi () < yi.to_shwi ();
- // If x doesn't fit and is negative, then it must be more
- // negative than any value in y, and hence smaller than y.
- if (neg_p (xi, SIGNED))
+ /* If x doesn't fit and is negative, then it must be more
+ negative than any value in y, and hence smaller than y. */
+ if (neg_p (xi))
return true;
- // If x is positive, then it must be larger than any value in y,
- // and hence greater than y.
+ /* If x is positive, then it must be larger than any value in y,
+ and hence greater than y. */
return false;
}
+ /* Optimize the opposite case, if it can be detected at compile time. */
+ if (CAN_TELL (xi.len == 1))
+ /* If YI is negative it is lower than the least HWI.
+ If YI is positive it is greater than the greatest HWI. */
+ return !neg_p (yi);
return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len);
}
@@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y)
unsigned int precision = get_binary_precision (x, y);
WIDE_INT_REF_FOR (T1) xi (x, precision);
WIDE_INT_REF_FOR (T2) yi (y, precision);
+ /* Optimize comparisons with constants and with sub-HWI unsigned
+ integers. */
+ if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0))
+ return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0];
+ if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0))
+ return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0];
if (precision <= HOST_BITS_PER_WIDE_INT)
{
unsigned HOST_WIDE_INT xl = xi.to_uhwi ();