This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, i386, x86_64]: Expand __builtin_parity() into optimized asm code.
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 11 Feb 2007 17:52:25 +0100
- Subject: [PATCH, i386, x86_64]: Expand __builtin_parity() into optimized asm code.
Hello!
This patch expands __buitlin_parity() into optimized i386 asm code.
The processor sets parity (P) flag for all logical operations, but P
flag depends only on 8-bit result in the lowest part of the register.
However, using a sequence of XOR operations, this limitation can be
overrided.
Currently, 64-bit parity is calculated as:
parity:
movq %rdi, %rax
shrq $32, %rax
xorq %rdi, %rax
movq %rax, %rdx
shrq $16, %rdx
xorq %rax, %rdx
movq %rdx, %rax
shrq $8, %rax
xorq %rdx, %rax
movq %rax, %rcx
shrq $4, %rcx
xorq %rax, %rcx
movl $27030, %eax
andl $15, %ecx
sarl %cl, %eax
andl $1, %eax
ret
Using attached patch, the sequence is much shorter:
testll:
movl %edi, %edx
shrq $32, %rdi
xorl %edi, %edx
movl %edx, %eax
shrl $16, %edx
xorl %edx, %eax
xorb %ah, %al
setnp %al
movzbl %al, %eax
ret
The patch also implements a couple of optimizations for memory and
register passing parameters for i386 to produce:
testll:
movl 8(%esp), %edx
xorl 4(%esp), %edx
movl %edx, %eax
shrl $16, %edx
xorl %edx, %eax
xorb %ah, %al
...
and (-mregparm=3):
testll:
xorl %eax, %edx
movl %edx, %ecx
shrl $16, %edx
xorl %edx, %ecx
xorb %ch, %cl
...
Additionally, patch implements post-reload splitting, so following testcase:
int test(unsigned char a)
{
unsigned char x = a + 1;
return __builtin_parity(x);
}
compiles to:
test:
addl $1, %edi
xorl %eax, %eax
testb %dil, %dil
setnp %al
ret
The performance impact can be shown using following testcase
(Core 2 Duo XE, 64bit):
--cut here--
static int test(unsigned int a)
{
return __builtin_parity(a);
}
int main()
{
unsigned int i;
int b = 0;
for (i = 0; i < 0xffffffff; i++)
b += test(i);
printf("%i\n", b);
return 0;
}
--cut here--
Execution time for an unpatched gcc:
user 0m14.537s
Execution time for a patched gcc:
user 0m6.236s
Patch was bootstrapped on x86_86-pc-linux-gnu and regression tested for
all default languages. I'll wait a couple of days for comments or
suggestions before patch is committed to SVN.
2007-02-11 Uros Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (paritydi2, paritysi2): New expanders.
(paritydi2_cmp, paritydi2_cmp): New insn and split patterns.
(*parityhi2_cmp, *parityqi2_cmp): New insn patterns.
testsuite/ChangeLog:
* gcc.target/i386/parity-1.c: New test.
* gcc.target/i386/parity-2.c: New test.
Uros.
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 121814)
+++ config/i386/i386.md (working copy)
@@ -15103,6 +15103,130 @@
[(set_attr "prefix_rep" "1")
(set_attr "type" "bitmanip")
(set_attr "mode" "HI")])
+
+(define_expand "paritydi2"
+ [(set (match_operand:DI 0 "register_operand" "")
+ (parity:DI (match_operand:DI 1 "nonimmediate_operand" "")))]
+ "! TARGET_POPCNT"
+{
+ rtx scratch = gen_reg_rtx (QImode);
+ rtx cond;
+
+ emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX,
+ NULL_RTX, operands[1]));
+
+ cond = gen_rtx_fmt_ee (ORDERED, QImode,
+ gen_rtx_REG (CCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+ if (TARGET_64BIT)
+ emit_insn (gen_zero_extendqidi2 (operands[0], scratch));
+ else
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+
+ emit_insn (gen_zero_extendqisi2 (tmp, scratch));
+ emit_insn (gen_zero_extendsidi2 (operands[0], tmp));
+ }
+ DONE;
+})
+
+(define_insn_and_split "paritydi2_cmp"
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_operand:DI 3 "nonimmediate_operand" "0,m")))
+ (clobber (match_scratch:DI 0 "=r,X"))
+ (clobber (match_scratch:SI 1 "=r,r"))
+ (clobber (match_scratch:HI 2 "=Q,Q"))]
+ "! TARGET_POPCNT"
+ "#"
+ "&& reload_completed"
+ [(parallel
+ [(set (match_dup 1)
+ (xor:SI (match_dup 1) (match_dup 4)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_dup 1)))
+ (clobber (match_dup 1))
+ (clobber (match_dup 2))])]
+{
+ operands[4] = gen_lowpart (SImode, operands[3]);
+
+ if (MEM_P (operands[3]))
+ emit_move_insn (operands[1], gen_highpart (SImode, operands[3]));
+ else if (! TARGET_64BIT)
+ operands[1] = gen_highpart (SImode, operands[3]);
+ else
+ {
+ emit_move_insn (operands[1], gen_lowpart (SImode, operands[3]));
+ emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32)));
+ }
+})
+
+(define_expand "paritysi2"
+ [(set (match_operand:SI 0 "register_operand" "")
+ (parity:SI (match_operand:SI 1 "nonimmediate_operand" "")))]
+ "! TARGET_POPCNT"
+{
+ rtx scratch = gen_reg_rtx (QImode);
+ rtx cond;
+
+ emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1]));
+
+ cond = gen_rtx_fmt_ee (ORDERED, QImode,
+ gen_rtx_REG (CCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+ emit_insn (gen_zero_extendqisi2 (operands[0], scratch));
+ DONE;
+})
+
+(define_insn_and_split "paritysi2_cmp"
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_operand:SI 2 "nonimmediate_operand" "0,m")))
+ (clobber (match_scratch:SI 0 "=r,X"))
+ (clobber (match_scratch:HI 1 "=Q,Q"))]
+ "! TARGET_POPCNT"
+ "#"
+ "&& reload_completed"
+ [(parallel
+ [(set (match_dup 1)
+ (xor:HI (match_dup 1) (match_dup 3)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_dup 1)))
+ (clobber (match_dup 1))])]
+{
+ operands[3] = gen_lowpart (HImode, operands[2]);
+
+ if (MEM_P (operands[2]))
+ emit_move_insn (operands[1], gen_highpart (HImode, operands[2]));
+ else
+ {
+ emit_move_insn (operands[1], gen_lowpart (HImode, operands[2]));
+ emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16)));
+ }
+})
+
+(define_insn "*parityhi2_cmp"
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_operand:HI 1 "register_operand" "0")))
+ (clobber (match_scratch:HI 0 "=Q"))]
+ "! TARGET_POPCNT"
+ "xor{b}\t{%h0, %b0|%b0, %h0}"
+ [(set_attr "length" "2")
+ (set_attr "mode" "HI")])
+
+(define_insn "*parityqi2_cmp"
+ [(set (reg:CC FLAGS_REG)
+ (parity:CC (match_operand:QI 0 "register_operand" "q")))]
+ "! TARGET_POPCNT"
+ "test{b}\t%0, %0"
+ [(set_attr "length" "2")
+ (set_attr "mode" "QI")])
;; Thread-local storage patterns for ELF.
;;
Index: testsuite/gcc.target/i386/parity-1.c
===================================================================
--- testsuite/gcc.target/i386/parity-1.c (revision 0)
+++ testsuite/gcc.target/i386/parity-1.c (revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "setnp" } } */
+
+int foo(unsigned int x)
+{
+ return __builtin_parity(x);
+}
+
Index: testsuite/gcc.target/i386/parity-2.c
===================================================================
--- testsuite/gcc.target/i386/parity-2.c (revision 0)
+++ testsuite/gcc.target/i386/parity-2.c (revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "setnp" } } */
+
+int foo(unsigned long long int x)
+{
+ return __builtin_parityll(x);
+}
+