This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, i386, x86_64]: Expand __builtin_parity() into optimized asm code.


Hello!

This patch expands __buitlin_parity() into optimized i386 asm code.

The processor sets parity (P) flag for all logical operations, but P
flag depends only on 8-bit result in the lowest part of the register.
However, using a sequence of XOR operations, this limitation can be
overrided.

Currently, 64-bit parity is calculated as:

parity:
         movq    %rdi, %rax
         shrq    $32, %rax
         xorq    %rdi, %rax
         movq    %rax, %rdx
         shrq    $16, %rdx
         xorq    %rax, %rdx
         movq    %rdx, %rax
         shrq    $8, %rax
         xorq    %rdx, %rax
         movq    %rax, %rcx
         shrq    $4, %rcx
         xorq    %rax, %rcx
         movl    $27030, %eax
         andl    $15, %ecx
         sarl    %cl, %eax
         andl    $1, %eax
         ret

Using attached patch, the sequence is much shorter:

testll:
         movl    %edi, %edx
         shrq    $32, %rdi
         xorl    %edi, %edx
         movl    %edx, %eax
         shrl    $16, %edx
         xorl    %edx, %eax
         xorb    %ah, %al
         setnp   %al
         movzbl  %al, %eax
         ret

The patch also implements a couple of optimizations for memory and
register passing parameters for i386 to produce:

testll:
         movl    8(%esp), %edx
         xorl    4(%esp), %edx
         movl    %edx, %eax
         shrl    $16, %edx
         xorl    %edx, %eax
         xorb    %ah, %al
         ...

and (-mregparm=3):

testll:
         xorl    %eax, %edx
         movl    %edx, %ecx
         shrl    $16, %edx
         xorl    %edx, %ecx
         xorb    %ch, %cl
         ...

Additionally, patch implements post-reload splitting, so following testcase:

int test(unsigned char a)
{
   unsigned char x = a + 1;
   return __builtin_parity(x);
}

compiles to:

test:
         addl    $1, %edi
         xorl    %eax, %eax
         testb   %dil, %dil
         setnp   %al
         ret

The performance impact can be shown using following testcase
(Core 2 Duo XE, 64bit):

--cut here--
static int test(unsigned int a)
{
   return __builtin_parity(a);
}

int main()
{
   unsigned int i;
   int b = 0;

   for (i = 0; i < 0xffffffff; i++)
     b += test(i);

   printf("%i\n", b);
   return 0;
}
--cut here--

Execution time for an unpatched gcc:

user 0m14.537s

Execution time for a patched gcc:

user 0m6.236s

Patch was bootstrapped on x86_86-pc-linux-gnu and regression tested for
all default languages. I'll wait a couple of days for comments or
suggestions before patch is committed to SVN.

2007-02-11 Uros Bizjak <ubizjak@gmail.com>

         * config/i386/i386.md (paritydi2, paritysi2): New expanders.
         (paritydi2_cmp, paritydi2_cmp): New insn and split patterns.
         (*parityhi2_cmp, *parityqi2_cmp): New insn patterns.

testsuite/ChangeLog:

         * gcc.target/i386/parity-1.c: New test.
         * gcc.target/i386/parity-2.c: New test.

Uros.

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 121814)
+++ config/i386/i386.md	(working copy)
@@ -15103,6 +15103,130 @@
   [(set_attr "prefix_rep" "1")
    (set_attr "type" "bitmanip")
    (set_attr "mode" "HI")])
+
+(define_expand "paritydi2"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(parity:DI (match_operand:DI 1 "nonimmediate_operand" "")))]
+  "! TARGET_POPCNT"
+{
+  rtx scratch = gen_reg_rtx (QImode);
+  rtx cond;
+
+  emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX,
+				NULL_RTX, operands[1]));
+
+  cond = gen_rtx_fmt_ee (ORDERED, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+  if (TARGET_64BIT)
+    emit_insn (gen_zero_extendqidi2 (operands[0], scratch));
+  else
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+
+      emit_insn (gen_zero_extendqisi2 (tmp, scratch));
+      emit_insn (gen_zero_extendsidi2 (operands[0], tmp));
+    }
+  DONE;
+})
+
+(define_insn_and_split "paritydi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(parity:CC (match_operand:DI 3 "nonimmediate_operand" "0,m")))
+   (clobber (match_scratch:DI 0 "=r,X"))
+   (clobber (match_scratch:SI 1 "=r,r"))
+   (clobber (match_scratch:HI 2 "=Q,Q"))]
+  "! TARGET_POPCNT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+     [(set (match_dup 1)
+	   (xor:SI (match_dup 1) (match_dup 4)))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (reg:CC FLAGS_REG)
+	   (parity:CC (match_dup 1)))
+      (clobber (match_dup 1))
+      (clobber (match_dup 2))])]
+{
+  operands[4] = gen_lowpart (SImode, operands[3]);
+
+  if (MEM_P (operands[3]))
+    emit_move_insn (operands[1], gen_highpart (SImode, operands[3]));
+  else if (! TARGET_64BIT)
+    operands[1] = gen_highpart (SImode, operands[3]);
+  else
+    {
+      emit_move_insn (operands[1], gen_lowpart (SImode, operands[3]));
+      emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32)));
+    }
+})
+
+(define_expand "paritysi2"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(parity:SI (match_operand:SI 1 "nonimmediate_operand" "")))]
+  "! TARGET_POPCNT"
+{
+  rtx scratch = gen_reg_rtx (QImode);
+  rtx cond;
+
+  emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1]));
+
+  cond = gen_rtx_fmt_ee (ORDERED, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+  emit_insn (gen_zero_extendqisi2 (operands[0], scratch));
+  DONE;
+})
+
+(define_insn_and_split "paritysi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(parity:CC (match_operand:SI 2 "nonimmediate_operand" "0,m")))
+   (clobber (match_scratch:SI 0 "=r,X"))
+   (clobber (match_scratch:HI 1 "=Q,Q"))]
+  "! TARGET_POPCNT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+     [(set (match_dup 1)
+	   (xor:HI (match_dup 1) (match_dup 3)))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (reg:CC FLAGS_REG)
+	   (parity:CC (match_dup 1)))
+      (clobber (match_dup 1))])]
+{
+  operands[3] = gen_lowpart (HImode, operands[2]);
+
+  if (MEM_P (operands[2]))
+    emit_move_insn (operands[1], gen_highpart (HImode, operands[2]));
+  else
+    {
+      emit_move_insn (operands[1], gen_lowpart (HImode, operands[2]));
+      emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16)));
+    }
+})
+
+(define_insn "*parityhi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(parity:CC (match_operand:HI 1 "register_operand" "0")))
+   (clobber (match_scratch:HI 0 "=Q"))]
+  "! TARGET_POPCNT"
+  "xor{b}\t{%h0, %b0|%b0, %h0}"
+  [(set_attr "length" "2")
+   (set_attr "mode" "HI")])
+
+(define_insn "*parityqi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(parity:CC (match_operand:QI 0 "register_operand" "q")))]
+  "! TARGET_POPCNT"
+  "test{b}\t%0, %0"
+  [(set_attr "length" "2")
+   (set_attr "mode" "QI")])
 
 ;; Thread-local storage patterns for ELF.
 ;;
Index: testsuite/gcc.target/i386/parity-1.c
===================================================================
--- testsuite/gcc.target/i386/parity-1.c	(revision 0)
+++ testsuite/gcc.target/i386/parity-1.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "setnp" } } */
+
+int foo(unsigned int x)
+{
+  return __builtin_parity(x);
+}
+
Index: testsuite/gcc.target/i386/parity-2.c
===================================================================
--- testsuite/gcc.target/i386/parity-2.c	(revision 0)
+++ testsuite/gcc.target/i386/parity-2.c	(revision 0)
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "setnp" } } */
+
+int foo(unsigned long long int x)
+{
+  return __builtin_parityll(x);
+}
+

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]