This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC PATCH, i386]: Generate bit test (bt) instructions


Hello!

According to Intel Technology Journal [1], page 270, bt instruction runs 20% faster on Core2 Duo than equivalent generic code.

---Qoute from p.270---
The bit test instruction bt was introduced in the i386™
processor. In some implementations, including the Intel
NetBurst® micro-architecture, the instruction has a high
latency. The Intel Core micro-architecture executes bt in
a single cycle, when the bit base operand is a register.
Therefore, the Intel C++/Fortran compiler uses the bt
instruction to implement a common bit test idiom when
optimizing for the Intel Core micro-architecture. The
optimized code runs about 20% faster than the generic
version on an Intel Core 2 Duo processor. Both of these
versions are shown below:

C source code
int x, n;
...
if (x & (1 << n)) ...

Generic code generation
; edx contains x, ecx contains n.
mov eax, 1
shl eax, cl
test edx, eax
je taken

Intel Core micro-architecture code generation
; edx contains x, eax contains n.
bt edx, eax
jae taken
---/Quote---

GCC compiles following code:

--cut here--
void foo (void);

int test (int x, int n)
{
if (x & (1 << n))
foo ();

return 0;
}
--cut here--

using -O2 to:

test:
subl $12, %esp
movl 16(%esp), %eax
movl 20(%esp), %ecx
sarl %cl, %eax
testb $1, %al
je .L2
call foo
.L2:
xorl %eax, %eax
addl $12, %esp
ret

With attached patch, -O2 -mtune=core2 produces:

test:
subl $12, %esp
movl 20(%esp), %edx
movl 16(%esp), %eax
btl %edx, %eax
jnc .L2
call foo
.L2:
xorl %eax, %eax
addl $12, %esp
ret

The patch without TARGET_USE_BT insn predicates was used to bootstrap gcc on i686-pc-linux-gnu and x86_64-pc-linux-gnu, where it converts >1800 shift-and-test sequences into eqivalent bt instructions.

Attached patch adds TARGET_USE_BT insn predicates and adds core2 to TARGET_USE_BT group.


2008-06-09 Uros Bizjak <ubizjak@gmail.com>


PR target/36473
* config/i386/i386.c (ix86_tune_features) [TUNE_USE_BT]: Add m_CORE2.
* config/i386/predicates.md (bt_comparison_operator): New predicate.
* config/i386/i386.md (*btdi_rex64): New instruction pattern.
(*btsi): Ditto.
(*jcc_btdi_rex64): New instruction and split pattern.
(*jcc_btsi): Ditto.
(*jcc_btsi_1): Ditto.
(*btsq): Fix Intel asm dialect operand order.
(*btrq): Ditto.
(*btcq): Ditto.


The patch was bootstrapped and regression tested on x86_64-pc-linux-gnu as well as i686-pc-linux-gnu, with and without TARGET_USE_BT insn predicates.


[1] Inside the Intel® 10.1 Compilers: New Threadizer and New Vectorizer for Intel® Core™2 Processors, Intel Technology Journal, Vol. 11, Issue 4, November 15, 2007, http://download.intel.com/technology/itj/2007/v11i4/1-inside/1-Inside_the_Intel_Compilers.pdf

Uros.
Index: testsuite/gcc.target/i386/bt-2.c
===================================================================
--- testsuite/gcc.target/i386/bt-2.c	(revision 0)
+++ testsuite/gcc.target/i386/bt-2.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/36473 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+extern void foo (void);
+
+int test(long x, long n)
+{
+  if (x & ( (long)0x01 << n ))
+    foo ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "btl\[ \t\]" { target { ! lp64 } } } } */
+/* { dg-final { scan-assembler "btq\[ \t\]" { target lp64 } } } */
Index: testsuite/gcc.target/i386/bt-1.c
===================================================================
--- testsuite/gcc.target/i386/bt-1.c	(revision 0)
+++ testsuite/gcc.target/i386/bt-1.c	(revision 0)
@@ -0,0 +1,15 @@
+/* PR target/36473 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+extern void foo (void);
+
+int test(int x, int n)
+{
+  if (x & ( 0x01 << n ))
+    foo ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "btl\[ \t\]" } } */
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 136593)
+++ config/i386/i386.md	(working copy)
@@ -13691,7 +13691,7 @@
 	(const_int 1))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
-  "bts{q} %1,%0"
+  "bts{q}\t{%1, %0|%0, %1}"
   [(set_attr "type" "alu1")])
 
 (define_insn "*btrq"
@@ -13701,7 +13701,7 @@
 	(const_int 0))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
-  "btr{q} %1,%0"
+  "btr{q}\t{%1, %0|%0, %1}"
   [(set_attr "type" "alu1")])
 
 (define_insn "*btcq"
@@ -13711,7 +13711,7 @@
 	(not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
-  "btc{q} %1,%0"
+  "btc{q}\t{%1, %0|%0, %1}"
   [(set_attr "type" "alu1")])
 
 ;; Allow Nocona to avoid these instructions if a register is available.
@@ -13812,6 +13812,30 @@
   emit_insn (gen_xordi3 (operands[0], operands[0], op1));
   DONE;
 })
+
+(define_insn "*btdi_rex64"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:DI
+	    (match_operand:DI 0 "register_operand" "r")
+	    (const_int 1)
+	    (match_operand:DI 1 "register_operand" "r"))
+	(const_int 0)))]
+  "TARGET_64BIT && TARGET_USE_BT"
+  "bt{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")])
+
+(define_insn "*btsi"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SI
+	    (match_operand:SI 0 "register_operand" "r")
+	    (const_int 1)
+	    (match_operand:SI 1 "register_operand" "r"))
+	(const_int 0)))]
+  "TARGET_USE_BT"
+  "bt{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")])
 
 ;; Store-flag instructions.
 
@@ -14057,6 +14081,104 @@
     FAIL;
 })
 
+;; zero_extend in SImode is correct, since this is what combine pass
+;; generates from shift insn with QImode operand.  Actually, the mode of
+;; operand 2 (bit offset operand) doesn't matter since bt insn takes
+;; appropriate modulo of the bit offset.
+
+(define_insn_and_split "*jcc_btdi_rex64"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:DI
+			   (match_operand:DI 1 "register_operand" "r")
+			   (const_int 1)
+			   (zero_extend:SI
+			     (match_operand:QI 2 "register_operand" "r")))
+			 (const_int 0)])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  "TARGET_64BIT && TARGET_USE_BT"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:DI
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (DImode, operands[2], QImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+(define_insn_and_split "*jcc_btsi"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:SI
+			   (match_operand:SI 1 "register_operand" "r")
+			   (const_int 1)
+			   (zero_extend:SI
+			     (match_operand:QI 2 "register_operand" "r")))
+			 (const_int 0)])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  "TARGET_USE_BT"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SI
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+(define_insn_and_split "*jcc_btsi_1"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(and:SI
+			   (lshiftrt:SI
+			     (match_operand:SI 1 "register_operand" "r")
+			     (match_operand:QI 2 "register_operand" "r"))
+			   (const_int 1))
+			 (const_int 0)])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  "TARGET_USE_BT"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SI
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
 ;; Define combination compare-and-branch fp compare instructions to use
 ;; during early optimization.  Splitting the operation apart early makes
 ;; for bad code when we want to reverse the operation.
Index: config/i386/predicates.md
===================================================================
--- config/i386/predicates.md	(revision 136593)
+++ config/i386/predicates.md	(working copy)
@@ -920,6 +920,9 @@
 (define_predicate "ix86_comparison_uns_operator"
   (match_code "ne,eq,geu,gtu,leu,ltu"))
 
+(define_predicate "bt_comparison_operator"
+  (match_code "ne,eq"))
+
 ;; Return 1 if OP is a valid comparison operator in valid mode.
 (define_predicate "ix86_comparison_operator"
   (match_operand 0 "comparison_operator")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 136593)
+++ config/i386/i386.c	(working copy)
@@ -1390,7 +1390,7 @@ unsigned int ix86_tune_features[X86_TUNE
   m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
 
   /* X86_TUNE_USE_BT */
-  m_AMD_MULTIPLE,
+  m_AMD_MULTIPLE | m_CORE2,
 
   /* X86_TUNE_USE_INCDEC */
   ~(m_PENT4 | m_NOCONA | m_GENERIC),

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]