This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[RFC PATCH, i386]: Generate bit test (bt) instructions
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Cc: "H.J. Lu" <hjl dot tools at gmail dot com>, Michael Meissner <michael dot meissner at amd dot com>
- Date: Mon, 09 Jun 2008 20:18:19 +0200
- Subject: [RFC PATCH, i386]: Generate bit test (bt) instructions
Hello!
According to Intel Technology Journal [1], page 270, bt instruction runs
20% faster on Core2 Duo than equivalent generic code.
---Qoute from p.270---
The bit test instruction bt was introduced in the i386™
processor. In some implementations, including the Intel
NetBurst® micro-architecture, the instruction has a high
latency. The Intel Core micro-architecture executes bt in
a single cycle, when the bit base operand is a register.
Therefore, the Intel C++/Fortran compiler uses the bt
instruction to implement a common bit test idiom when
optimizing for the Intel Core micro-architecture. The
optimized code runs about 20% faster than the generic
version on an Intel Core 2 Duo processor. Both of these
versions are shown below:
C source code
int x, n;
...
if (x & (1 << n)) ...
Generic code generation
; edx contains x, ecx contains n.
mov eax, 1
shl eax, cl
test edx, eax
je taken
Intel Core micro-architecture code generation
; edx contains x, eax contains n.
bt edx, eax
jae taken
---/Quote---
GCC compiles following code:
--cut here--
void foo (void);
int test (int x, int n)
{
if (x & (1 << n))
foo ();
return 0;
}
--cut here--
using -O2 to:
test:
subl $12, %esp
movl 16(%esp), %eax
movl 20(%esp), %ecx
sarl %cl, %eax
testb $1, %al
je .L2
call foo
.L2:
xorl %eax, %eax
addl $12, %esp
ret
With attached patch, -O2 -mtune=core2 produces:
test:
subl $12, %esp
movl 20(%esp), %edx
movl 16(%esp), %eax
btl %edx, %eax
jnc .L2
call foo
.L2:
xorl %eax, %eax
addl $12, %esp
ret
The patch without TARGET_USE_BT insn predicates was used to bootstrap
gcc on i686-pc-linux-gnu and x86_64-pc-linux-gnu, where it converts
>1800 shift-and-test sequences into eqivalent bt instructions.
Attached patch adds TARGET_USE_BT insn predicates and adds core2 to
TARGET_USE_BT group.
2008-06-09 Uros Bizjak <ubizjak@gmail.com>
PR target/36473
* config/i386/i386.c (ix86_tune_features) [TUNE_USE_BT]: Add m_CORE2.
* config/i386/predicates.md (bt_comparison_operator): New predicate.
* config/i386/i386.md (*btdi_rex64): New instruction pattern.
(*btsi): Ditto.
(*jcc_btdi_rex64): New instruction and split pattern.
(*jcc_btsi): Ditto.
(*jcc_btsi_1): Ditto.
(*btsq): Fix Intel asm dialect operand order.
(*btrq): Ditto.
(*btcq): Ditto.
The patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
as well as i686-pc-linux-gnu, with and without TARGET_USE_BT insn
predicates.
[1] Inside the Intel® 10.1 Compilers: New Threadizer and New Vectorizer
for Intel® Core™2 Processors, Intel Technology Journal, Vol. 11, Issue
4, November 15, 2007,
http://download.intel.com/technology/itj/2007/v11i4/1-inside/1-Inside_the_Intel_Compilers.pdf
Uros.
Index: testsuite/gcc.target/i386/bt-2.c
===================================================================
--- testsuite/gcc.target/i386/bt-2.c (revision 0)
+++ testsuite/gcc.target/i386/bt-2.c (revision 0)
@@ -0,0 +1,16 @@
+/* PR target/36473 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+extern void foo (void);
+
+int test(long x, long n)
+{
+ if (x & ( (long)0x01 << n ))
+ foo ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler "btl\[ \t\]" { target { ! lp64 } } } } */
+/* { dg-final { scan-assembler "btq\[ \t\]" { target lp64 } } } */
Index: testsuite/gcc.target/i386/bt-1.c
===================================================================
--- testsuite/gcc.target/i386/bt-1.c (revision 0)
+++ testsuite/gcc.target/i386/bt-1.c (revision 0)
@@ -0,0 +1,15 @@
+/* PR target/36473 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=core2" } */
+
+extern void foo (void);
+
+int test(int x, int n)
+{
+ if (x & ( 0x01 << n ))
+ foo ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler "btl\[ \t\]" } } */
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 136593)
+++ config/i386/i386.md (working copy)
@@ -13691,7 +13691,7 @@
(const_int 1))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
- "bts{q} %1,%0"
+ "bts{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "alu1")])
(define_insn "*btrq"
@@ -13701,7 +13701,7 @@
(const_int 0))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
- "btr{q} %1,%0"
+ "btr{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "alu1")])
(define_insn "*btcq"
@@ -13711,7 +13711,7 @@
(not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1))))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
- "btc{q} %1,%0"
+ "btc{q}\t{%1, %0|%0, %1}"
[(set_attr "type" "alu1")])
;; Allow Nocona to avoid these instructions if a register is available.
@@ -13812,6 +13812,30 @@
emit_insn (gen_xordi3 (operands[0], operands[0], op1));
DONE;
})
+
+(define_insn "*btdi_rex64"
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (zero_extract:DI
+ (match_operand:DI 0 "register_operand" "r")
+ (const_int 1)
+ (match_operand:DI 1 "register_operand" "r"))
+ (const_int 0)))]
+ "TARGET_64BIT && TARGET_USE_BT"
+ "bt{q}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "alu1")])
+
+(define_insn "*btsi"
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (zero_extract:SI
+ (match_operand:SI 0 "register_operand" "r")
+ (const_int 1)
+ (match_operand:SI 1 "register_operand" "r"))
+ (const_int 0)))]
+ "TARGET_USE_BT"
+ "bt{l}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "alu1")])
;; Store-flag instructions.
@@ -14057,6 +14081,104 @@
FAIL;
})
+;; zero_extend in SImode is correct, since this is what combine pass
+;; generates from shift insn with QImode operand. Actually, the mode of
+;; operand 2 (bit offset operand) doesn't matter since bt insn takes
+;; appropriate modulo of the bit offset.
+
+(define_insn_and_split "*jcc_btdi_rex64"
+ [(set (pc)
+ (if_then_else (match_operator 0 "bt_comparison_operator"
+ [(zero_extract:DI
+ (match_operand:DI 1 "register_operand" "r")
+ (const_int 1)
+ (zero_extend:SI
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 0)])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_64BIT && TARGET_USE_BT"
+ "#"
+ "&& 1"
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (zero_extract:DI
+ (match_dup 1)
+ (const_int 1)
+ (match_dup 2))
+ (const_int 0)))
+ (set (pc)
+ (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+ (label_ref (match_dup 3))
+ (pc)))]
+{
+ operands[2] = simplify_gen_subreg (DImode, operands[2], QImode, 0);
+
+ PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+(define_insn_and_split "*jcc_btsi"
+ [(set (pc)
+ (if_then_else (match_operator 0 "bt_comparison_operator"
+ [(zero_extract:SI
+ (match_operand:SI 1 "register_operand" "r")
+ (const_int 1)
+ (zero_extend:SI
+ (match_operand:QI 2 "register_operand" "r")))
+ (const_int 0)])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_USE_BT"
+ "#"
+ "&& 1"
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (zero_extract:SI
+ (match_dup 1)
+ (const_int 1)
+ (match_dup 2))
+ (const_int 0)))
+ (set (pc)
+ (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+ (label_ref (match_dup 3))
+ (pc)))]
+{
+ operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0);
+
+ PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+(define_insn_and_split "*jcc_btsi_1"
+ [(set (pc)
+ (if_then_else (match_operator 0 "bt_comparison_operator"
+ [(and:SI
+ (lshiftrt:SI
+ (match_operand:SI 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r"))
+ (const_int 1))
+ (const_int 0)])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_USE_BT"
+ "#"
+ "&& 1"
+ [(set (reg:CCC FLAGS_REG)
+ (compare:CCC
+ (zero_extract:SI
+ (match_dup 1)
+ (const_int 1)
+ (match_dup 2))
+ (const_int 0)))
+ (set (pc)
+ (if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+ (label_ref (match_dup 3))
+ (pc)))]
+{
+ operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0);
+
+ PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
;; Define combination compare-and-branch fp compare instructions to use
;; during early optimization. Splitting the operation apart early makes
;; for bad code when we want to reverse the operation.
Index: config/i386/predicates.md
===================================================================
--- config/i386/predicates.md (revision 136593)
+++ config/i386/predicates.md (working copy)
@@ -920,6 +920,9 @@
(define_predicate "ix86_comparison_uns_operator"
(match_code "ne,eq,geu,gtu,leu,ltu"))
+(define_predicate "bt_comparison_operator"
+ (match_code "ne,eq"))
+
;; Return 1 if OP is a valid comparison operator in valid mode.
(define_predicate "ix86_comparison_operator"
(match_operand 0 "comparison_operator")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 136593)
+++ config/i386/i386.c (working copy)
@@ -1390,7 +1390,7 @@ unsigned int ix86_tune_features[X86_TUNE
m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
/* X86_TUNE_USE_BT */
- m_AMD_MULTIPLE,
+ m_AMD_MULTIPLE | m_CORE2,
/* X86_TUNE_USE_INCDEC */
~(m_PENT4 | m_NOCONA | m_GENERIC),