[PATCH, testsuite, i386] BMI2 support for GCC

Uros Bizjak ubizjak@gmail.com
Sat Aug 20 20:05:00 GMT 2011


On Fri, Aug 19, 2011 at 4:51 PM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:

>>> Updated patch is attached.

Comments inline.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..bff1a05 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)

 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2

Are you sure that -mbmi2 does not imply -mbmi?

@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum
machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */

   If CODE is 'N', print the high register of a double word register pair.

@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;

+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+

No need to check modes, we _KNOW_ that DWI expands to double word
modes. Also, handling of 'N' should be put a couple of lines lower,
like:

     code = 16;
   else if (code == 't')
     code = 32;
+  else if (code == 'N')
+    {
+      gcc_assert (mode == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_highpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
   else
     code = GET_MODE_SIZE (GET_MODE (x));

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..05f7666 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md

 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])

+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])

 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr")
(ashiftrt "ashr")])

 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])

These renames should be part of another follow-up patch.

 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@

 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])

No. Pattern should be splitted instead.

 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})

Please split the expander instead!

+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode,
operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode)
+    {
+      rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+      emit_insn (gen_extendsidi2 (tmp, operands[2]));
+      operands[2] = tmp;
+    }
+})

Why splitters? Generate the shifts directly from the expander, fixing
the operands on-the-fly if necessary. Also, do not rename half of the
shift expanders and insn patterns just to introduce *ONE* extra RTX...

@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code
code, enum machine_mode mode,
     }

Don't expand RORX through ix86_expand_binary_operator, generate it
directly from expander. You are complicating things with splitters too
much!

I will rewrite this part of i386.md.

@@ -12346,6 +12422,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"

Please put these into "Bit manipulation instructions." section.  Also,
if possible, UNSPECs should not be used. It is much better to describe
pattern with generic RTXes, this way you will enable many existing
optimizations that (obviously) don't know how to handle UNSPECs.

+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}

Please add spaces around operators (also in other testcases).

+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */

Don't use -Ofast in the testsuite.  Use explicit -O2, -ffast-math,
etc. Options come and go from -Ofast.

+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);

Spaces.

+    res = (unsigned long long)a * b;

And here.

+  for (i=0; i<b; ++i) {

And here.  Please follow GNU coding standards [1].

+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }

Why? There are other ways to suppress unwanted optimizations, using
volatile and asm. See many examples through the testsuite.

+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);

If you _REALLY_ need debugging printfs in the test, then protect it
with #ifdef DEBUG and put them just before abort (again, see many
examples). But more or less, these printfs are just annoying.


[1] http://www.gnu.org/prep/standards/standards.html

Uros.



More information about the Gcc-patches mailing list