This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs


On Thu, 1 Aug 2019, Uros Bizjak wrote:

On Thu, Aug 1, 2019 at 11:28 AM Richard Biener <rguenther@suse.de> wrote:

So you unconditionally add a smaxdi3 pattern - indeed this looks
necessary even when going the STV route.  The actual regression
for the testcase could also be solved by turing the smaxsi3
back into a compare and jump rather than a conditional move sequence.
So I wonder how you'd do that given that there's pass_if_after_reload
after pass_split_after_reload and I'm not sure we can split
as late as pass_split_before_sched2 (there's also a split _after_
sched2 on x86 it seems).

So how would you go implement {s,u}{min,max}{si,di}3 for the
case STV doesn't end up doing any transform?

If STV doesn't transform the insn, then a pre-reload splitter splits
the insn back to compare+cmove.

OK, that would work.  But there's no way to force a jumpy sequence then
which we know is faster than compare+cmove because later RTL
if-conversion passes happily re-discover the smax (or conditional move)
sequence.

However, considering the SImode move
from/to int/xmm register is relatively cheap, the cost function should
be tuned so that STV always converts smaxsi3 pattern.

Note that on both Zen and even more so bdverN the int/xmm transition
makes it no longer profitable but a _lot_ slower than the cmp/cmov
sequence... (for the loop in hmmer which is the only one I see
any effect of any of my patches).  So identifying chains that
start/end in memory is important for cost reasons.

Please note that the cost function also considers the cost of move
from/to xmm. So, the cost of the whole chain would disable the
transformation.

So I think the splitting has to happen after the last if-conversion
pass (and thus we may need to allocate a scratch register for this
purpose?)

I really hope that the underlying issue will be solved by a machine
dependant pass inserted somewhere after the pre-reload split. This
way, we can split unconverted smax to the cmove, and this later pass
would handle jcc and cmove instructions. Until then... yes your
proposed approach is one of the ways to avoid unwanted if-conversion,
although sometimes we would like to split to cmove instead.

So the following makes STV also consider SImode chains, re-using the
DImode chain code.  I've kept a simple incomplete smaxsi3 pattern
and also did not alter the {SI,DI}mode chain cost function - it's
quite off for TARGET_64BIT.  With this I get the expected conversion
for the testcase derived from hmmer.

No further testing sofar.

Is it OK to re-use the DImode chain code this way?  I'll clean things
up some more of course.

Still need help with the actual patterns for minmax and how the splitters
should look like.

Richard.

Index: gcc/config/i386/i386-features.c
===================================================================
--- gcc/config/i386/i386-features.c	(revision 274037)
+++ gcc/config/i386/i386-features.c	(working copy)
@@ -276,8 +276,11 @@

 /* Initialize new chain.  */

-scalar_chain::scalar_chain ()
+scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
 {
+  smode = smode_;
+  vmode = vmode_;
+
   chain_id = ++max_id;

    if (dump_file)
@@ -473,7 +476,7 @@
 {
   gcc_assert (CONST_INT_P (exp));

-  if (standard_sse_constant_p (exp, V2DImode))
+  if (standard_sse_constant_p (exp, vmode))
     return COSTS_N_INSNS (1);
   return ix86_cost->sse_load[1];
 }
@@ -534,6 +537,9 @@
       else if (GET_CODE (src) == NEG
 	       || GET_CODE (src) == NOT)
 	gain += ix86_cost->add - COSTS_N_INSNS (1);
+      else if (GET_CODE (src) == SMAX
+	       || GET_CODE (src) == SMIN)
+	gain += COSTS_N_INSNS (3);
       else if (GET_CODE (src) == COMPARE)
 	{
 	  /* Assume comparison cost is the same.  */
@@ -573,7 +579,7 @@
 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
 {
   if (x == reg)
-    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
+    return gen_rtx_SUBREG (vmode, new_reg, 0);

   const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
   int i, j;
@@ -707,7 +713,7 @@
   bitmap_copy (conv, insns);

   if (scalar_copy)
-    scopy = gen_reg_rtx (DImode);
+    scopy = gen_reg_rtx (smode);

   for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
     {
@@ -750,6 +756,10 @@
 		  gen_rtx_VEC_SELECT (SImode,
 				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
 	    }
+	  else if (smode == SImode)
+	    {
+	      emit_move_insn (scopy, gen_rtx_SUBREG (SImode, reg, 0));
+	    }
 	  else
 	    {
 	      rtx vcopy = gen_reg_rtx (V2DImode);
@@ -816,14 +826,14 @@
   if (GET_CODE (*op) == NOT)
     {
       convert_op (&XEXP (*op, 0), insn);
-      PUT_MODE (*op, V2DImode);
+      PUT_MODE (*op, vmode);
     }
   else if (MEM_P (*op))
     {
-      rtx tmp = gen_reg_rtx (DImode);
+      rtx tmp = gen_reg_rtx (GET_MODE (*op));

       emit_insn_before (gen_move_insn (tmp, *op), insn);
-      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
+      *op = gen_rtx_SUBREG (vmode, tmp, 0);

       if (dump_file)
 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
@@ -841,24 +851,30 @@
 	    gcc_assert (!DF_REF_CHAIN (ref));
 	    break;
 	  }
-      *op = gen_rtx_SUBREG (V2DImode, *op, 0);
+      *op = gen_rtx_SUBREG (vmode, *op, 0);
     }
   else if (CONST_INT_P (*op))
     {
       rtx vec_cst;
-      rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
+      rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);

       /* Prefer all ones vector in case of -1.  */
       if (constm1_operand (*op, GET_MODE (*op)))
-	vec_cst = CONSTM1_RTX (V2DImode);
+	vec_cst = CONSTM1_RTX (vmode);
       else
-	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
-					gen_rtvec (2, *op, const0_rtx));
+	{
+	  unsigned n = GET_MODE_NUNITS (vmode);
+	  rtx *v = XALLOCAVEC (rtx, n);
+	  v[0] = *op;
+	  for (unsigned i = 1; i < n; ++i)
+	    v[i] = const0_rtx;
+	  vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
+	}

-      if (!standard_sse_constant_p (vec_cst, V2DImode))
+      if (!standard_sse_constant_p (vec_cst, vmode))
 	{
 	  start_sequence ();
-	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
+	  vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
 	  rtx_insn *seq = get_insns ();
 	  end_sequence ();
 	  emit_insn_before (seq, insn);
@@ -870,7 +886,7 @@
   else
     {
       gcc_assert (SUBREG_P (*op));
-      gcc_assert (GET_MODE (*op) == V2DImode);
+      gcc_assert (GET_MODE (*op) == vmode);
     }
 }

@@ -888,9 +904,9 @@
     {
       /* There are no scalar integer instructions and therefore
 	 temporary register usage is required.  */
-      rtx tmp = gen_reg_rtx (DImode);
+      rtx tmp = gen_reg_rtx (GET_MODE (dst));
       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
-      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+      dst = gen_rtx_SUBREG (vmode, tmp, 0);
     }

   switch (GET_CODE (src))
@@ -899,7 +915,7 @@
     case ASHIFTRT:
     case LSHIFTRT:
       convert_op (&XEXP (src, 0), insn);
-      PUT_MODE (src, V2DImode);
+      PUT_MODE (src, vmode);
       break;

     case PLUS:
@@ -907,25 +923,27 @@
     case IOR:
     case XOR:
     case AND:
+    case SMAX:
+    case SMIN:
       convert_op (&XEXP (src, 0), insn);
       convert_op (&XEXP (src, 1), insn);
-      PUT_MODE (src, V2DImode);
+      PUT_MODE (src, vmode);
       break;

     case NEG:
       src = XEXP (src, 0);
       convert_op (&src, insn);
-      subreg = gen_reg_rtx (V2DImode);
-      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
-      src = gen_rtx_MINUS (V2DImode, subreg, src);
+      subreg = gen_reg_rtx (vmode);
+      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
+      src = gen_rtx_MINUS (vmode, subreg, src);
       break;

     case NOT:
       src = XEXP (src, 0);
       convert_op (&src, insn);
-      subreg = gen_reg_rtx (V2DImode);
-      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
-      src = gen_rtx_XOR (V2DImode, src, subreg);
+      subreg = gen_reg_rtx (vmode);
+      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
+      src = gen_rtx_XOR (vmode, src, subreg);
       break;

     case MEM:
@@ -939,17 +957,17 @@
       break;

     case SUBREG:
-      gcc_assert (GET_MODE (src) == V2DImode);
+      gcc_assert (GET_MODE (src) == vmode);
       break;

     case COMPARE:
       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));

-      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
-		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
+      gcc_assert ((REG_P (src) && GET_MODE (src) == GET_MODE_INNER (vmode))
+		  || (SUBREG_P (src) && GET_MODE (src) == vmode));

       if (REG_P (src))
-	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
+	subreg = gen_rtx_SUBREG (vmode, src, 0);
       else
 	subreg = copy_rtx_if_shared (src);
       emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
@@ -1186,7 +1204,7 @@
 		     (const_int 0 [0])))  */

 static bool
-convertible_comparison_p (rtx_insn *insn)
+convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
 {
   if (!TARGET_SSE4_1)
     return false;
@@ -1219,12 +1237,12 @@

   if (!SUBREG_P (op1)
       || !SUBREG_P (op2)
-      || GET_MODE (op1) != SImode
-      || GET_MODE (op2) != SImode
+      || GET_MODE (op1) != mode
+      || GET_MODE (op2) != mode
       || ((SUBREG_BYTE (op1) != 0
-	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
+	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (mode))
 	  && (SUBREG_BYTE (op2) != 0
-	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
+	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (mode))))
     return false;

   op1 = SUBREG_REG (op1);
@@ -1232,7 +1250,7 @@

   if (op1 != op2
       || !REG_P (op1)
-      || GET_MODE (op1) != DImode)
+      || GET_MODE (op1) != GET_MODE_WIDER_MODE (mode).else_blk ())
     return false;

   return true;
@@ -1241,7 +1259,7 @@
 /* The DImode version of scalar_to_vector_candidate_p.  */

 static bool
-dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
+dimode_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 {
   rtx def_set = single_set (insn);

@@ -1255,12 +1273,12 @@
   rtx dst = SET_DEST (def_set);

   if (GET_CODE (src) == COMPARE)
-    return convertible_comparison_p (insn);
+    return convertible_comparison_p (insn, mode);

   /* We are interested in DImode promotion only.  */
-  if ((GET_MODE (src) != DImode
+  if ((GET_MODE (src) != mode
        && !CONST_INT_P (src))
-      || GET_MODE (dst) != DImode)
+      || GET_MODE (dst) != mode)
     return false;

   if (!REG_P (dst) && !MEM_P (dst))
@@ -1285,12 +1303,14 @@
     case IOR:
     case XOR:
     case AND:
+    case SMAX:
+    case SMIN:
       if (!REG_P (XEXP (src, 1))
 	  && !MEM_P (XEXP (src, 1))
 	  && !CONST_INT_P (XEXP (src, 1)))
 	return false;

-      if (GET_MODE (XEXP (src, 1)) != DImode
+      if (GET_MODE (XEXP (src, 1)) != mode
 	  && !CONST_INT_P (XEXP (src, 1)))
 	return false;
       break;
@@ -1319,7 +1339,7 @@
 	  || !REG_P (XEXP (XEXP (src, 0), 0))))
       return false;

-  if (GET_MODE (XEXP (src, 0)) != DImode
+  if (GET_MODE (XEXP (src, 0)) != mode
       && !CONST_INT_P (XEXP (src, 0)))
     return false;

@@ -1392,7 +1412,7 @@
   if (TARGET_64BIT)
     return timode_scalar_to_vector_candidate_p (insn);
   else
-    return dimode_scalar_to_vector_candidate_p (insn);
+    return dimode_scalar_to_vector_candidate_p (insn, DImode);
 }

 /* The DImode version of remove_non_convertible_regs.  */
@@ -1577,11 +1597,12 @@
 convert_scalars_to_vector ()
 {
   basic_block bb;
-  bitmap candidates;
+  bitmap candidates, sicandidates;
   int converted_insns = 0;

   bitmap_obstack_initialize (NULL);
   candidates = BITMAP_ALLOC (NULL);
+  sicandidates = BITMAP_ALLOC (NULL);

   calculate_dominance_info (CDI_DOMINATORS);
   df_set_flags (DF_DEFER_INSN_RESCAN);
@@ -1605,28 +1626,43 @@

 	    bitmap_set_bit (candidates, INSN_UID (insn));
 	  }
+	else if (dimode_scalar_to_vector_candidate_p (insn, SImode))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "  insn %d is marked as a SI candidate\n",
+		       INSN_UID (insn));
+
+	    bitmap_set_bit (sicandidates, INSN_UID (insn));
+	  }
     }

   remove_non_convertible_regs (candidates);
+  dimode_remove_non_convertible_regs (sicandidates);

-  if (bitmap_empty_p (candidates))
+  if (bitmap_empty_p (candidates)
+      && bitmap_empty_p (sicandidates))
     if (dump_file)
       fprintf (dump_file, "There are no candidates for optimization.\n");

-  while (!bitmap_empty_p (candidates))
+  bitmap cand = candidates;
+  do
     {
-      unsigned uid = bitmap_first_set_bit (candidates);
+  while (!bitmap_empty_p (cand))
+    {
+      unsigned uid = bitmap_first_set_bit (cand);
       scalar_chain *chain;

-      if (TARGET_64BIT)
+      if (TARGET_64BIT && cand == candidates)
 	chain = new timode_scalar_chain;
-      else
-	chain = new dimode_scalar_chain;
+      else if (cand == candidates)
+	chain = new dimode_scalar_chain (DImode, V2DImode);
+      else if (cand == sicandidates)
+	chain = new dimode_scalar_chain (SImode, V4SImode);

       /* Find instructions chain we want to convert to vector mode.
 	 Check all uses and definitions to estimate all required
 	 conversions.  */
-      chain->build (candidates, uid);
+      chain->build (cand, uid);

       if (chain->compute_convert_gain () > 0)
 	converted_insns += chain->convert ();
@@ -1637,11 +1673,17 @@

       delete chain;
     }
+  if (cand == sicandidates)
+    break;
+  cand = sicandidates;
+    }
+  while (1);

   if (dump_file)
     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);

   BITMAP_FREE (candidates);
+  BITMAP_FREE (sicandidates);
   bitmap_obstack_release (NULL);
   df_process_deferred_rescans ();

Index: gcc/config/i386/i386-features.h
===================================================================
--- gcc/config/i386/i386-features.h	(revision 274037)
+++ gcc/config/i386/i386-features.h	(working copy)
@@ -127,11 +127,16 @@
 class scalar_chain
 {
  public:
-  scalar_chain ();
+  scalar_chain (enum machine_mode, enum machine_mode);
   virtual ~scalar_chain ();

   static unsigned max_id;

+  /* Scalar mode.  */
+  enum machine_mode smode;
+  /* Vector mode.  */
+  enum machine_mode vmode;
+
   /* ID of a chain.  */
   unsigned int chain_id;
   /* A queue of instructions to be included into a chain.  */
@@ -162,6 +167,8 @@
 class dimode_scalar_chain : public scalar_chain
 {
  public:
+  dimode_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
+    : scalar_chain (smode_, vmode_) {}
   int compute_convert_gain ();
  private:
   void mark_dual_mode_def (df_ref def);
@@ -178,6 +185,8 @@
 class timode_scalar_chain : public scalar_chain
 {
  public:
+  timode_scalar_chain () : scalar_chain (TImode, V1TImode) {}
+
   /* Convert from TImode to V1TImode is always faster.  */
   int compute_convert_gain () { return 1; }

Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md	(revision 274037)
+++ gcc/config/i386/i386.md	(working copy)
@@ -5325,6 +5325,16 @@
        (const_string "SI")
        (const_string "<MODE>")))])

+;; min/max patterns
+
+(define_insn "smaxsi3"
+  [(set (match_operand:SI 0 "register_operand")
+  	(smax:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_STV && TARGET_SSE4_1"
+  "#")
+
 ;; Add instructions

 (define_expand "add<mode>3"


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]