This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFA][PATCH][PR tree-optimization/45685]



So for this source, compiled for x86_64 with -O3:

typedef unsigned long int uint64_t;
typedef long int int64_t;
int summation_helper_1(int64_t* products, uint64_t count)
{
        int s = 0;
        uint64_t i;
        for(i=0; i<count; i++)
        {
                int64_t val = (products[i]>0) ? 1 : -1;
                products[i] *= val;
                if(products[i] != i)
                        val = -val;
                products[i] = val;
                s += val;
        }
        return s;
}


int summation_helper_2(int64_t* products, uint64_t count)
{
        int s = 0;
        uint64_t i;
        for(i=0; i<count; i++)
        {
                int val = (products[i]>0) ? 1 : -1;
                products[i] *= val;
                if(products[i] != i)
                        val = -val;
                products[i] = val;
                s += val;
        }
        return s;
}


The loops we generate are pretty bad and have regressed relative to older versions of GCC.

For the first loop, we have the following .optimized output for the loop:

  <bb 4>:
  # s_28 = PHI <s_20(5), 0(3)>
  # i_27 = PHI <i_21(5), 0(3)>
  _11 = MEM[base: products_9(D), index: i_27, step: 8, offset: 0B];
  val_4 = _11 > 0 ? 1 : -1;
  prephitmp_38 = _11 > 0 ? -1 : 1;
  prephitmp_39 = _11 > 0 ? 4294967295 : 1;
  prephitmp_41 = _11 > 0 ? 1 : 4294967295;
  _12 = val_4 * _11;
  _14 = (long unsigned int) _12;
  val_3 = _14 != i_27 ? prephitmp_38 : val_4;
  prephitmp_44 = _14 != i_27 ? prephitmp_39 : prephitmp_41;
  MEM[base: products_9(D), index: i_27, step: 8, offset: 0B] = val_3;
  s.1_18 = (unsigned int) s_28;
  _19 = prephitmp_44 + s.1_18;
  s_20 = (int) _19;
  i_21 = i_27 + 1;
  if (i_21 != count_7(D))
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 5>:
  goto <bb 4>;


Note the series of COND_EXPRs. A couple are just conditional negation which can be implemented with a straight-line code sequence. Using that straight-line sequence results in:

  <bb 4>:
  # s_31 = PHI <s_20(5), 0(3)>
  # i_32 = PHI <i_21(5), 0(3)>
  _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
  val_4 = _11 > 0 ? 1 : -1;
  _12 = val_4 * _11;
  _14 = (long unsigned int) _12;
  _24 = _14 != i_32;
  _25 = (int64_t) _24;
  _29 = -_25;
  _28 = _29 ^ val_4;
  _27 = _28 + _25;
  MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
  _17 = (unsigned int) _27;
  s.1_18 = (unsigned int) s_31;
  _19 = _17 + s.1_18;
  s_20 = (int) _19;
  i_21 = i_32 + 1;
  if (i_21 != count_7(D))
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 5>:
  goto <bb 4>;

Which *appears* worse. However, that code can much more easily be handled by the RTL optimizers. When we look at what the trunk generates at the assembly level we have:

.L3:
        movq    (%rdi,%rcx,8), %rdx
        testq   %rdx, %rdx
        setg    %r8b
        movzbl  %r8b, %r10d
        movzbl  %r8b, %r8d
        leaq    -1(%r10,%r10), %r10
        leal    -1(%r8,%r8), %r8d
        movq    %r10, %r11
        imulq   %rdx, %r11
        testq   %rdx, %rdx
        setle   %dl
        movzbl  %dl, %r9d
        movzbl  %dl, %edx
        leaq    -1(%r9,%r9), %r9
        leal    -1(%rdx,%rdx), %edx
        cmpq    %rcx, %r11
        cmove   %r10, %r9
        cmove   %r8d, %edx
        movq    %r9, (%rdi,%rcx,8)
        addq    $1, %rcx
        addl    %edx, %eax
        cmpq    %rsi, %rcx
        jne     .L3
(Ick)

With the conditional negation patch that turns into:

L3:
        movq    (%rdi,%rcx,8), %r8
        xorl    %edx, %edx
        testq   %r8, %r8
        setg    %dl
        leaq    -1(%rdx,%rdx), %rdx
        imulq   %rdx, %r8
        cmpq    %rcx, %r8
        setne   %r8b
        movzbl  %r8b, %r8d
        movq    %r8, %r9
        negq    %r9
        xorq    %r9, %rdx
        addq    %r8, %rdx
        movq    %rdx, (%rdi,%rcx,8)
        addq    $1, %rcx
        addl    %edx, %eax
        cmpq    %rsi, %rcx
        jne     .L3

No branches within the loop, no conditional moves either. In all it's 5 instructions shorter.


The second loop shows similar effects, though they're not as dramatic.

Before:
  <bb 4>:
  # s_27 = PHI <s_19(5), 0(3)>
  # i_26 = PHI <i_20(5), 0(3)>
  _11 = MEM[base: products_9(D), index: i_26, step: 8, offset: 0B];
  val_4 = _11 > 0 ? 1 : -1;
  prephitmp_32 = _11 > 0 ? 1 : -1;
  prephitmp_33 = _11 > 0 ? -1 : 1;
  prephitmp_34 = _11 > 0 ? -1 : 1;
  _13 = _11 * prephitmp_32;
  _15 = (long unsigned int) _13;
  val_3 = _15 != i_26 ? prephitmp_33 : val_4;
  prephitmp_36 = _15 != i_26 ? prephitmp_34 : prephitmp_32;
MEM[base: products_9(D), index: i_26, step: 8, offset: 0B] = prephitmp_36;
  s_19 = val_3 + s_27;
  i_20 = i_26 + 1;
  if (i_20 != count_7(D))
    goto <bb 5>;
  else
    goto <bb 6>;

 <bb 5>:
  goto <bb 4>;


Which results in the following assembly:

.L8:
        movq    (%rdi,%r8,8), %rdx
        testq   %rdx, %rdx
        movq    %rdx, %r11
        setg    %cl
        movzbl  %cl, %r10d
        movzbl  %cl, %ecx
        leaq    -1(%rcx,%rcx), %rcx
        leal    -1(%r10,%r10), %r10d
        imulq   %rcx, %r11
        testq   %rdx, %rdx
        setle   %dl
        movzbl  %dl, %r9d
        movzbl  %dl, %edx
        leaq    -1(%r9,%r9), %r9
        leal    -1(%rdx,%rdx), %edx
        cmpq    %r8, %r11
        cmovne  %r9, %rcx
        cmove   %r10d, %edx
        movq    %rcx, (%rdi,%r8,8)
        addq    $1, %r8
        addl    %edx, %eax
        cmpq    %rsi, %r8
        jne     .L8



With the conditional negation patch:

 <bb 4>:
  # s_31 = PHI <s_20(5), 0(3)>
  # i_32 = PHI <i_21(5), 0(3)>
  _11 = MEM[base: products_9(D), index: i_32, step: 8, offset: 0B];
  val_4 = _11 > 0 ? 1 : -1;
  _12 = val_4 * _11;
  _14 = (long unsigned int) _12;
  _24 = _14 != i_32;
  _25 = (int64_t) _24;
  _29 = -_25;
  _28 = _29 ^ val_4;
  _27 = _28 + _25;
  MEM[base: products_9(D), index: i_32, step: 8, offset: 0B] = _27;
  _17 = (unsigned int) _27;
  s.1_18 = (unsigned int) s_31;
  _19 = _17 + s.1_18;
  s_20 = (int) _19;
  i_21 = i_32 + 1;
  if (i_21 != count_7(D))
    goto <bb 5>;
  else
    goto <bb 6>;

 <bb 5>:
  goto <bb 4>;

Which again looks worse than the original, but optimizes well into:

.L8:
        movq    (%rdi,%r8,8), %r9
        testq   %r9, %r9
        setg    %cl
        movzbl  %cl, %edx
        leaq    -1(%rdx,%rdx), %rdx
        imulq   %r9, %rdx
        xorl    %r9d, %r9d
        cmpq    %r8, %rdx
        movzbl  %cl, %edx
        setne   %r9b
        leal    -1(%rdx,%rdx), %edx
        movl    %r9d, %r10d
        negl    %r10d
        xorl    %r10d, %edx
        addl    %r9d, %edx
        movslq  %edx, %rcx
        addl    %edx, %eax
        movq    %rcx, (%rdi,%r8,8)
        addq    $1, %r8
        cmpq    %rsi, %r8
        jne     .L8


Bootstrapped and regression tested on x86_64-unknown-linux-gnu. OK for the trunk?

	PR tree-optimization/45685
	* tree-ssa-phiopt.c (neg_replacement): New function.
	(tree_ssa_phiopt_worker): Call it.
	
	PR tree-optimization/45685
	* gcc.dg/tree-ssa/pr45685.c: New test.


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
new file mode 100644
index 0000000..0628943
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr45685.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-phiopt1-details" } */
+
+typedef unsigned long int uint64_t;
+typedef long int int64_t;
+int summation_helper_1(int64_t* products, uint64_t count)
+{
+	int s = 0;
+	uint64_t i;
+	for(i=0; i<count; i++)
+	{	
+		int64_t val = (products[i]>0) ? 1 : -1;
+		products[i] *= val;
+		if(products[i] != i)
+			val = -val;
+		products[i] = val;
+		s += val;
+	}
+	return s;
+}
+
+
+int summation_helper_2(int64_t* products, uint64_t count)
+{
+	int s = 0;
+	uint64_t i;
+	for(i=0; i<count; i++)
+	{	
+		int val = (products[i]>0) ? 1 : -1;
+		products[i] *= val;
+		if(products[i] != i)
+			val = -val;
+		products[i] = val;
+		s += val;
+	}
+	return s;
+}
+
+/* { dg-final { scan-tree-dump-times "converted to straightline code" 2 "phiopt1" } } */
+/* { dg-final { cleanup-tree-dump "phiopt1" } } */
+
diff --git a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
index 11e565f..2522255 100644
--- a/gcc/tree-ssa-phiopt.c
+++ b/gcc/tree-ssa-phiopt.c
@@ -69,6 +69,8 @@ static bool minmax_replacement (basic_block, basic_block,
 				edge, edge, gimple, tree, tree);
 static bool abs_replacement (basic_block, basic_block,
 			     edge, edge, gimple, tree, tree);
+static bool neg_replacement (basic_block, basic_block,
+			     edge, edge, gimple, tree, tree);
 static bool cond_store_replacement (basic_block, basic_block, edge, edge,
 				    struct pointer_set_t *);
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
@@ -489,6 +491,8 @@ tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 	    cfgchanged = true;
 	  else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
 	    cfgchanged = true;
+	  else if (neg_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
+	    cfgchanged = true;
 	  else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
 	    cfgchanged = true;
 	}
@@ -1285,6 +1289,143 @@ abs_replacement (basic_block cond_bb, basic_block middle_bb,
   return true;
 }
 
+/*  The function neg_replacement replaces conditional negation with
+    equivalent straight line code.  Returns TRUE if replacement is done,
+    otherwise returns FALSE.
+
+    COND_BB branches around negation occuring in MIDDLE_BB.
+
+    E0 and E1 are edges out of COND_BB.  E0 reaches MIDDLE_BB and
+    E1 reaches the other successor which should contain PHI with
+    arguments ARG0 and ARG1.
+
+    Assuming negation is to occur when the condition is true,
+    then the non-branching sequence is:
+
+       result = (rhs ^ -cond) + cond
+
+    Inverting the condition or its result gives us negation
+    when the original condition is false.  */
+
+static bool
+neg_replacement (basic_block cond_bb, basic_block middle_bb,
+		 edge e0 ATTRIBUTE_UNUSED, edge e1,
+		 gimple phi, tree arg0, tree arg1)
+{
+  gimple new_stmt, cond;
+  gimple_stmt_iterator gsi;
+  gimple assign;
+  edge true_edge, false_edge;
+  tree rhs, lhs;
+  enum tree_code cond_code;
+  bool invert = false;
+
+  /* This transformation performs logical operations on the
+     incoming arguments.  So force them to be integral types.   */
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (arg0)))
+    return false;
+
+  /* OTHER_BLOCK must have only one executable statement which must have the
+     form arg0 = -arg1 or arg1 = -arg0.  */
+
+  assign = last_and_only_stmt (middle_bb);
+  /* If we did not find the proper negation assignment, then we can not
+     optimize.  */
+  if (assign == NULL)
+    return false;
+
+  /* If we got here, then we have found the only executable statement
+     in OTHER_BLOCK.  If it is anything other than arg0 = -arg1 or
+     arg1 = -arg0, then we can not optimize.  */
+  if (gimple_code (assign) != GIMPLE_ASSIGN)
+    return false;
+
+  lhs = gimple_assign_lhs (assign);
+
+  if (gimple_assign_rhs_code (assign) != NEGATE_EXPR)
+    return false;
+
+  rhs = gimple_assign_rhs1 (assign);
+
+  /* The assignment has to be arg0 = -arg1 or arg1 = -arg0.  */
+  if (!(lhs == arg0 && rhs == arg1)
+      && !(lhs == arg1 && rhs == arg0))
+    return false;
+
+  /* The basic sequence assumes we negate when the condition is true.
+     If we need the opposite, then we will either need to invert the
+     condition or its result.  */
+  extract_true_false_edges_from_block (cond_bb, &true_edge, &false_edge);
+  invert = false_edge->dest == middle_bb;
+
+  /* Unlike abs_replacement, we can handle arbitrary conditionals here.  */
+  cond = last_stmt (cond_bb);
+  cond_code = gimple_cond_code (cond);
+
+  /* If inversion is needed, first try to invert the test since
+     that's cheapest.  */
+  if (invert)
+    {
+      enum tree_code new_code
+	= invert_tree_comparison (cond_code,
+				  HONOR_NANS (TYPE_MODE (TREE_TYPE (rhs))));
+
+      /* If invert_tree_comparison was successful, then use its return
+	 value as the new code and note that inversion is no longer
+	 needed.  */
+      if (new_code != ERROR_MARK)
+	{
+	  cond_code = new_code;
+	  invert = false;
+	}
+    }
+
+  tree cond_val = make_ssa_name (boolean_type_node, NULL);
+  new_stmt = gimple_build_assign_with_ops (cond_code, cond_val,
+					   gimple_cond_lhs (cond),
+					   gimple_cond_rhs (cond));
+  gsi = gsi_last_bb (cond_bb);
+  gsi_insert_before (&gsi, new_stmt, GSI_NEW_STMT);
+
+  /* If we still need inversion, then invert the result of the
+     condition.  */
+  if (invert)
+    {
+      tree tmp = make_ssa_name (boolean_type_node, NULL);
+      new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
+					       cond_val, boolean_true_node);
+      gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+      cond_val = tmp;
+    }
+
+  /* Get the condition in the right type so that we can perform
+     logical and arithmetic operations on it.  */
+  tree cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (NOP_EXPR, cond_val_converted,
+					   cond_val, NULL_TREE);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree neg_cond_val_converted = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, neg_cond_val_converted,
+					   cond_val_converted, NULL_TREE);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree tmp = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (BIT_XOR_EXPR, tmp,
+					   rhs, neg_cond_val_converted);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  tree new_lhs = make_ssa_name (TREE_TYPE (rhs), NULL);
+  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, new_lhs,
+					   tmp, cond_val_converted);
+  gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT);
+
+  replace_phi_edge_with_variable (cond_bb, e1, phi, new_lhs);
+
+  /* Note that we optimized this PHI.  */
+  return true;
+}
+
 /* Auxiliary functions to determine the set of memory accesses which
    can't trap because they are preceded by accesses to the same memory
    portion.  We do that for MEM_REFs, so we only need to track

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]