This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][PR target/15184] Fix for direct byte access on x86



So here's the updated patch which handles all 4 testcases from the PR as well as a couple of my own.

As discussed earlier this week, we have to twiddle the heuristic for when to enable 4 insn combinations to make 4-insn combos where the first insn is a load and the last a store profitable. And we still need to handle the MEMs being different modes.

The new part of the patch is extending make_field_assignment to handle cases where SUBREGs appear in the source of the potential field assignment.

In particular make_field_assignment has some nice code to detect a field assignment that looks like

(set (x) (ior (and (x) (const_int C)) y)))

There's obviously a variety of tests, but that's the overall structure of the RTL.

To handle the cases in PR 15184, we just need to be able to handle a couple annoying SUBREGs:

(set (x)
 (subreg (ior (and (subreg (x)) (const_int C) y)

The outer subreg is going to be a narrowing subreg to narrow the logicals to whatever mode the destination uses. The inner subreg widens the source to whatever mode is needed for the logicals.

Once the subregs are handled, the everything "just works" as I outlined in my earlier message.

Bootstrapped & regression tested on x86_64-unknown-linux and powerpc64-unknown-linux.

Applying to the trunk.

Jeff

commit 90e5056d53d680fcc47693a118bfd75f7af0b435
Author: Jeff Law <law@redhat.com>
Date:   Tue Jan 27 16:32:06 2015 -0700

    	PR target/15184
    	* combine.c (try_combine): If I0 is a memory load and I3 a store
    	to a related address, increase the "goodness" of doing a 4-insn
    	combination with I0-I3.
    	(make_field_assignment): Handle SUBREGs in the ior+and case.
    
    	PR target/15184
    	* gcc.target/i386/pr15184-1.c: New test.
    	* gcc.target/i386/pr15184-2.c: New test.

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7424d9f..0c02d43 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2015-01-29  Jeff Law  <law@redhat.com>
+
+	PR target/15184 
+	* combine.c (try_combine): If I0 is a memory load and I3 a store
+	to a related address, increase the "goodness" of doing a 4-insn
+	combination with I0-I3.
+	(make_field_assignment): Handle SUBREGs in the ior+and case.
+
 2015-01-29  Yuri Rumyantsev  <ysrumyan@gmail.com>
 
 	PR tree-optimization/64746
diff --git a/gcc/combine.c b/gcc/combine.c
index 5c763b4..24418df 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2620,6 +2620,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
       int i;
       int ngood = 0;
       int nshift = 0;
+      rtx set0, set3;
 
       if (!flag_expensive_optimizations)
 	return 0;
@@ -2643,6 +2644,34 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
 		   || GET_CODE (src) == LSHIFTRT)
 	    nshift++;
 	}
+
+      /* If I0 loads a memory and I3 sets the same memory, then I2 and I3
+	 are likely manipulating its value.  Ideally we'll be able to combine
+	 all four insns into a bitfield insertion of some kind. 
+
+	 Note the source in I0 might be inside a sign/zero extension and the
+	 memory modes in I0 and I3 might be different.  So extract the address
+	 from the destination of I3 and search for it in the source of I0.
+
+	 In the event that there's a match but the source/dest do not actually
+	 refer to the same memory, the worst that happens is we try some
+	 combinations that we wouldn't have otherwise.  */
+      if ((set0 = single_set (i0))
+	  /* Ensure the source of SET0 is a MEM, possibly buried inside
+	     an extension.  */
+	  && (GET_CODE (SET_SRC (set0)) == MEM
+	      || ((GET_CODE (SET_SRC (set0)) == ZERO_EXTEND
+		   || GET_CODE (SET_SRC (set0)) == SIGN_EXTEND)
+		  && GET_CODE (XEXP (SET_SRC (set0), 0)) == MEM))
+	  && (set3 = single_set (i3))
+	  /* Ensure the destination of SET3 is a MEM.  */
+	  && GET_CODE (SET_DEST (set3)) == MEM
+	  /* Would it be better to extract the base address for the MEM
+	     in SET3 and look for that?  I don't have cases where it matters
+	     but I could envision such cases.  */
+	  && rtx_referenced_p (XEXP (SET_DEST (set3), 0), SET_SRC (set0)))
+	ngood += 2;
+
       if (ngood < 2 && nshift < 2)
 	return 0;
     }
@@ -9272,6 +9301,13 @@ make_field_assignment (rtx x)
      to the appropriate position, force it to the required mode, and
      make the extraction.  Check for the AND in both operands.  */
 
+  /* One or more SUBREGs might obscure the constant-position field
+     assignment.  The first one we are likely to encounter is an outer
+     narrowing SUBREG, which we can just strip for the purposes of
+     identifying the constant-field assignment.  */
+  if (GET_CODE (src) == SUBREG && subreg_lowpart_p (src))
+    src = SUBREG_REG (src);
+
   if (GET_CODE (src) != IOR && GET_CODE (src) != XOR)
     return x;
 
@@ -9282,10 +9318,38 @@ make_field_assignment (rtx x)
       && CONST_INT_P (XEXP (rhs, 1))
       && rtx_equal_for_field_assignment_p (XEXP (rhs, 0), dest))
     c1 = INTVAL (XEXP (rhs, 1)), other = lhs;
+  /* The second SUBREG that might get in the way is a paradoxical
+     SUBREG around the first operand of the AND.  We want to 
+     pretend the operand is as wide as the destination here.   We
+     do this by creating a new MEM in the wider mode for the sole
+     purpose of the call to rtx_equal_for_field_assignment_p.   Also
+     note this trick only works for MEMs.  */
+  else if (GET_CODE (rhs) == AND
+	   && paradoxical_subreg_p (XEXP (rhs, 0))
+	   && GET_CODE (SUBREG_REG (XEXP (rhs, 0))) == MEM
+	   && CONST_INT_P (XEXP (rhs, 1))
+	   && rtx_equal_for_field_assignment_p (gen_rtx_MEM (GET_MODE (dest),
+							     XEXP (SUBREG_REG (XEXP (rhs, 0)), 0)),
+						dest))
+    c1 = INTVAL (XEXP (rhs, 1)), other = lhs;
   else if (GET_CODE (lhs) == AND
 	   && CONST_INT_P (XEXP (lhs, 1))
 	   && rtx_equal_for_field_assignment_p (XEXP (lhs, 0), dest))
     c1 = INTVAL (XEXP (lhs, 1)), other = rhs;
+  /* The second SUBREG that might get in the way is a paradoxical
+     SUBREG around the first operand of the AND.  We want to 
+     pretend the operand is as wide as the destination here.   We
+     do this by creating a new MEM in the wider mode for the sole
+     purpose of the call to rtx_equal_for_field_assignment_p.   Also
+     note this trick only works for MEMs.  */
+  else if (GET_CODE (lhs) == AND
+	   && paradoxical_subreg_p (XEXP (lhs, 0))
+	   && GET_CODE (SUBREG_REG (XEXP (lhs, 0))) == MEM
+	   && CONST_INT_P (XEXP (lhs, 1))
+	   && rtx_equal_for_field_assignment_p (gen_rtx_MEM (GET_MODE (dest),
+							     XEXP (SUBREG_REG (XEXP (lhs, 0)), 0)),
+						dest))
+    c1 = INTVAL (XEXP (lhs, 1)), other = rhs;
   else
     return x;
 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 69488ec..c60230f 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2015-01-29  Jeff Law  <law@redhat.com>
+
+	PR target/15184
+	* gcc.target/i386/pr15184-1.c: New test.
+	* gcc.target/i386/pr15184-2.c: New test.
+
 2015-01-29  Yuri Rumyantsev  <ysrumyan@gmail.com>
 
 	PR tree-optimization/64746
diff --git a/gcc/testsuite/gcc.target/i386/pr15184-1.c b/gcc/testsuite/gcc.target/i386/pr15184-1.c
new file mode 100644
index 0000000..9eb544c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr15184-1.c
@@ -0,0 +1,33 @@
+/* PR 15184 first two tests, plus two addition ones.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -m32 -march=pentiumpro" } */
+
+#define regparm __attribute__((__regparm__(3)))
+
+extern unsigned int x;
+extern unsigned short y;
+
+void regparm f0(unsigned char c)
+{
+       x = (x & 0xFFFFFF00) | (unsigned int)c;
+}
+
+void regparm f1(unsigned char c)
+{
+     x = (x & 0xFFFF00FF) | ((unsigned int)c << 8);
+}
+
+void regparm f2(unsigned char c)
+{
+     x = (x & 0xFF00FFFF) | ((unsigned int)c << 16);
+}
+void regparm f3(unsigned char c)
+{
+     x = (x & 0x00FFFFFF) | ((unsigned int)c << 24);
+}
+
+
+/* Each function should compile down to a byte move from
+   the input register into x, possibly at an offset within x.  */
+/* { dg-final { scan-assembler-times "\tmovb\t%al, x" 4 } } */
+
diff --git a/gcc/testsuite/gcc.target/i386/pr15184-2.c b/gcc/testsuite/gcc.target/i386/pr15184-2.c
new file mode 100644
index 0000000..99fdbc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr15184-2.c
@@ -0,0 +1,23 @@
+/* PR 15184 second two tests
+/* { dg-do compile } */
+/* { dg-options "-O2 -m32 -march=pentiumpro" } */
+
+#define regparm __attribute__((__regparm__(3)))
+
+extern unsigned int x;
+extern unsigned short y;
+
+void regparm g0(unsigned char c)
+{
+        y = (y & 0xFF00) | (unsigned short)c;
+}
+
+void regparm g1(unsigned char c)
+{
+        y = (y & 0x00FF) | ((unsigned short)c << 8);
+}
+
+/* Each function should compile down to a byte move from
+   the input register into x, possibly at an offset within x.  */
+/* { dg-final { scan-assembler-times "\tmovb\t%al, y" 2 } } */
+

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]