[PATCH] Optimize common rotate constructs (PR middle-end/29749)

Wed Nov 21 21:52:00 GMT 2007

Hi!

As shown on the attached testcase, GCC folder currently is only able
to recognize only the g1 case as LROTATE_EXPR (in e1 and f1 case
while it is not folded into LROTATE_EXPR, the RTL passes can still
recognize it as rotation).  But I believe it is far more common
to use patterns like {e,f,g}2 or {e,f,g}3 to open code rotation,
and that is not optimized and often generates worse code.

The attached patch allows {e,f,g}{2,3} to be folded exactly like
{e,f,g}1, as the diff between original dumps shows:
diff -up fold-rotate-1.c.003t.original1 fold-rotate-1.c.003t.original2 | grep ^[+-]

--- fold-rotate-1.c.003t.original1      2007-11-21 19:51:59.000000000 +0100
+++ fold-rotate-1.c.003t.original2      2007-11-21 19:48:58.000000000 +0100
-  return (unsigned char) ((signed char) (((int) a & 224) >> 5) | (signed char) (((int) a & 31) << 3));
+  return (unsigned char) ((signed char) (a >> 5) | (signed char) ((int) a << 3));
-  return (unsigned char) ((signed char) (a >> 5) & 7 | (signed char) ((int) a << 3) & -8);
+  return (unsigned char) ((signed char) (a >> 5) | (signed char) ((int) a << 3));
-  return (short unsigned int) ((short int) (((int) a & 65280) >> 8) | (short int) (((int) a & 255) << 8));
+  return (short unsigned int) ((short int) (a >> 8) | (short int) ((int) a << 8));
-  return (short unsigned int) ((short int) (a >> 8) & 255 | (short int) ((int) a << 8) & -256);
+  return (short unsigned int) ((short int) (a >> 8) | (short int) ((int) a << 8));
-  return (a & 4278190080) >> 24 | (a & 16777215) << 8;
+  return a r<< 8;
-  return a >> 24 & 255 | a << 8 & 4294967040;
+  return a r<< 8;

On x86_64 the assembly difference is mainly in e2 and f2 routines:
 e2:
 .LFB3:
-       movzbl  %dil, %edi
-       leal    0(,%rdi,8), %eax
-       shrl    $5, %edi
-       orl     %edi, %eax
+       movl    %edi, %eax
+       rolb    $3, %al
        ret
 .LFE3:
        .size   e2, .-e2
@@ -47,11 +45,8 @@ f1:
        .type   f2, @function
 f2:
 .LFB6:
-       movzwl  %di, %edi
        movl    %edi, %eax
-       shrl    $8, %edi
-       sall    $8, %eax
-       orl     %edi, %eax
+       rolw    $8, %ax
        ret

and I've checked also ppc64, i386 and sparc assembly and there were several code
improvements and haven't seen any code quality regressions.  Bootstrapped/regtested
on x86_64-linux.  Ok for trunk?

As follow up, perhaps another hunk in fold_binary's BIT_IOR_EXPR case
could attempt to fold {e,f}{1,2,3} into {unsigned char,unsigned short} LROTATE_EXPR.

2007-11-21  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/29749
	* fold-const.c (fold_binary) <case BIT_AND_EXPR>: Optimize
	(X << C1) & C2 into (X << C1) & (C2 | ((1 << C1) - 1))
	and (X >> C1) & C2 into (X >> C1) & (C2 | ~((type) -1 >> C1)).
	(fold_binary) <case LSHIFT_EXPR, case RSHIFT_EXPR>: Optimize
	(X & C2) << C1 into (X << C1) & (C2 << C1) and
	(X & C2) >> C1 into (X >> C1) & (C2 >> C1) if that allows further
	optimizations.

	* gcc.dg/fold-rotate-1.c: New test.

--- gcc/fold-const.c.jj	2007-11-18 20:15:02.000000000 +0100
+++ gcc/fold-const.c	2007-11-21 15:48:06.000000000 +0100
@@ -10973,6 +10973,100 @@ fold_binary (enum tree_code code, tree t
 	    return build_int_cst (type, residue & low);
 	}
 
+      /* Fold (X << C1) & C2 into (X << C1) & (C2 | ((1 << C1) - 1))
+	      (X >> C1) & C2 into (X >> C1) & (C2 | ~((type) -1 >> C1))
+	 if the new mask might be further optimized.  */
+      if ((TREE_CODE (arg0) == LSHIFT_EXPR
+	   || TREE_CODE (arg0) == RSHIFT_EXPR)
+	  && host_integerp (TREE_OPERAND (arg0, 1), 1)
+	  && host_integerp (arg1, TYPE_UNSIGNED (TREE_TYPE (arg1)))
+	  && tree_low_cst (TREE_OPERAND (arg0, 1), 1)
+	     < TYPE_PRECISION (TREE_TYPE (arg0))
+	  && TYPE_PRECISION (TREE_TYPE (arg0)) <= HOST_BITS_PER_WIDE_INT
+	  && tree_low_cst (TREE_OPERAND (arg0, 1), 1) > 0)
+	{
+	  unsigned int shiftc = tree_low_cst (TREE_OPERAND (arg0, 1), 1);
+	  unsigned HOST_WIDE_INT mask
+	    = tree_low_cst (arg1, TYPE_UNSIGNED (TREE_TYPE (arg1)));
+	  unsigned HOST_WIDE_INT newmask, zerobits = 0;
+	  tree shift_type = TREE_TYPE (arg0);
+
+	  if (TREE_CODE (arg0) == LSHIFT_EXPR)
+	    zerobits = ((((unsigned HOST_WIDE_INT) 1) << shiftc) - 1);
+	  else if (TREE_CODE (arg0) == RSHIFT_EXPR
+		   && TYPE_PRECISION (TREE_TYPE (arg0))
+		      == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (arg0))))
+	    {
+	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (arg0));
+	      tree arg00 = TREE_OPERAND (arg0, 0);
+	      /* See if more bits can be proven as zero because of
+		 zero extension.  */
+	      if (TREE_CODE (arg00) == NOP_EXPR
+		  && TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (arg00, 0))))
+		{
+		  tree inner_type = TREE_TYPE (TREE_OPERAND (arg00, 0));
+		  if (TYPE_PRECISION (inner_type)
+		      == GET_MODE_BITSIZE (TYPE_MODE (inner_type))
+		      && TYPE_PRECISION (inner_type) < prec)
+		    {
+		      prec = TYPE_PRECISION (inner_type);
+		      /* See if we can shorten the right shift.  */
+		      if (shiftc < prec)
+			shift_type = inner_type;
+		    }
+		}
+	      zerobits = ~(unsigned HOST_WIDE_INT) 0;
+	      zerobits >>= HOST_BITS_PER_WIDE_INT - shiftc;
+	      zerobits <<= prec - shiftc;
+	      /* For arithmetic shift if sign bit could be set, zerobits
+		 can contain actually sign bits, so no transformation is
+		 possible, unless MASK masks them all away.  In that
+		 case the shift needs to be converted into logical shift.  */
+	      if (!TYPE_UNSIGNED (TREE_TYPE (arg0))
+		  && prec == TYPE_PRECISION (TREE_TYPE (arg0)))
+		{
+		  if ((mask & zerobits) == 0)
+		    shift_type = unsigned_type_for (TREE_TYPE (arg0));
+		  else
+		    zerobits = 0;
+		}
+	    }
+
+	  /* ((X << 16) & 0xff00) is (X, 0).  */
+	  if ((mask & zerobits) == mask)
+	    return omit_one_operand (type, build_int_cst (type, 0), arg0);
+
+	  newmask = mask | zerobits;
+	  if (newmask != mask && (newmask & (newmask + 1)) == 0)
+	    {
+	      unsigned int prec;
+
+	      /* Only do the transformation if NEWMASK is some integer
+		 mode's mask.  */
+	      for (prec = BITS_PER_UNIT;
+		   prec < HOST_BITS_PER_WIDE_INT; prec <<= 1)
+		if (newmask == (((unsigned HOST_WIDE_INT) 1) << prec) - 1)
+		  break;
+	      if (prec < HOST_BITS_PER_WIDE_INT
+		  || newmask == ~(unsigned HOST_WIDE_INT) 0)
+		{
+		  if (shift_type != TREE_TYPE (arg0))
+		    {
+		      tem = fold_build2 (TREE_CODE (arg0), shift_type,
+					 fold_convert (shift_type,
+						       TREE_OPERAND (arg0, 0)),
+					 TREE_OPERAND (arg0, 1));
+		      tem = fold_convert (type, tem);
+		    }
+		  else
+		    tem = op0;
+		  return fold_build2 (BIT_AND_EXPR, type, tem,
+				      build_int_cst_type (TREE_TYPE (op1),
+							  newmask));
+		}
+	    }
+	}
+
       goto associate;
 
     case RDIV_EXPR:
@@ -11526,6 +11620,25 @@ fold_binary (enum tree_code code, tree t
 	      == (unsigned int) GET_MODE_BITSIZE (TYPE_MODE (type))))
 	return TREE_OPERAND (arg0, 0);
 
+      /* Fold (X & C2) << C1 into (X << C1) & (C2 << C1)
+	      (X & C2) >> C1 into (X >> C1) & (C2 >> C1)
+	 if the latter can be further optimized.  */
+      if ((code == LSHIFT_EXPR || code == RSHIFT_EXPR)
+	  && TREE_CODE (arg0) == BIT_AND_EXPR
+	  && TREE_CODE (arg1) == INTEGER_CST
+	  && TREE_CODE (TREE_OPERAND (arg0, 1)) == INTEGER_CST)
+	{
+	  tree mask = fold_build2 (code, type,
+				   fold_convert (type, TREE_OPERAND (arg0, 1)),
+				   arg1);
+	  tree shift = fold_build2 (code, type,
+				    fold_convert (type, TREE_OPERAND (arg0, 0)),
+				    arg1);
+	  tem = fold_binary (BIT_AND_EXPR, type, shift, mask);
+	  if (tem)
+	    return tem;
+	}
+
       return NULL_TREE;
 
     case MIN_EXPR:
--- gcc/testsuite/gcc.dg/fold-rotate-1.c.jj	2007-11-21 17:18:50.000000000 +0100
+++ gcc/testsuite/gcc.dg/fold-rotate-1.c	2007-11-21 17:18:34.000000000 +0100
@@ -0,0 +1,74 @@
+/* PR middle-end/29749 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-original" } */
+
+#if __SCHAR_MAX__ == 127
+
+unsigned char
+e1 (unsigned char a)
+{
+  return a >> 5 | a << 3;
+}
+
+unsigned char
+e2 (unsigned char a)
+{
+  return (a & 0xe0) >> 5 | (a & 0x1f) << 3;
+}
+
+unsigned char
+e3 (unsigned char a)
+{
+  return ((a >> 5) & 0x07) | ((a << 3) & 0xf8);
+}
+
+#endif
+
+#if __SHRT_MAX__ == 32767
+
+unsigned short
+f1 (unsigned short a)
+{
+  return a >> 8 | a << 8;
+}
+
+unsigned short
+f2 (unsigned short a)
+{
+  return (a & 0xff00) >> 8 | (a & 0x00ff) << 8;
+}
+
+unsigned short
+f3 (unsigned short a)
+{
+  return ((a >> 8) & 0x00ff) | ((a << 8) & 0xff00);
+}
+
+#endif
+
+#if __INT_MAX__ == 2147483647
+
+unsigned int
+g1 (unsigned int a)
+{
+  return a >> 24 | a << 8;
+}
+
+unsigned int
+g2 (unsigned int a)
+{
+  return (a & 0xff000000) >> 24 | (a & 0x00ffffff) << 8;
+}
+
+unsigned int
+g3 (unsigned int a)
+{
+  return ((a >> 24) & 0x000000ff) | ((a << 8) & 0xffffff00U);
+}
+
+#endif
+
+int i;
+
+/* { dg-final { scan-tree-dump-times "&" 0 "original" } } */
+/* { dg-final { cleanup-tree-dump "original" } } */

	Jakub