This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, SH] Improve builtin strnlen for small lengths


Hello,

This patch unrolls string compare for length < 8 and residual bytes
after the word at a time loops (with cmp/str), using base+offset
addressing mode.
It also allows the builtin to be inlined for non-constant lengths.

No new regressions. Upgraded test case to handle former case.

OK for trunk ?

thanks






2014-01-09  Christian Bruel  <christian.bruel@st.com>

	* gcc/config/sh/sh-mem.cc (sh_expand_cmpnstr): Unroll small sizes and
	  optimized non constant lengths.

2014-01-09  Christian Bruel  <christian.bruel@st.com>

	* gcc.target/sh/cmpstrn.c: New case.

Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc	(revision 206385)
+++ gcc/config/sh/sh-mem.cc	(working copy)
@@ -324,7 +324,6 @@ sh_expand_cmpnstr (rtx *operands)
   rtx addr2 = operands[2];
   rtx s1_addr = copy_addr_to_reg (XEXP (addr1, 0));
   rtx s2_addr = copy_addr_to_reg (XEXP (addr2, 0));
-  rtx tmp0 = gen_reg_rtx (SImode);
   rtx tmp1 = gen_reg_rtx (SImode);
   rtx tmp2 = gen_reg_rtx (SImode);
 
@@ -334,98 +333,128 @@ sh_expand_cmpnstr (rtx *operands)
   rtx L_end_loop_byte = gen_label_rtx ();
 
   rtx len = force_reg (SImode, operands[3]);
-  int constp = (CONST_INT_P (operands[3]));
-  int bytes = (constp ? INTVAL (operands[3]) : 0);
-  int witers = bytes / 4;
+  int constp = CONST_INT_P (operands[3]);
 
-  /* We could still loop on a register count. Not found very
-     convincing to optimize yet.  */
-  if (! constp)
-    return false;
+  /* Loop on a register count. */
+  if (constp)
+    {
+      rtx tmp0 = gen_reg_rtx (SImode);
+      rtx tmp3 = gen_reg_rtx (SImode);
+      rtx lenw = gen_reg_rtx (SImode);
 
-  if (witers > 1)
-    {
       rtx L_loop_long = gen_label_rtx ();
       rtx L_end_loop_long = gen_label_rtx ();
-      rtx tmp3 = gen_reg_rtx (SImode);
-      rtx lenw = gen_reg_rtx (SImode);
+      rtx L_small = gen_label_rtx ();
+
       int align = INTVAL (operands[4]);
+      int bytes = INTVAL (operands[3]);
+      int witers = bytes / 4;
 
-      emit_move_insn (tmp0, const0_rtx);
+      if (witers > 1)
+        {
+          addr1 = adjust_automodify_address (addr1, SImode, s1_addr, 0);
+          addr2 = adjust_automodify_address (addr2, SImode, s2_addr, 0);
 
-      if (align < 4)
-	{
-	  emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
-	  emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
-	  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
-	  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-	}
+          emit_move_insn (tmp0, const0_rtx);
 
-      addr1 = adjust_automodify_address (addr1, SImode, s1_addr, 0);
-      addr2 = adjust_automodify_address (addr2, SImode, s2_addr, 0);
+          if (align < 4)
+            {
+              emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+              emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
+              jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+              add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+            }
 
-      /* word count. Do we have iterations ? */
-      emit_insn (gen_lshrsi3 (lenw, len, GEN_INT (2)));
+          /* word count. Do we have iterations ? */
+          emit_insn (gen_lshrsi3 (lenw, len, GEN_INT (2)));
 
-      /*start long loop.  */
-      emit_label (L_loop_long);
+          /*start long loop.  */
+          emit_label (L_loop_long);
 
-      /* tmp2 is aligned, OK to load.  */
-      emit_move_insn (tmp2, addr2);
-      emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+          /* tmp2 is aligned, OK to load.  */
+          emit_move_insn (tmp2, addr2);
+          emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, GET_MODE_SIZE (SImode)));
 
-      /* tmp1 is aligned, OK to load.  */
-      emit_move_insn (tmp1, addr1);
-      emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4));
+          /* tmp1 is aligned, OK to load.  */
+          emit_move_insn (tmp1, addr1);
+          emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, GET_MODE_SIZE (SImode)));
 
-      /* Is there a 0 byte ?  */
-      emit_insn (gen_andsi3 (tmp3, tmp2, tmp1));
+          /* Is there a 0 byte ?  */
+          emit_insn (gen_andsi3 (tmp3, tmp2, tmp1));
 
-      emit_insn (gen_cmpstr_t (tmp0, tmp3));
-      jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
-      add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+          emit_insn (gen_cmpstr_t (tmp0, tmp3));
+          jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
+          add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
 
-      emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
-      jump = emit_jump_insn (gen_branch_false (L_end_loop_long));
-      add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+          emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+          jump = emit_jump_insn (gen_branch_false (L_end_loop_long));
+          add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
 
-      if (TARGET_SH2)
-	emit_insn (gen_dect (lenw, lenw));
-      else
-	{
-	  emit_insn (gen_addsi3 (lenw, lenw, GEN_INT (-1)));
-	  emit_insn (gen_tstsi_t (lenw, lenw));
-	}
-      jump = emit_jump_insn (gen_branch_false (L_loop_long));
-      add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+          if (TARGET_SH2)
+            emit_insn (gen_dect (lenw, lenw));
+          else
+            {
+              emit_insn (gen_addsi3 (lenw, lenw, GEN_INT (-1)));
+              emit_insn (gen_tstsi_t (lenw, lenw));
+            }
 
-      /* end loop.  Reached max iterations.  */
-      if (bytes % 4 == 0)
-	{
-	  /* Done.  */
-	  jump = emit_jump_insn (gen_jump_compact (L_return));
-	  emit_barrier_after (jump);
-	}
-      else
-	{
-	  /* Remaining bytes to read.   */
-	  emit_move_insn (len, GEN_INT (bytes % 4));
-	  jump = emit_jump_insn (gen_jump_compact (L_loop_byte));
-	  emit_barrier_after (jump);
-	}
+          jump = emit_jump_insn (gen_branch_false (L_loop_long));
+          add_int_reg_note (jump, REG_BR_PROB, prob_likely);
 
-      emit_label (L_end_loop_long);
+          /* end loop.  Reached max iterations.  */
+          if (bytes % 4 == 0)
+            {
+              /* Done.  */
+              jump = emit_jump_insn (gen_jump_compact (L_return));
+              emit_barrier_after (jump);
+            }
+          else
+            {
+              /* Remaining bytes to read.   */
+              jump = emit_jump_insn (gen_jump_compact (L_small));
+              emit_barrier_after (jump);
+            }
 
-      /* Remaining bytes to read.   */
-      emit_move_insn (len, GEN_INT (4));
+          emit_label (L_end_loop_long);
 
-      /* Found last word.  Restart it byte per byte. */
-      emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
-      emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+          /* Found last word.  Restart it byte per byte. */
+          bytes =  4;
+          emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -GET_MODE_SIZE (SImode)));
+          emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -GET_MODE_SIZE (SImode)));
+        }
+
+      emit_label (L_small);
+
+      gcc_assert (bytes <= 7);
+
+      addr1 = adjust_automodify_address (addr1, QImode, s1_addr, 0);
+      addr2 = adjust_automodify_address (addr2, QImode, s2_addr, 0);
+
+      while (bytes--)
+        {
+          emit_insn (gen_extendqisi2 (tmp1, addr1));
+          emit_insn (gen_extendqisi2 (tmp2, addr2));
+
+          emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx));
+          jump = emit_jump_insn (gen_branch_true (L_end_loop_byte));
+          add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+          emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+          if (flag_delayed_branch)
+            emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+          jump = emit_jump_insn (gen_branch_false (L_end_loop_byte));
+          add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+          addr1 = adjust_address (addr1, QImode, GET_MODE_SIZE (QImode));
+          addr2 = adjust_address (addr2, QImode, GET_MODE_SIZE (QImode));
+        }
+
+      jump = emit_jump_insn (gen_jump_compact( L_end_loop_byte));
+      emit_barrier_after (jump);
     }
 
-  addr1 = adjust_address (addr1, QImode, 0);
-  addr2 = adjust_address (addr2, QImode, 0);
+  addr1 = adjust_automodify_address (addr1, QImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (addr2, QImode, s2_addr, 0);
 
   emit_label (L_loop_byte);
 
Index: gcc/testsuite/gcc.target/sh/cmpstrn.c
===================================================================
--- gcc/testsuite/gcc.target/sh/cmpstrn.c	(revision 206385)
+++ gcc/testsuite/gcc.target/sh/cmpstrn.c	(working copy)
@@ -6,16 +6,23 @@
 /* { dg-final { scan-assembler-not "jmp" } } */
 /* { dg-final { scan-assembler-times "cmp/str" 1 } } */
 
-/* Test that the cmp/str loop is optimized out.  */
-test01(const char *s1, const char *s2, int n)
+/* Test that cmp/str is not used for small lengths.  */
+test01(const char *s1)
 {
   return __builtin_strncmp (s1, "abcde", 3);
 }
 
 /* Test that the cmp/str loop is used.  */
-test02(const char *s1, const char *s2, int n)
+test02(const char *s1)
 {
   return __builtin_strncmp (s1, "abcdefghi", 8);
 }
 
+/* Test that no call is generated  */
+test03(const char *s1, int n)
+{
+  return __builtin_strncmp (s1, "abcde", n);
+}
 
+
+

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]