This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug target/60884] [SH] improve inlined strlen-like builtin functions

From: "olegendo at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: Sat, 03 May 2014 22:01:25 +0000
Subject: [Bug target/60884] [SH] improve inlined strlen-like builtin functions
Auto-submitted: auto-generated
References: <bug-60884-4 at http dot gcc dot gnu dot org/bugzilla/>

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |christian.bruel at st dot com

--- Comment #1 from Oleg Endo <olegendo at gcc dot gnu.org> ---
With the following patch applied to current trunk (r210026)

Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc    (revision 210037)
+++ gcc/config/sh/sh-mem.cc    (working copy)
@@ -568,7 +568,7 @@

   addr1 = adjust_automodify_address (addr1, SImode, current_addr, 0);

-  /*start long loop.  */
+  /* start long loop.  */
   emit_label (L_loop_long);

   /* tmp1 is aligned, OK to load.  */
@@ -589,29 +589,15 @@
   addr1 = adjust_address (addr1, QImode, 0);

   /* unroll remaining bytes.  */
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  for (int i = 0; i < 4; ++i)
+    {
+      emit_insn (gen_extendqisi2 (tmp1, addr1));
+      emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+      emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+      jump = emit_jump_insn (gen_branch_true (L_return));
+      add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+    }

-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
-  jump = emit_jump_insn (gen_branch_true (L_return));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
-  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
-  emit_insn (gen_extendqisi2 (tmp1, addr1));
-  jump = emit_jump_insn (gen_jump_compact (L_return));
   emit_barrier_after (jump);

   /* start byte loop.  */
@@ -626,10 +612,9 @@

   /* end loop.  */

-  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
   emit_label (L_return);

+  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
   emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));

   return true;


I get the following when compiling

unsigned int test (const char* x)
{
  return __builtin_strlen (x);
}

with -O2 -m4:
_test:
    mov    r4,r0
    tst    #3,r0
    bf/s    .L12
    mov    r4,r1
    mov    #0,r3
.L4:
    mov.l    @r1+,r2
    cmp/str    r3,r2
    bf    .L4
    add    #-4,r1
    mov.b    @r1+,r2
    tst    r2,r2
    bt    .L2
    mov.b    @r1+,r2
    tst    r2,r2
    bt    .L2
    mov.b    @r1+,r2
    tst    r2,r2
    mov    #-1,r2
    negc    r2,r2
    add    r2,r1
.L2:
    mov    r1,r0
    rts
    subc    r4,r0
    .align 1
.L12:
    mov.b    @r1+,r2
    tst    r2,r2
    bf/s    .L12
    mov    r1,r0
    rts
    subc    r4,r0

which is 5 insns shorter than the currently expanded sequence.
It seems that other optimizers are able to figure out that the 4th byte load is
not needed and eliminate it.
Moving the 'emit_insn (gen_addsi3 ...' after the return label allows it to
utilize the subc insn, which is difficult to get otherwise, as combine looks
only at one BB at a time.

Christian, what do you think?  Any objections?

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]