This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/60884] [SH] improve inlined strlen-like builtin functions
- From: "olegendo at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Sat, 03 May 2014 22:01:25 +0000
- Subject: [Bug target/60884] [SH] improve inlined strlen-like builtin functions
- Auto-submitted: auto-generated
- References: <bug-60884-4 at http dot gcc dot gnu dot org/bugzilla/>
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60884
Oleg Endo <olegendo at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |christian.bruel at st dot com
--- Comment #1 from Oleg Endo <olegendo at gcc dot gnu.org> ---
With the following patch applied to current trunk (r210026)
Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc (revision 210037)
+++ gcc/config/sh/sh-mem.cc (working copy)
@@ -568,7 +568,7 @@
addr1 = adjust_automodify_address (addr1, SImode, current_addr, 0);
- /*start long loop. */
+ /* start long loop. */
emit_label (L_loop_long);
/* tmp1 is aligned, OK to load. */
@@ -589,29 +589,15 @@
addr1 = adjust_address (addr1, QImode, 0);
/* unroll remaining bytes. */
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ for (int i = 0; i < 4; ++i)
+ {
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+ emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_return));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ }
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
- jump = emit_jump_insn (gen_branch_true (L_return));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
-
- emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
-
- emit_insn (gen_extendqisi2 (tmp1, addr1));
- jump = emit_jump_insn (gen_jump_compact (L_return));
emit_barrier_after (jump);
/* start byte loop. */
@@ -626,10 +612,9 @@
/* end loop. */
- emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
emit_label (L_return);
+ emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));
return true;
I get the following when compiling
unsigned int test (const char* x)
{
return __builtin_strlen (x);
}
with -O2 -m4:
_test:
mov r4,r0
tst #3,r0
bf/s .L12
mov r4,r1
mov #0,r3
.L4:
mov.l @r1+,r2
cmp/str r3,r2
bf .L4
add #-4,r1
mov.b @r1+,r2
tst r2,r2
bt .L2
mov.b @r1+,r2
tst r2,r2
bt .L2
mov.b @r1+,r2
tst r2,r2
mov #-1,r2
negc r2,r2
add r2,r1
.L2:
mov r1,r0
rts
subc r4,r0
.align 1
.L12:
mov.b @r1+,r2
tst r2,r2
bf/s .L12
mov r1,r0
rts
subc r4,r0
which is 5 insns shorter than the currently expanded sequence.
It seems that other optimizers are able to figure out that the 4th byte load is
not needed and eliminate it.
Moving the 'emit_insn (gen_addsi3 ...' after the return label allows it to
utilize the subc insn, which is difficult to get otherwise, as combine looks
only at one BB at a time.
Christian, what do you think? Any objections?