This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, SH] Add support for inlined builtin-strcmp (1/2)
- From: Christian Bruel <christian dot bruel at st dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Cc: Kaz Kojima <kkojima at rr dot iij4u dot or dot jp>
- Date: Thu, 17 Oct 2013 16:13:25 +0200
- Subject: [PATCH, SH] Add support for inlined builtin-strcmp (1/2)
- Authentication-results: sourceware.org; auth=none
Hello,
This patch just reorganizes the SH code used for memory builtins into
its own file, in preparation of the RTL strcmp hoisting in the next part.
OK for trunk ?
Thanks
Christian
2013-10-17 Christian Bruel <christian.bruel@st.com>
* config.gcc (sh-*): Add sh-mem.o to extra_obj.
* gcc/config/sh/t-sh (sh-mem.o): New rule.
* gcc/config/sh/sh-mem (expand_block_move): Moved here.
* gcc/config/sh/sh.c (force_into, expand_block_move): Move to sh-mem.c
Index: gcc/config/sh/sh-mem.c
===================================================================
--- gcc/config/sh/sh-mem.c (revision 0)
+++ gcc/config/sh/sh-mem.c (working copy)
@@ -0,0 +1,176 @@
+/* Helper routines for memory move and comparison insns.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "expr.h"
+#include "tm_p.h"
+
+/* Like force_operand, but guarantees that VALUE ends up in TARGET. */
+static void
+force_into (rtx value, rtx target)
+{
+ value = force_operand (value, target);
+ if (! rtx_equal_p (value, target))
+ emit_insn (gen_move_insn (target, value));
+}
+
+/* Emit code to perform a block move. Choose the best method.
+
+ OPERANDS[0] is the destination.
+ OPERANDS[1] is the source.
+ OPERANDS[2] is the size.
+ OPERANDS[3] is the alignment safe to use. */
+bool
+expand_block_move (rtx *operands)
+{
+ int align = INTVAL (operands[3]);
+ int constp = (CONST_INT_P (operands[2]));
+ int bytes = (constp ? INTVAL (operands[2]) : 0);
+
+ if (! constp)
+ return false;
+
+ /* If we could use mov.l to move words and dest is word-aligned, we
+ can use movua.l for loads and still generate a relatively short
+ and efficient sequence. */
+ if (TARGET_SH4A_ARCH && align < 4
+ && MEM_ALIGN (operands[0]) >= 32
+ && can_move_by_pieces (bytes, 32))
+ {
+ rtx dest = copy_rtx (operands[0]);
+ rtx src = copy_rtx (operands[1]);
+ /* We could use different pseudos for each copied word, but
+ since movua can only load into r0, it's kind of
+ pointless. */
+ rtx temp = gen_reg_rtx (SImode);
+ rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
+ int copied = 0;
+
+ while (copied + 4 <= bytes)
+ {
+ rtx to = adjust_address (dest, SImode, copied);
+ rtx from = adjust_automodify_address (src, BLKmode,
+ src_addr, copied);
+
+ set_mem_size (from, 4);
+ emit_insn (gen_movua (temp, from));
+ emit_move_insn (src_addr, plus_constant (Pmode, src_addr, 4));
+ emit_move_insn (to, temp);
+ copied += 4;
+ }
+
+ if (copied < bytes)
+ move_by_pieces (adjust_address (dest, BLKmode, copied),
+ adjust_automodify_address (src, BLKmode,
+ src_addr, copied),
+ bytes - copied, align, 0);
+
+ return true;
+ }
+
+ /* If it isn't a constant number of bytes, or if it doesn't have 4 byte
+ alignment, or if it isn't a multiple of 4 bytes, then fail. */
+ if (align < 4 || (bytes % 4 != 0))
+ return false;
+
+ if (TARGET_HARD_SH4)
+ {
+ if (bytes < 12)
+ return false;
+ else if (bytes == 12)
+ {
+ rtx func_addr_rtx = gen_reg_rtx (Pmode);
+ rtx r4 = gen_rtx_REG (SImode, 4);
+ rtx r5 = gen_rtx_REG (SImode, 5);
+
+ function_symbol (func_addr_rtx, "__movmemSI12_i4", SFUNC_STATIC);
+ force_into (XEXP (operands[0], 0), r4);
+ force_into (XEXP (operands[1], 0), r5);
+ emit_insn (gen_block_move_real_i4 (func_addr_rtx));
+ return true;
+ }
+ else if (! optimize_size)
+ {
+ const char *entry_name;
+ rtx func_addr_rtx = gen_reg_rtx (Pmode);
+ int dwords;
+ rtx r4 = gen_rtx_REG (SImode, 4);
+ rtx r5 = gen_rtx_REG (SImode, 5);
+ rtx r6 = gen_rtx_REG (SImode, 6);
+
+ entry_name = (bytes & 4 ? "__movmem_i4_odd" : "__movmem_i4_even");
+ function_symbol (func_addr_rtx, entry_name, SFUNC_STATIC);
+ force_into (XEXP (operands[0], 0), r4);
+ force_into (XEXP (operands[1], 0), r5);
+
+ dwords = bytes >> 3;
+ emit_insn (gen_move_insn (r6, GEN_INT (dwords - 1)));
+ emit_insn (gen_block_lump_real_i4 (func_addr_rtx));
+ return true;
+ }
+ else
+ return false;
+ }
+ if (bytes < 64)
+ {
+ char entry[30];
+ rtx func_addr_rtx = gen_reg_rtx (Pmode);
+ rtx r4 = gen_rtx_REG (SImode, 4);
+ rtx r5 = gen_rtx_REG (SImode, 5);
+
+ sprintf (entry, "__movmemSI%d", bytes);
+ function_symbol (func_addr_rtx, entry, SFUNC_STATIC);
+ force_into (XEXP (operands[0], 0), r4);
+ force_into (XEXP (operands[1], 0), r5);
+ emit_insn (gen_block_move_real (func_addr_rtx));
+ return true;
+ }
+
+ /* This is the same number of bytes as a memcpy call, but to a different
+ less common function name, so this will occasionally use more space. */
+ if (! optimize_size)
+ {
+ rtx func_addr_rtx = gen_reg_rtx (Pmode);
+ int final_switch, while_loop;
+ rtx r4 = gen_rtx_REG (SImode, 4);
+ rtx r5 = gen_rtx_REG (SImode, 5);
+ rtx r6 = gen_rtx_REG (SImode, 6);
+
+ function_symbol (func_addr_rtx, "__movmem", SFUNC_STATIC);
+ force_into (XEXP (operands[0], 0), r4);
+ force_into (XEXP (operands[1], 0), r5);
+
+ /* r6 controls the size of the move. 16 is decremented from it
+ for each 64 bytes moved. Then the negative bit left over is used
+ as an index into a list of move instructions. e.g., a 72 byte move
+ would be set up with size(r6) = 14, for one iteration through the
+ big while loop, and a switch of -2 for the last part. */
+
+ final_switch = 16 - ((bytes / 4) % 16);
+ while_loop = ((bytes / 4) / 16 - 1) * 16;
+ emit_insn (gen_move_insn (r6, GEN_INT (while_loop + final_switch)));
+ emit_insn (gen_block_lump_real (func_addr_rtx));
+ return true;
+ }
+
+ return false;
+}
Index: gcc/config/sh/sh.c
===================================================================
--- gcc/config/sh/sh.c (revision 203751)
+++ gcc/config/sh/sh.c (working copy)
@@ -174,7 +174,6 @@ static bool shmedia_space_reserved_for_target_regi
static void split_branches (rtx);
static int branch_dest (rtx);
-static void force_into (rtx, rtx);
static void print_slot (rtx);
static rtx add_constant (rtx, enum machine_mode, rtx);
static void dump_table (rtx, rtx);
@@ -1621,157 +1620,6 @@ sh_encode_section_info (tree decl, rtx rtl, int fi
SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FUNCVEC_FUNCTION;
}
-/* Like force_operand, but guarantees that VALUE ends up in TARGET. */
-static void
-force_into (rtx value, rtx target)
-{
- value = force_operand (value, target);
- if (! rtx_equal_p (value, target))
- emit_insn (gen_move_insn (target, value));
-}
-
-/* Emit code to perform a block move. Choose the best method.
-
- OPERANDS[0] is the destination.
- OPERANDS[1] is the source.
- OPERANDS[2] is the size.
- OPERANDS[3] is the alignment safe to use. */
-bool
-expand_block_move (rtx *operands)
-{
- int align = INTVAL (operands[3]);
- int constp = (CONST_INT_P (operands[2]));
- int bytes = (constp ? INTVAL (operands[2]) : 0);
-
- if (! constp)
- return false;
-
- /* If we could use mov.l to move words and dest is word-aligned, we
- can use movua.l for loads and still generate a relatively short
- and efficient sequence. */
- if (TARGET_SH4A_ARCH && align < 4
- && MEM_ALIGN (operands[0]) >= 32
- && can_move_by_pieces (bytes, 32))
- {
- rtx dest = copy_rtx (operands[0]);
- rtx src = copy_rtx (operands[1]);
- /* We could use different pseudos for each copied word, but
- since movua can only load into r0, it's kind of
- pointless. */
- rtx temp = gen_reg_rtx (SImode);
- rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
- int copied = 0;
-
- while (copied + 4 <= bytes)
- {
- rtx to = adjust_address (dest, SImode, copied);
- rtx from = adjust_automodify_address (src, BLKmode,
- src_addr, copied);
-
- set_mem_size (from, 4);
- emit_insn (gen_movua (temp, from));
- emit_move_insn (src_addr, plus_constant (Pmode, src_addr, 4));
- emit_move_insn (to, temp);
- copied += 4;
- }
-
- if (copied < bytes)
- move_by_pieces (adjust_address (dest, BLKmode, copied),
- adjust_automodify_address (src, BLKmode,
- src_addr, copied),
- bytes - copied, align, 0);
-
- return true;
- }
-
- /* If it isn't a constant number of bytes, or if it doesn't have 4 byte
- alignment, or if it isn't a multiple of 4 bytes, then fail. */
- if (align < 4 || (bytes % 4 != 0))
- return false;
-
- if (TARGET_HARD_SH4)
- {
- if (bytes < 12)
- return false;
- else if (bytes == 12)
- {
- rtx func_addr_rtx = gen_reg_rtx (Pmode);
- rtx r4 = gen_rtx_REG (SImode, 4);
- rtx r5 = gen_rtx_REG (SImode, 5);
-
- function_symbol (func_addr_rtx, "__movmemSI12_i4", SFUNC_STATIC);
- force_into (XEXP (operands[0], 0), r4);
- force_into (XEXP (operands[1], 0), r5);
- emit_insn (gen_block_move_real_i4 (func_addr_rtx));
- return true;
- }
- else if (! optimize_size)
- {
- const char *entry_name;
- rtx func_addr_rtx = gen_reg_rtx (Pmode);
- int dwords;
- rtx r4 = gen_rtx_REG (SImode, 4);
- rtx r5 = gen_rtx_REG (SImode, 5);
- rtx r6 = gen_rtx_REG (SImode, 6);
-
- entry_name = (bytes & 4 ? "__movmem_i4_odd" : "__movmem_i4_even");
- function_symbol (func_addr_rtx, entry_name, SFUNC_STATIC);
- force_into (XEXP (operands[0], 0), r4);
- force_into (XEXP (operands[1], 0), r5);
-
- dwords = bytes >> 3;
- emit_insn (gen_move_insn (r6, GEN_INT (dwords - 1)));
- emit_insn (gen_block_lump_real_i4 (func_addr_rtx));
- return true;
- }
- else
- return false;
- }
- if (bytes < 64)
- {
- char entry[30];
- rtx func_addr_rtx = gen_reg_rtx (Pmode);
- rtx r4 = gen_rtx_REG (SImode, 4);
- rtx r5 = gen_rtx_REG (SImode, 5);
-
- sprintf (entry, "__movmemSI%d", bytes);
- function_symbol (func_addr_rtx, entry, SFUNC_STATIC);
- force_into (XEXP (operands[0], 0), r4);
- force_into (XEXP (operands[1], 0), r5);
- emit_insn (gen_block_move_real (func_addr_rtx));
- return true;
- }
-
- /* This is the same number of bytes as a memcpy call, but to a different
- less common function name, so this will occasionally use more space. */
- if (! optimize_size)
- {
- rtx func_addr_rtx = gen_reg_rtx (Pmode);
- int final_switch, while_loop;
- rtx r4 = gen_rtx_REG (SImode, 4);
- rtx r5 = gen_rtx_REG (SImode, 5);
- rtx r6 = gen_rtx_REG (SImode, 6);
-
- function_symbol (func_addr_rtx, "__movmem", SFUNC_STATIC);
- force_into (XEXP (operands[0], 0), r4);
- force_into (XEXP (operands[1], 0), r5);
-
- /* r6 controls the size of the move. 16 is decremented from it
- for each 64 bytes moved. Then the negative bit left over is used
- as an index into a list of move instructions. e.g., a 72 byte move
- would be set up with size(r6) = 14, for one iteration through the
- big while loop, and a switch of -2 for the last part. */
-
- final_switch = 16 - ((bytes / 4) % 16);
- while_loop = ((bytes / 4) / 16 - 1) * 16;
- emit_insn (gen_move_insn (r6, GEN_INT (while_loop + final_switch)));
- emit_insn (gen_block_lump_real (func_addr_rtx));
- return true;
- }
-
- return false;
-}
-
/* Prepare operands for a move define_expand; specifically, one of the
operands must be in a register. */
void
Index: gcc/config/sh/t-sh
===================================================================
--- gcc/config/sh/t-sh (revision 203751)
+++ gcc/config/sh/t-sh (working copy)
@@ -16,6 +16,10 @@
# along with GCC; see the file COPYING3. If not see
# <http://www.gnu.org/licenses/>.
+sh-mem.o: $(srcdir)/config/sh/sh-mem.c \
+ $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_H) $(TM_P_H)
+ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
sh-c.o: $(srcdir)/config/sh/sh-c.c \
$(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_H) $(TM_P_H) coretypes.h
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
Index: gcc/config.gcc
===================================================================
--- gcc/config.gcc (revision 203751)
+++ gcc/config.gcc (working copy)
@@ -465,7 +465,7 @@ sh[123456789lbe]*-*-* | sh-*-*)
cpu_type=sh
need_64bit_hwint=yes
extra_options="${extra_options} fused-madd.opt"
- extra_objs="${extra_objs} sh_treg_combine.o"
+ extra_objs="${extra_objs} sh_treg_combine.o sh-mem.o"
;;
v850*-*-*)
cpu_type=v850