[gcc(refs/users/acsawdey/heads/memcpy-unaligned-vsx)] rs6000: unaligned VSX in memcpy/memmove expansion
Aaron Sawdey
acsawdey@gcc.gnu.org
Mon Aug 17 19:48:06 GMT 2020
https://gcc.gnu.org/g:c19bbd22a410954ddf01522a066cedf386b5cf45
commit c19bbd22a410954ddf01522a066cedf386b5cf45
Author: Aaron Sawdey <acsawdey@linux.ibm.com>
Date: Fri Aug 7 19:02:20 2020 -0500
rs6000: unaligned VSX in memcpy/memmove expansion
This patch adds a few new instructions to inline expansion of
memcpy/memmove. Generation of all these are controlled by
the option -mblock-ops-unaligned-vsx which is set on by default if the
target has TARGET_EFFICIENT_UNALIGNED_VSX.
* unaligned vsx load/store (V2DImode)
* unaligned vsx pair load/store (POImode) which is also controlled
by -mblock-ops-vector-pair in case it is not wanted at some point.
The default for -mblock-ops-vector-pair is for it to be on if the
target has TARGET_MMA and TARGET_EFFICIENT_UNALIGNED_VSX. This is
redundant, but nice for the future to clearly specify what is
required.
* unaligned vsx lxvl/stxvl but generally only to do the remainder
of a copy/move we stated with some vsx loads/stores, and also prefer
to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes.
Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does
two memcpy() for each memcmp(). If the memcpy() calls don't do the right
thing then the memcmp() will fail unexpectedly.
gcc/ChangeLog:
* config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move):
Helper function.
(expand_block_move): Add lxvl/stxvl, vector pair, and
unaligned VSX.
* config/rs6000/rs6000.c (rs6000_option_override_internal):
Default value for -mblock-ops-vector-pair.
* config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair.
Diff:
---
gcc/config/rs6000/rs6000-string.c | 105 +++++++++++++++++++++++++++++++++-----
gcc/config/rs6000/rs6000.c | 14 ++++-
gcc/config/rs6000/rs6000.opt | 4 ++
3 files changed, 107 insertions(+), 16 deletions(-)
diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
index c35d93180ca..7c481d6fa40 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,36 @@ gen_lvx_v4si_move (rtx dest, rtx src)
return gen_altivec_lvx_v4si_internal (dest, src);
}
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+ gcc_assert (MEM_P (dest) ^ MEM_P (src));
+ gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+ gcc_assert (length <= 16);
+
+ bool is_store = MEM_P (dest);
+
+ /* If the address form is not a simple register, make it so. */
+ if (is_store)
+ {
+ dest = XEXP (dest, 0);
+ if (!REG_P (dest))
+ dest = force_reg (Pmode, dest);
+ }
+ else
+ {
+ src = XEXP (src, 0);
+ if (!REG_P (src))
+ src = force_reg (Pmode, src);
+ }
+
+ rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+ if (is_store)
+ return gen_stxvl (src, dest, len);
+ else
+ return gen_lxvl (dest, src, len);
+}
+
/* Expand a block move operation, and return 1 if successful. Return 0
if we should let the compiler generate normal code.
@@ -2750,18 +2780,56 @@ expand_block_move (rtx operands[], bool might_overlap)
if (bytes > rs6000_block_move_inline_limit)
return 0;
+ int orig_bytes = bytes;
for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
{
union {
- rtx (*movmemsi) (rtx, rtx, rtx, rtx);
rtx (*mov) (rtx, rtx);
+ rtx (*movlen) (rtx, rtx, int);
} gen_func;
machine_mode mode = BLKmode;
rtx src, dest;
-
- /* Altivec first, since it will be faster than a string move
- when it applies, and usually not significantly larger. */
- if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+ bool move_with_length = false;
+
+ /* Use POImode for paired vsx load/store. Use V2DI for single
+ unaligned vsx load/store, for consistency with what other
+ expansions (compare) already do, and so we can use lxvd2x on
+ p8. Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
+ with length < 16 (if allowed), then gpr load/store. */
+
+ if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_BLOCK_OPS_VECTOR_PAIR
+ && bytes >= 32
+ && (align >= 256 || !STRICT_ALIGNMENT))
+ {
+ move_bytes = 32;
+ mode = POImode;
+ gen_func.mov = gen_movpoi;
+ }
+ else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && VECTOR_MEM_VSX_P (V2DImode)
+ && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+ {
+ move_bytes = 16;
+ mode = V2DImode;
+ gen_func.mov = gen_vsx_movv2di_64bit;
+ }
+ else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_POWER10 && bytes < 16
+ && orig_bytes > 16
+ && !(bytes == 1 || bytes == 2
+ || bytes == 4 || bytes == 8)
+ && (align >= 128 || !STRICT_ALIGNMENT))
+ {
+ /* Only use lxvl/stxvl if it could replace multiple ordinary
+ loads+stores. Also don't use it unless we likely already
+ did one vsx copy so we aren't mixing gpr and vsx. */
+ move_bytes = bytes;
+ mode = V16QImode;
+ gen_func.movlen = gen_lxvl_stxvl_move;
+ move_with_length = true;
+ }
+ else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
move_bytes = 16;
mode = V4SImode;
@@ -2818,7 +2886,14 @@ expand_block_move (rtx operands[], bool might_overlap)
gen_func.mov = gen_movqi;
}
- /* Mode is always set to something other than BLKmode by one of the
+ /* If we can't succeed in doing the move in one pass, we can't
+ do it in the might_overlap case. Bail out and return
+ failure. */
+ if (might_overlap && (num_reg+1) >= MAX_MOVE_REG
+ && bytes > move_bytes)
+ return 0;
+
+ /* Mode is always set to something other than BLKmode by one of the
cases of the if statement above. */
gcc_assert (mode != BLKmode);
@@ -2826,15 +2901,17 @@ expand_block_move (rtx operands[], bool might_overlap)
dest = adjust_address (orig_dest, mode, offset);
rtx tmp_reg = gen_reg_rtx (mode);
-
- loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
- stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
- /* If we didn't succeed in doing it in one pass, we can't do it in the
- might_overlap case. Bail out and return failure. */
- if (might_overlap && num_reg >= MAX_MOVE_REG
- && bytes > move_bytes)
- return 0;
+ if (move_with_length)
+ {
+ loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes);
+ stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
+ }
+ else
+ {
+ loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
+ stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+ }
/* Emit loads and stores saved up. */
if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index fe93cf6ff2b..1c1caa90ede 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4018,6 +4018,14 @@ rs6000_option_override_internal (bool global_init_p)
rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
}
+ if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
+ {
+ if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+ rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+ else
+ rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+ }
+
/* Use long double size to select the appropriate long double. We use
TYPE_PRECISION to differentiate the 3 different long double types. We map
128 into the precision used for TFmode. */
@@ -23222,8 +23230,10 @@ struct rs6000_opt_mask {
static struct rs6000_opt_mask const rs6000_opt_masks[] =
{
{ "altivec", OPTION_MASK_ALTIVEC, false, true },
- { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
- false, true },
+ { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
+ false, true },
+ { "block-ops-vector-pair", OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
+ false, true },
{ "cmpb", OPTION_MASK_CMPB, false, true },
{ "crypto", OPTION_MASK_CRYPTO, false, true },
{ "direct-move", OPTION_MASK_DIRECT_MOVE, false, true },
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 9d3e740e930..4354cf1b898 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -328,6 +328,10 @@ mblock-ops-unaligned-vsx
Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags)
Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
+mblock-ops-vector-pair
+Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags)
+Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
+
mblock-compare-inline-limit=
Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save
Max number of bytes to compare without loops.
More information about the Gcc-cvs
mailing list