This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, rs6000] Allow memmov/memset builtin expansion to use unaligned vsx on p8/p9
- From: Aaron Sawdey <acsawdey at linux dot vnet dot ibm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Segher Boessenkool <segher at kernel dot crashing dot org>, David Edelsohn <dje dot gcc at gmail dot com>, Bill Schmidt <wschmidt at linux dot vnet dot ibm dot com>
- Date: Wed, 13 Dec 2017 14:07:44 -0600
- Subject: [PATCH, rs6000] Allow memmov/memset builtin expansion to use unaligned vsx on p8/p9
- Authentication-results: sourceware.org; auth=none
This patch allows the use of unaligned vsx loads/stores for builtin
expansion of memset and memcmp on p8/p9. Performance of unaligned vsx
instructions is good on these processors.
OK for trunk if bootstrap/regtest on ppc64le passes?
2017-12-13 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
* config/rs6000/rs6000-string.c (expand_block_move): Allow the use of
unaligned VSX load/store on P8/P9.
(expand_block_clear): Allow the use of unaligned VSX load/store on P8/P9.
--
Aaron Sawdey, Ph.D. acsawdey@linux.vnet.ibm.com
050-2/C113 (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain
Index: gcc/config/rs6000/rs6000-string.c
===================================================================
--- gcc/config/rs6000/rs6000-string.c (revision 255585)
+++ gcc/config/rs6000/rs6000-string.c (working copy)
@@ -67,11 +67,14 @@
if (bytes <= 0)
return 1;
+ bool isP8 = (rs6000_cpu == PROCESSOR_POWER8);
+ bool isP9 = (rs6000_cpu == PROCESSOR_POWER9);
+
/* Use the builtin memset after a point, to avoid huge code bloat.
When optimize_size, avoid any significant code bloat; calling
memset is about 4 instructions, so allow for one instruction to
load zero and three to do clearing. */
- if (TARGET_ALTIVEC && align >= 128)
+ if (TARGET_ALTIVEC && (align >= 128 || isP8 || isP9))
clear_step = 16;
else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
clear_step = 8;
@@ -88,7 +91,7 @@
machine_mode mode = BLKmode;
rtx dest;
- if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+ if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || isP8 || isP9))
{
clear_bytes = 16;
mode = V4SImode;
@@ -1247,6 +1250,9 @@
if (bytes > rs6000_block_move_inline_limit)
return 0;
+ bool isP8 = (rs6000_cpu == PROCESSOR_POWER8);
+ bool isP9 = (rs6000_cpu == PROCESSOR_POWER9);
+
for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
{
union {
@@ -1258,7 +1264,7 @@
/* Altivec first, since it will be faster than a string move
when it applies, and usually not significantly larger. */
- if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+ if (TARGET_ALTIVEC && bytes >= 16 && (isP8 || isP9 || align >= 128))
{
move_bytes = 16;
mode = V4SImode;