This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Simplify ix86_expand_vector_move_misalign
- From: "H.J. Lu" <hongjiu dot lu at intel dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Uros Bizjak <ubizjak at gmail dot com>
- Date: Tue, 19 Apr 2016 07:48:43 -0700
- Subject: [PATCH] Simplify ix86_expand_vector_move_misalign
- Authentication-results: sourceware.org; auth=none
- Reply-to: "H.J. Lu" <hjl dot tools at gmail dot com>
Since mov<mode>_internal patterns handle both aligned/unaligned load
and store, we can simplify ix86_avx256_split_vector_move_misalign and
ix86_expand_vector_move_misalign.
Tested on x86-64. OK for trunk?
H.J.
---
* config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
Short-cut unaligned load and store cases. Handle all integer
vector modes.
(ix86_expand_vector_move_misalign): Short-cut unaligned load
and store cases. Call ix86_avx256_split_vector_move_misalign
directly without checking mode class.
---
gcc/config/i386/i386.c | 252 ++++++++++++++++---------------------------------
1 file changed, 81 insertions(+), 171 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 4e48572..e056f68 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
rtx (*extract) (rtx, rtx, rtx);
machine_mode mode;
- switch (GET_MODE (op0))
+ if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+ || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ rtx orig_op0 = NULL_RTX;
+ mode = GET_MODE (op0);
+ switch (GET_MODE_CLASS (mode))
+ {
+ case MODE_VECTOR_INT:
+ case MODE_INT:
+ if (mode != V32QImode)
+ {
+ if (!MEM_P (op0))
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V32QImode);
+ }
+ else
+ op0 = gen_lowpart (V32QImode, op0);
+ op1 = gen_lowpart (V32QImode, op1);
+ mode = V32QImode;
+ }
+ break;
+ case MODE_VECTOR_FLOAT:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (mode)
{
default:
gcc_unreachable ();
@@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
if (MEM_P (op1))
{
- if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
- && optimize_insn_for_speed_p ())
- {
- rtx r = gen_reg_rtx (mode);
- m = adjust_address (op1, mode, 0);
- emit_move_insn (r, m);
- m = adjust_address (op1, mode, 16);
- r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
- emit_move_insn (op0, r);
- }
- else
- emit_insn (gen_rtx_SET (op0, op1));
+ rtx r = gen_reg_rtx (mode);
+ m = adjust_address (op1, mode, 0);
+ emit_move_insn (r, m);
+ m = adjust_address (op1, mode, 16);
+ r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+ emit_move_insn (op0, r);
}
else if (MEM_P (op0))
{
- if (TARGET_AVX256_SPLIT_UNALIGNED_STORE
- && optimize_insn_for_speed_p ())
- {
- m = adjust_address (op0, mode, 0);
- emit_insn (extract (m, op1, const0_rtx));
- m = adjust_address (op0, mode, 16);
- emit_insn (extract (m, op1, const1_rtx));
- }
- else
- emit_insn (gen_rtx_SET (op0, op1));
+ m = adjust_address (op0, mode, 0);
+ emit_insn (extract (m, op1, const0_rtx));
+ m = adjust_address (op0, mode, 16);
+ emit_insn (extract (m, op1, const1_rtx));
}
else
gcc_unreachable ();
+
+ if (orig_op0)
+ emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
}
/* Implement the movmisalign patterns for SSE. Non-SSE modes go
@@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
void
ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
{
- rtx op0, op1, orig_op0 = NULL_RTX, m;
+ rtx op0, op1, m;
op0 = operands[0];
op1 = operands[1];
- if (GET_MODE_SIZE (mode) == 64)
+ /* Use unaligned load/store for AVX512 or when optimizing for size. */
+ if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
{
- switch (GET_MODE_CLASS (mode))
- {
- case MODE_VECTOR_INT:
- case MODE_INT:
- if (GET_MODE (op0) != V16SImode)
- {
- if (!MEM_P (op0))
- {
- orig_op0 = op0;
- op0 = gen_reg_rtx (V16SImode);
- }
- else
- op0 = gen_lowpart (V16SImode, op0);
- }
- op1 = gen_lowpart (V16SImode, op1);
- /* FALLTHRU */
-
- case MODE_VECTOR_FLOAT:
-
- emit_insn (gen_rtx_SET (op0, op1));
- if (orig_op0)
- emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
- break;
-
- default:
- gcc_unreachable ();
- }
-
+ emit_insn (gen_rtx_SET (op0, op1));
return;
}
- if (TARGET_AVX
- && GET_MODE_SIZE (mode) == 32)
+ if (TARGET_AVX)
{
- switch (GET_MODE_CLASS (mode))
- {
- case MODE_VECTOR_INT:
- case MODE_INT:
- if (GET_MODE (op0) != V32QImode)
- {
- if (!MEM_P (op0))
- {
- orig_op0 = op0;
- op0 = gen_reg_rtx (V32QImode);
- }
- else
- op0 = gen_lowpart (V32QImode, op0);
- }
- op1 = gen_lowpart (V32QImode, op1);
- /* FALLTHRU */
-
- case MODE_VECTOR_FLOAT:
- ix86_avx256_split_vector_move_misalign (op0, op1);
- if (orig_op0)
- emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
- break;
+ if (GET_MODE_SIZE (mode) == 32)
+ ix86_avx256_split_vector_move_misalign (op0, op1);
+ else
+ /* Always use 128-bit mov<mode>_internal pattern for AVX. */
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
- default:
- gcc_unreachable ();
- }
+ if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+ /* ??? If we have typed data, then it would appear that using
+ movdqu is the only way to get unaligned data loaded with
+ integer type. */
+ if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
return;
}
if (MEM_P (op1))
{
- /* Normal *mov<mode>_internal pattern will handle
- unaligned loads just fine if misaligned_operand
- is true, and without the UNSPEC it can be combined
- with arithmetic instructions. */
- if (TARGET_AVX
- && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
- || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
- && misaligned_operand (op1, GET_MODE (op1)))
- emit_insn (gen_rtx_SET (op0, op1));
- /* ??? If we have typed data, then it would appear that using
- movdqu is the only way to get unaligned data loaded with
- integer type. */
- else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- if (GET_MODE (op0) != V16QImode)
- {
- orig_op0 = op0;
- op0 = gen_reg_rtx (V16QImode);
- }
- op1 = gen_lowpart (V16QImode, op1);
- /* We will eventually emit movups based on insn attributes. */
- emit_insn (gen_rtx_SET (op0, op1));
- if (orig_op0)
- emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
- }
- else if (TARGET_SSE2 && mode == V2DFmode)
+ if (TARGET_SSE2 && mode == V2DFmode)
{
rtx zero;
- if (TARGET_AVX
- || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
- || optimize_insn_for_size_p ())
- {
- /* We will eventually emit movups based on insn attributes. */
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
-
/* When SSE registers are split into halves, we can avoid
writing to the top half twice. */
if (TARGET_SSE_SPLIT_REGS)
@@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
{
rtx t;
- if (TARGET_AVX
- || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
- || optimize_insn_for_size_p ())
- {
- if (GET_MODE (op0) != V4SFmode)
- {
- orig_op0 = op0;
- op0 = gen_reg_rtx (V4SFmode);
- }
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_rtx_SET (op0, op1));
- if (orig_op0)
- emit_move_insn (orig_op0,
- gen_lowpart (GET_MODE (orig_op0), op0));
- return;
- }
-
if (mode != V4SFmode)
t = gen_reg_rtx (V4SFmode);
else
@@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
}
else if (MEM_P (op0))
{
- if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- op0 = gen_lowpart (V16QImode, op0);
- op1 = gen_lowpart (V16QImode, op1);
- /* We will eventually emit movups based on insn attributes. */
- emit_insn (gen_rtx_SET (op0, op1));
- }
- else if (TARGET_SSE2 && mode == V2DFmode)
- {
- if (TARGET_AVX
- || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
- || optimize_insn_for_size_p ())
- /* We will eventually emit movups based on insn attributes. */
- emit_insn (gen_rtx_SET (op0, op1));
- else
- {
- m = adjust_address (op0, DFmode, 0);
- emit_insn (gen_sse2_storelpd (m, op1));
- m = adjust_address (op0, DFmode, 8);
- emit_insn (gen_sse2_storehpd (m, op1));
- }
+ if (TARGET_SSE2 && mode == V2DFmode)
+ {
+ m = adjust_address (op0, DFmode, 0);
+ emit_insn (gen_sse2_storelpd (m, op1));
+ m = adjust_address (op0, DFmode, 8);
+ emit_insn (gen_sse2_storehpd (m, op1));
}
else
{
if (mode != V4SFmode)
op1 = gen_lowpart (V4SFmode, op1);
- if (TARGET_AVX
- || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
- || optimize_insn_for_size_p ())
- {
- op0 = gen_lowpart (V4SFmode, op0);
- emit_insn (gen_rtx_SET (op0, op1));
- }
- else
- {
- m = adjust_address (op0, V2SFmode, 0);
- emit_insn (gen_sse_storelps (m, op1));
- m = adjust_address (op0, V2SFmode, 8);
- emit_insn (gen_sse_storehps (m, op1));
- }
+ m = adjust_address (op0, V2SFmode, 0);
+ emit_insn (gen_sse_storelps (m, op1));
+ m = adjust_address (op0, V2SFmode, 8);
+ emit_insn (gen_sse_storehps (m, op1));
}
}
else
--
2.5.5