This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Simplify ix86_expand_vector_move_misalign
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: "H.J. Lu" <hjl dot tools at gmail dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 20 Apr 2016 13:09:53 +0200
- Subject: Re: [PATCH] Simplify ix86_expand_vector_move_misalign
- Authentication-results: sourceware.org; auth=none
- References: <20160419144843 dot GA7801 at intel dot com>
On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> Since mov<mode>_internal patterns handle both aligned/unaligned load
> and store, we can simplify ix86_avx256_split_vector_move_misalign and
> ix86_expand_vector_move_misalign.
>
> Tested on x86-64. OK for trunk?
>
> H.J.
> ---
> * config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
> Short-cut unaligned load and store cases. Handle all integer
> vector modes.
> (ix86_expand_vector_move_misalign): Short-cut unaligned load
> and store cases. Call ix86_avx256_split_vector_move_misalign
> directly without checking mode class.
LGTM, but it is hard to review interwoven code movements and deletions...
Hopefully OK.
Thanks,
Uros.
> ---
> gcc/config/i386/i386.c | 252 ++++++++++++++++---------------------------------
> 1 file changed, 81 insertions(+), 171 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 4e48572..e056f68 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
> rtx (*extract) (rtx, rtx, rtx);
> machine_mode mode;
>
> - switch (GET_MODE (op0))
> + if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> + || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
> + {
> + emit_insn (gen_rtx_SET (op0, op1));
> + return;
> + }
> +
> + rtx orig_op0 = NULL_RTX;
> + mode = GET_MODE (op0);
> + switch (GET_MODE_CLASS (mode))
> + {
> + case MODE_VECTOR_INT:
> + case MODE_INT:
> + if (mode != V32QImode)
> + {
> + if (!MEM_P (op0))
> + {
> + orig_op0 = op0;
> + op0 = gen_reg_rtx (V32QImode);
> + }
> + else
> + op0 = gen_lowpart (V32QImode, op0);
> + op1 = gen_lowpart (V32QImode, op1);
> + mode = V32QImode;
> + }
> + break;
> + case MODE_VECTOR_FLOAT:
> + break;
> + default:
> + gcc_unreachable ();
> + }
> +
> + switch (mode)
> {
> default:
> gcc_unreachable ();
> @@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
>
> if (MEM_P (op1))
> {
> - if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
> - && optimize_insn_for_speed_p ())
> - {
> - rtx r = gen_reg_rtx (mode);
> - m = adjust_address (op1, mode, 0);
> - emit_move_insn (r, m);
> - m = adjust_address (op1, mode, 16);
> - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
> - emit_move_insn (op0, r);
> - }
> - else
> - emit_insn (gen_rtx_SET (op0, op1));
> + rtx r = gen_reg_rtx (mode);
> + m = adjust_address (op1, mode, 0);
> + emit_move_insn (r, m);
> + m = adjust_address (op1, mode, 16);
> + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
> + emit_move_insn (op0, r);
> }
> else if (MEM_P (op0))
> {
> - if (TARGET_AVX256_SPLIT_UNALIGNED_STORE
> - && optimize_insn_for_speed_p ())
> - {
> - m = adjust_address (op0, mode, 0);
> - emit_insn (extract (m, op1, const0_rtx));
> - m = adjust_address (op0, mode, 16);
> - emit_insn (extract (m, op1, const1_rtx));
> - }
> - else
> - emit_insn (gen_rtx_SET (op0, op1));
> + m = adjust_address (op0, mode, 0);
> + emit_insn (extract (m, op1, const0_rtx));
> + m = adjust_address (op0, mode, 16);
> + emit_insn (extract (m, op1, const1_rtx));
> }
> else
> gcc_unreachable ();
> +
> + if (orig_op0)
> + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> }
>
> /* Implement the movmisalign patterns for SSE. Non-SSE modes go
> @@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
> void
> ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
> {
> - rtx op0, op1, orig_op0 = NULL_RTX, m;
> + rtx op0, op1, m;
>
> op0 = operands[0];
> op1 = operands[1];
>
> - if (GET_MODE_SIZE (mode) == 64)
> + /* Use unaligned load/store for AVX512 or when optimizing for size. */
> + if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
> {
> - switch (GET_MODE_CLASS (mode))
> - {
> - case MODE_VECTOR_INT:
> - case MODE_INT:
> - if (GET_MODE (op0) != V16SImode)
> - {
> - if (!MEM_P (op0))
> - {
> - orig_op0 = op0;
> - op0 = gen_reg_rtx (V16SImode);
> - }
> - else
> - op0 = gen_lowpart (V16SImode, op0);
> - }
> - op1 = gen_lowpart (V16SImode, op1);
> - /* FALLTHRU */
> -
> - case MODE_VECTOR_FLOAT:
> -
> - emit_insn (gen_rtx_SET (op0, op1));
> - if (orig_op0)
> - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> - break;
> -
> - default:
> - gcc_unreachable ();
> - }
> -
> + emit_insn (gen_rtx_SET (op0, op1));
> return;
> }
>
> - if (TARGET_AVX
> - && GET_MODE_SIZE (mode) == 32)
> + if (TARGET_AVX)
> {
> - switch (GET_MODE_CLASS (mode))
> - {
> - case MODE_VECTOR_INT:
> - case MODE_INT:
> - if (GET_MODE (op0) != V32QImode)
> - {
> - if (!MEM_P (op0))
> - {
> - orig_op0 = op0;
> - op0 = gen_reg_rtx (V32QImode);
> - }
> - else
> - op0 = gen_lowpart (V32QImode, op0);
> - }
> - op1 = gen_lowpart (V32QImode, op1);
> - /* FALLTHRU */
> -
> - case MODE_VECTOR_FLOAT:
> - ix86_avx256_split_vector_move_misalign (op0, op1);
> - if (orig_op0)
> - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> - break;
> + if (GET_MODE_SIZE (mode) == 32)
> + ix86_avx256_split_vector_move_misalign (op0, op1);
> + else
> + /* Always use 128-bit mov<mode>_internal pattern for AVX. */
> + emit_insn (gen_rtx_SET (op0, op1));
> + return;
> + }
>
> - default:
> - gcc_unreachable ();
> - }
> + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> + {
> + emit_insn (gen_rtx_SET (op0, op1));
> + return;
> + }
>
> + /* ??? If we have typed data, then it would appear that using
> + movdqu is the only way to get unaligned data loaded with
> + integer type. */
> + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> + {
> + emit_insn (gen_rtx_SET (op0, op1));
> return;
> }
>
> if (MEM_P (op1))
> {
> - /* Normal *mov<mode>_internal pattern will handle
> - unaligned loads just fine if misaligned_operand
> - is true, and without the UNSPEC it can be combined
> - with arithmetic instructions. */
> - if (TARGET_AVX
> - && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> - || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
> - && misaligned_operand (op1, GET_MODE (op1)))
> - emit_insn (gen_rtx_SET (op0, op1));
> - /* ??? If we have typed data, then it would appear that using
> - movdqu is the only way to get unaligned data loaded with
> - integer type. */
> - else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> - {
> - if (GET_MODE (op0) != V16QImode)
> - {
> - orig_op0 = op0;
> - op0 = gen_reg_rtx (V16QImode);
> - }
> - op1 = gen_lowpart (V16QImode, op1);
> - /* We will eventually emit movups based on insn attributes. */
> - emit_insn (gen_rtx_SET (op0, op1));
> - if (orig_op0)
> - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> - }
> - else if (TARGET_SSE2 && mode == V2DFmode)
> + if (TARGET_SSE2 && mode == V2DFmode)
> {
> rtx zero;
>
> - if (TARGET_AVX
> - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> - || optimize_insn_for_size_p ())
> - {
> - /* We will eventually emit movups based on insn attributes. */
> - emit_insn (gen_rtx_SET (op0, op1));
> - return;
> - }
> -
> /* When SSE registers are split into halves, we can avoid
> writing to the top half twice. */
> if (TARGET_SSE_SPLIT_REGS)
> @@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
> {
> rtx t;
>
> - if (TARGET_AVX
> - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> - || optimize_insn_for_size_p ())
> - {
> - if (GET_MODE (op0) != V4SFmode)
> - {
> - orig_op0 = op0;
> - op0 = gen_reg_rtx (V4SFmode);
> - }
> - op1 = gen_lowpart (V4SFmode, op1);
> - emit_insn (gen_rtx_SET (op0, op1));
> - if (orig_op0)
> - emit_move_insn (orig_op0,
> - gen_lowpart (GET_MODE (orig_op0), op0));
> - return;
> - }
> -
> if (mode != V4SFmode)
> t = gen_reg_rtx (V4SFmode);
> else
> @@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
> }
> else if (MEM_P (op0))
> {
> - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> - {
> - op0 = gen_lowpart (V16QImode, op0);
> - op1 = gen_lowpart (V16QImode, op1);
> - /* We will eventually emit movups based on insn attributes. */
> - emit_insn (gen_rtx_SET (op0, op1));
> - }
> - else if (TARGET_SSE2 && mode == V2DFmode)
> - {
> - if (TARGET_AVX
> - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> - || optimize_insn_for_size_p ())
> - /* We will eventually emit movups based on insn attributes. */
> - emit_insn (gen_rtx_SET (op0, op1));
> - else
> - {
> - m = adjust_address (op0, DFmode, 0);
> - emit_insn (gen_sse2_storelpd (m, op1));
> - m = adjust_address (op0, DFmode, 8);
> - emit_insn (gen_sse2_storehpd (m, op1));
> - }
> + if (TARGET_SSE2 && mode == V2DFmode)
> + {
> + m = adjust_address (op0, DFmode, 0);
> + emit_insn (gen_sse2_storelpd (m, op1));
> + m = adjust_address (op0, DFmode, 8);
> + emit_insn (gen_sse2_storehpd (m, op1));
> }
> else
> {
> if (mode != V4SFmode)
> op1 = gen_lowpart (V4SFmode, op1);
>
> - if (TARGET_AVX
> - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> - || optimize_insn_for_size_p ())
> - {
> - op0 = gen_lowpart (V4SFmode, op0);
> - emit_insn (gen_rtx_SET (op0, op1));
> - }
> - else
> - {
> - m = adjust_address (op0, V2SFmode, 0);
> - emit_insn (gen_sse_storelps (m, op1));
> - m = adjust_address (op0, V2SFmode, 8);
> - emit_insn (gen_sse_storehps (m, op1));
> - }
> + m = adjust_address (op0, V2SFmode, 0);
> + emit_insn (gen_sse_storelps (m, op1));
> + m = adjust_address (op0, V2SFmode, 8);
> + emit_insn (gen_sse_storehps (m, op1));
> }
> }
> else
> --
> 2.5.5
>