This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Richard Henderson <rth at redhat dot com>, Uros Bizjak <ubizjak at gmail dot com>, Kirill Yukhin <kirill dot yukhin at gmail dot com>
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Wed, 30 Oct 2013 10:47:13 +0100
- Subject: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
- Authentication-results: sourceware.org; auth=none
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!
Yesterday I've noticed that for AVX which allows unaligned operands in
AVX arithmetics instructions we still don't combine unaligned loads with the
AVX arithmetics instructions. So say for -O2 -mavx -ftree-vectorize
void
f1 (int *__restrict e, int *__restrict f)
{
int i;
for (i = 0; i < 1024; i++)
e[i] = f[i] * 7;
}
void
f2 (int *__restrict e, int *__restrict f)
{
int i;
for (i = 0; i < 1024; i++)
e[i] = f[i];
}
we have:
vmovdqu (%rsi,%rax), %xmm0
vpmulld %xmm1, %xmm0, %xmm0
vmovups %xmm0, (%rdi,%rax)
in the first loop. Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
*mov<mode>_internal patterns (and various others) use misaligned_operand
to see if they should emit vmovaps or vmovups (etc.), so as suggested by
Richard on IRC it isn't necessary to either allow UNSPEC_LOADU in memory
operands of all the various non-move AVX instructions for TARGET_AVX, or
add extra patterns to help combine, this patch instead just uses the
*mov<mode>_internal in that case (assuming initially misaligned_operand
doesn't become !misaligned_operand through RTL optimizations). Additionally
the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
loads, which usually means combine will fail, by doing the load into a
temporary pseudo in that case and then doing a pseudo to pseudo move with
gen_lowpart on the rhs (which will be merged soon after into following
instructions).
I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
bootstrap/regtest server isn't AVX capable.
2013-10-30 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
op1 is misaligned_operand, just use *mov<mode>_internal insn
rather than UNSPEC_LOADU load.
(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
Avoid gen_lowpart on op0 if it isn't MEM.
--- gcc/config/i386/i386.c.jj 2013-10-30 08:15:38.000000000 +0100
+++ gcc/config/i386/i386.c 2013-10-30 10:20:22.684708729 +0100
@@ -16560,6 +16560,12 @@ ix86_avx256_split_vector_move_misalign (
r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
emit_move_insn (op0, r);
}
+ /* Normal *mov<mode>_internal pattern will handle
+ unaligned loads just fine if misaligned_operand
+ is true, and without the UNSPEC it can be combined
+ with arithmetic instructions. */
+ else if (misaligned_operand (op1, GET_MODE (op1)))
+ emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
else
emit_insn (load_unaligned (op0, op1));
}
@@ -16634,7 +16640,7 @@ ix86_avx256_split_vector_move_misalign (
void
ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
{
- rtx op0, op1, m;
+ rtx op0, op1, orig_op0 = NULL_RTX, m;
rtx (*load_unaligned) (rtx, rtx);
rtx (*store_unaligned) (rtx, rtx);
@@ -16647,7 +16653,16 @@ ix86_expand_vector_move_misalign (enum m
{
case MODE_VECTOR_INT:
case MODE_INT:
- op0 = gen_lowpart (V16SImode, op0);
+ if (GET_MODE (op0) != V16SImode)
+ {
+ if (!MEM_P (op0))
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V16SImode);
+ }
+ else
+ op0 = gen_lowpart (V16SImode, op0);
+ }
op1 = gen_lowpart (V16SImode, op1);
/* FALLTHRU */
@@ -16676,6 +16691,8 @@ ix86_expand_vector_move_misalign (enum m
emit_insn (store_unaligned (op0, op1));
else
gcc_unreachable ();
+ if (orig_op0)
+ emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
break;
default:
@@ -16692,12 +16709,23 @@ ix86_expand_vector_move_misalign (enum m
{
case MODE_VECTOR_INT:
case MODE_INT:
- op0 = gen_lowpart (V32QImode, op0);
+ if (GET_MODE (op0) != V32QImode)
+ {
+ if (!MEM_P (op0))
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V32QImode);
+ }
+ else
+ op0 = gen_lowpart (V32QImode, op0);
+ }
op1 = gen_lowpart (V32QImode, op1);
/* FALLTHRU */
case MODE_VECTOR_FLOAT:
ix86_avx256_split_vector_move_misalign (op0, op1);
+ if (orig_op0)
+ emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
break;
default:
@@ -16709,15 +16737,30 @@ ix86_expand_vector_move_misalign (enum m
if (MEM_P (op1))
{
+ /* Normal *mov<mode>_internal pattern will handle
+ unaligned loads just fine if misaligned_operand
+ is true, and without the UNSPEC it can be combined
+ with arithmetic instructions. */
+ if (TARGET_AVX
+ && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ && misaligned_operand (op1, GET_MODE (op1)))
+ emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
/* ??? If we have typed data, then it would appear that using
movdqu is the only way to get unaligned data loaded with
integer type. */
- if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
- op0 = gen_lowpart (V16QImode, op0);
+ if (GET_MODE (op0) != V16QImode)
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V16QImode);
+ }
op1 = gen_lowpart (V16QImode, op1);
/* We will eventually emit movups based on insn attributes. */
emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+ if (orig_op0)
+ emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
}
else if (TARGET_SSE2 && mode == V2DFmode)
{
@@ -16765,9 +16808,16 @@ ix86_expand_vector_move_misalign (enum m
|| TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
|| optimize_insn_for_size_p ())
{
- op0 = gen_lowpart (V4SFmode, op0);
+ if (GET_MODE (op0) != V4SFmode)
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V4SFmode);
+ }
op1 = gen_lowpart (V4SFmode, op1);
emit_insn (gen_sse_loadups (op0, op1));
+ if (orig_op0)
+ emit_move_insn (orig_op0,
+ gen_lowpart (GET_MODE (orig_op0), op0));
return;
}
Jakub