+2011-03-27 H.J. Lu <hongjiu.lu@intel.com>
+
+ * config/i386/i386.c (flag_opts): Add -mavx256-split-unaligned-load
+ and -mavx256-split-unaligned-store.
+ (ix86_option_override_internal): Split 32-byte AVX unaligned
+ load/store by default.
+ (ix86_avx256_split_vector_move_misalign): New.
+ (ix86_expand_vector_move_misalign): Use it.
+
+ * config/i386/i386.opt: Add -mavx256-split-unaligned-load and
+ -mavx256-split-unaligned-store.
+
+ * config/i386/sse.md (*avx_mov<mode>_internal): Verify unaligned
+ 256bit load/store. Generate unaligned store on misaligned memory
+ operand.
+ (*avx_movu<ssemodesuffix><avxmodesuffix>): Verify unaligned
+ 256bit load/store.
+ (*avx_movdqu<avxmodesuffix>): Likewise.
+
+ * doc/invoke.texi: Document -mavx256-split-unaligned-load and
+ -mavx256-split-unaligned-store.
+
2011-03-27 Richard Sandiford <rdsandiford@googlemail.com>
PR target/38598
{ "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
{ "-m8bit-idiv", MASK_USE_8BIT_IDIV },
{ "-mvzeroupper", MASK_VZEROUPPER },
+ { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
+ { "-mavx256-split-unaligned-stroe", MASK_AVX256_SPLIT_UNALIGNED_STORE},
};
const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
if (TARGET_AVX)
{
/* When not optimize for size, enable vzeroupper optimization for
- TARGET_AVX with -fexpensive-optimizations. */
- if (!optimize_size
- && flag_expensive_optimizations
- && !(target_flags_explicit & MASK_VZEROUPPER))
- target_flags |= MASK_VZEROUPPER;
+ TARGET_AVX with -fexpensive-optimizations and split 32-byte
+ AVX unaligned load/store. */
+ if (!optimize_size)
+ {
+ if (flag_expensive_optimizations
+ && !(target_flags_explicit & MASK_VZEROUPPER))
+ target_flags |= MASK_VZEROUPPER;
+ if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
+ target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
+ if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
+ target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
+ }
}
else
{
emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
}
+/* Split 32-byte AVX unaligned load and store if needed. */
+
+static void
+ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+{
+ rtx m;
+ rtx (*extract) (rtx, rtx, rtx);
+ rtx (*move_unaligned) (rtx, rtx);
+ enum machine_mode mode;
+
+ switch (GET_MODE (op0))
+ {
+ default:
+ gcc_unreachable ();
+ case V32QImode:
+ extract = gen_avx_vextractf128v32qi;
+ move_unaligned = gen_avx_movdqu256;
+ mode = V16QImode;
+ break;
+ case V8SFmode:
+ extract = gen_avx_vextractf128v8sf;
+ move_unaligned = gen_avx_movups256;
+ mode = V4SFmode;
+ break;
+ case V4DFmode:
+ extract = gen_avx_vextractf128v4df;
+ move_unaligned = gen_avx_movupd256;
+ mode = V2DFmode;
+ break;
+ }
+
+ if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+ {
+ rtx r = gen_reg_rtx (mode);
+ m = adjust_address (op1, mode, 0);
+ emit_move_insn (r, m);
+ m = adjust_address (op1, mode, 16);
+ r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+ emit_move_insn (op0, r);
+ }
+ else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
+ {
+ m = adjust_address (op0, mode, 0);
+ emit_insn (extract (m, op1, const0_rtx));
+ m = adjust_address (op0, mode, 16);
+ emit_insn (extract (m, op1, const1_rtx));
+ }
+ else
+ emit_insn (move_unaligned (op0, op1));
+}
+
/* Implement the movmisalign patterns for SSE. Non-SSE modes go
straight to ix86_expand_vector_move. */
/* Code generation for scalar reg-reg moves of single and double precision data:
case 32:
op0 = gen_lowpart (V32QImode, op0);
op1 = gen_lowpart (V32QImode, op1);
- emit_insn (gen_avx_movdqu256 (op0, op1));
+ ix86_avx256_split_vector_move_misalign (op0, op1);
break;
default:
gcc_unreachable ();
emit_insn (gen_avx_movups (op0, op1));
break;
case V8SFmode:
- emit_insn (gen_avx_movups256 (op0, op1));
+ ix86_avx256_split_vector_move_misalign (op0, op1);
break;
case V2DFmode:
if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
emit_insn (gen_avx_movupd (op0, op1));
break;
case V4DFmode:
- emit_insn (gen_avx_movupd256 (op0, op1));
+ ix86_avx256_split_vector_move_misalign (op0, op1);
break;
default:
gcc_unreachable ();
m8bit-idiv
Target Report Mask(USE_8BIT_IDIV) Save
Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check
+
+mavx256-split-unaligned-load
+Target Report Mask(AVX256_SPLIT_UNALIGNED_LOAD) Save
+Split 32-byte AVX unaligned load
+
+mavx256-split-unaligned-store
+Target Report Mask(AVX256_SPLIT_UNALIGNED_STORE) Save
+Split 32-byte AVX unaligned store
return standard_sse_constant_opcode (insn, operands[1]);
case 1:
case 2:
+ if (GET_MODE_ALIGNMENT (<MODE>mode) == 256
+ && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE
+ && misaligned_operand (operands[0], <MODE>mode))
+ || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
+ && misaligned_operand (operands[1], <MODE>mode))))
+ gcc_unreachable ();
switch (get_attr_mode (insn))
{
case MODE_V8SF:
case MODE_V4SF:
- return "vmovaps\t{%1, %0|%0, %1}";
+ if (misaligned_operand (operands[0], <MODE>mode)
+ || misaligned_operand (operands[1], <MODE>mode))
+ return "vmovups\t{%1, %0|%0, %1}";
+ else
+ return "vmovaps\t{%1, %0|%0, %1}";
case MODE_V4DF:
case MODE_V2DF:
- if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ if (misaligned_operand (operands[0], <MODE>mode)
+ || misaligned_operand (operands[1], <MODE>mode))
+ return "vmovupd\t{%1, %0|%0, %1}";
+ else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
return "vmovaps\t{%1, %0|%0, %1}";
else
return "vmovapd\t{%1, %0|%0, %1}";
default:
- if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ if (misaligned_operand (operands[0], <MODE>mode)
+ || misaligned_operand (operands[1], <MODE>mode))
+ return "vmovdqu\t{%1, %0|%0, %1}";
+ else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
return "vmovaps\t{%1, %0|%0, %1}";
else
return "vmovdqa\t{%1, %0|%0, %1}";
UNSPEC_MOVU))]
"AVX_VEC_FLOAT_MODE_P (<MODE>mode)
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
+{
+ if (GET_MODE_ALIGNMENT (<MODE>mode) == 256
+ && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE
+ && misaligned_operand (operands[0], <MODE>mode))
+ || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
+ && misaligned_operand (operands[1], <MODE>mode))))
+ gcc_unreachable ();
+ return "vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
(set_attr "prefix" "vex")
[(match_operand:AVXMODEQI 1 "nonimmediate_operand" "xm,x")]
UNSPEC_MOVU))]
"TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
- "vmovdqu\t{%1, %0|%0, %1}"
+{
+ if (GET_MODE_ALIGNMENT (<MODE>mode) == 256
+ && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE
+ && misaligned_operand (operands[0], <MODE>mode))
+ || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
+ && misaligned_operand (operands[1], <MODE>mode))))
+ gcc_unreachable ();
+ return "vmovdqu\t{%1, %0|%0, %1}";
+}
[(set_attr "type" "ssemov")
(set_attr "movu" "1")
(set_attr "prefix" "vex")
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
-mcmodel=@var{code-model} -mabi=@var{name} @gol
-m32 -m64 -mlarge-data-threshold=@var{num} @gol
--msse2avx -mfentry -m8bit-idiv}
+-msse2avx -mfentry -m8bit-idiv @gol
+-mavx256-split-unaligned-load -mavx256-split-unaligned-store}
@emph{i386 and x86-64 Windows Options}
@gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol
to 255, 8bit unsigned integer divide will be used instead of
32bit/64bit integer divide.
+@item -mavx256-split-unaligned-load
+@item -mavx256-split-unaligned-store
+@opindex avx256-split-unaligned-load
+@opindex avx256-split-unaligned-store
+Split 32-byte AVX unaligned load and store.
+
@end table
These @samp{-m} switches are supported in addition to the above
+2011-03-27 H.J. Lu <hongjiu.lu@intel.com>
+
+ * gcc.target/i386/avx256-unaligned-load-1.c: New.
+ * gcc.target/i386/avx256-unaligned-load-2.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-load-3.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-load-4.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-load-5.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-load-6.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-load-7.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-1.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-2.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-3.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-4.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-5.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-6.c: Likewise.
+ * gcc.target/i386/avx256-unaligned-store-7.c: Likewise.
+
2011-03-27 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/47065
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#define N 1024
+
+float a[N], b[N+3], c[N];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i+3];
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movups256/1" } } */
+/* { dg-final { scan-assembler "\\*avx_movups/1" } } */
+/* { dg-final { scan-assembler "vinsertf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#define N 1024
+
+char **ep;
+char **fp;
+
+void
+avx_test (void)
+{
+ int i;
+ char **ap;
+ char **bp;
+ char **cp;
+
+ ap = ep;
+ bp = fp;
+ for (i = 128; i >= 0; i--)
+ {
+ *ap++ = *cp++;
+ *bp++ = 0;
+ }
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movdqu256/1" } } */
+/* { dg-final { scan-assembler "\\*avx_movdqu/1" } } */
+/* { dg-final { scan-assembler "vinsertf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#define N 1024
+
+double a[N], b[N+3], c[N];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ c[i] = a[i] * b[i+3];
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movupd256/1" } } */
+/* { dg-final { scan-assembler "\\*avx_movupd/1" } } */
+/* { dg-final { scan-assembler "vinsertf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" } */
+
+#define N 1024
+
+float a[N], b[N+3];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i] = a[i+3] * 2;
+}
+
+/* { dg-final { scan-assembler "\\*avx_movups256/1" } } */
+/* { dg-final { scan-assembler-not "\\*avx_movups/1" } } */
+/* { dg-final { scan-assembler-not "vinsertf128" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#include "avx-check.h"
+
+#define N 8
+
+float a[N+3] = { -1, -1, -1, 24.43, 68.346, 43.35,
+ 546.46, 46.79, 82.78, 82.7, 9.4 };
+float b[N];
+float c[N];
+
+void
+foo (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i] = a[i+3] * 2;
+}
+
+__attribute__ ((noinline))
+float
+bar (float x)
+{
+ return x * 2;
+}
+
+void
+avx_test (void)
+{
+ int i;
+
+ foo ();
+
+ for (i = 0; i < N; i++)
+ c[i] = bar (a[i+3]);
+
+ for (i = 0; i < N; i++)
+ if (b[i] != c[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#include "avx-check.h"
+
+#define N 4
+
+double a[N+3] = { -1, -1, -1, 24.43, 68.346, 43.35, 546.46 };
+double b[N];
+double c[N];
+
+void
+foo (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i] = a[i+3] * 2;
+}
+
+__attribute__ ((noinline))
+double
+bar (double x)
+{
+ return x * 2;
+}
+
+void
+avx_test (void)
+{
+ int i;
+
+ foo ();
+
+ for (i = 0; i < N; i++)
+ c[i] = bar (a[i+3]);
+
+ for (i = 0; i < N; i++)
+ if (b[i] != c[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */
+
+#include "avx-check.h"
+
+#define N 128
+
+char **ep;
+char **fp;
+char **mp;
+char **lp;
+
+__attribute__ ((noinline))
+void
+foo (void)
+{
+ mp = (char **) malloc (N);
+ lp = (char **) malloc (N);
+ ep = (char **) malloc (N);
+ fp = (char **) malloc (N);
+}
+
+void
+avx_test (void)
+{
+ int i;
+ char **ap, **bp, **cp, **dp;
+ char *str = "STR";
+
+ foo ();
+
+ cp = mp;
+ dp = lp;
+
+ for (i = N; i >= 0; i--)
+ {
+ *cp++ = str;
+ *dp++ = str;
+ }
+
+ ap = ep;
+ bp = fp;
+ cp = mp;
+ dp = lp;
+
+ for (i = N; i >= 0; i--)
+ {
+ *ap++ = *cp++;
+ *bp++ = *dp++;
+ }
+
+ for (i = N; i >= 0; i--)
+ {
+ if (strcmp (*--ap, "STR") != 0)
+ abort ();
+ if (strcmp (*--bp, "STR") != 0)
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#define N 1024
+
+float a[N], b[N+3], c[N], d[N];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i+3] = a[i] * 10.0;
+
+ for (i = 0; i < N; i++)
+ d[i] = c[i] * 20.0;
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movups256/2" } } */
+/* { dg-final { scan-assembler "movups.*\\*avx_movv4sf_internal/3" } } */
+/* { dg-final { scan-assembler "vextractf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#define N 1024
+
+char **ep;
+char **fp;
+
+void
+avx_test (void)
+{
+ int i;
+ char **ap;
+ char **bp;
+ char **cp;
+
+ ap = ep;
+ bp = fp;
+ for (i = 128; i >= 0; i--)
+ {
+ *ap++ = *cp++;
+ *bp++ = 0;
+ }
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movdqu256/2" } } */
+/* { dg-final { scan-assembler "movdqu.*\\*avx_movv16qi_internal/3" } } */
+/* { dg-final { scan-assembler "vextractf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#define N 1024
+
+double a[N], b[N+3], c[N], d[N];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i+3] = a[i] * 10.0;
+
+ for (i = 0; i < N; i++)
+ d[i] = c[i] * 20.0;
+}
+
+/* { dg-final { scan-assembler-not "\\*avx_movupd256/2" } } */
+/* { dg-final { scan-assembler "movupd.*\\*avx_movv2df_internal/3" } } */
+/* { dg-final { scan-assembler "vextractf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -dp -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" } */
+
+#define N 1024
+
+float a[N], b[N+3], c[N];
+
+void
+avx_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i+3] = a[i] * c[i];
+}
+
+/* { dg-final { scan-assembler "\\*avx_movups256/2" } } */
+/* { dg-final { scan-assembler-not "\\*avx_movups/2" } } */
+/* { dg-final { scan-assembler-not "\\*avx_movv4sf_internal/3" } } */
+/* { dg-final { scan-assembler-not "vextractf128" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#include "avx-check.h"
+
+#define N 8
+
+float a[N] = { 24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4 };
+float b[N+3];
+float c[N+3];
+
+void
+foo (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i+3] = a[i] * 2;
+}
+
+__attribute__ ((noinline))
+float
+bar (float x)
+{
+ return x * 2;
+}
+
+void
+avx_test (void)
+{
+ int i;
+
+ foo ();
+
+ for (i = 0; i < N; i++)
+ c[i+3] = bar (a[i]);
+
+ for (i = 0; i < N; i++)
+ if (b[i+3] != c[i+3])
+ abort ();
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#include "avx-check.h"
+
+#define N 4
+
+double a[N] = { 24.43, 68.346, 43.35, 546.46 };
+double b[N+3];
+double c[N+3];
+
+void
+foo (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ b[i+3] = a[i] * 2;
+}
+
+__attribute__ ((noinline))
+double
+bar (double x)
+{
+ return x * 2;
+}
+
+void
+avx_test (void)
+{
+ int i;
+
+ foo ();
+
+ for (i = 0; i < N; i++)
+ c[i+3] = bar (a[i]);
+
+ for (i = 0; i < N; i++)
+ if (b[i+3] != c[i+3])
+ abort ();
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */
+
+#include "avx-check.h"
+
+#define N 128
+
+char **ep;
+char **fp;
+
+__attribute__ ((noinline))
+void
+foo (void)
+{
+ ep = (char **) malloc (N);
+ fp = (char **) malloc (N);
+}
+
+void
+avx_test (void)
+{
+ int i;
+ char **ap, **bp;
+ char *str = "STR";
+
+ foo ();
+
+ ap = ep;
+ bp = fp;
+
+ for (i = N; i >= 0; i--)
+ {
+ *ap++ = str;
+ *bp++ = str;
+ }
+
+ for (i = N; i >= 0; i--)
+ {
+ if (strcmp (*--ap, "STR") != 0)
+ abort ();
+ if (strcmp (*--bp, "STR") != 0)
+ abort ();
+ }
+}