This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[patch, fortran] Enable FMA for AVX2 and AVX512F for matmul
- From: Thomas Koenig <tkoenig at netcologne dot de>
- To: "fortran at gcc dot gnu dot org" <fortran at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 1 Mar 2017 22:00:08 +0100
- Subject: [patch, fortran] Enable FMA for AVX2 and AVX512F for matmul
- Authentication-results: sourceware.org; auth=none
Hello world,
the attached patch enables FMA for the AVX2 and AVX512F variants of
matmul. This should bring a very nice speedup (although I have
been unable to run benchmarks due to lack of a suitable machine).
Question: Is this still appropriate for the current state of trunk?
Or rather, OK for when gcc 8 opens (which might still be some time
in the future)?
2017-03-01 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/78379
* m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
reals. Add fma to target options.
(matmul_'rtype_code`_avx512f): Add fma to target options.
(matmul_'rtype_code`): Call AVX2 and AVX512F only if
FMA is available.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
Regards
Thomas
Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4 (Revision 245760)
+++ m4/matmul.m4 (Arbeitskopie)
@@ -75,14 +75,6 @@
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
-'ifelse(rtype_letter,`r',dnl
-`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2. Only generate
- an AVX2 function if we are dealing with integer. */
-#undef HAVE_AVX2
-#endif')
-`
-
/* Put exhaustive list of possible architectures here here, ORed together. */
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
@@ -101,7 +93,7 @@
`static void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
- int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX2 */
@@ -110,7 +102,7 @@
`static void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
- int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx512f,fma")));
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX512F */
@@ -138,7 +130,9 @@
{
/* Run down the available processors in order of preference. */
#ifdef HAVE_AVX512F
- if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+
{
matmul_p = matmul_'rtype_code`_avx512f;
goto tailcall;
@@ -147,7 +141,8 @@
#endif /* HAVE_AVX512F */
#ifdef HAVE_AVX2
- if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_p = matmul_'rtype_code`_avx2;
goto tailcall;