This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [patch, libfortran] Add AVX-specific matmul
Well, here is a newer version of the patch.
I wrote a few configure tests to check for AVX.
This version hast the advantage that, if anybody
uses 32-bit programs with AVX, they would also benefit.
Jakub, would you be OK with that patch?
I do not yet want to commit this because it needs more
testing on different platforms to see if it actually
performs better.
Regarding putting the blocked part into something separate:
Quite doable, but I would rather like to do this in a follow-up
patch, if we decide t do it.
Regards
Thomas
2016-11-17 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/78379
* acinclude.m4 (LIBGFOR_CHECK_AVX): New test.
(LIBGFOR_CHECK_AVX2): New test.
(LIBGFOR_CHECK_AVX512F): New test.
* configure.ac: Call LIBGFOR_CHECK_AVX, LIBGFOR_CHECK_AVX2
and LIBGFOR_CHECK_AVX512F.
* config.h.in: Regenerated.
* configure: Regenerated.
* m4/matmul.m4: For AVX, AVX2 and AVX_512F, make the work function
for matmul static with target_clones for AVX and default, and
create a wrapper function to call it.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
Index: acinclude.m4
===================================================================
--- acinclude.m4 (Revision 242477)
+++ acinclude.m4 (Arbeitskopie)
@@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [
[Define if strerror_r takes two arguments and is available in <string.h>.]),)
CFLAGS="$ac_save_CFLAGS"
])
+
+dnl Check for AVX
+
+AC_DEFUN([LIBGFOR_CHECK_AVX], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ void _mm256_zeroall (void)
+ {
+ __builtin_ia32_vzeroall ();
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX, 1,
+ [Define if AVX instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX2
+
+AC_DEFUN([LIBGFOR_CHECK_AVX2], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx2"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+ __v4di
+ mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
+ {
+ return __builtin_ia32_andnotsi256 (__X, __Y);
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX2, 1,
+ [Define if AVX2 instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for AVX512f
+
+AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx512f"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+ typedef double __m512d __attribute__ ((__vector_size__ (64)));
+ __m512d _mm512_add (__m512d a)
+ {
+ return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+ }]], [[]])],
+ AC_DEFINE(HAVE_AVX512F, 1,
+ [Define if AVX512f instructions can be compiled.]),
+ [])
+ CFLAGS="$ac_save_CFLAGS"
+])
Index: config.h.in
===================================================================
--- config.h.in (Revision 242477)
+++ config.h.in (Arbeitskopie)
@@ -78,6 +78,15 @@
/* Define to 1 if the target supports __attribute__((visibility(...))). */
#undef HAVE_ATTRIBUTE_VISIBILITY
+/* Define if AVX instructions can be compiled. */
+#undef HAVE_AVX
+
+/* Define if AVX2 instructions can be compiled. */
+#undef HAVE_AVX2
+
+/* Define if AVX512f instructions can be compiled. */
+#undef HAVE_AVX512F
+
/* Define to 1 if you have the `cabs' function. */
#undef HAVE_CABS
Index: configure
===================================================================
--- configure (Revision 242477)
+++ configure (Arbeitskopie)
@@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h
fi
+# Check whether we support AVX extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ void _mm256_zeroall (void)
+ {
+ __builtin_ia32_vzeroall ();
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX2 extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx2"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+ __v4di
+ mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
+ {
+ return __builtin_ia32_andnotsi256 (__X, __Y);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
+# Check wether we support AVX512f extensions
+
+ ac_save_CFLAGS="$CFLAGS"
+ CFLAGS="-O2 -mavx512f"
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+ typedef double __m512d __attribute__ ((__vector_size__ (64)));
+ __m512d _mm512_add (__m512d a)
+ {
+ return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
+ }
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX512F 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ CFLAGS="$ac_save_CFLAGS"
+
+
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
# tests run on this system so they can be shared between configure
Index: configure.ac
===================================================================
--- configure.ac (Revision 242477)
+++ configure.ac (Arbeitskopie)
@@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE
# Check whether line terminator is LF or CRLF
LIBGFOR_CHECK_CRLF
+# Check whether we support AVX extensions
+LIBGFOR_CHECK_AVX
+
+# Check wether we support AVX2 extensions
+LIBGFOR_CHECK_AVX2
+
+# Check wether we support AVX512f extensions
+LIBGFOR_CHECK_AVX512F
+
AC_CACHE_SAVE
if test ${multilib} = yes; then
Index: generated/matmul_c10.c
===================================================================
--- generated/matmul_c10.c (Revision 242477)
+++ generated/matmul_c10.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_c10 (gfc_array_c10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c10);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_10 * restrict abase;
const GFC_COMPLEX_10 * restrict bbase;
GFC_COMPLEX_10 * restrict dest;
Index: generated/matmul_c16.c
===================================================================
--- generated/matmul_c16.c (Revision 242477)
+++ generated/matmul_c16.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_c16 (gfc_array_c16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_16 * restrict abase;
const GFC_COMPLEX_16 * restrict bbase;
GFC_COMPLEX_16 * restrict dest;
Index: generated/matmul_c4.c
===================================================================
--- generated/matmul_c4.c (Revision 242477)
+++ generated/matmul_c4.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_c4 (gfc_array_c4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_4 * restrict abase;
const GFC_COMPLEX_4 * restrict bbase;
GFC_COMPLEX_4 * restrict dest;
Index: generated/matmul_c8.c
===================================================================
--- generated/matmul_c8.c (Revision 242477)
+++ generated/matmul_c8.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_c8 (gfc_array_c8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_8 * restrict abase;
const GFC_COMPLEX_8 * restrict bbase;
GFC_COMPLEX_8 * restrict dest;
Index: generated/matmul_i1.c
===================================================================
--- generated/matmul_i1.c (Revision 242477)
+++ generated/matmul_i1.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_i1 (gfc_array_i1 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i1);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i1 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_1 * restrict abase;
const GFC_INTEGER_1 * restrict bbase;
GFC_INTEGER_1 * restrict dest;
Index: generated/matmul_i16.c
===================================================================
--- generated/matmul_i16.c (Revision 242477)
+++ generated/matmul_i16.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_i16 (gfc_array_i16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_i16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i16 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_16 * restrict abase;
const GFC_INTEGER_16 * restrict bbase;
GFC_INTEGER_16 * restrict dest;
Index: generated/matmul_i2.c
===================================================================
--- generated/matmul_i2.c (Revision 242477)
+++ generated/matmul_i2.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_i2 (gfc_array_i2 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i2);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i2 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_2 * restrict abase;
const GFC_INTEGER_2 * restrict bbase;
GFC_INTEGER_2 * restrict dest;
Index: generated/matmul_i4.c
===================================================================
--- generated/matmul_i4.c (Revision 242477)
+++ generated/matmul_i4.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_i4 (gfc_array_i4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_4 * restrict abase;
const GFC_INTEGER_4 * restrict bbase;
GFC_INTEGER_4 * restrict dest;
Index: generated/matmul_i8.c
===================================================================
--- generated/matmul_i8.c (Revision 242477)
+++ generated/matmul_i8.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_i8 (gfc_array_i8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_i8 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_8 * restrict abase;
const GFC_INTEGER_8 * restrict bbase;
GFC_INTEGER_8 * restrict dest;
Index: generated/matmul_r10.c
===================================================================
--- generated/matmul_r10.c (Revision 242477)
+++ generated/matmul_r10.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_r10 (gfc_array_r10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r10);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_10 * restrict abase;
const GFC_REAL_10 * restrict bbase;
GFC_REAL_10 * restrict dest;
Index: generated/matmul_r16.c
===================================================================
--- generated/matmul_r16.c (Revision 242477)
+++ generated/matmul_r16.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_r16 (gfc_array_r16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r16);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_16 * restrict abase;
const GFC_REAL_16 * restrict bbase;
GFC_REAL_16 * restrict dest;
Index: generated/matmul_r4.c
===================================================================
--- generated/matmul_r4.c (Revision 242477)
+++ generated/matmul_r4.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_r4 (gfc_array_r4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r4);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_4 * restrict abase;
const GFC_REAL_4 * restrict bbase;
GFC_REAL_4 * restrict dest;
Index: generated/matmul_r8.c
===================================================================
--- generated/matmul_r8.c (Revision 242477)
+++ generated/matmul_r8.c (Arbeitskopie)
@@ -75,11 +75,47 @@ extern void matmul_r8 (gfc_array_r8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r8);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_8 * restrict abase;
const GFC_REAL_8 * restrict bbase;
GFC_REAL_8 * restrict dest;
Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4 (Revision 242477)
+++ m4/matmul.m4 (Arbeitskopie)
@@ -76,11 +76,47 @@ extern void matmul_'rtype_code` ('rtype` * const r
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
+#if defined(HAVE_AVX) || defined(HAVE_AFX2) || defined(HAVE_AVX512F)
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones(
+#if defined(HAVE_AVX)
+"avx",
+#endif
+#if defined(HAVE_AVX2)
+"avx2",
+#endif
+#if defined(HAVE_AVX512F)
+"avx512f",
+#endif
+"default")));
+
void
matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const 'rtype_name` * restrict abase;
const 'rtype_name` * restrict bbase;
'rtype_name` * restrict dest;