[patch, fortran] Enable FMA for AVX2 and AVX512F for matmul

Thu Mar 2 12:02:00 GMT 2017

On Thu, Mar 02, 2017 at 12:57:05PM +0100, Thomas Koenig wrote:
> --- m4/matmul.m4	(Revision 245836)
> +++ m4/matmul.m4	(Arbeitskopie)
> @@ -123,9 +123,14 @@ void matmul_'rtype_code` ('rtype` * const restrict
>  	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
>  	int blas_limit, blas_call gemm) = NULL;

Please drop the " = NULL" here

> +  void (*matmul_fn) ('rtype` * const restrict retarray, 
> +	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
> +	int blas_limit, blas_call gemm) = NULL;

and here as well.  The first one because static vars are zero initialized
by default, the latter because it makes no sense to initialize it and then
immediately overwrite it in the next stmt.

> +
> +  matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
>    if (matmul_p == NULL)

This needs to test matmul_fn == NULL instead of matmul_p == NULL.

> @@ -151,14 +156,15 @@ void matmul_'rtype_code` ('rtype` * const restrict
>  #ifdef HAVE_AVX
>        	  if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
>   	    {
> -              matmul_p = matmul_'rtype_code`_avx;
> -	      goto tailcall;
> +              matmul_fn = matmul_'rtype_code`_avx;
> +	      goto store;
>  	    }
>  #endif  /* HAVE_AVX */
>          }
> +   store:
> +      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
>     }
>  
> -tailcall:
>     (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);

And this needs to use *matmul_fn instead of *matmul_p too.
The whole point is that matmul_p is only loaded using __atomic_load_n
and only optionally stored using __atomic_store_n.

Ok with those changes.

	Jakub