[Patch, libgfortran] PR21468 Vectorizing matmul, other perf improvements

Sun Nov 13 15:58:00 GMT 2005

Hello,

I have looked at PR21468, "Vectorizing libgfortran". The attached
patch enables vectorizing the main loop for matmul in case of unit
stride. It doesn't change the code much, only adds const and restrict
appropriately to the type declarations, so that the vectorizer can
safely vectorize the main loop. 

While I don't have the figures handy ATM, IIRC on my AMD64 it improved
matmul_r4 by about 20-30 % for arrays that fit into cache.

Another thing that it does is enabling -funroll-loops for matmul_* and
matmull_*. This is actually a bigger performance booster than
vectorizing, almost doubling performance.

The results below show the difference loop unrolling does for
matmull_l4. The difference is similar for matmul_r4.

With -ftree-vectorize -funroll-loops for libmatmul.la
libmatmull.la default

 Multiplying two          600 x         600  matrices consumes cpu time as:
 matmul:    2.612602      seconds.
      100000  times matrix multiply for            9 x           9  matrices:
 matmul:   0.2409642      seconds.
      100000  times logical matmul for            9 x           9  matrix:
  0.4789269      seconds.

-funroll-loops for libmatmull.la too:

 Multiplying two          600 x         600  matrices consumes cpu time as:
 matmul:    2.653597      seconds.
      100000  times matrix multiply for            9 x           9  matrices:
 matmul:   0.2399640      seconds.
      100000  times logical matmul for            9 x           9  matrix:
  0.3049529      seconds.


I think that differences this large warrant exceptions to the normal
"-g -O2". The attached diff to Makefile.am achieves this.

Finally, I think that in general the intrinsic function arguments
(which are all intent(in)) should be like

const gfc_type_t * const restrict foo

Unfortunately the first const won't work for array intrinsics, like
matmul, since they sometimes change the descriptors. Anyway, I plan to
do this for other intrinsics after this patch is OK'd. Hopefully it'll
allow some further vectorization with very little effort.

Regtested on i686-pc-linux-gnu.

-- 
Janne Blomqvist
-------------- next part --------------
2005-11-13  Janne Blomqvist  <jb@gcc.gnu.org>

	PR fortran/21468
	* Makefile.am: Add special flags for compiling matmul.
	* m4/matmul.m4: Add const and restrict to type declarations as
	appropriate.
	* m4/matmull.m4: Likewise.
	* Makefile.in: Regenerated.
	* aclocal.m4: Likewise.
	* generated/matmul_*.c: Likewise.
	
-------------- next part --------------
Index: Makefile.am
===================================================================

--- Makefile.am	(revision 106827)
+++ Makefile.am	(working copy)
@@ -431,7 +431,7 @@
 gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
     $(i_maxloc1_c) $(i_maxval_c) $(i_minloc0_c) $(i_minloc1_c) $(i_minval_c) \
     $(i_product_c) $(i_sum_c) $(i_dotprod_c) $(i_dotprodl_c) $(i_dotprodc_c) \
-    $(i_matmul_c) $(i_matmull_c) $(i_transpose_c) $(i_shape_c) $(i_eoshift1_c) \
+    $(i_transpose_c) $(i_shape_c) $(i_eoshift1_c) \
     $(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
     $(i_exponent_c) $(i_fraction_c) $(i_nearest_c) $(i_set_exponent_c) \
     $(i_pow_c) \
@@ -571,11 +571,19 @@
 intrinsics/dprod_r8.f90 \
 intrinsics/f2c_specifics.F90
 
+EXTRA_LTLIBRARIES = libmatmul.la libmatmull.la
+libmatmul_la_SOURCES = $(i_matmul_c)
+libmatmul_la_CFLAGS = -ftree-vectorize -funroll-loops $(AM_CFLAGS)
+libmatmull_la_SOURCES =  $(i_matmull_c)
+libmatmull_la_CFLAGS = -funroll-loops $(AM_CFLAGS)
+
 BUILT_SOURCES=$(gfor_built_src) $(gfor_built_specific_src) \
-    $(gfor_built_specific2_src)
+    $(gfor_built_specific2_src) $(libmatmul_la_SOURCES) $(libmatmull_la_SOURCES)
 libgfortran_la_SOURCES = $(gfor_src) $(gfor_built_src) $(gfor_io_src) \
     $(gfor_helper_src) $(gfor_io_headers) $(gfor_specific_src)
 
+libgfortran_la_LIBADD =  libmatmul.la libmatmull.la
+
 I_M4_DEPS=m4/iparm.m4
 I_M4_DEPS0=$(I_M4_DEPS) m4/iforeach.m4
 I_M4_DEPS1=$(I_M4_DEPS) m4/ifunction.m4
Index: m4/matmul.m4
===================================================================
--- m4/matmul.m4	(revision 106827)
+++ m4/matmul.m4	(working copy)
@@ -49,15 +49,17 @@
          C(I,J) = C(I,J)+A(I,K)*B(K,J)
 */
 
-extern void matmul_`'rtype_code (rtype * retarray, rtype * a, rtype * b);
+extern void matmul_`'rtype_code (rtype * const restrict retarray, 
+	rtype * const restrict a, rtype * const restrict b);
 export_proto(matmul_`'rtype_code);
 
 void
-matmul_`'rtype_code (rtype * retarray, rtype * a, rtype * b)
+matmul_`'rtype_code (rtype * const restrict retarray, 
+	rtype * const restrict a, rtype * const restrict b)
 {
-  rtype_name *abase;
-  rtype_name *bbase;
-  rtype_name *dest;
+  const rtype_name * restrict abase;
+  const rtype_name * restrict bbase;
+  rtype_name * restrict dest;
 
   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
   index_type x, y, n, count, xcount, ycount;
@@ -106,12 +108,10 @@
       retarray->offset = 0;
     }
 
-  abase = a->data;
-  bbase = b->data;
-  dest = retarray->data;
-
   if (retarray->dim[0].stride == 0)
     retarray->dim[0].stride = 1;
+
+  /* This prevents constifying the input arguments.  */
   if (a->dim[0].stride == 0)
     a->dim[0].stride = 1;
   if (b->dim[0].stride == 0)
@@ -177,9 +177,9 @@
 
   if (rxstride == 1 && axstride == 1 && bxstride == 1)
     {
-      rtype_name *bbase_y;
-      rtype_name *dest_y;
-      rtype_name *abase_n;
+      const rtype_name * restrict bbase_y;
+      rtype_name * restrict dest_y;
+      const rtype_name * restrict abase_n;
       rtype_name bbase_yn;
 
       if (rystride == ycount)
Index: m4/matmull.m4
===================================================================
--- m4/matmull.m4	(revision 106827)
+++ m4/matmull.m4	(working copy)
@@ -39,15 +39,17 @@
 /* Dimensions: retarray(x,y) a(x, count) b(count,y).
    Either a or b can be rank 1.  In this case x or y is 1.  */
 
-extern void matmul_`'rtype_code (rtype *, gfc_array_l4 *, gfc_array_l4 *);
+extern void matmul_`'rtype_code (rtype * const restrict, 
+	gfc_array_l4 * const restrict, gfc_array_l4 * const restrict);
 export_proto(matmul_`'rtype_code);
 
 void
-matmul_`'rtype_code (rtype * retarray, gfc_array_l4 * a, gfc_array_l4 * b)
+matmul_`'rtype_code (rtype * const restrict retarray, 
+	gfc_array_l4 * const restrict a, gfc_array_l4 * const restrict b)
 {
-  GFC_INTEGER_4 *abase;
-  GFC_INTEGER_4 *bbase;
-  rtype_name *dest;
+  const GFC_INTEGER_4 * restrict abase;
+  const GFC_INTEGER_4 * restrict bbase;
+  rtype_name * restrict dest;
   index_type rxstride;
   index_type rystride;
   index_type xcount;
@@ -57,8 +59,8 @@
   index_type x;
   index_type y;
 
-  GFC_INTEGER_4 *pa;
-  GFC_INTEGER_4 *pb;
+  const GFC_INTEGER_4 * restrict pa;
+  const GFC_INTEGER_4 * restrict pb;
   index_type astride;
   index_type bstride;
   index_type count;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 185 bytes
Desc: not available
URL: <http://gcc.gnu.org/pipermail/gcc-patches/attachments/20051113/3ef37042/attachment.sig>