This is the mail archive of the fortran@gcc.gnu.org mailing list for the GNU Fortran project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: patches for increased performance of matmul, dotprod, transpose


Tobias Schlüter wrote:

Hi Tim,


If the manual loop unrolling is really helpful, I think an optimizer bug
should be filed.  On IRC our optimizer guys told me that at least the
modification to matmul should already be done automatically, so I'm wondering,
 if you have any benchmark numbers supporting this modification?  Can't the
same effect be obtained by building libgfortran with -funroll-loops?

A procedural point: your patch doesn't conform to the GNU coding style
(comments should begin with a capital letter and end in "punctuation + two
spaces + */", always put blanks before and after operators, indent comments to
code); also if you find you really need to hand-optimize code, please add a
comment explaining why, adding "FIXME" and a PR number if you think you're
working around an optimizer bug.

WRT your mail:  paragraphs make reading and also responding to emails much
easier.  ChangeLogs are also useful for referencing parts when discussing the
patch.

- Tobi


These changes are complementary to -funroll-loops. That optimization does not duplicate any of these source changes. gcc might be considered deficient in that it does not automatically perform scalar reductions (to move store instructions out of a loop, where there is more than one sum) or strength reduction (to move integer multiplication out of a loop), where it is dictated in the matmul patch. I did not change the existing usage of * (multiplication) without spaces surrounding the operator, but included the spaces in my additional copies of those expressions.

Performance of Livermore Fortran Kernels 3 and 21 on Pentium D x86-64 exceeds ifort with these patches. Performance of Kernel 4 is doubled from previous libgfortran, and Kernel 6 is speeded up significantly. This assumes those kernels are written using dotprod() and matmul().
I verified performance gain also on ia64 linux and Pentium-m cygwin.


changelog
gcc/fortran
* transpose.m4: Swap inner and outer loops, to make the destination stride 1, taking advantage of Write Combine buffering architectures.
* dotprod.m4: Accumulate sum by pairs, to permit up to double the operations in the pipeline.
Optimization primarily to speed up floating point (not complex).
* matmul.m4: Swap inner loops, moving store instructions out of inner loop. Calculate results
in pairs, combining operations with a common multiplier. Dictate scalar reduction of sums and
strength reduction of index multiplication, where not optimized automatically by gcc. Net 50%
reduction in memory traffic.


*** orig/matmul.m4	Sun Aug 28 10:00:21 2005
--- matmul.m4	Mon Aug 29 12:04:28 2005
*************** sinclude(`matmul_asm_'rtype_code`.m4')dn
*** 179,183 ****
        rtype_name *dest_y;
        rtype_name *abase_n;
-       rtype_name bbase_yn;
  
        if (rystride == ycount)
--- 179,182 ----
*************** sinclude(`matmul_asm_'rtype_code`.m4')dn
*** 188,191 ****
--- 187,191 ----
  	    for (x = 0; x < xcount; x++)
  	      dest[x + y*rystride] = (rtype_name)0;
+ 	/* Possible FIXME:  original code does not surround * by spaces.  */
  	}
  
*************** sinclude(`matmul_asm_'rtype_code`.m4')dn
*** 194,206 ****
  	  bbase_y = bbase + y*bystride;
  	  dest_y = dest + y*rystride;
! 	  for (n = 0; n < count; n++)
  	    {
! 	      abase_n = abase + n*aystride;
! 	      bbase_yn = bbase_y[n];
! 	      for (x = 0; x < xcount; x++)
  		{
! 		  dest_y[x] += abase_n[x] * bbase_yn;
  		}
  	    }
  	}
      }
--- 194,220 ----
  	  bbase_y = bbase + y*bystride;
  	  dest_y = dest + y*rystride;
! 	  for (x = 0; x < xcount-1; x += 2)
! 	  /* Loops interchanged, loop unrolled by 2 to improve addition
! 	  	throughput and register locality.  Not necessarily 
! 		advantageous for integer version.  */
  	    {
! 	      rtype_name sum=0,sum1=0;
! 	      /* Possible FIXME:  sums should be registerized automatically
! 	      	by compiler  */
! 	      for (n = 0; n < count; n++)
  		{
! 		  abase_n = abase + n * aystride;
! 		  sum += abase_n[x] * bbase_y[n];
! 		  sum1 += abase_n[x+1] * bbase_y[n];
  		}
+ 	      dest_y[x] += sum;
+ 	      dest_y[x+1] += sum1;
  	    }
+ 	  if ( x < xcount)
+ 	      for (n = 0; n < count; n++)
+ 		{
+ 		  abase_n = abase + n * aystride;
+ 		  dest_y[x] += abase_n[x] * bbase_y[n];
+ 		}
  	}
      }
*************** sinclude(`matmul_asm_'rtype_code`.m4')dn
*** 209,219 ****
        for (y = 0; y < ycount; y++)
  	for (x = 0; x < xcount; x++)
! 	  dest[x*rxstride + y*rystride] = (rtype_name)0;
  
        for (y = 0; y < ycount; y++)
- 	for (n = 0; n < count; n++)
  	  for (x = 0; x < xcount; x++)
! 	    /* dest[x,y] += a[x,n] * b[n,y] */
! 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
      }
  }
--- 223,243 ----
        for (y = 0; y < ycount; y++)
  	for (x = 0; x < xcount; x++)
! 	  dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
  
        for (y = 0; y < ycount; y++)
  	  for (x = 0; x < xcount; x++)
! 	    {
! 	      rtype_name sum=0;
! 	      /* Possible FIXME:  strength reduction for n * [ab]ystride
! 	      	should not require writing out by hand */
! 	      index_type nay = 0, nbx = 0;
! 	      for (n = 0; n < count; n++)
! 		{
! 		  sum += abase[x * axstride + nay] * bbase[nbx + y * bystride];
! 		  nay += aystride;
! 		  nbx += bxstride;
! 		}
! 	      dest[x * rxstride + y * rystride] += sum;
! 	    }
      }
  }
*** orig/transpose.m4	Sun Aug 28 10:02:33 2005
--- transpose.m4	Mon Aug 29 11:29:42 2005
*************** transpose_`'rtype_code (rtype * ret, rty
*** 85,99 ****
    sptr = source->data;
  
!   for (y=0; y < ycount; y++)
      {
!       for (x=0; x < xcount; x++)
!         {
            *rptr = *sptr;
  
!           sptr += sxstride;
!           rptr += rystride;
          }
!         sptr += systride - (sxstride * xcount);
!         rptr += rxstride - (rystride * xcount);
      }
  }
--- 85,101 ----
    sptr = source->data;
  
! /* Keep writes local by working on minimum rstride.
! 	Improve Write Combine buffering  */
!   for (x=0; x < xcount; x++)
      {
!       for (y=0; y < ycount; y++)
! 	{
            *rptr = *sptr;
  
!           sptr += systride;
!           rptr += rxstride;
          }
!         sptr += sxstride - (systride * ycount);
!         rptr += rystride - (rxstride * ycount);
      }
  }
*** orig/dotprod.m4	Sun Aug 28 10:49:51 2005
--- dotprod.m4	Mon Aug 29 11:30:46 2005
*************** dot_product_`'rtype_code (rtype * a, rty
*** 67,76 ****
  sinclude(`dotprod_asm_'rtype_code`.m4')dnl
  
!   while (count--)
      {
!       res += *pa * *pb;
!       pa += astride;
!       pb += bstride;
      }
  
    return res;
--- 67,82 ----
  sinclude(`dotprod_asm_'rtype_code`.m4')dnl
  
!   for(;count>1;count-=2)
!   /* Summing by pairs allows up to double pipeline throughput
! 	when multiply issue rate exceeds add latency.  
! 	Not as beneficial for complex or integer on many platforms.  */
      {
!       res += *pa * *pb + pa[astride] * pb[bstride];
!       pa += astride*2;
!       pb += bstride*2;
      }
+   /* Take care of odd remainder */
+   if (count>0)
+       res += *pa * *pb;
  
    return res;

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]