[Patch, libgfortran] PR21468 Vectorizing matmul, other perf improvements

Mon Nov 14 20:18:00 GMT 2005

On Sun, Nov 13, 2005 at 08:48:55PM +0100, Thomas Koenig wrote:
> On Sun, Nov 13, 2005 at 05:52:37PM +0200, Janne Blomqvist wrote:
> 
> > I have looked at PR21468, "Vectorizing libgfortran". The attached
> > patch enables vectorizing the main loop for matmul in case of unit
> > stride. It doesn't change the code much, only adds const and restrict
> > appropriately to the type declarations, so that the vectorizer can
> > safely vectorize the main loop. 
> 
> This is OK.  You can commit this part.

Committed. Matmul is now vectorized by default on those targets where
SIMD instructions are availabe.

> Maybe this needs to become platform-dependent.  I think we need
> some more benchmarks here.

I updated the benchmark I posted yesterday slightly and did some
measurements on an 1.8 GHz A64 (i686-pc-linux-gnu, sorry no 64-bit
results). It seems that while -funroll-loops improves performance
compared to the baseline, it doesn't really improve when combined with
vectorizing. Perhaps it's different on x86-64, where there is twice
the number of SSE2 registers. If that's the case, I propose that we
enable -funroll-loops, as it increases performance for "bare" x86
(which doesn't vectorize as sse2 isn't used by default). As for
-frename-registers mentioned by Tim Prince, it is enabled by default
if -funroll-loops is used.

trunk:

 Matrix side size    Matmul (Gflops/s)    sgemm (Gflops/s)
 ========================================================
    2                0.084                0.022
    4                0.346                0.157
    8                0.794                0.455
   16                0.714                0.741
   32                0.901                1.250
   64                0.989                1.332
  128                1.022                2.827
  256                0.848                4.466
  512                0.773                4.790
 1024                0.769                4.969
 2048                0.778                4.972

With -funroll-loops:

 Matrix side size    Matmul (Gflops/s)    sgemm (Gflops/s)
 ========================================================
    2                0.056                0.020
    4                0.388                0.149
    8                0.794                0.459
   16                1.042                0.719
   32                1.205                1.282
   64                1.264                1.407
  128                1.172                2.827
  256                1.000                4.187
  512                0.912                4.877
 1024                0.906                4.946
 2048                0.918                5.047

With -ftree-vectorize and -msse2 (sse2 is default on x86-64, needed for
vectorization):

 Matrix side size    Matmul (Gflops/s)    sgemm (Gflops/s)
 ========================================================
    2                0.059                0.021
    4                0.274                0.159
    8                0.467                0.467
   16                0.807                0.730
   32                1.316                1.235
   64                1.491                1.350
  128                1.922                2.746
  256                1.675                4.785
  512                1.060                4.877
 1024                1.060                4.901
 2048                1.065                5.029

Both -ftree-vectorize and -funroll-loops (and -msse2 as well):

 Matrix side size    Matmul (Gflops/s)    sgemm (Gflops/s)
 ========================================================
    2                0.047                0.022
    4                0.261                0.163
    8                0.380                0.459
   16                0.645                0.741
   32                1.000                1.282
   64                1.469                1.427
  128                1.884                2.827
  256                1.675                4.785
  512                1.052                4.877
 1024                1.058                4.946
 2048                1.059                5.042

-- 
Janne Blomqvist
-------------- next part --------------
program matmul_bench
  implicit none

  integer, parameter :: wp = selected_real_kind(4), &
       dp = selected_real_kind(15)

  call runbench (2500)

contains

  ! Run matrix mult benchmark with different sized arrays.
  subroutine runbench (nmax)
    integer, intent(in) :: nmax
    real(wp), allocatable, dimension(:,:) :: a, b, res
    integer :: n, loop
    real(dp) :: time, flops, time2

    print *, 'Matrix side size    Matmul (Gflops/s)    sgemm (Gflops/s)'
    print *, '========================================================'
    n = 2
    do
       allocate (a(n,n), b(n,n), res(n,n))
       call random_number (a)
       call random_number (b)
       res = 0.0_wp
       ! matmul for square matrix is (2n-1)*n**2 flops. Assuming on
       ! average 1 gflop/s cpu, 4e8 flops takes about 0.4 seconds and
       ! should be enough.
       flops = (2.0_dp * real (n, dp) - 1.0_dp) * real (n, dp)**2
       loop = max (int (1.0e8_dp / flops), 1)
       call matmul_timing (a, b, res, loop, time)
       res = 0.0_wp
       call sgemm_timing (a, b, res, loop, time2)
       print '(I5,15X,F6.3,15X,F6.3)', n, &
            flops * real(loop, dp) / time / 1.0e9_dp, &
            flops * real (loop, dp) / time2 / 1.0e9_dp
       deallocate (a, b, res)
       n = n * 2
       if (n > nmax) exit
    end do
  end subroutine runbench

  ! Actual routine, and timing.
  subroutine matmul_timing (a, b, res, loop, time)
    real(wp), intent(in), dimension(:, :) :: a, b
    real(wp), intent(inout) :: res(:,:)
    integer, intent(in) :: loop
    real(dp), intent(out) :: time
    real(dp) :: t1, t2
    integer :: i

    call cpu_time (t1)
    do i = 1, loop
       res = matmul (a, b)
    end do
    call cpu_time (t2)
    time = t2 - t1
  end subroutine matmul_timing

  subroutine sgemm_timing (a, b, res, loop, time)
    real(wp), intent(in), dimension(:, :) :: a, b
    real(wp), intent(inout) :: res(:,:)
    integer, intent(in) :: loop
    real(dp), intent(out) :: time
    real(dp) :: t1, t2
    integer :: i, n

    n = size (a, 1)
    call cpu_time (t1)
    do i = 1, loop
       call sgemm('n','n',n, n, n, 1.0_wp, a, n, b, n, 0.0_wp, res, n)
    end do
    call cpu_time (t2)
    time = t2 - t1
  end subroutine sgemm_timing

end program matmul_bench
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 185 bytes
Desc: not available
URL: <http://gcc.gnu.org/pipermail/gcc-patches/attachments/20051114/9b1f9c2f/attachment.sig>