Bug 57169 - fully unrolled matrix multiplication not vectorized
Summary: fully unrolled matrix multiplication not vectorized
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.9.0
: P3 normal
Target Milestone: 10.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2013-05-04 08:55 UTC by vincenzo Innocente
Modified: 2021-12-28 06:23 UTC (History)
0 users

See Also:
Host:
Target:
Build:
Known to work: 10.1.0, 12.0
Known to fail:
Last reconfirmed: 2013-05-06 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description vincenzo Innocente 2013-05-04 08:55:02 UTC
a lot of legacy code still fully unroll linear algebra for small dimensions

As shown below gcc fails to vectorized a unrolled 4x4 matrix multiplication
while vectorize well the corresponding loop expression 

sample code

alignas(32) float a[4][4];
alignas(32) float b[4][4];
alignas(32) float c[4][4];

void matmul() {
   for (int i=0;i!=4;++i)
     for (int j=0;j!=4;++j) {
       float sum=0;
       for (int k=0;k!=4;++k)
             sum += a[i][k]*b[k][j];
       c[i][j]=sum;
     }
}


alignas(32) float src1[4][4];
alignas(32) float src2[4][4];
alignas(32) float dest[4][4];

void matmulU(){
  dest[0][0] = src1[0][0] * src2[0][0] + src1[0][1] * src2[1][0] + src1[0][2] * src2[2][0] + src1[0][3] * src2[3][0]; 
  dest[0][1] = src1[0][0] * src2[0][1] + src1[0][1] * src2[1][1] + src1[0][2] * src2[2][1] + src1[0][3] * src2[3][1]; 
  dest[0][2] = src1[0][0] * src2[0][2] + src1[0][1] * src2[1][2] + src1[0][2] * src2[2][2] + src1[0][3] * src2[3][2]; 
  dest[0][3] = src1[0][0] * src2[0][3] + src1[0][1] * src2[1][3] + src1[0][2] * src2[2][3] + src1[0][3] * src2[3][3]; 
  dest[1][0] = src1[1][0] * src2[0][0] + src1[1][1] * src2[1][0] + src1[1][2] * src2[2][0] + src1[1][3] * src2[3][0]; 
  dest[1][1] = src1[1][0] * src2[0][1] + src1[1][1] * src2[1][1] + src1[1][2] * src2[2][1] + src1[1][3] * src2[3][1]; 
  dest[1][2] = src1[1][0] * src2[0][2] + src1[1][1] * src2[1][2] + src1[1][2] * src2[2][2] + src1[1][3] * src2[3][2]; 
  dest[1][3] = src1[1][0] * src2[0][3] + src1[1][1] * src2[1][3] + src1[1][2] * src2[2][3] + src1[1][3] * src2[3][3]; 
  dest[2][0] = src1[2][0] * src2[0][0] + src1[2][1] * src2[1][0] + src1[2][2] * src2[2][0] + src1[2][3] * src2[3][0]; 
  dest[2][1] = src1[2][0] * src2[0][1] + src1[2][1] * src2[1][1] + src1[2][2] * src2[2][1] + src1[2][3] * src2[3][1]; 
  dest[2][2] = src1[2][0] * src2[0][2] + src1[2][1] * src2[1][2] + src1[2][2] * src2[2][2] + src1[2][3] * src2[3][2]; 
  dest[2][3] = src1[2][0] * src2[0][3] + src1[2][1] * src2[1][3] + src1[2][2] * src2[2][3] + src1[2][3] * src2[3][3]; 
  dest[3][0] = src1[3][0] * src2[0][0] + src1[3][1] * src2[1][0] + src1[3][2] * src2[2][0] + src1[3][3] * src2[3][0]; 
  dest[3][1] = src1[3][0] * src2[0][1] + src1[3][1] * src2[1][1] + src1[3][2] * src2[2][1] + src1[3][3] * src2[3][1]; 
  dest[3][2] = src1[3][0] * src2[0][2] + src1[3][1] * src2[1][2] + src1[3][2] * src2[2][2] + src1[3][3] * src2[3][2]; 
  dest[3][3] = src1[3][0] * src2[0][3] + src1[3][1] * src2[1][3] + src1[3][2] * src2[2][3] + src1[3][3] * src2[3][3]; 
};

generated asm 

c++ -v
Using built-in specs.
COLLECT_GCC=c++
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-apple-darwin12.3.0/4.9.0/lto-wrapper
Target: x86_64-apple-darwin12.3.0
Configured with: ./configure --disable-multilib --disable-bootstrap --enable-lto -disable-libitm --enable-languages=c,c++,fortran,lto --no-create --no-recursion
Thread model: posix
gcc version 4.9.0 20130428 (experimental) [trunk revision 198366] (GCC) 
Vincenzos-MacBook-Pro:vectorize innocent$ c++ -O3 -march=corei7-avx  -std=c++11 -S matmul.cc -mavx2 -mfma
Vincenzos-MacBook-Pro:vectorize innocent$ cat matmul.s
	.text
	.align 4,0x90
	.globl __Z6matmulv
__Z6matmulv:
LFB0:
	vmovss	8+_b(%rip), %xmm7
	vmovss	24+_b(%rip), %xmm1
	vinsertps	$0x10, 12+_b(%rip), %xmm7, %xmm0
	vmovss	_b(%rip), %xmm7
	vmovss	16+_b(%rip), %xmm2
	vinsertps	$0x10, 4+_b(%rip), %xmm7, %xmm8
	vmovss	40+_b(%rip), %xmm3
	vmovlhps	%xmm0, %xmm8, %xmm8
	vmovss	32+_b(%rip), %xmm4
	vinsertf128	$1, %xmm8, %ymm8, %ymm8
	vinsertps	$0x10, 28+_b(%rip), %xmm1, %xmm0
	vmovss	56+_b(%rip), %xmm7
	vinsertps	$0x10, 20+_b(%rip), %xmm2, %xmm6
	vmovlhps	%xmm0, %xmm6, %xmm6
	vmovss	48+_b(%rip), %xmm1
	vinsertf128	$1, %xmm6, %ymm6, %ymm6
	vinsertps	$0x10, 44+_b(%rip), %xmm3, %xmm0
	vinsertps	$0x10, 36+_b(%rip), %xmm4, %xmm5
	vmovlhps	%xmm0, %xmm5, %xmm5
	vinsertps	$0x10, 60+_b(%rip), %xmm7, %xmm0
	vinsertps	$0x10, 52+_b(%rip), %xmm1, %xmm4
	vmovlhps	%xmm0, %xmm4, %xmm4
	vxorps	%xmm7, %xmm7, %xmm7
	vmovaps	_a(%rip), %ymm0
	vinsertf128	$1, %xmm5, %ymm5, %ymm5
	vinsertf128	$1, %xmm4, %ymm4, %ymm4
	vpermilps	$255, %ymm0, %ymm1
	vpermilps	$170, %ymm0, %ymm2
	vpermilps	$85, %ymm0, %ymm3
	vpermilps	$0, %ymm0, %ymm0
	vfmadd132ps	%ymm8, %ymm7, %ymm0
	vfmadd132ps	%ymm6, %ymm0, %ymm3
	vmovaps	32+_a(%rip), %ymm0
	vfmadd132ps	%ymm5, %ymm3, %ymm2
	vfmadd132ps	%ymm4, %ymm2, %ymm1
	vmovaps	%ymm1, _c(%rip)
	vpermilps	$170, %ymm0, %ymm2
	vpermilps	$255, %ymm0, %ymm1
	vpermilps	$85, %ymm0, %ymm3
	vpermilps	$0, %ymm0, %ymm0
	vfmadd132ps	%ymm8, %ymm7, %ymm0
	vfmadd132ps	%ymm6, %ymm0, %ymm3
	vfmadd132ps	%ymm5, %ymm3, %ymm2
	vfmadd132ps	%ymm4, %ymm2, %ymm1
	vmovaps	%ymm1, 32+_c(%rip)
	vzeroupper
	ret
LFE0:
	.align 4,0x90
	.globl __Z7matmulUv
__Z7matmulUv:
LFB1:
	vmovss	4+_src1(%rip), %xmm5
	vmovss	16+_src2(%rip), %xmm15
	vmovss	_src1(%rip), %xmm4
	vmulss	%xmm15, %xmm5, %xmm1
	vmovss	8+_src1(%rip), %xmm2
	vmovss	12+_src1(%rip), %xmm0
	vmovss	_src2(%rip), %xmm14
	vmovss	32+_src2(%rip), %xmm13
	vmovss	48+_src2(%rip), %xmm12
	vfmadd231ss	%xmm14, %xmm4, %xmm1
	vmovss	20+_src2(%rip), %xmm11
	vfmadd231ss	%xmm13, %xmm2, %xmm1
	vfmadd231ss	%xmm12, %xmm0, %xmm1
	vmovss	%xmm1, _dest(%rip)
	vmovss	4+_src2(%rip), %xmm10
	vmulss	%xmm11, %xmm5, %xmm1
	vmovss	36+_src2(%rip), %xmm9
	vmovss	52+_src2(%rip), %xmm8
	vmovss	24+_src2(%rip), %xmm7
	vmovss	28+_src2(%rip), %xmm6
	vfmadd231ss	%xmm10, %xmm4, %xmm1
	vfmadd231ss	%xmm9, %xmm2, %xmm1
	vfmadd231ss	%xmm8, %xmm0, %xmm1
	vmovss	%xmm1, 4+_dest(%rip)
	vmulss	%xmm7, %xmm5, %xmm1
	vmovss	44+_src2(%rip), %xmm3
	vmulss	%xmm6, %xmm5, %xmm5
	vfmadd231ss	8+_src2(%rip), %xmm4, %xmm1
	vfmadd231ss	40+_src2(%rip), %xmm2, %xmm1
	vfmadd231ss	56+_src2(%rip), %xmm0, %xmm1
	vfmadd231ss	12+_src2(%rip), %xmm4, %xmm5
	vfmadd231ss	%xmm3, %xmm2, %xmm5
	vfmadd231ss	60+_src2(%rip), %xmm0, %xmm5
	vmovss	%xmm5, 12+_dest(%rip)
	vmovss	20+_src1(%rip), %xmm5
	vmovss	%xmm1, 8+_dest(%rip)
	vmovss	16+_src1(%rip), %xmm4
	vmulss	%xmm5, %xmm15, %xmm1
	vmovss	24+_src1(%rip), %xmm2
	vmovss	28+_src1(%rip), %xmm0
	vfmadd231ss	%xmm4, %xmm14, %xmm1
	vfmadd231ss	%xmm2, %xmm13, %xmm1
	vfmadd231ss	%xmm0, %xmm12, %xmm1
	vmovss	%xmm1, 16+_dest(%rip)
	vmulss	%xmm5, %xmm11, %xmm1
	vfmadd231ss	%xmm4, %xmm10, %xmm1
	vfmadd231ss	%xmm2, %xmm9, %xmm1
	vfmadd231ss	%xmm0, %xmm8, %xmm1
	vmovss	%xmm1, 20+_dest(%rip)
	vmulss	%xmm5, %xmm7, %xmm1
	vmulss	%xmm5, %xmm6, %xmm5
	vfmadd231ss	8+_src2(%rip), %xmm4, %xmm1
	vfmadd231ss	40+_src2(%rip), %xmm2, %xmm1
	vfmadd231ss	56+_src2(%rip), %xmm0, %xmm1
	vmovss	%xmm1, 24+_dest(%rip)
	vfmadd231ss	12+_src2(%rip), %xmm4, %xmm5
	vfmadd231ss	%xmm2, %xmm3, %xmm5
	vfmadd231ss	60+_src2(%rip), %xmm0, %xmm5
	vmovss	%xmm5, 28+_dest(%rip)
	vmovss	36+_src1(%rip), %xmm5
	vmovss	32+_src1(%rip), %xmm4
	vmulss	%xmm5, %xmm15, %xmm1
	vmovss	40+_src1(%rip), %xmm2
	vmovss	44+_src1(%rip), %xmm0
	vfmadd231ss	%xmm4, %xmm14, %xmm1
	vfmadd231ss	%xmm2, %xmm13, %xmm1
	vfmadd231ss	%xmm0, %xmm12, %xmm1
	vmovss	%xmm1, 32+_dest(%rip)
	vmulss	%xmm5, %xmm11, %xmm1
	vfmadd231ss	%xmm4, %xmm10, %xmm1
	vfmadd231ss	%xmm2, %xmm9, %xmm1
	vfmadd231ss	%xmm0, %xmm8, %xmm1
	vmovss	%xmm1, 36+_dest(%rip)
	vmulss	%xmm5, %xmm7, %xmm1
	vmulss	%xmm5, %xmm6, %xmm5
	vfmadd231ss	8+_src2(%rip), %xmm4, %xmm1
	vfmadd231ss	40+_src2(%rip), %xmm2, %xmm1
	vfmadd231ss	56+_src2(%rip), %xmm0, %xmm1
	vfmadd231ss	12+_src2(%rip), %xmm4, %xmm5
	vfmadd231ss	%xmm2, %xmm3, %xmm5
	vfmadd231ss	60+_src2(%rip), %xmm0, %xmm5
	vmovss	%xmm5, 44+_dest(%rip)
	vmovss	52+_src1(%rip), %xmm5
	vmovss	48+_src1(%rip), %xmm4
	vmovss	%xmm1, 40+_dest(%rip)
	vmulss	%xmm5, %xmm15, %xmm15
	vmovss	56+_src1(%rip), %xmm2
	vmulss	%xmm5, %xmm11, %xmm11
	vmovss	60+_src1(%rip), %xmm0
	vmulss	%xmm5, %xmm7, %xmm7
	vmulss	%xmm5, %xmm6, %xmm5
	vfmadd231ss	%xmm4, %xmm14, %xmm15
	vfmadd231ss	%xmm2, %xmm13, %xmm15
	vfmadd231ss	%xmm0, %xmm12, %xmm15
	vfmadd132ss	%xmm4, %xmm11, %xmm10
	vmovss	%xmm15, 48+_dest(%rip)
	vfmadd132ss	%xmm2, %xmm10, %xmm9
	vfmadd231ss	8+_src2(%rip), %xmm4, %xmm7
	vfmadd231ss	%xmm0, %xmm8, %xmm9
	vfmadd231ss	40+_src2(%rip), %xmm2, %xmm7
	vfmadd132ss	12+_src2(%rip), %xmm5, %xmm4
	vfmadd132ss	%xmm3, %xmm4, %xmm2
	vfmadd231ss	56+_src2(%rip), %xmm0, %xmm7
	vfmadd231ss	60+_src2(%rip), %xmm0, %xmm2
	vmovss	%xmm9, 52+_dest(%rip)
	vmovss	%xmm7, 56+_dest(%rip)
	vmovss	%xmm2, 60+_dest(%rip)
	ret
Comment 1 Richard Biener 2013-05-06 11:33:29 UTC
This is because basic-block SLP does not support vectorizing reductions.
Comment 2 Ilya Palachev 2014-03-27 10:07:57 UTC
(In reply to Richard Biener from comment #1)
> This is because basic-block SLP does not support vectorizing reductions.


At page http://gcc.gnu.org/wiki/VectorizationTasks

it is written that the generalization of reduction support (http://gcc.gnu.org/ml/gcc-patches/2006-04/msg00172.html) can help to fix this PR25621

Has this bug the same reason as the bug discussed in PR25621 ?
Comment 3 Andrew Pinski 2021-12-28 06:23:06 UTC
GCC 6-9 vectorizes both but matmul uses scalar loads until GCC 10.

So all fixed in GCC 10+.