This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/62178] New: [AArch64] Performance regression on matrix matrix multiply due to r211211
- From: "spop at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 18 Aug 2014 22:47:00 +0000
- Subject: [Bug target/62178] New: [AArch64] Performance regression on matrix matrix multiply due to r211211
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62178
Bug ID: 62178
Summary: [AArch64] Performance regression on matrix matrix
multiply due to r211211
Product: gcc
Version: 5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: spop at gcc dot gnu.org
int a[30 +1][30 +1], b[30 +1][30 +1], r[30 +1][30 +1];
void Intmm (int run) {
int i, j, k;
for ( i = 1; i <= 30; i++ )
for ( j = 1; j <= 30; j++ ) {
r[i][j] = 0;
for(k = 1; k <= 30; k++ )
r[i][j] += a[i][k]*b[k][j];
}
}
compile this at -O3 with the last good compiler r211210 and with the first bad
compiler at r211211, then diff the assembly:
--- good.s 2014-08-18 17:44:26.179506000 -0500
+++ bad.s 2014-08-18 17:44:26.213807000 -0500
@@ -6,45 +6,44 @@
.type Intmm, %function
Intmm:
movi v3.2s, 0
- adrp x6, a+128
- adrp x8, r+128
- adrp x10, r+3848
- adrp x9, b+128
- adrp x7, b+248
- add x6, x6, :lo12:a+128
- add x8, x8, :lo12:r+128
- add x10, x10, :lo12:r+3848
- add x9, x9, :lo12:b+128
- add x7, x7, :lo12:b+248
+ adrp x6, r+128
+ adrp x4, a+124
+ adrp x8, r+3848
+ adrp x7, b
+ add x6, x6, :lo12:r+128
+ add x4, x4, :lo12:a+124
+ add x8, x8, :lo12:r+3848
+ add x7, x7, :lo12:b
.L2:
- mov x5, x8
- mov x4, x8
- mov x3, x9
+ mov x5, 0
.L4:
- str d3, [x4]
- add x2, x3, 3720
- movi v0.2s, 0
- mov x1, x6
- mov x0, x3
+ str d3, [x6, x5]
+ add x3, x5, 128
+ movi v1.2s, 0
+ add x3, x3, x7
+ mov x0, 0
.L3:
- ldr d1, [x0]
- add x0, x0, 124
- ld1r {v2.2s}, [x1], 4
- cmp x0, x2
- mla v0.2s, v2.2s, v1.2s
+ add x1, x4, x0
+ lsl x2, x0, 5
+ sub x2, x2, x0
+ add x0, x0, 4
+ cmp x0, 120
+ ldr w1, [x1, 4]
+ ldr d2, [x3, x2]
+ dup v0.2s, w1
+ mla v1.2s, v0.2s, v2.2s
bne .L3
- str d0, [x5], 8
- add x3, x3, 8
- cmp x3, x7
- add x4, x4, 8
+ str d1, [x6, x5]
+ add x5, x5, 8
+ cmp x5, 120
bne .L4
- add x8, x8, 124
add x6, x6, 124
- cmp x8, x10
+ add x4, x4, 124
+ cmp x6, x8
bne .L2
ret
.size Intmm, .-Intmm
.comm r,3844,8
.comm b,3844,8
.comm a,3844,8
Remark that the innermost loop .L3 contains 5 more instructions with the bad
compiler, due to more scalar computations for the addressing modes:
.L3:
- ldr d1, [x0]
- add x0, x0, 124
- ld1r {v2.2s}, [x1], 4
- cmp x0, x2
- mla v0.2s, v2.2s, v1.2s
+ add x1, x4, x0
+ lsl x2, x0, 5
+ sub x2, x2, x0
+ add x0, x0, 4
+ cmp x0, 120
+ ldr w1, [x1, 4]
+ ldr d2, [x3, x2]
+ dup v0.2s, w1
+ mla v1.2s, v0.2s, v2.2s
bne .L3