This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug target/62178] New: [AArch64] Performance regression on matrix matrix multiply due to r211211


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62178

            Bug ID: 62178
           Summary: [AArch64] Performance regression on matrix matrix
                    multiply due to r211211
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: spop at gcc dot gnu.org

int a[30 +1][30 +1], b[30 +1][30 +1], r[30 +1][30 +1];

void Intmm (int run) {
  int i, j, k;

  for ( i = 1; i <= 30; i++ )
    for ( j = 1; j <= 30; j++ ) {
      r[i][j] = 0;
      for(k = 1; k <= 30; k++ )
        r[i][j] += a[i][k]*b[k][j];
    }
}

compile this at -O3 with the last good compiler r211210 and with the first bad
compiler at r211211, then diff the assembly:

--- good.s    2014-08-18 17:44:26.179506000 -0500
+++ bad.s    2014-08-18 17:44:26.213807000 -0500
@@ -6,45 +6,44 @@
     .type    Intmm, %function
 Intmm:
     movi    v3.2s, 0
-    adrp    x6, a+128
-    adrp    x8, r+128
-    adrp    x10, r+3848
-    adrp    x9, b+128
-    adrp    x7, b+248
-    add    x6, x6, :lo12:a+128
-    add    x8, x8, :lo12:r+128
-    add    x10, x10, :lo12:r+3848
-    add    x9, x9, :lo12:b+128
-    add    x7, x7, :lo12:b+248
+    adrp    x6, r+128
+    adrp    x4, a+124
+    adrp    x8, r+3848
+    adrp    x7, b
+    add    x6, x6, :lo12:r+128
+    add    x4, x4, :lo12:a+124
+    add    x8, x8, :lo12:r+3848
+    add    x7, x7, :lo12:b
 .L2:
-    mov    x5, x8
-    mov    x4, x8
-    mov    x3, x9
+    mov    x5, 0
 .L4:
-    str    d3, [x4]
-    add    x2, x3, 3720
-    movi    v0.2s, 0
-    mov    x1, x6
-    mov    x0, x3
+    str    d3, [x6, x5]
+    add    x3, x5, 128
+    movi    v1.2s, 0
+    add    x3, x3, x7
+    mov    x0, 0
 .L3:
-    ldr    d1, [x0]
-    add    x0, x0, 124
-    ld1r    {v2.2s}, [x1], 4
-    cmp    x0, x2
-    mla    v0.2s, v2.2s, v1.2s
+    add    x1, x4, x0
+    lsl    x2, x0, 5
+    sub    x2, x2, x0
+    add    x0, x0, 4
+    cmp    x0, 120
+    ldr    w1, [x1, 4]
+    ldr    d2, [x3, x2]
+    dup    v0.2s, w1
+    mla    v1.2s, v0.2s, v2.2s
     bne    .L3
-    str    d0, [x5], 8
-    add    x3, x3, 8
-    cmp    x3, x7
-    add    x4, x4, 8
+    str    d1, [x6, x5]
+    add    x5, x5, 8
+    cmp    x5, 120
     bne    .L4
-    add    x8, x8, 124
     add    x6, x6, 124
-    cmp    x8, x10
+    add    x4, x4, 124
+    cmp    x6, x8
     bne    .L2
     ret
     .size    Intmm, .-Intmm
     .comm    r,3844,8
     .comm    b,3844,8
     .comm    a,3844,8

Remark that the innermost loop .L3 contains 5 more instructions with the bad
compiler, due to more scalar computations for the addressing modes:

 .L3:
-    ldr    d1, [x0]
-    add    x0, x0, 124
-    ld1r    {v2.2s}, [x1], 4
-    cmp    x0, x2
-    mla    v0.2s, v2.2s, v1.2s
+    add    x1, x4, x0
+    lsl    x2, x0, 5
+    sub    x2, x2, x0
+    add    x0, x0, 4
+    cmp    x0, 120
+    ldr    w1, [x1, 4]
+    ldr    d2, [x3, x2]
+    dup    v0.2s, w1
+    mla    v1.2s, v0.2s, v2.2s
     bne    .L3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]