AArch64: Add better costing for vector constants and operations

author Tamar Christina <tamar.christina@arm.com>

Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h

index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644 (file)
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* mult.  */
+    COSTS_N_INSNS (4), /* mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
    }
  };
  
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
    }
  };
  
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
    }
  };
  
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@
  )
  
  (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
         (vec_duplicate:VDQF_F16
-         (match_operand:<VEL> 1 "register_operand" "w")))]
+         (match_operand:<VEL> 1 "register_operand" "w,r")))]
    "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
  )
  
  (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 8448e56443c22e296aba75e18c25446d26e8c015..fd9249c62b3400d2dea68c7b64a3db7dca6324ad 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12701,7 +12701,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
    rtx op0, op1, op2;
    const struct cpu_cost_table *extra_cost
      = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
    scalar_int_mode int_mode;
  
    /* By default, assume that everything has equivalent cost to the
@@ -13462,8 +13462,7 @@ cost_plus:
  
          we must cost the explicit register move.  */
        if (mode == DImode
-         && GET_MODE (op0) == SImode
-         && outer == SET)
+         && GET_MODE (op0) == SImode)
         {
           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
  
@@ -14002,8 +14001,39 @@ cost_plus:
                              mode, MULT, 1, speed);
            return true;
          }
+       break;
+    case CONST_VECTOR:
+       {
+         /* Load using MOVI/MVNI.  */
+         if (aarch64_simd_valid_immediate (x, NULL))
+           *cost = extra_cost->vect.movi;
+         else /* Load using constant pool.  */
+           *cost = extra_cost->ldst.load;
+         break;
+       }
+    case VEC_CONCAT:
+       /* depending on the operation, either DUP or INS.
+          For now, keep default costing.  */
+       break;
+    case VEC_DUPLICATE:
+       /* Load using a DUP.  */
+       *cost = extra_cost->vect.dup;
+       return false;
+    case VEC_SELECT:
+       {
+         rtx op0 = XEXP (x, 0);
+         *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
  
-      /* Fall through.  */
+         /* cost subreg of 0 as free, otherwise as DUP */
+         rtx op1 = XEXP (x, 1);
+         if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+           ;
+         else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+           *cost = extra_cost->vect.dup;
+         else
+           *cost = extra_cost->vect.extract;
+         return true;
+       }
      default:
        break;
      }
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h

index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644 (file)
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
  {
    const int alu;
    const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
  };
  
  struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h

index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644 (file)
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
    /* Vector */
    {
      COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
    }
  };
  
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c

new file mode 100644 (file)

index 0000000..d025e98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile  { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**     adrp    x[0-9]+, .LC[0-9]+
+**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     str     q[0-9]+, \[x[0-9]+\]
+**     fmov    x[0-9]+, d[0-9]+
+**     orr     x[0-9]+, x[0-9]+, x[0-9]+
+**     ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**     adrp    x[0-9]+, .LC[0-1]+
+**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     str     q[0-9]+, \[x[0-9]+\]
+**     fmov    x[0-9]+, d[0-9]+
+**     orr     x[0-9]+, x[0-9]+, x[0-9]+
+**     ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**     adrp    x[0-9]+, .LC[0-9]+
+**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     str     q[0-9]+, \[x1\]
+**     fmov    w[0-9]+, s[0-9]+
+**     orr     w[0-9]+, w[0-9]+, w[0-9]+
+**     ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**     ushr    v[0-9]+.16b, v[0-9]+.16b, 7
+**     mov     x[0-9]+, 16512
+**     movk    x[0-9]+, 0x1020, lsl 16
+**     movk    x[0-9]+, 0x408, lsl 32
+**     movk    x[0-9]+, 0x102, lsl 48
+**     fmov    d[0-9]+, x[0-9]+
+**     pmull   v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**     dup     v[0-9]+.2d, v[0-9]+.d\[0\]
+**     pmull2  v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**     trn2    v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**     umov    w[0-9]+, v[0-9]+.h\[3\]
+**     ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Mon, 1 Nov 2021 13:49:46 +0000 (13:49 +0000)
gcc/config/aarch64/aarch64-cost-tables.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| blame \| history
gcc/config/arm/aarch-common-protos.h		patch \| blob \| blame \| history
gcc/config/arm/aarch-cost-tables.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c	[new file with mode: 0644]	patch \| blob