From ec870680166db6e508fd994d73dbb9ae401600dc Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Wed, 4 Nov 2020 12:56:12 -0500 Subject: [PATCH] PowerPC: PR target/81594 gcc/ 2020-11-04 Michael Meissner PR target/81594 * config/rs6000/predicates.md (ds_form_memory): New predicate. * config/rs6000/vsx.md (concatv2di_store): New insn. (dupv2di_store): New insn. gcc/testsuite/ 2020-11-04 Michael Meissner PR target/81594 * gcc.target/powerpc/pr81594.c: New test. --- gcc/config/rs6000/predicates.md | 42 +++++++++++ gcc/config/rs6000/vsx.md | 84 ++++++++++++++++++++++ gcc/testsuite/gcc.target/powerpc/pr81594.c | 61 ++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr81594.c diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 4c2fe7fa3129..048763633bcf 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1876,3 +1876,45 @@ { return address_is_prefixed (XEXP (op, 0), mode, NON_PREFIXED_DEFAULT); }) + +;; Return true if the operand is a valid memory operand with an offsettable +;; address that can be split into 2 sub-addresses, each of which is a valid +;; DS-form (bottom 2 bits of the offset are 0). This is used to optimize +;; creating a vector of two DImode elements and then storing the vector. We +;; want to eliminate the direct moves from GPRs to form the vector and do the +;; store directly from the GPRs. + +(define_predicate "ds_form_memory" + (match_code "mem") +{ + if (!memory_operand (op, mode)) + return false; + + rtx addr = XEXP (op, 0); + + if (REG_P (addr) || SUBREG_P (addr)) + return true; + + if (GET_CODE (addr) != PLUS) + return false; + + if (!base_reg_operand (XEXP (addr, 0), Pmode)) + return false; + + rtx offset = XEXP (addr, 1); + if (!CONST_INT_P (offset)) + return false; + + HOST_WIDE_INT value = INTVAL (offset); + + if (TARGET_PREFIXED) + return SIGNED_34BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode)); + + /* If we don't support prefixed addressing, ensure that the two addresses + created would each be valid for doing a STD instruction (which is a + DS-form instruction that requires the bottom 2 bits to be 0). */ + if ((value & 0x3) != 0) + return false; + + return SIGNED_16BIT_OFFSET_EXTRA_P (value, GET_MODE_SIZE (DImode)); +}) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 947631d83ee3..1ef9e9b95fc8 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -2981,6 +2981,90 @@ } [(set_attr "type" "vecperm")]) +;; If the only use for a VEC_CONCAT is to store 2 64-bit values, replace it +;; with two stores. Only do this on DImode, since it saves doing 1 direct move +;; on power9, and 2 direct moves + XXPERMDI on power8 to form the vector so we +;; can do a vector store. This typically shows up with -O3 where two stores +;; are combined into a vector. +;; +;; Typically DFmode would generate XXPERMDI and a vector store. Benchmarks +;; like Spec show that is typically the same speed or faster than doing the two +;; scalar DFmode stores. +(define_insn_and_split "*concatv2di_store" + [(set (match_operand:V2DI 0 "memory_operand" "=m,m,m,m") + (vec_concat:V2DI + (match_operand:DI 1 "gpc_reg_operand" "r,wa,r,wa") + (match_operand:DI 2 "gpc_reg_operand" "r,wa,wa,r"))) + (clobber (match_scratch:DI 3 "=&b,&b,&b,&b"))] + "TARGET_DIRECT_MOVE_64BIT" + "#" + "&& 1" + [(set (match_dup 4) + (match_dup 5)) + (set (match_dup 6) + (match_dup 7))] +{ + rtx mem = operands[0]; + + /* If the address can't be used directly for both stores, copy it to the + temporary base register. */ + if (!ds_form_memory (mem, V2DImode)) + { + rtx old_addr = XEXP (mem, 0); + rtx new_addr = operands[3]; + if (GET_CODE (new_addr) == SCRATCH) + new_addr = gen_reg_rtx (Pmode); + + emit_move_insn (new_addr, old_addr); + mem = change_address (mem, VOIDmode, new_addr); + } + + /* Because we are creating scalar stores, we don't have to swap the order + of the elements and then swap the stores to get the right order on + little endian systems. */ + operands[4] = adjust_address (mem, DImode, 0); + operands[5] = operands[1]; + operands[6] = adjust_address (mem, DImode, 8); + operands[7] = operands[2]; +} + [(set_attr "length" "8") + (set_attr "type" "store,fpstore,fpstore,store")]) + +;; Optimize creating a vector with 2 duplicate DImode elements and storing it. +(define_insn_and_split "*dupv2di_store" + [(set (match_operand:V2DI 0 "memory_operand" "=m,m") + (vec_duplicate:V2DI + (match_operand:DI 1 "gpc_reg_operand" "r,wa"))) + (clobber (match_scratch:DI 2 "=&b,&b"))] + "TARGET_DIRECT_MOVE_64BIT" + "#" + "&& 1" + [(set (match_dup 3) + (match_dup 1)) + (set (match_dup 4) + (match_dup 1))] +{ + rtx mem = operands[0]; + + /* If the address can't be used directly for both stores, copy it to the + temporary base register. */ + if (!ds_form_memory (mem, V2DImode)) + { + rtx old_addr = XEXP (mem, 0); + rtx new_addr = operands[2]; + if (GET_CODE (new_addr) == SCRATCH) + new_addr = gen_reg_rtx (Pmode); + + emit_move_insn (new_addr, old_addr); + mem = change_address (mem, VOIDmode, new_addr); + } + + operands[3] = adjust_address (mem, DImode, 0); + operands[4] = adjust_address (mem, DImode, 8); +} + [(set_attr "length" "8") + (set_attr "type" "store,fpstore")]) + ;; Special purpose concat using xxpermdi to glue two single precision values ;; together, relying on the fact that internally scalar floats are represented ;; as doubles. This is used to initialize a V4SF vector with 4 floats diff --git a/gcc/testsuite/gcc.target/powerpc/pr81594.c b/gcc/testsuite/gcc.target/powerpc/pr81594.c new file mode 100644 index 000000000000..35a9749db380 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr81594.c @@ -0,0 +1,61 @@ +/* { dg-do compile { target { powerpc-*-* && ilp64 } } } */ +/* { dg-require-effective-target powerpc_p8vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */ + +/* PR target/81594. Optimize creating a vector of 2 64-bit elements and then + storing the vector into separate stores. */ + +void +store_v2di_0 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + *p = (vector unsigned long long) { a, b }; +} + +void +store_v2di_4 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[4] = (vector unsigned long long) { a, b }; +} + +void +store_v2di_splat_0 (vector unsigned long long *p, unsigned long long a) +{ + *p = (vector unsigned long) { a, a }; +} + +void +store_v2di_splat_8 (vector unsigned long long *p, unsigned long long a) +{ + p[8] = (vector unsigned long long) { a, a }; +} + +/* 2047 is the largest index that can be used with DS-form instructions. */ +void +store_v2di_2047 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[2047] = (vector unsigned long long) { a, b }; +} + +/* 2048 will require the constant to be loaded because we can't use a pair of + DS-form instructions. If we have prefixed addressing, a prefixed form will + be generated instead. Two separate stores should still be issued. */ +void +store_v2di_2048 (vector unsigned long long *p, + unsigned long long a, + unsigned long long b) +{ + p[2048] = (vector unsigned long long) { a, b }; +} + +/* { dg-final { scan-assembler-not {\mstxv\M} } } */ +/* { dg-final { scan-assembler-not {\mstxvx\M} } } */ +/* { dg-final { scan-assembler-not {\mmfvsrd\M} } } */ +/* { dg-final { scan-assembler-not {\mmtvsrd\M} } } */ +/* { dg-final { scan-assembler-not {\mmtvsrdd\M} } } */ +/* { dg-final { scan-assembler-not {\mxxpermdi\M} } } */ -- 2.43.5