[PATCH v2] aarch64: Improve popcount for bytes [PR113042]
Andrew Pinski
quic_apinski@quicinc.com
Mon Jun 10 19:22:55 GMT 2024
For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This changes the popcount extend to cover all ALLI rather than GPI.
Changes since v1:
* v2 - Use ALLI iterator and combine all into one pattern.
Add new testcases popcnt[6-8].c.
Bootstrapped and tested on aarch64-linux-gnu with no regressions.
PR target/113042
gcc/ChangeLog:
* config/aarch64/aarch64.md (popcount<mode>2): Update pattern
to support ALLI modes.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/popcnt5.c: New test.
* gcc.target/aarch64/popcnt6.c: New test.
Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
gcc/config/aarch64/aarch64.md | 52 +++++++++++++++++++---
gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++
gcc/testsuite/gcc.target/aarch64/popcnt6.c | 19 ++++++++
gcc/testsuite/gcc.target/aarch64/popcnt7.c | 18 ++++++++
gcc/testsuite/gcc.target/aarch64/popcnt8.c | 18 ++++++++
5 files changed, 119 insertions(+), 7 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt6.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt7.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt8.c
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..dd88fd891b5 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5332,28 +5332,66 @@ (define_insn "*aarch64_popcount<mode>2_cssc_insn"
;; MOV w0, v2.b[0]
(define_expand "popcount<mode>2"
- [(set (match_operand:GPI 0 "register_operand")
- (popcount:GPI (match_operand:GPI 1 "register_operand")))]
+ [(set (match_operand:ALLI 0 "register_operand")
+ (popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
"TARGET_CSSC || TARGET_SIMD"
{
+ rtx in = operands[1];
+ rtx out = operands[0];
+ if (TARGET_CSSC
+ && (<MODE>mode == HImode
+ || <MODE>mode == QImode))
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+ rtx out1 = gen_reg_rtx (SImode);
+ if (<MODE>mode == HImode)
+ emit_insn (gen_zero_extendhisi2 (tmp, in));
+ else
+ emit_insn (gen_zero_extendqisi2 (tmp, in));
+ emit_insn (gen_popcountsi2 (out1, tmp));
+ emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
+ DONE;
+ }
if (!TARGET_CSSC)
{
rtx v = gen_reg_rtx (V8QImode);
rtx v1 = gen_reg_rtx (V8QImode);
rtx in = operands[1];
rtx out = operands[0];
- if(<MODE>mode == SImode)
+ /* SImode and HImode should be zero extended to DImode. */
+ if (<MODE>mode == SImode || <MODE>mode == HImode)
{
rtx tmp;
tmp = gen_reg_rtx (DImode);
- /* If we have SImode, zero extend to DImode, pop count does
- not change if we have extra zeros. */
- emit_insn (gen_zero_extendsidi2 (tmp, in));
+ /* If we have SImode, zero extend to DImode,
+ pop count does not change if we have extra zeros. */
+ if (<MODE>mode == SImode)
+ emit_insn (gen_zero_extendsidi2 (tmp, in));
+ else
+ emit_insn (gen_zero_extendhidi2 (tmp, in));
in = tmp;
}
emit_move_insn (v, gen_lowpart (V8QImode, in));
emit_insn (gen_popcountv8qi2 (v1, v));
- emit_insn (gen_aarch64_zero_extend<mode>_reduc_plus_v8qi (out, v1));
+ /* QImode, just extract from the v8qi vector. */
+ if (<MODE>mode == QImode)
+ {
+ emit_move_insn (out, gen_lowpart (QImode, v1));
+ }
+ /* HI and SI, reduction is zero extended to SImode. */
+ else if (<MODE>mode == SImode || <MODE>mode == HImode)
+ {
+ rtx out1;
+ out1 = gen_reg_rtx (SImode);
+ emit_insn (gen_aarch64_zero_extendsi_reduc_plus_v8qi (out1, v1));
+ emit_move_insn (out, gen_lowpart (<MODE>mode, out1));
+ }
+ /* DImode, reduction is zero extended to DImode. */
+ else
+ {
+ gcc_assert (<MODE>mode == DImode);
+ emit_insn (gen_aarch64_zero_extenddi_reduc_plus_v8qi (out, v1));
+ }
DONE;
}
})
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+** ldr b[0-9]+, \[x0\]
+** cnt v[0-9]+.8b, v[0-9]+.8b
+** smov w0, v[0-9]+.b\[0\]
+** ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+ return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt6.c b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
new file mode 100644
index 00000000000..e882cb24126
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt6.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+** ldr h[0-9]+, \[x0\]
+** cnt v[0-9]+.8b, v[0-9]+.8b
+** addv b[0-9]+, v[0-9]+.8b
+** fmov w0, s[0-9]+
+** ret
+*/
+
+unsigned h8 (const unsigned short *a) {
+ return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt7.c b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
new file mode 100644
index 00000000000..8dfff211ae4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt7.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+cssc"
+
+/*
+** h8:
+** ldrb w[0-9]+, \[x0\]
+** cnt w[0-9]+, w[0-9]+
+** ret
+*/
+/* We should not produce any extra zero extend for this code */
+
+unsigned h8 (const unsigned char *a) {
+ return __builtin_popcountg (a[0]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt8.c b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
new file mode 100644
index 00000000000..66a88b6a929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+cssc"
+
+/*
+** h8:
+** ldrh w[0-9]+, \[x0\]
+** cnt w[0-9]+, w[0-9]+
+** ret
+*/
+/* We should not produce any extra zero extend for this code */
+
+unsigned h8 (const unsigned short *a) {
+ return __builtin_popcountg (a[0]);
+}
--
2.42.0
More information about the Gcc-patches
mailing list