Bug 101927 - reduction of popcount of 8bits can be improved
Summary: reduction of popcount of 8bits can be improved
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 12.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2021-08-16 05:02 UTC by Andrew Pinski
Modified: 2023-12-16 03:53 UTC (History)
1 user (show)

See Also:
Host:
Target: aarch64
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Andrew Pinski 2021-08-16 05:02:09 UTC
Take:

#include <stdlib.h>
#include <stdint.h>

size_t hd (const uint8_t *restrict a, const uint8_t *restrict b, size_t l) {
  size_t r = 0, x;
  for (x = 0; x < l; x++)
    r += __builtin_popcount (a[x] ^ b[x]);

  return r;
}

at -O3 we don't vectorize this.
Clang/LLVM does:
.LBB0_5:                                // =>This Inner Loop Header: Depth=1
        ld1     { v3.b }[0], [x8]
        sub     x12, x8, #2
        ld1     { v5.b }[0], [x10]
        ld1     { v4.b }[0], [x12]
        sub     x12, x10, #2
        ld1     { v6.b }[0], [x12]
        add     x12, x8, #1
        ld1     { v3.b }[4], [x12]
        add     x12, x10, #1
        ld1     { v5.b }[4], [x12]
        sub     x12, x8, #1
        ld1     { v4.b }[4], [x12]
        sub     x12, x10, #1
        ld1     { v6.b }[4], [x12]
        eor     v3.8b, v5.8b, v3.8b
        ushll   v3.2d, v3.2s, #0
        and     v3.16b, v3.16b, v1.16b
        eor     v4.8b, v6.8b, v4.8b
        ushll   v4.2d, v4.2s, #0
        and     v4.16b, v4.16b, v1.16b
        cnt     v3.16b, v3.16b
        cnt     v4.16b, v4.16b
        uaddlp  v3.8h, v3.16b
        uaddlp  v4.8h, v4.16b
        uaddlp  v3.4s, v3.8h
        uaddlp  v4.4s, v4.8h
        add     x8, x8, #4
        subs    x11, x11, #4
        uadalp  v2.2d, v3.4s
        uadalp  v0.2d, v4.4s
        add     x10, x10, #4
        b.ne    .LBB0_5

------ CUT ----
Note I think we could be better.