This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.
With this patch, we now generate the following for V8HI:
cnt v1.16b, v0.16b
uaddlp v2.8h, v1.16b
For V4HI, we generate:
cnt v1.8b, v0.8b
uaddlp v2.4h, v1.8b
For V4SI, we generate:
cnt v1.16b, v0.16b
uaddlp v2.8h, v1.16b
uaddlp v3.4s, v2.8h
For V4SI with TARGET_DOTPROD, we generate the following instead:
movi v0.4s, #0
movi v1.16b, #1
cnt v3.16b, v2.16b
udot v0.4s, v3.16b, v1.16b
For V2SI, we generate:
cnt v1.8b, v.8b
uaddlp v2.4h, v1.8b
uaddlp v3.2s, v2.4h
For V2SI with TARGET_DOTPROD, we generate the following instead:
movi v0.8b, #0
movi v1.8b, #1
cnt v3.8b, v2.8b
udot v0.2s, v3.8b, v1.8b
For V2DI, we generate:
cnt v1.16b, v.16b
uaddlp v2.8h, v1.16b
uaddlp v3.4s, v2.8h
uaddlp v4.2d, v3.4s
For V4SI with TARGET_DOTPROD, we generate the following instead:
movi v0.4s, #0
movi v1.16b, #1
cnt v3.16b, v2.16b
udot v0.4s, v3.16b, v1.16b
uaddlp v0.2d, v0.4s