https://godbolt.org/z/9K3369 #include<array> #include<cstdint> struct number { std::array<std::uint64_t,5> num; inline constexpr std::uint64_t& operator[](std::size_t position) noexcept { return num[position]; } inline constexpr std::uint64_t const& operator[](std::size_t position) const noexcept { return num[position]; } }; number add_reduce(number const& a,number const& b) noexcept { constexpr auto reduce_mask_51{(static_cast<std::uint64_t>(1) << 51) - 1}; number out; std::uint64_t c{(a[0] + b[0])>>51}; out[1] = a[1] + b[1] + c; c = (out[1] >> 51); out[1] &= reduce_mask_51; out[2] = a[2] + b[2] + c; c = (out[2] >> 51); out[2] &= reduce_mask_51; out[3] = a[3] + b[3] + c; c = (out[3] >> 51); out[3] &= reduce_mask_51; out[4] = a[4] + b[4] + c; c = (out[4] >> 51); out[4] &= reduce_mask_51; out[0] = c * 19; return out; } gcc: add_reduce(number const&, number const&): movq (%rdx), %rax addq (%rsi), %rax movq %rdi, %r8 movq %rdx, %rdi shrq $51, %rax movq 8(%rdx), %rdx addq 8(%rsi), %rdx movq %rsi, %rcx movabsq $2251799813685247, %rsi addq %rdx, %rax movq %rax, %rdx shrq $51, %rax andq %rsi, %rdx movq %rdx, 8(%r8) movq 16(%rdi), %rdx addq 16(%rcx), %rdx addq %rdx, %rax movq %rax, %rdx shrq $51, %rax andq %rsi, %rdx movq %rdx, 16(%r8) movq 24(%rdi), %rdx addq 24(%rcx), %rdx addq %rax, %rdx movq %rdx, %rax shrq $51, %rdx andq %rsi, %rax movq %rax, 24(%r8) movq 32(%rdi), %rax addq 32(%rcx), %rax addq %rdx, %rax andq %rax, %rsi shrq $51, %rax leaq (%rax,%rax,8), %rdx movq %rsi, 32(%r8) leaq (%rax,%rdx,2), %rax movq %rax, (%r8) movq %r8, %rax ret clang: add_reduce(number const&, number const&): # @add_reduce(number const&, number const&) movq %rdi, %rax movq (%rdx), %rcx movq 8(%rdx), %rdi addq (%rsi), %rcx shrq $51, %rcx addq 8(%rsi), %rdi addq %rcx, %rdi movq %rdi, %rcx shrq $51, %rcx movabsq $2251799813685247, %r8 # imm = 0x7FFFFFFFFFFFF andq %r8, %rdi movq %rdi, 8(%rax) movq 16(%rdx), %rdi addq 16(%rsi), %rdi addq %rcx, %rdi movq %rdi, %rcx shrq $51, %rcx andq %r8, %rdi movq %rdi, 16(%rax) movq 24(%rdx), %rdi addq 24(%rsi), %rdi addq %rcx, %rdi movq %rdi, %rcx andq %r8, %rdi movq %rdi, 24(%rax) movq 32(%rdx), %rdx addq 32(%rsi), %rdx shrq $51, %rcx addq %rcx, %rdx movq %rdx, %rcx shrq $51, %rcx andq %r8, %rdx movq %rdx, 32(%rax) leaq (%rcx,%rcx,8), %rdx leaq (%rcx,%rdx,2), %rcx movq %rcx, (%rax) retq clang with -march=native .LCPI0_0: .quad 2251799813685247 add_reduce(number const&, number const&): # @add_reduce(number const&, number const&) movq %rdi, %rax movq (%rdx), %rcx movq 8(%rdx), %rdi addq (%rsi), %rcx shrq $51, %rcx addq 8(%rsi), %rdi addq %rcx, %rdi vmovq %rdi, %xmm0 shrq $51, %rdi movq 16(%rdx), %rcx addq 16(%rsi), %rcx addq %rdi, %rcx vmovq %rcx, %xmm1 shrq $51, %rcx movq 24(%rdx), %rdi addq 24(%rsi), %rdi addq %rcx, %rdi vmovq %rdi, %xmm2 shrq $51, %rdi movq 32(%rdx), %rcx addq 32(%rsi), %rcx addq %rdi, %rcx vpunpcklqdq %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0] vmovq %rcx, %xmm1 vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0] vinserti128 $1, %xmm1, %ymm0, %ymm0 vpandq .LCPI0_0(%rip){1to4}, %ymm0, %ymm0 shrq $51, %rcx vmovdqu %ymm0, 8(%rax) leaq (%rcx,%rcx,8), %rdx leaq (%rcx,%rdx,2), %rcx movq %rcx, (%rax) vzeroupper retq
GCC fails to SLP vectorize this because SLP discovery fails for this reduction scheme. The only think clang vectorizes seems to be the mask reduction and the store. We're not falling back to scalar operand construction for the mask and for some reason.