This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
loops and optimizations
- From: Gero Peterhoff <g dot peterhoff at t-online dot de>
- To: gcc at gcc dot gnu dot org
- Date: Mon, 2 Sep 2019 09:19:18 +0200
- Subject: loops and optimizations
Hello gcc team,
I have a big problem with code generation for conditions and loop control and in general.
1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions.
2) Only (almost) always conditional jumps are generated:
- even mini-functions do not produce cmov
- the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized
- The functionality is always moved to the end of a function and must return to the exit point.
All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction.
An Example for swap bits in a integer:
#define bitsof(Arg) std::size_t(sizeof(Arg) * CHAR_BIT)
using numeric_type = std::uint8_t;
template <typename Type, typename Bits>
constexpr bool is_bits(const Type&, const Bits& bits) noexcept
{
return bits < bitsof(Type);
}
template <bool Numeric, typename... Args>
constexpr bool all_of(const Args... args) noexcept
{
static_assert
(
sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()} &&
sizeof...(Args) > 0,
"invalid argument count"
);
if constexpr (Numeric)
return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args));
else
return (bool(args) && ...);
}
template <size_t Test, typename Type, typename Bit>
constexpr Type swap(const Type& arg, const Bit& bit1, const Bit& bit2) noexcept
{
// Test 0 = default
// Test 1 = fold
// Test 2 = numeric
using unsigned_type = std::make_unsigned_t<Type>;
auto work = [&]() constexpr noexcept -> Type
{
unsigned_type
res = arg;
res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
res = (res<<bit1) | (res<<bit2);
res ^= arg;
return res;
};
if constexpr (Test == 2)
return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
else if constexpr (Test == 1)
return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg;
else
return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg;
}
generates:
1) default:
2 jumps to different targets: 20->28, 26->30
return back 4e->28
0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 49 89 d0 mov %rdx,%r8
3: 0f b6 16 movzbl (%rsi),%edx
6: 41 0f b6 08 movzbl (%r8),%ecx
a: 44 8b 07 mov (%rdi),%r8d
d: 80 fa 1f cmp $0x1f,%dl
10: 0f 96 c0 setbe %al
13: 38 d1 cmp %dl,%cl
15: 0f b6 f9 movzbl %cl,%edi
18: 40 0f 95 c6 setne %sil
1c: 21 f0 and %esi,%eax
1e: a8 01 test $0x1,%al
20: 74 06 je 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
22: 66 83 ff 1f cmp $0x1f,%di
26: 76 08 jbe 30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
28: 44 89 c0 mov %r8d,%eax
2b: c3 retq
2c: 0f 1f 40 00 nopl 0x0(%rax)
30: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
35: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
3a: 31 f0 xor %esi,%eax
3c: 83 e0 01 and $0x1,%eax
3f: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
44: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
49: 09 ca or %ecx,%edx
4b: 41 31 d0 xor %edx,%r8d
4e: eb d8 jmp 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
2) fold-expr:
3 jumps with one target: 11,17,1d->3d
~25% faster than 1)
0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 0f b6 12 movzbl (%rdx),%edx
3: 0f b6 0e movzbl (%rsi),%ecx
6: 44 8b 07 mov (%rdi),%r8d
9: 0f b6 f2 movzbl %dl,%esi
c: 89 c8 mov %ecx,%eax
e: 66 39 ce cmp %cx,%si
11: 74 2a je 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
13: 66 83 f9 1f cmp $0x1f,%cx
17: 77 24 ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
19: 66 83 fe 1f cmp $0x1f,%si
1d: 77 1e ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
1f: c4 c2 7b f7 c8 shrx %eax,%r8d,%ecx
24: c4 c2 6b f7 f0 shrx %edx,%r8d,%esi
29: 31 f1 xor %esi,%ecx
2b: 83 e1 01 and $0x1,%ecx
2e: c4 e2 79 f7 c1 shlx %eax,%ecx,%eax
33: c4 e2 69 f7 d1 shlx %edx,%ecx,%edx
38: 09 d0 or %edx,%eax
3a: 41 31 c0 xor %eax,%r8d
3d: 44 89 c0 mov %r8d,%eax
40: c3 retq
3) numeric
calculate condiditions by numeric:
1 jump to 1 target: 29->49
~50% faster than 1)
0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
0: 0f b6 0a movzbl (%rdx),%ecx
3: 0f b6 16 movzbl (%rsi),%edx
6: 31 c0 xor %eax,%eax
8: 44 8b 07 mov (%rdi),%r8d
b: 80 f9 1f cmp $0x1f,%cl
e: 0f 96 c0 setbe %al
11: 31 f6 xor %esi,%esi
13: 80 fa 1f cmp $0x1f,%dl
16: 40 0f 96 c6 setbe %sil
1a: 01 f0 add %esi,%eax
1c: 31 f6 xor %esi,%esi
1e: 38 ca cmp %cl,%dl
20: 40 0f 95 c6 setne %sil
24: 01 f0 add %esi,%eax
26: 83 f8 03 cmp $0x3,%eax
29: 75 1e jne 49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
2b: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax
30: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi
35: 31 f0 xor %esi,%eax
37: 83 e0 01 and $0x1,%eax
3a: c4 e2 69 f7 d0 shlx %edx,%eax,%edx
3f: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx
44: 09 ca or %ecx,%edx
46: 41 31 d0 xor %edx,%r8d
49: 44 89 c0 mov %r8d,%eax
4c: c3 retq
but clang can more tricky with adc:
(code by godbolt)
mov rax, rdi
xor ecx, ecx
cmp sil, dl
setne cl
cmp sil, 64
adc ecx, 0
cmp dl, 64
adc ecx, 0
cmp ecx, 3
jne .LBB0_2
shrx rcx, rax, rsi
shrx rdi, rax, rdx
xor edi, ecx
and edi, 1
shlx rcx, rdi, rsi
shlx rdx, rdi, rdx
or rdx, rcx
xor rax, rdx
.LBB0_2:
ret
I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function.
best regards
Gero