GCC does not optimize out functions without side effects with asm statements inside loop even if return velue is ignored

Andrew Pinski pinskia@gmail.com
Sat Oct 7 21:32:00 GMT 2017


On Sat, Oct 7, 2017 at 2:22 PM, Saldyrkine, Mikhail
<Mikhail.Saldyrkine@gs.com> wrote:
> The " uint64_t test_noasm(uint64_t idx)" has same loop and the function is optimized out.

There is a difference there, objects is limited to 1024.  Loading past
the array bounds is undefined.

Thanks,
Andrew


> I've changed code to constraint the loop iterations and compiler:
> - unrolled loop
> - did not eliminate the function as it does when asm is not used
> It looks like the " infinite loop" is not root cause.
>
> inline uint64_t test_asm_inside_loop(uint64_t idx) {
>     uint64_t result;
>     for( int i = 0; i < capacity; ++i )
>     {
>         asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>         if( result > 128 )
>             return result;
>         ++idx;
>     }
>     return 0;
> }
>
> Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
>    0x0000000000400b40 <+0>:     xor    %eax,%eax
>    0x0000000000400b42 <+2>:     mov    $0x602080,%edx
>    0x0000000000400b47 <+7>:     mov    (%rdx,%rax,8),%rcx
>    0x0000000000400b4b <+11>:    cmp    $0x80,%rcx
>    0x0000000000400b52 <+18>:    ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400b58 <+24>:    mov    $0x1,%eax
>    0x0000000000400b5d <+29>:    mov    (%rdx,%rax,8),%rsi
>    0x0000000000400b61 <+33>:    cmp    $0x80,%rsi
>    0x0000000000400b68 <+40>:    ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400b6e <+46>:    lea    0x1(%rax),%rdi
>    0x0000000000400b72 <+50>:    mov    (%rdx,%rdi,8),%r8
>    0x0000000000400b76 <+54>:    cmp    $0x80,%r8
>    0x0000000000400b7d <+61>:    ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400b83 <+67>:    lea    0x2(%rax),%r9
>    0x0000000000400b87 <+71>:    mov    (%rdx,%r9,8),%r10
>    0x0000000000400b8b <+75>:    cmp    $0x80,%r10
>    0x0000000000400b92 <+82>:    ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400b98 <+88>:    lea    0x3(%rax),%r11
>    0x0000000000400b9c <+92>:    mov    (%rdx,%r11,8),%rcx
>    0x0000000000400ba0 <+96>:    cmp    $0x80,%rcx
>    0x0000000000400ba7 <+103>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400bad <+109>:   lea    0x4(%rax),%rsi
>    0x0000000000400bb1 <+113>:   mov    (%rdx,%rsi,8),%r8
>    0x0000000000400bb5 <+117>:   cmp    $0x80,%r8
>    0x0000000000400bbc <+124>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400bbe <+126>:   lea    0x5(%rax),%r9
>    0x0000000000400bc2 <+130>:   mov    (%rdx,%r9,8),%r10
>    0x0000000000400bc6 <+134>:   cmp    $0x80,%r10
>    0x0000000000400bcd <+141>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400bcf <+143>:   lea    0x6(%rax),%r11
>    0x0000000000400bd3 <+147>:   mov    (%rdx,%r11,8),%rcx
>    0x0000000000400bd7 <+151>:   cmp    $0x80,%rcx
>    0x0000000000400bde <+158>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400be0 <+160>:   lea    0x7(%rax),%rsi
>    0x0000000000400be4 <+164>:   mov    (%rdx,%rsi,8),%r8
>    0x0000000000400be8 <+168>:   cmp    $0x80,%r8
>    0x0000000000400bef <+175>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400bf1 <+177>:   lea    0x8(%rax),%r9
>    0x0000000000400bf5 <+181>:   mov    (%rdx,%r9,8),%r10
>    0x0000000000400bf9 <+185>:   cmp    $0x80,%r10
>    0x0000000000400c00 <+192>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400c02 <+194>:   add    $0x9,%rax
>    0x0000000000400c06 <+198>:   mov    (%rdx,%rax,8),%rax
>    0x0000000000400c0a <+202>:   cmp    $0x80,%rax
>    0x0000000000400c10 <+208>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400c12 <+210>:   lea    0x9(%rdi),%r11
>    0x0000000000400c16 <+214>:   mov    (%rdx,%r11,8),%rcx
>    0x0000000000400c1a <+218>:   cmp    $0x80,%rcx
>    0x0000000000400c21 <+225>:   ja     0x400c38 <_Z28compile_test_asm_inside_loopv+248>
>    0x0000000000400c23 <+227>:   lea    0xa(%rdi),%rax
>    0x0000000000400c27 <+231>:   cmp    $0x400,%rax
>    0x0000000000400c2d <+237>:   jne    0x400b5d <_Z28compile_test_asm_inside_loopv+29>
>    0x0000000000400c33 <+243>:   repz retq
>    0x0000000000400c35 <+245>:   nopl   (%rax)
>    0x0000000000400c38 <+248>:   repz retq
>
> -----Original Message-----
> From: Andrew Pinski [mailto:pinskia@gmail.com]
> Sent: Saturday, October 07, 2017 3:04 PM
> To: Saldyrkine, Mikhail [Sec Div]
> Cc: gcc-bugs@gcc.gnu.org
> Subject: Re: GCC does not optimize out functions without side effects with asm statements inside loop even if return velue is ignored
>
> On Sat, Oct 7, 2017 at 8:39 AM, Saldyrkine, Mikhail
> <Mikhail.Saldyrkine@gs.com> wrote:
>> g++ (GCC) 6.3.1 20170216 (Red Hat 6.3.1-3)
>>
>> In the below case compile_test_asm_inside_loop invokes test_asm_inside_loop and ignores results.
>> The call into test_asm_inside_loop is expected to be eliminated since return value is not used and there is no side effect
>> The call elimination works fine without asm and without loop
>> It does not work with asm inside loop
>
> Because the loop could be an infinite loop and GCC does not know how
> many times the inline-asm is going to be called and if there are other
> side effects.
>
> Let's look at the function:
> inline uint64_t test_asm_inside_loop(uint64_t idx) {
>     while(true)
>     {
>         uint64_t result;
>         asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>         if( result > 128 )
>             return result;
>         ++idx;
>     }
> }
>
> The loop is only broken out of when result is > 128.  result from the
> inline-asm is used as the breakout from the loop.
>
> Thanks,
> Andrew
>
>>
>> TEST CODE
>>
>> #include <iostream>
>> #include <assert.h>
>>
>> using namespace std;
>> constexpr static size_t capacity = 1024;
>> uint64_t objects[capacity];
>>
>> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
>> inline uint64_t test_noloop(uint64_t idx) {
>>     uint64_t result;
>>     asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>>     if( result > 128 )
>>         return result;
>>     return 0;
>> }
>>
>> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
>> inline uint64_t test_noasm(uint64_t idx) {
>>     while(true)
>>     {
>>         if( objects[idx] > 128 )
>>             return objects[idx];
>>         ++idx;
>>     }
>> }
>>
>> // THE FUNCTION IS KEEPT EVEN WHEN IF RESULT IS NOT USED - ASM INSIDE LOOP CAUSING THE ISSUE
>> inline uint64_t test_asm_inside_loop(uint64_t idx) {
>>     while(true)
>>     {
>>         uint64_t result;
>>         asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>>         if( result > 128 )
>>             return result;
>>         ++idx;
>>     }
>> }
>>
>> void init() {
>>     srand(time(nullptr));
>>     for( size_t i = 0; i < capacity - 1; ++i )
>>         objects[i] = random() % 256;
>>     objects[capacity-1] = 255;
>> }
>>
>> // TETS THAT test_noasm AND test_asm_inside_loop PRODUCE SAME RESULT
>> void sanity_test() {
>>     for( size_t i = 0; i < capacity; ++i ) {
>>         assert( test_noasm(i) == test_asm_inside_loop(i));
>>     }
>> }
>>
>> void compile_test_noasm() {
>>     test_noasm(0);
>> }
>>
>> void compile_test_noloop() {
>>     test_noloop(0);
>> }
>>
>> void compile_test_asm_inside_loop() {
>>     test_asm_inside_loop(0);
>> }
>>
>> int main( int argc, char* argv[] ) {
>>     init();
>>     sanity_test();
>>     compile_test_noasm();
>>     compile_test_noloop();
>>     compile_test_asm_inside_loop();
>> }
>>
>> COMPILATION AND DISASSEMBLER RESULTS:
>>
>> /opt/rh/devtoolset-6//root/bin/g++  -O3 -funroll-loops  loop_optimization.cpp; gdb -batch -ex "file a.out" -ex "disas compile_test_noasm" -ex "disas compile_test_noloop" -ex "disas compile_test_asm_inside_loop"
>> Dump of assembler code for function _Z18compile_test_noasmv:
>>    0x0000000000400970 <+0>:     repz retq
>> End of assembler dump.
>> Dump of assembler code for function _Z19compile_test_noloopv:
>>    0x0000000000400980 <+0>:     repz retq
>> End of assembler dump.
>> Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
>>    0x0000000000400990 <+0>:     xor    %edx,%edx
>>    0x0000000000400992 <+2>:     mov    $0x601080,%ecx
>>    0x0000000000400997 <+7>:     xor    %eax,%eax
>>    0x0000000000400999 <+9>:     mov    (%rcx,%rdx,8),%rsi
>>    0x000000000040099d <+13>:    cmp    $0x80,%rsi
>>    0x00000000004009a4 <+20>:    ja     0x4009c1 <_Z28compile_test_asm_inside_loopv+49>
>>    0x00000000004009a6 <+22>:    nopw   %cs:0x0(%rax,%rax,1)
>>    0x00000000004009b0 <+32>:    add    $0x1,%rax
>>    0x00000000004009b4 <+36>:    mov    (%rcx,%rax,8),%rdi
>>    0x00000000004009b8 <+40>:    cmp    $0x80,%rdi
>>    0x00000000004009bf <+47>:    jbe    0x4009b0 <_Z28compile_test_asm_inside_loopv+32>
>>    0x00000000004009c1 <+49>:    repz retq
>> End of assembler dump.
>>
>>



More information about the Gcc-bugs mailing list