[Bug c++/64704] New: software crashed when using vectorizing optimization

Wed Jan 21 02:37:00 GMT 2015

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64704

            Bug ID: 64704
           Summary: software crashed when using vectorizing optimization
           Product: gcc
           Version: 4.8.2
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: zhangyajie_koy at 126 dot com

when executing the following for() loop,the system crash.
uint16 MessageBuffer::icmp6Checksum(int update)
{
   TRACE_FUNCTION_ENTRY("");
   register uint32 sum = 0xffff;

   struct icmp6_hdr *icmp6Ptr = NULL;
   uint8 type = findPayloadType((void**)&icmp6Ptr);

   register int i;
   uint16 len = getLength();
   register uint16 *ptr = (uint16 *)icmp6Ptr;

   for (i = 0; i < len - 1; i += 2)
   {
        sum += *ptr++;
   }
   return (sum);
}

this code runs OK,when the compiler is 4.4.1, OS is Ubuntu9.10. when the
compiler is 4.8.2,OS is Ubuntu14.04,it is crash. I check the assemble code of
this for()loop, when using 4.8.2, it optimized by 2 ways. first, loop
unrolling, it preunrolled by 10 times. second, the auto vectorizing
optimization.
after several test,i find that, when the actual loops are less than 10,it runs
OK,while, if greater than 10, it is crashed. so, it must be something wrong
with the auto vectorizing.
when i modify the makefile to close the auto vectorizing optimization using 
-O3 -fno-tree-vectorize,it is OK. the assemble code for the for() loop is shown
as below.
  for loop unrolling optimization begin：
 13081bc:       45 8d 4d ff             lea    -0x1(%r13),%r9d
 13081c0:       45 85 c9                test   %r9d,%r9d
 13081c3:       0f 8e 9e 02 00 00       jle    1308467
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3a7>
 13081c9:       41 8d 75 fe             lea    -0x2(%r13),%esi
 13081cd:       48 89 da                mov    %rbx,%rdx
 13081d0:       83 e2 0f                and    $0xf,%edx
 13081d3:       d1 ee                   shr    %esi
 13081d5:       48 d1 ea                shr    %rdx
 13081d8:       8d 7e 01                lea    0x1(%rsi),%edi
 13081db:       48 f7 da                neg    %rdx
 13081de:       83 e2 07                and    $0x7,%edx
 13081e1:       39 d7                   cmp    %edx,%edi
 13081e3:       89 f9                   mov    %edi,%ecx
 13081e5:       0f 46 d7                cmovbe %edi,%edx
 13081e8:       83 ff 0a                cmp    $0xa,%edi
 13081eb:       0f 87 0f 02 00 00       ja     1308400
<_ZN13MessageBuffer13icmp6ChecksumEi+0x340>
 13081f1:       44 0f b7 03             movzwl (%rbx),%r8d
 13081f5:       48 8d 53 02             lea    0x2(%rbx),%rdx
 13081f9:       44 01 c0                add    %r8d,%eax
 13081fc:       83 f9 01                cmp    $0x1,%ecx
 13081ff:       0f 86 95 02 00 00       jbe    130849a
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3da>
 1308205:       44 0f b7 43 02          movzwl 0x2(%rbx),%r8d
 130820a:       48 8d 53 04             lea    0x4(%rbx),%rdx
 130820e:       44 01 c0                add    %r8d,%eax
 1308211:       83 f9 02                cmp    $0x2,%ecx
 1308214:       0f 86 75 02 00 00       jbe    130848f
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3cf>
 130821a:       44 0f b7 43 04          movzwl 0x4(%rbx),%r8d
 130821f:       48 8d 53 06             lea    0x6(%rbx),%rdx
 1308223:       44 01 c0                add    %r8d,%eax
 1308226:       83 f9 03                cmp    $0x3,%ecx
 1308229:       0f 86 97 02 00 00       jbe    13084c6
<_ZN13MessageBuffer13icmp6ChecksumEi+0x406>
 130822f:       44 0f b7 43 06          movzwl 0x6(%rbx),%r8d
 1308234:       48 8d 53 08             lea    0x8(%rbx),%rdx
 1308238:       44 01 c0                add    %r8d,%eax
 130823b:       83 f9 04                cmp    $0x4,%ecx
 130823e:       0f 86 77 02 00 00       jbe    13084bb
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3fb>
 1308244:       44 0f b7 43 08          movzwl 0x8(%rbx),%r8d
 1308249:       48 8d 53 0a             lea    0xa(%rbx),%rdx
 130824d:       44 01 c0                add    %r8d,%eax
 1308250:       83 f9 05                cmp    $0x5,%ecx
 1308253:       0f 86 57 02 00 00       jbe    13084b0
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3f0>
 1308259:       44 0f b7 43 0a          movzwl 0xa(%rbx),%r8d
 130825e:       48 8d 53 0c             lea    0xc(%rbx),%rdx
 1308262:       44 01 c0                add    %r8d,%eax
 1308265:       83 f9 06                cmp    $0x6,%ecx
 1308268:       0f 86 37 02 00 00       jbe    13084a5
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3e5>
 130826e:       44 0f b7 43 0c          movzwl 0xc(%rbx),%r8d
 1308273:       48 8d 53 0e             lea    0xe(%rbx),%rdx
 1308277:       44 01 c0                add    %r8d,%eax
 130827a:       83 f9 07                cmp    $0x7,%ecx
 130827d:       0f 86 f6 01 00 00       jbe    1308479
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3b9>
 1308283:       44 0f b7 43 0e          movzwl 0xe(%rbx),%r8d
 1308288:       48 8d 53 10             lea    0x10(%rbx),%rdx
 130828c:       44 01 c0                add    %r8d,%eax
 130828f:       83 f9 08                cmp    $0x8,%ecx
 1308292:       0f 86 d6 01 00 00       jbe    130846e
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3ae>
 1308298:       44 0f b7 43 10          movzwl 0x10(%rbx),%r8d
 130829d:       48 8d 53 12             lea    0x12(%rbx),%rdx
 13082a1:       44 01 c0                add    %r8d,%eax
 13082a4:       83 f9 09                cmp    $0x9,%ecx
 13082a7:       0f 86 d7 01 00 00       jbe    1308484
<_ZN13MessageBuffer13icmp6ChecksumEi+0x3c4>
 13082ad:       44 0f b7 43 12          movzwl 0x12(%rbx),%r8d
 13082b2:       48 8d 53 14             lea    0x14(%rbx),%rdx
 13082b6:       44 01 c0                add    %r8d,%eax
 13082b9:       41 b8 14 00 00 00       mov    $0x14,%r8d
 13082bf:       39 f9                   cmp    %edi,%ecx
 13082c1:       0f 84 e2 00 00 00       je     13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 for loop unrolling optimization end:

 for loop auto vectorizing optimization begin：
 13082c7:       41 89 fe                mov    %edi,%r14d
 13082ca:       41 89 ca                mov    %ecx,%r10d
 13082cd:       41 29 ce                sub    %ecx,%r14d
 13082d0:       44 89 f5                mov    %r14d,%ebp
 13082d3:       c1 ed 03                shr    $0x3,%ebp
 13082d6:       8d 0c ed 00 00 00 00    lea    0x0(,%rbp,8),%ecx
 13082dd:       85 c9                   test   %ecx,%ecx
 13082df:       74 69                   je     130834a
<_ZN13MessageBuffer13icmp6ChecksumEi+0x28a>
 13082e1:       66 0f ef c0             pxor   %xmm0,%xmm0
 13082e5:       4e 8d 1c 53             lea    (%rbx,%r10,2),%r11
 13082e9:       66 0f ef d2             pxor   %xmm2,%xmm2
 13082ed:       45 31 d2                xor    %r10d,%r10d
 13082f0:       66 41 0f 6f 0b          movdqa (%r11),%xmm1
 13082f5:       41 83 c2 01             add    $0x1,%r10d
 13082f9:       49 83 c3 10             add    $0x10,%r11
 13082fd:       44 39 d5                cmp    %r10d,%ebp
 1308300:       66 0f 6f e1             movdqa %xmm1,%xmm4
 1308304:       66 0f 69 ca             punpckhwd %xmm2,%xmm1
 1308308:       66 0f 61 e2             punpcklwd %xmm2,%xmm4
 130830c:       66 0f fe c4             paddd  %xmm4,%xmm0
 1308310:       66 0f fe c1             paddd  %xmm1,%xmm0
 1308314:       77 da                   ja     13082f0
<_ZN13MessageBuffer13icmp6ChecksumEi+0x230>
 1308316:       66 0f 6f e8             movdqa %xmm0,%xmm5
 130831a:       41 89 ca                mov    %ecx,%r10d
 130831d:       45 8d 04 48             lea    (%r8,%rcx,2),%r8d
 1308321:       4a 8d 14 52             lea    (%rdx,%r10,2),%rdx
 1308325:       66 0f 73 dd 08          psrldq $0x8,%xmm5
 130832a:       66 0f fe c5             paddd  %xmm5,%xmm0
 130832e:       66 0f 6f f0             movdqa %xmm0,%xmm6
 1308332:       66 0f 73 de 04          psrldq $0x4,%xmm6
 1308337:       66 0f fe c6             paddd  %xmm6,%xmm0
 130833b:       66 0f 7e 44 24 0c       movd   %xmm0,0xc(%rsp)
 1308341:       03 44 24 0c             add    0xc(%rsp),%eax
 1308345:       41 39 ce                cmp    %ecx,%r14d
 1308348:       74 5f                   je     13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 130834a:       0f b7 0a                movzwl (%rdx),%ecx
 130834d:       01 c8                   add    %ecx,%eax
 130834f:       41 8d 48 02             lea    0x2(%r8),%ecx
 1308353:       44 39 c9                cmp    %r9d,%ecx
 1308356:       7d 51                   jge    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 1308358:       0f b7 4a 02             movzwl 0x2(%rdx),%ecx
 130835c:       01 c8                   add    %ecx,%eax
 130835e:       41 8d 48 04             lea    0x4(%r8),%ecx
 1308362:       41 39 c9                cmp    %ecx,%r9d
 1308365:       7e 42                   jle    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 1308367:       0f b7 4a 04             movzwl 0x4(%rdx),%ecx
 130836b:       01 c8                   add    %ecx,%eax
 130836d:       41 8d 48 06             lea    0x6(%r8),%ecx
 1308371:       41 39 c9                cmp    %ecx,%r9d
 1308374:       7e 33                   jle    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 1308376:       0f b7 4a 06             movzwl 0x6(%rdx),%ecx
 130837a:       01 c8                   add    %ecx,%eax
 130837c:       41 8d 48 08             lea    0x8(%r8),%ecx
 1308380:       41 39 c9                cmp    %ecx,%r9d
 1308383:       7e 24                   jle    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 1308385:       0f b7 4a 08             movzwl 0x8(%rdx),%ecx
 1308389:       01 c8                   add    %ecx,%eax
 130838b:       41 8d 48 0a             lea    0xa(%r8),%ecx
 130838f:       41 39 c9                cmp    %ecx,%r9d
 1308392:       7e 15                   jle    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 1308394:       0f b7 4a 0a             movzwl 0xa(%rdx),%ecx
 1308398:       41 83 c0 0c             add    $0xc,%r8d
 130839c:       01 c8                   add    %ecx,%eax
 130839e:       45 39 c1                cmp    %r8d,%r9d
 13083a1:       7e 06                   jle    13083a9
<_ZN13MessageBuffer13icmp6ChecksumEi+0x2e9>
 13083a3:       0f b7 52 0c             movzwl 0xc(%rdx),%edx
 13083a7:       01 d0                   add    %edx,%eax
 13083a9:       48 8d 5c 73 02          lea    0x2(%rbx,%rsi,2),%rbx
 13083ae:       01 ff                   add    %edi,%edi
 for loop auto vectorizing optimization end:

our cpu info is：
Intel(R) Xeon(R) CPU E5-2687W 0 @ 3.10GHz
i have 2cpu, 8cores for each cpu.