Created attachment 36833 [details] unreduced testcase Since r230091 gcc apparently miscompiles the glibc dynmanic linker when using -O3: markus@x4 glibc-build % gdb --args /var/tmp/glibc-build/elf/ld-linux-x86-64.so.2 /home/markus/bin/xmonad Reading symbols from /var/tmp/glibc-build/elf/ld-linux-x86-64.so.2...done. (gdb) run Starting program: /var/tmp/glibc-build/elf/ld-linux-x86-64.so.2 /home/markus/bin/xmonad [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/libthread_db.so.1". Program received signal SIGSEGV, Segmentation fault. _dl_lookup_symbol_x (undef_name=0x7ffff75cb3c5 "__gmpn_invert_limb", undef_map=0x7ffff76494d0, ref=ref@entry=0x7fffffffa170, symbol_scope=0x7ffff7649828, version=0x0, type_class=type_class@entry=1, flags=1, skip_map=0x0) at dl-lookup.c:809 809 struct sym_val current_value = { NULL, NULL }; (gdb) bt #0 _dl_lookup_symbol_x (undef_name=0x7ffff75cb3c5 "__gmpn_invert_limb", undef_map=0x7ffff76494d0, ref=ref@entry=0x7fffffffa170, symbol_scope=0x7ffff7649828, version=0x0, type_class=type_class@entry=1, flags=1, skip_map=0x0) at dl-lookup.c:809 #1 0x0000555555564413 in _dl_fixup (l=<optimized out>, reloc_arg=<optimized out>) at ../elf/dl-runtime.c:111 #2 0x000055555556ce1f in _dl_runtime_resolve_sse () at ../sysdeps/x86_64/dl-trampoline.h:112 #3 0x00007ffff762093a in __gmpn_divrem_1 () from /usr/lib/libgmp.so.10 #4 0xee6b280000000000 in ?? () #5 0x00007ffff6b0b138 in ?? () #6 0x00007ffff6b057d0 in ?? () #7 0x00000000ffffffe2 in ?? () #8 0x00007ffff6b057b8 in ?? () #9 0x00007ffff6b051b0 in ?? () #10 0x00007ffff6b0b1b0 in ?? () #11 0x00000000006b47d8 in ?? () #12 0x00000000006142a9 in ?? () #13 0x0000000000000000 in ?? () (gdb) disass Dump of assembler code for function _dl_lookup_symbol_x: ... 0x000055555555e915 <+117>: jne 0x55555555e900 <_dl_lookup_symbol_x+96> 0x000055555555e917 <+119>: mov %eax,%eax 0x000055555555e919 <+121>: mov %rax,-0xb0(%rbp) 0x000055555555e920 <+128>: pxor %xmm0,%xmm0 0x000055555555e924 <+132>: mov $0xffffffff,%eax 0x000055555555e929 <+137>: incq 0x21d048(%rip) # 0x55555577b978 <_rtld_local+2424> 0x000055555555e930 <+144>: test %r12,%r12 0x000055555555e933 <+147>: mov %rax,-0xa0(%rbp) => 0x000055555555e93a <+154>: movaps %xmm0,-0x90(%rbp) 0x000055555555e941 <+161>: je 0x55555555e950 <_dl_lookup_symbol_x+176> rax 0xffffffff 4294967295 rbx 0x7ffff763c360 140737343898464 rcx 0x7ffff75cb3d7 140737343435735 rdx 0x0 0 rsi 0x33f9b936d4fb49a0 3745228210287888800 rdi 0x7ffff75cb3c5 140737343435717 rbp 0x7fffffffa148 0x7fffffffa148 rsp 0x7fffffffa058 0x7fffffffa058 r8 0x0 0 r9 0x1 1 r10 0x7ffff76494d0 140737343952080 r11 0x7ffff7620840 140737343785024 r12 0x0 0 r13 0x0 0 r14 0x7ffff76494d0 140737343952080 r15 0x7fffffffa170 140737488331120 rip 0x55555555e93a 0x55555555e93a <_dl_lookup_symbol_x+154> eflags 0x10246 [ PF ZF IF RF ] cs 0x33 51 ss 0x2b 43 ds 0x0 0 es 0x0 0 fs 0x0 0 gs 0x0 0 Comparison of disassembly of the function in question left with __attribute__ ((__target__ ("no-mmx,no-sse"))) right without (segfaulting). add %rsi,%rax | add %rsi,%rax add %rdx,%rax | add %rdx,%rax movzbl (%rcx),%edx | movzbl (%rcx),%edx test %dl,%dl | test %dl,%dl jne da0 <_dl_lookup_symbol_x+0x60> | jne da0 <_dl_lookup_symbol_x+0x60> mov %eax,%ebx | mov %eax,%ebx mov $0xffffffff,%eax | pxor %xmm0,%xmm0 test %r13,%r13 | mov $0xffffffff,%eax movq $0x0,-0x80(%rbp) | test %r13,%r13 | mov %rax,-0x88(%rbp) mov %rax,-0x88(%rbp) | movaps %xmm0,-0x80(%rbp) movq $0x0,-0x78(%rbp) | je ddf <_dl_lookup_symbol_x+0x9f> | testl $0xfffffffa,0x10(%rbp) je de7 <_dl_lookup_symbol_x+0xa7> | jne 18e4 <_dl_lookup_symbol_x+0xba4> testl $0xfffffffa,0x10(%rbp) | mov -0xb8(%rbp),%rax jne 196c <_dl_lookup_symbol_x+0xc2c> | test %r14,%r14 mov -0xb8(%rbp),%rax | mov (%rax),%r9 test %r14,%r14 | jne 1000 <_dl_lookup_symbol_x+0x2c0> mov (%rax),%r9 | test %r9,%r9 jne 1040 <_dl_lookup_symbol_x+0x300> | movq $0x0,-0xc0(%rbp) markus@x4 elf % gcc -O3 -c dl-lookup.i markus@x4 elf % objdump -dr dl-lookup.o | grep movaps 14f4: 0f 29 45 80 movaps %xmm0,-0x80(%rbp) 163b: 0f 29 45 90 movaps %xmm0,-0x70(%rbp) With -fno-vect-cost-model even 4.9 generates the segfaulting instruction.
You have left out bits of the disassembly which would show if the stack is misaligned on entry of the function. (This would be a bug in GHC, not GCC.)
0000000000001470 <_dl_lookup_symbol_x>: 1470: 55 push %rbp 1471: 48 89 e5 mov %rsp,%rbp 1474: 41 57 push %r15 1476: 41 56 push %r14 1478: 41 55 push %r13 147a: 41 54 push %r12 147c: 49 89 d5 mov %rdx,%r13 147f: 53 push %rbx 1480: 49 89 f7 mov %rsi,%r15 1483: 4d 89 c6 mov %r8,%r14 1486: 48 81 ec a8 00 00 00 sub $0xa8,%rsp 148d: 0f b6 17 movzbl (%rdi),%edx 1490: 48 89 bd 68 ff ff ff mov %rdi,-0x98(%rbp) 1497: 48 89 8d 50 ff ff ff mov %rcx,-0xb0(%rbp) 149e: 44 89 8d 5c ff ff ff mov %r9d,-0xa4(%rbp) 14a5: 4c 8b 65 18 mov 0x18(%rbp),%r12 14a9: 84 d2 test %dl,%dl 14ab: 0f 84 cb 02 00 00 je 177c <_dl_lookup_symbol_x+0x30c> 14b1: 48 89 f9 mov %rdi,%rcx 14b4: b8 05 15 00 00 mov $0x1505,%eax 14b9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 14c0: 48 89 c6 mov %rax,%rsi 14c3: 48 83 c1 01 add $0x1,%rcx 14c7: 48 c1 e6 05 shl $0x5,%rsi 14cb: 48 01 f0 add %rsi,%rax 14ce: 48 01 d0 add %rdx,%rax 14d1: 0f b6 11 movzbl (%rcx),%edx 14d4: 84 d2 test %dl,%dl 14d6: 75 e8 jne 14c0 <_dl_lookup_symbol_x+0x50> 14d8: 89 c0 mov %eax,%eax 14da: 48 89 85 60 ff ff ff mov %rax,-0xa0(%rbp) 14e1: 66 0f ef c0 pxor %xmm0,%xmm0 14e5: b8 ff ff ff ff mov $0xffffffff,%eax 14ea: 4d 85 f6 test %r14,%r14 14ed: 48 89 85 78 ff ff ff mov %rax,-0x88(%rbp) 14f4: 0f 29 45 80 movaps %xmm0,-0x80(%rbp)
(In reply to Markus Trippelsdorf from comment #0) > rbp 0x7fffffffa148 0x7fffffffa148 (In reply to Markus Trippelsdorf from comment #2) > 0000000000001470 <_dl_lookup_symbol_x>: > 1470: 55 push %rbp > 1471: 48 89 e5 mov %rsp,%rbp > 14f4: 0f 29 45 80 movaps %xmm0,-0x80(%rbp) If I'm reading this correctly, %rsp was a multiple of 16 when the function is entered, but this is not correct, it has to be congruent 8 modulo 16. This looks like a GHC or GHCi bug (which may already have been fixed).
We vectorize zeroing of protected_value and current_value which both are analyzed as being properly aligned. They are of type struct sym_val { const Elf64_Sym *s; struct link_map *m; }; which would have natural alignment of 8 bytes only. The actual decls look like arg 0 <var_decl 0x7ffff62d4e10 current_value type <record_type 0x7ffff67b2690 sym_val sizes-gimplified type_0 TI size <integer_cst 0x7ffff68cccf0 constant 128> unit size <integer_cst 0x7ffff68ccd08 constant 16> align 64 symtab 0 alias set 44 canonical type 0x7ffff67b2690 fields <field_decl 0x7ffff6873098 s> context <translation_unit_decl 0x7ffff671dbb8 D.10151> pointer_to_this <pointer_type 0x7ffff64dd5e8> chain <type_decl 0x7ffff6873000 D.9849>> addressable used TI file dl-lookup.c line 810 col 18 size <integer_cst 0x7ffff68cccf0 128> unit size <integer_cst 0x7ffff68ccd08 16> align 128 context <function_decl 0x7ffff6a968c0 _dl_lookup_symbol_x> and thus indeed have a DECL_ALIGN of 16 bytes. expand_one_var computes align as else align = MINIMUM_ALIGNMENT (var, DECL_MODE (var), DECL_ALIGN (var)); 128 as well and registers that with the output machinery. Partition 2: size 64 align 16 all protected_value all Partition 1: size 16 align 16 current_value Partition 0: size 8 align 8 old_hash which looks ok as well. Breakpoint 11, expand_one_stack_var_at ( decl=<var_decl 0x7ffff62ddc60 protected_value>, base=0x7ffff68cc3a8, base_align=128, offset=-64) at /space/rguenther/src/svn/trunk3/gcc/cfgexpand.c:978 978 gcc_assert (offset == trunc_int_for_mode (offset, Pmode)); Breakpoint 11, expand_one_stack_var_at ( decl=<var_decl 0x7ffff62d4e10 current_value>, base=0x7ffff68cc3a8, base_align=128, offset=-80) at /space/rguenther/src/svn/trunk3/gcc/cfgexpand.c:978 978 gcc_assert (offset == trunc_int_for_mode (offset, Pmode)); ok as well. So does the intitial RTL (well, still virtual-stack-vars is life). So this is an RTL optimization or target issue.
(In reply to Florian Weimer from comment #3) > (In reply to Markus Trippelsdorf from comment #0) > > rbp 0x7fffffffa148 0x7fffffffa148 > > (In reply to Markus Trippelsdorf from comment #2) > > 0000000000001470 <_dl_lookup_symbol_x>: > > 1470: 55 push %rbp > > 1471: 48 89 e5 mov %rsp,%rbp > > > 14f4: 0f 29 45 80 movaps %xmm0,-0x80(%rbp) > > If I'm reading this correctly, %rsp was a multiple of 16 when the function > is entered, but this is not correct, it has to be congruent 8 modulo 16. > > This looks like a GHC or GHCi bug (which may already have been fixed). Ok. Well, I'm running GHC version 7.10.2, which is the latest. (Not sure how to report this issue to the ghc folks).
(In reply to Markus Trippelsdorf from comment #5) > Ok. Well, I'm running GHC version 7.10.2, which is the latest. > (Not sure how to report this issue to the ghc folks). To be honest, if Richard thinks there is a GCC bug here (and I read comment #4 this way), he's likely right and I'm wrong.
(In reply to Florian Weimer from comment #6) > (In reply to Markus Trippelsdorf from comment #5) > > > Ok. Well, I'm running GHC version 7.10.2, which is the latest. > > (Not sure how to report this issue to the ghc folks). > > To be honest, if Richard thinks there is a GCC bug here (and I read comment > #4 this way), he's likely right and I'm wrong. Well, only Haskell programs crash. So I think you're right.
(In reply to Markus Trippelsdorf from comment #7) > Well, only Haskell programs crash. This is a valid point. You can open a GHC ticket in Trac, after registering: <https://ghc.haskell.org/trac/ghc/>
For the curious: https://ghc.haskell.org/trac/ghc/ticket/11133
(In reply to Florian Weimer from comment #6) > (In reply to Markus Trippelsdorf from comment #5) > > > Ok. Well, I'm running GHC version 7.10.2, which is the latest. > > (Not sure how to report this issue to the ghc folks). > > To be honest, if Richard thinks there is a GCC bug here (and I read comment > #4 this way), he's likely right and I'm wrong. No, I just assumed it is a bug because Markus inspected the assembler and analyzed it to be broken. I just verified it is all well from vectorization to RTL expansion.