This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c/68725] New: suboptimal handling of constant compound literals
- From: "rv at rasmusvillemoes dot dk" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Sat, 05 Dec 2015 18:46:26 +0000
- Subject: [Bug c/68725] New: suboptimal handling of constant compound literals
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68725
Bug ID: 68725
Summary: suboptimal handling of constant compound literals
Product: gcc
Version: 5.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: rv at rasmusvillemoes dot dk
CC: rostedt at goodmis dot org
Target Milestone: ---
The motivation for this comes from the linux kernel's
include/trace/trace_events.h file (hence the cc Steven Rostedt), in
particular the __print_flags and __print_symbolic macros. They are
invoked multiple times with the same list of flag_array initializers,
which causes identical copies of the defined static array to be stored
in .rodata. The obvious solution, to define the array once in some .c
file and declare it extern, is rather inconvenient to try to retrofit
to the way the tracing subsystem works.
So I decided to try and see if using anonymous objects (compound
literals) would work. It turns out it did, and then it didn't. gcc is
smart enough to only emit a single copy to .rodata (at least within a
single translation unit, but that's good enough in this
case). However, at each use site, gcc decides to make a stack copy and
pass a pointer to that copy on to trace_print_flags_seq function. This
is very inefficient, completely redundant, and since some of the
arrays are over 1k in size, unacceptable in kernel code.
The problem can be seen in the example below. For smaller arrays, gcc
doesn't seem to put a copy in .rodata; it constructs the array on the
stack with a sequence of movq instructions, which is even more
inefficient (the .text to build the array takes more space than a copy
of the array in .rodata + a memcpy would) and equally wasteful. And it
gets even worse when one looks at the g functions, where the stack use
is doubled, and the copy/construction is done twice.
For const-qualified compound literals with compile-time constant
initializers, which are not explicitly used to initialize another
object, I don't see any reason to actually construct such an
object. [Maybe as an optimization if the object only takes up a few
words, but even that is questionable.] As soon as the size is greater
than, say, 32 bytes, I think it would much better to just refer to a
single copy in .rodata.
I've tried gcc 4.9, 5.1 and both -O2, -O3, and they all show the same
behaviour.
// gcc -std=gnu89 -O2 -o complit.o -c complit.c
#include <stddef.h>
struct flag_name { unsigned long mask; const char *name; };
#define FLAG_0 (1UL << 0)
#define FLAG_1 (1UL << 1)
#define FLAG_2 (1UL << 2)
#define FLAG_3 (1UL << 3)
#define FLAG_4 (1UL << 4)
#define FLAG_5 (1UL << 5)
#define FLAG_6 (1UL << 6)
#define FLAG_7 (1UL << 7)
#define FLAG_8 (1UL << 8)
#define FLAG_9 (1UL << 9)
#define FLAG_10 (1UL << 10)
#define FLAG_11 (1UL << 11)
#define FLAG_12 (1UL << 12)
#define FLAG_13 (1UL << 13)
#define FLAG_14 (1UL << 14)
#define FLAG_15 (1UL << 15)
#define FLAG_16 (1UL << 16)
#define FLAG_17 (1UL << 17)
#define FLAG_18 (1UL << 18)
#define FLAG_19 (1UL << 19)
#define FLAG_20 (1UL << 20)
#define FLAG_21 (1UL << 21)
#define FLAG_22 (1UL << 22)
#define FLAG_23 (1UL << 23)
#define FLAG_24 (1UL << 24)
#define FLAG_25 (1UL << 25)
#define FLAG_26 (1UL << 26)
#define FLAG_27 (1UL << 27)
#define FLAG_28 (1UL << 28)
#define FLAG_29 (1UL << 29)
#define FLAG_30 (1UL << 30)
#define FLAG_31 (1UL << 31)
#define FLAG_32 (1UL << 32)
#define flag_pair(f) {f, #f}
#define FLAG_NAMES \
flag_pair(FLAG_0), \
flag_pair(FLAG_1), \
flag_pair(FLAG_2), \
flag_pair(FLAG_3), \
flag_pair(FLAG_4), \
flag_pair(FLAG_5), \
flag_pair(FLAG_6), \
flag_pair(FLAG_7), \
flag_pair(FLAG_8), \
flag_pair(FLAG_9), \
flag_pair(FLAG_10), \
flag_pair(FLAG_11), \
flag_pair(FLAG_12), \
flag_pair(FLAG_13), \
flag_pair(FLAG_14), \
flag_pair(FLAG_15), \
flag_pair(FLAG_16), \
flag_pair(FLAG_17), \
flag_pair(FLAG_18), \
flag_pair(FLAG_19), \
flag_pair(FLAG_20), \
flag_pair(FLAG_21), \
flag_pair(FLAG_22), \
flag_pair(FLAG_23), \
flag_pair(FLAG_24), \
flag_pair(FLAG_25), \
flag_pair(FLAG_26), \
flag_pair(FLAG_27), \
flag_pair(FLAG_28), \
flag_pair(FLAG_29), \
flag_pair(FLAG_30), \
flag_pair(FLAG_31), \
flag_pair(FLAG_32)
#define FLAG_NAMES2 \
flag_pair(FLAG_0), \
flag_pair(FLAG_1), \
flag_pair(FLAG_2), \
flag_pair(FLAG_3), \
flag_pair(FLAG_4)
void print_flags(const char *s, unsigned long flags, const struct flag_name
*names);
void f(unsigned long flags)
{
print_flags("foo", flags, (const struct flag_name[]){ FLAG_NAMES,
{-1UL, NULL}});
}
void g(unsigned long flags)
{
print_flags("bar", flags, (const struct flag_name[]){ FLAG_NAMES,
{-1UL, NULL}});
flags &= 0x07;
print_flags("baz", flags, (const struct flag_name[]){ FLAG_NAMES,
{-1UL, NULL}});
}
void f2(unsigned long flags)
{
print_flags("foo", flags, (const struct flag_name[]){ FLAG_NAMES2,
{-1UL, NULL}});
}
void g2(unsigned long flags)
{
print_flags("bar", flags, (const struct flag_name[]){ FLAG_NAMES2,
{-1UL, NULL}});
flags &= 0x07;
print_flags("baz", flags, (const struct flag_name[]){ FLAG_NAMES2,
{-1UL, NULL}});
}
objdump output:
complit.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <f>:
0: 48 81 ec 28 02 00 00 sub $0x228,%rsp
7: 49 89 f8 mov %rdi,%r8
a: be 00 00 00 00 mov $0x0,%esi
b: R_X86_64_32 .rodata
f: 48 89 e7 mov %rsp,%rdi
12: 48 89 e2 mov %rsp,%rdx
15: b9 44 00 00 00 mov $0x44,%ecx
1a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
1d: 4c 89 c6 mov %r8,%rsi
20: bf 00 00 00 00 mov $0x0,%edi
21: R_X86_64_32 .rodata.str1.1
25: e8 00 00 00 00 callq 2a <f+0x2a>
26: R_X86_64_PC32 print_flags-0x4
2a: 48 81 c4 28 02 00 00 add $0x228,%rsp
31: c3 retq
32: 66 66 66 66 66 2e 0f 1f 84 00 00 00 data16 data16 data16 data16
nopw %cs:0x0(%rax,%rax,1)
3e: 00 00
0000000000000040 <g>:
40: 53 push %rbx
41: 48 89 fb mov %rdi,%rbx
44: be 00 00 00 00 mov $0x0,%esi
45: R_X86_64_32 .rodata
49: b9 44 00 00 00 mov $0x44,%ecx
4e: 48 81 ec 40 04 00 00 sub $0x440,%rsp
55: 48 89 e2 mov %rsp,%rdx
58: 48 89 e7 mov %rsp,%rdi
5b: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
5e: 48 89 de mov %rbx,%rsi
61: bf 00 00 00 00 mov $0x0,%edi
62: R_X86_64_32 .rodata.str1.1+0x102
66: 83 e3 07 and $0x7,%ebx
69: e8 00 00 00 00 callq 6e <g+0x2e>
6a: R_X86_64_PC32 print_flags-0x4
6e: 48 8d 94 24 20 02 00 00 lea 0x220(%rsp),%rdx
76: 48 8d bc 24 20 02 00 00 lea 0x220(%rsp),%rdi
7e: be 00 00 00 00 mov $0x0,%esi
7f: R_X86_64_32 .rodata
83: b9 44 00 00 00 mov $0x44,%ecx
88: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
8b: 48 89 de mov %rbx,%rsi
8e: bf 00 00 00 00 mov $0x0,%edi
8f: R_X86_64_32 .rodata.str1.1+0x106
93: e8 00 00 00 00 callq 98 <g+0x58>
94: R_X86_64_PC32 print_flags-0x4
98: 48 81 c4 40 04 00 00 add $0x440,%rsp
9f: 5b pop %rbx
a0: c3 retq
a1: 66 66 66 66 66 66 2e 0f 1f 84 00 00 data16 data16 data16 data16
data16 nopw %cs:0x0(%rax,%rax,1)
ad: 00 00 00
00000000000000b0 <f2>:
b0: 48 83 ec 68 sub $0x68,%rsp
b4: 48 89 fe mov %rdi,%rsi
b7: bf 00 00 00 00 mov $0x0,%edi
b8: R_X86_64_32 .rodata.str1.1
bc: 48 89 e2 mov %rsp,%rdx
bf: 48 c7 04 24 01 00 00 00 movq $0x1,(%rsp)
c7: 48 c7 44 24 08 00 00 00 00 movq $0x0,0x8(%rsp)
cc: R_X86_64_32S .rodata.str1.1+0x4
d0: 48 c7 44 24 10 02 00 00 00 movq $0x2,0x10(%rsp)
d9: 48 c7 44 24 18 00 00 00 00 movq $0x0,0x18(%rsp)
de: R_X86_64_32S .rodata.str1.1+0xb
e2: 48 c7 44 24 20 04 00 00 00 movq $0x4,0x20(%rsp)
eb: 48 c7 44 24 28 00 00 00 00 movq $0x0,0x28(%rsp)
f0: R_X86_64_32S .rodata.str1.1+0x12
f4: 48 c7 44 24 30 08 00 00 00 movq $0x8,0x30(%rsp)
fd: 48 c7 44 24 38 00 00 00 00 movq $0x0,0x38(%rsp)
102: R_X86_64_32S .rodata.str1.1+0x19
106: 48 c7 44 24 40 10 00 00 00 movq $0x10,0x40(%rsp)
10f: 48 c7 44 24 48 00 00 00 00 movq $0x0,0x48(%rsp)
114: R_X86_64_32S .rodata.str1.1+0x20
118: 48 c7 44 24 50 ff ff ff ff movq
$0xffffffffffffffff,0x50(%rsp)
121: 48 c7 44 24 58 00 00 00 00 movq $0x0,0x58(%rsp)
12a: e8 00 00 00 00 callq 12f <f2+0x7f>
12b: R_X86_64_PC32 print_flags-0x4
12f: 48 83 c4 68 add $0x68,%rsp
133: c3 retq
134: 66 66 66 2e 0f 1f 84 00 00 00 00 00 data16 data16 nopw
%cs:0x0(%rax,%rax,1)
0000000000000140 <g2>:
140: 55 push %rbp
141: 53 push %rbx
142: 48 c7 c5 ff ff ff ff mov $0xffffffffffffffff,%rbp
149: 48 89 fe mov %rdi,%rsi
14c: 48 89 fb mov %rdi,%rbx
14f: bf 00 00 00 00 mov $0x0,%edi
150: R_X86_64_32 .rodata.str1.1+0x102
154: 48 81 ec c8 00 00 00 sub $0xc8,%rsp
15b: 83 e3 07 and $0x7,%ebx
15e: 48 89 e2 mov %rsp,%rdx
161: 48 89 6c 24 50 mov %rbp,0x50(%rsp)
166: 48 c7 04 24 01 00 00 00 movq $0x1,(%rsp)
16e: 48 c7 44 24 08 00 00 00 00 movq $0x0,0x8(%rsp)
173: R_X86_64_32S .rodata.str1.1+0x4
177: 48 c7 44 24 10 02 00 00 00 movq $0x2,0x10(%rsp)
180: 48 c7 44 24 18 00 00 00 00 movq $0x0,0x18(%rsp)
185: R_X86_64_32S .rodata.str1.1+0xb
189: 48 c7 44 24 20 04 00 00 00 movq $0x4,0x20(%rsp)
192: 48 c7 44 24 28 00 00 00 00 movq $0x0,0x28(%rsp)
197: R_X86_64_32S .rodata.str1.1+0x12
19b: 48 c7 44 24 30 08 00 00 00 movq $0x8,0x30(%rsp)
1a4: 48 c7 44 24 38 00 00 00 00 movq $0x0,0x38(%rsp)
1a9: R_X86_64_32S .rodata.str1.1+0x19
1ad: 48 c7 44 24 40 10 00 00 00 movq $0x10,0x40(%rsp)
1b6: 48 c7 44 24 48 00 00 00 00 movq $0x0,0x48(%rsp)
1bb: R_X86_64_32S .rodata.str1.1+0x20
1bf: 48 c7 44 24 58 00 00 00 00 movq $0x0,0x58(%rsp)
1c8: e8 00 00 00 00 callq 1cd <g2+0x8d>
1c9: R_X86_64_PC32 print_flags-0x4
1cd: 48 8d 54 24 60 lea 0x60(%rsp),%rdx
1d2: 48 89 de mov %rbx,%rsi
1d5: bf 00 00 00 00 mov $0x0,%edi
1d6: R_X86_64_32 .rodata.str1.1+0x106
1da: 48 89 ac 24 b0 00 00 00 mov %rbp,0xb0(%rsp)
1e2: 48 c7 44 24 60 01 00 00 00 movq $0x1,0x60(%rsp)
1eb: 48 c7 44 24 68 00 00 00 00 movq $0x0,0x68(%rsp)
1f0: R_X86_64_32S .rodata.str1.1+0x4
1f4: 48 c7 44 24 70 02 00 00 00 movq $0x2,0x70(%rsp)
1fd: 48 c7 44 24 78 00 00 00 00 movq $0x0,0x78(%rsp)
202: R_X86_64_32S .rodata.str1.1+0xb
206: 48 c7 84 24 80 00 00 00 04 00 00 00 movq $0x4,0x80(%rsp)
212: 48 c7 84 24 88 00 00 00 00 00 00 00 movq $0x0,0x88(%rsp)
21a: R_X86_64_32S .rodata.str1.1+0x12
21e: 48 c7 84 24 90 00 00 00 08 00 00 00 movq $0x8,0x90(%rsp)
22a: 48 c7 84 24 98 00 00 00 00 00 00 00 movq $0x0,0x98(%rsp)
232: R_X86_64_32S .rodata.str1.1+0x19
236: 48 c7 84 24 a0 00 00 00 10 00 00 00 movq $0x10,0xa0(%rsp)
242: 48 c7 84 24 a8 00 00 00 00 00 00 00 movq $0x0,0xa8(%rsp)
24a: R_X86_64_32S .rodata.str1.1+0x20
24e: 48 c7 84 24 b8 00 00 00 00 00 00 00 movq $0x0,0xb8(%rsp)
25a: e8 00 00 00 00 callq 25f <g2+0x11f>
25b: R_X86_64_PC32 print_flags-0x4
25f: 48 81 c4 c8 00 00 00 add $0xc8,%rsp
266: 5b pop %rbx
267: 5d pop %rbp
268: c3 retq