Created attachment 51494 [details] test.c Structs with flexible array members are not optimized on the stack. Testing with the test.c file shown below (which outputs the runtime in seconds), we see that the compiled program is significantly slower when the struct has a flexible array member than when it does not. This was tested on GCC 9.3.0 and 10.3.0 on Ubuntu 20.04. $ gcc -O3 test.c $ ./a.out 0.302769 $ gcc -O3 -DUSE_FLEX_ARR=1 test.c $ ./a.out 0.728760 clang does not have this issue. $ clang -O3 test.c $ ./a.out 0.312194 $ clang -O3 -DUSE_FLEX_ARR=1 test.c $ ./a.out 0.301175 This is what test.c looks like: #include <stdlib.h> #include <stdio.h> #include <time.h> #include <string.h> struct Test { long is_a; union { struct { long one; long two; long three; } a; struct { int one; int two; int three; int four; #if USE_FLEX_ARR char arr[]; #endif } b; } as; }; #define COUNT 100000000 static inline struct Test make_test_a(struct Test *test) { if (test->is_a) { return *test; } else { struct Test ret; ret.as.a.one = test->as.b.one; ret.as.a.two = test->as.b.two; ret.as.a.three = test->as.b.three; return ret; } } /* This function should be optimized to not allocate struct Test on the stack * since it only uses attribute "three". */ static inline long get_three(struct Test *test) { return make_test_a(test).as.a.three; } int main(int argc, char *argv[]) { struct timespec start, end; struct Test *mem = malloc(sizeof(struct Test) * COUNT); memset(mem, 0, sizeof(struct Test) * COUNT); clock_gettime(CLOCK_MONOTONIC, &start); { for (int i = 0; i < COUNT; i++) { long three = get_three(&mem[i]); if (three) { /* Impossible case. */ printf("what\n"); } } } clock_gettime(CLOCK_MONOTONIC, &end); double time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1000000000.0; printf("%f\n", time); return 0; }
The difference shows up during inlining ....
adding a char member may also change TBAA, so see whether it reproduces with making the flexarray member a int []
get_three after einline w/o USE_FLEX_ARR defined (this is changing the type to int): <bb 2> : _5 = test_2(D)->is_a; if (_5 != 0) goto <bb 3>; [50.00%] else goto <bb 4>; [50.00%] <bb 3> : D.3292 = *test_2(D); goto <bb 5>; [100.00%] <bb 4> : _6 = test_2(D)->as.b.one; _7 = (long int) _6; _8 = test_2(D)->as.b.two; _9 = (long int) _8; _10 = test_2(D)->as.b.three; _11 = (long int) _10; D.3292.as.a.one = _7; D.3292.as.a.two = _9; D.3292.as.a.three = _11; <bb 5> : _4 = D.3292.as.a.three; return _4; With USE_FLEX_ARR defined to 1: <bb 2> : _5 = test_2(D)->is_a; if (_5 != 0) goto <bb 3>; [50.00%] else goto <bb 4>; [50.00%] <bb 3> : D.3293 = *test_2(D); goto <bb 5>; [100.00%] <bb 4> : _6 = test_2(D)->as.b.one; _7 = (long int) _6; ret.as.a.one = _7; _8 = test_2(D)->as.b.two; _9 = (long int) _8; ret.as.a.two = _9; _10 = test_2(D)->as.b.three; _11 = (long int) _10; ret.as.a.three = _11; D.3293 = ret; ret ={v} {CLOBBER}; <bb 5> : _4 = D.3293.as.a.three; return _4; as far as I can tell the IR is the same before that, even RSO: D.3293 = make_test_a (test_2(D)); [return slot optimization] But it looks in the case of the flex array case, RSO does not actually happen.
Oh it is just esra (and SRA) rejecting the struct: Rejected (3268): zero structure field size: ret Rejected (3311): zero structure field size: ret Rejected (3309): zero structure field size: D.3309 I had read the dump order incorrectly and such.
On Thu, 23 Sep 2021, pinskia at gcc dot gnu.org wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102452 > > --- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> --- > get_three after einline w/o USE_FLEX_ARR defined (this is changing the type to > int): > > <bb 2> : > _5 = test_2(D)->is_a; > if (_5 != 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > > <bb 3> : > D.3292 = *test_2(D); > goto <bb 5>; [100.00%] > > <bb 4> : > _6 = test_2(D)->as.b.one; > _7 = (long int) _6; > _8 = test_2(D)->as.b.two; > _9 = (long int) _8; > _10 = test_2(D)->as.b.three; > _11 = (long int) _10; > D.3292.as.a.one = _7; > D.3292.as.a.two = _9; > D.3292.as.a.three = _11; > > <bb 5> : > _4 = D.3292.as.a.three; > return _4; > > With USE_FLEX_ARR defined to 1: > <bb 2> : > _5 = test_2(D)->is_a; > if (_5 != 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > > <bb 3> : > D.3293 = *test_2(D); > goto <bb 5>; [100.00%] > > <bb 4> : > _6 = test_2(D)->as.b.one; > _7 = (long int) _6; > ret.as.a.one = _7; > _8 = test_2(D)->as.b.two; > _9 = (long int) _8; > ret.as.a.two = _9; > _10 = test_2(D)->as.b.three; > _11 = (long int) _10; > ret.as.a.three = _11; > D.3293 = ret; > ret ={v} {CLOBBER}; > > <bb 5> : > _4 = D.3293.as.a.three; > return _4; > > > as far as I can tell the IR is the same before that, > even RSO: > D.3293 = make_test_a (test_2(D)); [return slot optimization] > > > But it looks in the case of the flex array case, RSO does not actually happen. Are you sure it's not SRA behaving differently? The IL inlined early is only showing in later dumps...
On Thu, 23 Sep 2021, pinskia at gcc dot gnu.org wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102452 > > Andrew Pinski <pinskia at gcc dot gnu.org> changed: > > What |Removed |Added > ---------------------------------------------------------------------------- > Severity|normal |enhancement > Component|ipa |tree-optimization > > --- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> --- > Oh it is just esra (and SRA) rejecting the struct: > Rejected (3268): zero structure field size: ret > > Rejected (3311): zero structure field size: ret > Rejected (3309): zero structure field size: D.3309 > > I had read the dump order incorrectly and such. Yeah - not sure why exactly it execuses itself for !DECL_SIZE fields, but well. Something to improve. (just ignore those fields)