For T being an arithmetic type, gcc 4.1 generates what looks like suboptimal assembly code for inline C++ functions that take their argument(s) by const reference (const T&) vs the same functions that take their argument(s) by value (T). Ideally, the code generated for test_min_ref() and test_min_ptr() below would be the same as for test_min_val(). The code generated for SPARC at the bottom shows that the code for both functions is worse. template <class T> inline const T& min_ref (const T &x, const T &y) { return x < y ? x : y; } template <class T> inline const T* min_ptr (const T *x, const T *y) { return *x < *y ? x : y; } template <class T> inline T min_val (T x, T y) { return x < y ? x : y; } int test_min_ref (int x, int y) { return min_ref (x, y); } int test_min_ptr (int x, int y) { return *min_ptr (&x, &y); } int test_min_val (int x, int y) { return min_val (x, y); } .file "t.cpp" .section ".text" .align 4 .global _Z12test_min_refii .type _Z12test_min_refii, #function .proc 04 _Z12test_min_refii: .LLFB5: cmp %o0, %o1 st %o0, [%sp+68] st %o1, [%sp+72] bge .LL7 add %sp, 68, %o0 jmp %o7+8 ld [%o0], %o0 .LL7: add %sp, 72, %o0 jmp %o7+8 ld [%o0], %o0 .LLFE5: .size _Z12test_min_refii, .-_Z12test_min_refii .global __gxx_personality_v0 .align 4 .global _Z12test_min_ptrii .type _Z12test_min_ptrii, #function .proc 04 _Z12test_min_ptrii: .LLFB6: cmp %o0, %o1 st %o0, [%sp+68] st %o1, [%sp+72] bge .LL13 add %sp, 68, %o0 jmp %o7+8 ld [%o0], %o0 .LL13: add %sp, 72, %o0 jmp %o7+8 ld [%o0], %o0 .LLFE6: .size _Z12test_min_ptrii, .-_Z12test_min_ptrii .align 4 .global _Z12test_min_valii .type _Z12test_min_valii, #function .proc 04 _Z12test_min_valii: .LLFB7: cmp %o0, %o1 bg,a .LL17 mov %o1, %o0 .LL17: jmp %o7+8 nop .LLFE7: .size _Z12test_min_valii, .-_Z12test_min_valii .ident "GCC: (GNU) 4.1.0"
Confirmed. This is because we don't optimize int test_min_ref(int, int) (x, y) { int D.2516; int D.2517; const int & x; int D.2458; <bb 2>: # VUSE <x_10(D)> D.2516_6 = x; # VUSE <y_11(D)> D.2517_7 = y; if (D.2516_6 < D.2517_7) goto <L1>; else goto <L2>; <L1>:; # x_8 = PHI <&y(2), &x(3)> <L2>:; # VUSE <x_10(D), y_11(D)> D.2458_2 = *x_8; return D.2458_2; } We could "backward propagate" *a_8 into the PHI node. This missing optimization also keeps x and y addressable, which is bad.
Created attachment 13026 [details] prototype patch works for the testcase, otherwise unchecked. Does not look through copies, so it doesn't trigger during early optimization and we can as well move it to phiopts, which is not run during early optimization.
I'll take care of it.
*** This bug has been marked as a duplicate of 19431 ***