This testcase: --cut here-- typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); typedef int __v4si __attribute__ ((__vector_size__ (16))); __m128i _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) { return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; } --cut here-- compiles using -O2 -m64 -msse2 to: movq %rsi, -8(%rsp) # 28 *movdi_1_rex64/4 [length = 5] movq -8(%rsp), %xmm1 # 29 *movdi_1_rex64/17 [length = 6] movq %rdi, -8(%rsp) # 30 *movdi_1_rex64/4 [length = 5] movq -8(%rsp), %xmm0 # 31 *movdi_1_rex64/17 [length = 6] movq %rcx, -8(%rsp) # 32 *movdi_1_rex64/4 [length = 5] punpckldq %xmm0, %xmm1 # 9 *vec_concatv2si_sse2/1= 3] movq -8(%rsp), %xmm0 # 33 *movdi_1_rex64/17 [length = 6] movq %rdx, -8(%rsp) # 34 *movdi_1_rex64/4 [length = 5] movq -8(%rsp), %xmm2 # 35 *movdi_1_rex64/17 [length = 6] punpckldq %xmm2, %xmm0 # 10 *vec_concatv2si_sse2/1 punpcklqdq %xmm1, %xmm0 # 11 *vec_concatv4si_1/1 ret # 38 return_internal [length = 1] when -march=core2 is added to compile flags, so TARGET_INTER_UNIT_MOVES is enabled, following code is produced: movd %edi, %xmm0 # 29 *movsi_1/11 [length = 3] movd %esi, %xmm1 # 28 *movsi_1/11 [length = 3] movd %edx, %xmm2 # 31 *movsi_1/11 [length = 3] punpckldq %xmm0, %xmm1 # 9 *vec_concatv2si_sse2/1 movd %ecx, %xmm0 # 30 *movsi_1/11 [length = 3] punpckldq %xmm2, %xmm0 # 10 *vec_concatv2si_sse2/1 punpcklqdq %xmm1, %xmm0 # 11 *vec_concatv4si_1/1 ret # 34 return_internal [length = 1] also, when compiled with -m32 -O2 -msse2 -mregparm=3: subl $4, %esp # 37 pro_epilogue_adjust_stack_1/1 movl %edx, (%esp) # 28 *movsi_1/2 [length = 3] movd 8(%esp), %xmm3 # 33 *movsi_1/12 [length = 5] movd (%esp), %xmm0 # 29 *movsi_1/12 [length = 4] movl %eax, (%esp) # 31 *movsi_1/2 [length = 3] movd (%esp), %xmm2 # 32 *movsi_1/12 [length = 4] movl %ecx, (%esp) # 35 *movsi_1/2 [length = 3] punpckldq %xmm2, %xmm0 # 9 *vec_concatv2si_sse2/1 movd (%esp), %xmm2 # 36 *movsi_1/12 [length = 4] movq %xmm0, %xmm1 # 30 *movv2si_internal/7 [length = 4] punpckldq %xmm2, %xmm3 # 10 *vec_concatv2si_sse2/1 addl $4, %esp # 40 pro_epilogue_adjust_stack_1/1 movq %xmm3, %xmm0 # 34 *movv2si_internal/7 [length = 4] punpcklqdq %xmm1, %xmm0 # 11 *vec_concatv4si_1/1 ret # 41 return_internal [length = 1] The problem is, that gcc generates 64bit reg->mem->xmmreg moves (see first asm code dump) for 32bit values, when direct reg->xmmreg moves are disabled. This happens only for 64bit targets, code for 32bit targets is what is expected. For the first asm code dump, we have following RTX for (insn 9) in _lreg: (insn:HI 2 7 3 2 uuu.c:7 (set (reg/v:SI 59 [ __q3 ]) (reg:SI 5 di [ __q3 ])) 47 {*movsi_1} (insn:HI 3 2 4 2 uuu.c:7 (set (reg/v:SI 60 [ __q2 ]) (reg:SI 4 si [ __q2 ])) 47 {*movsi_1} (insn:HI 9 6 10 2 uuu.c:7 (set (reg:V2SI 65) (vec_concat:V2SI (reg/v:SI 60 [ __q2 ]) (reg/v:SI 59 [ __q3 ]))) 1338 {*vec_concatv2si_sse2} Reload says: Reloads for insn # 9 Reload 0: reload_in (SI) = (reg/v:SI 4 si [orig:60 __q2 ] [60]) reload_out (V2SI) = (reg:V2SI 22 xmm1 [65]) SSE_REGS, RELOAD_OTHER (opnum = 0) reload_in_reg: (reg/v:SI 4 si [orig:60 __q2 ] [60]) reload_out_reg: (reg:V2SI 22 xmm1 [65]) reload_reg_rtx: (reg:V2SI 22 xmm1 [65]) Reload 1: reload_in (SI) = (reg/v:SI 5 di [orig:59 __q3 ] [59]) SSE_REGS, RELOAD_FOR_INPUT (opnum = 2) reload_in_reg: (reg/v:SI 5 di [orig:59 __q3 ] [59]) reload_reg_rtx: (reg:SI 21 xmm0) And this results in: (insn 28 6 29 2 uuu.c:7 (set (mem/c:DI (plus:DI (reg/f:DI 7 sp) (const_int -8 [0xfffffffffffffff8])) [0 S8 A8]) (reg:DI 4 si)) 89 {*movdi_1_rex64} (nil)) (insn 29 28 30 2 uuu.c:7 (set (reg:DI 22 xmm1) (mem/c:DI (plus:DI (reg/f:DI 7 sp) (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])) 89 {*movdi_1_rex64} (nil)) (insn 30 29 31 2 uuu.c:7 (set (mem/c:DI (plus:DI (reg/f:DI 7 sp) (const_int -8 [0xfffffffffffffff8])) [0 S8 A8]) (reg:DI 5 di)) 89 {*movdi_1_rex64} (nil)) (insn 31 30 9 2 uuu.c:7 (set (reg:DI 21 xmm0) (mem/c:DI (plus:DI (reg/f:DI 7 sp) (const_int -8 [0xfffffffffffffff8])) [0 S8 A8])) 89 {*movdi_1_rex64} (nil)) (insn:HI 9 31 32 2 uuu.c:7 (set (reg:V2SI 22 xmm1 [65]) (vec_concat:V2SI (reg:SI 22 xmm1) (reg:SI 21 xmm0))) 1338 {*vec_concatv2si_sse2} (nil)) Do we really need 64bit moves here?
32bit targets (-mregparm=3) choose 32bit moves: Reloads for insn # 9 Reload 0: reload_in (SI) = (reg/v:SI 1 dx [orig:60 __q2 ] [60]) reload_out (V2SI) = (reg:V2SI 22 xmm1 [65]) SSE_REGS, RELOAD_OTHER (opnum = 0) reload_in_reg: (reg/v:SI 1 dx [orig:60 __q2 ] [60]) reload_out_reg: (reg:V2SI 22 xmm1 [65]) reload_reg_rtx: (reg:V2SI 21 xmm0) Reload 1: reload_in (SI) = (reg/v:SI 0 ax [orig:59 __q3 ] [59]) SSE_REGS, RELOAD_FOR_INPUT (opnum = 2) reload_in_reg: (reg/v:SI 0 ax [orig:59 __q3 ] [59]) reload_reg_rtx: (reg:SI 23 xmm2) This results in: (insn 28 6 29 2 uuu.c:7 (set (mem/c:SI (reg/f:SI 7 sp) [0 S4 A8]) (reg/v:SI 1 dx [orig:60 __q2 ] [60])) 47 {*movsi_1} (nil)) (insn 29 28 31 2 uuu.c:7 (set (reg:SI 21 xmm0) (mem/c:SI (reg/f:SI 7 sp) [0 S4 A8])) 47 {*movsi_1} (nil)) (insn 31 29 32 2 uuu.c:7 (set (mem/c:SI (reg/f:SI 7 sp) [0 S4 A8]) (reg/v:SI 0 ax [orig:59 __q3 ] [59])) 47 {*movsi_1} (nil)) (insn 32 31 9 2 uuu.c:7 (set (reg:SI 23 xmm2) (mem/c:SI (reg/f:SI 7 sp) [0 S4 A8])) 47 {*movsi_1} (nil)) (insn:HI 9 32 30 2 uuu.c:7 (set (reg:V2SI 21 xmm0) (vec_concat:V2SI (reg:SI 21 xmm0) (reg:SI 23 xmm2))) 1338 {*vec_concatv2si_sse2} (nil))
It is actually a target issue. SECONDARY_MEMORY_NEEDED_MODE should be defined to handle this. From reload.c: rtx get_secondary_mem (rtx x ATTRIBUTE_UNUSED, enum machine_mode mode, int opnum, enum reload_type type) { rtx loc; int mem_valid; /* By default, if MODE is narrower than a word, widen it to a word. This is required because most machines that require these memory locations do not support short load and stores from all registers (e.g., FP registers). */ #ifdef SECONDARY_MEMORY_NEEDED_MODE mode = SECONDARY_MEMORY_NEEDED_MODE (mode); #else if (GET_MODE_BITSIZE (mode) < BITS_PER_WORD && INTEGRAL_MODE_P (mode)) mode = mode_for_size (BITS_PER_WORD, GET_MODE_CLASS (mode), 0); #endif
Patch in testing: Index: i386.h =================================================================== --- i386.h (revision 135408) +++ i386.h (working copy) @@ -1516,6 +1516,14 @@ #define SECONDARY_MEMORY_NEEDED(CLASS1, CLASS2, MODE) \ ix86_secondary_memory_needed ((CLASS1), (CLASS2), (MODE), 1) +/* Get_secondary_mem widens integral modes to BITS_PER_WORD. + There is no need to emit full 64 bit move for integral modes + that can be moved using 32 bit move. */ +#define SECONDARY_MEMORY_NEEDED_MODE(MODE) \ + (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE) \ + ? mode_for_size (32, GET_MODE_CLASS (MODE), 0) \ + : MODE) + /* QImode spills from non-QI registers need a scratch. This does not happen often -- the only example so far requires an uninitialized pseudo. */
Subject: Bug 36246 Author: uros Date: Fri May 16 18:34:04 2008 New Revision: 135437 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=135437 Log: PR target/36246 * config/i386/i386.h (SECONDARY_MEMORY_NEEDED_MODE): New define. testsuite/ChangeLog: PR target/36246 * gcc.target/i386/pr36246.c: New test. Added: trunk/gcc/testsuite/gcc.target/i386/pr36246.c Modified: trunk/gcc/ChangeLog trunk/gcc/config/i386/i386.h trunk/gcc/testsuite/ChangeLog
Fixed.
The gcc.target/i386/pr36246.c test case fails on i686-apple-darwin9 at -m64 as... Executing on host: /sw/src/fink.build/gcc44-4.3.999-20081115/darwin_objdir/gcc/xgcc -B/sw/src/fink.build/gcc44-4.3.999-20081115/darwin_objdi r/gcc/ /sw/src/fink.build/gcc44-4.3.999-20081115/gcc-4.4-20081115/gcc/testsuite/gcc.target/i386/pr36246.c -O2 -mtune=generic -S -m64 -o p r36246.s (timeout = 300) PASS: gcc.target/i386/pr36246.c (test for excess errors) FAIL: gcc.target/i386/pr36246.c scan-assembler-not movq
Created attachment 16693 [details] assembly file generated for gcc.target/i386/pr36246.c at -m64 on i686-apple-darwin9