[hjl@gnu-6 sse-1]$ cat d.c #include <emmintrin.h> __m128i foo2 (int x1, int x2, int x3, int x4) { return _mm_set_epi32 (x1, x2, x3, x4); } [hjl@gnu-6 sse-1]$ make d.s /export/build/gnu/gcc-stack-internal/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-stack-internal/build-x86_64-linux/gcc/ -msse4 -march=core2 -m3dnow -msse5 -maes -mpclmul -Wall -O2 -S d.c [hjl@gnu-6 sse-1]$ cat d.s .file "d.c" .text .p2align 4,,15 .globl foo2 .type foo2, @function foo2: .LFB518: movd %edx, %xmm0 movd %edi, %xmm2 movd %ecx, %xmm1 punpckldq %xmm0, %xmm1 movd %esi, %xmm0 punpckldq %xmm2, %xmm0 movq %xmm1, %xmm2 punpcklqdq %xmm0, %xmm2 movdqa %xmm2, %xmm0 <<< --- Can we optimize it out? ret
Also do we need "movq %xmm1, %xmm2"?
(In reply to comment #1) > Also do we need "movq %xmm1, %xmm2"? We can help RA a bit by emitting RTL sequence that requires less pseudos. Index: i386.c =================================================================== --- i386.c (revision 135220) +++ i386.c (working copy) @@ -23859,14 +23859,14 @@ /* For V4SF and V4SI, we implement a concat of two V2 vectors. Recurse to load the two halves. */ + op1 = gen_reg_rtx (half_mode); + v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3)); + ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v)); + op0 = gen_reg_rtx (half_mode); v = gen_rtvec (2, XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1)); ix86_expand_vector_init (false, op0, gen_rtx_PARALLEL (half_mode, v)); - op1 = gen_reg_rtx (half_mode); - v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3)); - ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v)); - use_vec_concat = true; } break; @@ -23883,10 +23883,10 @@ if (use_vec_concat) { + if (!register_operand (op1, half_mode)) + op1 = force_reg (half_mode, op1); if (!register_operand (op0, half_mode)) op0 = force_reg (half_mode, op0); - if (!register_operand (op1, half_mode)) - op1 = force_reg (half_mode, op1); emit_insn (gen_rtx_SET (VOIDmode, target, gen_rtx_VEC_CONCAT (mode, op0, op1))); This patch will produce: _mm_set_epi32: .LFB2: movd %edi, %xmm0 movd %esi, %xmm1 movd %edx, %xmm2 punpckldq %xmm0, %xmm1 movd %ecx, %xmm0 punpckldq %xmm2, %xmm0 punpcklqdq %xmm1, %xmm0 ret
This bug may be related to PR 30961. Another example: bash-3.2$ cat d.c #include <emmintrin.h> __m128i foo2 (long long x1, long long x2) { return _mm_set_epi64x (x1, x2); } bash-3.2$ /export/build/gnu/gcc-stack/build-x86_64-linux/gcc/xgcc -B./ -B/export/build/gnu/gcc-stack/build-x86_64-linux/gcc/ -Wall -I.. -O2 -march=core2 -msse4 -fno-asynchronous-unwind-tables -DDEBUG -S d.c bash-3.2$ cat d.s .file "d.c" .text .p2align 4,,15 .globl foo2 .type foo2, @function foo2: movd %rsi, %xmm1 pinsrq $0x1, %rdi, %xmm1 movdqa %xmm1, %xmm0 ret .size foo2, .-foo2 d.c.176r.lreg has ;; Pred edge ENTRY [100.0%] (fallthru) (note:HI 5 1 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) (insn:HI 2 5 3 2 d.c:5 (set (reg/v:DI 59 [ x1 ]) (reg:DI 5 di [ x1 ])) 89 {*movdi_1_rex64} (expr_list:REG_DEAD (reg:DI 5 di [ x1 ]) (nil))) (insn:HI 3 2 4 2 d.c:5 (set (reg/v:DI 60 [ x2 ]) (reg:DI 4 si [ x2 ])) 89 {*movdi_1_rex64} (expr_list:REG_DEAD (reg:DI 4 si [ x2 ]) (nil))) (note:HI 4 3 7 2 NOTE_INSN_FUNCTION_BEG) (note:HI 7 4 12 2 NOTE_INSN_DELETED) (insn:HI 12 7 18 2 d.c:7 (set (reg/i:V2DI 21 xmm0 [ <result> ]) (vec_concat:V2DI (reg/v:DI 60 [ x2 ]) (reg/v:DI 59 [ x1 ]))) 1340 {*vec_concatv2di_rex64_sse4_1} (expr_lis t:REG_DEAD (reg/v:DI 60 [ x2 ]) (expr_list:REG_DEAD (reg/v:DI 59 [ x1 ]) (nil)))) (insn:HI 18 12 0 2 d.c:7 (use (reg/i:V2DI 21 xmm0 [ <result> ])) -1 (nil)) ;; End of basic block 2 -> ( 1) ;; lr out 6 [bp] 7 [sp] 16 [argp] 20 [frame] 21 [xmm0] ;; live out 6 [bp] 7 [sp] 16 [argp] 20 [frame] 21 [xmm0] But d.c.177r.greg has ;; Pred edge ENTRY [100.0%] (fallthru) (note:HI 5 1 4 2 [bb 2] NOTE_INSN_BASIC_BLOCK) (note:HI 4 5 7 2 NOTE_INSN_FUNCTION_BEG) (note:HI 7 4 24 2 NOTE_INSN_DELETED) (insn 24 7 12 2 d.c:7 (set (reg:DI 22 xmm1) (reg/v:DI 4 si [orig:60 x2 ] [60])) 89 {*movdi_1_rex64} (nil)) (insn:HI 12 24 25 2 d.c:7 (set (reg:V2DI 22 xmm1) (vec_concat:V2DI (reg:DI 22 xmm1) (reg/v:DI 5 di [orig:59 x1 ] [59]))) 1340 {*vec_concatv2di_rex64_sse 4_1} (nil)) (insn 25 12 18 2 d.c:7 (set (reg/i:V2DI 21 xmm0 [ <result> ]) (reg:V2DI 22 xmm1)) 1019 {*movv2di_internal} (nil)) (insn 18 25 23 2 d.c:7 (use (reg/i:V2DI 21 xmm0 [ <result> ])) -1 (nil)) ;; End of basic block 2 -> ( 1) ;; lr out 7 [sp] 21 [xmm0] ;; live out 7 [sp] 21 [xmm0] Ulrich, do you have any idea why reload won't use xmm0? Thanks.
It looks like reload doesn't check any vector instructions. I guess there may be many missed optimizations with vector instructions.
Subject: Bug 36222 Author: uros Date: Tue May 13 21:33:40 2008 New Revision: 135275 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=135275 Log: PR target/36222 * config/i386/i386.c (ix86_expand_vector_init_general): Rearrange op0 and op1 expansion before vector concat to have less live pseudos. testsuite/ChangeLog: PR target/36222 * gcc.target/i386/pr36222-1.c: New test. Added: trunk/gcc/testsuite/gcc.target/i386/pr36222-1.c Modified: trunk/gcc/ChangeLog trunk/gcc/config/i386/i386.c trunk/gcc/testsuite/ChangeLog
There is still underlying RA problem left. Compiling the testcase from Comment 0 on 32bit i686 (-O2 -msse2), we get: _mm_set_epi32: pushl %ebp movl %esp, %ebp movd 8(%ebp), %xmm0 movd 12(%ebp), %xmm2 punpckldq %xmm0, %xmm2 movd 16(%ebp), %xmm0 >>> movq %xmm2, %xmm1 movd 20(%ebp), %xmm2 popl %ebp punpckldq %xmm0, %xmm2 >>> movq %xmm2, %xmm0 punpcklqdq %xmm1, %xmm0 ret Two movq insns can be avoided by rearranging of registers.
find_reloads in reload.c has /* Special case a simple move with an input reload and a destination of a hard reg, if the hard reg is ok, use it. */ for (i = 0; i < n_reloads; i++) if (rld[i].when_needed == RELOAD_FOR_INPUT && GET_CODE (PATTERN (insn)) == SET && REG_P (SET_DEST (PATTERN (insn))) && (SET_SRC (PATTERN (insn)) == rld[i].in || SET_SRC (PATTERN (insn)) == rld[i].in_reg) && !elimination_target_reg_p (SET_DEST (PATTERN (insn)))) { rtx dest = SET_DEST (PATTERN (insn)); unsigned int regno = REGNO (dest); If it can be extended to handle vector instructions, it will help this bug.
That special case in find_reloads is really about a different situation. We do not have a simple move here. The problem also is not really related to vector instruction in particular; reload doesn't at all care what the instructions actually do ... There are two problems involved in this particular case, each of which suffices to prevent optimal register allocation. Before reload, we have (insn:HI 10 27 11 2 d.c:7 (set (reg:V2SI 66) (vec_concat:V2SI (mem/c/i:SI (reg/f:SI 16 argp) [2 x1+0 S4 A32]) (reg/v:SI 60 [ x2 ]))) 1338 {*vec_concatv2si_sse2} (expr_list:REG_DEAD (reg/v:SI 60 [ x2 ]) (nil))) where local register allocation has already selected hard registers for *both* operands 0 and 2: ;; Register 60 in 21. ;; Register 66 in 21. Now, the insn pattern offers those alternatives: (define_insn "*vec_concatv2si_sse2" [(set (match_operand:V2SI 0 "register_operand" "=x,x ,*y,*y") (vec_concat:V2SI (match_operand:SI 1 "nonimmediate_operand" " 0,rm, 0,rm") (match_operand:SI 2 "reg_or_0_operand" " x,C ,*y, C")))] As operand 2 is not zero ("C" constraint), reload must choose the first alternative, which has a matching constraint between operands 0 and 1. This means that the choices selected by local-alloc (both operand 0 and 2 in the same register) *force* a reload of operand 0 here. Why does local-alloc choose the same register for the two operands? This happens because in general, there is no conflict between a register that is used and set in the same insn, because usually the same hard register *can* be used for both. In this case this is not true, but local-alloc does not recognize this. There is indeed code in block_alloc that tries to handle matching constraints, but this only recognizes the more typical scenario where *every* alternative requires a match. Here, we seemingly have alternatives that do not require a match -- of course, that doesn't help because in these alternatives operand 2 is extremely constrained ("C" will only accept a constant zero) and so they aren't actually usable ... Even assuming local-alloc had made better choices, reload would still generate an output reload. This second problem really comes down to use of matching constraints between operands of different modes / sizes. Once operand 0 was assigned to a hard register by local-alloc, reload would generally attempt to also assign operand 1 to the same register, in order to fulfill the matching constraint without requiring an output reload. This is done by the routine find_dummy_reload. However, in this particular case, that routine immediately fails due to: /* If operands exceed a word, we can't use either of them unless they have the same size. */ if (GET_MODE_SIZE (outmode) != GET_MODE_SIZE (inmode) && (GET_MODE_SIZE (outmode) > UNITS_PER_WORD || GET_MODE_SIZE (inmode) > UNITS_PER_WORD)) return 0; because operand 0 is two words in size, while operand 1 is just a single word in size. I'm not completely sure this check (which has been in SVN forever) is still required today ... In any case, the simplest work-around might be to write that pattern in a way that is easier to handle by local-alloc / reload: the two cases x <- 0, x and x <- rm, C are nearly completely unrelated; why not split them into two different insn patterns?
Did you investigate whether IRA fixes this issue?
Fixed. Gcc 4.4.0 revision 144128 generates: foo2: movd %edi, %xmm0 movd %esi, %xmm1 movd %edx, %xmm2 punpckldq %xmm0, %xmm1 movd %ecx, %xmm0 punpckldq %xmm2, %xmm0 punpcklqdq %xmm1, %xmm0 ret .size foo2, .-foo2