Bug 36222 - x86 fails to optimize out __v4si -> __m128i move
Summary: x86 fails to optimize out __v4si -> __m128i move
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.4.0
: P3 normal
Target Milestone: 4.4.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization, ra
Depends on: 30961
Blocks:
  Show dependency treegraph
 
Reported: 2008-05-12 15:37 UTC by H.J. Lu
Modified: 2009-02-13 15:57 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2008-05-16 19:09:08


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description H.J. Lu 2008-05-12 15:37:35 UTC
[hjl@gnu-6 sse-1]$ cat d.c
#include <emmintrin.h>

__m128i
foo2 (int x1, int x2, int x3, int x4)
{
  return _mm_set_epi32 (x1, x2, x3, x4);
}
[hjl@gnu-6 sse-1]$ make d.s
/export/build/gnu/gcc-stack-internal/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-stack-internal/build-x86_64-linux/gcc/ -msse4 -march=core2 -m3dnow -msse5 -maes -mpclmul -Wall -O2 -S d.c
[hjl@gnu-6 sse-1]$ cat d.s
        .file   "d.c"
        .text
        .p2align 4,,15
.globl foo2
        .type   foo2, @function
foo2:
.LFB518:
        movd    %edx, %xmm0
        movd    %edi, %xmm2
        movd    %ecx, %xmm1
        punpckldq       %xmm0, %xmm1
        movd    %esi, %xmm0
        punpckldq       %xmm2, %xmm0
        movq    %xmm1, %xmm2
        punpcklqdq      %xmm0, %xmm2
        movdqa  %xmm2, %xmm0  <<< --- Can we optimize it out?
        ret
Comment 1 H.J. Lu 2008-05-12 15:57:16 UTC
Also do we need "movq    %xmm1, %xmm2"?
Comment 2 Uroš Bizjak 2008-05-12 19:23:48 UTC
(In reply to comment #1)
> Also do we need "movq    %xmm1, %xmm2"?

We can help RA a bit by emitting RTL sequence that requires less pseudos.

Index: i386.c
===================================================================
--- i386.c      (revision 135220)
+++ i386.c      (working copy)
@@ -23859,14 +23859,14 @@
        /* For V4SF and V4SI, we implement a concat of two V2 vectors.
           Recurse to load the two halves.  */
 
+       op1 = gen_reg_rtx (half_mode);
+       v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3));
+       ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v));
+
        op0 = gen_reg_rtx (half_mode);
        v = gen_rtvec (2, XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1));
        ix86_expand_vector_init (false, op0, gen_rtx_PARALLEL (half_mode, v));
 
-       op1 = gen_reg_rtx (half_mode);
-       v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3));
-       ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v));
-
        use_vec_concat = true;
       }
       break;
@@ -23883,10 +23883,10 @@
 
   if (use_vec_concat)
     {
+      if (!register_operand (op1, half_mode))
+       op1 = force_reg (half_mode, op1);
       if (!register_operand (op0, half_mode))
        op0 = force_reg (half_mode, op0);
-      if (!register_operand (op1, half_mode))
-       op1 = force_reg (half_mode, op1);
 
       emit_insn (gen_rtx_SET (VOIDmode, target,
                              gen_rtx_VEC_CONCAT (mode, op0, op1)));


This patch will produce:

_mm_set_epi32:
.LFB2:
        movd    %edi, %xmm0
        movd    %esi, %xmm1
        movd    %edx, %xmm2
        punpckldq       %xmm0, %xmm1
        movd    %ecx, %xmm0
        punpckldq       %xmm2, %xmm0
        punpcklqdq      %xmm1, %xmm0
        ret
Comment 3 H.J. Lu 2008-05-13 13:58:18 UTC
This bug may be related to PR 30961. Another example:

bash-3.2$ cat d.c
#include <emmintrin.h>

__m128i
foo2 (long long x1, long long x2)
{
  return _mm_set_epi64x (x1, x2);
}
bash-3.2$ /export/build/gnu/gcc-stack/build-x86_64-linux/gcc/xgcc -B./ -B/export/build/gnu/gcc-stack/build-x86_64-linux/gcc/ -Wall -I.. -O2 -march=core2 -msse4 -fno-asynchronous-unwind-tables -DDEBUG -S d.c 
bash-3.2$ cat d.s
        .file   "d.c"
        .text
        .p2align 4,,15
.globl foo2
        .type   foo2, @function
foo2:
        movd    %rsi, %xmm1
        pinsrq  $0x1, %rdi, %xmm1
        movdqa  %xmm1, %xmm0
        ret
        .size   foo2, .-foo2

d.c.176r.lreg has

;; Pred edge  ENTRY [100.0%]  (fallthru)
(note:HI 5 1 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)

(insn:HI 2 5 3 2 d.c:5 (set (reg/v:DI 59 [ x1 ])
        (reg:DI 5 di [ x1 ])) 89 {*movdi_1_rex64} (expr_list:REG_DEAD (reg:DI 5 
di [ x1 ])
        (nil)))

(insn:HI 3 2 4 2 d.c:5 (set (reg/v:DI 60 [ x2 ])
        (reg:DI 4 si [ x2 ])) 89 {*movdi_1_rex64} (expr_list:REG_DEAD (reg:DI 4 
si [ x2 ])
        (nil)))

(note:HI 4 3 7 2 NOTE_INSN_FUNCTION_BEG)

(note:HI 7 4 12 2 NOTE_INSN_DELETED)

(insn:HI 12 7 18 2 d.c:7 (set (reg/i:V2DI 21 xmm0 [ <result> ])
        (vec_concat:V2DI (reg/v:DI 60 [ x2 ])
            (reg/v:DI 59 [ x1 ]))) 1340 {*vec_concatv2di_rex64_sse4_1} (expr_lis
t:REG_DEAD (reg/v:DI 60 [ x2 ])
        (expr_list:REG_DEAD (reg/v:DI 59 [ x1 ])
            (nil))))

(insn:HI 18 12 0 2 d.c:7 (use (reg/i:V2DI 21 xmm0 [ <result> ])) -1 (nil))
;; End of basic block 2 -> ( 1)
;; lr  out       6 [bp] 7 [sp] 16 [argp] 20 [frame] 21 [xmm0]
;; live  out     6 [bp] 7 [sp] 16 [argp] 20 [frame] 21 [xmm0]

But d.c.177r.greg has
;; Pred edge  ENTRY [100.0%]  (fallthru)
(note:HI 5 1 4 2 [bb 2] NOTE_INSN_BASIC_BLOCK)

(note:HI 4 5 7 2 NOTE_INSN_FUNCTION_BEG)

(note:HI 7 4 24 2 NOTE_INSN_DELETED)

(insn 24 7 12 2 d.c:7 (set (reg:DI 22 xmm1)
        (reg/v:DI 4 si [orig:60 x2 ] [60])) 89 {*movdi_1_rex64} (nil))

(insn:HI 12 24 25 2 d.c:7 (set (reg:V2DI 22 xmm1)
        (vec_concat:V2DI (reg:DI 22 xmm1)
            (reg/v:DI 5 di [orig:59 x1 ] [59]))) 1340 {*vec_concatv2di_rex64_sse
4_1} (nil))

(insn 25 12 18 2 d.c:7 (set (reg/i:V2DI 21 xmm0 [ <result> ])
        (reg:V2DI 22 xmm1)) 1019 {*movv2di_internal} (nil))

(insn 18 25 23 2 d.c:7 (use (reg/i:V2DI 21 xmm0 [ <result> ])) -1 (nil))
;; End of basic block 2 -> ( 1)
;; lr  out       7 [sp] 21 [xmm0]
;; live  out     7 [sp] 21 [xmm0]

Ulrich, do you have any idea why reload won't use xmm0? Thanks.
Comment 4 H.J. Lu 2008-05-13 14:09:40 UTC
It looks like reload doesn't check any vector instructions. I guess there may
be many missed optimizations with vector instructions.
Comment 5 uros 2008-05-13 21:34:29 UTC
Subject: Bug 36222

Author: uros
Date: Tue May 13 21:33:40 2008
New Revision: 135275

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=135275
Log:
        PR target/36222
        * config/i386/i386.c (ix86_expand_vector_init_general): Rearrange op0
        and op1 expansion before vector concat to have less live pseudos.

testsuite/ChangeLog:

        PR target/36222
        * gcc.target/i386/pr36222-1.c: New test.


Added:
    trunk/gcc/testsuite/gcc.target/i386/pr36222-1.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/config/i386/i386.c
    trunk/gcc/testsuite/ChangeLog

Comment 6 Uroš Bizjak 2008-05-16 19:09:08 UTC
There is still underlying RA problem left. Compiling the testcase from Comment 0 on 32bit i686 (-O2 -msse2), we get:

_mm_set_epi32:
	pushl	%ebp
	movl	%esp, %ebp
	movd	8(%ebp), %xmm0
	movd	12(%ebp), %xmm2
	punpckldq	%xmm0, %xmm2
	movd	16(%ebp), %xmm0
>>>	movq	%xmm2, %xmm1
	movd	20(%ebp), %xmm2
	popl	%ebp
	punpckldq	%xmm0, %xmm2
>>>	movq	%xmm2, %xmm0
	punpcklqdq	%xmm1, %xmm0
	ret

Two movq insns can be avoided by rearranging of registers.
Comment 7 H.J. Lu 2008-05-16 21:54:17 UTC
find_reloads in reload.c has

  /* Special case a simple move with an input reload and a
     destination of a hard reg, if the hard reg is ok, use it.  */
  for (i = 0; i < n_reloads; i++) 
    if (rld[i].when_needed == RELOAD_FOR_INPUT
        && GET_CODE (PATTERN (insn)) == SET
        && REG_P (SET_DEST (PATTERN (insn)))
        && (SET_SRC (PATTERN (insn)) == rld[i].in
            || SET_SRC (PATTERN (insn)) == rld[i].in_reg)
        && !elimination_target_reg_p (SET_DEST (PATTERN (insn))))
      {    
        rtx dest = SET_DEST (PATTERN (insn));
        unsigned int regno = REGNO (dest);

If it can be extended to handle vector instructions, it will help this
bug.
Comment 8 Ulrich Weigand 2008-05-18 15:58:41 UTC
That special case in find_reloads is really about a different situation.
We do not have a simple move here.

The problem also is not really related to vector instruction in particular;
reload doesn't at all care what the instructions actually do ...

There are two problems involved in this particular case, each of which
suffices to prevent optimal register allocation.

Before reload, we have

(insn:HI 10 27 11 2 d.c:7 (set (reg:V2SI 66)
        (vec_concat:V2SI (mem/c/i:SI (reg/f:SI 16 argp) [2 x1+0 S4 A32])
            (reg/v:SI 60 [ x2 ]))) 1338 {*vec_concatv2si_sse2} (expr_list:REG_DEAD (reg/v:SI 60 [ x2 ])
        (nil)))

where local register allocation has already selected hard registers
for *both* operands 0 and 2:

;; Register 60 in 21.
;; Register 66 in 21.

Now, the insn pattern offers those alternatives:

(define_insn "*vec_concatv2si_sse2"
  [(set (match_operand:V2SI 0 "register_operand"     "=x,x ,*y,*y")
        (vec_concat:V2SI
          (match_operand:SI 1 "nonimmediate_operand" " 0,rm, 0,rm")
          (match_operand:SI 2 "reg_or_0_operand"     " x,C ,*y, C")))]

As operand 2 is not zero ("C" constraint), reload must choose
the first alternative, which has a matching constraint between
operands 0 and 1.

This means that the choices selected by local-alloc (both operand
0 and 2 in the same register) *force* a reload of operand 0 here.

Why does local-alloc choose the same register for the two operands?
This happens because in general, there is no conflict between a
register that is used and set in the same insn, because usually the
same hard register *can* be used for both.

In this case this is not true, but local-alloc does not recognize
this.  There is indeed code in block_alloc that tries to handle
matching constraints, but this only recognizes the more typical
scenario where *every* alternative requires a match.

Here, we seemingly have alternatives that do not require a match
-- of course, that doesn't help because in these alternatives
operand 2 is extremely constrained ("C" will only accept a constant
zero) and so they aren't actually usable ...


Even assuming local-alloc had made better choices, reload would still
generate an output reload.  This second problem really comes down to
use of matching constraints between operands of different modes / sizes.

Once operand 0 was assigned to a hard register by local-alloc, reload
would generally attempt to also assign operand 1 to the same register,
in order to fulfill the matching constraint without requiring an
output reload.

This is done by the routine find_dummy_reload.  However, in this
particular case, that routine immediately fails due to:

  /* If operands exceed a word, we can't use either of them
     unless they have the same size.  */
  if (GET_MODE_SIZE (outmode) != GET_MODE_SIZE (inmode)
      && (GET_MODE_SIZE (outmode) > UNITS_PER_WORD
          || GET_MODE_SIZE (inmode) > UNITS_PER_WORD))
    return 0;

because operand 0 is two words in size, while operand 1 is just
a single word in size.  I'm not completely sure this check (which
has been in SVN forever) is still required today ...


In any case, the simplest work-around might be to write that pattern
in a way that is easier to handle by local-alloc / reload:  the two
cases x <- 0, x  and x <- rm, C are nearly completely unrelated; why
not split them into two different insn patterns?


Comment 9 Richard Biener 2008-05-18 16:58:31 UTC
Did you investigate whether IRA fixes this issue?
Comment 10 H.J. Lu 2009-02-13 15:57:28 UTC
Fixed. Gcc 4.4.0 revision 144128 generates:

foo2:
	movd	%edi, %xmm0
	movd	%esi, %xmm1
	movd	%edx, %xmm2
	punpckldq	%xmm0, %xmm1
	movd	%ecx, %xmm0
	punpckldq	%xmm2, %xmm0
	punpcklqdq	%xmm1, %xmm0
	ret
	.size	foo2, .-foo2