50696 – [x32] Unnecessary lea

Bug 50696 - [x32] Unnecessary lea

Summary: [x32] Unnecessary lea

Status:	UNCONFIRMED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	rtl-optimization (show other bugs)
Version:	4.7.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:	http://gcc.gnu.org/ml/gcc-patches/201...
Keywords:	patch

Depends on:
Blocks:

Reported:	2011-10-11 17:36 UTC by H.J. Lu
Modified:	2021-02-16 15:47 UTC (History)
CC List:	3 users (show)

See Also:	50633
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description H.J. Lu 2011-10-11 17:36:32 UTC

[hjl@gnu-mic-2 pr50633]$ cat x.i
struct s { int val[16]; };

extern double f (struct s pb, double pc);

int main ()
{
  struct s x;
  int i;

  for (i = 0; i < 16; i++)
    x.val[i] = i + 1;
  if (f (x, 10000.0L) != 10136.0L)
    __builtin_abort ();
  return 0;
}
[hjl@gnu-mic-2 pr50633]$ make x.s
/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ -mx32 -O -S x.i
[hjl@gnu-mic-2 pr50633]$ cat x.s
    .file    "x.i"
    .text
    .globl    main
    .type    main, @function
main:
.LFB0:
    .cfi_startproc
    subq    $136, %rsp
    .cfi_def_cfa_offset 144
    movl    $0, %eax
    movl    %esp, %ecx
    addl    $60, %ecx
.L2:
    addl    $1, %eax
    leal    (%rcx,%rax,4), %edx
    movl    %eax, (%edx)
    cmpl    $16, %eax
    jne    .L2
    movq    64(%rsp), %rax
    movq    %rax, (%rsp)
    movq    72(%rsp), %rax
    movq    %rax, 8(%rsp)
    movq    80(%rsp), %rax
    movq    %rax, 16(%rsp)
    movq    88(%rsp), %rax
    movq    %rax, 24(%rsp)
    movq    96(%rsp), %rax
    movq    %rax, 32(%rsp)
    movq    104(%rsp), %rax
    movq    %rax, 40(%rsp)
    movq    112(%rsp), %rax
    movq    %rax, 48(%rsp)
    movq    120(%rsp), %rax
    movq    %rax, 56(%rsp)
    movsd    .LC0(%rip), %xmm0
    call    f
    ucomisd    .LC1(%rip), %xmm0
    jp    .L5
    je    .L7
.L5:
    call    abort
.L7:
    movl    $0, %eax
    addq    $136, %rsp
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE0:
    .size    main, .-main

    leal    (%rcx,%rax,4), %edx
    movl    %eax, (%edx)

can be combined into

       movl    %eax, (%ecx,%eax,4)

[reply] [-] Comment 4 H.J. Lu 2011-10-06 19:19:23 UTC

Combine failed:

(set (mem:SI (and:DI (plus:DI (subreg:DI (mult:SI (reg/v:SI 84 [ i ])
                        (const_int 4 [0x4])) 0)
                (subreg:DI (reg:SI 106) 0)) 
            (const_int 4294967292 [0xfffffffc])) [3 MEM[symbol: x, index:
D.2741_12, step: 4, offset: 4294967292B]+0 S4 A32])
    (reg/v:SI 84 [ i ])) 

for

(insn 37 35 39 3 (set (reg:SI 90)
        (plus:SI (mult:SI (reg/v:SI 84 [ i ])
                (const_int 4 [0x4]))
            (reg:SI 106))) x.i:11 247 {*leasi_2}
     (nil))

(insn 39 37 41 3 (set (mem:SI (zero_extend:DI (reg:SI 90)) [3 MEM[symbol: x,
index: D.2741_12, step: 4, offset: 4294967292B]+0 S4 A32])
        (reg/v:SI 84 [ i ])) x.i:11 64 {*movsi_internal}
     (expr_list:REG_DEAD (reg:SI 90)
        (nil)))

Since address is 32bit aligned, 0xfffffffc is the same as
0xffffffff.  But we don't have this information.

why combine creates:

Failed to match this instruction:
(set (mem:SI (and:DI (plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ])
                        (const_int 4 [0x4])) 0)
                (subreg:DI (reg:SI 106) 0))
            (const_int 4294967292 [0xfffffffc])) [0 MEM[symbol: x, index:
D.2741_1, step: 4, offset: 4294967292B]+0 S4 A32])
    (reg/v:SI 85 [ i ]))

Considering that this is in fact zero-extension, the "optimized" pattern is
worse than sticking subreg to the whole address, i.e.

(and:DI (subreg:DI (plus:SI (mult:SI (reg/v:SI 85 [ i ]) (const_int 4 [0x4]))
                            (reg:SI 106)) 0)
        (const_int 4294967295 [0xffffffff]))

Please note that we have registers in two different modes in the former
pattern. The later pattern would be recognized by i386.c code.

Comment 1 H.J. Lu 2011-10-11 17:59:50 UTC

It is generated by expand_compound_operation.

Comment 2 H.J. Lu 2011-10-11 20:09:10 UTC

For

(insn 37 35 39 3 (parallel [
            (set (reg:SI 88) 
                (plus:SI (reg:SI 89) 
                    (reg:SI 100)))
            (clobber (reg:CC 17 flags))
        ]) x.i:12 253 {*addsi_1}
     (expr_list:REG_DEAD (reg:SI 89) 
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))

(insn 39 37 41 3 (set (mem:SI (zero_extend:DI (reg:SI 88)) [0 MEM[symbol: x, index: D.2735_1, step: 4, offset: 4294967292B]+0 S4 A32])
        (reg/v:SI 85 [ i ])) x.i:12 64 {*movsi_internal}
     (expr_list:REG_DEAD (reg:DI 87)
        (nil)))

combine replaces zero_extend with and. It may be a valid
option for normal computation.  But it messes up the
POINTERS_EXTEND_UNSIGNED > 0 target where address is
zero-extendeded from ptr_mode to Pmode.

Comment 3 H.J. Lu 2011-10-11 20:11:59 UTC

Does this patch

diff --git a/gcc/combine.c b/gcc/combine.c
index 6c3b17c..52259b6 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -5078,6 +5078,22 @@ subst (rtx x, rtx from, rtx to, int in_dest, int in_cond, int unique_copy)
 	    }
 	}
     }
+#ifdef POINTERS_EXTEND_UNSIGNED
+  else if (POINTERS_EXTEND_UNSIGNED > 0
+	   && code == MEM
+	   && GET_MODE (XEXP (x, 0)) == Pmode
+	   && GET_MODE (from) == ptr_mode
+	   && GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+	   && COMBINE_RTX_EQUAL_P (XEXP (XEXP (x, 0), 0), from))
+    {
+      /* If an address is zero-extended from ptr_mode to
+	 Pmode, replace FROM with TO.  */
+      SUBST (XEXP (XEXP (x, 0), 0),
+	     (unique_copy && n_occurrences
+	      ? copy_rtx (to) : to));
+      n_occurrences++;
+    }
+#endif
   else
     {
       len = GET_RTX_LENGTH (code);

make any senses?

Comment 4 H.J. Lu 2011-10-11 22:13:47 UTC

const_32bit_mask is incorrect since combine may optimize VAL
in ADDR & VAL from 0xffffffff to 0xfffffffc.  Even if we take
this into account, we can't decompose

(plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ])
            (const_int 4 [0x4])) 0)
    (subreg:DI (reg:SI 100) 0))

Comment 5 H.J. Lu 2011-10-11 23:29:05 UTC

This patch changes combine not to generate:

(plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ])
            (const_int 4 [0x4])) 0)
    (subreg:DI (reg:SI 100) 0))

and changes const_32bit_mask to match what combine may generate:

diff --git a/gcc/combine.c b/gcc/combine.c
index 6c3b17c..147d158 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -6905,10 +6905,17 @@ expand_compound_operation (rtx x)
       tem = gen_lowpart (mode, XEXP (x, 0));
       if (!tem || GET_CODE (tem) == CLOBBER)
 	return x;
-      tem = simplify_shift_const (NULL_RTX, ASHIFT, mode,
-				  tem, modewidth - pos - len);
-      tem = simplify_shift_const (NULL_RTX, unsignedp ? LSHIFTRT : ASHIFTRT,
-				  mode, tem, modewidth - len);
+      if (GET_CODE (x) == ZERO_EXTEND)
+	tem = gen_rtx_AND (mode, tem,
+			   GEN_INT (GET_MODE_MASK (GET_MODE (XEXP (x, 0)))));
+      else
+	{
+	  tem = simplify_shift_const (NULL_RTX, ASHIFT, mode,
+				      tem, modewidth - pos - len);
+	  tem = simplify_shift_const (NULL_RTX,
+				      unsignedp ? LSHIFTRT : ASHIFTRT,
+				      mode, tem, modewidth - len);
+	}
     }
   else if (unsignedp && len < HOST_BITS_PER_WIDE_INT)
     tem = simplify_and_const_int (NULL_RTX, GET_MODE (x),
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 349f5b0..03da1fb 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -600,8 +600,8 @@
 ;; Match exactly 0x0FFFFFFFF in anddi as a zero-extension operation
 (define_predicate "const_32bit_mask"
   (and (match_code "const_int")
-       (match_test "trunc_int_for_mode (INTVAL (op), DImode)
-		    == (HOST_WIDE_INT) 0xffffffff")))
+       (match_test "(trunc_int_for_mode (INTVAL (op), DImode) & (HOST_WIDE_INT)
 0xffffff00)
+		    == (HOST_WIDE_INT) 0xffffff00")))
 
 ;; Match 2, 4, or 8.  Used for leal multiplicands.
 (define_predicate "const248_operand"

Comment 6 Paolo Bonzini 2011-10-14 16:24:38 UTC

H.J., can you please add a link to the patch that you are testing and that Kenner informally approved?

Comment 7 H.J. Lu 2011-10-14 16:35:45 UTC

A patch is posted at

http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01307.html

Comment 8 Eric Gallager 2018-03-12 04:24:53 UTC

(In reply to H.J. Lu from comment #7)
> A patch is posted at
> 
> http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01307.html

Adding "patch" keyword