[hjl@gnu-mic-2 pr50633]$ cat x.i struct s { int val[16]; }; extern double f (struct s pb, double pc); int main () { struct s x; int i; for (i = 0; i < 16; i++) x.val[i] = i + 1; if (f (x, 10000.0L) != 10136.0L) __builtin_abort (); return 0; } [hjl@gnu-mic-2 pr50633]$ make x.s /export/build/gnu/gcc-x32/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/gcc-x32/build-x86_64-linux/gcc/ -mx32 -O -S x.i [hjl@gnu-mic-2 pr50633]$ cat x.s .file "x.i" .text .globl main .type main, @function main: .LFB0: .cfi_startproc subq $136, %rsp .cfi_def_cfa_offset 144 movl $0, %eax movl %esp, %ecx addl $60, %ecx .L2: addl $1, %eax leal (%rcx,%rax,4), %edx movl %eax, (%edx) cmpl $16, %eax jne .L2 movq 64(%rsp), %rax movq %rax, (%rsp) movq 72(%rsp), %rax movq %rax, 8(%rsp) movq 80(%rsp), %rax movq %rax, 16(%rsp) movq 88(%rsp), %rax movq %rax, 24(%rsp) movq 96(%rsp), %rax movq %rax, 32(%rsp) movq 104(%rsp), %rax movq %rax, 40(%rsp) movq 112(%rsp), %rax movq %rax, 48(%rsp) movq 120(%rsp), %rax movq %rax, 56(%rsp) movsd .LC0(%rip), %xmm0 call f ucomisd .LC1(%rip), %xmm0 jp .L5 je .L7 .L5: call abort .L7: movl $0, %eax addq $136, %rsp .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE0: .size main, .-main leal (%rcx,%rax,4), %edx movl %eax, (%edx) can be combined into movl %eax, (%ecx,%eax,4) [reply] [-] Comment 4 H.J. Lu 2011-10-06 19:19:23 UTC Combine failed: (set (mem:SI (and:DI (plus:DI (subreg:DI (mult:SI (reg/v:SI 84 [ i ]) (const_int 4 [0x4])) 0) (subreg:DI (reg:SI 106) 0)) (const_int 4294967292 [0xfffffffc])) [3 MEM[symbol: x, index: D.2741_12, step: 4, offset: 4294967292B]+0 S4 A32]) (reg/v:SI 84 [ i ])) for (insn 37 35 39 3 (set (reg:SI 90) (plus:SI (mult:SI (reg/v:SI 84 [ i ]) (const_int 4 [0x4])) (reg:SI 106))) x.i:11 247 {*leasi_2} (nil)) (insn 39 37 41 3 (set (mem:SI (zero_extend:DI (reg:SI 90)) [3 MEM[symbol: x, index: D.2741_12, step: 4, offset: 4294967292B]+0 S4 A32]) (reg/v:SI 84 [ i ])) x.i:11 64 {*movsi_internal} (expr_list:REG_DEAD (reg:SI 90) (nil))) Since address is 32bit aligned, 0xfffffffc is the same as 0xffffffff. But we don't have this information. why combine creates: Failed to match this instruction: (set (mem:SI (and:DI (plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ]) (const_int 4 [0x4])) 0) (subreg:DI (reg:SI 106) 0)) (const_int 4294967292 [0xfffffffc])) [0 MEM[symbol: x, index: D.2741_1, step: 4, offset: 4294967292B]+0 S4 A32]) (reg/v:SI 85 [ i ])) Considering that this is in fact zero-extension, the "optimized" pattern is worse than sticking subreg to the whole address, i.e. (and:DI (subreg:DI (plus:SI (mult:SI (reg/v:SI 85 [ i ]) (const_int 4 [0x4])) (reg:SI 106)) 0) (const_int 4294967295 [0xffffffff])) Please note that we have registers in two different modes in the former pattern. The later pattern would be recognized by i386.c code.
It is generated by expand_compound_operation.
For (insn 37 35 39 3 (parallel [ (set (reg:SI 88) (plus:SI (reg:SI 89) (reg:SI 100))) (clobber (reg:CC 17 flags)) ]) x.i:12 253 {*addsi_1} (expr_list:REG_DEAD (reg:SI 89) (expr_list:REG_UNUSED (reg:CC 17 flags) (nil)))) (insn 39 37 41 3 (set (mem:SI (zero_extend:DI (reg:SI 88)) [0 MEM[symbol: x, index: D.2735_1, step: 4, offset: 4294967292B]+0 S4 A32]) (reg/v:SI 85 [ i ])) x.i:12 64 {*movsi_internal} (expr_list:REG_DEAD (reg:DI 87) (nil))) combine replaces zero_extend with and. It may be a valid option for normal computation. But it messes up the POINTERS_EXTEND_UNSIGNED > 0 target where address is zero-extendeded from ptr_mode to Pmode.
Does this patch diff --git a/gcc/combine.c b/gcc/combine.c index 6c3b17c..52259b6 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -5078,6 +5078,22 @@ subst (rtx x, rtx from, rtx to, int in_dest, int in_cond, int unique_copy) } } } +#ifdef POINTERS_EXTEND_UNSIGNED + else if (POINTERS_EXTEND_UNSIGNED > 0 + && code == MEM + && GET_MODE (XEXP (x, 0)) == Pmode + && GET_MODE (from) == ptr_mode + && GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + && COMBINE_RTX_EQUAL_P (XEXP (XEXP (x, 0), 0), from)) + { + /* If an address is zero-extended from ptr_mode to + Pmode, replace FROM with TO. */ + SUBST (XEXP (XEXP (x, 0), 0), + (unique_copy && n_occurrences + ? copy_rtx (to) : to)); + n_occurrences++; + } +#endif else { len = GET_RTX_LENGTH (code); make any senses?
const_32bit_mask is incorrect since combine may optimize VAL in ADDR & VAL from 0xffffffff to 0xfffffffc. Even if we take this into account, we can't decompose (plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ]) (const_int 4 [0x4])) 0) (subreg:DI (reg:SI 100) 0))
This patch changes combine not to generate: (plus:DI (subreg:DI (mult:SI (reg/v:SI 85 [ i ]) (const_int 4 [0x4])) 0) (subreg:DI (reg:SI 100) 0)) and changes const_32bit_mask to match what combine may generate: diff --git a/gcc/combine.c b/gcc/combine.c index 6c3b17c..147d158 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -6905,10 +6905,17 @@ expand_compound_operation (rtx x) tem = gen_lowpart (mode, XEXP (x, 0)); if (!tem || GET_CODE (tem) == CLOBBER) return x; - tem = simplify_shift_const (NULL_RTX, ASHIFT, mode, - tem, modewidth - pos - len); - tem = simplify_shift_const (NULL_RTX, unsignedp ? LSHIFTRT : ASHIFTRT, - mode, tem, modewidth - len); + if (GET_CODE (x) == ZERO_EXTEND) + tem = gen_rtx_AND (mode, tem, + GEN_INT (GET_MODE_MASK (GET_MODE (XEXP (x, 0))))); + else + { + tem = simplify_shift_const (NULL_RTX, ASHIFT, mode, + tem, modewidth - pos - len); + tem = simplify_shift_const (NULL_RTX, + unsignedp ? LSHIFTRT : ASHIFTRT, + mode, tem, modewidth - len); + } } else if (unsignedp && len < HOST_BITS_PER_WIDE_INT) tem = simplify_and_const_int (NULL_RTX, GET_MODE (x), diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 349f5b0..03da1fb 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -600,8 +600,8 @@ ;; Match exactly 0x0FFFFFFFF in anddi as a zero-extension operation (define_predicate "const_32bit_mask" (and (match_code "const_int") - (match_test "trunc_int_for_mode (INTVAL (op), DImode) - == (HOST_WIDE_INT) 0xffffffff"))) + (match_test "(trunc_int_for_mode (INTVAL (op), DImode) & (HOST_WIDE_INT) 0xffffff00) + == (HOST_WIDE_INT) 0xffffff00"))) ;; Match 2, 4, or 8. Used for leal multiplicands. (define_predicate "const248_operand"
H.J., can you please add a link to the patch that you are testing and that Kenner informally approved?
A patch is posted at http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01307.html
(In reply to H.J. Lu from comment #7) > A patch is posted at > > http://gcc.gnu.org/ml/gcc-patches/2011-10/msg01307.html Adding "patch" keyword