For this code: extern unsigned int S[]; extern unsigned int state[]; unsigned int foo () { register unsigned int t; register int j; j=0; t=0; for (j=0; j<16; j+=4) { t= state[j+ 0]^=S[t]; t= state[j+ 1]^=S[t]; t= state[j+ 2]^=S[t]; t= state[j+ 3]^=S[t]; } t=(t)&0xff; return t; } With -O3, gcc generates: foo: .LFB2: xorl %esi, %esi xorl %ecx, %ecx .p2align 4,,7 .L2: movslq %esi,%rdx mov %ecx, %eax movl S(,%rax,4), %eax xorl state(,%rdx,4), %eax movl %eax, state(,%rdx,4) leal 1(%rsi), %edx mov %eax, %eax <====== Why? movl S(,%rax,4), %eax movslq %edx,%rdx xorl state(,%rdx,4), %eax movl %eax, state(,%rdx,4) leal 2(%rsi), %edx mov %eax, %eax <====== Why? movl S(,%rax,4), %eax movslq %edx,%rdx xorl state(,%rdx,4), %eax movl %eax, state(,%rdx,4) leal 3(%rsi), %edx mov %eax, %eax <====== Why? movl S(,%rax,4), %ecx leal 4(%rsi), %eax movslq %edx,%rdx xorl state(,%rdx,4), %ecx cmpl $16, %eax movl %eax, %esi movl %ecx, state(,%rdx,4) jne .L2 movzbl %cl,%eax ret With -O3, gcc 3.4.3 gets foo: .LFB2: xorl %edi, %edi xorl %ecx, %ecx movl $state, %esi .p2align 4,,7 .L5: movslq %ecx,%rdx mov %edi, %eax addl $4, %ecx movl S(,%rax,4), %r11d xorl state(,%rdx,4), %r11d mov %r11d, %r10d <====== Why? movl %r11d, state(,%rdx,4) movl S(,%r10,4), %r9d xorl state+4(,%rdx,4), %r9d mov %r9d, %r8d <====== Why? movl %r9d, 4(%rsi,%rdx,4) movl S(,%r8,4), %edi xorl state+8(,%rdx,4), %edi mov %edi, %eax <====== Why? movl %edi, 8(%rsi,%rdx,4) movl S(,%rax,4), %eax xorl state+12(,%rdx,4), %eax cmpl $15, %ecx movl %eax, 12(%rsi,%rdx,4) movl %eax, %edi jle .L5 andl $255, %eax ret
The patch listed here might help: <http://gcc.gnu.org/ml/gcc/2004-09/msg00377.html>.
The funny instruction is a zero extension done via 32bit move. tree-ssa-ivopts should promote the induction variable to 64bit counter for sure.
Actually this can be reproduce without IV.
ivopts gets rid of the extends once the record_bounds pass is enabled by the following patch. The nonsensical "mov %eax, %eax" type moves however still remain in the code. Zdenek Index: tree-optimize.c =================================================================== RCS file: /cvs/gcc/gcc/gcc/tree-optimize.c,v retrieving revision 2.45 diff -c -3 -p -r2.45 tree-optimize.c *** tree-optimize.c 9 Sep 2004 20:53:36 -0000 2.45 --- tree-optimize.c 11 Sep 2004 19:44:56 -0000 *************** init_tree_optimization_passes (void) *** 392,397 **** --- 392,398 ---- NEXT_PASS (pass_loop_init); NEXT_PASS (pass_lim); NEXT_PASS (pass_iv_canon); + NEXT_PASS (pass_record_bounds); NEXT_PASS (pass_if_conversion); NEXT_PASS (pass_vectorize); NEXT_PASS (pass_linear_transform); Index: tree-pass.h =================================================================== RCS file: /cvs/gcc/gcc/gcc/tree-pass.h,v retrieving revision 2.15 diff -c -3 -p -r2.15 tree-pass.h *** tree-pass.h 9 Sep 2004 20:53:37 -0000 2.15 --- tree-pass.h 11 Sep 2004 19:44:56 -0000 *************** extern struct tree_opt_pass pass_loop; *** 126,131 **** --- 126,132 ---- extern struct tree_opt_pass pass_loop_init; extern struct tree_opt_pass pass_lim; extern struct tree_opt_pass pass_iv_canon; + extern struct tree_opt_pass pass_record_bounds; extern struct tree_opt_pass pass_if_conversion; extern struct tree_opt_pass pass_vectorize; extern struct tree_opt_pass pass_complete_unroll; Index: tree-ssa-loop.c =================================================================== RCS file: /cvs/gcc/gcc/gcc/tree-ssa-loop.c,v retrieving revision 2.17 diff -c -3 -p -r2.17 tree-ssa-loop.c *** tree-ssa-loop.c 8 Sep 2004 15:28:56 -0000 2.17 --- tree-ssa-loop.c 11 Sep 2004 19:44:57 -0000 *************** struct tree_opt_pass pass_iv_canon = *** 263,268 **** --- 263,297 ---- 0 /* letter */ }; + /* Record bounds on numbers of iterations of loops. */ + + static void + tree_ssa_loop_bounds (void) + { + if (!current_loops) + return; + + estimate_numbers_of_iterations (current_loops); + scev_reset (); + } + + struct tree_opt_pass pass_record_bounds = + { + "bounds", /* name */ + NULL, /* gate */ + tree_ssa_loop_bounds, /* execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + 0, /* tv_id */ + PROP_cfg | PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ + 0 /* letter */ + }; + /* Complete unrolling of loops. */ static void
Mainline today produces: foo: .LFB2: xorl %eax, %eax movl $4, %ecx movl $state, %edx .p2align 4,,7 .L2: mov %eax, %eax movl S(,%rax,4), %eax xorl (%rdx), %eax movl %eax, (%rdx) mov %eax, %eax movl S(,%rax,4), %eax xorl 4(%rdx), %eax movl %eax, 4(%rdx) mov %eax, %eax movl S(,%rax,4), %eax xorl 8(%rdx), %eax movl %eax, 8(%rdx) mov %eax, %eax movl S(,%rax,4), %eax xorl 12(%rdx), %eax movl %eax, 12(%rdx) addq $16, %rdx decl %ecx jne .L2 andl $255, %eax ret
Looking at the mainline result, I still see "mov %eax, %eax", which was the original bug report for mainline.
Confirmed again.
Uhm, right. OK then, lemme have a look.
My first suspicion was the `register' keyword, but that's not it: -------------------------------------------------- extern unsigned int S[]; extern unsigned int state[]; unsigned int foo () { unsigned int t; int j; t=0; for (j=0; j<16; j+=4) { t = state[j + 0] ^= S[t]; t = state[j + 1] ^= S[t]; } return t; } -------------------------------------------------- --> -------------------------------------------------- foo: .LFB2: xorl %eax, %eax movl $4, %ecx movl $state, %edx .p2align 4,,7 .L2: mov %eax, %eax movl S(,%rax,4), %eax xorl (%rdx), %eax movl %eax, (%rdx) mov %eax, %eax <-- bah! movl S(,%rax,4), %eax xorl 4(%rdx), %eax movl %eax, 4(%rdx) addq $16, %rdx decl %ecx jne .L2 rep ; ret .LFE2: --------------------------------------------------
GCC can't see that this is a NOP because of the zero_extend: #(insn 34 30 37 (set (reg:DI 0 ax [orig:76 D.1460 ] [76]) # (zero_extend:DI (reg:SI 0 ax [orig:70 D.1460 ] [70]))) 111 {zero_extendsidi2_rex64} (insn_list:REG_DEP_ANTI 30 (insn_list:REG_DEP_TRUE 29 (nil))) # (nil)) mov %eax, %eax # D.1460, D.1460 # 34 zero_extendsidi2_rex64/1 [length = 3] Perhaps we should have a peephole2 for this. I'm curious why we can not eliminate the move earlier on, though.
Gcc doesn't know/remember movl S(,%rax,4), %eax will zero extend to 64bit. I don't know you can touch only the lower 32bit bits.
Steven, you do realize this is essentially unfixable without a new pass that optimially places widened operations, don't you?
Subject: Re: Redundant instructions in loop optimization On Thursday 27 January 2005 07:05, rth at gcc dot gnu dot org wrote: > ------- Additional Comments From rth at gcc dot gnu dot org 2005-01-27 > 06:05 ------- Steven, you do realize this is essentially unfixable without > a new pass that optimially places widened operations, don't you? No, I didn't :-) So you suggest closing this as SUSPEND?
More knowledgable sources than me say: "[mov %eax, %eax ] is not nop. 32bit operations implicitly zero extend, so this is zero extension. There is no movqzx."
moron alert
/me should read up on the amd64 instruction set first :-(
(In reply to comment #13) > No, I didn't :-) > > So you suggest closing this as SUSPEND? Yes. But note IBM haifa is going to submit a pass for this and has been outlined before.
New patch was posted: http://gcc.gnu.org/ml/gcc-patches/2005-09/msg01769.html
Even with the new patch http://gcc.gnu.org/ml/gcc-patches/2006-02/msg01994.html I still got the same result. The new see pass won't touch (insn:HI 13 9 14 2 (set (reg/v:SI 73 [ t ]) (mem/s:SI (symbol_ref:DI ("state") [flags 0x40] <var_decl 0x2a98541420 state>) [3 state+0 S4 A32])) 40 {*movsi_1} (nil) (nil)) (insn:HI 14 13 16 2 (parallel [ (set (reg/v:SI 73 [ t ]) (xor:SI (mem/s:SI (symbol_ref:DI ("S") [flags 0x40] <var_decl 0x2a985412c0 S>) [3 S+0 S4 A32]) (reg/v:SI 73 [ t ]))) (clobber (reg:CC 17 flags)) ]) 340 {*xorsi_1} (insn_list:REG_DEP_TRUE 13 (nil)) (expr_list:REG_UNUSED (reg:CC 17 flags) (expr_list:REG_EQUAL (xor:SI (mem/s:SI (symbol_ref:DI ("S") [flags 0x40] <var_decl 0x2a985412c0 S>) [3 S+0 S4 A32]) (mem/s:SI (symbol_ref:DI ("state") [flags 0x40] <var_decl 0x2a98541420 state>) [3 state+0 S4 A32])) (nil)))) (insn:HI 16 14 18 2 (set (mem/s:SI (symbol_ref:DI ("state") [flags 0x40] <var_decl 0x2a98541420 state>) [3 state+0 S4 A32]) (reg/v:SI 73 [ t ])) 40 {*movsi_1} (insn_list:REG_DEP_TRUE 14 (nil)) (nil)) (insn:HI 18 16 21 2 (set (reg:DI 79 [ t ]) (zero_extend:DI (reg/v:SI 73 [ t ]))) 111 {zero_extendsidi2_rex64} (nil) (expr_list:REG_DEAD (reg/v:SI 73 [ t ]) (nil)))
It is my intention to fix see.c to work on x86* hardware, so I'm taking this bug.
Collection of important related links: http://gcc.gnu.org/ml/gcc-patches/2006-04/msg00766.html http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27437#c5
With the current implementation of SEE it is almost impossible to make it work on x86. You have to take into account the liveness of the flags register, and there currently is no way to include that in the dataflow equations. Maybe someone else knows how to do this...
> Gcc doesn't know/remember > > movl S(,%rax,4), %eax > > will zero extend to 64bit. I don't know you can touch only the lower > 32bit bits. This could be fixed by LOAD_EXTEND_OP, right?
I tried: --- config/i386/i386.h.zero 2009-02-18 08:42:40.000000000 -0800 +++ config/i386/i386.h 2009-02-18 13:16:26.000000000 -0800 @@ -1940,6 +1940,11 @@ do { \ is done just by pretending it is already truncated. */ #define TRULY_NOOP_TRUNCATION(OUTPREC, INPREC) 1 +/* When in 64-bit mode, move insns will zero extend SImode. All other + references are unknown. */ +#define LOAD_EXTEND_OP(MODE) \ + (TARGET_64BIT && (MODE) == SImode ? ZERO_EXTEND : UNKNOWN) + /* A macro to update M and UNSIGNEDP when an object whose type is TYPE and which has the specified mode and signedness is to be stored in a register. This macro is only called when TYPE is a It makes no differences.
All 32bit load insns are zero extended to 64bit, not just move.
*** Bug 34653 has been marked as a duplicate of this bug. ***
Fixed by revision 159342: http://gcc.gnu.org/ml/gcc-cvs/2010-05/msg00394.html
Delivery has failed to these recipients or distribution lists: xuepeng.guo@intel.com<mailto:xuepeng.guo@intel.com> The recipient's mailbox is full and can't accept messages now. Microsoft Exchange will not try to redeliver this message for you. Please try resending this message later, or contact the recipient directly. ________________________________ Sent by Microsoft Exchange Server 2007 Diagnostic information for administrators: Generating server: ccr.corp.intel.com xuepeng.guo@intel.com #550 5.2.2 STOREDRV.Deliver: mailbox full. The following information should help identify the cause: "MapiExceptionShutoffQuotaExceeded:16.18969:8C000000, 17.27161:0000000094000000000000000000000000000000, 255.23226:00000000, 255.27962:FE000000, 255.17082:DD040000, 0.26937:94000000, 4.21921:DD040000, 255.27962:FA000000, 255.1494:00000000, 255.26426:FE000000, 4.7588:0F010480, 4.6564:0F010480, 4.4740:05000780, 4.6276:05000780, 4.5721:DD040000, 4.6489:DD040000, 4.2199:DD040000, 4.17097:DD040000, 4.8620:DD040000, 255.1750:EC030000, 0.26849:0F010480, 255.21817:DD040000, 0.26297:0F010480, 4.16585:DD040000, 0.32441:DD040000, 4.1706:DD040000, 0.24761:DD040000, 4.20665:DD040000, 0.25785:2F000000, 4.29881:DD040000". ## Original message headers: Received: from rrsmsx602.amr.corp.intel.com (10.31.0.33) by SHSMSX602.ccr.corp.intel.com (10.239.4.104) with Microsoft SMTP Server (TLS) id 8.2.254.0; Wed, 2 Feb 2011 04:38:11 +0800 Received: from azsmga001.ch.intel.com (10.2.17.19) by rrsmsx602-1.rr.intel.com (10.31.0.33) with Microsoft SMTP Server id 8.2.254.0; Tue, 1 Feb 2011 13:38:09 -0700 Received: from azsmga101.ch.intel.com ([10.2.16.36]) by azsmga001-1.ch.intel.com with ESMTP; 01 Feb 2011 12:38:06 -0800 X-IronPort-Anti-Spam-Filtered: true X-IronPort-Anti-Spam-Result: AooAAED+R03RhLSDkWdsb2JhbACEF5JPjjkBAQEJCwoHEQUfrQCQPYEngzd1BIwj X-IronPort-AV: E=Sophos;i="4.60,411,1291622400"; d="scan'208";a="709060835" Received: from server1.sourceware.org (HELO sourceware.org) ([209.132.180.131]) by mga03.intel.com with SMTP; 01 Feb 2011 12:38:06 -0800 Received: (qmail 28249 invoked by uid 22791); 1 Feb 2011 20:38:06 -0000 X-SWARE-Spam-Status: No, hits=-2.7 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00 X-Spam-Status: No, hits=-2.7 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00 X-Spam-Check-By: sourceware.org Received: from localhost (HELO gcc.gnu.org) (127.0.0.1) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 01 Feb 2011 20:38:02 +0000 From: pinskia at gcc dot gnu.org <gcc-bugzilla@gcc.gnu.org> To: <xuepeng.guo@intel.com> Subject: [Bug rtl-optimization/17387] Redundant zero extension instructions in loop optimization X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: rtl-optimization X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: enhancement X-Bugzilla-Who: pinskia at gcc dot gnu.org X-Bugzilla-Status: RESOLVED X-Bugzilla-Priority: P2 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 4.6.0 X-Bugzilla-Changed-Fields: CC Message-ID: <bug-17387-15152-Pem3O8aLw6@http.gcc.gnu.org/bugzilla/> In-Reply-To: <bug-17387-15152@http.gcc.gnu.org/bugzilla/> References: <bug-17387-15152@http.gcc.gnu.org/bugzilla/> X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" MIME-Version: 1.0 Date: Tue, 1 Feb 2011 20:38:01 +0000 Return-Path: gcc-bugzilla@gcc.gnu.org
Delivery has failed to these recipients or distribution lists: Joey.ye@intel.com<mailto:Joey.ye@intel.com> The recipient's mailbox is full and can't accept messages now. Microsoft Exchange will not try to redeliver this message for you. Please try resending this message later, or contact the recipient directly. ________________________________ Sent by Microsoft Exchange Server 2007 Diagnostic information for administrators: Generating server: ccr.corp.intel.com Joey.ye@intel.com #550 5.2.2 STOREDRV.Deliver: mailbox full. The following information should help identify the cause: "MapiExceptionShutoffQuotaExceeded:16.18969:8C000000, 17.27161:0000000094000000000000000000000000000000, 255.23226:00000000, 255.27962:FE000000, 255.17082:DD040000, 0.26937:0A000000, 4.21921:DD040000, 255.27962:FA000000, 255.1494:94000000, 255.26426:FE000000, 4.7588:0F010480, 4.6564:0F010480, 4.4740:05000780, 4.6276:05000780, 4.5721:DD040000, 4.6489:DD040000, 4.2199:DD040000, 4.17097:DD040000, 4.8620:DD040000, 255.1750:EC030000, 0.26849:0F010480, 255.21817:DD040000, 0.26297:0F010480, 4.16585:DD040000, 0.32441:DD040000, 4.1706:DD040000, 0.24761:DD040000, 4.20665:DD040000, 0.25785:0F010480, 4.29881:DD040000". ## Original message headers: Received: from orsmsx604.amr.corp.intel.com (10.22.226.87) by shsmsx601.ccr.corp.intel.com (10.239.4.112) with Microsoft SMTP Server (TLS) id 8.2.254.0; Wed, 2 Feb 2011 04:48:29 +0800 Received: from fmsmga001.fm.intel.com (10.253.24.23) by orsmsx604-1.jf.intel.com (10.22.226.87) with Microsoft SMTP Server id 8.2.254.0; Tue, 1 Feb 2011 12:48:27 -0800 Received: from fmsmga102.fm.intel.com ([10.1.193.69]) by fmsmga001-1.fm.intel.com with ESMTP; 01 Feb 2011 12:48:27 -0800 X-IronPort-Anti-Spam-Filtered: true X-IronPort-Anti-Spam-Result: AooAAG0ASE3RhLSDkWdsb2JhbACEF5JPjjkBAQEJCwoHEQUfrQCQO4Engzd1BIwj X-IronPort-AV: E=Sophos;i="4.60,411,1291622400"; d="scan'208";a="1155131181" Received: from server1.sourceware.org (HELO sourceware.org) ([209.132.180.131]) by mga11.intel.com with SMTP; 01 Feb 2011 12:48:27 -0800 Received: (qmail 3519 invoked by uid 22791); 1 Feb 2011 20:48:27 -0000 X-SWARE-Spam-Status: No, hits=-2.7 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00 X-Spam-Status: No, hits=-2.7 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00 X-Spam-Check-By: sourceware.org Received: from localhost (HELO gcc.gnu.org) (127.0.0.1) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 01 Feb 2011 20:48:24 +0000 From: pinskia at gcc dot gnu.org <gcc-bugzilla@gcc.gnu.org> To: <Joey.ye@intel.com> Subject: [Bug rtl-optimization/17387] Redundant zero extension instructions in loop optimization X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: rtl-optimization X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: enhancement X-Bugzilla-Who: pinskia at gcc dot gnu.org X-Bugzilla-Status: RESOLVED X-Bugzilla-Priority: P2 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 4.6.0 X-Bugzilla-Changed-Fields: CC Message-ID: <bug-17387-14503-GLv9NmIyjX@http.gcc.gnu.org/bugzilla/> In-Reply-To: <bug-17387-14503@http.gcc.gnu.org/bugzilla/> References: <bug-17387-14503@http.gcc.gnu.org/bugzilla/> X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" MIME-Version: 1.0 Date: Tue, 1 Feb 2011 20:48:23 +0000 Return-Path: gcc-bugzilla@gcc.gnu.org