Created attachment 22343 [details] preprocessed file When compiling function with ia64: int nor(char* __restrict__ c, char* __restrict__ d) { int i, sum = 0; for (i = 0; i < 256; i++) d[i] = c[i]; return sum; } before sched1 we have: (insn 91 329 330 4 (set (mem:V8QI (reg/v/f:DI 435 [ d ]) [0 MEM[(char *)d_43]+0 S8 A64]) (reg:V8QI 430 [ vect_var_?25 ])) a.c:5 384 {*movv8qi_internal} (expr_list:REG_DEAD (reg:V8QI 430 [ vect_var_?25 ]) (nil))) (insn 330 91 253 4 (set (reg/f:DI 450) (plus:DI (reg/v/f:DI 435 [ d ]) (const_int 8 [0x8]))) a.c:5 205 {adddi3} (nil)) (insn 253 330 254 4 (set (reg:V8QI 455 [ vect_var_?25 ]) (mem:V8QI (reg:DI 449) [0 MEM[(char *)D.2084_33]+0 S8 A64])) a.c:5 384 {*movv8qi_internal} (nil)) insn 91 is a store and 253 is a load from a different location (two different restrict pointers). These insns should not have a dependency between them but we can see in sched1 that they do have: ;; ====================================================== ;; -- basic block 4 from 89 to 340 -- before reload ;; ====================================================== ;; --------------- forward dependences: ------------ ;; --- Region Dependences --- b 4 bb 0 ;; insn code bb dep prio cost reservation ;; ---- ---- -- --- ---- ---- ----------- ;; 89 384 4 0 1 1 2_M_only_um01 : 340 321 320 310 299 288 277 266 255 91 ;; 329 205 4 0 3 1 2_A : 340 320 254 253 ;; 91 384 4 1 0 1 2_M_only_um23 : 340 322 321 319 310 308 299 297 288 286 277 275 266 264 255 253 ;; 330 205 4 0 2 1 2_A : 340 322 256 255 ;; 253 384 4 2 1 1 2_M_only_um01 : 340 321 310 299 288 277 266 255 ;; 254 205 4 1 2 1 2_A : 340 264 ;; 255 384 4 4 0 1 2_M_only_um23 : 340 321 319 310 308 299 297 288 286 277 275 266 264 A similar bug was fixed a few months ago (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44479). Might be connected. Using built-in specs. COLLECT_GCC=./xgcc Target: ia64-linux-elf Configured with: Thread model: single gcc version 4.6.0 20101106 (experimental) (GCC) COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-funroll-loops' '-fdump-rtl-all' '-fsched-verbose=8' cc1 -E -quiet -v -iprefix /home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/ia64-linux-elf/4.6.0/ a.c -funroll-loops -fdump-rtl-all -fsched-verbose=8 -O3 -fpch-preprocess -o a.i ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/ia64-linux-elf/4.6.0/include" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/ia64-linux-elf/4.6.0/include-fixed" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/ia64-linux-elf/4.6.0/../../../../ia64-linux-elf/sys-include" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/ia64-linux-elf/4.6.0/../../../../ia64-linux-elf/include" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/../../lib/gcc/ia64-linux-elf/4.6.0/include" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/../../lib/gcc/ia64-linux-elf/4.6.0/include-fixed" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/../../lib/gcc/ia64-linux-elf/4.6.0/../../../../ia64-linux-elf/sys-include" ignoring nonexistent directory "/home/swproj/sw/users/eyalhar/ia64-new/gcc/../lib/gcc/../../lib/gcc/ia64-linux-elf/4.6.0/../../../../ia64-linux-elf/include" #include "..." search starts here: #include <...> search starts here: End of search list. COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-funroll-loops' '-fdump-rtl-all' '-fsched-verbose=8' cc1 -fpreprocessed a.i -quiet -dumpbase a.c -auxbase a -O3 -version -funroll-loops -fdump-rtl-all -fsched-verbose=8 -o a.s GNU C (GCC) version 4.6.0 20101106 (experimental) (ia64-linux-elf) compiled by GNU C version 4.1.2 20080704 (Red Hat 4.1.2-44), GMP version 4.3.2, MPFR version 2.4.2, MPC version 0.8.1 GGC heuristics: --param ggc-min-expand=30 --param ggc-min-heapsize=4096 GNU C (GCC) version 4.6.0 20101106 (experimental) (ia64-linux-elf) compiled by GNU C version 4.1.2 20080704 (Red Hat 4.1.2-44), GMP version 4.3.2, MPFR version 2.4.2, MPC version 0.8.1 GGC heuristics: --param ggc-min-expand=30 --param ggc-min-heapsize=4096 Compiler executable checksum: 48128c38ed97d38ae5641e25d3ca761e COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-funroll-loops' '-fdump-rtl-all' '-fsched-verbose=8' as -N so -o a.o a.s
On x86_64 restrict information is there (-fdump-tree-optimized-alias): <bb 5>: # PT = nonlocal { PARM_RESTRICT.0 } (restr) vect_p.27_71 = c_8(D) + prolog_loop_niters.16_27; # PT = nonlocal { PARM_RESTRICT.1 } (restr) vect_p.32_75 = d_5(D) + prolog_loop_niters.16_27; <bb 6>: # ivtmp.52_46 = PHI <ivtmp.52_31(6), 0(5)> # ivtmp.55_47 = PHI <ivtmp.55_33(6), 0(5)> vect_var_.28_74 = MEM[base: vect_p.27_71, index: ivtmp.55_47, offset: 0B]; MEM[base: vect_p.32_75, index: ivtmp.55_47, offset: 0B] = vect_var_.28_74; ivtmp.52_31 = ivtmp.52_46 + 1; ivtmp.55_33 = ivtmp.55_47 + 16; if (ivtmp.52_31 < bnd.20_49) goto <bb 6>; else goto <bb 7>;
Seems to me that also on ia64 it is there but the dependecies are still wrong: ;; Function nor (nor) nor (char * restrict c, char * restrict d) { long unsigned int D.2085; long unsigned int D.2086; vector(8) char * restrict D.2087; void * D.2084; long unsigned int ivtmp?52; void * D.2071; void * D.2070; long unsigned int ivtmp?41; long unsigned int ivtmp?38; vector(8) char vect_var_?25; long int andmask?20; long int orptrs1?19; long int addr2int1?18; long int addr2int0?14; int i; char D.2021; <bb 2>: addr2int0?14_22 = (long int) c_8(D); addr2int1?18_24 = (long int) d_5(D); orptrs1?19_25 = addr2int1?18_24 | addr2int0?14_22; andmask?20_26 = orptrs1?19_25 & 7; if (andmask?20_26 == 0) goto <bb 3>; else goto <bb 5>; <bb 3>: ivtmp?52_37 = (long unsigned int) c_8(D); D.2085_32 = (long unsigned int) d_5(D); D.2086_48 = D.2085_32 + 256; D.2087_49 = (vector(8) char * restrict) D.2086_48; <bb 4>: # PT = nonlocal { PARM_RESTRICT?1 } (restr) # ALIGN = 8, MISALIGN = 0 # d_43 = PHI <d_44(4), d_5(D)(3)> # ivtmp?52_31 = PHI <ivtmp?52_30(4), ivtmp?52_37(3)> # PT = nonlocal { PARM_RESTRICT?0 } (restr) # ALIGN = 8, MISALIGN = 0 D.2084_33 = (void *) ivtmp?52_31; vect_var_?25_41 = MEM[(char *)D.2084_33]; ivtmp?52_30 = ivtmp?52_31 + 8; MEM[(char *)d_43] = vect_var_?25_41; # PT = nonlocal { PARM_RESTRICT?1 } (restr) d_44 = d_43 + 8; if (d_44 != D.2087_49) goto <bb 4>; else goto <bb 7>; <bb 5>: ivtmp?38_4 = (long unsigned int) c_8(D); ivtmp?41_10 = (long unsigned int) d_5(D); <bb 6>: # i_28 = PHI <0(5), i_36(6)> # ivtmp?38_19 = PHI <ivtmp?38_4(5), ivtmp?38_17(6)> # ivtmp?41_6 = PHI <ivtmp?41_10(5), ivtmp?41_9(6)> # PT = nonlocal { PARM_RESTRICT?0 } (restr) D.2070_11 = (void *) ivtmp?38_19; D.2021_34 = MEM[(char *)D.2070_11]; ivtmp?38_17 = ivtmp?38_19 + 1; # PT = nonlocal { PARM_RESTRICT?1 } (restr) D.2071_13 = (void *) ivtmp?41_6; MEM[(char *)D.2071_13] = D.2021_34; ivtmp?41_9 = ivtmp?41_6 + 1; i_36 = i_28 + 1; if (i_36 != 256) goto <bb 6>; else goto <bb 7>; <bb 7>: return 0; }
It happens also on i386 with ./cc1 -O3 a.c -fdump-rtl-all -funroll-loops -fsched-verbose=8 -fschedule-insns: In asmcons: (insn 16 15 17 4 a.c:5 (set (reg:SI 72 [ vect_var_.26 ]) (mem:SI (plus:SI (reg/v/f:SI 74 [ c ]) (reg:SI 70 [ ivtmp.42 ])) [0 MEM[(char * restrict)vect_p.22]+0 S4 A32])) 50 {*movsi_internal} (nil)) (insn 17 16 134 4 a.c:5 (set (mem:SI (plus:SI (reg/v/f:SI 75 [ d ]) (reg:SI 70 [ ivtmp.42 ])) [0 MEM[(char * restrict)vect_p.27]+0 S4 A32]) (reg:SI 72 [ vect_var_.26 ])) 50 {*movsi_internal} (expr_list:REG_DEAD (reg:SI 72 [ vect_var_.26 ]) (nil))) (insn 134 17 135 4 a.c:5 (set (reg:SI 82 [ vect_var_.26 ]) (mem:SI (plus:SI (plus:SI (reg/v/f:SI 74 [ c ]) (reg:SI 70 [ ivtmp.42 ])) (const_int 4 [0x4])) [0 MEM[(char * restrict)vect_p.22]+0 S4 A32])) 50 {*movsi_internal} (nil)) and in sched1: ;; ====================================================== ;; -- basic block 4 from 16 to 198 -- before reload ;; ====================================================== ;; --------------- forward dependences: ------------ ;; --- Region Dependences --- b 4 bb 0 ;; insn code bb dep prio cost reservation ;; ---- ---- -- --- ---- ---- ----------- ;; 16 50 4 0 35 4 decodern,p2 : 198 196 195 185 175 165 155 145 135 17 ;; 17 50 4 1 31 1 decoder0,(p4+p3) : 198 196 194 184 174 164 154 144 134 ;; 134 50 4 1 31 4 decodern,p2 : 198 196 195 185 175 165 155 145 135 ;; 135 50 4 2 27 1 decoder0,(p4+p3) : 198 196 194 184 174 164 154 144 There should not be any dependency between 17 (store) and 134 (load). BTW, I failed building i386 from the current snapshot so I used an old one. (../../gcc-4.6-20101106/gcc/config/i386/bdver1.md:528: unknown mode `V4DF' ../../gcc-4.6-20101106/gcc/config/i386/bdver1.md:528: following context is `0 "r egister_operand")' )
I suspect this has been long fixed.
It is. Slightly altered testcase (to avoid unrolling on GIMPLE), with -fno-schedule-insn2 on x86_64: .L2: movdqu (%rdi,%rax), %xmm1 paddb %xmm0, %xmm1 movups %xmm1, (%rsi,%rax) movdqu 16(%rdi,%rax), %xmm2 paddb %xmm0, %xmm2 movups %xmm2, 16(%rsi,%rax) movdqu 32(%rdi,%rax), %xmm3 paddb %xmm0, %xmm3 movups %xmm3, 32(%rsi,%rax) movdqu 48(%rdi,%rax), %xmm4 paddb %xmm0, %xmm4 movups %xmm4, 48(%rsi,%rax) movdqu 64(%rdi,%rax), %xmm5 paddb %xmm0, %xmm5 movups %xmm5, 64(%rsi,%rax) movdqu 80(%rdi,%rax), %xmm6 paddb %xmm0, %xmm6 movups %xmm6, 80(%rsi,%rax) movdqu 96(%rdi,%rax), %xmm7 paddb %xmm0, %xmm7 movups %xmm7, 96(%rsi,%rax) movdqu 112(%rdi,%rax), %xmm8 paddb %xmm0, %xmm8 movups %xmm8, 112(%rsi,%rax) subq $-128, %rax cmpq $4096, %rax jne .L2 and without: .L2: movdqu (%rdi,%rax), %xmm1 movdqu 16(%rdi,%rax), %xmm2 movdqu 32(%rdi,%rax), %xmm3 movdqu 48(%rdi,%rax), %xmm4 movdqu 64(%rdi,%rax), %xmm5 paddb %xmm0, %xmm1 paddb %xmm0, %xmm2 movdqu 80(%rdi,%rax), %xmm6 movdqu 96(%rdi,%rax), %xmm7 paddb %xmm0, %xmm3 paddb %xmm0, %xmm4 movups %xmm1, (%rsi,%rax) movdqu 112(%rdi,%rax), %xmm8 paddb %xmm0, %xmm5 paddb %xmm0, %xmm6 movups %xmm2, 16(%rsi,%rax) paddb %xmm0, %xmm7 movups %xmm3, 32(%rsi,%rax) paddb %xmm0, %xmm8 movups %xmm4, 48(%rsi,%rax) movups %xmm5, 64(%rsi,%rax) movups %xmm6, 80(%rsi,%rax) movups %xmm7, 96(%rsi,%rax) movups %xmm8, 112(%rsi,%rax) subq $-128, %rax cmpq $4096, %rax jne .L2 and that's only possible if this dependence is not visible. 4.3 shows the problem still, GCC 7 doesn't.