This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim
- From: "amker at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 04 May 2015 08:32:32 +0000
- Subject: [Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003
Bug ID: 66003
Summary: missed cse opportunity in addr expressions because of
tree pre/lim
Product: gcc
Version: 6.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: amker at gcc dot gnu.org
Target Milestone: ---
Below simple case is reduced from spec,
typedef struct
{
int x;
int y;
} coord;
extern unsigned short **org;
extern coord *c;
void bar (unsigned short *ptr);
void foo (int s, int n)
{
unsigned short arr[256], *ptr = arr;
int x, y;
for (y = c->y; y < c->y + 16; y++)
for (x = c->x; x < c->x + 16; x++)
*ptr++ = org [y][x];
bar (ptr);
}
When compiling with below two command lines
A: $gcc -Ofast -S test.c -o x.S
B: $gcc -Ofast -S test.c -o y.S -fno-tree-pre -fno-tree-loop-im
The assembly difference is as below:
$ diff x.S y.S
12,14c12,34
< subq $520, %rsp
< .cfi_def_cfa_offset 528
< movq c(%rip), %rdx
---
> pushq %r15
> .cfi_def_cfa_offset 16
> .cfi_offset 15, -16
> pushq %r14
> .cfi_def_cfa_offset 24
> .cfi_offset 14, -24
> pushq %r13
> .cfi_def_cfa_offset 32
> .cfi_offset 13, -32
> pushq %r12
> .cfi_def_cfa_offset 40
> .cfi_offset 12, -40
> pushq %rbp
> .cfi_def_cfa_offset 48
> .cfi_offset 6, -48
> pushq %rbx
> .cfi_def_cfa_offset 56
> .cfi_offset 3, -56
> subq $568, %rsp
> .cfi_def_cfa_offset 624
> movq c(%rip), %rax
> movslq (%rax), %rsi
> movslq 4(%rax), %rdx
16,20c36,58
< movslq 4(%rdx), %rcx
< leaq (%rax,%rcx,8), %rsi
< movslq (%rdx), %rcx
< movq %rsp, %rax
< addq %rcx, %rcx
---
> addq %rsi, %rsi
> leaq 24(%rsi), %rcx
> leaq 22(%rsi), %rdi
> leaq 2(%rsi), %r15
> leaq 4(%rsi), %r14
> leaq 6(%rsi), %r13
> leaq 8(%rsi), %r12
> movq %rcx, 8(%rsp)
> leaq 26(%rsi), %rcx
> leaq 10(%rsi), %rbp
> leaq 12(%rsi), %rbx
> leaq 14(%rsi), %r11
> leaq 16(%rsi), %r10
> movq %rcx, 16(%rsp)
> leaq 28(%rsi), %rcx
> leaq 18(%rsi), %r9
> leaq 20(%rsi), %r8
> movq %rdi, 40(%rsp)
> movq %rcx, 24(%rsp)
> leaq 30(%rsi), %rcx
> movq %rcx, 32(%rsp)
> leaq (%rax,%rdx,8), %rcx
> leaq 48(%rsp), %rax
24c62
< movq (%rsi), %rdx
---
> movq (%rcx), %rdx
26,27c64,65
< addq $8, %rsi
< movzwl (%rdx,%rcx), %edi
---
> addq $8, %rcx
> movzwl (%rdx,%rsi), %edi
29c67
< movzwl 2(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r15), %edi
31c69
< movzwl 4(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r14), %edi
33c71
< movzwl 6(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r13), %edi
35c73
< movzwl 8(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r12), %edi
37c75
< movzwl 10(%rdx,%rcx), %edi
---
> movzwl (%rdx,%rbp), %edi
39c77
< movzwl 12(%rdx,%rcx), %edi
---
> movzwl (%rdx,%rbx), %edi
41c79
< movzwl 14(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r11), %edi
43c81
< movzwl 16(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r10), %edi
45c83
< movzwl 18(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r9), %edi
47c85
< movzwl 20(%rdx,%rcx), %edi
---
> movzwl (%rdx,%r8), %edi
49c87,88
< movzwl 22(%rdx,%rcx), %edi
---
> movq 40(%rsp), %rdi
> movzwl (%rdx,%rdi), %edi
51c90,91
< movzwl 24(%rdx,%rcx), %edi
---
> movq 8(%rsp), %rdi
> movzwl (%rdx,%rdi), %edi
53c93,94
< movzwl 26(%rdx,%rcx), %edi
---
> movq 16(%rsp), %rdi
> movzwl (%rdx,%rdi), %edi
55c96,97
< movzwl 28(%rdx,%rcx), %edi
---
> movq 24(%rsp), %rdi
> movzwl (%rdx,%rdi), %edi
57c99,100
< movzwl 30(%rdx,%rcx), %edx
---
> movq 32(%rsp), %rdi
> movzwl (%rdx,%rdi), %edx
59c102
< leaq 512(%rsp), %rdx
---
> leaq 560(%rsp), %rdx
64c107,119
< addq $520, %rsp
---
> addq $568, %rsp
> .cfi_def_cfa_offset 56
> popq %rbx
> .cfi_def_cfa_offset 48
> popq %rbp
> .cfi_def_cfa_offset 40
> popq %r12
> .cfi_def_cfa_offset 32
> popq %r13
> .cfi_def_cfa_offset 24
> popq %r14
> .cfi_def_cfa_offset 16
> popq %r15
The tree-pre dump is as below:
<bb 2>:
c.0_8 = c;
y_9 = c.0_8->y;
_47 = y_9 + 15;
pretmp_112 = c.0_8->x;
pretmp_128 = org;
pretmp_144 = (long unsigned int) pretmp_112;
pretmp_159 = pretmp_144 * 2;
pretmp_160 = pretmp_112 + 1;
pretmp_175 = (long unsigned int) pretmp_160;
pretmp_176 = pretmp_175 * 2;
pretmp_191 = pretmp_112 + 2;
pretmp_192 = (long unsigned int) pretmp_191;
pretmp_207 = pretmp_192 * 2;
pretmp_208 = pretmp_112 + 3;
pretmp_223 = (long unsigned int) pretmp_208;
pretmp_224 = pretmp_223 * 2;
pretmp_239 = pretmp_112 + 4;
pretmp_240 = (long unsigned int) pretmp_239;
pretmp_255 = pretmp_240 * 2;
pretmp_256 = pretmp_112 + 5;
pretmp_271 = (long unsigned int) pretmp_256;
pretmp_283 = pretmp_271 * 2;
pretmp_12 = pretmp_112 + 6;
pretmp_50 = (long unsigned int) pretmp_12;
pretmp_51 = pretmp_50 * 2;
pretmp_52 = pretmp_112 + 7;
pretmp_53 = (long unsigned int) pretmp_52;
pretmp_65 = pretmp_53 * 2;
pretmp_66 = pretmp_112 + 8;
pretmp_67 = (long unsigned int) pretmp_66;
pretmp_68 = pretmp_67 * 2;
pretmp_69 = pretmp_112 + 9;
pretmp_81 = (long unsigned int) pretmp_69;
pretmp_82 = pretmp_81 * 2;
pretmp_83 = pretmp_112 + 10;
pretmp_84 = (long unsigned int) pretmp_83;
pretmp_85 = pretmp_84 * 2;
pretmp_97 = pretmp_112 + 11;
pretmp_98 = (long unsigned int) pretmp_97;
pretmp_99 = pretmp_98 * 2;
pretmp_100 = pretmp_112 + 12;
pretmp_101 = (long unsigned int) pretmp_100;
pretmp_113 = pretmp_101 * 2;
pretmp_114 = pretmp_112 + 13;
pretmp_115 = (long unsigned int) pretmp_114;
pretmp_116 = pretmp_115 * 2;
pretmp_117 = pretmp_112 + 14;
pretmp_129 = (long unsigned int) pretmp_117;
pretmp_130 = pretmp_129 * 2;
pretmp_131 = pretmp_112 + 15;
pretmp_132 = (long unsigned int) pretmp_131;
pretmp_133 = pretmp_132 * 2;
<bb 3>:
# ptr_48 = PHI <&arr(2), ptr_272(3)>
# y_64 = PHI <y_9(2), y_25(3)>
_34 = (long unsigned int) y_64;
_35 = _34 * 8;
_36 = pretmp_128 + _35;
_37 = *_36;
_40 = _37 + pretmp_159;
_41 = *_40;
*ptr_48 = _41;
_56 = _37 + pretmp_176;
_57 = *_56;
MEM[(short unsigned int *)ptr_48 + 2B] = _57;
_72 = _37 + pretmp_207;
_73 = *_72;
MEM[(short unsigned int *)ptr_48 + 4B] = _73;
_88 = _37 + pretmp_224;
_89 = *_88;
MEM[(short unsigned int *)ptr_48 + 6B] = _89;
_104 = _37 + pretmp_255;
_105 = *_104;
MEM[(short unsigned int *)ptr_48 + 8B] = _105;
_120 = _37 + pretmp_283;
_121 = *_120;
MEM[(short unsigned int *)ptr_48 + 10B] = _121;
_136 = _37 + pretmp_51;
_137 = *_136;
MEM[(short unsigned int *)ptr_48 + 12B] = _137;
_152 = _37 + pretmp_65;
_153 = *_152;
MEM[(short unsigned int *)ptr_48 + 14B] = _153;
_168 = _37 + pretmp_68;
_169 = *_168;
MEM[(short unsigned int *)ptr_48 + 16B] = _169;
_184 = _37 + pretmp_82;
_185 = *_184;
MEM[(short unsigned int *)ptr_48 + 18B] = _185;
_200 = _37 + pretmp_85;
_201 = *_200;
MEM[(short unsigned int *)ptr_48 + 20B] = _201;
_216 = _37 + pretmp_99;
_217 = *_216;
MEM[(short unsigned int *)ptr_48 + 22B] = _217;
_232 = _37 + pretmp_113;
_233 = *_232;
MEM[(short unsigned int *)ptr_48 + 24B] = _233;
_248 = _37 + pretmp_116;
_249 = *_248;
MEM[(short unsigned int *)ptr_48 + 26B] = _249;
_264 = _37 + pretmp_130;
_265 = *_264;
MEM[(short unsigned int *)ptr_48 + 28B] = _265;
ptr_272 = &MEM[(void *)ptr_48 + 32B];
_280 = _37 + pretmp_133;
_281 = *_280;
MEM[(short unsigned int *)ptr_48 + 30B] = _281;
y_25 = y_64 + 1;
if (y_25 > _47)
goto <bb 4>;
else
goto <bb 3>;
Pre hoist the index part of addr expression "base + (reg + i) *2" out of first
loop. This introduces higher register pressure, prevents gcc from using
powerful addressing expression on x86.
On other targets like arm, only register pressure issue may hold.
Both pre and lim will do same transformation.