This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug tree-optimization/66003] New: missed cse opportunity in addr expressions because of tree pre/lim


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66003

            Bug ID: 66003
           Summary: missed cse opportunity in addr expressions because of
                    tree pre/lim
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amker at gcc dot gnu.org
  Target Milestone: ---

Below simple case is reduced from spec,
typedef struct
{
  int x;
  int y;
} coord;

extern unsigned short **org;
extern coord *c;
void bar (unsigned short *ptr);
void foo (int s, int n)
{
  unsigned short arr[256], *ptr = arr;
  int x, y;

  for (y = c->y; y < c->y + 16; y++)
    for (x = c->x; x < c->x + 16; x++)
      *ptr++ = org [y][x];

  bar (ptr);
}

When compiling with below two command lines
A: $gcc -Ofast -S test.c -o x.S
B: $gcc -Ofast -S test.c -o y.S -fno-tree-pre -fno-tree-loop-im

The assembly difference is as below:

$ diff  x.S y.S
12,14c12,34
<       subq    $520, %rsp
<       .cfi_def_cfa_offset 528
<       movq    c(%rip), %rdx
---
> 	pushq	%r15
> 	.cfi_def_cfa_offset 16
> 	.cfi_offset 15, -16
> 	pushq	%r14
> 	.cfi_def_cfa_offset 24
> 	.cfi_offset 14, -24
> 	pushq	%r13
> 	.cfi_def_cfa_offset 32
> 	.cfi_offset 13, -32
> 	pushq	%r12
> 	.cfi_def_cfa_offset 40
> 	.cfi_offset 12, -40
> 	pushq	%rbp
> 	.cfi_def_cfa_offset 48
> 	.cfi_offset 6, -48
> 	pushq	%rbx
> 	.cfi_def_cfa_offset 56
> 	.cfi_offset 3, -56
> 	subq	$568, %rsp
> 	.cfi_def_cfa_offset 624
> 	movq	c(%rip), %rax
> 	movslq	(%rax), %rsi
> 	movslq	4(%rax), %rdx
16,20c36,58
<       movslq  4(%rdx), %rcx
<       leaq    (%rax,%rcx,8), %rsi
<       movslq  (%rdx), %rcx
<       movq    %rsp, %rax
<       addq    %rcx, %rcx
---
> 	addq	%rsi, %rsi
> 	leaq	24(%rsi), %rcx
> 	leaq	22(%rsi), %rdi
> 	leaq	2(%rsi), %r15
> 	leaq	4(%rsi), %r14
> 	leaq	6(%rsi), %r13
> 	leaq	8(%rsi), %r12
> 	movq	%rcx, 8(%rsp)
> 	leaq	26(%rsi), %rcx
> 	leaq	10(%rsi), %rbp
> 	leaq	12(%rsi), %rbx
> 	leaq	14(%rsi), %r11
> 	leaq	16(%rsi), %r10
> 	movq	%rcx, 16(%rsp)
> 	leaq	28(%rsi), %rcx
> 	leaq	18(%rsi), %r9
> 	leaq	20(%rsi), %r8
> 	movq	%rdi, 40(%rsp)
> 	movq	%rcx, 24(%rsp)
> 	leaq	30(%rsi), %rcx
> 	movq	%rcx, 32(%rsp)
> 	leaq	(%rax,%rdx,8), %rcx
> 	leaq	48(%rsp), %rax
24c62
<       movq    (%rsi), %rdx
---
> 	movq	(%rcx), %rdx
26,27c64,65
<       addq    $8, %rsi
<       movzwl  (%rdx,%rcx), %edi
---
> 	addq	$8, %rcx
> 	movzwl	(%rdx,%rsi), %edi
29c67
<       movzwl  2(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r15), %edi
31c69
<       movzwl  4(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r14), %edi
33c71
<       movzwl  6(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r13), %edi
35c73
<       movzwl  8(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r12), %edi
37c75
<       movzwl  10(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%rbp), %edi
39c77
<       movzwl  12(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%rbx), %edi
41c79
<       movzwl  14(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r11), %edi
43c81
<       movzwl  16(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r10), %edi
45c83
<       movzwl  18(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r9), %edi
47c85
<       movzwl  20(%rdx,%rcx), %edi
---
> 	movzwl	(%rdx,%r8), %edi
49c87,88
<       movzwl  22(%rdx,%rcx), %edi
---
> 	movq	40(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
51c90,91
<       movzwl  24(%rdx,%rcx), %edi
---
> 	movq	8(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
53c93,94
<       movzwl  26(%rdx,%rcx), %edi
---
> 	movq	16(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
55c96,97
<       movzwl  28(%rdx,%rcx), %edi
---
> 	movq	24(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edi
57c99,100
<       movzwl  30(%rdx,%rcx), %edx
---
> 	movq	32(%rsp), %rdi
> 	movzwl	(%rdx,%rdi), %edx
59c102
<       leaq    512(%rsp), %rdx
---
> 	leaq	560(%rsp), %rdx
64c107,119
<       addq    $520, %rsp
---
> 	addq	$568, %rsp
> 	.cfi_def_cfa_offset 56
> 	popq	%rbx
> 	.cfi_def_cfa_offset 48
> 	popq	%rbp
> 	.cfi_def_cfa_offset 40
> 	popq	%r12
> 	.cfi_def_cfa_offset 32
> 	popq	%r13
> 	.cfi_def_cfa_offset 24
> 	popq	%r14
> 	.cfi_def_cfa_offset 16
> 	popq	%r15

The tree-pre dump is as below:

  <bb 2>:
  c.0_8 = c;
  y_9 = c.0_8->y;
  _47 = y_9 + 15;
  pretmp_112 = c.0_8->x;
  pretmp_128 = org;
  pretmp_144 = (long unsigned int) pretmp_112;
  pretmp_159 = pretmp_144 * 2;
  pretmp_160 = pretmp_112 + 1;
  pretmp_175 = (long unsigned int) pretmp_160;
  pretmp_176 = pretmp_175 * 2;
  pretmp_191 = pretmp_112 + 2;
  pretmp_192 = (long unsigned int) pretmp_191;
  pretmp_207 = pretmp_192 * 2;
  pretmp_208 = pretmp_112 + 3;
  pretmp_223 = (long unsigned int) pretmp_208;
  pretmp_224 = pretmp_223 * 2;
  pretmp_239 = pretmp_112 + 4;
  pretmp_240 = (long unsigned int) pretmp_239;
  pretmp_255 = pretmp_240 * 2;
  pretmp_256 = pretmp_112 + 5;
  pretmp_271 = (long unsigned int) pretmp_256;
  pretmp_283 = pretmp_271 * 2;
  pretmp_12 = pretmp_112 + 6;
  pretmp_50 = (long unsigned int) pretmp_12;
  pretmp_51 = pretmp_50 * 2;
  pretmp_52 = pretmp_112 + 7;
  pretmp_53 = (long unsigned int) pretmp_52;
  pretmp_65 = pretmp_53 * 2;
  pretmp_66 = pretmp_112 + 8;
  pretmp_67 = (long unsigned int) pretmp_66;
  pretmp_68 = pretmp_67 * 2;
  pretmp_69 = pretmp_112 + 9;
  pretmp_81 = (long unsigned int) pretmp_69;
  pretmp_82 = pretmp_81 * 2;
  pretmp_83 = pretmp_112 + 10;
  pretmp_84 = (long unsigned int) pretmp_83;
  pretmp_85 = pretmp_84 * 2;
  pretmp_97 = pretmp_112 + 11;
  pretmp_98 = (long unsigned int) pretmp_97;
  pretmp_99 = pretmp_98 * 2;
  pretmp_100 = pretmp_112 + 12;
  pretmp_101 = (long unsigned int) pretmp_100;
  pretmp_113 = pretmp_101 * 2;
  pretmp_114 = pretmp_112 + 13;
  pretmp_115 = (long unsigned int) pretmp_114;
  pretmp_116 = pretmp_115 * 2;
  pretmp_117 = pretmp_112 + 14;
  pretmp_129 = (long unsigned int) pretmp_117;
  pretmp_130 = pretmp_129 * 2;
  pretmp_131 = pretmp_112 + 15;
  pretmp_132 = (long unsigned int) pretmp_131;
  pretmp_133 = pretmp_132 * 2;

  <bb 3>:
  # ptr_48 = PHI <&arr(2), ptr_272(3)>
  # y_64 = PHI <y_9(2), y_25(3)>
  _34 = (long unsigned int) y_64;
  _35 = _34 * 8;
  _36 = pretmp_128 + _35;
  _37 = *_36;
  _40 = _37 + pretmp_159;
  _41 = *_40;
  *ptr_48 = _41;
  _56 = _37 + pretmp_176;
  _57 = *_56;
  MEM[(short unsigned int *)ptr_48 + 2B] = _57;
  _72 = _37 + pretmp_207;
  _73 = *_72;
  MEM[(short unsigned int *)ptr_48 + 4B] = _73;
  _88 = _37 + pretmp_224;
  _89 = *_88;
  MEM[(short unsigned int *)ptr_48 + 6B] = _89;
  _104 = _37 + pretmp_255;
  _105 = *_104;
  MEM[(short unsigned int *)ptr_48 + 8B] = _105;
  _120 = _37 + pretmp_283;
  _121 = *_120;
  MEM[(short unsigned int *)ptr_48 + 10B] = _121;
  _136 = _37 + pretmp_51;
  _137 = *_136;
  MEM[(short unsigned int *)ptr_48 + 12B] = _137;
  _152 = _37 + pretmp_65;
  _153 = *_152;
  MEM[(short unsigned int *)ptr_48 + 14B] = _153;
  _168 = _37 + pretmp_68;
  _169 = *_168;
  MEM[(short unsigned int *)ptr_48 + 16B] = _169;
  _184 = _37 + pretmp_82;
  _185 = *_184;
  MEM[(short unsigned int *)ptr_48 + 18B] = _185;
  _200 = _37 + pretmp_85;
  _201 = *_200;
  MEM[(short unsigned int *)ptr_48 + 20B] = _201;
  _216 = _37 + pretmp_99;
  _217 = *_216;
  MEM[(short unsigned int *)ptr_48 + 22B] = _217;
  _232 = _37 + pretmp_113;
  _233 = *_232;
  MEM[(short unsigned int *)ptr_48 + 24B] = _233;
  _248 = _37 + pretmp_116;
  _249 = *_248;
  MEM[(short unsigned int *)ptr_48 + 26B] = _249;
  _264 = _37 + pretmp_130;
  _265 = *_264;
  MEM[(short unsigned int *)ptr_48 + 28B] = _265;
  ptr_272 = &MEM[(void *)ptr_48 + 32B];
  _280 = _37 + pretmp_133;
  _281 = *_280;
  MEM[(short unsigned int *)ptr_48 + 30B] = _281;
  y_25 = y_64 + 1;
  if (y_25 > _47)
    goto <bb 4>;
  else
    goto <bb 3>;

Pre hoist the index part of addr expression "base + (reg + i) *2" out of first
loop.  This introduces higher register pressure, prevents gcc from using
powerful addressing expression on x86.

On other targets like arm, only register pressure issue may hold.

Both pre and lim will do same transformation.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]