Bug 46928 - data dependence analysis fails on constant array accesses
Summary: data dependence analysis fails on constant array accesses
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.6.0
: P3 normal
Target Milestone: ---
Assignee: Sebastian Pop
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2010-12-13 18:06 UTC by Sebastian Pop
Modified: 2010-12-15 05:07 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2010-12-13 18:09:36


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Sebastian Pop 2010-12-13 18:06:39 UTC
In this code we should be able to convert the inner loop to memset zero with ./cc1 -O3 bug.c

typedef int mad_fixed_t;
struct mad_pcm
{
  unsigned int samplerate;
  unsigned short channels;
  unsigned short length;
  mad_fixed_t samples[2][1152];
};
struct mad_synth
{
  mad_fixed_t filter[2][2][2][16][8];
  unsigned int phase;
  struct mad_pcm pcm;
};
void mad_synth_mute (struct mad_synth *synth);
void
mad_synth_mute (struct mad_synth *synth)
{
  unsigned int ch;
  unsigned int s;
  unsigned int v;

  ch = 0U;
  while (ch < 2U)
    {
      s = 0U;
      while (s < 16U)
	{
	  v = 0U;
	  while (v < 8U)
	    {
	      synth->filter[ch][1][1][s][v] = 0;
	      synth->filter[ch][1][0][s][v] = 0;
	      synth->filter[ch][0][1][s][v] = 0;
	      synth->filter[ch][0][0][s][v] = 0;
	      v++;
	    }
	  s++;
	}
      ch++;
    }
  return;
}

When looking at the output of the dump
./cc1  -O3 -fdump-tree-ldist-details bug.c
the data dependence analysis fails to analyze the overlapping iterations for s_29:

(compute_affine_dependence
  (stmt_a = 
synth_7(D)->filter[ch_28][0][1][s_29][v_30] = 0;
)
  (stmt_b = 
synth_7(D)->filter[ch_28][0][0][s_29][v_30] = 0;
)
(subscript_dependence_tester 
(analyze_overlapping_iterations 
  (chrec_a = {0, +, 1}_3)
  (chrec_b = {0, +, 1}_3)
  (overlap_iterations_a = [0]
)
  (overlap_iterations_b = [0]
)
)
(analyze_overlapping_iterations 
  (chrec_a = s_29)
  (chrec_b = s_29)
  (overlap_iterations_a = not known
)
  (overlap_iterations_b = not known
)
)
(dependence classified: scev_not_known)
)
)

When we remove the s loop, we transform the following code with memset zero:

  ch = 0U;
  while (ch < 2U)
    {
	  v = 0U;
	  while (v < 8U)
	    {
	      synth->filter[ch][1][1][0][v] = 0;
	      synth->filter[ch][1][0][0][v] = 0;
	      synth->filter[ch][0][1][0][v] = 0;
	      synth->filter[ch][0][0][0][v] = 0;
	      v++;
	    }
      ch++;
    }
Comment 1 Sebastian Pop 2010-12-13 18:09:36 UTC
Mine.

BTW, this PR comes from example 2 of http://blog.regehr.org/archives/320
Comment 2 davidxl 2010-12-13 18:38:40 UTC
Is memset guaranteed to be asynchronous safe? Otherwise such transformation is not legal in signal handle code.

Icc does not covert it to memset, but unrolls differently -- this is where the tuing in gcc should be done as well.

David
Comment 3 Sebastian Pop 2010-12-13 18:58:41 UTC
Patch here:
http://gcc.gnu.org/ml/gcc-patches/2010-12/msg01016.html
Comment 4 sebpop@gmail.com 2010-12-13 19:05:58 UTC
The code that is produced looks like this just after loop
distribution, i.e., we generate memset zero only by distributing the
innermost loop:

mad_synth_mute (struct mad_synth * synth)
{
  long unsigned int D.2739;
  long unsigned int D.2740;
  long unsigned int D.2741;
  long unsigned int D.2742;
  long unsigned int D.2743;
  struct mad_synth * D.2744;
  long unsigned int D.2730;
  long unsigned int D.2731;
  long unsigned int D.2732;
  long unsigned int D.2733;
  <unnamed-signed:64> D.2734;
  <unnamed-signed:64> D.2735;
  <unnamed-signed:64> D.2736;
  long unsigned int D.2737;
  struct mad_synth * D.2738;
  long unsigned int D.2721;
  long unsigned int D.2722;
  long unsigned int D.2723;
  long unsigned int D.2724;
  <unnamed-signed:64> D.2725;
  <unnamed-signed:64> D.2726;
  <unnamed-signed:64> D.2727;
  long unsigned int D.2728;
  struct mad_synth * D.2729;
  long unsigned int D.2712;
  long unsigned int D.2713;
  long unsigned int D.2714;
  long unsigned int D.2715;
  <unnamed-signed:64> D.2716;
  <unnamed-signed:64> D.2717;
  <unnamed-signed:64> D.2718;
  long unsigned int D.2719;
  struct mad_synth * D.2720;
  unsigned int pretmp.2;
  unsigned int v;
  unsigned int s;
  unsigned int ch;

<bb 2>:
  goto <bb 10>;

<bb 5>:
Invalid sum of incoming frequencies 139, should be 1111
  s_9 = s_29 + 1;
  if (s_9 != 16)
    goto <bb 6>;
  else
    goto <bb 8>;

<bb 6>:

<bb 7>:
Invalid sum of outgoing probabilities 12.5%
  # s_29 = PHI <0(10), s_9(6)>
  D.2712_25 = (long unsigned int) s_29;
  D.2713_22 = (long unsigned int) ch_28;
  D.2714_1 = D.2713_22 * 64;
  D.2715_2 = D.2712_25 + D.2714_1;
  D.2716_3 = (<unnamed-signed:64>) D.2715_2;
  D.2717_26 = D.2716_3 + 48;
  D.2718_27 = D.2717_26 * 32;
  D.2719_24 = (long unsigned int) D.2718_27;
  D.2720_23 = synth_7(D) + D.2719_24;
  __builtin_memset (D.2720_23, 0, 32);
  D.2721_13 = (long unsigned int) s_29;
  D.2722_12 = (long unsigned int) ch_28;
  D.2723_11 = D.2722_12 * 64;
  D.2724_6 = D.2721_13 + D.2723_11;
  D.2725_20 = (<unnamed-signed:64>) D.2724_6;
  D.2726_32 = D.2725_20 + 32;
  D.2727_33 = D.2726_32 * 32;
  D.2728_34 = (long unsigned int) D.2727_33;
  D.2729_35 = synth_7(D) + D.2728_34;
  __builtin_memset (D.2729_35, 0, 32);
  D.2730_36 = (long unsigned int) s_29;
  D.2731_37 = (long unsigned int) ch_28;
  D.2732_38 = D.2731_37 * 64;
  D.2733_39 = D.2730_36 + D.2732_38;
  D.2734_40 = (<unnamed-signed:64>) D.2733_39;
  D.2735_41 = D.2734_40 + 16;
  D.2736_42 = D.2735_41 * 32;
  D.2737_43 = (long unsigned int) D.2736_42;
  D.2738_44 = synth_7(D) + D.2737_43;
  __builtin_memset (D.2738_44, 0, 32);
  D.2739_45 = (long unsigned int) ch_28;
  D.2740_46 = D.2739_45 * 64;
  D.2741_47 = (long unsigned int) s_29;
  D.2742_48 = D.2740_46 + D.2741_47;
  D.2743_49 = D.2742_48 * 32;
  D.2744_50 = synth_7(D) + D.2743_49;
  __builtin_memset (D.2744_50, 0, 32);
  goto <bb 5>;

<bb 8>:
  ch_10 = ch_28 + 1;
  if (ch_10 != 2)
    goto <bb 9>;
  else
    goto <bb 11>;

<bb 9>:

<bb 10>:
  # ch_28 = PHI <0(2), ch_10(9)>
  goto <bb 7>;

<bb 11>:
  return;

}

and the assembler:

mad_synth_mute:
.LFB0:
	.cfi_startproc
	movq	%rdi, %r9
	xorl	%r8d, %r8d
.L2:
	leaq	16(%r8), %rsi
	movq	%r9, %rdx
	movq	%r8, %rax
	.p2align 4,,10
	.p2align 3
.L3:
	leaq	48(%rax), %rcx
	salq	$5, %rcx
	addq	%rdi, %rcx
	movq	$0, (%rcx)
	movq	$0, 8(%rcx)
	movq	$0, 16(%rcx)
	movq	$0, 24(%rcx)
	leaq	32(%rax), %rcx
	salq	$5, %rcx
	addq	%rdi, %rcx
	movq	$0, (%rcx)
	movq	$0, 8(%rcx)
	movq	$0, 16(%rcx)
	movq	$0, 24(%rcx)
	leaq	16(%rax), %rcx
	addq	$1, %rax
	salq	$5, %rcx
	addq	%rdi, %rcx
	movq	$0, (%rcx)
	movq	$0, 8(%rcx)
	movq	$0, 16(%rcx)
	movq	$0, 24(%rcx)
	movq	$0, (%rdx)
	movq	$0, 8(%rdx)
	movq	$0, 16(%rdx)
	movq	$0, 24(%rdx)
	addq	$32, %rdx
	cmpq	%rsi, %rax
	jne	.L3
	addq	$64, %r8
	addq	$2048, %r9
	cmpq	$128, %r8
	jne	.L2
	rep
	ret
Comment 5 Sebastian Pop 2010-12-13 19:30:19 UTC
The code from comment #4 comes from 
./cc1 -O2 -ftree-loop-distribution -fdump-tree-ldist-details
as on -O3 we end up unrolling all the loops.
Comment 6 Sebastian Pop 2010-12-15 05:04:43 UTC
Author: spop
Date: Wed Dec 15 05:04:40 2010
New Revision: 167843

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=167843
Log:
Fix PR46928: handle "A[p] == A[p]" in data dep analysis with p a parameter of the loop.

2010-12-14  Sebastian Pop  <sebastian.pop@amd.com>

	PR tree-optimization/46928
	* tree-data-ref.c (analyze_overlapping_iterations): Handle "A[p] == A[p]"
	in data dependence analysis with p a parameter of the loop.

	* gcc.dg/tree-ssa/ldist-17.c: New.

Added:
    trunk/gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-data-ref.c
Comment 7 Sebastian Pop 2010-12-15 05:07:27 UTC
Fixed.