[Bug target/17593] New: Overaggressive Speculative Code Motion

Tue Sep 21 20:53:00 GMT 2004

Description:
This example comes from procedure "mainGtU" in blocksort.c which is part of 
bzip2-1.0.2

The procedure is not scheduled because it exceeds the threshold allowed 
for "max-sched-region-blocks" which defaults to 10.  If we use --param to bump 
that value to 30, then gcc will schedule this procedure.

The bad news is that once scheduled, it performs worse than if it were not 
scheduled.  The main reason appears to be that overaggressive speculative 
motions are taking place within the "do" loop.  Many of the adds/shifts 
required in the loop are moved prior to the first exit condition.  Thus, if 
the loop exits early, we are computing a lot of results that will never get 
used.

In addition, the aggressive code motion introduces much higher register 
pressure causing the procedure to use non-volatile registers and in turn 
adding save-restore code to the prolog and epilog.

Testcase: 
typedef unsigned char   Bool;
typedef unsigned int    UInt32;
typedef unsigned char   UChar;
typedef unsigned short  UInt16;
typedef int             Int32;
#define True  ((Bool)1)
#define False ((Bool)0)
#define AssertD(cond,msg) /* */

Bool mainGtU ( UInt32  i1, 
               UInt32  i2,
               UChar*  block, 
               UInt16* quadrant,
               UInt32  nblock,
               Int32*  budget )
{
   Int32  k;
   UChar  c1, c2;
   UInt16 s1, s2;

   AssertD ( i1 != i2, "mainGtU" );
   /* 1 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 2 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 3 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 4 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 5 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 6 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 7 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 8 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 9 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 10 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 11 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;
   /* 12 */
   c1 = block[i1]; c2 = block[i2];
   if (c1 != c2) return (c1 > c2);
   i1++; i2++;

   k = nblock + 8;

   do {
      /* 1 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 2 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 3 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 4 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 5 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 6 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 7 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;
      /* 8 */
      c1 = block[i1]; c2 = block[i2];
      if (c1 != c2) return (c1 > c2);
      s1 = quadrant[i1]; s2 = quadrant[i2];
      if (s1 != s2) return (s1 > s2);
      i1++; i2++;

      if (i1 >= nblock) i1 -= nblock;
      if (i2 >= nblock) i2 -= nblock;

      k -= 8;
      (*budget)--;
   }
      while (k >= 0);

   return False;
}

Assembly for non-schedule case:

 Using gcc 4.0:

 gcc -c -m32 -O2 -mcpu=power4 bzip2_mainGtU.c

mainGtU:
	lbzx 0,3,5
	mr 11,3
	lbzx 9,5,4
	stwu 1,-16(1)
	cmpw 7,0,9
	stw 31,12(1)
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	beq- 7,.L85
.L4:
	lwz 31,12(1)
	addi 1,1,16
	blr
.L85:
	add 10,11,5
	add 12,4,5
	lbz 0,1(10)
	lbz 9,1(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,2(10)
	lbz 9,2(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,3(10)
	lbz 9,3(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,4(10)
	lbz 9,4(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,5(10)
	lbz 9,5(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,6(10)
	lbz 9,6(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,7(10)
	lbz 9,7(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,8(10)
	lbz 9,8(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,9(10)
	lbz 9,9(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,10(10)
	lbz 9,10(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,11(10)
	lbz 9,11(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	addi 10,11,12
	addi 4,4,12
	addi 31,7,8
	li 12,0
.L27:
	lbzx 3,5,10
	lbzx 0,5,4
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,10,1
	slwi 9,4,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,1
	addi 9,4,1
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,2
	addi 9,4,2
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,3
	addi 9,4,3
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,4
	addi 9,4,4
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,5
	addi 9,4,5
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,6
	addi 9,4,6
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 11,10,7
	addi 9,4,7
	lbzx 3,5,11
	lbzx 0,5,9
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,11,1
	slwi 9,9,1
	lhzx 3,6,0
	lhzx 0,9,6
	cmpw 7,3,0
	bne- 7,.L81
	addi 10,10,8
	addi 4,4,8
	cmplw 7,7,10
	bgt- 7,.L60
	subf 10,7,10
.L60:
	cmplw 7,7,4
	bgt- 7,.L62
	subf 4,7,4
.L62:
	addi 12,12,-8
	lwz 9,0(8)
	add. 0,31,12
	addi 9,9,-1
	stw 9,0(8)
	bge+ 0,.L27
	li 3,0
	b .L4
.L81:
	subfc 3,3,0
	subfe 3,3,3
	neg 3,3
	b .L4

Assembly for scheduled case:

 Using gcc 4.0:

 gcc -c -m32 -O2 -mcpu=power4 --param max-sched-region-blocks=30
                                                      bzip2_mainGtU.c

 See notes within code:

mainGtU:
	mflr 0
	stwu 1,-80(1)
	mr 11,3
	lbzx 9,5,4
	stw 0,84(1)
	lbzx 0,3,5
	stw 14,8(1)  - register save code due to increased reg pressure
	cmpw 7,0,9
	stw 15,12(1) .
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	stw 16,16(1) .
	stw 17,20(1) .
	stw 18,24(1) .
	stw 19,28(1) .
	stw 20,32(1) .
	stw 21,36(1) .
	stw 22,40(1) .
	stw 23,44(1) .
	stw 24,48(1) .
	stw 25,52(1) .
	stw 26,56(1) .
	stw 27,60(1) .
	stw 28,64(1) .
	stw 29,68(1) .
	stw 30,72(1) .
	stw 31,76(1) --------------------------------------------------
	beq- 7,.L85
.L4:
	lwz 0,84(1)
	lwz 14,8(1)  - register restore code due to increased reg pressure
	lwz 15,12(1) .
	mtlr 0
	lwz 16,16(1) .
	lwz 17,20(1) .
	lwz 18,24(1) .
	lwz 19,28(1) .
	lwz 20,32(1) .
	lwz 21,36(1) .
	lwz 22,40(1) .
	lwz 23,44(1) .
	lwz 24,48(1) .
	lwz 25,52(1) .
	lwz 26,56(1) .
	lwz 27,60(1) .
	lwz 28,64(1) .
	lwz 29,68(1) .
	lwz 30,72(1) .
	lwz 31,76(1) -----------------------------------------------------
	addi 1,1,80
	blr
.L85:
	add 10,11,5
	add 12,4,5
	lbz 0,1(10)
	lbz 9,1(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,2(10)
	lbz 9,2(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,3(10)
	lbz 9,3(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,4(10)
	lbz 9,4(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,5(10)
	lbz 9,5(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,6(10)
	lbz 9,6(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,7(10)
	lbz 9,7(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,8(10)
	lbz 9,8(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,9(10)
	lbz 9,9(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,10(10)
	lbz 9,10(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	lbz 0,11(10)
	lbz 9,11(12)
	cmpw 7,0,9
	subfc 3,0,9
	subfe 3,3,3
	neg 3,3
	bne+ 7,.L4
	addi 11,11,12
	addi 4,4,12
	addi 22,7,8
	li 23,0
.L27:                   - start of "do" loop.
	lbzx 3,5,11
	addi 9,11,1     - many of the add/shifts in loop get
	addi 10,4,1     . speculatively moved here.  They cause the
	lbzx 0,5,4      . pathlength of the procedure to increase in
	addi 29,11,2    . the case where the loop exits early.
	addi 24,4,2     . 
	addi 30,11,3    . They are also the cause of the increased 
	addi 25,4,3     . register pressure that is causing the reg
	cmpw 7,3,0      . allocator to resort to the use of non-volatiles
	slwi 28,11,1    . 
	slwi 27,4,1     .
	slwi 21,9,1     . Opportunities exist near the bottom of the   
	slwi 20,10,1    . loop to perform some of these in parallel
	slwi 19,29,1    . with other instructions rather than increase
	slwi 18,24,1    . the path length here.
	slwi 17,30,1    .
	slwi 16,25,1    .
	addi 31,11,4    .
	addi 26,4,4     .
	bne- 7,.L81     - first possible loop exit.
	lhzx 3,28,6
	slwi 15,31,1
	slwi 14,26,1
	lhzx 0,27,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,9
	addi 12,11,5
	addi 27,4,5
	lbzx 0,5,10
	cmpw 7,3,0
	bne- 7,.L81
	slwi 0,12,1
	lhzx 3,21,6
	mtctr 0
	slwi 0,27,1
	mtlr 0
	lhzx 0,20,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,29
	addi 10,11,6
	addi 28,4,6
	lbzx 0,5,24
	cmpw 7,3,0
	bne- 7,.L81
	lhzx 3,19,6
	slwi 24,10,1
	slwi 21,28,1
	lhzx 0,18,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,30
	addi 9,11,7
	addi 29,4,7
	lbzx 0,5,25
	cmpw 7,3,0
	bne- 7,.L81
	lhzx 3,17,6
	slwi 30,9,1
	slwi 25,29,1
	lhzx 0,16,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,31
	addi 11,11,8
	addi 23,23,-8
	lbzx 0,5,26
	addi 4,4,8
	cmplw 6,7,11
	add. 31,22,23
	cmplw 1,7,4
	cmpw 7,3,0
	bne- 7,.L81
	lhzx 3,15,6
	lhzx 0,14,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,12
	lbzx 0,5,27
	cmpw 7,3,0
	bne- 7,.L81
	mfctr 0
	mflr 31
	lhzx 3,6,0
	lhzx 0,31,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,10
	lbzx 0,5,28
	cmpw 7,3,0
	bne- 7,.L81
	lhzx 3,24,6
	lhzx 0,21,6
	cmpw 7,3,0
	bne- 7,.L81
	lbzx 3,5,9
	lbzx 0,5,29
	cmpw 7,3,0
	bne- 7,.L81
	lhzx 3,30,6
	lhzx 0,25,6
	cmpw 7,3,0
	bne- 7,.L81
	bgt- 6,.L60
	subf 11,7,11
.L60:
	bgt- 1,.L62
	subf 4,7,4
.L62:
	lwz 9,0(8)
	addi 9,9,-1
	stw 9,0(8)
	bge+ 0,.L27
	li 3,0
	b .L4
.L81:
	subfc 3,3,0
	subfe 3,3,3
	neg 3,3
	b .L4

-- 
           Summary: Overaggressive Speculative Code Motion
           Product: gcc
           Version: 4.0.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P1
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: steinmtz at us dot ibm dot com
                CC: gcc-bugs at gcc dot gnu dot org,steinmtz at us dot ibm
                    dot com
 GCC build triplet: powerpc64-linux
  GCC host triplet: powerpc64-linux
GCC target triplet: powerpc64-linux

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17593