PATCH: PR target/46519: Missing vzeroupper

H.J. Lu hjl.tools@gmail.com
Sat Dec 18 20:11:00 GMT 2010


On Sat, Dec 18, 2010 at 9:48 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Fri, Dec 17, 2010 at 8:03 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>
>> This patch fixes another missing vzeroupper.  OK for trunk?
>>
>> Thanks.
>>
>>
>> H.J.
>> ---
>> gcc/
>>
>> 2010-12-17  H.J. Lu  <hongjiu.lu@intel.com>
>>
>>        PR target/46519
>>        * config/i386/i386.c (rescan_move_or_delete_vzeroupper): Stop
>>        rescanning predecessor edges if one of them uses upper 128bits.
>>
>> gcc/testsuite/
>>
>> 2010-12-17  H.J. Lu  <hongjiu.lu@intel.com>
>>
>>        PR target/46519
>>        * gfortran.dg/pr46519-2.f90: New.
>
> OK.
>
> Thanks,
> Uros.
>

I'd like to apply this patch instead. It removes escan_move_or_delete_vzeroupper
and rewrites move_or_delete_vzeroupper_1 to avoid recursive call. It first scans
all basic blocks repeatedly until no basic block changes the upper
128bits of AVX
to used at exit.  Then it rescans all basic blocks with unknown upper
128bit state.
OK for trunk?

Thanks.

-- 
H.J.
---
gcc/

2010-12-18  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/46519
	* config/i386/i386.c (block_info_def): Remove referenced, count
	and rescanned.
	(move_or_delete_vzeroupper_2): Updated.
	(move_or_delete_vzeroupper_1): Rewritten to avoid recursive call.
	(rescan_move_or_delete_vzeroupper): Removed.
	(move_or_delete_vzeroupper): Repeat processing all basic blocks
	until no basic block state is changed to used at exit.

gcc/testsuite/

2010-12-18  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/46519
	* gfortran.dg/pr46519-2.f90: New.
-------------- next part --------------
gcc/

2010-12-18  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/46519
	* config/i386/i386.c (block_info_def): Remove referenced, count
	and rescanned.
	(move_or_delete_vzeroupper_2): Updated.
	(move_or_delete_vzeroupper_1): Rewritten to avoid recursive call.
	(rescan_move_or_delete_vzeroupper): Removed.
	(move_or_delete_vzeroupper): Repeat processing all basic blocks
	until no basic block state is changed to used at exit.

gcc/testsuite/

2010-12-18  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/46519
	* gfortran.dg/pr46519-2.f90: New.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 40999c8..28b26ef 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -68,14 +68,8 @@ typedef struct block_info_def
 {
   /* State of the upper 128bits of any AVX registers at exit.  */
   enum upper_128bits_state state;
-  /* If the upper 128bits of any AVX registers are referenced.  */
-  enum upper_128bits_state referenced;
-  /* Number of vzerouppers in this block.  */
-  unsigned int count;
   /* TRUE if block has been processed.  */
   bool processed;
-  /* TRUE if block has been rescanned.  */
-  bool rescanned;
 } *block_info;
 
 #define BLOCK_INFO(B)   ((block_info) (B)->aux)
@@ -127,8 +121,6 @@ move_or_delete_vzeroupper_2 (basic_block bb,
   rtx vzeroupper_insn = NULL_RTX;
   rtx pat;
   int avx256;
-  enum upper_128bits_state referenced = BLOCK_INFO (bb)->referenced;
-  int count = BLOCK_INFO (bb)->count;
 
   if (dump_file)
     fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
@@ -191,24 +183,20 @@ move_or_delete_vzeroupper_2 (basic_block bb,
 	      /* Delete pending vzeroupper insertion.  */
 	      if (vzeroupper_insn)
 		{
-		  count--;
 		  delete_insn (vzeroupper_insn);
 		  vzeroupper_insn = NULL_RTX;
 		}
 	    }
-	  else if (state != used && referenced != unused)
+	  else if (state != used)
 	    {
 	      /* No need to call note_stores if the upper 128bits of
 		 AVX registers are never referenced.  */
 	      note_stores (pat, check_avx256_stores, &state);
-	      if (state == used)
-		referenced = used;
 	    }
 	  continue;
 	}
 
       /* Process vzeroupper intrinsic.  */
-      count++;
       avx256 = INTVAL (XVECEXP (pat, 0, 0));
 
       if (state == unused)
@@ -226,7 +214,6 @@ move_or_delete_vzeroupper_2 (basic_block bb,
 	      fprintf (dump_file, "Delete redundant vzeroupper:\n");
 	      print_rtl_single (dump_file, insn);
 	    }
-	  count--;
 	  delete_insn (insn);
 	}
       else
@@ -246,7 +233,6 @@ move_or_delete_vzeroupper_2 (basic_block bb,
 		  fprintf (dump_file, "Delete callee pass vzeroupper:\n");
 		  print_rtl_single (dump_file, insn);
 		}
-	      count--;
 	      delete_insn (insn);
 	    }
 	  else
@@ -256,30 +242,22 @@ move_or_delete_vzeroupper_2 (basic_block bb,
 
   BLOCK_INFO (bb)->state = state;
 
-  if (BLOCK_INFO (bb)->referenced == unknown)
-    {
-      /* The upper 128bits of AVX registers are never referenced if
-	 REFERENCED isn't updated.  */
-      if (referenced == unknown)
-	referenced = unused;
-      BLOCK_INFO (bb)->referenced = referenced;
-      BLOCK_INFO (bb)->count = count;
-    }
-
   if (dump_file)
     fprintf (dump_file, " [bb %i] exit: upper 128bits: %d\n",
 	     bb->index, state);
 }
 
 /* Helper function for move_or_delete_vzeroupper.  Process vzeroupper
-   in BLOCK and its predecessor blocks recursively.  */
+   in BLOCK and check its predecessor blocks.  Treat UNKNOWN state
+   as USED if UNKNOWN_IS_UNUSED is true.  */
 
 static void
-move_or_delete_vzeroupper_1 (basic_block block)
+move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
 {
   edge e;
   edge_iterator ei;
-  enum upper_128bits_state state;
+  enum upper_128bits_state state, old_state, new_state;
+  bool seen_unknown;
 
   if (dump_file)
     fprintf (dump_file, " Process [bb %i]: status: %d\n",
@@ -288,83 +266,42 @@ move_or_delete_vzeroupper_1 (basic_block block)
   if (BLOCK_INFO (block)->processed)
     return;
 
-  BLOCK_INFO (block)->processed = true;
-
-  state = unknown;
+  state = unused;
 
-  /* Process all predecessor edges of this block.  */
+  /* Check all predecessor edges of this block.  */
+  seen_unknown = false;
   FOR_EACH_EDGE (e, ei, block->preds)
     {
       if (e->src == block)
 	continue;
-      move_or_delete_vzeroupper_1 (e->src);
       switch (BLOCK_INFO (e->src)->state)
 	{
 	case unknown:
-	  if (state == unused)
-	    state = unknown;
+	  if (!unknown_is_unused)
+	    seen_unknown = true;
+	case unused:
 	  break;
 	case used:
 	  state = used;
-	  break;
-	case unused:
-	  break;
+	  goto done;
 	}
     }
 
-  /* If state of any predecessor edges is unknown, we need to rescan.  */
-  if (state == unknown)
-    cfun->machine->rescan_vzeroupper_p = 1;
+  if (seen_unknown)
+    state = unknown;
 
-  /* Process this block.  */
+done:
+  old_state = BLOCK_INFO (block)->state;
   move_or_delete_vzeroupper_2 (block, state);
-}
-
-/* Helper function for move_or_delete_vzeroupper.  Rescan vzeroupper
-   in BLOCK and its predecessor blocks recursively.  */
-
-static void
-rescan_move_or_delete_vzeroupper (basic_block block)
-{
-  edge e;
-  edge_iterator ei;
-  enum upper_128bits_state state;
-
-  if (dump_file)
-    fprintf (dump_file, " Rescan [bb %i]: status: %d\n",
-	     block->index, BLOCK_INFO (block)->rescanned);
-
-  if (BLOCK_INFO (block)->rescanned)
-    return;
-
-  BLOCK_INFO (block)->rescanned = true;
-
-  state = unused;
+  new_state = BLOCK_INFO (block)->state;
 
-  /* Rescan all predecessor edges of this block.  */
-  FOR_EACH_EDGE (e, ei, block->preds)
-    {
-      if (e->src == block)
-	continue;
-      rescan_move_or_delete_vzeroupper (e->src);
-      /* For rescan, UKKNOWN state is treated as UNUSED.  */
-      if (BLOCK_INFO (e->src)->state == used)
-	state = used;
-    }
+  if (state != unknown || new_state == used)
+    BLOCK_INFO (block)->processed = true;
 
-  /* Rescan this block only if there are vzerouppers or the upper
-     128bits of AVX registers are referenced.  */
-  if (BLOCK_INFO (block)->count == 0
-      && (state == used || BLOCK_INFO (block)->referenced != used))
-    {
-      if (state == used)
-	BLOCK_INFO (block)->state = state;
-      if (dump_file)
-	fprintf (dump_file, " [bb %i] exit: upper 128bits: %d\n",
-		 block->index, BLOCK_INFO (block)->state);
-    }
-  else
-    move_or_delete_vzeroupper_2 (block, state);
+  /* Need to rescan if the upper 128bits of AVX registers are changed
+     to USED at exit.  */
+  if (new_state != old_state && new_state == used)
+    cfun->machine->rescan_vzeroupper_p = 1;
 }
 
 /* Go through the instruction stream looking for vzeroupper.  Delete
@@ -377,7 +314,7 @@ move_or_delete_vzeroupper (void)
   edge e;
   edge_iterator ei;
   basic_block bb;
-  unsigned int count = 0;
+  unsigned int count;
 
   /* Set up block info for each basic block.  */
   alloc_aux_for_blocks (sizeof (struct block_info_def));
@@ -392,28 +329,30 @@ move_or_delete_vzeroupper (void)
 				   cfun->machine->caller_pass_avx256_p
 				   ? used : unused);
       BLOCK_INFO (e->dest)->processed = true;
-      BLOCK_INFO (e->dest)->rescanned = true;
     }
 
   /* Process all basic blocks.  */
-  if (dump_file)
-    fprintf (dump_file, "Process all basic blocks\n");
-
-  FOR_EACH_BB (bb)
-    {
-      move_or_delete_vzeroupper_1 (bb);
-      count += BLOCK_INFO (bb)->count;
-    }
-
-  /* Rescan all basic blocks if needed.  */
-  if (count && cfun->machine->rescan_vzeroupper_p)
+  count = 0;
+  do
     {
       if (dump_file)
-	fprintf (dump_file, "Rescan all basic blocks\n");
-
+	fprintf (dump_file, "Process all basic blocks: trip %d\n",
+		 count);
+      cfun->machine->rescan_vzeroupper_p = 0;
       FOR_EACH_BB (bb)
-	rescan_move_or_delete_vzeroupper (bb);
+	move_or_delete_vzeroupper_1 (bb, false);
     }
+  while (cfun->machine->rescan_vzeroupper_p && count++ < 20);
+
+  /* FIXME: Is 20 big enough?  */
+  if (count >= 20)
+    gcc_unreachable ();
+
+  if (dump_file)
+    fprintf (dump_file, "Process all basic blocks\n");
+
+  FOR_EACH_BB (bb)
+    move_or_delete_vzeroupper_1 (bb, true);
 
   free_aux_for_blocks ();
 }
diff --git a/gcc/testsuite/gfortran.dg/pr46519-2.f90 b/gcc/testsuite/gfortran.dg/pr46519-2.f90
new file mode 100644
index 0000000..b4d6980
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr46519-2.f90
@@ -0,0 +1,31 @@
+! { dg-do compile { target i?86-*-* x86_64-*-* } }
+! { dg-options "-O3 -mavx -mvzeroupper -mtune=generic -dp" }
+
+  SUBROUTINE func(kts, kte, qrz, qiz, rho)
+  IMPLICIT NONE
+  INTEGER, INTENT(IN)               :: kts, kte
+  REAL,    DIMENSION(kts:kte), INTENT(INOUT) :: qrz, qiz, rho
+  INTEGER                              :: k
+  REAL, DIMENSION(kts:kte)    ::  praci, vtiold
+  REAL                          :: fluxout
+  INTEGER                       :: min_q, max_q, var
+  do k=kts,kte
+    praci(k)=0.0
+  enddo
+  min_q=kte
+  max_q=kts-1
+  DO var=1,20
+    do k=max_q,min_q,-1
+       fluxout=rho(k)*qrz(k)
+    enddo
+    qrz(min_q-1)=qrz(min_q-1)+fluxout
+  ENDDO
+  DO var=1,20
+    do k=kts,kte-1
+      vtiold(k)= (rho(k))**0.16
+    enddo
+  ENDDO
+  STOP
+  END SUBROUTINE func
+
+! { dg-final { scan-assembler "avx_vzeroupper" } }


More information about the Gcc-patches mailing list