[PATCH] rs6000: Enable limited unrolling at -O2

Jiufu Guo guojiufu@linux.ibm.com
Fri Oct 25 14:49:00 GMT 2019


Hi,

In PR88760, there are a few disscussion about improve or tune unroller for
targets. And we would agree to enable unroller for small loops at O2 first.
And we could see performance improvement(~10%) for below code:
```
  subroutine foo (i, i1, block)
    integer :: i, i1
    integer :: block(9, 9, 9)
    block(i:9,1,i1) = block(i:9,1,i1) - 10
  end subroutine foo

```
This kind of code occurs a few times in exchange2 benchmark.

Similar C code:
```
  for (i = 0; i < n; i++)
    arr[i] = arr[i] - 10;
```

On powerpc64le, for O2 , enable -funroll-loops and limit
PARAM_MAX_UNROLL_TIMES=2 and PARAM_MAX_UNROLLED_INSNS=20, we can see >2%
overall improvement for SPEC2017.

This patch is only for rs6000 in which we see visible performance improvement.

Bootstrapped on powerpc64le, and cases are updated. Is this ok for trunk?

Jiufu Guo
BR


gcc/
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>	    

	PR tree-optimization/88760
	* config/rs6000/rs6000-common.c (rs6000_option_optimization_table): for
	O2, enable funroll-loops.
	* config/rs6000/rs6000.c (rs6000_option_override_internal): if unroller
	is enabled throught O2, set constrains to PARAM_MAX_UNROLL_TIMES=2 and
	PARAM_MAX_UNROLLED_INSNS=20 for small loops.
	
gcc.testsuite/
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>

	PR tree-optimization/88760
	* gcc.target/powerpc/small-loop-unroll.c: New test.
	* c-c++-common/tsan/thread_leak2.c: Update test.
	* gcc.dg/pr59643.c: Update test.
	* gcc.target/powerpc/loop_align.c: Update test.
	* gcc.target/powerpc/ppc-fma-1.c: Update test.
	* gcc.target/powerpc/ppc-fma-2.c: Update test.
	* gcc.target/powerpc/ppc-fma-3.c: Update test.
	* gcc.target/powerpc/ppc-fma-4.c: Update test.
	* gcc.target/powerpc/pr78604.c: Update test.

---
 gcc/common/config/rs6000/rs6000-common.c             |  1 +
 gcc/config/rs6000/rs6000.c                           | 20 ++++++++++++++++++++
 gcc/testsuite/c-c++-common/tsan/thread_leak2.c       |  1 +
 gcc/testsuite/gcc.dg/pr59643.c                       |  1 +
 gcc/testsuite/gcc.target/powerpc/loop_align.c        |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/pr78604.c           |  2 +-
 gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c | 13 +++++++++++++
 11 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c

diff --git a/gcc/common/config/rs6000/rs6000-common.c b/gcc/common/config/rs6000/rs6000-common.c
index 4b0c205..b947196 100644
--- a/gcc/common/config/rs6000/rs6000-common.c
+++ b/gcc/common/config/rs6000/rs6000-common.c
@@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
     { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
     /* Enable -fsched-pressure for first pass instruction scheduling.  */
     { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
     { OPT_LEVELS_NONE, 0, NULL, 0 }
   };
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a129137d..9a8ff9a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4540,6 +4540,26 @@ rs6000_option_override_internal (bool global_init_p)
 			     global_options.x_param_values,
 			     global_options_set.x_param_values);
 
+      /* unroll very small loops 2 time if no -funroll-loops.  */
+      if (!global_options_set.x_flag_unroll_loops
+	  && !global_options_set.x_flag_unroll_all_loops)
+	{
+	  maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
+				 global_options.x_param_values,
+				 global_options_set.x_param_values);
+
+	  maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
+				 global_options.x_param_values,
+				 global_options_set.x_param_values);
+
+	  /* If fweb or frename-registers are not specificed in command-line,
+	     do not turn them on implicitly.  */
+	  if (!global_options_set.x_flag_web)
+	    global_options.x_flag_web = 0;
+	  if (!global_options_set.x_flag_rename_registers)
+	    global_options.x_flag_rename_registers = 0;
+	}
+
       /* If using typedef char *va_list, signal that
 	 __builtin_va_start (&ap, 0) can be optimized to
 	 ap = __builtin_next_arg (0).  */
diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
index c9b8046..17aa5c6 100644
--- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
+++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
@@ -1,4 +1,5 @@
 /* { dg-shouldfail "tsan" } */
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* powerpc64-*-* powerpc64le-*-* } } } */
 
 #include <pthread.h>
 #include <unistd.h>
diff --git a/gcc/testsuite/gcc.dg/pr59643.c b/gcc/testsuite/gcc.dg/pr59643.c
index de78d60..3aef439 100644
--- a/gcc/testsuite/gcc.dg/pr59643.c
+++ b/gcc/testsuite/gcc.dg/pr59643.c
@@ -1,6 +1,7 @@
 /* PR tree-optimization/59643 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-pcom-details" } */
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* powerpc64-*-* powerpc64le-*-* } } } */
 
 void
 foo (double *a, double *b, double *c, double d, double e, int n)
diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c b/gcc/testsuite/gcc.target/powerpc/loop_align.c
index ebe3782..ef67f77 100644
--- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
+++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
-/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
 /* { dg-final { scan-assembler ".p2align 5" } } */
 
 void f(double *a, double *b, double *c, unsigned long n) {
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
index b4945e6..2a5b92c 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
index 5ed630a..bf2c67f 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
index ef252b3..8608116 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
 /* { dg-final { scan-assembler-times "fmadd " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
index c2eaf1a..291c2ee 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
 /* { dg-final { scan-assembler-times "fmadd " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c b/gcc/testsuite/gcc.target/powerpc/pr78604.c
index 76d8945..35bfdb3 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
-/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
 
 #ifndef SIZE
 #define SIZE 1024
diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
new file mode 100644
index 0000000..888ea50b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
+
+void __attribute__ ((noinline)) foo(int n, int *arr)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    arr[i] = arr[i] - 10;
+}
+/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
+/* { dg-final { scan-assembler-times "lwz" 3 } } */
+/* { dg-final { scan-assembler-times "stw" 3 } } */
+
-- 
2.7.4



More information about the Gcc-patches mailing list