From 2711355fbc306e7e0f59b3247943614eba4a382b Mon Sep 17 00:00:00 2001 From: Zdenek Dvorak Date: Thu, 1 Mar 2007 23:14:23 +0100 Subject: [PATCH] tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the estimated number of iterations. * tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the estimated number of iterations. (loop_prefetch_arrays): Do not prefetch in loops that iterate less than prefetch latency. * gcc.dg/tree-ssa/prefetch-4.c: New test. From-SVN: r122435 --- gcc/ChangeLog | 7 ++ gcc/config/i386/driver-i386.c | 136 ++++++++++++++++++++- gcc/testsuite/ChangeLog | 4 + gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c | 18 +++ gcc/tree-ssa-loop-prefetch.c | 45 ++++--- 5 files changed, 192 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b0e26a9bb138..9c75bc251ed3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2007-03-01 Zdenek Dvorak + + * tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll + factor by the estimated number of iterations. + (loop_prefetch_arrays): Do not prefetch in loops that iterate less than + prefetch latency. + 2007-03-01 Richard Henderson * expr.c (emit_move_complex_push): Export. diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c index df43512f7750..3a5d29de30df 100644 --- a/gcc/config/i386/driver-i386.c +++ b/gcc/config/i386/driver-i386.c @@ -47,6 +47,131 @@ const char *host_detect_local_cpu (int argc, const char **argv); #define bit_3DNOWP (1 << 30) #define bit_LM (1 << 29) +/* Returns parameters that describe L1_ASSOC associative cache of size + L1_SIZEKB with lines of size L1_LINE. */ + +static char * +describe_cache (unsigned l1_sizekb, unsigned l1_line, + unsigned l1_assoc ATTRIBUTE_UNUSED) +{ + char size[1000], line[1000]; + unsigned size_in_lines; + + /* At the moment, gcc middle-end does not use the information about the + associativity of the cache. */ + + size_in_lines = (l1_sizekb * 1024) / l1_line; + + sprintf (size, "--param l1-cache-size=%u", size_in_lines); + sprintf (line, "--param l1-cache-line-size=%u", l1_line); + + return concat (size, " ", line, " ", NULL); +} + +/* Returns the description of caches for an AMD processor. */ + +static char * +detect_caches_amd (unsigned max_ext_level) +{ + unsigned eax, ebx, ecx, edx; + unsigned l1_sizekb, l1_line, l1_assoc; + + if (max_ext_level < 0x80000005) + return NULL; + + cpuid (0x80000005, eax, ebx, ecx, edx); + + l1_line = ecx & 0xff; + l1_sizekb = (ecx >> 24) & 0xff; + l1_assoc = (ecx >> 16) & 0xff; + + return describe_cache (l1_sizekb, l1_line, l1_assoc); +} + +/* Stores the size of the L1 cache and cache line, and the associativity + of the cache according to REG to L1_SIZEKB, L1_LINE and L1_ASSOC. */ + +static void +decode_caches_intel (unsigned reg, unsigned *l1_sizekb, unsigned *l1_line, + unsigned *l1_assoc) +{ + unsigned i, val; + + if (((reg >> 31) & 1) != 0) + return; + + for (i = 0; i < 4; i++) + { + val = reg & 0xff; + reg >>= 8; + + switch (val) + { + case 0xa: + *l1_sizekb = 8; + *l1_line = 32; + *l1_assoc = 2; + break; + case 0xc: + *l1_sizekb = 16; + *l1_line = 32; + *l1_assoc = 4; + break; + case 0x2c: + *l1_sizekb = 32; + *l1_line = 64; + *l1_assoc = 8; + break; + case 0x60: + *l1_sizekb = 16; + *l1_line = 64; + *l1_assoc = 8; + break; + case 0x66: + *l1_sizekb = 8; + *l1_line = 64; + *l1_assoc = 4; + break; + case 0x67: + *l1_sizekb = 16; + *l1_line = 64; + *l1_assoc = 4; + break; + case 0x68: + *l1_sizekb = 32; + *l1_line = 64; + *l1_assoc = 4; + break; + + default: + break; + } + } +} + +/* Returns the description of caches for an intel processor. */ + +static char * +detect_caches_intel (unsigned max_level) +{ + unsigned eax, ebx, ecx, edx; + unsigned l1_sizekb = 0, l1_line = 0, assoc = 0; + + if (max_level < 2) + return NULL; + + cpuid (2, eax, ebx, ecx, edx); + + decode_caches_intel (eax, &l1_sizekb, &l1_line, &assoc); + decode_caches_intel (ebx, &l1_sizekb, &l1_line, &assoc); + decode_caches_intel (ecx, &l1_sizekb, &l1_line, &assoc); + decode_caches_intel (edx, &l1_sizekb, &l1_line, &assoc); + if (!l1_sizekb) + return (char *) ""; + + return describe_cache (l1_sizekb, l1_line, assoc); +} + /* This will be called by the spec parser in gcc.c when it sees a %:local_cpu_detect(args) construct. Currently it will be called with either "arch" or "tune" as argument depending on if -march=native @@ -62,6 +187,7 @@ const char *host_detect_local_cpu (int argc, const char **argv); const char *host_detect_local_cpu (int argc, const char **argv) { const char *cpu = NULL; + const char *cache = ""; enum processor_type processor = PROCESSOR_I386; unsigned int eax, ebx, ecx, edx; unsigned int max_level; @@ -126,6 +252,14 @@ const char *host_detect_local_cpu (int argc, const char **argv) is_amd = vendor == *(unsigned int*)"Auth"; + if (!arch) + { + if (is_amd) + cache = detect_caches_amd (ext_level); + else if (vendor == *(unsigned int*)"Genu") + cache = detect_caches_intel (max_level); + } + if (is_amd) { if (has_mmx) @@ -283,7 +417,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) } done: - return concat ("-m", argv[0], "=", cpu, NULL); + return concat (cache, "-m", argv[0], "=", cpu, NULL); } #else /* If we aren't compiling with GCC we just provide a minimal diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1f432cf5b33c..e2cf6d9f96bd 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2007-03-01 Zdenek Dvorak + + * gcc.dg/tree-ssa/prefetch-4.c: New test. + 2007-03-01 Simon Baldwin PR c++/23689 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c new file mode 100644 index 000000000000..8a5230eedd34 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c @@ -0,0 +1,18 @@ +/* The loop rolls too little, hence the prefetching would not be useful. */ + +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -fdump-tree-final_cleanup" } */ + +int xxx[20]; + +void foo (int n) +{ + int i; + + for (i = 0; i < n; i++) + xxx[i] = i; +} + +/* { dg-final { scan-tree-dump-times "prefetch" 0 "final_cleanup" } } */ +/* { dg-final { cleanup-tree-dump "final_cleanup" } } */ diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c index e0612b9a56e8..53977d8bddd5 100644 --- a/gcc/tree-ssa-loop-prefetch.c +++ b/gcc/tree-ssa-loop-prefetch.c @@ -885,13 +885,14 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, /* Determine the coefficient by that unroll LOOP, from the information contained in the list of memory references REFS. Description of - umber of iterations of LOOP is stored to DESC. AHEAD is the number - of iterations ahead that we need to prefetch. NINSNS is number of - insns of the LOOP. */ + umber of iterations of LOOP is stored to DESC. NINSNS is the number of + insns of the LOOP. EST_NITER is the estimated number of iterations of + the loop, or -1 if no estimate is available. */ static unsigned determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, - unsigned ninsns, struct tree_niter_desc *desc) + unsigned ninsns, struct tree_niter_desc *desc, + HOST_WIDE_INT est_niter) { unsigned upper_bound; unsigned nfactor, factor, mod_constraint; @@ -906,6 +907,12 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, gains from better scheduling and decreasing loop overhead, which is not the case here. */ upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; + + /* If we unrolled the loop more times than it iterates, the unrolled version + of the loop would be never entered. */ + if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound) + upper_bound = est_niter; + if (upper_bound <= 1) return 1; @@ -935,7 +942,8 @@ static bool loop_prefetch_arrays (struct loop *loop) { struct mem_ref_group *refs; - unsigned ahead, ninsns, unroll_factor; + unsigned ahead, ninsns, time, unroll_factor; + HOST_WIDE_INT est_niter; struct tree_niter_desc desc; bool unrolled = false; @@ -950,21 +958,24 @@ loop_prefetch_arrays (struct loop *loop) /* Step 3: determine the ahead and unroll factor. */ - /* FIXME: We should use not size of the loop, but the average number of - instructions executed per iteration of the loop. */ - ninsns = tree_num_loop_insns (loop, &eni_time_weights); - ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns; - unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc); - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); + /* FIXME: the time should be weighted by the probabilities of the blocks in + the loop body. */ + time = tree_num_loop_insns (loop, &eni_time_weights); + ahead = (PREFETCH_LATENCY + time - 1) / time; + est_niter = estimated_loop_iterations_int (loop, false); - /* If the loop rolls less than the required unroll factor, prefetching - is useless. */ - if (unroll_factor > 1 - && cst_and_fits_in_hwi (desc.niter) - && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor) + /* The prefetches will run for AHEAD iterations of the original loop. Unless + the loop rolls at least AHEAD times, prefetching the references does not + make sense. */ + if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead) goto fail; + ninsns = tree_num_loop_insns (loop, &eni_size_weights); + unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, + est_niter); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); + /* Step 4: what to prefetch? */ if (!schedule_prefetches (refs, unroll_factor, ahead)) goto fail; -- 2.43.5