]>
Commit | Line | Data |
---|---|---|
a29c7ea6 | 1 | /* Loop unrolling and peeling. |
0c20a65f | 2 | Copyright (C) 2002, 2003 Free Software Foundation, Inc. |
a29c7ea6 ZD |
3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8 | Software Foundation; either version 2, or (at your option) any later | |
9 | version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING. If not, write to the Free | |
18 | Software Foundation, 59 Temple Place - Suite 330, Boston, MA | |
19 | 02111-1307, USA. */ | |
20 | ||
21 | #include "config.h" | |
22 | #include "system.h" | |
23 | #include "coretypes.h" | |
24 | #include "tm.h" | |
25 | #include "rtl.h" | |
26 | #include "hard-reg-set.h" | |
27 | #include "basic-block.h" | |
28 | #include "cfgloop.h" | |
29 | #include "cfglayout.h" | |
30 | #include "params.h" | |
31 | #include "output.h" | |
32 | #include "expr.h" | |
33 | ||
34 | /* This pass performs loop unrolling and peeling. We only perform these | |
e0bb17a8 | 35 | optimizations on innermost loops (with single exception) because |
a29c7ea6 ZD |
36 | the impact on performance is greatest here, and we want to avoid |
37 | unnecessary code size growth. The gain is caused by greater sequentiality | |
4d6922ee | 38 | of code, better code to optimize for further passes and in some cases |
a29c7ea6 ZD |
39 | by fewer testings of exit conditions. The main problem is code growth, |
40 | that impacts performance negatively due to effect of caches. | |
41 | ||
42 | What we do: | |
43 | ||
44 | -- complete peeling of once-rolling loops; this is the above mentioned | |
45 | exception, as this causes loop to be cancelled completely and | |
46 | does not cause code growth | |
47 | -- complete peeling of loops that roll (small) constant times. | |
48 | -- simple peeling of first iterations of loops that do not roll much | |
49 | (according to profile feedback) | |
50 | -- unrolling of loops that roll constant times; this is almost always | |
51 | win, as we get rid of exit condition tests. | |
52 | -- unrolling of loops that roll number of times that we can compute | |
53 | in runtime; we also get rid of exit condition tests here, but there | |
54 | is the extra expense for calculating the number of iterations | |
55 | -- simple unrolling of remaining loops; this is performed only if we | |
56 | are asked to, as the gain is questionable in this case and often | |
57 | it may even slow down the code | |
58 | For more detailed descriptions of each of those, see comments at | |
59 | appropriate function below. | |
60 | ||
61 | There is a lot of parameters (defined and described in params.def) that | |
62 | control how much we unroll/peel. | |
63 | ||
64 | ??? A great problem is that we don't have a good way how to determine | |
65 | how many times we should unroll the loop; the experiments I have made | |
66 | showed that this choice may affect performance in order of several %. | |
67 | */ | |
68 | ||
0c20a65f AJ |
69 | static void decide_unrolling_and_peeling (struct loops *, int); |
70 | static void peel_loops_completely (struct loops *, int); | |
d47cc544 SB |
71 | static void decide_peel_simple (struct loop *, int); |
72 | static void decide_peel_once_rolling (struct loop *, int); | |
73 | static void decide_peel_completely (struct loop *, int); | |
74 | static void decide_unroll_stupid (struct loop *, int); | |
75 | static void decide_unroll_constant_iterations (struct loop *, int); | |
76 | static void decide_unroll_runtime_iterations (struct loop *, int); | |
0c20a65f AJ |
77 | static void peel_loop_simple (struct loops *, struct loop *); |
78 | static void peel_loop_completely (struct loops *, struct loop *); | |
79 | static void unroll_loop_stupid (struct loops *, struct loop *); | |
80 | static void unroll_loop_constant_iterations (struct loops *, struct loop *); | |
81 | static void unroll_loop_runtime_iterations (struct loops *, struct loop *); | |
a29c7ea6 ZD |
82 | |
83 | /* Unroll and/or peel (depending on FLAGS) LOOPS. */ | |
84 | void | |
0c20a65f | 85 | unroll_and_peel_loops (struct loops *loops, int flags) |
a29c7ea6 ZD |
86 | { |
87 | struct loop *loop, *next; | |
50654f6c | 88 | bool check; |
a29c7ea6 ZD |
89 | |
90 | /* First perform complete loop peeling (it is almost surely a win, | |
91 | and affects parameters for further decision a lot). */ | |
92 | peel_loops_completely (loops, flags); | |
93 | ||
94 | /* Now decide rest of unrolling and peeling. */ | |
95 | decide_unrolling_and_peeling (loops, flags); | |
96 | ||
97 | loop = loops->tree_root; | |
98 | while (loop->inner) | |
99 | loop = loop->inner; | |
100 | ||
101 | /* Scan the loops, inner ones first. */ | |
102 | while (loop != loops->tree_root) | |
103 | { | |
104 | if (loop->next) | |
105 | { | |
106 | next = loop->next; | |
107 | while (next->inner) | |
108 | next = next->inner; | |
109 | } | |
110 | else | |
111 | next = loop->outer; | |
112 | ||
50654f6c | 113 | check = true; |
a29c7ea6 ZD |
114 | /* And perform the appropriate transformations. */ |
115 | switch (loop->lpt_decision.decision) | |
116 | { | |
117 | case LPT_PEEL_COMPLETELY: | |
118 | /* Already done. */ | |
119 | abort (); | |
120 | case LPT_PEEL_SIMPLE: | |
121 | peel_loop_simple (loops, loop); | |
122 | break; | |
123 | case LPT_UNROLL_CONSTANT: | |
124 | unroll_loop_constant_iterations (loops, loop); | |
125 | break; | |
126 | case LPT_UNROLL_RUNTIME: | |
127 | unroll_loop_runtime_iterations (loops, loop); | |
128 | break; | |
129 | case LPT_UNROLL_STUPID: | |
130 | unroll_loop_stupid (loops, loop); | |
131 | break; | |
132 | case LPT_NONE: | |
50654f6c | 133 | check = false; |
a29c7ea6 ZD |
134 | break; |
135 | default: | |
136 | abort (); | |
137 | } | |
138 | if (check) | |
139 | { | |
140 | #ifdef ENABLE_CHECKING | |
d47cc544 | 141 | verify_dominators (CDI_DOMINATORS); |
a29c7ea6 ZD |
142 | verify_loop_structure (loops); |
143 | #endif | |
144 | } | |
145 | loop = next; | |
146 | } | |
50654f6c ZD |
147 | |
148 | iv_analysis_done (); | |
149 | } | |
150 | ||
151 | /* Check whether exit of the LOOP is at the end of loop body. */ | |
152 | ||
153 | static bool | |
154 | loop_exit_at_end_p (struct loop *loop) | |
155 | { | |
156 | struct niter_desc *desc = get_simple_loop_desc (loop); | |
157 | rtx insn; | |
158 | ||
159 | if (desc->in_edge->dest != loop->latch) | |
160 | return false; | |
161 | ||
162 | /* Check that the latch is empty. */ | |
163 | FOR_BB_INSNS (loop->latch, insn) | |
164 | { | |
165 | if (INSN_P (insn)) | |
166 | return false; | |
167 | } | |
168 | ||
169 | return true; | |
a29c7ea6 ZD |
170 | } |
171 | ||
172 | /* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */ | |
173 | static void | |
0c20a65f | 174 | peel_loops_completely (struct loops *loops, int flags) |
a29c7ea6 ZD |
175 | { |
176 | struct loop *loop, *next; | |
177 | ||
178 | loop = loops->tree_root; | |
179 | while (loop->inner) | |
180 | loop = loop->inner; | |
181 | ||
182 | while (loop != loops->tree_root) | |
183 | { | |
184 | if (loop->next) | |
185 | { | |
186 | next = loop->next; | |
187 | while (next->inner) | |
188 | next = next->inner; | |
189 | } | |
190 | else | |
191 | next = loop->outer; | |
192 | ||
193 | loop->lpt_decision.decision = LPT_NONE; | |
0c20a65f | 194 | |
a29c7ea6 | 195 | if (rtl_dump_file) |
50654f6c | 196 | fprintf (rtl_dump_file, "\n;; *** Considering loop %d for complete peeling ***\n", |
a29c7ea6 ZD |
197 | loop->num); |
198 | ||
a29c7ea6 ZD |
199 | loop->ninsns = num_loop_insns (loop); |
200 | ||
d47cc544 | 201 | decide_peel_once_rolling (loop, flags); |
a29c7ea6 | 202 | if (loop->lpt_decision.decision == LPT_NONE) |
d47cc544 | 203 | decide_peel_completely (loop, flags); |
a29c7ea6 ZD |
204 | |
205 | if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY) | |
206 | { | |
207 | peel_loop_completely (loops, loop); | |
208 | #ifdef ENABLE_CHECKING | |
d47cc544 | 209 | verify_dominators (CDI_DOMINATORS); |
a29c7ea6 ZD |
210 | verify_loop_structure (loops); |
211 | #endif | |
212 | } | |
213 | loop = next; | |
214 | } | |
215 | } | |
216 | ||
217 | /* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */ | |
218 | static void | |
0c20a65f | 219 | decide_unrolling_and_peeling (struct loops *loops, int flags) |
a29c7ea6 ZD |
220 | { |
221 | struct loop *loop = loops->tree_root, *next; | |
222 | ||
223 | while (loop->inner) | |
224 | loop = loop->inner; | |
225 | ||
226 | /* Scan the loops, inner ones first. */ | |
227 | while (loop != loops->tree_root) | |
228 | { | |
229 | if (loop->next) | |
230 | { | |
231 | next = loop->next; | |
232 | while (next->inner) | |
233 | next = next->inner; | |
234 | } | |
235 | else | |
236 | next = loop->outer; | |
237 | ||
238 | loop->lpt_decision.decision = LPT_NONE; | |
239 | ||
240 | if (rtl_dump_file) | |
50654f6c | 241 | fprintf (rtl_dump_file, "\n;; *** Considering loop %d ***\n", loop->num); |
a29c7ea6 ZD |
242 | |
243 | /* Do not peel cold areas. */ | |
244 | if (!maybe_hot_bb_p (loop->header)) | |
245 | { | |
246 | if (rtl_dump_file) | |
247 | fprintf (rtl_dump_file, ";; Not considering loop, cold area\n"); | |
248 | loop = next; | |
249 | continue; | |
250 | } | |
251 | ||
252 | /* Can the loop be manipulated? */ | |
253 | if (!can_duplicate_loop_p (loop)) | |
254 | { | |
255 | if (rtl_dump_file) | |
256 | fprintf (rtl_dump_file, | |
257 | ";; Not considering loop, cannot duplicate\n"); | |
258 | loop = next; | |
259 | continue; | |
260 | } | |
261 | ||
262 | /* Skip non-innermost loops. */ | |
263 | if (loop->inner) | |
264 | { | |
265 | if (rtl_dump_file) | |
266 | fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n"); | |
267 | loop = next; | |
268 | continue; | |
269 | } | |
270 | ||
271 | loop->ninsns = num_loop_insns (loop); | |
272 | loop->av_ninsns = average_num_loop_insns (loop); | |
273 | ||
274 | /* Try transformations one by one in decreasing order of | |
275 | priority. */ | |
276 | ||
d47cc544 | 277 | decide_unroll_constant_iterations (loop, flags); |
a29c7ea6 | 278 | if (loop->lpt_decision.decision == LPT_NONE) |
d47cc544 | 279 | decide_unroll_runtime_iterations (loop, flags); |
a29c7ea6 | 280 | if (loop->lpt_decision.decision == LPT_NONE) |
d47cc544 | 281 | decide_unroll_stupid (loop, flags); |
a29c7ea6 | 282 | if (loop->lpt_decision.decision == LPT_NONE) |
d47cc544 | 283 | decide_peel_simple (loop, flags); |
a29c7ea6 ZD |
284 | |
285 | loop = next; | |
286 | } | |
287 | } | |
288 | ||
289 | /* Decide whether the LOOP is once rolling and suitable for complete | |
290 | peeling. */ | |
291 | static void | |
d47cc544 | 292 | decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED) |
a29c7ea6 | 293 | { |
50654f6c ZD |
294 | struct niter_desc *desc; |
295 | ||
a29c7ea6 | 296 | if (rtl_dump_file) |
50654f6c | 297 | fprintf (rtl_dump_file, "\n;; Considering peeling once rolling loop\n"); |
a29c7ea6 ZD |
298 | |
299 | /* Is the loop small enough? */ | |
300 | if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns) | |
301 | { | |
302 | if (rtl_dump_file) | |
303 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
304 | return; | |
305 | } | |
306 | ||
307 | /* Check for simple loops. */ | |
50654f6c | 308 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
309 | |
310 | /* Check number of iterations. */ | |
50654f6c ZD |
311 | if (!desc->simple_p |
312 | || desc->assumptions | |
313 | || !desc->const_iter | |
314 | || desc->niter != 0) | |
a29c7ea6 ZD |
315 | { |
316 | if (rtl_dump_file) | |
317 | fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n"); | |
318 | return; | |
319 | } | |
320 | ||
321 | /* Success. */ | |
322 | if (rtl_dump_file) | |
323 | fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n"); | |
324 | loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; | |
325 | } | |
326 | ||
327 | /* Decide whether the LOOP is suitable for complete peeling. */ | |
328 | static void | |
d47cc544 | 329 | decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED) |
a29c7ea6 ZD |
330 | { |
331 | unsigned npeel; | |
50654f6c | 332 | struct niter_desc *desc; |
a29c7ea6 ZD |
333 | |
334 | if (rtl_dump_file) | |
50654f6c | 335 | fprintf (rtl_dump_file, "\n;; Considering peeling completely\n"); |
a29c7ea6 ZD |
336 | |
337 | /* Skip non-innermost loops. */ | |
338 | if (loop->inner) | |
339 | { | |
340 | if (rtl_dump_file) | |
341 | fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n"); | |
342 | return; | |
343 | } | |
344 | ||
35b07080 ZD |
345 | /* Do not peel cold areas. */ |
346 | if (!maybe_hot_bb_p (loop->header)) | |
347 | { | |
348 | if (rtl_dump_file) | |
349 | fprintf (rtl_dump_file, ";; Not considering loop, cold area\n"); | |
350 | return; | |
351 | } | |
352 | ||
353 | /* Can the loop be manipulated? */ | |
354 | if (!can_duplicate_loop_p (loop)) | |
355 | { | |
356 | if (rtl_dump_file) | |
357 | fprintf (rtl_dump_file, | |
358 | ";; Not considering loop, cannot duplicate\n"); | |
359 | return; | |
360 | } | |
361 | ||
3dc575ff | 362 | /* npeel = number of iterations to peel. */ |
a29c7ea6 ZD |
363 | npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns; |
364 | if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES)) | |
365 | npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES); | |
366 | ||
367 | /* Is the loop small enough? */ | |
368 | if (!npeel) | |
369 | { | |
370 | if (rtl_dump_file) | |
371 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
372 | return; | |
373 | } | |
374 | ||
375 | /* Check for simple loops. */ | |
50654f6c | 376 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
377 | |
378 | /* Check number of iterations. */ | |
50654f6c ZD |
379 | if (!desc->simple_p |
380 | || desc->assumptions | |
381 | || !desc->const_iter) | |
a29c7ea6 ZD |
382 | { |
383 | if (rtl_dump_file) | |
384 | fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n"); | |
385 | return; | |
386 | } | |
387 | ||
50654f6c | 388 | if (desc->niter > npeel - 1) |
a29c7ea6 ZD |
389 | { |
390 | if (rtl_dump_file) | |
0c20a65f | 391 | { |
a29c7ea6 | 392 | fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much ("); |
50654f6c | 393 | fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter); |
f937d5e6 | 394 | fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel); |
a29c7ea6 ZD |
395 | } |
396 | return; | |
397 | } | |
398 | ||
399 | /* Success. */ | |
400 | if (rtl_dump_file) | |
401 | fprintf (rtl_dump_file, ";; Decided to peel loop completely\n"); | |
402 | loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; | |
403 | } | |
404 | ||
405 | /* Peel all iterations of LOOP, remove exit edges and cancel the loop | |
406 | completely. The transformation done: | |
0c20a65f | 407 | |
a29c7ea6 ZD |
408 | for (i = 0; i < 4; i++) |
409 | body; | |
410 | ||
411 | ==> | |
0c20a65f AJ |
412 | |
413 | i = 0; | |
a29c7ea6 ZD |
414 | body; i++; |
415 | body; i++; | |
416 | body; i++; | |
417 | body; i++; | |
418 | */ | |
419 | static void | |
0c20a65f | 420 | peel_loop_completely (struct loops *loops, struct loop *loop) |
a29c7ea6 ZD |
421 | { |
422 | sbitmap wont_exit; | |
423 | unsigned HOST_WIDE_INT npeel; | |
a29c7ea6 | 424 | unsigned n_remove_edges, i; |
50654f6c ZD |
425 | edge *remove_edges, ei; |
426 | struct niter_desc *desc = get_simple_loop_desc (loop); | |
0c20a65f | 427 | |
a29c7ea6 ZD |
428 | npeel = desc->niter; |
429 | ||
35b07080 ZD |
430 | if (npeel) |
431 | { | |
432 | wont_exit = sbitmap_alloc (npeel + 1); | |
433 | sbitmap_ones (wont_exit); | |
434 | RESET_BIT (wont_exit, 0); | |
50654f6c | 435 | if (desc->noloop_assumptions) |
35b07080 ZD |
436 | RESET_BIT (wont_exit, 1); |
437 | ||
438 | remove_edges = xcalloc (npeel, sizeof (edge)); | |
439 | n_remove_edges = 0; | |
440 | ||
441 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
442 | loops, npeel, | |
443 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, | |
444 | DLTHE_FLAG_UPDATE_FREQ)) | |
445 | abort (); | |
446 | ||
447 | free (wont_exit); | |
448 | ||
449 | /* Remove the exit edges. */ | |
450 | for (i = 0; i < n_remove_edges; i++) | |
451 | remove_path (loops, remove_edges[i]); | |
452 | free (remove_edges); | |
453 | } | |
a29c7ea6 | 454 | |
50654f6c ZD |
455 | ei = desc->in_edge; |
456 | free_simple_loop_desc (loop); | |
457 | ||
35b07080 ZD |
458 | /* Now remove the unreachable part of the last iteration and cancel |
459 | the loop. */ | |
50654f6c | 460 | remove_path (loops, ei); |
a29c7ea6 ZD |
461 | |
462 | if (rtl_dump_file) | |
463 | fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel); | |
464 | } | |
465 | ||
466 | /* Decide whether to unroll LOOP iterating constant number of times and how much. */ | |
50654f6c | 467 | |
a29c7ea6 | 468 | static void |
d47cc544 | 469 | decide_unroll_constant_iterations (struct loop *loop, int flags) |
a29c7ea6 | 470 | { |
50654f6c ZD |
471 | unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; |
472 | struct niter_desc *desc; | |
a29c7ea6 ZD |
473 | |
474 | if (!(flags & UAP_UNROLL)) | |
475 | { | |
476 | /* We were not asked to, just return back silently. */ | |
477 | return; | |
478 | } | |
479 | ||
480 | if (rtl_dump_file) | |
50654f6c ZD |
481 | fprintf (rtl_dump_file, |
482 | "\n;; Considering unrolling loop with constant number of iterations\n"); | |
a29c7ea6 ZD |
483 | |
484 | /* nunroll = total number of copies of the original loop body in | |
485 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
486 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
487 | nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
488 | if (nunroll > nunroll_by_av) | |
489 | nunroll = nunroll_by_av; | |
490 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
491 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
492 | ||
493 | /* Skip big loops. */ | |
494 | if (nunroll <= 1) | |
495 | { | |
496 | if (rtl_dump_file) | |
497 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
498 | return; | |
499 | } | |
500 | ||
501 | /* Check for simple loops. */ | |
50654f6c | 502 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
503 | |
504 | /* Check number of iterations. */ | |
50654f6c | 505 | if (!desc->simple_p || !desc->const_iter || desc->assumptions) |
a29c7ea6 ZD |
506 | { |
507 | if (rtl_dump_file) | |
508 | fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n"); | |
509 | return; | |
510 | } | |
511 | ||
512 | /* Check whether the loop rolls enough to consider. */ | |
50654f6c | 513 | if (desc->niter < 2 * nunroll) |
a29c7ea6 ZD |
514 | { |
515 | if (rtl_dump_file) | |
516 | fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
517 | return; | |
518 | } | |
519 | ||
520 | /* Success; now compute number of iterations to unroll. We alter | |
521 | nunroll so that as few as possible copies of loop body are | |
e0bb17a8 | 522 | necessary, while still not decreasing the number of unrollings |
a29c7ea6 ZD |
523 | too much (at most by 1). */ |
524 | best_copies = 2 * nunroll + 10; | |
525 | ||
526 | i = 2 * nunroll + 2; | |
50654f6c ZD |
527 | if (i - 1 >= desc->niter) |
528 | i = desc->niter - 2; | |
a29c7ea6 ZD |
529 | |
530 | for (; i >= nunroll - 1; i--) | |
531 | { | |
50654f6c | 532 | unsigned exit_mod = desc->niter % (i + 1); |
a29c7ea6 | 533 | |
50654f6c | 534 | if (!loop_exit_at_end_p (loop)) |
a29c7ea6 | 535 | n_copies = exit_mod + i + 1; |
50654f6c ZD |
536 | else if (exit_mod != (unsigned) i |
537 | || desc->noloop_assumptions != NULL_RTX) | |
a29c7ea6 ZD |
538 | n_copies = exit_mod + i + 2; |
539 | else | |
540 | n_copies = i + 1; | |
541 | ||
542 | if (n_copies < best_copies) | |
543 | { | |
544 | best_copies = n_copies; | |
545 | best_unroll = i; | |
546 | } | |
547 | } | |
548 | ||
549 | if (rtl_dump_file) | |
550 | fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n", | |
551 | best_unroll + 1, best_copies, nunroll); | |
552 | ||
553 | loop->lpt_decision.decision = LPT_UNROLL_CONSTANT; | |
554 | loop->lpt_decision.times = best_unroll; | |
50654f6c ZD |
555 | |
556 | if (rtl_dump_file) | |
557 | fprintf (rtl_dump_file, | |
558 | ";; Decided to unroll the constant times rolling loop, %d times.\n", | |
559 | loop->lpt_decision.times); | |
a29c7ea6 ZD |
560 | } |
561 | ||
562 | /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1 | |
0c20a65f AJ |
563 | times. The transformation does this: |
564 | ||
a29c7ea6 ZD |
565 | for (i = 0; i < 102; i++) |
566 | body; | |
0c20a65f | 567 | |
a29c7ea6 | 568 | ==> |
0c20a65f | 569 | |
a29c7ea6 ZD |
570 | i = 0; |
571 | body; i++; | |
572 | body; i++; | |
573 | while (i < 102) | |
574 | { | |
575 | body; i++; | |
576 | body; i++; | |
577 | body; i++; | |
578 | body; i++; | |
579 | } | |
580 | */ | |
581 | static void | |
0c20a65f | 582 | unroll_loop_constant_iterations (struct loops *loops, struct loop *loop) |
a29c7ea6 ZD |
583 | { |
584 | unsigned HOST_WIDE_INT niter; | |
585 | unsigned exit_mod; | |
586 | sbitmap wont_exit; | |
587 | unsigned n_remove_edges, i; | |
588 | edge *remove_edges; | |
589 | unsigned max_unroll = loop->lpt_decision.times; | |
50654f6c ZD |
590 | struct niter_desc *desc = get_simple_loop_desc (loop); |
591 | bool exit_at_end = loop_exit_at_end_p (loop); | |
a29c7ea6 ZD |
592 | |
593 | niter = desc->niter; | |
594 | ||
50654f6c | 595 | if (niter <= max_unroll + 1) |
a29c7ea6 ZD |
596 | abort (); /* Should not get here (such loop should be peeled instead). */ |
597 | ||
598 | exit_mod = niter % (max_unroll + 1); | |
599 | ||
600 | wont_exit = sbitmap_alloc (max_unroll + 1); | |
601 | sbitmap_ones (wont_exit); | |
602 | ||
603 | remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge)); | |
604 | n_remove_edges = 0; | |
605 | ||
50654f6c | 606 | if (!exit_at_end) |
a29c7ea6 | 607 | { |
50654f6c | 608 | /* The exit is not at the end of the loop; leave exit test |
a29c7ea6 ZD |
609 | in the first copy, so that the loops that start with test |
610 | of exit condition have continuous body after unrolling. */ | |
611 | ||
612 | if (rtl_dump_file) | |
613 | fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n"); | |
614 | ||
615 | /* Peel exit_mod iterations. */ | |
616 | RESET_BIT (wont_exit, 0); | |
50654f6c | 617 | if (desc->noloop_assumptions) |
a29c7ea6 ZD |
618 | RESET_BIT (wont_exit, 1); |
619 | ||
50654f6c ZD |
620 | if (exit_mod) |
621 | { | |
622 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
623 | loops, exit_mod, | |
624 | wont_exit, desc->out_edge, | |
625 | remove_edges, &n_remove_edges, | |
626 | DLTHE_FLAG_UPDATE_FREQ)) | |
627 | abort (); | |
628 | ||
629 | desc->noloop_assumptions = NULL_RTX; | |
630 | desc->niter -= exit_mod; | |
631 | desc->niter_max -= exit_mod; | |
632 | } | |
a29c7ea6 ZD |
633 | |
634 | SET_BIT (wont_exit, 1); | |
635 | } | |
636 | else | |
637 | { | |
638 | /* Leave exit test in last copy, for the same reason as above if | |
639 | the loop tests the condition at the end of loop body. */ | |
640 | ||
641 | if (rtl_dump_file) | |
642 | fprintf (rtl_dump_file, ";; Condition on end of loop.\n"); | |
643 | ||
644 | /* We know that niter >= max_unroll + 2; so we do not need to care of | |
645 | case when we would exit before reaching the loop. So just peel | |
50654f6c ZD |
646 | exit_mod + 1 iterations. */ |
647 | if (exit_mod != max_unroll | |
648 | || desc->noloop_assumptions) | |
a29c7ea6 ZD |
649 | { |
650 | RESET_BIT (wont_exit, 0); | |
50654f6c | 651 | if (desc->noloop_assumptions) |
a29c7ea6 ZD |
652 | RESET_BIT (wont_exit, 1); |
653 | ||
654 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
655 | loops, exit_mod + 1, | |
656 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, | |
657 | DLTHE_FLAG_UPDATE_FREQ)) | |
658 | abort (); | |
659 | ||
50654f6c ZD |
660 | desc->niter -= exit_mod + 1; |
661 | desc->niter_max -= exit_mod + 1; | |
662 | desc->noloop_assumptions = NULL_RTX; | |
663 | ||
a29c7ea6 ZD |
664 | SET_BIT (wont_exit, 0); |
665 | SET_BIT (wont_exit, 1); | |
666 | } | |
667 | ||
668 | RESET_BIT (wont_exit, max_unroll); | |
669 | } | |
670 | ||
671 | /* Now unroll the loop. */ | |
672 | if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), | |
673 | loops, max_unroll, | |
674 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, | |
675 | DLTHE_FLAG_UPDATE_FREQ)) | |
676 | abort (); | |
677 | ||
678 | free (wont_exit); | |
679 | ||
50654f6c ZD |
680 | if (exit_at_end) |
681 | { | |
682 | basic_block exit_block = desc->in_edge->src->rbi->copy; | |
683 | /* Find a new in and out edge; they are in the last copy we have made. */ | |
684 | ||
685 | if (exit_block->succ->dest == desc->out_edge->dest) | |
686 | { | |
687 | desc->out_edge = exit_block->succ; | |
688 | desc->in_edge = exit_block->succ->succ_next; | |
689 | } | |
690 | else | |
691 | { | |
692 | desc->out_edge = exit_block->succ->succ_next; | |
693 | desc->in_edge = exit_block->succ; | |
694 | } | |
695 | } | |
696 | ||
697 | desc->niter /= max_unroll + 1; | |
698 | desc->niter_max /= max_unroll + 1; | |
699 | desc->niter_expr = GEN_INT (desc->niter); | |
700 | ||
a29c7ea6 ZD |
701 | /* Remove the edges. */ |
702 | for (i = 0; i < n_remove_edges; i++) | |
703 | remove_path (loops, remove_edges[i]); | |
704 | free (remove_edges); | |
705 | ||
706 | if (rtl_dump_file) | |
707 | fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop)); | |
708 | } | |
709 | ||
710 | /* Decide whether to unroll LOOP iterating runtime computable number of times | |
711 | and how much. */ | |
712 | static void | |
d47cc544 | 713 | decide_unroll_runtime_iterations (struct loop *loop, int flags) |
a29c7ea6 ZD |
714 | { |
715 | unsigned nunroll, nunroll_by_av, i; | |
50654f6c | 716 | struct niter_desc *desc; |
a29c7ea6 ZD |
717 | |
718 | if (!(flags & UAP_UNROLL)) | |
719 | { | |
720 | /* We were not asked to, just return back silently. */ | |
721 | return; | |
722 | } | |
723 | ||
724 | if (rtl_dump_file) | |
50654f6c ZD |
725 | fprintf (rtl_dump_file, |
726 | "\n;; Considering unrolling loop with runtime computable number of iterations\n"); | |
a29c7ea6 ZD |
727 | |
728 | /* nunroll = total number of copies of the original loop body in | |
729 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
730 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
731 | nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
732 | if (nunroll > nunroll_by_av) | |
733 | nunroll = nunroll_by_av; | |
734 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
735 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
736 | ||
737 | /* Skip big loops. */ | |
738 | if (nunroll <= 1) | |
739 | { | |
740 | if (rtl_dump_file) | |
741 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
742 | return; | |
743 | } | |
744 | ||
745 | /* Check for simple loops. */ | |
50654f6c | 746 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
747 | |
748 | /* Check simpleness. */ | |
50654f6c | 749 | if (!desc->simple_p || desc->assumptions) |
a29c7ea6 ZD |
750 | { |
751 | if (rtl_dump_file) | |
50654f6c ZD |
752 | fprintf (rtl_dump_file, |
753 | ";; Unable to prove that the number of iterations can be counted in runtime\n"); | |
a29c7ea6 ZD |
754 | return; |
755 | } | |
756 | ||
50654f6c | 757 | if (desc->const_iter) |
a29c7ea6 ZD |
758 | { |
759 | if (rtl_dump_file) | |
760 | fprintf (rtl_dump_file, ";; Loop iterates constant times\n"); | |
761 | return; | |
762 | } | |
763 | ||
764 | /* If we have profile feedback, check whether the loop rolls. */ | |
765 | if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll) | |
766 | { | |
767 | if (rtl_dump_file) | |
768 | fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
769 | return; | |
770 | } | |
771 | ||
772 | /* Success; now force nunroll to be power of 2, as we are unable to | |
773 | cope with overflows in computation of number of iterations. */ | |
50654f6c ZD |
774 | for (i = 1; 2 * i <= nunroll; i *= 2) |
775 | continue; | |
a29c7ea6 ZD |
776 | |
777 | loop->lpt_decision.decision = LPT_UNROLL_RUNTIME; | |
778 | loop->lpt_decision.times = i - 1; | |
50654f6c ZD |
779 | |
780 | if (rtl_dump_file) | |
781 | fprintf (rtl_dump_file, | |
782 | ";; Decided to unroll the runtime computable times rolling loop, %d times.\n", | |
783 | loop->lpt_decision.times); | |
a29c7ea6 ZD |
784 | } |
785 | ||
786 | /* Unroll LOOP for that we are able to count number of iterations in runtime | |
787 | LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some | |
788 | extra care for case n < 0): | |
0c20a65f | 789 | |
a29c7ea6 ZD |
790 | for (i = 0; i < n; i++) |
791 | body; | |
0c20a65f | 792 | |
a29c7ea6 | 793 | ==> |
0c20a65f | 794 | |
a29c7ea6 ZD |
795 | i = 0; |
796 | mod = n % 4; | |
0c20a65f | 797 | |
a29c7ea6 ZD |
798 | switch (mod) |
799 | { | |
800 | case 3: | |
801 | body; i++; | |
802 | case 2: | |
803 | body; i++; | |
804 | case 1: | |
805 | body; i++; | |
806 | case 0: ; | |
807 | } | |
0c20a65f | 808 | |
a29c7ea6 ZD |
809 | while (i < n) |
810 | { | |
811 | body; i++; | |
812 | body; i++; | |
813 | body; i++; | |
814 | body; i++; | |
815 | } | |
816 | */ | |
817 | static void | |
0c20a65f | 818 | unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop) |
a29c7ea6 | 819 | { |
50654f6c | 820 | rtx old_niter, niter, init_code, branch_code, tmp; |
a29c7ea6 ZD |
821 | unsigned i, j, p; |
822 | basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch; | |
823 | unsigned n_dom_bbs; | |
824 | sbitmap wont_exit; | |
825 | int may_exit_copy; | |
826 | unsigned n_peel, n_remove_edges; | |
827 | edge *remove_edges, e; | |
828 | bool extra_zero_check, last_may_exit; | |
829 | unsigned max_unroll = loop->lpt_decision.times; | |
50654f6c ZD |
830 | struct niter_desc *desc = get_simple_loop_desc (loop); |
831 | bool exit_at_end = loop_exit_at_end_p (loop); | |
a29c7ea6 ZD |
832 | |
833 | /* Remember blocks whose dominators will have to be updated. */ | |
834 | dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block)); | |
835 | n_dom_bbs = 0; | |
836 | ||
837 | body = get_loop_body (loop); | |
838 | for (i = 0; i < loop->num_nodes; i++) | |
839 | { | |
840 | unsigned nldom; | |
841 | basic_block *ldom; | |
842 | ||
d47cc544 | 843 | nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom); |
a29c7ea6 ZD |
844 | for (j = 0; j < nldom; j++) |
845 | if (!flow_bb_inside_loop_p (loop, ldom[j])) | |
846 | dom_bbs[n_dom_bbs++] = ldom[j]; | |
847 | ||
848 | free (ldom); | |
849 | } | |
850 | free (body); | |
851 | ||
50654f6c | 852 | if (!exit_at_end) |
a29c7ea6 ZD |
853 | { |
854 | /* Leave exit in first copy (for explanation why see comment in | |
855 | unroll_loop_constant_iterations). */ | |
856 | may_exit_copy = 0; | |
857 | n_peel = max_unroll - 1; | |
858 | extra_zero_check = true; | |
859 | last_may_exit = false; | |
860 | } | |
861 | else | |
862 | { | |
863 | /* Leave exit in last copy (for explanation why see comment in | |
864 | unroll_loop_constant_iterations). */ | |
865 | may_exit_copy = max_unroll; | |
866 | n_peel = max_unroll; | |
867 | extra_zero_check = false; | |
868 | last_may_exit = true; | |
869 | } | |
870 | ||
871 | /* Get expression for number of iterations. */ | |
872 | start_sequence (); | |
50654f6c ZD |
873 | old_niter = niter = gen_reg_rtx (desc->mode); |
874 | tmp = force_operand (copy_rtx (desc->niter_expr), niter); | |
875 | if (tmp != niter) | |
876 | emit_move_insn (niter, tmp); | |
a29c7ea6 ZD |
877 | |
878 | /* Count modulo by ANDing it with max_unroll; we use the fact that | |
879 | the number of unrollings is a power of two, and thus this is correct | |
880 | even if there is overflow in the computation. */ | |
50654f6c | 881 | niter = expand_simple_binop (desc->mode, AND, |
a29c7ea6 ZD |
882 | niter, |
883 | GEN_INT (max_unroll), | |
884 | NULL_RTX, 0, OPTAB_LIB_WIDEN); | |
885 | ||
886 | init_code = get_insns (); | |
887 | end_sequence (); | |
888 | ||
889 | /* Precondition the loop. */ | |
d47cc544 | 890 | loop_split_edge_with (loop_preheader_edge (loop), init_code); |
a29c7ea6 ZD |
891 | |
892 | remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge)); | |
893 | n_remove_edges = 0; | |
894 | ||
895 | wont_exit = sbitmap_alloc (max_unroll + 2); | |
896 | ||
897 | /* Peel the first copy of loop body (almost always we must leave exit test | |
898 | here; the only exception is when we have extra zero check and the number | |
50654f6c ZD |
899 | of iterations is reliable. Also record the place of (possible) extra |
900 | zero check. */ | |
a29c7ea6 | 901 | sbitmap_zero (wont_exit); |
50654f6c ZD |
902 | if (extra_zero_check |
903 | && !desc->noloop_assumptions) | |
a29c7ea6 ZD |
904 | SET_BIT (wont_exit, 1); |
905 | ezc_swtch = loop_preheader_edge (loop)->src; | |
906 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
907 | loops, 1, | |
908 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, | |
909 | DLTHE_FLAG_UPDATE_FREQ)) | |
910 | abort (); | |
911 | ||
912 | /* Record the place where switch will be built for preconditioning. */ | |
913 | swtch = loop_split_edge_with (loop_preheader_edge (loop), | |
d47cc544 | 914 | NULL_RTX); |
a29c7ea6 ZD |
915 | |
916 | for (i = 0; i < n_peel; i++) | |
917 | { | |
918 | /* Peel the copy. */ | |
919 | sbitmap_zero (wont_exit); | |
920 | if (i != n_peel - 1 || !last_may_exit) | |
921 | SET_BIT (wont_exit, 1); | |
922 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
923 | loops, 1, | |
924 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, | |
925 | DLTHE_FLAG_UPDATE_FREQ)) | |
0c20a65f | 926 | abort (); |
a29c7ea6 | 927 | |
91f4cfe3 ZD |
928 | /* Create item for switch. */ |
929 | j = n_peel - i - (extra_zero_check ? 0 : 1); | |
930 | p = REG_BR_PROB_BASE / (i + 2); | |
931 | ||
d47cc544 | 932 | preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX); |
50654f6c ZD |
933 | branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ, |
934 | block_label (preheader), p, NULL_RTX); | |
91f4cfe3 | 935 | |
d47cc544 SB |
936 | swtch = loop_split_edge_with (swtch->pred, branch_code); |
937 | set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); | |
91f4cfe3 ZD |
938 | swtch->succ->probability = REG_BR_PROB_BASE - p; |
939 | e = make_edge (swtch, preheader, | |
940 | swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP); | |
941 | e->probability = p; | |
a29c7ea6 ZD |
942 | } |
943 | ||
944 | if (extra_zero_check) | |
945 | { | |
946 | /* Add branch for zero iterations. */ | |
947 | p = REG_BR_PROB_BASE / (max_unroll + 1); | |
948 | swtch = ezc_swtch; | |
d47cc544 | 949 | preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX); |
50654f6c ZD |
950 | branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ, |
951 | block_label (preheader), p, NULL_RTX); | |
a29c7ea6 | 952 | |
d47cc544 SB |
953 | swtch = loop_split_edge_with (swtch->succ, branch_code); |
954 | set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); | |
a29c7ea6 | 955 | swtch->succ->probability = REG_BR_PROB_BASE - p; |
72b8d451 ZD |
956 | e = make_edge (swtch, preheader, |
957 | swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP); | |
a29c7ea6 ZD |
958 | e->probability = p; |
959 | } | |
960 | ||
961 | /* Recount dominators for outer blocks. */ | |
d47cc544 | 962 | iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs); |
a29c7ea6 ZD |
963 | |
964 | /* And unroll loop. */ | |
965 | ||
966 | sbitmap_ones (wont_exit); | |
967 | RESET_BIT (wont_exit, may_exit_copy); | |
968 | ||
969 | if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), | |
0c20a65f | 970 | loops, max_unroll, |
a29c7ea6 ZD |
971 | wont_exit, desc->out_edge, remove_edges, &n_remove_edges, |
972 | DLTHE_FLAG_UPDATE_FREQ)) | |
973 | abort (); | |
974 | ||
975 | free (wont_exit); | |
976 | ||
50654f6c ZD |
977 | if (exit_at_end) |
978 | { | |
979 | basic_block exit_block = desc->in_edge->src->rbi->copy; | |
980 | /* Find a new in and out edge; they are in the last copy we have made. */ | |
981 | ||
982 | if (exit_block->succ->dest == desc->out_edge->dest) | |
983 | { | |
984 | desc->out_edge = exit_block->succ; | |
985 | desc->in_edge = exit_block->succ->succ_next; | |
986 | } | |
987 | else | |
988 | { | |
989 | desc->out_edge = exit_block->succ->succ_next; | |
990 | desc->in_edge = exit_block->succ; | |
991 | } | |
992 | } | |
993 | ||
a29c7ea6 ZD |
994 | /* Remove the edges. */ |
995 | for (i = 0; i < n_remove_edges; i++) | |
996 | remove_path (loops, remove_edges[i]); | |
997 | free (remove_edges); | |
998 | ||
50654f6c ZD |
999 | /* We must be careful when updating the number of iterations due to |
1000 | preconditioning and the fact that the value must be valid at entry | |
1001 | of the loop. After passing through the above code, we see that | |
1002 | the correct new number of iterations is this: */ | |
1003 | if (desc->const_iter) | |
1004 | abort (); | |
1005 | desc->niter_expr = | |
1006 | simplify_gen_binary (UDIV, desc->mode, old_niter, GEN_INT (max_unroll + 1)); | |
1007 | desc->niter_max /= max_unroll + 1; | |
1008 | if (exit_at_end) | |
1009 | { | |
1010 | desc->niter_expr = | |
1011 | simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx); | |
1012 | desc->noloop_assumptions = NULL_RTX; | |
1013 | desc->niter_max--; | |
1014 | } | |
1015 | ||
a29c7ea6 ZD |
1016 | if (rtl_dump_file) |
1017 | fprintf (rtl_dump_file, | |
1018 | ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n", | |
1019 | max_unroll, num_loop_insns (loop)); | |
1020 | } | |
0c20a65f | 1021 | |
a29c7ea6 ZD |
1022 | /* Decide whether to simply peel LOOP and how much. */ |
1023 | static void | |
d47cc544 | 1024 | decide_peel_simple (struct loop *loop, int flags) |
a29c7ea6 ZD |
1025 | { |
1026 | unsigned npeel; | |
50654f6c | 1027 | struct niter_desc *desc; |
a29c7ea6 ZD |
1028 | |
1029 | if (!(flags & UAP_PEEL)) | |
1030 | { | |
1031 | /* We were not asked to, just return back silently. */ | |
1032 | return; | |
1033 | } | |
1034 | ||
1035 | if (rtl_dump_file) | |
50654f6c | 1036 | fprintf (rtl_dump_file, "\n;; Considering simply peeling loop\n"); |
a29c7ea6 | 1037 | |
3dc575ff | 1038 | /* npeel = number of iterations to peel. */ |
a29c7ea6 ZD |
1039 | npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns; |
1040 | if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES)) | |
1041 | npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES); | |
1042 | ||
1043 | /* Skip big loops. */ | |
1044 | if (!npeel) | |
1045 | { | |
1046 | if (rtl_dump_file) | |
1047 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
1048 | return; | |
1049 | } | |
1050 | ||
1051 | /* Check for simple loops. */ | |
50654f6c | 1052 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
1053 | |
1054 | /* Check number of iterations. */ | |
50654f6c | 1055 | if (desc->simple_p && !desc->assumptions && desc->const_iter) |
a29c7ea6 ZD |
1056 | { |
1057 | if (rtl_dump_file) | |
1058 | fprintf (rtl_dump_file, ";; Loop iterates constant times\n"); | |
1059 | return; | |
1060 | } | |
1061 | ||
1062 | /* Do not simply peel loops with branches inside -- it increases number | |
1063 | of mispredicts. */ | |
50654f6c | 1064 | if (num_loop_branches (loop) > 1) |
a29c7ea6 ZD |
1065 | { |
1066 | if (rtl_dump_file) | |
1067 | fprintf (rtl_dump_file, ";; Not peeling, contains branches\n"); | |
1068 | return; | |
1069 | } | |
1070 | ||
1071 | if (loop->header->count) | |
1072 | { | |
1073 | unsigned niter = expected_loop_iterations (loop); | |
1074 | if (niter + 1 > npeel) | |
1075 | { | |
1076 | if (rtl_dump_file) | |
1077 | { | |
1078 | fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much ("); | |
1079 | fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1)); | |
1080 | fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel); | |
1081 | } | |
1082 | return; | |
1083 | } | |
1084 | npeel = niter + 1; | |
1085 | } | |
1086 | else | |
1087 | { | |
1088 | /* For now we have no good heuristics to decide whether loop peeling | |
1089 | will be effective, so disable it. */ | |
1090 | if (rtl_dump_file) | |
1091 | fprintf (rtl_dump_file, | |
1092 | ";; Not peeling loop, no evidence it will be profitable\n"); | |
1093 | return; | |
1094 | } | |
1095 | ||
1096 | /* Success. */ | |
1097 | loop->lpt_decision.decision = LPT_PEEL_SIMPLE; | |
1098 | loop->lpt_decision.times = npeel; | |
50654f6c ZD |
1099 | |
1100 | if (rtl_dump_file) | |
1101 | fprintf (rtl_dump_file, ";; Decided to simply peel the loop, %d times.\n", | |
1102 | loop->lpt_decision.times); | |
a29c7ea6 ZD |
1103 | } |
1104 | ||
1105 | /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation: | |
1106 | while (cond) | |
1107 | body; | |
1108 | ||
1109 | ==> | |
1110 | ||
1111 | if (!cond) goto end; | |
1112 | body; | |
1113 | if (!cond) goto end; | |
1114 | body; | |
1115 | while (cond) | |
1116 | body; | |
1117 | end: ; | |
1118 | */ | |
1119 | static void | |
0c20a65f | 1120 | peel_loop_simple (struct loops *loops, struct loop *loop) |
a29c7ea6 ZD |
1121 | { |
1122 | sbitmap wont_exit; | |
1123 | unsigned npeel = loop->lpt_decision.times; | |
50654f6c | 1124 | struct niter_desc *desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
1125 | |
1126 | wont_exit = sbitmap_alloc (npeel + 1); | |
1127 | sbitmap_zero (wont_exit); | |
1128 | ||
1129 | if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), | |
1130 | loops, npeel, wont_exit, NULL, NULL, NULL, | |
1131 | DLTHE_FLAG_UPDATE_FREQ)) | |
1132 | abort (); | |
0c20a65f | 1133 | |
a29c7ea6 ZD |
1134 | free (wont_exit); |
1135 | ||
50654f6c ZD |
1136 | if (desc->simple_p) |
1137 | { | |
1138 | if (desc->const_iter) | |
1139 | { | |
1140 | desc->niter -= npeel; | |
1141 | desc->niter_expr = GEN_INT (desc->niter); | |
1142 | desc->noloop_assumptions = NULL_RTX; | |
1143 | } | |
1144 | else | |
1145 | { | |
1146 | /* We cannot just update niter_expr, as its value might be clobbered | |
1147 | inside loop. We could handle this by counting the number into | |
1148 | temporary just like we do in runtime unrolling, but it does not | |
1149 | seem worthwhile. */ | |
1150 | free_simple_loop_desc (loop); | |
1151 | } | |
1152 | } | |
a29c7ea6 ZD |
1153 | if (rtl_dump_file) |
1154 | fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel); | |
1155 | } | |
1156 | ||
1157 | /* Decide whether to unroll LOOP stupidly and how much. */ | |
1158 | static void | |
d47cc544 | 1159 | decide_unroll_stupid (struct loop *loop, int flags) |
a29c7ea6 ZD |
1160 | { |
1161 | unsigned nunroll, nunroll_by_av, i; | |
50654f6c | 1162 | struct niter_desc *desc; |
a29c7ea6 ZD |
1163 | |
1164 | if (!(flags & UAP_UNROLL_ALL)) | |
1165 | { | |
1166 | /* We were not asked to, just return back silently. */ | |
1167 | return; | |
1168 | } | |
1169 | ||
1170 | if (rtl_dump_file) | |
50654f6c | 1171 | fprintf (rtl_dump_file, "\n;; Considering unrolling loop stupidly\n"); |
a29c7ea6 ZD |
1172 | |
1173 | /* nunroll = total number of copies of the original loop body in | |
1174 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
1175 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
1176 | nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
1177 | if (nunroll > nunroll_by_av) | |
1178 | nunroll = nunroll_by_av; | |
1179 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
1180 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
1181 | ||
1182 | /* Skip big loops. */ | |
1183 | if (nunroll <= 1) | |
1184 | { | |
1185 | if (rtl_dump_file) | |
1186 | fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); | |
1187 | return; | |
1188 | } | |
1189 | ||
1190 | /* Check for simple loops. */ | |
50654f6c | 1191 | desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
1192 | |
1193 | /* Check simpleness. */ | |
50654f6c | 1194 | if (desc->simple_p && !desc->assumptions) |
a29c7ea6 ZD |
1195 | { |
1196 | if (rtl_dump_file) | |
1197 | fprintf (rtl_dump_file, ";; The loop is simple\n"); | |
1198 | return; | |
1199 | } | |
1200 | ||
1201 | /* Do not unroll loops with branches inside -- it increases number | |
1202 | of mispredicts. */ | |
50654f6c | 1203 | if (num_loop_branches (loop) > 1) |
a29c7ea6 ZD |
1204 | { |
1205 | if (rtl_dump_file) | |
1206 | fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n"); | |
1207 | return; | |
1208 | } | |
1209 | ||
1210 | /* If we have profile feedback, check whether the loop rolls. */ | |
50654f6c ZD |
1211 | if (loop->header->count |
1212 | && expected_loop_iterations (loop) < 2 * nunroll) | |
a29c7ea6 ZD |
1213 | { |
1214 | if (rtl_dump_file) | |
1215 | fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
1216 | return; | |
1217 | } | |
1218 | ||
1219 | /* Success. Now force nunroll to be power of 2, as it seems that this | |
e0bb17a8 | 1220 | improves results (partially because of better alignments, partially |
a29c7ea6 | 1221 | because of some dark magic). */ |
50654f6c ZD |
1222 | for (i = 1; 2 * i <= nunroll; i *= 2) |
1223 | continue; | |
a29c7ea6 ZD |
1224 | |
1225 | loop->lpt_decision.decision = LPT_UNROLL_STUPID; | |
1226 | loop->lpt_decision.times = i - 1; | |
50654f6c ZD |
1227 | |
1228 | if (rtl_dump_file) | |
1229 | fprintf (rtl_dump_file, | |
1230 | ";; Decided to unroll the loop stupidly, %d times.\n", | |
1231 | loop->lpt_decision.times); | |
a29c7ea6 ZD |
1232 | } |
1233 | ||
1234 | /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation: | |
1235 | while (cond) | |
1236 | body; | |
1237 | ||
1238 | ==> | |
1239 | ||
1240 | while (cond) | |
1241 | { | |
1242 | body; | |
1243 | if (!cond) break; | |
1244 | body; | |
1245 | if (!cond) break; | |
1246 | body; | |
1247 | if (!cond) break; | |
1248 | body; | |
1249 | } | |
1250 | */ | |
1251 | static void | |
0c20a65f | 1252 | unroll_loop_stupid (struct loops *loops, struct loop *loop) |
a29c7ea6 ZD |
1253 | { |
1254 | sbitmap wont_exit; | |
1255 | unsigned nunroll = loop->lpt_decision.times; | |
50654f6c | 1256 | struct niter_desc *desc = get_simple_loop_desc (loop); |
a29c7ea6 ZD |
1257 | |
1258 | wont_exit = sbitmap_alloc (nunroll + 1); | |
1259 | sbitmap_zero (wont_exit); | |
1260 | ||
1261 | if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), | |
1262 | loops, nunroll, wont_exit, NULL, NULL, NULL, | |
1263 | DLTHE_FLAG_UPDATE_FREQ)) | |
1264 | abort (); | |
1265 | ||
1266 | free (wont_exit); | |
0c20a65f | 1267 | |
50654f6c ZD |
1268 | if (desc->simple_p) |
1269 | { | |
1270 | /* We indeed may get here provided that there are nontrivial assumptions | |
1271 | for a loop to be really simple. We could update the counts, but the | |
1272 | problem is that we are unable to decide which exit will be taken | |
1273 | (not really true in case the number of iterations is constant, | |
1274 | but noone will do anything with this information, so we do not | |
1275 | worry about it). */ | |
1276 | desc->simple_p = false; | |
1277 | } | |
1278 | ||
a29c7ea6 ZD |
1279 | if (rtl_dump_file) |
1280 | fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n", | |
1281 | nunroll, num_loop_insns (loop)); | |
1282 | } |