This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl-optimization/78200)
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Richard Biener <rguenther at suse dot de>
- Cc: gcc-patches at gcc dot gnu dot org, Venkataramanan dot Kumar at amd dot com
- Date: Tue, 27 Mar 2018 11:10:23 +0200
- Subject: [PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl-optimization/78200)
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!
The following patch attempts to improve expansion, if we have code like:
<bb 16> [local count: 102513059]:
if_conversion_var_52 = MEM[base: st1_22, offset: 0B];
if (if_conversion_var_52 < 0)
goto <bb 17>; [41.00%]
else
goto <bb 18>; [59.00%]
...
<bb 18> [local count: 60482706]:
_81 = _11 == 2;
_82 = if_conversion_var_52 > 0;
_79 = _81 & _82;
if (_79 != 0)
goto <bb 19>; [29.26%]
else
goto <bb 20>; [70.74%]
Here, the single pred of the bb performed a similar comparison to what
one of the & (or |) operands does, and as & (or |) is not ordered, we can
choose which operand we'll expand first. If we expand if_conversion_var_52 > 0
first, there is a chance that we can reuse flags from the previous
comparison. The patch does it only if there are no (non-virtual) phis on the
current bb, all stmts before the current condition are TERable, so there is
nothing that would clobber the flags in between.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk if it
helps for SPEC? Venkat, do you think you could benchmark it in the setup
where you've measured the slowdown to see if it helps? I see the patch
changes the loop:
.p2align 4,,10
.p2align 3
.L11:
+ jle .L10
cmpl $2, %eax
- jne .L10
- testq %r8, %r8
- jg .L12
+ je .L12
.p2align 4,,10
.p2align 3
.L10:
addq %r9, %rsi
cmpq %rsi, %rdx
jbe .L35
.L13:
movl 24(%rsi), %eax
testl %eax, %eax
jle .L10
movq (%rsi), %r8
testq %r8, %r8
jns .L11
cmpl $1, %eax
jne .L10
.L12:
addq $1, %r10
movl $1, %r11d
movq st5(,%r10,8), %rax
movq %rsi, (%rax)
addq %r9, %rsi
movq %r8, 8(%rax)
movq st5(,%rdi,8), %rax
movq %r8, 16(%rax)
cmpq %rsi, %rdx
ja .L13
which I assume shall be an improvement, since we can save one extra
comparison.
2018-03-27 Jakub Jelinek <jakub@redhat.com>
Richard Biener <rguenth@suse.de>
PR rtl-optimization/78200
* cfgexpand.c (gimple_bb_info_for_bb): New variable.
(expand_bb_seq, expand_phi_nodes): New functions.
(expand_gimple_cond): Use heuristics if it is desirable to
swap TRUTH_{AND,OR}IF_EXPR operands.
(expand_gimple_basic_block): Remember GIMPLE il for bbs
being expanded or already expanded.
(pass_expand::execute): Initialize and then free the
gimple_bb_info_for_bb vector.
--- gcc/cfgexpand.c.jj 2018-02-09 06:44:36.413805556 +0100
+++ gcc/cfgexpand.c 2018-03-26 13:35:57.536509844 +0200
@@ -2357,6 +2357,34 @@ label_rtx_for_bb (basic_block bb ATTRIBU
return l;
}
+/* Maps blocks to their GIMPLE IL. */
+static vec<gimple_bb_info, va_heap, vl_embed> *gimple_bb_info_for_bb;
+
+/* Like bb_seq, except that during expansion returns the GIMPLE seq
+ even for the blocks that have been already expanded or are being
+ currently expanded. */
+
+static gimple_seq
+expand_bb_seq (basic_block bb)
+{
+ if ((bb->flags & BB_RTL)
+ && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
+ return (*gimple_bb_info_for_bb)[bb->index].seq;
+ return bb_seq (bb);
+}
+
+/* Like phi_nodes, except that during expansion returns the GIMPLE PHIs
+ even for the blocks that have been already expanded or are being
+ currently expanded. */
+
+static gimple_seq
+expand_phi_nodes (basic_block bb)
+{
+ if ((bb->flags & BB_RTL)
+ && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb))
+ return (*gimple_bb_info_for_bb)[bb->index].phi_nodes;
+ return phi_nodes (bb);
+}
/* A subroutine of expand_gimple_cond. Given E, a fallthrough edge
of a basic block where we just expanded the conditional at the end,
@@ -2475,6 +2503,65 @@ expand_gimple_cond (basic_block bb, gcon
op0 = gimple_assign_rhs1 (second);
op1 = gimple_assign_rhs2 (second);
}
+
+ /* We'll expand RTL for op0 first, see if we'd better expand RTL
+ for op1 first. Do that if the previous bb ends with
+ if (x op cst), op1's def_stmt rhs is x op2 cst and there are
+ no non-virtual PHIs nor non-TERed stmts in BB before STMT. */
+ while (TREE_CODE (op1) == SSA_NAME
+ && (code == TRUTH_ANDIF_EXPR || code == TRUTH_ORIF_EXPR)
+ && single_pred_p (bb))
+ {
+ gimple *def1 = SSA_NAME_DEF_STMT (op1);
+ if (!is_gimple_assign (def1)
+ || (TREE_CODE_CLASS (gimple_assign_rhs_code (def1))
+ != tcc_comparison))
+ break;
+
+ basic_block pred = single_pred (bb);
+ gimple_seq pred_seq = expand_bb_seq (pred);
+ gimple_stmt_iterator i = gsi_last (pred_seq);
+ if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i)))
+ gsi_prev_nondebug (&i);
+
+ gimple *last = gsi_stmt (i);
+ if (!last
+ || gimple_code (last) != GIMPLE_COND
+ || gimple_assign_rhs1 (def1) != gimple_cond_lhs (last)
+ || !operand_equal_p (gimple_assign_rhs2 (def1),
+ gimple_cond_rhs (last), 0))
+ break;
+
+ gimple_seq phi_seq = expand_phi_nodes (bb);
+ i = gsi_start (phi_seq);
+ if (!gsi_end_p (i)
+ && !virtual_operand_p (gimple_phi_result (gsi_stmt (i))))
+ break;
+
+ /* Check if all stmts before STMT are TERed. */
+ gimple_seq cur_seq = expand_bb_seq (bb);
+ i = gsi_start_nondebug (cur_seq);
+ int cnt = 100;
+ while (!gsi_end_p (i) && gsi_stmt (i) != stmt)
+ {
+ gimple *g = gsi_stmt (i);
+ if (!is_gimple_assign (g))
+ break;
+ if (--cnt == 0)
+ break;
+ tree lhs = gimple_assign_lhs (g);
+ if (TREE_CODE (lhs) != SSA_NAME
+ || !get_gimple_for_ssa_name (lhs))
+ break;
+ gsi_next_nondebug (&i);
+ }
+
+ if (gsi_stmt (i) != stmt)
+ break;
+
+ std::swap (op0, op1);
+ break;
+ }
}
}
}
@@ -5501,6 +5588,10 @@ expand_gimple_basic_block (basic_block b
if (optimize)
reorder_operands (bb);
stmts = bb_seq (bb);
+ if ((unsigned) bb->index >= vec_safe_length (gimple_bb_info_for_bb))
+ vec_safe_grow_cleared (gimple_bb_info_for_bb, bb->index + 1);
+ (*gimple_bb_info_for_bb)[bb->index].seq = stmts;
+ (*gimple_bb_info_for_bb)[bb->index].phi_nodes = phi_nodes (bb);
bb->il.gimple.seq = NULL;
bb->il.gimple.phi_nodes = NULL;
rtl_profile_for_bb (bb);
@@ -6419,6 +6510,8 @@ pass_expand::execute (function *fun)
>= PARAM_VALUE (PARAM_MAX_DEBUG_MARKER_COUNT))
cfun->debug_nonbind_markers = false;
+ vec_safe_grow_cleared (gimple_bb_info_for_bb, n_basic_blocks_for_fn (cfun));
+
lab_rtx_for_bb = new hash_map<basic_block, rtx_code_label *>;
FOR_BB_BETWEEN (bb, init_block->next_bb, EXIT_BLOCK_PTR_FOR_FN (fun),
next_bb)
@@ -6433,6 +6526,8 @@ pass_expand::execute (function *fun)
deep_ter_debug_map = NULL;
}
+ vec_free (gimple_bb_info_for_bb);
+
/* Free stuff we no longer need after GIMPLE optimizations. */
free_dominance_info (CDI_DOMINATORS);
free_dominance_info (CDI_POST_DOMINATORS);
Jakub