[Bug tree-optimization/95855] New: A missing ifcvt optimization to generate fcsel
yangyang305 at huawei dot com
gcc-bugzilla@gcc.gnu.org
Wed Jun 24 02:48:07 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95855
Bug ID: 95855
Summary: A missing ifcvt optimization to generate fcsel
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: yangyang305 at huawei dot com
Target Milestone: ---
For the following case,
double test(double* d1, double* d2, double* d3, int num, double* ip) {
double dmax[3];
for (int i = 0; i < num; i++) {
dmax[0] = d1[i] < dmax[0] ? dmax[0] : d1[i];
dmax[1] = d2[i] < dmax[1] ? dmax[1] : d2[i];
dmax[2] = d3[i] < dmax[2] ? dmax[2] : d3[i];
ip[i] = dmax[2];
}
return dmax[0] + dmax[1] + dmax[2];
}
gcc -O3 generates:
movi d0, #0
mov x5, 0
cmp w3, 0
ble .L3
.p2align 3,,7
.L2:
ldr d4, [x0, x5, lsl 3]
ldr d3, [x1, x5, lsl 3]
ldr d0, [x2, x5, lsl 3]
fcmpe d1, d4
fcsel d1, d1, d4, gt
fcmpe d2, d3
fcsel d2, d2, d3, gt
fcmpe d5, d0
bgt .L8
str d0, [x4, x5, lsl 3]
add x5, x5, 1
cmp w3, w5
ble .L3
fmov d5, d0
b .L2
.p2align 2,,3
.L8:
str d5, [x4, x5, lsl 3]
add x5, x5, 1
cmp w3, w5
bgt .L2
fmov d0, d5
.L3:
fadd d1, d1, d2
fadd d0, d1, d0
ret
Gcc generates fcsel instruction for "dmax[0] = d1[i] < dmax[0] ? dmax[0] :
d1[i];dmax[1] = d2[i] < dmax[1] ? dmax[1] : d2[i];" and doesn't do so for
"dmax[2] = d3[i] < dmax[2] ? dmax[2] : d3[i];".
Pass_split_paths splits the corresponding bb so that pass_rtl_ifcvt failed to
generate the fcsel instruction. Moreover, I have found that pass_split_paths
has already added some checks to aovid spoiling if-conversion if, while the
above case is not covered. I plan to add some checks in pass_split_paths to fix
this problem and have prepared the following patch:
diff -uprN a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c
--- a/gcc/gimple-ssa-split-paths.c
+++ b/gcc/gimple-ssa-split-paths.c
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3. If not see
#include "gimple-ssa.h"
#include "tree-phinodes.h"
#include "ssa-iterators.h"
+#include "cfghooks.h"
/* Given LATCH, the latch block in a loop, see if the shape of the
path reaching LATCH is suitable for being split by duplication.
@@ -254,6 +255,44 @@ is_feasible_trace (basic_block bb)
}
}
+ /* Canonicalize the form. */
+ if (single_pred_p (pred1) && single_pred (pred1) == pred2
+ && empty_block_p (pred1))
+ std::swap (pred1, pred2);
+
+ /* This is meant to catch another kind of cases that are likely
opportunities
+ for if-conversion. After canonicalizing, PRED2 must be an empty block and
+ PRED1 must be the only predecessor of PRED2. Moreover, PRED1 is supposed
+ to end with a cond_stmt which has the same args with the PHI in BB. */
+ if (single_pred_p (pred2) && single_pred (pred2) == pred1
+ && empty_block_p (pred2))
+ {
+ gimple *cond_stmt = last_stmt (pred1);
+ if (cond_stmt && gimple_code (cond_stmt) == GIMPLE_COND)
+ {
+ tree lhs = gimple_cond_lhs (cond_stmt);
+ tree rhs = gimple_cond_rhs (cond_stmt);
+
+ gimple_stmt_iterator gsi;
+ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *phi = gsi_stmt (gsi);
+ if ((gimple_phi_arg_def (phi, 0) == lhs
+ && gimple_phi_arg_def (phi, 1) == rhs)
+ || (gimple_phi_arg_def (phi, 0) == rhs
+ && gimple_phi_arg_def (phi, 1) == lhs))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Block %d appears to be optimized to a join "
+ "point for if-convertable half-diamond.\n",
+ bb->index);
+ return false;
+ }
+ }
+ }
+ }
+
/* If the joiner has no PHIs with useful uses there is zero chance
of CSE/DCE/jump-threading possibilities exposed by duplicating it. */
bool found_useful_phi = false;
With this patch, gcc -O3 generates:
cmp w3, 0
ble .L2
mov x5, 0
.p2align 3,,7
.L9:
ldr d5, [x0, x5, lsl 3]
ldr d4, [x1, x5, lsl 3]
ldr d3, [x2, x5, lsl 3]
fcmpe d0, d5
fcsel d0, d0, d5, gt
fcmpe d2, d4
fcsel d2, d2, d4, gt
fcmpe d1, d3
fcsel d1, d1, d3, gt
str d1, [x4, x5, lsl 3]
add x5, x5, 1
cmp w3, w5
bgt .L9
.L2:
fadd d0, d0, d2
fadd d0, d0, d1
ret
More information about the Gcc-bugs
mailing list