[Bug tree-optimization/95855] New: A missing ifcvt optimization to generate fcsel

yangyang305 at huawei dot com gcc-bugzilla@gcc.gnu.org
Wed Jun 24 02:48:07 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95855

            Bug ID: 95855
           Summary: A missing ifcvt optimization to generate fcsel
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: yangyang305 at huawei dot com
  Target Milestone: ---

For the following case,

double test(double* d1, double* d2, double* d3, int num, double* ip) {
  double dmax[3];
  for (int i = 0; i < num; i++) {
    dmax[0] = d1[i] < dmax[0] ? dmax[0] : d1[i];
    dmax[1] = d2[i] < dmax[1] ? dmax[1] : d2[i];
    dmax[2] = d3[i] < dmax[2] ? dmax[2] : d3[i];
    ip[i] = dmax[2];
  }
  return dmax[0] + dmax[1] + dmax[2];
}

gcc -O3 generates:

        movi    d0, #0
        mov     x5, 0
        cmp     w3, 0
        ble     .L3
        .p2align 3,,7
.L2:
        ldr     d4, [x0, x5, lsl 3]
        ldr     d3, [x1, x5, lsl 3]
        ldr     d0, [x2, x5, lsl 3]
        fcmpe   d1, d4
        fcsel   d1, d1, d4, gt
        fcmpe   d2, d3
        fcsel   d2, d2, d3, gt
        fcmpe   d5, d0
        bgt     .L8
        str     d0, [x4, x5, lsl 3]
        add     x5, x5, 1
        cmp     w3, w5
        ble     .L3
        fmov    d5, d0
        b       .L2
        .p2align 2,,3
.L8:
        str     d5, [x4, x5, lsl 3]
        add     x5, x5, 1
        cmp     w3, w5
        bgt     .L2
        fmov    d0, d5
.L3:
        fadd    d1, d1, d2
        fadd    d0, d1, d0
        ret

Gcc generates fcsel instruction for "dmax[0] = d1[i] < dmax[0] ? dmax[0] :
d1[i];dmax[1] = d2[i] < dmax[1] ? dmax[1] : d2[i];" and doesn't do so for
"dmax[2] = d3[i] < dmax[2] ? dmax[2] : d3[i];". 

Pass_split_paths splits the corresponding bb so that pass_rtl_ifcvt failed to
generate the fcsel instruction. Moreover, I have found that pass_split_paths
has already added some checks to aovid spoiling if-conversion if, while the
above case is not covered. I plan to add some checks in pass_split_paths to fix
this problem and have prepared the following patch:

diff -uprN a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c
--- a/gcc/gimple-ssa-split-paths.c
+++ b/gcc/gimple-ssa-split-paths.c
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-ssa.h"
 #include "tree-phinodes.h"
 #include "ssa-iterators.h"
+#include "cfghooks.h"

 /* Given LATCH, the latch block in a loop, see if the shape of the
    path reaching LATCH is suitable for being split by duplication.
@@ -254,6 +255,44 @@ is_feasible_trace (basic_block bb)
        }
     }

+  /* Canonicalize the form.  */
+  if (single_pred_p (pred1) && single_pred (pred1) == pred2
+      && empty_block_p (pred1))
+    std::swap (pred1, pred2);
+
+  /* This is meant to catch another kind of cases that are likely
opportunities
+     for if-conversion. After canonicalizing, PRED2 must be an empty block and
+     PRED1 must be the only predecessor of PRED2. Moreover, PRED1 is supposed
+     to end with a cond_stmt which has the same args with the PHI in BB. */
+  if (single_pred_p (pred2) && single_pred (pred2) == pred1
+      && empty_block_p (pred2))
+    {
+      gimple *cond_stmt = last_stmt (pred1);
+      if (cond_stmt && gimple_code (cond_stmt) == GIMPLE_COND)
+       {
+         tree lhs = gimple_cond_lhs (cond_stmt);
+         tree rhs = gimple_cond_rhs (cond_stmt);
+
+         gimple_stmt_iterator gsi;
+         for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gimple *phi = gsi_stmt (gsi);
+             if ((gimple_phi_arg_def (phi, 0) == lhs
+                  && gimple_phi_arg_def (phi, 1) == rhs)
+                 || (gimple_phi_arg_def (phi, 0) == rhs
+                     && gimple_phi_arg_def (phi, 1) == lhs))
+               {
+                 if (dump_file && (dump_flags & TDF_DETAILS))
+                   fprintf (dump_file,
+                            "Block %d appears to be optimized to a join "
+                            "point for if-convertable half-diamond.\n",
+                            bb->index);
+                 return false;
+               }
+           }
+       }
+    }
+
   /* If the joiner has no PHIs with useful uses there is zero chance
      of CSE/DCE/jump-threading possibilities exposed by duplicating it.  */
   bool found_useful_phi = false;

With this patch, gcc -O3 generates:

        cmp     w3, 0
        ble     .L2
        mov     x5, 0
        .p2align 3,,7
.L9:
        ldr     d5, [x0, x5, lsl 3]
        ldr     d4, [x1, x5, lsl 3]
        ldr     d3, [x2, x5, lsl 3]
        fcmpe   d0, d5
        fcsel   d0, d0, d5, gt
        fcmpe   d2, d4
        fcsel   d2, d2, d4, gt
        fcmpe   d1, d3
        fcsel   d1, d1, d3, gt
        str     d1, [x4, x5, lsl 3]
        add     x5, x5, 1
        cmp     w3, w5
        bgt     .L9
.L2:
        fadd    d0, d0, d2
        fadd    d0, d0, d1
        ret


More information about the Gcc-bugs mailing list