Improve byte-wise DSE (modref-dse-[45].c failures)

Richard Biener richard.guenther@gmail.com
Tue Nov 23 09:45:40 GMT 2021


On Tue, Nov 23, 2021 at 8:26 AM Jan Hubicka via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
> testcase modref-dse-4.c and modref-dse-5.c fails on some targets because they
> depend on store merging.  What really happens is that without store merging
> we produce for kill_me combined write that is ao_ref with offset=0, size=32
> and max_size=96.  We have size != max_size becaue we do ont track the info that
> all 3 writes must happen in a group and conider case only some of them are done.
>
> This disables byte-wise DSE which checks that size == max_size.  This is
> completely unnecesary for store being proved to be dead or load being checked
> to not read live bytes.  It is only necessary for kill store that is used to
> prove that given store is dead.
>
> While looking into this I also noticed that we check that everything is byte
> aligned.  This is also unnecessary and with access merging in modref may more
> commonly fire on accesses that we could otherwise handle.
>
> This patch fixes both also also changes interface to normalize_ref that I found
> confusing since it modifies the ref. Instead of that we have get_byte_range
> that is computing range in bytes (since that is what we need to maintain the
> bitmap) and has additional parameter specifying if the store in question should
> be turned into sub-range or super-range depending whether we compute range
> for kill or load.
>
> Bootstrapped/regtested x86_64-linux OK?

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> 2021-11-23  Jan Hubicka  <hubicka@ucw.cz>
>
>         * tree-ssa-dse.c (valid_ao_ref_for_dse): Rename to ...
>         (valid_ao_ref_kill_for_dse): ... this; do not check that boundaries
>         are divisible by BITS_PER_UNIT.
>         (get_byte_aligned_range_containing_ref): New function.
>         (get_byte_aligned_range_contained_in_ref): New function.
>         (normalize_ref): Rename to ...
>         (get_byte_range): ... this one; handle accesses not aligned to byte
>         boundary; return range in bytes rater than updating ao_ref.
>         (clear_live_bytes_for_ref): Take write ref by reference; simplify using
>         get_byte_access.
>         (setup_live_bytes_from_ref): Likewise.
>         (clear_bytes_written_by): Update.
>         (live_bytes_read): Update.
>         (dse_classify_store): Simplify tech before live_bytes_read checks.
>
> gcc/testsuite/ChangeLog:
>
> 2021-11-23  Jan Hubicka  <hubicka@ucw.cz>
>
>         * gcc.dg/tree-ssa/modref-dse-4.c: Update template.
>         * gcc.dg/tree-ssa/modref-dse-5.c: Update template.
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-4.c b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-4.c
> index 81aa7dc587c..19e91b00f15 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-4.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-4.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-tree-dse2-details"  } */
> +/* { dg-options "-O2 -fdump-tree-dse1-details"  } */
>  struct a {int a,b,c;};
>  __attribute__ ((noinline))
>  void
> @@ -23,4 +23,4 @@ set (struct a *a)
>    my_pleasure (a);
>    a->b=1;
>  }
> -/* { dg-final { scan-tree-dump "Deleted dead store: kill_me" "dse2" } } */
> +/* { dg-final { scan-tree-dump "Deleted dead store: kill_me" "dse1" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-5.c b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-5.c
> index ad35b70136f..dc2c2892615 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-5.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-tree-dse2-details"  } */
> +/* { dg-options "-O2 -fdump-tree-dse1-details"  } */
>  struct a {int a,b,c;};
>  __attribute__ ((noinline))
>  void
> @@ -36,8 +36,7 @@ set (struct a *a)
>  {
>    wrap (0, a);
>    int ret = wrap2 (0, a);
> -  //int ret = my_pleasure (a);
>    a->b=1;
>    return ret;
>  }
> -/* { dg-final { scan-tree-dump "Deleted dead store: wrap" "dse2" } } */
> +/* { dg-final { scan-tree-dump "Deleted dead store: wrap" "dse1" } } */
> diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c
> index 9531d892f76..8717d654e5a 100644
> --- a/gcc/tree-ssa-dse.c
> +++ b/gcc/tree-ssa-dse.c
> @@ -156,57 +156,137 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write)
>  }
>
>  /* Given REF from the alias oracle, return TRUE if it is a valid
> -   memory reference for dead store elimination, false otherwise.
> +   kill memory reference for dead store elimination, false otherwise.
>
>     In particular, the reference must have a known base, known maximum
>     size, start at a byte offset and have a size that is one or more
>     bytes.  */
>
>  static bool
> -valid_ao_ref_for_dse (ao_ref *ref)
> +valid_ao_ref_kill_for_dse (ao_ref *ref)
>  {
>    return (ao_ref_base (ref)
>           && known_size_p (ref->max_size)
>           && maybe_ne (ref->size, 0)
>           && known_eq (ref->max_size, ref->size)
> -         && known_ge (ref->offset, 0)
> -         && multiple_p (ref->offset, BITS_PER_UNIT)
> -         && multiple_p (ref->size, BITS_PER_UNIT));
> +         && known_ge (ref->offset, 0));
> +}
> +
> +/* Given REF from the alias oracle, return TRUE if it is a valid
> +   load or store memory reference for dead store elimination, false otherwise.
> +
> +   Unlike for valid_ao_ref_kill_for_dse we can accept writes where max_size
> +   is not same as size since we can handle conservatively the larger range.  */
> +
> +static bool
> +valid_ao_ref_for_dse (ao_ref *ref)
> +{
> +  return (ao_ref_base (ref)
> +         && known_size_p (ref->max_size)
> +         && known_ge (ref->offset, 0));
> +}
> +
> +/* Initialize OFFSET and SIZE to a range known to contain REF
> +   where the boundaries are divisible by BITS_PER_UNIT (bit still in bits).
> +   Return false if this is impossible.  */
> +
> +static bool
> +get_byte_aligned_range_containing_ref (ao_ref *ref, poly_int64 *offset,
> +                                      HOST_WIDE_INT *size)
> +{
> +  if (!known_size_p (ref->max_size))
> +    return false;
> +  *offset = aligned_lower_bound (ref->offset, BITS_PER_UNIT);
> +  poly_int64 end = aligned_upper_bound (ref->offset + ref->max_size,
> +                                       BITS_PER_UNIT);
> +  return (end - *offset).is_constant (size);
> +}
> +
> +/* Initialize OFFSET and SIZE to a range known to be contained REF
> +   where the boundaries are divisible by BITS_PER_UNIT (but still in bits).
> +   Return false if this is impossible.  */
> +
> +static bool
> +get_byte_aligned_range_contained_in_ref (ao_ref *ref, poly_int64 *offset,
> +                                        HOST_WIDE_INT *size)
> +{
> +  if (!known_size_p (ref->size)
> +      || !known_eq (ref->size, ref->max_size))
> +    return false;
> +  *offset = aligned_upper_bound (ref->offset, BITS_PER_UNIT);
> +  poly_int64 end = aligned_lower_bound (ref->offset + ref->max_size,
> +                                       BITS_PER_UNIT);
> +  /* For bit accesses we can get -1 here, but also 0 sized kill is not
> +     useful.  */
> +  if (!known_gt (end, *offset))
> +    return false;
> +  return (end - *offset).is_constant (size);
>  }
>
> -/* Try to normalize COPY (an ao_ref) relative to REF.  Essentially when we are
> -   done COPY will only refer bytes found within REF.  Return true if COPY
> -   is known to intersect at least one byte of REF.  */
> +/* Compute byte range (returned iN REF_OFFSET and RET_SIZE) for access COPY
> +   inside REF.  If KILL is true, then COPY represent a kill and the byte range
> +   needs to be fully contained in bit range given by COPY.  If KILL is false
> +   then the byte range returned must contain the range of COPY.  */
>
>  static bool
> -normalize_ref (ao_ref *copy, ao_ref *ref)
> +get_byte_range (ao_ref *copy, ao_ref *ref, bool kill,
> +               HOST_WIDE_INT *ret_offset, HOST_WIDE_INT *ret_size)
>  {
> -  if (!ordered_p (copy->offset, ref->offset))
> +  HOST_WIDE_INT copy_size, ref_size;
> +  poly_int64 copy_offset, ref_offset;
> +  HOST_WIDE_INT diff;
> +
> +  /* First translate from bits to bytes, rounding to bigger or smaller ranges
> +     as needed.  Kills needs to be always rounded to smaller ranges while
> +     uses and stores to larger ranges.  */
> +  if (kill)
> +    {
> +      if (!get_byte_aligned_range_contained_in_ref (copy, &copy_offset,
> +                                                   &copy_size))
> +       return false;
> +    }
> +  else
> +    {
> +      if (!get_byte_aligned_range_containing_ref (copy, &copy_offset,
> +                                                 &copy_size))
> +       return false;
> +    }
> +
> +  if (!get_byte_aligned_range_containing_ref (ref, &ref_offset, &ref_size)
> +      || !ordered_p (copy_offset, ref_offset))
>      return false;
>
> +  /* Switch sizes from bits to bytes so we do not need to care about
> +     overflows.  Offset calculation needs to stay in bits until we compute
> +     the difference and can switch to HOST_WIDE_INT.  */
> +  copy_size /= BITS_PER_UNIT;
> +  ref_size /= BITS_PER_UNIT;
> +
>    /* If COPY starts before REF, then reset the beginning of
>       COPY to match REF and decrease the size of COPY by the
>       number of bytes removed from COPY.  */
> -  if (maybe_lt (copy->offset, ref->offset))
> +  if (maybe_lt (copy_offset, ref_offset))
>      {
> -      poly_int64 diff = ref->offset - copy->offset;
> -      if (maybe_le (copy->size, diff))
> +      if (!(ref_offset - copy_offset).is_constant (&diff)
> +         || copy_size < diff / BITS_PER_UNIT)
>         return false;
> -      copy->size -= diff;
> -      copy->offset = ref->offset;
> +      copy_size -= diff / BITS_PER_UNIT;
> +      copy_offset = ref_offset;
>      }
>
> -  poly_int64 diff = copy->offset - ref->offset;
> -  if (maybe_le (ref->size, diff))
> +  if (!(copy_offset - ref_offset).is_constant (&diff)
> +      || ref_size <= diff / BITS_PER_UNIT)
>      return false;
>
>    /* If COPY extends beyond REF, chop off its size appropriately.  */
> -  poly_int64 limit = ref->size - diff;
> -  if (!ordered_p (limit, copy->size))
> -    return false;
> +  HOST_WIDE_INT limit = ref_size - diff / BITS_PER_UNIT;
>
> -  if (maybe_gt (copy->size, limit))
> -    copy->size = limit;
> +  if (copy_size > limit)
> +    copy_size = limit;
> +  *ret_size = copy_size;
> +  if (!(copy_offset - ref_offset).is_constant (ret_offset))
> +    return false;
> +  *ret_offset /= BITS_PER_UNIT;
>    return true;
>  }
>
> @@ -214,20 +294,14 @@ normalize_ref (ao_ref *copy, ao_ref *ref)
>     Verify we have the same base memory address, the write
>     has a known size and overlaps with REF.  */
>  static void
> -clear_live_bytes_for_ref (sbitmap live_bytes, ao_ref *ref, ao_ref write)
> +clear_live_bytes_for_ref (sbitmap live_bytes, ao_ref *ref, ao_ref *write)
>  {
>    HOST_WIDE_INT start, size;
>
> -  if (valid_ao_ref_for_dse (&write)
> -      && operand_equal_p (write.base, ref->base, OEP_ADDRESS_OF)
> -      && known_eq (write.size, write.max_size)
> -      /* normalize_ref modifies write and for that reason write is not
> -        passed by reference.  */
> -      && normalize_ref (&write, ref)
> -      && (write.offset - ref->offset).is_constant (&start)
> -      && write.size.is_constant (&size))
> -    bitmap_clear_range (live_bytes, start / BITS_PER_UNIT,
> -                       size / BITS_PER_UNIT);
> +  if (valid_ao_ref_kill_for_dse (write)
> +      && operand_equal_p (write->base, ref->base, OEP_ADDRESS_OF)
> +      && get_byte_range (write, ref, true, &start, &size))
> +    bitmap_clear_range (live_bytes, start, size);
>  }
>
>  /* Clear any bytes written by STMT from the bitmap LIVE_BYTES.  The base
> @@ -250,12 +324,12 @@ clear_bytes_written_by (sbitmap live_bytes, gimple *stmt, ao_ref *ref)
>        if (summary && !interposed)
>         for (auto kill : summary->kills)
>           if (kill.get_ao_ref (as_a <gcall *> (stmt), &write))
> -           clear_live_bytes_for_ref (live_bytes, ref, write);
> +           clear_live_bytes_for_ref (live_bytes, ref, &write);
>      }
>    if (!initialize_ao_ref_for_dse (stmt, &write))
>      return;
>
> -  clear_live_bytes_for_ref (live_bytes, ref, write);
> +  clear_live_bytes_for_ref (live_bytes, ref, &write);
>  }
>
>  /* REF is a memory write.  Extract relevant information from it and
> @@ -267,9 +341,11 @@ setup_live_bytes_from_ref (ao_ref *ref, sbitmap live_bytes)
>  {
>    HOST_WIDE_INT const_size;
>    if (valid_ao_ref_for_dse (ref)
> -      && ref->size.is_constant (&const_size)
> -      && (const_size / BITS_PER_UNIT
> -         <= param_dse_max_object_size))
> +      && ((aligned_upper_bound (ref->offset + ref->max_size, BITS_PER_UNIT)
> +          - aligned_lower_bound (ref->offset,
> +                                 BITS_PER_UNIT)).is_constant (&const_size))
> +      && (const_size / BITS_PER_UNIT <= param_dse_max_object_size)
> +      && const_size > 1)
>      {
>        bitmap_clear (live_bytes);
>        bitmap_set_range (live_bytes, 0, const_size / BITS_PER_UNIT);
> @@ -645,24 +721,21 @@ maybe_trim_partially_dead_store (ao_ref *ref, sbitmap live, gimple *stmt)
>     location.  So callers do not see those modifications.  */
>
>  static bool
> -live_bytes_read (ao_ref use_ref, ao_ref *ref, sbitmap live)
> +live_bytes_read (ao_ref *use_ref, ao_ref *ref, sbitmap live)
>  {
>    /* We have already verified that USE_REF and REF hit the same object.
>       Now verify that there's actually an overlap between USE_REF and REF.  */
>    HOST_WIDE_INT start, size;
> -  if (normalize_ref (&use_ref, ref)
> -      && (use_ref.offset - ref->offset).is_constant (&start)
> -      && use_ref.size.is_constant (&size))
> +  if (get_byte_range (use_ref, ref, false, &start, &size))
>      {
>        /* If USE_REF covers all of REF, then it will hit one or more
>          live bytes.   This avoids useless iteration over the bitmap
>          below.  */
> -      if (start == 0 && known_eq (size, ref->size))
> +      if (start == 0 && known_eq (size * 8, ref->size))
>         return true;
>
>        /* Now check if any of the remaining bits in use_ref are set in LIVE.  */
> -      return bitmap_bit_in_range_p (live, start / BITS_PER_UNIT,
> -                                   (start + size - 1) / BITS_PER_UNIT);
> +      return bitmap_bit_in_range_p (live, start, (start + size - 1));
>      }
>    return true;
>  }
> @@ -861,16 +934,18 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
>             {
>               /* Handle common cases where we can easily build an ao_ref
>                  structure for USE_STMT and in doing so we find that the
> -                references hit non-live bytes and thus can be ignored.  */
> +                references hit non-live bytes and thus can be ignored.
> +
> +                TODO: We can also use modref summary to handle calls.  */
>               if (byte_tracking_enabled
>                   && is_gimple_assign (use_stmt))
>                 {
>                   ao_ref use_ref;
>                   ao_ref_init (&use_ref, gimple_assign_rhs1 (use_stmt));
>                   if (valid_ao_ref_for_dse (&use_ref)
> -                     && use_ref.base == ref->base
> -                     && known_eq (use_ref.size, use_ref.max_size)
> -                     && !live_bytes_read (use_ref, ref, live_bytes))
> +                     && operand_equal_p (use_ref.base, ref->base,
> +                                         OEP_ADDRESS_OF)
> +                     && !live_bytes_read (&use_ref, ref, live_bytes))
>                     {
>                       /* If this is a store, remember it as we possibly
>                          need to walk the defs uses.  */


More information about the Gcc-patches mailing list