Fix VIEW_CONVERT_EXPRs for VECTOR_BOOLEAN_TYPE_Ps

Richard Biener richard.guenther@gmail.com
Wed Dec 4 12:48:00 GMT 2019


On Wed, Dec 4, 2019 at 11:48 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> In r278410 I added code to handle VIEW_CONVERT_EXPRs between
> variable-length vectors.  This included support for decoding
> a VECTOR_BOOLEAN_TYPE_P with subbyte elements.
>
> However, it turns out that we were already mishandling such bool vectors
> for fixed-length vectors: we treated each element as a stand-alone byte
> instead of putting multiple elements into the same byte.  I think in
> principle this could have been an issue for AVX512 as well.
>
> This patch adds encoding support for boolean vectors and reuses
> a version of the new decode support for fixed-length vectors.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> Richard
>
>
> 2019-12-04  Richard Sandiford  <richard.sandiford@arm.com>
>
> gcc/
>         * fold-const.c (native_encode_vector_part): Handle
>         VECTOR_BOOLEAN_TYPE_Ps that have subbyte precision.
>         (native_decode_vector_tree): Delete, moving the bulk of the code to...
>         (native_interpret_vector_part): ...this new function.  Use a pointer
>         and length instead of a vec<> and start index.
>         (native_interpret_vector): Use native_interpret_vector_part.
>         (fold_view_convert_vector_encoding): Likewise.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve/acle/general/whilelt_5.c: New test.
>
> Index: gcc/fold-const.c
> ===================================================================
> --- gcc/fold-const.c    2019-12-02 17:51:02.287225873 +0000
> +++ gcc/fold-const.c    2019-12-04 10:46:30.201176596 +0000
> @@ -7727,21 +7727,53 @@ native_encode_complex (const_tree expr,
>  native_encode_vector_part (const_tree expr, unsigned char *ptr, int len,
>                            int off, unsigned HOST_WIDE_INT count)
>  {
> -  unsigned HOST_WIDE_INT i;
> -  int size, offset;
> -  tree itype, elem;
> -
> -  offset = 0;
> -  itype = TREE_TYPE (TREE_TYPE (expr));
> -  size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype));
> -  for (i = 0; i < count; i++)
> +  tree itype = TREE_TYPE (TREE_TYPE (expr));
> +  if (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (expr))
> +      && TYPE_PRECISION (itype) <= BITS_PER_UNIT)
> +    {
> +      /* This is the only case in which elements can be smaller than a byte.
> +        Element 0 is always in the lsb of the containing byte.  */
> +      unsigned int elt_bits = TYPE_PRECISION (itype);
> +      int total_bytes = CEIL (elt_bits * count, BITS_PER_UNIT);
> +      if ((off == -1 && total_bytes > len) || off >= total_bytes)
> +       return 0;
> +
> +      if (off == -1)
> +       off = 0;
> +
> +      /* Zero the buffer and then set bits later where necessary.  */
> +      int extract_bytes = MIN (len, total_bytes - off);
> +      if (ptr)
> +       memset (ptr, 0, extract_bytes);
> +
> +      unsigned int elts_per_byte = BITS_PER_UNIT / elt_bits;
> +      unsigned int first_elt = off * elts_per_byte;
> +      unsigned int extract_elts = extract_bytes * elts_per_byte;
> +      for (unsigned int i = 0; i < extract_elts; ++i)
> +       {
> +         tree elt = VECTOR_CST_ELT (expr, first_elt + i);
> +         if (TREE_CODE (elt) != INTEGER_CST)
> +           return 0;
> +
> +         if (ptr && wi::extract_uhwi (wi::to_wide (elt), 0, 1))
> +           {
> +             unsigned int bit = i * elt_bits;
> +             ptr[bit / BITS_PER_UNIT] |= 1 << (bit % BITS_PER_UNIT);
> +           }
> +       }
> +      return extract_bytes;
> +    }
> +
> +  int offset = 0;
> +  int size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype));
> +  for (unsigned HOST_WIDE_INT i = 0; i < count; i++)
>      {
>        if (off >= size)
>         {
>           off -= size;
>           continue;
>         }
> -      elem = VECTOR_CST_ELT (expr, i);
> +      tree elem = VECTOR_CST_ELT (expr, i);
>        int res = native_encode_expr (elem, ptr ? ptr + offset : NULL,
>                                     len - offset, off);
>        if ((off == -1 && res != size) || res == 0)
> @@ -7976,6 +8008,55 @@ native_interpret_complex (tree type, con
>    return build_complex (type, rpart, ipart);
>  }
>
> +/* Read a vector of type TYPE from the target memory image given by BYTES,
> +   which contains LEN bytes.  The vector is known to be encodable using
> +   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each.
> +
> +   Return the vector on success, otherwise return null.  */
> +
> +static tree
> +native_interpret_vector_part (tree type, const unsigned char *bytes,
> +                             unsigned int len, unsigned int npatterns,
> +                             unsigned int nelts_per_pattern)
> +{
> +  tree elt_type = TREE_TYPE (type);
> +  if (VECTOR_BOOLEAN_TYPE_P (type)
> +      && TYPE_PRECISION (elt_type) <= BITS_PER_UNIT)
> +    {
> +      /* This is the only case in which elements can be smaller than a byte.
> +        Element 0 is always in the lsb of the containing byte.  */
> +      unsigned int elt_bits = TYPE_PRECISION (elt_type);
> +      if (elt_bits * npatterns * nelts_per_pattern > len * BITS_PER_UNIT)
> +       return NULL_TREE;
> +
> +      tree_vector_builder builder (type, npatterns, nelts_per_pattern);
> +      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> +       {
> +         unsigned int bit_index = i * elt_bits;
> +         unsigned int byte_index = bit_index / BITS_PER_UNIT;
> +         unsigned int lsb = bit_index % BITS_PER_UNIT;
> +         builder.quick_push (bytes[byte_index] & (1 << lsb)
> +                             ? build_all_ones_cst (elt_type)
> +                             : build_zero_cst (elt_type));
> +       }
> +      return builder.build ();
> +    }
> +
> +  unsigned int elt_bytes = tree_to_uhwi (TYPE_SIZE_UNIT (elt_type));
> +  if (elt_bytes * npatterns * nelts_per_pattern > len)
> +    return NULL_TREE;
> +
> +  tree_vector_builder builder (type, npatterns, nelts_per_pattern);
> +  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> +    {
> +      tree elt = native_interpret_expr (elt_type, bytes, elt_bytes);
> +      if (!elt)
> +       return NULL_TREE;
> +      builder.quick_push (elt);
> +      bytes += elt_bytes;
> +    }
> +  return builder.build ();
> +}
>
>  /* Subroutine of native_interpret_expr.  Interpret the contents of
>     the buffer PTR of length LEN as a VECTOR_CST of type TYPE.
> @@ -7984,8 +8065,8 @@ native_interpret_complex (tree type, con
>  static tree
>  native_interpret_vector (tree type, const unsigned char *ptr, unsigned int len)
>  {
> -  tree etype, elem;
> -  unsigned int i, size;
> +  tree etype;
> +  unsigned int size;
>    unsigned HOST_WIDE_INT count;
>
>    etype = TREE_TYPE (type);
> @@ -7994,15 +8075,7 @@ native_interpret_vector (tree type, cons
>        || size * count > len)
>      return NULL_TREE;
>
> -  tree_vector_builder elements (type, count, 1);
> -  for (i = 0; i < count; ++i)
> -    {
> -      elem = native_interpret_expr (etype, ptr+(i*size), size);
> -      if (!elem)
> -       return NULL_TREE;
> -      elements.quick_push (elem);
> -    }
> -  return elements.build ();
> +  return native_interpret_vector_part (type, ptr, len, count, 1);
>  }
>
>
> @@ -8064,54 +8137,6 @@ can_native_interpret_type_p (tree type)
>      }
>  }
>
> -/* Read a vector of type TYPE from the target memory image given by BYTES,
> -   starting at byte FIRST_BYTE.  The vector is known to be encodable using
> -   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
> -   and BYTES is known to have enough bytes to supply NPATTERNS *
> -   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
> -   BITS_PER_UNIT bits and the bytes are in target memory order.
> -
> -   Return the vector on success, otherwise return null.  */
> -
> -static tree
> -native_decode_vector_tree (tree type, vec<unsigned char> bytes,
> -                          unsigned int first_byte, unsigned int npatterns,
> -                          unsigned int nelts_per_pattern)
> -{
> -  tree_vector_builder builder (type, npatterns, nelts_per_pattern);
> -  tree elt_type = TREE_TYPE (type);
> -  unsigned int elt_bits = tree_to_uhwi (TYPE_SIZE (elt_type));
> -  if (VECTOR_BOOLEAN_TYPE_P (type) && elt_bits <= BITS_PER_UNIT)
> -    {
> -      /* This is the only case in which elements can be smaller than a byte.
> -        Element 0 is always in the lsb of the containing byte.  */
> -      elt_bits = TYPE_PRECISION (elt_type);
> -      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> -       {
> -         unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
> -         unsigned int byte_index = bit_index / BITS_PER_UNIT;
> -         unsigned int lsb = bit_index % BITS_PER_UNIT;
> -         builder.quick_push (bytes[byte_index] & (1 << lsb)
> -                             ? build_all_ones_cst (elt_type)
> -                             : build_zero_cst (elt_type));
> -       }
> -    }
> -  else
> -    {
> -      unsigned int elt_bytes = elt_bits / BITS_PER_UNIT;
> -      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
> -       {
> -         tree elt = native_interpret_expr (elt_type, &bytes[first_byte],
> -                                           elt_bytes);
> -         if (!elt)
> -           return NULL_TREE;
> -         builder.quick_push (elt);
> -         first_byte += elt_bytes;
> -       }
> -    }
> -  return builder.build ();
> -}
> -
>  /* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating
>     directly on the VECTOR_CST encoding, in a way that works for variable-
>     length vectors.  Return the resulting VECTOR_CST on success or null
> @@ -8168,8 +8193,8 @@ fold_view_convert_vector_encoding (tree
>
>    /* Reencode the bytes as TYPE.  */
>    unsigned int type_npatterns = type_sequence_bits / type_elt_bits;
> -  return native_decode_vector_tree (type, buffer, 0, type_npatterns,
> -                                   nelts_per_pattern);
> +  return native_interpret_vector_part (type, &buffer[0], buffer.length (),
> +                                      type_npatterns, nelts_per_pattern);
>  }
>
>  /* Fold a VIEW_CONVERT_EXPR of a constant expression EXPR to type
> Index: gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c
> ===================================================================
> --- /dev/null   2019-09-17 11:41:18.176664108 +0100
> +++ gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_5.c       2019-12-04 10:46:30.213176516 +0000
> @@ -0,0 +1,163 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target lp64 } */
> +/* { dg-additional-options "-O -msve-vector-bits=512 -fdump-tree-optimized" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_sve.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/*
> +** load_vl1:
> +**     ptrue   (p[0-7])\.[bhsd], vl1
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl1 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 1), ptr);
> +}
> +
> +/*
> +** load_vl2:
> +**     ptrue   (p[0-7])\.h, vl2
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl2 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 2), ptr);
> +}
> +
> +/*
> +** load_vl3:
> +**     ptrue   (p[0-7])\.h, vl3
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl3 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 3), ptr);
> +}
> +
> +/*
> +** load_vl4:
> +**     ptrue   (p[0-7])\.h, vl4
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl4 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 4), ptr);
> +}
> +
> +/*
> +** load_vl5:
> +**     ptrue   (p[0-7])\.h, vl5
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl5 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 5), ptr);
> +}
> +
> +/*
> +** load_vl6:
> +**     ptrue   (p[0-7])\.h, vl6
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl6 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 6), ptr);
> +}
> +
> +/*
> +** load_vl7:
> +**     ptrue   (p[0-7])\.h, vl7
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl7 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 7), ptr);
> +}
> +
> +/*
> +** load_vl8:
> +**     ptrue   (p[0-7])\.h, vl8
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl8 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 8), ptr);
> +}
> +
> +/*
> +** load_vl9:
> +**     mov     (x[0-9]+), #?9
> +**     whilelo (p[0-7])\.h, xzr, \1
> +**     ld1h    z0\.h, \2/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl9 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 9), ptr);
> +}
> +
> +/*
> +** load_vl15:
> +**     mov     (x[0-9]+), #?15
> +**     whilelo (p[0-7])\.h, xzr, \1
> +**     ld1h    z0\.h, \2/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl15 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 15), ptr);
> +}
> +
> +/*
> +** load_vl16:
> +**     ptrue   (p[0-7])\.h, vl16
> +**     ld1h    z0\.h, \1/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl16 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 16), ptr);
> +}
> +
> +/*
> +** load_vl17:
> +**     mov     (x[0-9]+), #?17
> +**     whilelo (p[0-7])\.h, xzr, \1
> +**     ld1h    z0\.h, \2/z, \[x0\]
> +**     ret
> +*/
> +svint16_t
> +load_vl17 (int16_t *ptr)
> +{
> +  return svld1 (svwhilelt_b16 (0, 17), ptr);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +/* { dg-final { scan-tree-dump-not "VIEW_CONVERT_EXPR" "optimized" } } */



More information about the Gcc-patches mailing list