This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug middle-end/81502] In some cases the data is moved to memory unnecessarily [partial regression]


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Note that with -mtune=intel we already get

_Z3barPv:
.LFB526:
        .cfi_startproc
        movq    %rdi, %xmm0
        movd    %xmm0, %eax
        ret

but yes, the intermediate temporary is unnecessary.  We do not optimize
this to a BIT_INSERT_EXPR because the vector element type doesn't match the
insertion quantity.  We can relax that a bit with the following:

Index: gcc/tree-ssa.c
===================================================================
--- gcc/tree-ssa.c      (revision 250386)
+++ gcc/tree-ssa.c      (working copy)
@@ -1513,8 +1513,8 @@ non_rewritable_lvalue_p (tree lhs)
       if (DECL_P (decl)
          && VECTOR_TYPE_P (TREE_TYPE (decl))
          && TYPE_MODE (TREE_TYPE (decl)) != BLKmode
-         && types_compatible_p (TREE_TYPE (lhs),
-                                TREE_TYPE (TREE_TYPE (decl)))
+         && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
+                             TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
          && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
          && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
                              TYPE_SIZE_UNIT (TREE_TYPE (decl)))
@@ -1839,8 +1839,9 @@ execute_update_addresses_taken (void)
                    && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
                    && VECTOR_TYPE_P (TREE_TYPE (sym))
                    && TYPE_MODE (TREE_TYPE (sym)) != BLKmode
-                   && types_compatible_p (TREE_TYPE (lhs),
-                                          TREE_TYPE (TREE_TYPE (sym)))
+                   && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
+                                       TYPE_SIZE_UNIT
+                                         (TREE_TYPE (TREE_TYPE (sym))), 0)
                    && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
                    && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
                                        TYPE_SIZE_UNIT (TREE_TYPE (sym)))
@@ -1848,6 +1849,18 @@ execute_update_addresses_taken (void)
                        % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) ==
0)
                  {
                    tree val = gimple_assign_rhs1 (stmt);
+                   if (! types_compatible_p (TREE_TYPE (lhs),
+                                             TREE_TYPE (TREE_TYPE (sym))))
+                     {
+                       tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
+                       gimple *pun
+                         = gimple_build_assign (tem,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        TREE_TYPE (TREE_TYPE
+                                                                 (sym)),
val));
+                       gsi_insert_before (&gsi, pun, GSI_SAME_STMT);
+                       val = tem;
+                     }
                    tree bitpos
                      = wide_int_to_tree (bitsizetype,
                                          mem_ref_offset (lhs) *
BITS_PER_UNIT);

this gets us to

int bar(void*) (void * ptr)
{
  int res;
  __m128i word;
  long long int _2;
  unsigned int _4;

  <bb 2> [100.00%] [count: INV]:
  _2 = (long long int) ptr_6(D);
  word_3 = BIT_INSERT_EXPR <{ 0, 0 }, _2, 0 (64 bits)>;
  _4 = BIT_FIELD_REF <word_3, 32, 0>;
  res_5 = (int) _4;
  return res_5;

in .optimized which shows (already known) missed foldings for bit-field-ref
of bit-insert.  That's a complicated one btw, extracting a component from
a vector insert.

Oh, and it misses bit-insert -> CONSTRUCTOR, thus

word_3 = { _2, 0 };

(simplify
 (bit_insert VECTOR_CST@0 @1 @2)
 {
   vec<constructor_elt, va_gc> *v;
   vec_alloc (v, TYPE_VECTOR_SUBPARTS (type));
   for (unsigned i = 0; i < VECTOR_CST_NELTS (@0); ++i)
     {
       constructor_elt elt = { NULL_TREE, VECTOR_CST_ELT (@0, i) };
       v->quick_push (elt);
     }
   (*v)[TREE_INT_CST_LOW (@2) 
        / TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (type)))].value = @1;
   build_constructor (type, v);
 })

that gets us to

  <bb 2> [100.00%] [count: INV]:
  _2 = (long long int) ptr_6(D);
  word_3 = {_2, 0};
  _4 = BIT_FIELD_REF <word_3, 32, 0>;
  res_5 = (int) _4;
  return res_5;

where we still need that BIT_FIELD_REF simplification.  The IL is already
in this form when we run into FRE1 so handling it there should be
possible in principle.  Or we can fold

  word_3 = {_2, 0};
  _4 = BIT_FIELD_REF <word_3, 32, 0>;

to

  _4 = BIT_FIELD_REF <_2, 32, 0 [+adjustment]>;

thus a BIT_FIELD_REF on a CONSTRUCTOR to one on the element.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]