[Bug fortran/90539] [10 Regression] 481.wrf slowdown by 25% on Intel Kaby with -Ofast -march=native starting with r271377

marxin at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Wed May 22 10:34:00 GMT 2019


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90539

--- Comment #15 from Martin Liška <marxin at gcc dot gnu.org> ---
Resulting difference in original dump file is:

BEFORE:

            D.20757 = _gfortran_internal_pack (&parm.2491);
            __result_nf90_put_var_1d_eigh = nf_put_vara_double
((integer(kind=4) *) ncid, (integer(kind=4) *) varid, &localstart, &localcount,
D.20757);
            if ((real(kind=8)[0:] *) parm.2491.data != (real(kind=8)[0:] *)
D.20757)
              {
                _gfortran_internal_unpack (&parm.2491, D.20757);
                __builtin_free (D.20757);
              }

AFTER:

            D.20757 = offset.2468;
            D.20758 = ubound.2466;
            D.20759 = D.20758 + -1;
                        typedef real(kind=8) [0:];
            atmp.2492.dtype = {.elem_len=8, .rank=1, .type=3};
            atmp.2492.dim[0].stride = 1;
            atmp.2492.dim[0].lbound = 0;
            atmp.2492.dim[0].ubound = D.20759;
            D.20767 = D.20759 < 0;
            D.20768 = D.20759 + 1;
            atmp.2492.span = 8;
            D.20769 = (void * restrict) __builtin_malloc (D.20767 ? 1 :
MAX_EXPR <(unsigned long) (D.20768 * 8), 1>);
            D.20770 = D.20769;
            atmp.2492.data = D.20770;
            atmp.2492.offset = 0;
            {
              integer(kind=8) S.2493;
              integer(kind=8) D.20772;

              D.20772 = stride.2467;
              S.2493 = 0;
              while (1)
                {
                  if (S.2493 > D.20759) goto L.778;
                  (*(real(kind=8)[0:] * restrict) atmp.2492.data)[S.2493] =
(*values.0)[(S.2493 + 1) * D.20772 + D.20757];
                  S.2493 = S.2493 + 1;
                }
              L.778:;
            }
            __result_nf90_put_var_1d_eigh = nf_put_vara_double
((integer(kind=4) *) ncid, (integer(kind=4) *) varid, &localstart, &localcount,
(real(kind=8)[0:] * restrict) atmp.2492.data);
            D.20774 = offset.2468;
            D.20775 = ubound.2466;
            {
              integer(kind=8) S.2494;
              integer(kind=8) D.20778;

              D.20778 = stride.2467;
              D.20776 = -1;
              S.2494 = 1;
              while (1)
                {
                  if (S.2494 > D.20775) goto L.779;
                  (*values.0)[S.2494 * D.20778 + D.20774] = (*(real(kind=8)[0:]
* restrict) atmp.2492.data)[S.2494 + D.20776];
                  S.2494 = S.2494 + 1;
                }
              L.779:;
            }
            __builtin_free ((void *) atmp.2492.data);

@Thomas: Can you please provide another hint what to do now?


More information about the Gcc-bugs mailing list