This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug rtl-optimization/23322] [4.1 regression] performance regression, possibly related to caching

From: "rguenth at gcc dot gnu dot org" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: 11 Aug 2005 10:30:28 -0000
Subject: [Bug rtl-optimization/23322] [4.1 regression] performance regression, possibly related to caching
References: <20050811005656.23322.danalis@cis.udel.edu>
Reply-to: gcc-bugzilla at gcc dot gnu dot org

------- Additional Comments From rguenth at gcc dot gnu dot org  2005-08-11 10:30 -------
I cannot confirm your observations, instead, with -O2 timings are about the
same for 4.0.2 (20050728) and 4.1.0 (20050803), while with -O3 the 4.0.2
compiler seems to be about 2x faster even if the tree optimizers do a better
job in the 4.1 case.

One difference is:

(4.1)
.L37:
        movl    $0, (%eax)
        movl    $1074266112, 4(%eax)
        addl    $8, %eax
        cmpl    %eax, %edx
        jne     .L37

vs.

(4.0)
        fldl    init_value
.L16:
        fstl    (%eax)
        addl    $8, %eax
        cmpl    %eax, %ecx
        jne     .L16

(known bug, I think - andrew will know the PR)

The other one is

(4.0)
.L8:
        fldz
        xorl    %eax, %eax
        fstl    -16(%ebp)
        .p2align 4,,15
.L11:
        faddl   (%ebx,%eax,8)
        incl    %eax
        cmpl    %edx, %eax
        fstl    -16(%ebp)
        jne     .L11
        fstp    %st(0)
        jmp     .L10

vs.

(4.1)
        fldz
        xorl    %eax, %eax
        fstpl   -16(%ebp)
        jmp     .L31
        .p2align 4,,7
.L43:
        fstp    %st(0)
.L31:
        fldl    -16(%ebp)
        faddl   (%ebx,%eax,8)
        incl    %eax
        cmpl    %edx, %eax
        fstl    -16(%ebp)
        jne     .L43

which certainly explains the big difference.  This is just

<L37>:;
  result = 0.0;
  n = 0;

<L8>:;
  result = MEM[base: first, index: (double *) n, step: 8B] + result;
  n = n + 1;
  if (n != D.34008) goto <L8>; else goto <L34>;

btw. or

static double test0(double* first, double* last) {
    double result = 0;
    for (int n = 0; n < last - first; ++n) result += first[n];
    return result;
}

Note that compiling this function stand-alone both produce identical
(good) assembly:

        fldz
        xorl    %eax, %eax
        .p2align 4,,15
.L5:
        faddl   (%ecx,%eax,8)
        incl    %eax
        cmpl    %edx, %eax
        jne     .L5

so it looks to me that RTL optimization goes berzerk and messes things up
here.  The cerr effect may have to to sth with aliasing (though again at
the RTL level, I think).  IVOPTs dumps show

(4.0)
  # result_67 = PHI <result_32(19), 0.0(17)>;
  # n_4 = PHI <n_66(19), 0(17)>;
<L6>:;
  D.32905_127 = (unsigned int) n_4;
  D.32906_126 = (double *) D.32905_127;
  D.32907_125 = D.32906_126 * 8B;
  D.32908_124 = first_11 + D.32907_125;
  D.32848_69 = D.32908_124;
  #   VUSE <init_value_5>;
  #   VUSE <data_12>;
  #   VUSE <Data_129>;
  #   VUSE <cerr_3>;
  D.32849_64 = *D.32848_69;
  result_32 = D.32849_64 + result_67;
  n_66 = n_4 + 1;
  if (n_66 != D.32844_140) goto <L34>; else goto <L35>;

(4.1)
  # n_105 = PHI <n_66(11), 0(9)>;
  # result_103 = PHI <result_65(11), 0.0(9)>;
<L8>:;
  D.34086_22 = (double *) n_105;
  #   VUSE <cerr_13>;
  #   VUSE <data_36>;
  #   VUSE <Data_27>;
  D.34003_64 = MEM[base: first_11, index: D.34086_22, step: 8B];
  result_65 = D.34003_64 + result_103;
  n_66 = n_105 + 1;
  if (n_66 != D.34008_102) goto <L33>; else goto <L34>;

which shows there's no real difference in tree-level alias information.
For the separate function we do

  # n_27 = PHI <n_19(3), 0(1)>;
  # result_25 = PHI <result_18(3), 0.0(1)>;
<L0>:;
  D.1814_2 = (double *) n_27;
  #   VUSE <TMT.8_20>;
  D.1750_17 = MEM[base: first_7, index: D.1814_2, step: 8B];
  result_18 = D.1750_17 + result_25;
  n_19 = n_27 + 1;
  if (n_19 != D.1744_24) goto <L9>; else goto <L10>;

though.  I'll make this rtl-optimization until someone tries another architecture.

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|target                      |rtl-optimization
           Keywords|                            |missed-optimization
            Summary|performance regression,     |[4.1 regression] performance
                   |possibly related to caching |regression, possibly related
                   |                            |to caching


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23322

References:
- [Bug target/23322] New: performance regression, possibly related to caching
  - From: danalis at cis dot udel dot edu

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]