87608 – Very slow swap operations

Bug 87608 - Very slow swap operations

Summary: Very slow swap operations

Status:	RESOLVED FIXED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	9.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2018-10-13 23:33 UTC by Thomas Koenig
Modified:	2018-10-23 12:18 UTC (History)
CC List:	1 user (show)

See Also:
Host:
Target:	x86_64-pc-linux-gnu
Build:
Known to work:
Known to fail:
Last reconfirmed:	2018-10-15 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Thomas Koenig 2018-10-13 23:33:49 UTC

The following test program I received from somebody else (reproduced
with permission) takes about three times as many cycles using
gcc as it does with clang - 1428 cycles vs. 544 cycles including
measurement overhead.
 
#include <stdio.h>

extern "C" long rdtsc(void);

#define cond_swap5(a,b);\
    t = *(a);\
    *(a) = (t<*(b))?t:*(b);\
    *(b) = (t<*(b))?*(b):t;

template<int n>
void static_sort1(int *a){
    return;
}

template<>
void static_sort1<32>(int* first){
        int t;
    static_sort1<16>(first);
    static_sort1<16>(first+16);

    cond_swap5(first + 0u, first + 16u);
    cond_swap5(first + 8u, first + 24u);
    cond_swap5(first + 8u, first + 16u);
    cond_swap5(first + 4u, first + 20u);
    cond_swap5(first + 12u, first + 28u);
    cond_swap5(first + 12u, first + 20u);
    cond_swap5(first + 4u, first + 8u);
    cond_swap5(first + 12u, first + 16u);
    cond_swap5(first + 20u, first + 24u);
    cond_swap5(first + 2u, first + 18u);
    cond_swap5(first + 10u, first + 26u);
    cond_swap5(first + 10u, first + 18u);
    cond_swap5(first + 6u, first + 22u);
    cond_swap5(first + 14u, first + 30u);
    cond_swap5(first + 14u, first + 22u);
    cond_swap5(first + 6u, first + 10u);
    cond_swap5(first + 14u, first + 18u);
    cond_swap5(first + 22u, first + 26u);
    cond_swap5(first + 2u, first + 4u);
    cond_swap5(first + 6u, first + 8u);
    cond_swap5(first + 10u, first + 12u);
    cond_swap5(first + 14u, first + 16u);
    cond_swap5(first + 18u, first + 20u);
    cond_swap5(first + 22u, first + 24u);
    cond_swap5(first + 26u, first + 28u);
    cond_swap5(first + 1u, first + 17u);
    cond_swap5(first + 9u, first + 25u);
    cond_swap5(first + 9u, first + 17u);
    cond_swap5(first + 5u, first + 21u);
    cond_swap5(first + 13u, first + 29u);
    cond_swap5(first + 13u, first + 21u);
    cond_swap5(first + 5u, first + 9u);
    cond_swap5(first + 13u, first + 17u);
    cond_swap5(first + 21u, first + 25u);
    cond_swap5(first + 3u, first + 19u);
    cond_swap5(first + 11u, first + 27u);
    cond_swap5(first + 11u, first + 19u);
    cond_swap5(first + 7u, first + 23u);
    cond_swap5(first + 15u, first + 31u);
    cond_swap5(first + 15u, first + 23u);
    cond_swap5(first + 7u, first + 11u);
    cond_swap5(first + 15u, first + 19u);
    cond_swap5(first + 23u, first + 27u);
    cond_swap5(first + 3u, first + 5u);
    cond_swap5(first + 7u, first + 9u);
    cond_swap5(first + 11u, first + 13u);
    cond_swap5(first + 15u, first + 17u);
    cond_swap5(first + 19u, first + 21u);
    cond_swap5(first + 23u, first + 25u);
    cond_swap5(first + 27u, first + 29u);
    cond_swap5(first + 1u, first + 2u);
    cond_swap5(first + 3u, first + 4u);
    cond_swap5(first + 5u, first + 6u);
    cond_swap5(first + 7u, first + 8u);
    cond_swap5(first + 9u, first + 10u);
    cond_swap5(first + 11u, first + 12u);
    cond_swap5(first + 13u, first + 14u);
    cond_swap5(first + 15u, first + 16u);
    cond_swap5(first + 17u, first + 18u);
    cond_swap5(first + 19u, first + 20u);
    cond_swap5(first + 21u, first + 22u);
    cond_swap5(first + 23u, first + 24u);
    cond_swap5(first + 25u, first + 26u);
    cond_swap5(first + 27u, first + 28u);
    cond_swap5(first + 29u, first + 30u);
};

int main(){
    int a[32];
    long t1, t2;
        for (int i=0; i<32; i++)
       a[i] = 20*i - 32*i*i;

    t1 = rdtsc();
    static_sort1<32>(a);
    t2 = rdtsc();
        for (int i=0; i<32; i++)
      printf("%d ",a[i]);
    printf("\n %ld\n", t2-t1);
    return 0;
}
$ cat rdtsc.s
        .file   "rdtsc.s"
        .text
        .globl  rdtsc
        .type   rdtsc, @function
rdtsc:
.LFB0:
        rdtsc
        shl     $32, %rdx
        or      %rdx, %rax
        ret
.LFE0:
        .size   rdtsc, .-rdtsc
        .section        .note.GNU-stack,"",@progbits
$ g++ -march=native -mtune=native -O3 j2.c rdtsc.s
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692 -26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432 -4368 -700 -5148 -1032 -5992 -1428 -6900 
 1428
$ clang++ -O3 -stdlib=libc++ j2.c rdtsc.s
clang-3.8: warning: treating 'c' input as 'c++' when in C++ mode, this behavior is deprecated
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692 -26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432 -4368 -700 -5148 -1032 -5992 -1428 -6900 
 544

This is on x86_64-pc-linux-gnu with an AMD Ryzen 7.

Comment 1 Alexander Monakov 2018-10-14 12:11:39 UTC

Note the compiler can evaluate the initialization loop and then also evaluate the effect of static_sort1 call, so the testcase might give misleading results. To avoid that, pass the address of 'a' to rdtsc, or introduce a compiler barrier with an asm:

  asm volatile ("" :: "r"(a) : "memory");

Furthermore, note that the CPU executes the rdtsc instruction without waiting for all preceding computations to complete. Using lfence just before rdtsc will ensure that rdtsc reads the cycle counter only after all preceding computations are done.

On this testcase I think LLVM introduces ternary select operations in the IR fairly early and then works with straight-line code; in contrast, in GCC we scalarize the array and have a soup of BBs and phi nodes throughout gimple passes, which would be very hard to properly clean up on rtl.

Comment 2 Thomas Koenig 2018-10-14 16:04:12 UTC

(In reply to Alexander Monakov from comment #1)
> Note the compiler can evaluate the initialization loop and then also
> evaluate the effect of static_sort1 call, so the testcase might give
> misleading results. To avoid that, pass the address of 'a' to rdtsc, or
> introduce a compiler barrier with an asm:
> 
>   asm volatile ("" :: "r"(a) : "memory");
> 
> Furthermore, note that the CPU executes the rdtsc instruction without
> waiting for all preceding computations to complete. Using lfence just before
> rdtsc will ensure that rdtsc reads the cycle counter only after all
> preceding computations are done.

Thanks for the hint.

I added the memory barrier to the code, it didn't make any appreciable
difference to the timing.

Comment 3 Richard Biener 2018-10-15 07:53:20 UTC

I believe this is another case where an early phiopt pass would help.  We have
a sequence of

  <bb 2> :
  _1 = first_522(D) + 64;
  t_525 = *first_522(D);
  _3 = MEM[(int *)first_522(D) + 64B];
  if (_3 <= t_525)
    goto <bb 3>; [INV]
  else
    goto <bb 4>; [INV]

  <bb 3> :

  <bb 4> :
  # iftmp.0_391 = PHI <_3(3), t_525(2)>

in which the VRP1 pass mixes up the CFG via jump-threading so that the
later phiopt pass no longer sees this min/max patterns.

I have (even posted) a nearly finished patch to move phiopt1 early but
it had some fallout.  I'm not sure if I can find enough cycles to finish
it for GCC 9.  Some ideas were restricting it to min/max/abs recognition
because that's what most PRs talk about.  Another idea would be to move
both ifcombine and phiopt before the first real jump-threading pass (vrp1)
since most "bad" effects of moving phiopt were related to missed ifcombine
optimizations.

Comment 4 Richard Biener 2018-10-23 11:35:28 UTC

Author: rguenth
Date: Tue Oct 23 11:34:56 2018
New Revision: 265421

URL: https://gcc.gnu.org/viewcvs?rev=265421&root=gcc&view=rev
Log:
2018-10-23  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87105
	PR tree-optimization/87608
	* passes.def (pass_all_early_optimizations): Add early phi-opt
	after dce.
	* tree-ssa-phiopt.c (value_replacement): Ignore NOPs and predicts in
	addition to debug stmts.
	(tree_ssa_phiopt_worker): Add early_p argument, do only min/max
	and abs replacement early.
	* tree-cfg.c (gimple_empty_block_p): Likewise.

	* g++.dg/tree-ssa/phiopt-1.C: New testcase.
	g++.dg/vect/slp-pr87105.cc: Likewise.
	* g++.dg/tree-ssa/pr21463.C: Scan phiopt2 because this testcase
	relies on phiprop run before.
	* g++.dg/tree-ssa/pr30738.C: Likewise.
	* g++.dg/tree-ssa/pr57380.C: Likewise.
	* gcc.dg/tree-ssa/pr84859.c: Likewise.
	* gcc.dg/tree-ssa/pr45397.c: Scan phiopt2 because phiopt1 is
	confused by copies in the IL left by EVRP.
	* gcc.dg/tree-ssa/phi-opt-5.c: Likewise, this time confused
	by predictors.
	* gcc.dg/tree-ssa/phi-opt-12.c: Scan phiopt2.
	* gcc.dg/pr24574.c: Likewise.
	* g++.dg/tree-ssa/pr86544.C: Scan phiopt4.

Added:
    trunk/gcc/testsuite/g++.dg/tree-ssa/phiopt-1.C
    trunk/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/passes.def
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr21463.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr30738.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr57380.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
    trunk/gcc/testsuite/gcc.dg/pr24574.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/20040514-1.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/20040518-1.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-12.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-5.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-6.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-8.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr45397.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr84859.c
    trunk/gcc/testsuite/gcc.dg/uninit-15.c
    trunk/gcc/tree-cfg.c
    trunk/gcc/tree-ssa-phiopt.c

Comment 5 Richard Biener 2018-10-23 12:18:43 UTC

Fixed on trunk.