This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

optimizing predictable branches on x86


Hi list,

gcc-4.3 appears to make quite heavy use of cmov to eliminate
conditional branches on x86(-64) architecture, even for those
branches that are determined to be predictable.

The problem with this is that the data dependancy introduced
by the cmov can restrict execution, wheras a predicted branch
will not. The Intel manual says not to use cmov for
predictable branches.

I have a test case which I /believe/ demonstrates that a cond
jump is not a great deal slower in the case where there are
no data dependancy hazards hit, and can be quite a bit faster
in the case where there is a dependancy -- if the branch is
predictable. It also shows that if the branch is unpredictable
then cmov can be a good win (as expected).

Compiled with:
gcc-4.3 -O3 -falign-functions=64 -falign-loops=64 -falign-jumps=64
        -falign-labels=64 -march=core2/opteron

Opteron:
 no deps,   predictable -- C    code took   8.92ns per iteration
 no deps,   predictable -- cmov code took   9.09ns per iteration
 no deps,   predictable -- jmp  code took   9.60ns per iteration[1]
has deps,   predictable -- C    code took  20.04ns per iteration
has deps,   predictable -- cmov code took  18.09ns per iteration
has deps,   predictable -- jmp  code took  14.97ns per iteration[2]
 no deps, unpredictable -- C    code took   8.92ns per iteration
 no deps, unpredictable -- cmov code took   9.09ns per iteration
 no deps, unpredictable -- jmp  code took  15.19ns per iteration[3]
has deps, unpredictable -- C    code took  32.07ns per iteration
has deps, unpredictable -- cmov code took  33.15ns per iteration
has deps, unpredictable -- jmp  code took  69.04ns per iteration[4]

[1] jmp is slightly slower, can it be improved?
[2] nice improvement here
[3] mispredict penalty, cmov is a big win
[4] mispredict which includes load missing cache!

Core2:
 no deps,   predictable -- C    code took   4.24ns per iteration
 no deps,   predictable -- cmov code took   4.27ns per iteration
 no deps,   predictable -- jmp  code took   4.24ns per iteration
has deps,   predictable -- C    code took   6.04ns per iteration
has deps,   predictable -- cmov code took   6.59ns per iteration
has deps,   predictable -- jmp  code took   5.04ns per iteration
 no deps, unpredictable -- C    code took   4.24ns per iteration
 no deps, unpredictable -- cmov code took   4.25ns per iteration
 no deps, unpredictable -- jmp  code took   8.79ns per iteration
has deps, unpredictable -- C    code took  49.78ns per iteration
has deps, unpredictable -- cmov code took  50.28ns per iteration
has deps, unpredictable -- jmp  code took  75.67ns per iteration

Core2 follows a similar pattern, although it's not seeing any
slowdown in the "no deps, predictable, jmp" case like K8 does.

Any comments? (please cc me) Should gcc be using conditional jumps
more often eg. in the case of __builtin_expect())?

Thanks,
Nick
//#define noinline __attribute__((noinline))
#define noinline
#define likely(x) __builtin_expect(!!(x), 1)

static noinline int test_cmov(int a, int b)
{
	int ret;

	asm volatile (
		"cmpl	%1, %2\n\t"
		"cmovle	%2, %1\n\t"
		"movl	%1, %0\n\t"
		: "=r" (ret)
		: "r" (a), "r" (b));

	return ret;
}

static noinline int test_jmp(int a, int b)
{
	int ret;

	asm volatile (
		"cmpl	%1, %2\n\t"
		"jle	1f\n\t"
		"movl	%1, %0\n\t"
		"2:\n\t"
		".subsection 1\n\t"
		"1:\n\t"
		"movl	%2, %0\n\t"
		"jmp	2b\n\t"
		".previous\n\t"
		: "=r" (ret)
		: "r" (a), "r" (b));

	return ret;
}

static noinline int test_c(int a, int b)
{
	int ret;

	if (likely(a < b))
		ret = a;
	else
		ret = b;

	return ret;
}

#define ITERS	100000000
#define SIZE	(4*1024*1024)
static int array1[SIZE];
static int array2[SIZE];
static volatile int result[SIZE];

#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

static void print_time(const char *str, struct timeval *s, struct timeval *e, unsigned long iters)
{
	unsigned long long us, ns;
	double ns_iter;

	us = 1000000*(e->tv_sec - s->tv_sec) + (e->tv_usec - s->tv_usec);
	ns = us * 1000;
	ns_iter = (double)ns / iters;
	ns /= iters;

	printf("%s took % 6.2lfns per iteration\n", str, ns_iter);
}

int main(void)
{
	struct timeval s, e;
	int i;

	assert(test_cmov(1, 2) == test_c(1, 2));
	assert(test_cmov(1, 1) == test_c(1, 1));
	assert(test_cmov(2, 1) == test_c(2, 1));
	assert(test_jmp(1, 2) == test_c(1, 2));
	assert(test_jmp(1, 1) == test_c(1, 1));
	assert(test_jmp(2, 1) == test_c(2, 1));

	for (i = 0; i < SIZE; i++) {
		array1[i] = i;
		array2[i] = i + 1;
		result[i] = 0;
	}

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps,   predictable -- C    code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps,   predictable -- cmov code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps,   predictable -- jmp  code", &s, &e, ITERS);


	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_c(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps,   predictable -- C    code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_cmov(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps,   predictable -- cmov code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_jmp(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps,   predictable -- jmp  code", &s, &e, ITERS);


	for (i = 0; i < SIZE; i++) {
		array1[i] = random();
		array2[i] = random();
		result[i] = 0;
	}

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps, unpredictable -- C    code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps, unpredictable -- cmov code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]);
	}
	gettimeofday(&e, NULL);
	print_time(" no deps, unpredictable -- jmp  code", &s, &e, ITERS);


	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_c(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps, unpredictable -- C    code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_cmov(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps, unpredictable -- cmov code", &s, &e, ITERS);

	gettimeofday(&s, NULL);
	for (i = 0; i < ITERS; i++) {
		unsigned int r;
		r = test_jmp(array1[i%SIZE], array2[i%SIZE]);
		result[r%SIZE]++;
	}
	gettimeofday(&e, NULL);
	print_time("has deps, unpredictable -- jmp  code", &s, &e, ITERS);

	return 0;
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]