This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
optimizing predictable branches on x86
- From: Nick Piggin <nickpiggin at yahoo dot com dot au>
- To: gcc at gcc dot gnu dot org
- Date: Tue, 26 Feb 2008 14:34:11 +1100
- Subject: optimizing predictable branches on x86
Hi list,
gcc-4.3 appears to make quite heavy use of cmov to eliminate
conditional branches on x86(-64) architecture, even for those
branches that are determined to be predictable.
The problem with this is that the data dependancy introduced
by the cmov can restrict execution, wheras a predicted branch
will not. The Intel manual says not to use cmov for
predictable branches.
I have a test case which I /believe/ demonstrates that a cond
jump is not a great deal slower in the case where there are
no data dependancy hazards hit, and can be quite a bit faster
in the case where there is a dependancy -- if the branch is
predictable. It also shows that if the branch is unpredictable
then cmov can be a good win (as expected).
Compiled with:
gcc-4.3 -O3 -falign-functions=64 -falign-loops=64 -falign-jumps=64
-falign-labels=64 -march=core2/opteron
Opteron:
no deps, predictable -- C code took 8.92ns per iteration
no deps, predictable -- cmov code took 9.09ns per iteration
no deps, predictable -- jmp code took 9.60ns per iteration[1]
has deps, predictable -- C code took 20.04ns per iteration
has deps, predictable -- cmov code took 18.09ns per iteration
has deps, predictable -- jmp code took 14.97ns per iteration[2]
no deps, unpredictable -- C code took 8.92ns per iteration
no deps, unpredictable -- cmov code took 9.09ns per iteration
no deps, unpredictable -- jmp code took 15.19ns per iteration[3]
has deps, unpredictable -- C code took 32.07ns per iteration
has deps, unpredictable -- cmov code took 33.15ns per iteration
has deps, unpredictable -- jmp code took 69.04ns per iteration[4]
[1] jmp is slightly slower, can it be improved?
[2] nice improvement here
[3] mispredict penalty, cmov is a big win
[4] mispredict which includes load missing cache!
Core2:
no deps, predictable -- C code took 4.24ns per iteration
no deps, predictable -- cmov code took 4.27ns per iteration
no deps, predictable -- jmp code took 4.24ns per iteration
has deps, predictable -- C code took 6.04ns per iteration
has deps, predictable -- cmov code took 6.59ns per iteration
has deps, predictable -- jmp code took 5.04ns per iteration
no deps, unpredictable -- C code took 4.24ns per iteration
no deps, unpredictable -- cmov code took 4.25ns per iteration
no deps, unpredictable -- jmp code took 8.79ns per iteration
has deps, unpredictable -- C code took 49.78ns per iteration
has deps, unpredictable -- cmov code took 50.28ns per iteration
has deps, unpredictable -- jmp code took 75.67ns per iteration
Core2 follows a similar pattern, although it's not seeing any
slowdown in the "no deps, predictable, jmp" case like K8 does.
Any comments? (please cc me) Should gcc be using conditional jumps
more often eg. in the case of __builtin_expect())?
Thanks,
Nick
//#define noinline __attribute__((noinline))
#define noinline
#define likely(x) __builtin_expect(!!(x), 1)
static noinline int test_cmov(int a, int b)
{
int ret;
asm volatile (
"cmpl %1, %2\n\t"
"cmovle %2, %1\n\t"
"movl %1, %0\n\t"
: "=r" (ret)
: "r" (a), "r" (b));
return ret;
}
static noinline int test_jmp(int a, int b)
{
int ret;
asm volatile (
"cmpl %1, %2\n\t"
"jle 1f\n\t"
"movl %1, %0\n\t"
"2:\n\t"
".subsection 1\n\t"
"1:\n\t"
"movl %2, %0\n\t"
"jmp 2b\n\t"
".previous\n\t"
: "=r" (ret)
: "r" (a), "r" (b));
return ret;
}
static noinline int test_c(int a, int b)
{
int ret;
if (likely(a < b))
ret = a;
else
ret = b;
return ret;
}
#define ITERS 100000000
#define SIZE (4*1024*1024)
static int array1[SIZE];
static int array2[SIZE];
static volatile int result[SIZE];
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
static void print_time(const char *str, struct timeval *s, struct timeval *e, unsigned long iters)
{
unsigned long long us, ns;
double ns_iter;
us = 1000000*(e->tv_sec - s->tv_sec) + (e->tv_usec - s->tv_usec);
ns = us * 1000;
ns_iter = (double)ns / iters;
ns /= iters;
printf("%s took % 6.2lfns per iteration\n", str, ns_iter);
}
int main(void)
{
struct timeval s, e;
int i;
assert(test_cmov(1, 2) == test_c(1, 2));
assert(test_cmov(1, 1) == test_c(1, 1));
assert(test_cmov(2, 1) == test_c(2, 1));
assert(test_jmp(1, 2) == test_c(1, 2));
assert(test_jmp(1, 1) == test_c(1, 1));
assert(test_jmp(2, 1) == test_c(2, 1));
for (i = 0; i < SIZE; i++) {
array1[i] = i;
array2[i] = i + 1;
result[i] = 0;
}
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, predictable -- C code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, predictable -- cmov code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, predictable -- jmp code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_c(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, predictable -- C code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_cmov(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, predictable -- cmov code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_jmp(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, predictable -- jmp code", &s, &e, ITERS);
for (i = 0; i < SIZE; i++) {
array1[i] = random();
array2[i] = random();
result[i] = 0;
}
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, unpredictable -- C code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, unpredictable -- cmov code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]);
}
gettimeofday(&e, NULL);
print_time(" no deps, unpredictable -- jmp code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_c(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, unpredictable -- C code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_cmov(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, unpredictable -- cmov code", &s, &e, ITERS);
gettimeofday(&s, NULL);
for (i = 0; i < ITERS; i++) {
unsigned int r;
r = test_jmp(array1[i%SIZE], array2[i%SIZE]);
result[r%SIZE]++;
}
gettimeofday(&e, NULL);
print_time("has deps, unpredictable -- jmp code", &s, &e, ITERS);
return 0;
}