This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
OpenMP performance loss
- From: Benoit Pradelle <b dot pradelle at gmail dot com>
- To: gcc at gcc dot gnu dot org
- Date: Tue, 19 May 2009 14:22:30 +0200
- Subject: OpenMP performance loss
Hello all,
I recently encountered a strange performance drop on a test code. I
have two versions of the same code (I believe so at least), one
parallelized with OpenMP pragmas and one other manually parallelized
with pthread. The test machine is made of a Intel Core i7 920
processor (x86_64, 4 cores and HyperThreading activated). Both
versions are compiled using the -O2 option.
gcc -v gives me:
Utilisation des specs internes.
Target: x86_64-unknown-linux-gnu
Configuré avec: ../configure --prefix=/usr --enable-shared
--enable-languages=c,c++,fortran,objc,obj-c++ --enable-threads=posix
--mandir=/usr/share/man --infodir=/usr/share/info
--enable-__cxa_atexit --disable-multilib --libdir=/usr/lib
--libexecdir=/usr/lib --enable-clocale=gnu --disable-libstdcxx-pch
--with-tune=generic
Modèle de thread: posix
gcc version 4.4.0 (GCC)
uname -a:
Linux 2.6.29-ARCH #1 SMP PREEMPT Sat May 9 14:09:36 CEST 2009 x86_64
Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz GenuineIntel GNU/Linux
The problem appears when at least one core is heavily loaded. I
simulated the machine load using a simple, infinite program running
some multiplications on random parts of a 8MB array.
I obtain some very coherent results on both version except when I use
exactly 8 threads. In this special case, the pthread version
performance is still very coherent but the OpenMP version suffers from
a huge performance drop: the execution time is multiplied by a factor
around 5 compared to the runtime using 7 or 9 threads.
What surprised me is that this performance drop doesn't appear on the
pthread version, and when using 7 or 9 threads with the OpenMP
version. The problem only appears when at least one core is heavily
loaded.
Stop me if I'm wrong but I think that both code versions are pretty similar.
Could someone help me with that performance drop? Do I do something
wrong using OpenMP? Why does this performance drop only happen when
using exactly 8 threads?
Thank you in advance,
Benoit Pradelle.
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define N 1024
#define PAD 17
double A[N * (N + PAD)];
double C[N * (N + PAD)];
int main(int argc, char **argv) {
unsigned int nb_ths = 8;
int i, j, k;
omp_set_dynamic(0);
if (argc > 1) {
nb_ths = atoi(argv[1]);
omp_set_num_threads(nb_ths);
}
/* initialization */
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i * (N + PAD) + j] = i * j;
C[i * (N + PAD) + j] = i * j;
}
}
#pragma omp parallel private(i, j, k) shared(A)
{
for (i = 0; i < N - 1; i++) {
#pragma omp for
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
A[i * (N + PAD) + j] =
A[(i + 1) * (N + PAD) + j + 5] + j * i - k;
}
}
}
}
/* checking */
for (i = 0; i < N - 1; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
C[i * (N + PAD) + j] =
C[(i + 1) * (N + PAD) + j + 5] + j * i - k;
}
}
}
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) {
printf("CHECK FAILED at %d %d\n", i, j);
}
}
}
return EXIT_SUCCESS;
}
#include <pthread.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#define N 1024
#define PAD 17
double A[N * (N + PAD)];
double C[N * (N + PAD)];
/* thread attributes */
typedef struct {
unsigned int id;
unsigned int nb_ths;
} tattrs;
/* barrier stuff */
static unsigned int limit;
static unsigned int count;
static pthread_mutex_t lock;
static pthread_cond_t cond;
/* initialize the barrier */
void init_barrier(unsigned int lim) {
limit = lim;
count = 0;
pthread_mutex_init(&lock, NULL);
pthread_cond_init(&cond, NULL);
}
/* wait for a barrier */
void wait_barrier() {
pthread_mutex_lock(&lock);
count++;
if (count >= limit) {
pthread_cond_broadcast(&cond);
count = 0;
} else {
pthread_cond_wait(&cond, &lock);
}
pthread_mutex_unlock(&lock);
}
/* pthread thread routine */
void *routine(void *args) {
unsigned int i, j, k;
tattrs *ctx = (tattrs *) args;
unsigned int jmin, jmax;
for (i = 0; i < N - 1; i++) {
/* bound computation */
jmin = ctx->id * (N / ctx->nb_ths);
if (ctx->id == ctx->nb_ths - 1) {
jmax = N;
} else {
jmax = jmin + (N / ctx->nb_ths);
}
for (j = jmin; j < jmax; j++) {
for (k = 0; k < N; k++) {
A[i * (N + PAD) + j] =
A[(i + 1) * (N + PAD) + j + 5] + j * i - k;
}
}
wait_barrier();
}
return NULL;
}
int main(int argc, char **argv) {
pthread_t *tids;
tattrs *ctx;
unsigned int nb_ths = 8;
unsigned int i, j, k;
if (argc > 1) {
nb_ths = atoi(argv[1]);
}
/* initialization */
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i * (N + PAD) + j] = i * j;
C[i * (N + PAD) + j] = i * j;
}
}
init_barrier(nb_ths);
tids = malloc(nb_ths * sizeof(*tids));
ctx = malloc(nb_ths * sizeof(*ctx));
/* run */
for (i = 0; i < nb_ths; i++) {
ctx[i].id = i;
ctx[i].nb_ths = nb_ths;
pthread_create(&tids[i], NULL, routine, &ctx[i]);
}
for (i = 0; i < nb_ths; i++) {
pthread_join(tids[i], NULL);
}
free(tids);
free(ctx);
/* checking */
for (i = 0; i < N - 1; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
C[i * (N + PAD) + j] =
C[(i + 1) * (N + PAD) + j + 5] + j * i - k;
}
}
}
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) {
printf("CHECK FAILED at %d %d\n", i, j);
}
}
}
return EXIT_SUCCESS;
}