This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

OpenMP performance loss


Hello all,

I recently encountered a strange performance drop on a test code. I
have two versions of the same code (I believe so at least), one
parallelized with OpenMP pragmas and one other manually parallelized
with pthread. The test machine is made of a Intel Core i7 920
processor (x86_64, 4 cores and HyperThreading activated). Both
versions are compiled using the -O2 option.

gcc -v gives me:

Utilisation des specs internes.
Target: x86_64-unknown-linux-gnu
Configuré avec: ../configure --prefix=/usr --enable-shared
--enable-languages=c,c++,fortran,objc,obj-c++ --enable-threads=posix
--mandir=/usr/share/man --infodir=/usr/share/info
--enable-__cxa_atexit --disable-multilib --libdir=/usr/lib
--libexecdir=/usr/lib --enable-clocale=gnu --disable-libstdcxx-pch
--with-tune=generic
Modèle de thread: posix
gcc version 4.4.0 (GCC)

uname -a:

Linux 2.6.29-ARCH #1 SMP PREEMPT Sat May 9 14:09:36 CEST 2009 x86_64
Intel(R) Core(TM) i7 CPU 920 @ 2.67GHz GenuineIntel GNU/Linux

The problem appears when at least one core is heavily loaded. I
simulated the machine load using a simple, infinite program running
some multiplications on random parts of a 8MB array.

I obtain some very coherent results on both version except when I use
exactly 8 threads. In this special case, the pthread version
performance is still very coherent but the OpenMP version suffers from
a huge performance drop: the execution time is multiplied by a factor
around 5 compared to the runtime using 7 or 9 threads.

What surprised me is that this performance drop doesn't appear on the
pthread version, and when using 7 or 9 threads with the OpenMP
version. The problem only appears when at least one core is heavily
loaded.

Stop me if I'm wrong but I think that both code versions are pretty similar.

Could someone help me with that performance drop? Do I do something
wrong using OpenMP? Why does this performance drop only happen when
using exactly 8 threads?

Thank you in advance,

Benoit Pradelle.
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#define N 1024
#define PAD 17

double A[N * (N + PAD)];
double C[N * (N + PAD)];

int main(int argc, char **argv) {
    unsigned int nb_ths = 8;
    int i, j, k;

    omp_set_dynamic(0);

    if (argc > 1) {
        nb_ths = atoi(argv[1]);
        omp_set_num_threads(nb_ths);
    }

    /* initialization */
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            A[i * (N + PAD) + j] = i * j;
            C[i * (N + PAD) + j] = i * j;
        }
    }

#pragma omp parallel private(i, j, k) shared(A)
    {
        for (i = 0; i < N - 1; i++) {
            #pragma omp for
            for (j = 0; j < N; j++) {
                for (k = 0; k < N; k++) {
                    A[i * (N + PAD) + j] = 
                        A[(i + 1) * (N + PAD) + j + 5] + j * i - k;
                }
            }
        }
    }

    /* checking */
    for (i = 0; i < N - 1; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) {
                C[i * (N + PAD) + j] = 
                    C[(i + 1) * (N + PAD) + j + 5] + j * i - k;
            }
        }
    }

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) {
                printf("CHECK FAILED at %d %d\n", i, j);
            }
        }
    }

    return EXIT_SUCCESS;
}

#include <pthread.h>
#include <time.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>

#define N 1024
#define PAD 17

double A[N * (N + PAD)];
double C[N * (N + PAD)];

/* thread attributes */
typedef struct {
    unsigned int id;
    unsigned int nb_ths;
} tattrs;


/* barrier stuff */
static unsigned int limit;
static unsigned int count;
static pthread_mutex_t lock;
static pthread_cond_t cond;


/* initialize the barrier */
void init_barrier(unsigned int lim) {
    limit = lim;
    count = 0;
    pthread_mutex_init(&lock, NULL);
    pthread_cond_init(&cond, NULL);
}

/* wait for a barrier */
void wait_barrier() {
    pthread_mutex_lock(&lock);
    count++;
    if (count >= limit) {
        pthread_cond_broadcast(&cond);
        count = 0;
    } else {
        pthread_cond_wait(&cond, &lock);
    }
    pthread_mutex_unlock(&lock);
}

/* pthread thread routine */
void *routine(void *args) {
    unsigned int i, j, k;
    tattrs *ctx = (tattrs *) args;
    unsigned int jmin, jmax;

    for (i = 0; i < N - 1; i++) {
        /* bound computation */
        jmin = ctx->id * (N / ctx->nb_ths);
        if (ctx->id == ctx->nb_ths - 1) {
            jmax = N;
        } else {
            jmax = jmin + (N / ctx->nb_ths);
        }

        for (j = jmin; j < jmax; j++) {
            for (k = 0; k < N; k++) {
                A[i * (N + PAD) + j] = 
                    A[(i + 1) * (N + PAD) + j + 5] + j * i - k;
            }
        }

        wait_barrier();
    }

    return NULL;
}


int main(int argc, char **argv) {
    pthread_t *tids;
    tattrs *ctx;
    unsigned int nb_ths = 8;
    unsigned int i, j, k;

    if (argc > 1) {
        nb_ths = atoi(argv[1]);
    }

    /* initialization */
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            A[i * (N + PAD) + j] = i * j;
            C[i * (N + PAD) + j] = i * j;
        }
    }

    init_barrier(nb_ths);

    tids = malloc(nb_ths * sizeof(*tids));
    ctx = malloc(nb_ths * sizeof(*ctx));

    /* run */
    for (i = 0; i < nb_ths; i++) {
        ctx[i].id = i;
        ctx[i].nb_ths = nb_ths;
        pthread_create(&tids[i], NULL, routine, &ctx[i]);
    }

    for (i = 0; i < nb_ths; i++) {
        pthread_join(tids[i], NULL);
    }

    free(tids);
    free(ctx);

    /* checking */
    for (i = 0; i < N - 1; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) {
                C[i * (N + PAD) + j] = 
                    C[(i + 1) * (N + PAD) + j + 5] + j * i - k;
            }
        }
    }

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            if (A[i * (N + PAD) + j] != C[i * (N + PAD) + j]) {
                printf("CHECK FAILED at %d %d\n", i, j);
            }
        }
    }

    return EXIT_SUCCESS;
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]