[gomp-nvptx 1/2] libgomp: avoid malloc calls in gomp_nvptx_main
Alexander Monakov
amonakov@ispras.ru
Thu Mar 24 21:25:00 GMT 2016
Avoid calling malloc where it's easy to use stack storage instead: device
malloc is very slow in CUDA. This cuts about 60-80 microseconds from target
region entry/exit time, slimming down empty target regions from ~95 to ~17
microseconds (as measured on a GTX Titan).
* config/nvptx/target.c (GOMP_teams): Do not call 'free'.
* config/nvptx/team.c (gomp_nvptx_main): Use 'alloca' instead of
'malloc' to obtain storage. Do not call 'free'.
* team.c (gomp_free_thread) [__nvptx__]: Do not call 'free'.
---
libgomp/ChangeLog.gomp-nvptx | 7 +++++++
libgomp/config/nvptx/target.c | 1 -
libgomp/config/nvptx/team.c | 9 +++++----
libgomp/team.c | 4 +++-
4 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index dbf4710..38ea7f7 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -43,7 +43,6 @@ GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
else if (block_id >= num_teams)
{
gomp_free_thread (nvptx_thrs);
- free (nvptx_thrs);
asm ("exit;");
}
gomp_num_teams_var = num_teams - 1;
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
index b9f9f9f..933f5a0 100644
--- a/libgomp/config/nvptx/team.c
+++ b/libgomp/config/nvptx/team.c
@@ -29,6 +29,7 @@
#include "libgomp.h"
#include <stdlib.h>
+#include <string.h>
struct gomp_thread *nvptx_thrs __attribute__((shared));
@@ -46,10 +47,11 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
/* Starting additional threads is not supported. */
gomp_global_icv.dyn_var = true;
- nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs));
+ nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
+ memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
- struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool));
- pool->threads = gomp_malloc (ntids * sizeof (*pool->threads));
+ struct gomp_thread_pool *pool = alloca (sizeof (*pool));
+ pool->threads = alloca (ntids * sizeof (*pool->threads));
for (tid = 0; tid < ntids; tid++)
pool->threads[tid] = nvptx_thrs + tid;
pool->threads_size = ntids;
@@ -63,7 +65,6 @@ gomp_nvptx_main (void (*fn) (void *), void *fn_data)
fn (fn_data);
gomp_free_thread (nvptx_thrs);
- free (nvptx_thrs);
}
else
{
diff --git a/libgomp/team.c b/libgomp/team.c
index 9a43a10..e301345 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -274,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
gomp_mutex_unlock (&gomp_managed_threads_lock);
#endif
}
- free (pool->threads);
if (pool->last_team)
free_team (pool->last_team);
+#ifndef __nvptx__
+ free (pool->threads);
free (pool);
+#endif
thr->thread_pool = NULL;
}
if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
More information about the Gcc-patches
mailing list