]> gcc.gnu.org Git - gcc.git/blame - libgomp/plugin/plugin-gcn.c
libgomp: Fix GOMP_DEVICE_NUM_VAR stringification during offload image load
[gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
99dee823 3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
83177ca9 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
45#include "gomp-constants.h"
46#include <elf.h>
47#include "oacc-plugin.h"
48#include "oacc-int.h"
49#include <assert.h>
50
237957cc 51/* These probably won't be in elf.h for a while. */
97981e13 52#ifndef R_AMDGPU_NONE
237957cc
AS
53#define R_AMDGPU_NONE 0
54#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
55#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
56#define R_AMDGPU_ABS64 3 /* S + A */
57#define R_AMDGPU_REL32 4 /* S + A - P */
58#define R_AMDGPU_REL64 5 /* S + A - P */
59#define R_AMDGPU_ABS32 6 /* S + A */
60#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
61#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
62#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
63#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
64#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 65#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 66#endif
237957cc
AS
67
68/* GCN specific definitions for asynchronous queues. */
69
70#define ASYNC_QUEUE_SIZE 64
71#define DRAIN_QUEUE_SYNCHRONOUS_P false
72#define DEBUG_QUEUES 0
73#define DEBUG_THREAD_SLEEP 0
74#define DEBUG_THREAD_SIGNAL 0
75
76/* Defaults. */
77#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
78
79/* Secure getenv() which returns NULL if running as SUID/SGID. */
80#ifndef HAVE_SECURE_GETENV
81#ifdef HAVE___SECURE_GETENV
82#define secure_getenv __secure_getenv
83#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
84 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
85
86#include <unistd.h>
87
88/* Implementation of secure_getenv() for targets where it is not provided but
89 we have at least means to test real and effective IDs. */
90
91static char *
92secure_getenv (const char *name)
93{
94 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
95 return getenv (name);
96 else
97 return NULL;
98}
99
100#else
101#define secure_getenv getenv
102#endif
103#endif
104
105/* }}} */
106/* {{{ Types */
107
93d90219 108/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
109
110struct gcn_thread
111{
112 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
113 int async;
114};
115
116/* As an HSA runtime is dlopened, following structure defines function
117 pointers utilized by the HSA plug-in. */
118
119struct hsa_runtime_fn_info
120{
121 /* HSA runtime. */
122 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
123 const char **status_string);
124 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
125 void *value);
126 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
127 hsa_agent_info_t attribute,
128 void *value);
129 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
130 hsa_isa_info_t attribute,
131 uint32_t index,
132 void *value);
133 hsa_status_t (*hsa_init_fn) (void);
134 hsa_status_t (*hsa_iterate_agents_fn)
135 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
136 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
137 hsa_region_info_t attribute,
138 void *value);
139 hsa_status_t (*hsa_queue_create_fn)
140 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
141 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
142 void *data, uint32_t private_segment_size,
143 uint32_t group_segment_size, hsa_queue_t **queue);
144 hsa_status_t (*hsa_agent_iterate_regions_fn)
145 (hsa_agent_t agent,
146 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
147 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
148 hsa_status_t (*hsa_executable_create_fn)
149 (hsa_profile_t profile, hsa_executable_state_t executable_state,
150 const char *options, hsa_executable_t *executable);
151 hsa_status_t (*hsa_executable_global_variable_define_fn)
152 (hsa_executable_t executable, const char *variable_name, void *address);
153 hsa_status_t (*hsa_executable_load_code_object_fn)
154 (hsa_executable_t executable, hsa_agent_t agent,
155 hsa_code_object_t code_object, const char *options);
156 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
157 const char *options);
158 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
159 uint32_t num_consumers,
160 const hsa_agent_t *consumers,
161 hsa_signal_t *signal);
162 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
163 void **ptr);
164 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
165 hsa_access_permission_t access);
166 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
167 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
168 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
169 hsa_status_t (*hsa_executable_get_symbol_fn)
170 (hsa_executable_t executable, const char *module_name,
171 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
172 hsa_executable_symbol_t *symbol);
173 hsa_status_t (*hsa_executable_symbol_get_info_fn)
174 (hsa_executable_symbol_t executable_symbol,
175 hsa_executable_symbol_info_t attribute, void *value);
176 hsa_status_t (*hsa_executable_iterate_symbols_fn)
177 (hsa_executable_t executable,
178 hsa_status_t (*callback)(hsa_executable_t executable,
179 hsa_executable_symbol_t symbol, void *data),
180 void *data);
181 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
182 uint64_t value);
183 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
184 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
185 hsa_signal_value_t value);
186 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
189 (hsa_signal_t signal, hsa_signal_condition_t condition,
190 hsa_signal_value_t compare_value, uint64_t timeout_hint,
191 hsa_wait_state_t wait_state_hint);
192 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
193 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
194
195 hsa_status_t (*hsa_code_object_deserialize_fn)
196 (void *serialized_code_object, size_t serialized_code_object_size,
197 const char *options, hsa_code_object_t *code_object);
198};
199
200/* Structure describing the run-time and grid properties of an HSA kernel
201 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
202
203struct GOMP_kernel_launch_attributes
204{
205 /* Number of dimensions the workload has. Maximum number is 3. */
206 uint32_t ndim;
207 /* Size of the grid in the three respective dimensions. */
208 uint32_t gdims[3];
209 /* Size of work-groups in the respective dimensions. */
210 uint32_t wdims[3];
211};
212
213/* Collection of information needed for a dispatch of a kernel from a
214 kernel. */
215
216struct kernel_dispatch
217{
218 struct agent_info *agent;
219 /* Pointer to a command queue associated with a kernel dispatch agent. */
220 void *queue;
221 /* Pointer to a memory space used for kernel arguments passing. */
222 void *kernarg_address;
223 /* Kernel object. */
224 uint64_t object;
225 /* Synchronization signal used for dispatch synchronization. */
226 uint64_t signal;
227 /* Private segment size. */
228 uint32_t private_segment_size;
229 /* Group segment size. */
230 uint32_t group_segment_size;
231};
232
233/* Structure of the kernargs segment, supporting console output.
234
235 This needs to match the definitions in Newlib, and the expectations
236 in libgomp target code. */
237
238struct kernargs {
239 /* Leave space for the real kernel arguments.
240 OpenACC and OpenMP only use one pointer. */
241 int64_t dummy1;
242 int64_t dummy2;
243
244 /* A pointer to struct output, below, for console output data. */
245 int64_t out_ptr;
246
247 /* A pointer to struct heap, below. */
248 int64_t heap_ptr;
249
250 /* A pointer to an ephemeral memory arena.
251 Only needed for OpenMP. */
252 int64_t arena_ptr;
253
254 /* Output data. */
255 struct output {
256 int return_value;
257 unsigned int next_output;
258 struct printf_data {
259 int written;
260 char msg[128];
261 int type;
262 union {
263 int64_t ivalue;
264 double dvalue;
265 char text[128];
266 };
267 } queue[1024];
268 unsigned int consumed;
269 } output_data;
270};
271
272/* A queue entry for a future asynchronous launch. */
273
274struct kernel_launch
275{
276 struct kernel_info *kernel;
277 void *vars;
278 struct GOMP_kernel_launch_attributes kla;
279};
280
281/* A queue entry for a future callback. */
282
283struct callback
284{
285 void (*fn)(void *);
286 void *data;
287};
288
289/* A data struct for the copy_data callback. */
290
291struct copy_data
292{
293 void *dst;
294 const void *src;
295 size_t len;
296 bool free_src;
297 struct goacc_asyncqueue *aq;
298};
299
300/* A queue entry for a placeholder. These correspond to a wait event. */
301
302struct placeholder
303{
304 int executed;
305 pthread_cond_t cond;
306 pthread_mutex_t mutex;
307};
308
309/* A queue entry for a wait directive. */
310
311struct asyncwait_info
312{
313 struct placeholder *placeholderp;
314};
315
316/* Encode the type of an entry in an async queue. */
317
318enum entry_type
319{
320 KERNEL_LAUNCH,
321 CALLBACK,
322 ASYNC_WAIT,
323 ASYNC_PLACEHOLDER
324};
325
326/* An entry in an async queue. */
327
328struct queue_entry
329{
330 enum entry_type type;
331 union {
332 struct kernel_launch launch;
333 struct callback callback;
334 struct asyncwait_info asyncwait;
335 struct placeholder placeholder;
336 } u;
337};
338
339/* An async queue header.
340
341 OpenMP may create one of these.
342 OpenACC may create many. */
343
344struct goacc_asyncqueue
345{
346 struct agent_info *agent;
347 hsa_queue_t *hsa_queue;
348
349 pthread_t thread_drain_queue;
350 pthread_mutex_t mutex;
351 pthread_cond_t queue_cond_in;
352 pthread_cond_t queue_cond_out;
353 struct queue_entry queue[ASYNC_QUEUE_SIZE];
354 int queue_first;
355 int queue_n;
356 int drain_queue_stop;
357
358 int id;
359 struct goacc_asyncqueue *prev;
360 struct goacc_asyncqueue *next;
361};
362
363/* Mkoffload uses this structure to describe a kernel.
364
365 OpenMP kernel dimensions are passed at runtime.
366 OpenACC kernel dimensions are passed at compile time, here. */
367
368struct hsa_kernel_description
369{
370 const char *name;
371 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
372 int sgpr_count;
373 int vpgr_count;
237957cc
AS
374};
375
376/* Mkoffload uses this structure to describe an offload variable. */
377
378struct global_var_info
379{
380 const char *name;
381 void *address;
382};
383
384/* Mkoffload uses this structure to describe all the kernels in a
385 loadable module. These are passed the libgomp via static constructors. */
386
387struct gcn_image_desc
388{
389 struct gcn_image {
390 size_t size;
391 void *image;
392 } *gcn_image;
393 const unsigned kernel_count;
394 struct hsa_kernel_description *kernel_infos;
395 const unsigned global_variable_count;
237957cc
AS
396};
397
7d593fd6
FH
398/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
399 support.
400 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
401
402typedef enum {
7d593fd6
FH
403 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
404 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
405 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
3535402e 406 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
7d593fd6
FH
407} EF_AMDGPU_MACH;
408
409const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
410typedef EF_AMDGPU_MACH gcn_isa;
411
237957cc
AS
412/* Description of an HSA GPU agent (device) and the program associated with
413 it. */
414
415struct agent_info
416{
417 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
418 hsa_agent_t id;
419 /* The user-visible device number. */
420 int device_id;
421 /* Whether the agent has been initialized. The fields below are usable only
422 if it has been. */
423 bool initialized;
7d593fd6
FH
424
425 /* The instruction set architecture of the device. */
426 gcn_isa device_isa;
2e5ea579
FH
427 /* Name of the agent. */
428 char name[64];
429 /* Name of the vendor of the agent. */
430 char vendor_name[64];
237957cc
AS
431 /* Command queues of the agent. */
432 hsa_queue_t *sync_queue;
433 struct goacc_asyncqueue *async_queues, *omp_async_queue;
434 pthread_mutex_t async_queues_mutex;
435
436 /* The HSA memory region from which to allocate kernel arguments. */
437 hsa_region_t kernarg_region;
438
439 /* The HSA memory region from which to allocate device data. */
440 hsa_region_t data_region;
441
442 /* Allocated team arenas. */
443 struct team_arena_list *team_arena_list;
444 pthread_mutex_t team_arena_write_lock;
445
446 /* Read-write lock that protects kernels which are running or about to be run
447 from interference with loading and unloading of images. Needs to be
448 locked for reading while a kernel is being run, and for writing if the
449 list of modules is manipulated (and thus the HSA program invalidated). */
450 pthread_rwlock_t module_rwlock;
451
452 /* The module associated with this kernel. */
453 struct module_info *module;
454
455 /* Mutex enforcing that only one thread will finalize the HSA program. A
456 thread should have locked agent->module_rwlock for reading before
457 acquiring it. */
458 pthread_mutex_t prog_mutex;
459 /* Flag whether the HSA program that consists of all the modules has been
460 finalized. */
461 bool prog_finalized;
462 /* HSA executable - the finalized program that is used to locate kernels. */
463 hsa_executable_t executable;
464};
465
466/* Information required to identify, finalize and run any given kernel. */
467
468enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
469
470struct kernel_info
471{
472 /* Name of the kernel, required to locate it within the GCN object-code
473 module. */
474 const char *name;
475 /* The specific agent the kernel has been or will be finalized for and run
476 on. */
477 struct agent_info *agent;
478 /* The specific module where the kernel takes place. */
479 struct module_info *module;
5a28e272
KCY
480 /* Information provided by mkoffload associated with the kernel. */
481 struct hsa_kernel_description *description;
237957cc
AS
482 /* Mutex enforcing that at most once thread ever initializes a kernel for
483 use. A thread should have locked agent->module_rwlock for reading before
484 acquiring it. */
485 pthread_mutex_t init_mutex;
486 /* Flag indicating whether the kernel has been initialized and all fields
487 below it contain valid data. */
488 bool initialized;
489 /* Flag indicating that the kernel has a problem that blocks an execution. */
490 bool initialization_failed;
491 /* The object to be put into the dispatch queue. */
492 uint64_t object;
493 /* Required size of kernel arguments. */
494 uint32_t kernarg_segment_size;
495 /* Required size of group segment. */
496 uint32_t group_segment_size;
497 /* Required size of private segment. */
498 uint32_t private_segment_size;
499 /* Set up for OpenMP or OpenACC? */
500 enum offload_kind kind;
501};
502
503/* Information about a particular GCN module, its image and kernels. */
504
505struct module_info
506{
507 /* The description with which the program has registered the image. */
508 struct gcn_image_desc *image_desc;
509 /* GCN heap allocation. */
510 struct heap *heap;
511 /* Physical boundaries of the loaded module. */
512 Elf64_Addr phys_address_start;
513 Elf64_Addr phys_address_end;
514
515 bool constructors_run_p;
516 struct kernel_info *init_array_func, *fini_array_func;
517
518 /* Number of kernels in this module. */
519 int kernel_count;
520 /* An array of kernel_info structures describing each kernel in this
521 module. */
522 struct kernel_info kernels[];
523};
524
525/* A linked list of memory arenas allocated on the device.
526 These are only used by OpenMP, as a means to optimize per-team malloc. */
527
528struct team_arena_list
529{
530 struct team_arena_list *next;
531
532 /* The number of teams determines the size of the allocation. */
533 int num_teams;
534 /* The device address of the arena itself. */
535 void *arena;
536 /* A flag to prevent two asynchronous kernels trying to use the same arena.
537 The mutex is locked until the kernel exits. */
538 pthread_mutex_t in_use;
539};
540
541/* Information about the whole HSA environment and all of its agents. */
542
543struct hsa_context_info
544{
545 /* Whether the structure has been initialized. */
546 bool initialized;
547 /* Number of usable GPU HSA agents in the system. */
548 int agent_count;
549 /* Array of agent_info structures describing the individual HSA agents. */
550 struct agent_info *agents;
2e5ea579
FH
551 /* Driver version string. */
552 char driver_version_s[30];
237957cc
AS
553};
554
555/* Format of the on-device heap.
556
557 This must match the definition in Newlib and gcn-run. */
558
559struct heap {
560 int64_t size;
561 char data[0];
562};
563
564/* }}} */
565/* {{{ Global variables */
566
567/* Information about the whole HSA environment and all of its agents. */
568
569static struct hsa_context_info hsa_context;
570
571/* HSA runtime functions that are initialized in init_hsa_context. */
572
573static struct hsa_runtime_fn_info hsa_fns;
574
575/* Heap space, allocated target-side, provided for use of newlib malloc.
576 Each module should have it's own heap allocated.
577 Beware that heap usage increases with OpenMP teams. See also arenas. */
578
579static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
580
581/* Flag to decide whether print to stderr information about what is going on.
582 Set in init_debug depending on environment variables. */
583
584static bool debug;
585
586/* Flag to decide if the runtime should suppress a possible fallback to host
587 execution. */
588
589static bool suppress_host_fallback;
590
591/* Flag to locate HSA runtime shared library that is dlopened
592 by this plug-in. */
593
594static const char *hsa_runtime_lib;
595
596/* Flag to decide if the runtime should support also CPU devices (can be
597 a simulator). */
598
599static bool support_cpu_devices;
600
601/* Runtime dimension overrides. Zero indicates default. */
602
603static int override_x_dim = 0;
604static int override_z_dim = 0;
605
606/* }}} */
607/* {{{ Debug & Diagnostic */
608
609/* Print a message to stderr if GCN_DEBUG value is set to true. */
610
611#define DEBUG_PRINT(...) \
612 do \
613 { \
614 if (debug) \
615 { \
616 fprintf (stderr, __VA_ARGS__); \
617 } \
618 } \
619 while (false);
620
621/* Flush stderr if GCN_DEBUG value is set to true. */
622
623#define DEBUG_FLUSH() \
624 do { \
625 if (debug) \
626 fflush (stderr); \
627 } while (false)
628
629/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
630 is set to true. */
631
632#define DEBUG_LOG(prefix, ...) \
633 do \
634 { \
635 DEBUG_PRINT (prefix); \
636 DEBUG_PRINT (__VA_ARGS__); \
637 DEBUG_FLUSH (); \
638 } while (false)
639
640/* Print a debugging message to stderr. */
641
642#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
643
644/* Print a warning message to stderr. */
645
646#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
647
648/* Print HSA warning STR with an HSA STATUS code. */
649
650static void
651hsa_warn (const char *str, hsa_status_t status)
652{
653 if (!debug)
654 return;
655
656 const char *hsa_error_msg = "[unknown]";
657 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
658
659 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
660 hsa_error_msg);
661}
662
663/* Report a fatal error STR together with the HSA error corresponding to STATUS
664 and terminate execution of the current process. */
665
666static void
667hsa_fatal (const char *str, hsa_status_t status)
668{
669 const char *hsa_error_msg = "[unknown]";
670 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
671 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
672 hsa_error_msg);
673}
674
675/* Like hsa_fatal, except only report error message, and return FALSE
676 for propagating error processing to outside of plugin. */
677
678static bool
679hsa_error (const char *str, hsa_status_t status)
680{
681 const char *hsa_error_msg = "[unknown]";
682 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
683 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
684 hsa_error_msg);
685 return false;
686}
687
688/* Dump information about the available hardware. */
689
690static void
691dump_hsa_system_info (void)
692{
693 hsa_status_t status;
694
695 hsa_endianness_t endianness;
696 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
697 &endianness);
698 if (status == HSA_STATUS_SUCCESS)
699 switch (endianness)
700 {
701 case HSA_ENDIANNESS_LITTLE:
702 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
703 break;
704 case HSA_ENDIANNESS_BIG:
705 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
706 break;
707 default:
708 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
709 }
710 else
711 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
712
713 uint8_t extensions[128];
714 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
715 &extensions);
716 if (status == HSA_STATUS_SUCCESS)
717 {
718 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
719 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
720 }
721 else
722 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
723}
724
725/* Dump information about the available hardware. */
726
727static void
728dump_machine_model (hsa_machine_model_t machine_model, const char *s)
729{
730 switch (machine_model)
731 {
732 case HSA_MACHINE_MODEL_SMALL:
733 GCN_DEBUG ("%s: SMALL\n", s);
734 break;
735 case HSA_MACHINE_MODEL_LARGE:
736 GCN_DEBUG ("%s: LARGE\n", s);
737 break;
738 default:
739 GCN_WARNING ("%s: UNKNOWN\n", s);
740 break;
741 }
742}
743
744/* Dump information about the available hardware. */
745
746static void
747dump_profile (hsa_profile_t profile, const char *s)
748{
749 switch (profile)
750 {
751 case HSA_PROFILE_FULL:
752 GCN_DEBUG ("%s: FULL\n", s);
753 break;
754 case HSA_PROFILE_BASE:
755 GCN_DEBUG ("%s: BASE\n", s);
756 break;
757 default:
758 GCN_WARNING ("%s: UNKNOWN\n", s);
759 break;
760 }
761}
762
763/* Dump information about a device memory region. */
764
765static hsa_status_t
766dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
767{
768 hsa_status_t status;
769
770 hsa_region_segment_t segment;
771 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
772 &segment);
773 if (status == HSA_STATUS_SUCCESS)
774 {
775 if (segment == HSA_REGION_SEGMENT_GLOBAL)
776 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
777 else if (segment == HSA_REGION_SEGMENT_READONLY)
778 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
779 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
780 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
781 else if (segment == HSA_REGION_SEGMENT_GROUP)
782 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
783 else
784 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
785 }
786 else
787 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
788
789 if (segment == HSA_REGION_SEGMENT_GLOBAL)
790 {
791 uint32_t flags;
792 status
793 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
794 &flags);
795 if (status == HSA_STATUS_SUCCESS)
796 {
797 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
798 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
799 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
800 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
801 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
802 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
803 }
804 else
805 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
806 }
807
808 size_t size;
809 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
810 if (status == HSA_STATUS_SUCCESS)
811 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
812 else
813 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
814
815 status
816 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
817 &size);
818 if (status == HSA_STATUS_SUCCESS)
819 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
820 else
821 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
822
823 bool alloc_allowed;
824 status
825 = hsa_fns.hsa_region_get_info_fn (region,
826 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
827 &alloc_allowed);
828 if (status == HSA_STATUS_SUCCESS)
829 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
830 else
831 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
832
833 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
834 return HSA_STATUS_SUCCESS;
835
836 status
837 = hsa_fns.hsa_region_get_info_fn (region,
838 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
839 &size);
840 if (status == HSA_STATUS_SUCCESS)
841 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
842 else
843 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
844
845 size_t align;
846 status
847 = hsa_fns.hsa_region_get_info_fn (region,
848 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
849 &align);
850 if (status == HSA_STATUS_SUCCESS)
851 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
852 else
853 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
854
855 return HSA_STATUS_SUCCESS;
856}
857
858/* Dump information about all the device memory regions. */
859
860static void
861dump_hsa_regions (hsa_agent_t agent)
862{
863 hsa_status_t status;
864 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
865 dump_hsa_region,
866 NULL);
867 if (status != HSA_STATUS_SUCCESS)
868 hsa_error ("Dumping hsa regions failed", status);
869}
870
871/* Dump information about the available devices. */
872
873static hsa_status_t
874dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
875{
876 hsa_status_t status;
877
878 char buf[64];
879 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
880 &buf);
881 if (status == HSA_STATUS_SUCCESS)
882 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
883 else
884 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
885
886 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
887 &buf);
888 if (status == HSA_STATUS_SUCCESS)
889 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
890 else
891 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
892
893 hsa_machine_model_t machine_model;
894 status
895 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
896 &machine_model);
897 if (status == HSA_STATUS_SUCCESS)
898 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
899 else
900 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
901
902 hsa_profile_t profile;
903 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
904 &profile);
905 if (status == HSA_STATUS_SUCCESS)
906 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
907 else
908 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
909
910 hsa_device_type_t device_type;
911 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
912 &device_type);
913 if (status == HSA_STATUS_SUCCESS)
914 {
915 switch (device_type)
916 {
917 case HSA_DEVICE_TYPE_CPU:
918 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
919 break;
920 case HSA_DEVICE_TYPE_GPU:
921 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
922 break;
923 case HSA_DEVICE_TYPE_DSP:
924 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
925 break;
926 default:
927 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
928 break;
929 }
930 }
931 else
932 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
933
934 uint32_t cu_count;
935 status = hsa_fns.hsa_agent_get_info_fn
936 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
937 if (status == HSA_STATUS_SUCCESS)
938 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
939 else
940 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
941
942 uint32_t size;
943 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
944 &size);
945 if (status == HSA_STATUS_SUCCESS)
946 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
947 else
948 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
949
950 uint32_t max_dim;
951 status = hsa_fns.hsa_agent_get_info_fn (agent,
952 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
953 &max_dim);
954 if (status == HSA_STATUS_SUCCESS)
955 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
956 else
957 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
958
959 uint32_t max_size;
960 status = hsa_fns.hsa_agent_get_info_fn (agent,
961 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
962 &max_size);
963 if (status == HSA_STATUS_SUCCESS)
964 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
965 else
966 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
967
968 uint32_t grid_max_dim;
969 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
970 &grid_max_dim);
971 if (status == HSA_STATUS_SUCCESS)
972 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
973 else
974 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
975
976 uint32_t grid_max_size;
977 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
978 &grid_max_size);
979 if (status == HSA_STATUS_SUCCESS)
980 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
981 else
982 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
983
984 dump_hsa_regions (agent);
985
986 return HSA_STATUS_SUCCESS;
987}
988
989/* Forward reference. */
990
991static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
992
993/* Helper function for dump_executable_symbols. */
994
995static hsa_status_t
996dump_executable_symbol (hsa_executable_t executable,
997 hsa_executable_symbol_t symbol,
998 void *data __attribute__((unused)))
999{
1000 char *name = get_executable_symbol_name (symbol);
1001
1002 if (name)
1003 {
1004 GCN_DEBUG ("executable symbol: %s\n", name);
1005 free (name);
1006 }
1007
1008 return HSA_STATUS_SUCCESS;
1009}
1010
1011/* Dump all global symbol in an executable. */
1012
1013static void
1014dump_executable_symbols (hsa_executable_t executable)
1015{
1016 hsa_status_t status;
1017 status
1018 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1019 dump_executable_symbol,
1020 NULL);
1021 if (status != HSA_STATUS_SUCCESS)
1022 hsa_fatal ("Could not dump HSA executable symbols", status);
1023}
1024
1025/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1026
1027static void
1028print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1029{
1030 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1031
1032 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1033 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1034 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1035 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1036 (void*)kernargs->heap_ptr);
1037 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1038 (void*)kernargs->arena_ptr);
1039 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1040 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1041 dispatch->private_segment_size);
1042 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1043 dispatch->group_segment_size);
1044 fprintf (stderr, "\n");
1045}
1046
1047/* }}} */
1048/* {{{ Utility functions */
1049
1050/* Cast the thread local storage to gcn_thread. */
1051
1052static inline struct gcn_thread *
1053gcn_thread (void)
1054{
1055 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1056}
1057
1058/* Initialize debug and suppress_host_fallback according to the environment. */
1059
1060static void
1061init_environment_variables (void)
1062{
1063 if (secure_getenv ("GCN_DEBUG"))
1064 debug = true;
1065 else
1066 debug = false;
1067
1068 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1069 suppress_host_fallback = true;
1070 else
1071 suppress_host_fallback = false;
1072
1073 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1074 if (hsa_runtime_lib == NULL)
7c1e856b 1075 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1076
1077 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1078
1079 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1080 if (!x)
1081 x = secure_getenv ("GCN_NUM_GANGS");
1082 if (x)
1083 override_x_dim = atoi (x);
1084
1085 const char *z = secure_getenv ("GCN_NUM_THREADS");
1086 if (!z)
1087 z = secure_getenv ("GCN_NUM_WORKERS");
1088 if (z)
1089 override_z_dim = atoi (z);
1090
1091 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1092 if (heap)
1093 {
1094 size_t tmp = atol (heap);
1095 if (tmp)
1096 gcn_kernel_heap_size = tmp;
1097 }
1098}
1099
1100/* Return malloc'd string with name of SYMBOL. */
1101
1102static char *
1103get_executable_symbol_name (hsa_executable_symbol_t symbol)
1104{
1105 hsa_status_t status;
1106 char *res;
1107 uint32_t len;
1108 const hsa_executable_symbol_info_t info_name_length
1109 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1110
1111 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1112 &len);
1113 if (status != HSA_STATUS_SUCCESS)
1114 {
1115 hsa_error ("Could not get length of symbol name", status);
1116 return NULL;
1117 }
1118
1119 res = GOMP_PLUGIN_malloc (len + 1);
1120
1121 const hsa_executable_symbol_info_t info_name
1122 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1123
1124 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1125
1126 if (status != HSA_STATUS_SUCCESS)
1127 {
1128 hsa_error ("Could not get symbol name", status);
1129 free (res);
1130 return NULL;
1131 }
1132
1133 res[len] = '\0';
1134
1135 return res;
1136}
1137
237957cc
AS
1138/* Get the number of GPU Compute Units. */
1139
1140static int
1141get_cu_count (struct agent_info *agent)
1142{
1143 uint32_t cu_count;
1144 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1145 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1146 if (status == HSA_STATUS_SUCCESS)
1147 return cu_count;
1148 else
1149 return 64; /* The usual number for older devices. */
1150}
1151
1152/* Calculate the maximum grid size for OMP threads / OACC workers.
1153 This depends on the kernel's resource usage levels. */
1154
1155static int
1156limit_worker_threads (int threads)
1157{
1158 /* FIXME Do something more inteligent here.
1159 GCN can always run 4 threads within a Compute Unit, but
1160 more than that depends on register usage. */
1161 if (threads > 16)
1162 threads = 16;
1163 return threads;
1164}
1165
1166/* Parse the target attributes INPUT provided by the compiler and return true
1167 if we should run anything all. If INPUT is NULL, fill DEF with default
1168 values, then store INPUT or DEF into *RESULT.
1169
1170 This is used for OpenMP only. */
1171
1172static bool
1173parse_target_attributes (void **input,
1174 struct GOMP_kernel_launch_attributes *def,
1175 struct GOMP_kernel_launch_attributes **result,
1176 struct agent_info *agent)
1177{
1178 if (!input)
1179 GOMP_PLUGIN_fatal ("No target arguments provided");
1180
1181 bool grid_attrs_found = false;
1182 bool gcn_dims_found = false;
1183 int gcn_teams = 0;
1184 int gcn_threads = 0;
1185 while (*input)
1186 {
1187 intptr_t id = (intptr_t) *input++, val;
1188
1189 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1190 val = (intptr_t) *input++;
1191 else
1192 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1193
1194 val = (val > INT_MAX) ? INT_MAX : val;
1195
1196 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1197 && ((id & GOMP_TARGET_ARG_ID_MASK)
1198 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1199 {
1200 grid_attrs_found = true;
1201 break;
1202 }
14e5e746
AS
1203 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1204 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1205 {
1206 gcn_dims_found = true;
1207 switch (id & GOMP_TARGET_ARG_ID_MASK)
1208 {
1209 case GOMP_TARGET_ARG_NUM_TEAMS:
1210 gcn_teams = val;
1211 break;
1212 case GOMP_TARGET_ARG_THREAD_LIMIT:
1213 gcn_threads = limit_worker_threads (val);
1214 break;
1215 default:
1216 ;
1217 }
1218 }
1219 }
1220
1221 if (gcn_dims_found)
1222 {
4dcd1e1f
KCY
1223 bool gfx900_workaround_p = false;
1224
7d593fd6
FH
1225 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1226 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1227 {
4dcd1e1f 1228 gfx900_workaround_p = true;
237957cc 1229 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
4dcd1e1f 1230 "threads to at most 4 per team.\n");
237957cc
AS
1231 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1232 "GCN_NUM_THREADS=16\n");
1233 }
1234
4dcd1e1f
KCY
1235 /* Ideally, when a dimension isn't explicitly specified, we should
1236 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1237 In practice, we tune for peak performance on BabelStream, which
1238 for OpenACC is currently 32 threads per CU. */
237957cc 1239 def->ndim = 3;
4dcd1e1f
KCY
1240 if (gcn_teams <= 0 && gcn_threads <= 0)
1241 {
1242 /* Set up a reasonable number of teams and threads. */
1243 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1244 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1245 def->gdims[2] = gcn_threads;
1246 }
1247 else if (gcn_teams <= 0 && gcn_threads > 0)
1248 {
1249 /* Auto-scale the number of teams with the number of threads. */
1250 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1251 def->gdims[2] = gcn_threads;
1252 }
1253 else if (gcn_teams > 0 && gcn_threads <= 0)
1254 {
1255 int max_threads = gfx900_workaround_p ? 4 : 16;
1256
1257 /* Auto-scale the number of threads with the number of teams. */
1258 def->gdims[0] = gcn_teams;
1259 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1260 if (def->gdims[2] == 0)
1261 def->gdims[2] = 1;
1262 else if (def->gdims[2] > max_threads)
1263 def->gdims[2] = max_threads;
1264 }
1265 else
1266 {
1267 def->gdims[0] = gcn_teams;
1268 def->gdims[2] = gcn_threads;
1269 }
1270 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1271 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1272 def->wdims[1] = 64;
1273 def->wdims[2] = 16;
1274 *result = def;
1275 return true;
1276 }
1277 else if (!grid_attrs_found)
1278 {
1279 def->ndim = 1;
1280 def->gdims[0] = 1;
1281 def->gdims[1] = 1;
1282 def->gdims[2] = 1;
1283 def->wdims[0] = 1;
1284 def->wdims[1] = 1;
1285 def->wdims[2] = 1;
1286 *result = def;
1287 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1288 return true;
1289 }
1290
1291 struct GOMP_kernel_launch_attributes *kla;
1292 kla = (struct GOMP_kernel_launch_attributes *) *input;
1293 *result = kla;
1294 if (kla->ndim == 0 || kla->ndim > 3)
1295 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1296
1297 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1298 unsigned i;
1299 for (i = 0; i < kla->ndim; i++)
1300 {
1301 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1302 kla->gdims[i], kla->wdims[i]);
1303 if (kla->gdims[i] == 0)
1304 return false;
1305 }
1306 return true;
1307}
1308
1309/* Return the group size given the requested GROUP size, GRID size and number
1310 of grid dimensions NDIM. */
1311
1312static uint32_t
1313get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1314{
1315 if (group == 0)
1316 {
1317 /* TODO: Provide a default via environment or device characteristics. */
1318 if (ndim == 1)
1319 group = 64;
1320 else if (ndim == 2)
1321 group = 8;
1322 else
1323 group = 4;
1324 }
1325
1326 if (group > grid)
1327 group = grid;
1328 return group;
1329}
1330
1331/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1332
1333static void
1334packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1335{
1336 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1337}
1338
1339/* A never-called callback for the HSA command queues. These signal events
1340 that we don't use, so we trigger an error.
1341
1342 This "queue" is not to be confused with the async queues, below. */
1343
1344static void
1345hsa_queue_callback (hsa_status_t status,
1346 hsa_queue_t *queue __attribute__ ((unused)),
1347 void *data __attribute__ ((unused)))
1348{
1349 hsa_fatal ("Asynchronous queue error", status);
1350}
1351
1352/* }}} */
1353/* {{{ HSA initialization */
1354
1355/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1356
1357static bool
1358init_hsa_runtime_functions (void)
1359{
1360#define DLSYM_FN(function) \
1361 hsa_fns.function##_fn = dlsym (handle, #function); \
1362 if (hsa_fns.function##_fn == NULL) \
1363 return false;
1364 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1365 if (handle == NULL)
1366 return false;
1367
1368 DLSYM_FN (hsa_status_string)
1369 DLSYM_FN (hsa_system_get_info)
1370 DLSYM_FN (hsa_agent_get_info)
1371 DLSYM_FN (hsa_init)
1372 DLSYM_FN (hsa_iterate_agents)
1373 DLSYM_FN (hsa_region_get_info)
1374 DLSYM_FN (hsa_queue_create)
1375 DLSYM_FN (hsa_agent_iterate_regions)
1376 DLSYM_FN (hsa_executable_destroy)
1377 DLSYM_FN (hsa_executable_create)
1378 DLSYM_FN (hsa_executable_global_variable_define)
1379 DLSYM_FN (hsa_executable_load_code_object)
1380 DLSYM_FN (hsa_executable_freeze)
1381 DLSYM_FN (hsa_signal_create)
1382 DLSYM_FN (hsa_memory_allocate)
1383 DLSYM_FN (hsa_memory_assign_agent)
1384 DLSYM_FN (hsa_memory_copy)
1385 DLSYM_FN (hsa_memory_free)
1386 DLSYM_FN (hsa_signal_destroy)
1387 DLSYM_FN (hsa_executable_get_symbol)
1388 DLSYM_FN (hsa_executable_symbol_get_info)
1389 DLSYM_FN (hsa_executable_iterate_symbols)
1390 DLSYM_FN (hsa_queue_add_write_index_release)
1391 DLSYM_FN (hsa_queue_load_read_index_acquire)
1392 DLSYM_FN (hsa_signal_wait_acquire)
1393 DLSYM_FN (hsa_signal_store_relaxed)
1394 DLSYM_FN (hsa_signal_store_release)
1395 DLSYM_FN (hsa_signal_load_acquire)
1396 DLSYM_FN (hsa_queue_destroy)
1397 DLSYM_FN (hsa_code_object_deserialize)
1398 return true;
1399#undef DLSYM_FN
1400}
1401
1402/* Return true if the agent is a GPU and can accept of concurrent submissions
1403 from different threads. */
1404
1405static bool
1406suitable_hsa_agent_p (hsa_agent_t agent)
1407{
1408 hsa_device_type_t device_type;
1409 hsa_status_t status
1410 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1411 &device_type);
1412 if (status != HSA_STATUS_SUCCESS)
1413 return false;
1414
1415 switch (device_type)
1416 {
1417 case HSA_DEVICE_TYPE_GPU:
1418 break;
1419 case HSA_DEVICE_TYPE_CPU:
1420 if (!support_cpu_devices)
1421 return false;
1422 break;
1423 default:
1424 return false;
1425 }
1426
1427 uint32_t features = 0;
1428 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1429 &features);
1430 if (status != HSA_STATUS_SUCCESS
1431 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1432 return false;
1433 hsa_queue_type_t queue_type;
1434 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1435 &queue_type);
1436 if (status != HSA_STATUS_SUCCESS
1437 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1438 return false;
1439
1440 return true;
1441}
1442
1443/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1444 agent_count in hsa_context. */
1445
1446static hsa_status_t
1447count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1448{
1449 if (suitable_hsa_agent_p (agent))
1450 hsa_context.agent_count++;
1451 return HSA_STATUS_SUCCESS;
1452}
1453
1454/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1455 id to the describing structure in the hsa context. The index of the
1456 structure is pointed to by DATA, increment it afterwards. */
1457
1458static hsa_status_t
1459assign_agent_ids (hsa_agent_t agent, void *data)
1460{
1461 if (suitable_hsa_agent_p (agent))
1462 {
1463 int *agent_index = (int *) data;
1464 hsa_context.agents[*agent_index].id = agent;
1465 ++*agent_index;
1466 }
1467 return HSA_STATUS_SUCCESS;
1468}
1469
1470/* Initialize hsa_context if it has not already been done.
1471 Return TRUE on success. */
1472
1473static bool
1474init_hsa_context (void)
1475{
1476 hsa_status_t status;
1477 int agent_index = 0;
1478
1479 if (hsa_context.initialized)
1480 return true;
1481 init_environment_variables ();
1482 if (!init_hsa_runtime_functions ())
1483 {
1484 GCN_WARNING ("Run-time could not be dynamically opened\n");
1485 if (suppress_host_fallback)
1486 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1487 return false;
1488 }
1489 status = hsa_fns.hsa_init_fn ();
1490 if (status != HSA_STATUS_SUCCESS)
1491 return hsa_error ("Run-time could not be initialized", status);
1492 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1493
1494 if (debug)
1495 dump_hsa_system_info ();
1496
1497 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1498 if (status != HSA_STATUS_SUCCESS)
1499 return hsa_error ("GCN GPU devices could not be enumerated", status);
1500 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1501
1502 hsa_context.agents
1503 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1504 * sizeof (struct agent_info));
1505 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1506 if (status != HSA_STATUS_SUCCESS)
1507 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1508 if (agent_index != hsa_context.agent_count)
1509 {
1510 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1511 return false;
1512 }
1513
1514 if (debug)
1515 {
1516 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1517 if (status != HSA_STATUS_SUCCESS)
1518 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1519 }
1520
2e5ea579
FH
1521 uint16_t minor, major;
1522 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1523 &minor);
1524 if (status != HSA_STATUS_SUCCESS)
1525 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1526 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1527 &major);
1528 if (status != HSA_STATUS_SUCCESS)
1529 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1530
1531 size_t len = sizeof hsa_context.driver_version_s;
1532 int printed = snprintf (hsa_context.driver_version_s, len,
1533 "HSA Runtime %hu.%hu", (unsigned short int)major,
1534 (unsigned short int)minor);
1535 if (printed >= len)
1536 GCN_WARNING ("HSA runtime version string was truncated."
1537 "Version %hu.%hu is too long.", (unsigned short int)major,
1538 (unsigned short int)minor);
1539
237957cc
AS
1540 hsa_context.initialized = true;
1541 return true;
1542}
1543
1544/* Verify that hsa_context has already been initialized and return the
1545 agent_info structure describing device number N. Return NULL on error. */
1546
1547static struct agent_info *
1548get_agent_info (int n)
1549{
1550 if (!hsa_context.initialized)
1551 {
1552 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1553 return NULL;
1554 }
1555 if (n >= hsa_context.agent_count)
1556 {
1557 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1558 return NULL;
1559 }
1560 if (!hsa_context.agents[n].initialized)
1561 {
1562 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1563 return NULL;
1564 }
1565 return &hsa_context.agents[n];
1566}
1567
1568/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1569
1570 Selects (breaks at) a suitable region of type KIND. */
1571
1572static hsa_status_t
1573get_memory_region (hsa_region_t region, hsa_region_t *retval,
1574 hsa_region_global_flag_t kind)
1575{
1576 hsa_status_t status;
1577 hsa_region_segment_t segment;
1578
1579 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1580 &segment);
1581 if (status != HSA_STATUS_SUCCESS)
1582 return status;
1583 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1584 return HSA_STATUS_SUCCESS;
1585
1586 uint32_t flags;
1587 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1588 &flags);
1589 if (status != HSA_STATUS_SUCCESS)
1590 return status;
1591 if (flags & kind)
1592 {
1593 *retval = region;
1594 return HSA_STATUS_INFO_BREAK;
1595 }
1596 return HSA_STATUS_SUCCESS;
1597}
1598
1599/* Callback of hsa_agent_iterate_regions.
1600
1601 Selects a kernargs memory region. */
1602
1603static hsa_status_t
1604get_kernarg_memory_region (hsa_region_t region, void *data)
1605{
1606 return get_memory_region (region, (hsa_region_t *)data,
1607 HSA_REGION_GLOBAL_FLAG_KERNARG);
1608}
1609
1610/* Callback of hsa_agent_iterate_regions.
1611
1612 Selects a coarse-grained memory region suitable for the heap and
1613 offload data. */
1614
1615static hsa_status_t
1616get_data_memory_region (hsa_region_t region, void *data)
1617{
1618 return get_memory_region (region, (hsa_region_t *)data,
1619 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1620}
1621
7d593fd6
FH
1622static int
1623elf_gcn_isa_field (Elf64_Ehdr *image)
1624{
1625 return image->e_flags & EF_AMDGPU_MACH_MASK;
1626}
1627
7d593fd6
FH
1628const static char *gcn_gfx803_s = "gfx803";
1629const static char *gcn_gfx900_s = "gfx900";
1630const static char *gcn_gfx906_s = "gfx906";
3535402e 1631const static char *gcn_gfx908_s = "gfx908";
7d593fd6
FH
1632const static int gcn_isa_name_len = 6;
1633
1634/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1635 support the ISA. */
1636
1637static const char*
1638isa_hsa_name (int isa) {
1639 switch(isa)
1640 {
7d593fd6
FH
1641 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1642 return gcn_gfx803_s;
1643 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1644 return gcn_gfx900_s;
1645 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1646 return gcn_gfx906_s;
3535402e
AS
1647 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1648 return gcn_gfx908_s;
7d593fd6
FH
1649 }
1650 return NULL;
1651}
1652
1653/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1654 with -march) or NULL if we do not support the ISA.
1655 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1656
1657static const char*
1658isa_gcc_name (int isa) {
1659 switch(isa)
1660 {
7d593fd6
FH
1661 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1662 return "fiji";
1663 default:
1664 return isa_hsa_name (isa);
1665 }
1666}
1667
1668/* Returns the code which is used in the GCN object code to identify the ISA with
1669 the given name (as used by the HSA runtime). */
1670
1671static gcn_isa
1672isa_code(const char *isa) {
7d593fd6
FH
1673 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1674 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1675
1676 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1677 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1678
1679 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1680 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1681
3535402e
AS
1682 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1683 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1684
7d593fd6
FH
1685 return -1;
1686}
1687
237957cc
AS
1688/* }}} */
1689/* {{{ Run */
1690
1691/* Create or reuse a team arena.
1692
1693 Team arenas are used by OpenMP to avoid calling malloc multiple times
1694 while setting up each team. This is purely a performance optimization.
1695
1696 Allocating an arena also costs performance, albeit on the host side, so
1697 this function will reuse an existing arena if a large enough one is idle.
1698 The arena is released, but not deallocated, when the kernel exits. */
1699
1700static void *
1701get_team_arena (struct agent_info *agent, int num_teams)
1702{
1703 struct team_arena_list **next_ptr = &agent->team_arena_list;
1704 struct team_arena_list *item;
1705
1706 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1707 {
1708 if (item->num_teams < num_teams)
1709 continue;
1710
1711 if (pthread_mutex_trylock (&item->in_use))
1712 continue;
1713
1714 return item->arena;
1715 }
1716
1717 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1718
1719 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1720 {
1721 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1722 return false;
1723 }
1724 item = malloc (sizeof (*item));
1725 item->num_teams = num_teams;
1726 item->next = NULL;
1727 *next_ptr = item;
1728
1729 if (pthread_mutex_init (&item->in_use, NULL))
1730 {
1731 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1732 return false;
1733 }
1734 if (pthread_mutex_lock (&item->in_use))
1735 {
1736 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1737 return false;
1738 }
1739 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1740 {
1741 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1742 return false;
1743 }
1744
1745 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1746 hsa_status_t status;
1747 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1748 TEAM_ARENA_SIZE*num_teams,
1749 &item->arena);
1750 if (status != HSA_STATUS_SUCCESS)
1751 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1752 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1753 HSA_ACCESS_PERMISSION_RW);
1754 if (status != HSA_STATUS_SUCCESS)
1755 hsa_fatal ("Could not assign arena memory to device", status);
1756
1757 return item->arena;
1758}
1759
1760/* Mark a team arena available for reuse. */
1761
1762static void
1763release_team_arena (struct agent_info* agent, void *arena)
1764{
1765 struct team_arena_list *item;
1766
1767 for (item = agent->team_arena_list; item; item = item->next)
1768 {
1769 if (item->arena == arena)
1770 {
1771 if (pthread_mutex_unlock (&item->in_use))
1772 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1773 return;
1774 }
1775 }
1776 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1777}
1778
1779/* Clean up all the allocated team arenas. */
1780
1781static bool
1782destroy_team_arenas (struct agent_info *agent)
1783{
1784 struct team_arena_list *item, *next;
1785
1786 for (item = agent->team_arena_list; item; item = next)
1787 {
1788 next = item->next;
1789 hsa_fns.hsa_memory_free_fn (item->arena);
1790 if (pthread_mutex_destroy (&item->in_use))
1791 {
1792 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1793 return false;
1794 }
1795 free (item);
1796 }
1797 agent->team_arena_list = NULL;
1798
1799 return true;
1800}
1801
1802/* Allocate memory on a specified device. */
1803
1804static void *
1805alloc_by_agent (struct agent_info *agent, size_t size)
1806{
1807 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1808
1809 /* Zero-size allocations are invalid, so in order to return a valid pointer
1810 we need to pass a valid size. One source of zero-size allocations is
1811 kernargs for kernels that have no inputs or outputs (the kernel may
1812 only use console output, for example). */
1813 if (size == 0)
1814 size = 4;
1815
1816 void *ptr;
1817 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1818 size, &ptr);
1819 if (status != HSA_STATUS_SUCCESS)
1820 {
1821 hsa_error ("Could not allocate device memory", status);
1822 return NULL;
1823 }
1824
1825 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1826 HSA_ACCESS_PERMISSION_RW);
1827 if (status != HSA_STATUS_SUCCESS)
1828 {
1829 hsa_error ("Could not assign data memory to device", status);
1830 return NULL;
1831 }
1832
1833 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1834 bool profiling_dispatch_p
1835 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1836 if (profiling_dispatch_p)
1837 {
1838 acc_prof_info *prof_info = thr->prof_info;
1839 acc_event_info data_event_info;
1840 acc_api_info *api_info = thr->api_info;
1841
1842 prof_info->event_type = acc_ev_alloc;
1843
1844 data_event_info.data_event.event_type = prof_info->event_type;
1845 data_event_info.data_event.valid_bytes
1846 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1847 data_event_info.data_event.parent_construct
1848 = acc_construct_parallel;
1849 data_event_info.data_event.implicit = 1;
1850 data_event_info.data_event.tool_info = NULL;
1851 data_event_info.data_event.var_name = NULL;
1852 data_event_info.data_event.bytes = size;
1853 data_event_info.data_event.host_ptr = NULL;
1854 data_event_info.data_event.device_ptr = (void *) ptr;
1855
1856 api_info->device_api = acc_device_api_other;
1857
1858 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1859 api_info);
1860 }
1861
1862 return ptr;
1863}
1864
1865/* Create kernel dispatch data structure for given KERNEL, along with
1866 the necessary device signals and memory allocations. */
1867
1868static struct kernel_dispatch *
1869create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1870{
1871 struct agent_info *agent = kernel->agent;
1872 struct kernel_dispatch *shadow
1873 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1874
1875 shadow->agent = kernel->agent;
1876 shadow->object = kernel->object;
1877
1878 hsa_signal_t sync_signal;
1879 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1880 if (status != HSA_STATUS_SUCCESS)
1881 hsa_fatal ("Error creating the GCN sync signal", status);
1882
1883 shadow->signal = sync_signal.handle;
1884 shadow->private_segment_size = kernel->private_segment_size;
1885 shadow->group_segment_size = kernel->group_segment_size;
1886
1887 /* We expect kernels to request a single pointer, explicitly, and the
1888 rest of struct kernargs, implicitly. If they request anything else
1889 then something is wrong. */
1890 if (kernel->kernarg_segment_size > 8)
1891 {
1892 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1893 return NULL;
1894 }
1895
1896 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1897 sizeof (struct kernargs),
1898 &shadow->kernarg_address);
1899 if (status != HSA_STATUS_SUCCESS)
1900 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1901 struct kernargs *kernargs = shadow->kernarg_address;
1902
1903 /* Zero-initialize the output_data (minimum needed). */
1904 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1905 kernargs->output_data.next_output = 0;
1906 for (unsigned i = 0;
1907 i < (sizeof (kernargs->output_data.queue)
1908 / sizeof (kernargs->output_data.queue[0]));
1909 i++)
1910 kernargs->output_data.queue[i].written = 0;
1911 kernargs->output_data.consumed = 0;
1912
1913 /* Pass in the heap location. */
1914 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1915
1916 /* Create an arena. */
1917 if (kernel->kind == KIND_OPENMP)
1918 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1919 else
1920 kernargs->arena_ptr = 0;
1921
1922 /* Ensure we can recognize unset return values. */
1923 kernargs->output_data.return_value = 0xcafe0000;
1924
1925 return shadow;
1926}
1927
1928/* Output any data written to console output from the kernel. It is expected
1929 that this function is polled during kernel execution.
1930
1931 We print all entries from the last item printed to the next entry without
1932 a "written" flag. If the "final" flag is set then it'll continue right to
1933 the end.
1934
1935 The print buffer is circular, but the from and to locations don't wrap when
1936 the buffer does, so the output limit is UINT_MAX. The target blocks on
1937 output when the buffer is full. */
1938
1939static void
1940console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1941 bool final)
1942{
1943 unsigned int limit = (sizeof (kernargs->output_data.queue)
1944 / sizeof (kernargs->output_data.queue[0]));
1945
1946 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1947 __ATOMIC_ACQUIRE);
1948 unsigned int to = kernargs->output_data.next_output;
1949
1950 if (from > to)
1951 {
1952 /* Overflow. */
1953 if (final)
1954 printf ("GCN print buffer overflowed.\n");
1955 return;
1956 }
1957
1958 unsigned int i;
1959 for (i = from; i < to; i++)
1960 {
1961 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1962
1963 if (!data->written && !final)
1964 break;
1965
1966 switch (data->type)
1967 {
1968 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1969 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1970 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1971 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1972 default: printf ("GCN print buffer error!\n"); break;
1973 }
1974 data->written = 0;
1975 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1976 __ATOMIC_RELEASE);
1977 }
1978 fflush (stdout);
1979}
1980
1981/* Release data structure created for a kernel dispatch in SHADOW argument,
1982 and clean up the signal and memory allocations. */
1983
1984static void
1985release_kernel_dispatch (struct kernel_dispatch *shadow)
1986{
1987 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1988
1989 struct kernargs *kernargs = shadow->kernarg_address;
1990 void *arena = (void *)kernargs->arena_ptr;
1991 if (arena)
1992 release_team_arena (shadow->agent, arena);
1993
1994 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1995
1996 hsa_signal_t s;
1997 s.handle = shadow->signal;
1998 hsa_fns.hsa_signal_destroy_fn (s);
1999
2000 free (shadow);
2001}
2002
2003/* Extract the properties from a kernel binary. */
2004
2005static void
2006init_kernel_properties (struct kernel_info *kernel)
2007{
2008 hsa_status_t status;
2009 struct agent_info *agent = kernel->agent;
2010 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2011 char *buf = alloca (strlen (kernel->name) + 4);
2012 sprintf (buf, "%s.kd", kernel->name);
237957cc 2013 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2014 buf, agent->id,
237957cc
AS
2015 0, &kernel_symbol);
2016 if (status != HSA_STATUS_SUCCESS)
2017 {
2018 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2019 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2020 dump_executable_symbols (agent->executable);
2021 goto failure;
2022 }
2023 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2024 status = hsa_fns.hsa_executable_symbol_get_info_fn
2025 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2026 if (status != HSA_STATUS_SUCCESS)
2027 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2028 status = hsa_fns.hsa_executable_symbol_get_info_fn
2029 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2030 &kernel->kernarg_segment_size);
2031 if (status != HSA_STATUS_SUCCESS)
2032 hsa_fatal ("Could not get info about kernel argument size", status);
2033 status = hsa_fns.hsa_executable_symbol_get_info_fn
2034 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2035 &kernel->group_segment_size);
2036 if (status != HSA_STATUS_SUCCESS)
2037 hsa_fatal ("Could not get info about kernel group segment size", status);
2038 status = hsa_fns.hsa_executable_symbol_get_info_fn
2039 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2040 &kernel->private_segment_size);
2041 if (status != HSA_STATUS_SUCCESS)
2042 hsa_fatal ("Could not get info about kernel private segment size",
2043 status);
2044
2045 /* The kernel type is not known until something tries to launch it. */
2046 kernel->kind = KIND_UNKNOWN;
2047
2048 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2049 "following segment sizes: \n", kernel->name);
2050 GCN_DEBUG (" group_segment_size: %u\n",
2051 (unsigned) kernel->group_segment_size);
2052 GCN_DEBUG (" private_segment_size: %u\n",
2053 (unsigned) kernel->private_segment_size);
2054 GCN_DEBUG (" kernarg_segment_size: %u\n",
2055 (unsigned) kernel->kernarg_segment_size);
2056 return;
2057
2058failure:
2059 kernel->initialization_failed = true;
2060}
2061
2062/* Do all the work that is necessary before running KERNEL for the first time.
2063 The function assumes the program has been created, finalized and frozen by
2064 create_and_finalize_hsa_program. */
2065
2066static void
2067init_kernel (struct kernel_info *kernel)
2068{
2069 if (pthread_mutex_lock (&kernel->init_mutex))
2070 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2071 if (kernel->initialized)
2072 {
2073 if (pthread_mutex_unlock (&kernel->init_mutex))
2074 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2075 "mutex");
2076
2077 return;
2078 }
2079
2080 init_kernel_properties (kernel);
2081
2082 if (!kernel->initialization_failed)
2083 {
2084 GCN_DEBUG ("\n");
2085
2086 kernel->initialized = true;
2087 }
2088 if (pthread_mutex_unlock (&kernel->init_mutex))
2089 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2090 "mutex");
2091}
2092
2093/* Run KERNEL on its agent, pass VARS to it as arguments and take
2094 launch attributes from KLA.
2095
2096 MODULE_LOCKED indicates that the caller already holds the lock and
2097 run_kernel need not lock it again.
2098 If AQ is NULL then agent->sync_queue will be used. */
2099
2100static void
2101run_kernel (struct kernel_info *kernel, void *vars,
2102 struct GOMP_kernel_launch_attributes *kla,
2103 struct goacc_asyncqueue *aq, bool module_locked)
2104{
5a28e272
KCY
2105 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2106 kernel->description->vpgr_count);
2107
2108 /* Reduce the number of threads/workers if there are insufficient
2109 VGPRs available to run the kernels together. */
2110 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2111 {
2112 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2113 int max_threads = (256 / granulated_vgprs) * 4;
2114 if (kla->gdims[2] > max_threads)
2115 {
2116 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2117 " per team/gang - reducing to %d threads/workers.\n",
2118 kla->gdims[2], max_threads);
2119 kla->gdims[2] = max_threads;
2120 }
2121 }
2122
237957cc
AS
2123 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2124 (aq ? aq->id : 0));
2125 GCN_DEBUG ("GCN launch attribs: gdims:[");
2126 int i;
2127 for (i = 0; i < kla->ndim; ++i)
2128 {
2129 if (i)
2130 DEBUG_PRINT (", ");
2131 DEBUG_PRINT ("%u", kla->gdims[i]);
2132 }
2133 DEBUG_PRINT ("], normalized gdims:[");
2134 for (i = 0; i < kla->ndim; ++i)
2135 {
2136 if (i)
2137 DEBUG_PRINT (", ");
2138 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2139 }
2140 DEBUG_PRINT ("], wdims:[");
2141 for (i = 0; i < kla->ndim; ++i)
2142 {
2143 if (i)
2144 DEBUG_PRINT (", ");
2145 DEBUG_PRINT ("%u", kla->wdims[i]);
2146 }
2147 DEBUG_PRINT ("]\n");
2148 DEBUG_FLUSH ();
2149
2150 struct agent_info *agent = kernel->agent;
2151 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2152 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2153
2154 if (!agent->initialized)
2155 GOMP_PLUGIN_fatal ("Agent must be initialized");
2156
2157 if (!kernel->initialized)
2158 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2159
2160 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2161
2162 uint64_t index
2163 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2164 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2165
2166 /* Wait until the queue is not full before writing the packet. */
2167 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2168 >= command_q->size)
2169 ;
2170
2171 /* Do not allow the dimensions to be overridden when running
2172 constructors or destructors. */
2173 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2174 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2175
2176 hsa_kernel_dispatch_packet_t *packet;
2177 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2178 + index % command_q->size;
2179
2180 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2181 packet->grid_size_x = override_x ? : kla->gdims[0];
2182 packet->workgroup_size_x = get_group_size (kla->ndim,
2183 packet->grid_size_x,
2184 kla->wdims[0]);
2185
2186 if (kla->ndim >= 2)
2187 {
2188 packet->grid_size_y = kla->gdims[1];
2189 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2190 kla->wdims[1]);
2191 }
2192 else
2193 {
2194 packet->grid_size_y = 1;
2195 packet->workgroup_size_y = 1;
2196 }
2197
2198 if (kla->ndim == 3)
2199 {
2200 packet->grid_size_z = limit_worker_threads (override_z
2201 ? : kla->gdims[2]);
2202 packet->workgroup_size_z = get_group_size (kla->ndim,
2203 packet->grid_size_z,
2204 kla->wdims[2]);
2205 }
2206 else
2207 {
2208 packet->grid_size_z = 1;
2209 packet->workgroup_size_z = 1;
2210 }
2211
2212 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2213 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2214 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2215 packet->grid_size_x / packet->workgroup_size_x,
2216 packet->grid_size_y / packet->workgroup_size_y,
2217 packet->grid_size_z / packet->workgroup_size_z,
2218 packet->workgroup_size_x, packet->workgroup_size_y,
2219 packet->workgroup_size_z);
2220
2221 struct kernel_dispatch *shadow
2222 = create_kernel_dispatch (kernel, packet->grid_size_x);
2223 shadow->queue = command_q;
2224
2225 if (debug)
2226 {
2227 fprintf (stderr, "\nKernel has following dependencies:\n");
2228 print_kernel_dispatch (shadow, 2);
2229 }
2230
2231 packet->private_segment_size = kernel->private_segment_size;
2232 packet->group_segment_size = kernel->group_segment_size;
2233 packet->kernel_object = kernel->object;
2234 packet->kernarg_address = shadow->kernarg_address;
2235 hsa_signal_t s;
2236 s.handle = shadow->signal;
2237 packet->completion_signal = s;
2238 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2239 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2240
2241 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2242
2243 uint16_t header;
2244 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2245 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2246 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2247
2248 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2249 agent->device_id);
2250
2251 packet_store_release ((uint32_t *) packet, header,
2252 (uint16_t) kla->ndim
2253 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2254
2255 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2256 index);
2257
2258 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2259
2260 /* Root signal waits with 1ms timeout. */
2261 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2262 1000 * 1000,
2263 HSA_WAIT_STATE_BLOCKED) != 0)
2264 {
2265 console_output (kernel, shadow->kernarg_address, false);
2266 }
2267 console_output (kernel, shadow->kernarg_address, true);
2268
2269 struct kernargs *kernargs = shadow->kernarg_address;
2270 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2271
2272 release_kernel_dispatch (shadow);
2273
2274 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2275 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2276
2277 unsigned int upper = (return_value & ~0xffff) >> 16;
2278 if (upper == 0xcafe)
2279 ; // exit not called, normal termination.
2280 else if (upper == 0xffff)
2281 ; // exit called.
2282 else
2283 {
2284 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2285 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2286 return_value);
2287 abort ();
2288 }
2289
2290 if (upper == 0xffff)
2291 {
2292 unsigned int signal = (return_value >> 8) & 0xff;
2293
2294 if (signal == SIGABRT)
2295 {
2296 GCN_WARNING ("GCN Kernel aborted\n");
2297 abort ();
2298 }
2299 else if (signal != 0)
2300 {
2301 GCN_WARNING ("GCN Kernel received unknown signal\n");
2302 abort ();
2303 }
2304
2305 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2306 exit (return_value & 0xff);
2307 }
2308}
2309
2310/* }}} */
2311/* {{{ Load/Unload */
2312
2313/* Initialize KERNEL from D and other parameters. Return true on success. */
2314
2315static bool
2316init_basic_kernel_info (struct kernel_info *kernel,
2317 struct hsa_kernel_description *d,
2318 struct agent_info *agent,
2319 struct module_info *module)
2320{
2321 kernel->agent = agent;
2322 kernel->module = module;
2323 kernel->name = d->name;
5a28e272 2324 kernel->description = d;
237957cc
AS
2325 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2326 {
2327 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2328 return false;
2329 }
2330 return true;
2331}
2332
7d593fd6
FH
2333/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2334
2335static bool
2336isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2337{
2338 int isa_field = elf_gcn_isa_field (image);
2339 const char* isa_s = isa_hsa_name (isa_field);
2340 if (!isa_s)
2341 {
2342 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2343 return false;
2344 }
2345
2346 if (isa_field != agent->device_isa)
2347 {
2348 char msg[120];
2349 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2350 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2351 assert (agent_isa_s);
2352 assert (agent_isa_gcc_s);
2353
2354 snprintf (msg, sizeof msg,
2355 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2356 "Try to recompile with '-foffload=-march=%s'.\n",
2357 isa_s, agent_isa_s, agent_isa_gcc_s);
2358
2359 hsa_error (msg, HSA_STATUS_ERROR);
2360 return false;
2361 }
2362
2363 return true;
2364}
2365
237957cc
AS
2366/* Create and finalize the program consisting of all loaded modules. */
2367
2368static bool
2369create_and_finalize_hsa_program (struct agent_info *agent)
2370{
2371 hsa_status_t status;
237957cc
AS
2372 bool res = true;
2373 if (pthread_mutex_lock (&agent->prog_mutex))
2374 {
2375 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2376 return false;
2377 }
2378 if (agent->prog_finalized)
2379 goto final;
2380
2381 status
2382 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2383 HSA_EXECUTABLE_STATE_UNFROZEN,
2384 "", &agent->executable);
2385 if (status != HSA_STATUS_SUCCESS)
2386 {
2387 hsa_error ("Could not create GCN executable", status);
2388 goto fail;
2389 }
2390
2391 /* Load any GCN modules. */
2392 struct module_info *module = agent->module;
2393 if (module)
2394 {
2395 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2396
7d593fd6
FH
2397 if (!isa_matches_agent (agent, image))
2398 goto fail;
2399
237957cc
AS
2400 hsa_code_object_t co = { 0 };
2401 status = hsa_fns.hsa_code_object_deserialize_fn
2402 (module->image_desc->gcn_image->image,
2403 module->image_desc->gcn_image->size,
2404 NULL, &co);
2405 if (status != HSA_STATUS_SUCCESS)
2406 {
2407 hsa_error ("Could not deserialize GCN code object", status);
2408 goto fail;
2409 }
2410
2411 status = hsa_fns.hsa_executable_load_code_object_fn
2412 (agent->executable, agent->id, co, "");
2413 if (status != HSA_STATUS_SUCCESS)
2414 {
2415 hsa_error ("Could not load GCN code object", status);
2416 goto fail;
2417 }
2418
2419 if (!module->heap)
2420 {
2421 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2422 gcn_kernel_heap_size,
2423 (void**)&module->heap);
2424 if (status != HSA_STATUS_SUCCESS)
2425 {
2426 hsa_error ("Could not allocate memory for GCN heap", status);
2427 goto fail;
2428 }
2429
2430 status = hsa_fns.hsa_memory_assign_agent_fn
2431 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2432 if (status != HSA_STATUS_SUCCESS)
2433 {
2434 hsa_error ("Could not assign GCN heap memory to device", status);
2435 goto fail;
2436 }
2437
2438 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2439 &gcn_kernel_heap_size,
2440 sizeof (gcn_kernel_heap_size));
2441 }
2442
2443 }
2444
2445 if (debug)
2446 dump_executable_symbols (agent->executable);
2447
2448 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2449 if (status != HSA_STATUS_SUCCESS)
2450 {
2451 hsa_error ("Could not freeze the GCN executable", status);
2452 goto fail;
2453 }
2454
237957cc
AS
2455final:
2456 agent->prog_finalized = true;
2457
2458 if (pthread_mutex_unlock (&agent->prog_mutex))
2459 {
2460 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2461 res = false;
2462 }
2463
2464 return res;
2465
2466fail:
2467 res = false;
2468 goto final;
2469}
2470
2471/* Free the HSA program in agent and everything associated with it and set
2472 agent->prog_finalized and the initialized flags of all kernels to false.
2473 Return TRUE on success. */
2474
2475static bool
2476destroy_hsa_program (struct agent_info *agent)
2477{
2478 if (!agent->prog_finalized)
2479 return true;
2480
2481 hsa_status_t status;
2482
2483 GCN_DEBUG ("Destroying the current GCN program.\n");
2484
2485 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2486 if (status != HSA_STATUS_SUCCESS)
2487 return hsa_error ("Could not destroy GCN executable", status);
2488
2489 if (agent->module)
2490 {
2491 int i;
2492 for (i = 0; i < agent->module->kernel_count; i++)
2493 agent->module->kernels[i].initialized = false;
2494
2495 if (agent->module->heap)
2496 {
2497 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2498 agent->module->heap = NULL;
2499 }
2500 }
2501 agent->prog_finalized = false;
2502 return true;
2503}
2504
2505/* Deinitialize all information associated with MODULE and kernels within
2506 it. Return TRUE on success. */
2507
2508static bool
2509destroy_module (struct module_info *module, bool locked)
2510{
2511 /* Run destructors before destroying module. */
2512 struct GOMP_kernel_launch_attributes kla =
2513 { 3,
2514 /* Grid size. */
2515 { 1, 64, 1 },
2516 /* Work-group size. */
2517 { 1, 64, 1 }
2518 };
2519
2520 if (module->fini_array_func)
2521 {
2522 init_kernel (module->fini_array_func);
2523 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2524 }
2525 module->constructors_run_p = false;
2526
2527 int i;
2528 for (i = 0; i < module->kernel_count; i++)
2529 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2530 {
2531 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2532 "mutex");
2533 return false;
2534 }
2535
2536 return true;
2537}
2538
2539/* }}} */
2540/* {{{ Async */
2541
2542/* Callback of dispatch queues to report errors. */
2543
2544static void
2545execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2546{
2547 struct queue_entry *entry = &aq->queue[index];
2548
2549 switch (entry->type)
2550 {
2551 case KERNEL_LAUNCH:
2552 if (DEBUG_QUEUES)
2553 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2554 aq->agent->device_id, aq->id, index);
2555 run_kernel (entry->u.launch.kernel,
2556 entry->u.launch.vars,
2557 &entry->u.launch.kla, aq, false);
2558 if (DEBUG_QUEUES)
2559 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2560 aq->agent->device_id, aq->id, index);
2561 break;
2562
2563 case CALLBACK:
2564 if (DEBUG_QUEUES)
2565 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2566 aq->agent->device_id, aq->id, index);
2567 entry->u.callback.fn (entry->u.callback.data);
2568 if (DEBUG_QUEUES)
2569 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2570 aq->agent->device_id, aq->id, index);
2571 break;
2572
2573 case ASYNC_WAIT:
2574 {
2575 /* FIXME: is it safe to access a placeholder that may already have
2576 been executed? */
2577 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2578
2579 if (DEBUG_QUEUES)
2580 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2581 aq->agent->device_id, aq->id, index);
2582
2583 pthread_mutex_lock (&placeholderp->mutex);
2584
2585 while (!placeholderp->executed)
2586 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2587
2588 pthread_mutex_unlock (&placeholderp->mutex);
2589
2590 if (pthread_cond_destroy (&placeholderp->cond))
2591 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2592
2593 if (pthread_mutex_destroy (&placeholderp->mutex))
2594 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2595
2596 if (DEBUG_QUEUES)
2597 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2598 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2599 }
2600 break;
2601
2602 case ASYNC_PLACEHOLDER:
2603 pthread_mutex_lock (&entry->u.placeholder.mutex);
2604 entry->u.placeholder.executed = 1;
2605 pthread_cond_signal (&entry->u.placeholder.cond);
2606 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2607 break;
2608
2609 default:
2610 GOMP_PLUGIN_fatal ("Unknown queue element");
2611 }
2612}
2613
2614/* This function is run as a thread to service an async queue in the
2615 background. It runs continuously until the stop flag is set. */
2616
2617static void *
2618drain_queue (void *thread_arg)
2619{
2620 struct goacc_asyncqueue *aq = thread_arg;
2621
2622 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2623 {
2624 aq->drain_queue_stop = 2;
2625 return NULL;
2626 }
2627
2628 pthread_mutex_lock (&aq->mutex);
2629
2630 while (true)
2631 {
2632 if (aq->drain_queue_stop)
2633 break;
2634
2635 if (aq->queue_n > 0)
2636 {
2637 pthread_mutex_unlock (&aq->mutex);
2638 execute_queue_entry (aq, aq->queue_first);
2639
2640 pthread_mutex_lock (&aq->mutex);
2641 aq->queue_first = ((aq->queue_first + 1)
2642 % ASYNC_QUEUE_SIZE);
2643 aq->queue_n--;
2644
2645 if (DEBUG_THREAD_SIGNAL)
2646 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2647 aq->agent->device_id, aq->id);
2648 pthread_cond_broadcast (&aq->queue_cond_out);
2649 pthread_mutex_unlock (&aq->mutex);
2650
2651 if (DEBUG_QUEUES)
2652 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2653 aq->id);
2654 pthread_mutex_lock (&aq->mutex);
2655 }
2656 else
2657 {
2658 if (DEBUG_THREAD_SLEEP)
2659 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2660 aq->agent->device_id, aq->id);
2661 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2662 if (DEBUG_THREAD_SLEEP)
2663 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2664 aq->agent->device_id, aq->id);
2665 }
2666 }
2667
2668 aq->drain_queue_stop = 2;
2669 if (DEBUG_THREAD_SIGNAL)
2670 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2671 aq->agent->device_id, aq->id);
2672 pthread_cond_broadcast (&aq->queue_cond_out);
2673 pthread_mutex_unlock (&aq->mutex);
2674
2675 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2676 return NULL;
2677}
2678
2679/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2680 is not usually the case. This is just a debug tool. */
2681
2682static void
2683drain_queue_synchronous (struct goacc_asyncqueue *aq)
2684{
2685 pthread_mutex_lock (&aq->mutex);
2686
2687 while (aq->queue_n > 0)
2688 {
2689 execute_queue_entry (aq, aq->queue_first);
2690
2691 aq->queue_first = ((aq->queue_first + 1)
2692 % ASYNC_QUEUE_SIZE);
2693 aq->queue_n--;
2694 }
2695
2696 pthread_mutex_unlock (&aq->mutex);
2697}
2698
d88b27da
JB
2699/* Block the current thread until an async queue is writable. The aq->mutex
2700 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2701
2702static void
2703wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2704{
2705 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2706 {
237957cc
AS
2707 /* Queue is full. Wait for it to not be full. */
2708 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2709 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2710 }
2711}
2712
2713/* Request an asynchronous kernel launch on the specified queue. This
2714 may block if the queue is full, but returns without waiting for the
2715 kernel to run. */
2716
2717static void
2718queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2719 void *vars, struct GOMP_kernel_launch_attributes *kla)
2720{
2721 assert (aq->agent == kernel->agent);
2722
237957cc
AS
2723 pthread_mutex_lock (&aq->mutex);
2724
d88b27da
JB
2725 wait_for_queue_nonfull (aq);
2726
237957cc
AS
2727 int queue_last = ((aq->queue_first + aq->queue_n)
2728 % ASYNC_QUEUE_SIZE);
2729 if (DEBUG_QUEUES)
2730 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2731 aq->id, queue_last);
2732
2733 aq->queue[queue_last].type = KERNEL_LAUNCH;
2734 aq->queue[queue_last].u.launch.kernel = kernel;
2735 aq->queue[queue_last].u.launch.vars = vars;
2736 aq->queue[queue_last].u.launch.kla = *kla;
2737
2738 aq->queue_n++;
2739
2740 if (DEBUG_THREAD_SIGNAL)
2741 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2742 aq->agent->device_id, aq->id);
2743 pthread_cond_signal (&aq->queue_cond_in);
2744
2745 pthread_mutex_unlock (&aq->mutex);
2746}
2747
2748/* Request an asynchronous callback on the specified queue. The callback
2749 function will be called, with the given opaque data, from the appropriate
2750 async thread, when all previous items on that queue are complete. */
2751
2752static void
2753queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2754 void *data)
2755{
237957cc
AS
2756 pthread_mutex_lock (&aq->mutex);
2757
d88b27da
JB
2758 wait_for_queue_nonfull (aq);
2759
237957cc
AS
2760 int queue_last = ((aq->queue_first + aq->queue_n)
2761 % ASYNC_QUEUE_SIZE);
2762 if (DEBUG_QUEUES)
2763 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2764 aq->id, queue_last);
2765
2766 aq->queue[queue_last].type = CALLBACK;
2767 aq->queue[queue_last].u.callback.fn = fn;
2768 aq->queue[queue_last].u.callback.data = data;
2769
2770 aq->queue_n++;
2771
2772 if (DEBUG_THREAD_SIGNAL)
2773 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2774 aq->agent->device_id, aq->id);
2775 pthread_cond_signal (&aq->queue_cond_in);
2776
2777 pthread_mutex_unlock (&aq->mutex);
2778}
2779
2780/* Request that a given async thread wait for another thread (unspecified) to
2781 reach the given placeholder. The wait will occur when all previous entries
2782 on the queue are complete. A placeholder is effectively a kind of signal
2783 which simply sets a flag when encountered in a queue. */
2784
2785static void
2786queue_push_asyncwait (struct goacc_asyncqueue *aq,
2787 struct placeholder *placeholderp)
2788{
237957cc
AS
2789 pthread_mutex_lock (&aq->mutex);
2790
d88b27da
JB
2791 wait_for_queue_nonfull (aq);
2792
237957cc
AS
2793 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2794 if (DEBUG_QUEUES)
2795 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2796 aq->id, queue_last);
2797
2798 aq->queue[queue_last].type = ASYNC_WAIT;
2799 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2800
2801 aq->queue_n++;
2802
2803 if (DEBUG_THREAD_SIGNAL)
2804 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2805 aq->agent->device_id, aq->id);
2806 pthread_cond_signal (&aq->queue_cond_in);
2807
2808 pthread_mutex_unlock (&aq->mutex);
2809}
2810
2811/* Add a placeholder into an async queue. When the async thread reaches the
2812 placeholder it will set the "executed" flag to true and continue.
2813 Another thread may be waiting on this thread reaching the placeholder. */
2814
2815static struct placeholder *
2816queue_push_placeholder (struct goacc_asyncqueue *aq)
2817{
2818 struct placeholder *placeholderp;
2819
237957cc
AS
2820 pthread_mutex_lock (&aq->mutex);
2821
d88b27da
JB
2822 wait_for_queue_nonfull (aq);
2823
237957cc
AS
2824 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2825 if (DEBUG_QUEUES)
2826 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2827 aq->id, queue_last);
2828
2829 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2830 placeholderp = &aq->queue[queue_last].u.placeholder;
2831
2832 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2833 {
2834 pthread_mutex_unlock (&aq->mutex);
2835 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2836 }
2837
2838 if (pthread_cond_init (&placeholderp->cond, NULL))
2839 {
2840 pthread_mutex_unlock (&aq->mutex);
2841 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2842 }
2843
2844 placeholderp->executed = 0;
2845
2846 aq->queue_n++;
2847
2848 if (DEBUG_THREAD_SIGNAL)
2849 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2850 aq->agent->device_id, aq->id);
2851 pthread_cond_signal (&aq->queue_cond_in);
2852
2853 pthread_mutex_unlock (&aq->mutex);
2854
2855 return placeholderp;
2856}
2857
2858/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2859
2860static void
2861finalize_async_thread (struct goacc_asyncqueue *aq)
2862{
2863 pthread_mutex_lock (&aq->mutex);
2864 if (aq->drain_queue_stop == 2)
2865 {
2866 pthread_mutex_unlock (&aq->mutex);
2867 return;
2868 }
2869
2870 aq->drain_queue_stop = 1;
2871
2872 if (DEBUG_THREAD_SIGNAL)
2873 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2874 aq->agent->device_id, aq->id);
2875 pthread_cond_signal (&aq->queue_cond_in);
2876
2877 while (aq->drain_queue_stop != 2)
2878 {
2879 if (DEBUG_THREAD_SLEEP)
2880 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2881 " to sleep\n", aq->agent->device_id, aq->id);
2882 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2883 if (DEBUG_THREAD_SLEEP)
2884 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2885 aq->agent->device_id, aq->id);
2886 }
2887
2888 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2889 aq->id);
2890 pthread_mutex_unlock (&aq->mutex);
2891
2892 int err = pthread_join (aq->thread_drain_queue, NULL);
2893 if (err != 0)
2894 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2895 aq->agent->device_id, aq->id, strerror (err));
2896 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2897}
2898
2899/* Set up an async queue for OpenMP. There will be only one. The
2900 implementation simply uses an OpenACC async queue.
2901 FIXME: is this thread-safe if two threads call this function? */
2902
2903static void
2904maybe_init_omp_async (struct agent_info *agent)
2905{
2906 if (!agent->omp_async_queue)
2907 agent->omp_async_queue
2908 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2909}
2910
8d2f4ddf
JB
2911/* A wrapper that works around an issue in the HSA runtime with host-to-device
2912 copies from read-only pages. */
2913
2914static void
2915hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2916{
2917 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2918
2919 if (status == HSA_STATUS_SUCCESS)
2920 return;
2921
2922 /* It appears that the copy fails if the source data is in a read-only page.
2923 We can't detect that easily, so try copying the data to a temporary buffer
2924 and doing the copy again if we got an error above. */
2925
2926 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2927 "[%p:+%d]\n", (void *) src, (int) len);
2928
2929 void *src_copy = malloc (len);
2930 memcpy (src_copy, src, len);
2931 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2932 free (src_copy);
2933 if (status != HSA_STATUS_SUCCESS)
2934 GOMP_PLUGIN_error ("memory copy failed");
2935}
2936
237957cc
AS
2937/* Copy data to or from a device. This is intended for use as an async
2938 callback event. */
2939
2940static void
2941copy_data (void *data_)
2942{
2943 struct copy_data *data = (struct copy_data *)data_;
2944 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2945 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2946 data->dst);
8d2f4ddf 2947 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2948 if (data->free_src)
2949 free ((void *) data->src);
2950 free (data);
2951}
2952
2953/* Free device data. This is intended for use as an async callback event. */
2954
2955static void
2956gomp_offload_free (void *ptr)
2957{
2958 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2959 GOMP_OFFLOAD_free (0, ptr);
2960}
2961
2962/* Request an asynchronous data copy, to or from a device, on a given queue.
2963 The event will be registered as a callback. If FREE_SRC is true
2964 then the source data will be freed following the copy. */
2965
2966static void
2967queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2968 size_t len, bool free_src)
2969{
2970 if (DEBUG_QUEUES)
2971 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2972 aq->agent->device_id, aq->id, len, src, dst);
2973 struct copy_data *data
2974 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2975 data->dst = dst;
2976 data->src = src;
2977 data->len = len;
2978 data->free_src = free_src;
2979 data->aq = aq;
2980 queue_push_callback (aq, copy_data, data);
2981}
2982
2983/* Return true if the given queue is currently empty. */
2984
2985static int
2986queue_empty (struct goacc_asyncqueue *aq)
2987{
2988 pthread_mutex_lock (&aq->mutex);
2989 int res = aq->queue_n == 0 ? 1 : 0;
2990 pthread_mutex_unlock (&aq->mutex);
2991
2992 return res;
2993}
2994
2995/* Wait for a given queue to become empty. This implements an OpenACC wait
2996 directive. */
2997
2998static void
2999wait_queue (struct goacc_asyncqueue *aq)
3000{
3001 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3002 {
3003 drain_queue_synchronous (aq);
3004 return;
3005 }
3006
3007 pthread_mutex_lock (&aq->mutex);
3008
3009 while (aq->queue_n > 0)
3010 {
3011 if (DEBUG_THREAD_SLEEP)
3012 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3013 aq->agent->device_id, aq->id);
3014 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3015 if (DEBUG_THREAD_SLEEP)
3016 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3017 aq->id);
3018 }
3019
3020 pthread_mutex_unlock (&aq->mutex);
3021 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3022}
3023
3024/* }}} */
3025/* {{{ OpenACC support */
3026
3027/* Execute an OpenACC kernel, synchronously or asynchronously. */
3028
3029static void
3030gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3031 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3032 struct goacc_asyncqueue *aq)
3033{
3034 if (!GOMP_OFFLOAD_can_run (kernel))
3035 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3036
3037 /* If we get here then this must be an OpenACC kernel. */
3038 kernel->kind = KIND_OPENACC;
3039
3040 /* devaddrs must be double-indirect on the target. */
3041 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3042 for (size_t i = 0; i < mapnum; i++)
3043 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3044 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3045 sizeof (void *));
3046
3047 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3048 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3049 {
3050 struct hsa_kernel_description *d
3051 = &kernel->module->image_desc->kernel_infos[i];
3052 if (d->name == kernel->name)
3053 {
3054 hsa_kernel_desc = d;
3055 break;
3056 }
3057 }
3058
3059 /* We may have statically-determined dimensions in
3060 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3061 invocation at runtime in dims[]. We allow static dimensions to take
3062 priority over dynamic dimensions when present (non-zero). */
3063 if (hsa_kernel_desc->oacc_dims[0] > 0)
3064 dims[0] = hsa_kernel_desc->oacc_dims[0];
3065 if (hsa_kernel_desc->oacc_dims[1] > 0)
3066 dims[1] = hsa_kernel_desc->oacc_dims[1];
3067 if (hsa_kernel_desc->oacc_dims[2] > 0)
3068 dims[2] = hsa_kernel_desc->oacc_dims[2];
3069
4dcd1e1f
KCY
3070 /* Ideally, when a dimension isn't explicitly specified, we should
3071 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3072 In practice, we tune for peak performance on BabelStream, which
3073 for OpenACC is currently 32 threads per CU. */
3074 if (dims[0] == 0 && dims[1] == 0)
3075 {
3076 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3077 number. There isn't really a correct answer for this without a clue
3078 about the problem size, so let's do a reasonable number of workers
3079 and gangs. */
237957cc 3080
4dcd1e1f
KCY
3081 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3082 dims[1] = 8; /* Workers. */
3083 }
3084 else if (dims[0] == 0 && dims[1] > 0)
3085 {
3086 /* Auto-scale the number of gangs with the requested number of workers. */
3087 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3088 }
3089 else if (dims[0] > 0 && dims[1] == 0)
3090 {
3091 /* Auto-scale the number of workers with the requested number of gangs. */
3092 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3093 if (dims[1] == 0)
3094 dims[1] = 1;
3095 if (dims[1] > 16)
3096 dims[1] = 16;
3097 }
237957cc
AS
3098
3099 /* The incoming dimensions are expressed in terms of gangs, workers, and
3100 vectors. The HSA dimensions are expressed in terms of "work-items",
3101 which means multiples of vector lanes.
3102
3103 The "grid size" specifies the size of the problem space, and the
3104 "work-group size" specifies how much of that we want a single compute
3105 unit to chew on at once.
3106
3107 The three dimensions do not really correspond to hardware, but the
3108 important thing is that the HSA runtime will launch as many
3109 work-groups as it takes to process the entire grid, and each
3110 work-group will contain as many wave-fronts as it takes to process
3111 the work-items in that group.
3112
3113 Essentially, as long as we set the Y dimension to 64 (the number of
3114 vector lanes in hardware), and the Z group size to the maximum (16),
3115 then we will get the gangs (X) and workers (Z) launched as we expect.
3116
3117 The reason for the apparent reversal of vector and worker dimension
3118 order is to do with the way the run-time distributes work-items across
3119 v1 and v2. */
3120 struct GOMP_kernel_launch_attributes kla =
3121 {3,
3122 /* Grid size. */
3123 {dims[0], 64, dims[1]},
3124 /* Work-group size. */
3125 {1, 64, 16}
3126 };
3127
3128 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3129 acc_prof_info *prof_info = thr->prof_info;
3130 acc_event_info enqueue_launch_event_info;
3131 acc_api_info *api_info = thr->api_info;
3132 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3133 if (profiling_dispatch_p)
3134 {
3135 prof_info->event_type = acc_ev_enqueue_launch_start;
3136
3137 enqueue_launch_event_info.launch_event.event_type
3138 = prof_info->event_type;
3139 enqueue_launch_event_info.launch_event.valid_bytes
3140 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3141 enqueue_launch_event_info.launch_event.parent_construct
3142 = acc_construct_parallel;
3143 enqueue_launch_event_info.launch_event.implicit = 1;
3144 enqueue_launch_event_info.launch_event.tool_info = NULL;
3145 enqueue_launch_event_info.launch_event.kernel_name
3146 = (char *) kernel->name;
3147 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3148 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3149 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3150
3151 api_info->device_api = acc_device_api_other;
3152
3153 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3154 &enqueue_launch_event_info, api_info);
3155 }
3156
3157 if (!async)
3158 {
3159 run_kernel (kernel, ind_da, &kla, NULL, false);
3160 gomp_offload_free (ind_da);
3161 }
3162 else
3163 {
3164 queue_push_launch (aq, kernel, ind_da, &kla);
3165 if (DEBUG_QUEUES)
3166 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3167 aq->agent->device_id, aq->id, ind_da);
3168 queue_push_callback (aq, gomp_offload_free, ind_da);
3169 }
3170
3171 if (profiling_dispatch_p)
3172 {
3173 prof_info->event_type = acc_ev_enqueue_launch_end;
3174 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3175 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3176 &enqueue_launch_event_info,
3177 api_info);
3178 }
3179}
3180
3181/* }}} */
3182/* {{{ Generic Plugin API */
3183
3184/* Return the name of the accelerator, which is "gcn". */
3185
3186const char *
3187GOMP_OFFLOAD_get_name (void)
3188{
3189 return "gcn";
3190}
3191
3192/* Return the specific capabilities the HSA accelerator have. */
3193
3194unsigned int
3195GOMP_OFFLOAD_get_caps (void)
3196{
3197 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3198 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3199 | GOMP_OFFLOAD_CAP_OPENACC_200;
3200}
3201
3202/* Identify as GCN accelerator. */
3203
3204int
3205GOMP_OFFLOAD_get_type (void)
3206{
3207 return OFFLOAD_TARGET_TYPE_GCN;
3208}
3209
3210/* Return the libgomp version number we're compatible with. There is
3211 no requirement for cross-version compatibility. */
3212
3213unsigned
3214GOMP_OFFLOAD_version (void)
3215{
3216 return GOMP_VERSION;
3217}
3218
3219/* Return the number of GCN devices on the system. */
3220
3221int
3222GOMP_OFFLOAD_get_num_devices (void)
3223{
3224 if (!init_hsa_context ())
3225 return 0;
3226 return hsa_context.agent_count;
3227}
3228
3229/* Initialize device (agent) number N so that it can be used for computation.
3230 Return TRUE on success. */
3231
3232bool
3233GOMP_OFFLOAD_init_device (int n)
3234{
3235 if (!init_hsa_context ())
3236 return false;
3237 if (n >= hsa_context.agent_count)
3238 {
3239 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3240 return false;
3241 }
3242 struct agent_info *agent = &hsa_context.agents[n];
3243
3244 if (agent->initialized)
3245 return true;
3246
3247 agent->device_id = n;
3248
3249 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3250 {
3251 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3252 return false;
3253 }
3254 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3255 {
3256 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3257 return false;
3258 }
3259 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3260 {
3261 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3262 return false;
3263 }
3264 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3265 {
3266 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3267 return false;
3268 }
3269 agent->async_queues = NULL;
3270 agent->omp_async_queue = NULL;
3271 agent->team_arena_list = NULL;
3272
3273 uint32_t queue_size;
3274 hsa_status_t status;
3275 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3276 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3277 &queue_size);
3278 if (status != HSA_STATUS_SUCCESS)
3279 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3280 status);
3281
237957cc 3282 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3283 &agent->name);
237957cc
AS
3284 if (status != HSA_STATUS_SUCCESS)
3285 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3286
2e5ea579 3287 agent->device_isa = isa_code (agent->name);
7d593fd6 3288 if (agent->device_isa < 0)
2e5ea579
FH
3289 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3290
3291 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3292 &agent->vendor_name);
3293 if (status != HSA_STATUS_SUCCESS)
3294 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3295
3296 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3297 HSA_QUEUE_TYPE_MULTI,
3298 hsa_queue_callback, NULL, UINT32_MAX,
3299 UINT32_MAX, &agent->sync_queue);
3300 if (status != HSA_STATUS_SUCCESS)
3301 return hsa_error ("Error creating command queue", status);
3302
3303 agent->kernarg_region.handle = (uint64_t) -1;
3304 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3305 get_kernarg_memory_region,
3306 &agent->kernarg_region);
966de09b
AS
3307 if (status != HSA_STATUS_SUCCESS
3308 && status != HSA_STATUS_INFO_BREAK)
3309 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3310 if (agent->kernarg_region.handle == (uint64_t) -1)
3311 {
3312 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3313 "arguments");
3314 return false;
3315 }
3316 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3317 dump_hsa_region (agent->kernarg_region, NULL);
3318
3319 agent->data_region.handle = (uint64_t) -1;
3320 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3321 get_data_memory_region,
3322 &agent->data_region);
966de09b
AS
3323 if (status != HSA_STATUS_SUCCESS
3324 && status != HSA_STATUS_INFO_BREAK)
3325 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3326 if (agent->data_region.handle == (uint64_t) -1)
3327 {
3328 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3329 "data");
3330 return false;
3331 }
3332 GCN_DEBUG ("Selected device data memory region:\n");
3333 dump_hsa_region (agent->data_region, NULL);
3334
3335 GCN_DEBUG ("GCN agent %d initialized\n", n);
3336
3337 agent->initialized = true;
3338 return true;
3339}
3340
3341/* Load GCN object-code module described by struct gcn_image_desc in
3342 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3343 If there are any constructors then run them. */
3344
3345int
3346GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3347 struct addr_pair **target_table)
3348{
3349 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3350 {
3351 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3352 " (expected %u, received %u)",
3353 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3354 return -1;
3355 }
3356
3357 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3358 struct agent_info *agent;
3359 struct addr_pair *pair;
3360 struct module_info *module;
3361 struct kernel_info *kernel;
3362 int kernel_count = image_desc->kernel_count;
3363 unsigned var_count = image_desc->global_variable_count;
83177ca9 3364 int other_count = 1;
237957cc
AS
3365
3366 agent = get_agent_info (ord);
3367 if (!agent)
3368 return -1;
3369
3370 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3371 {
3372 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3373 return -1;
3374 }
3375 if (agent->prog_finalized
3376 && !destroy_hsa_program (agent))
3377 return -1;
3378
3379 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3380 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
83177ca9
CLT
3381 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3382 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3383 * sizeof (struct addr_pair));
3384 *target_table = pair;
3385 module = (struct module_info *)
3386 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3387 + kernel_count * sizeof (struct kernel_info));
3388 module->image_desc = image_desc;
3389 module->kernel_count = kernel_count;
3390 module->heap = NULL;
3391 module->constructors_run_p = false;
3392
3393 kernel = &module->kernels[0];
3394
3395 /* Allocate memory for kernel dependencies. */
3396 for (unsigned i = 0; i < kernel_count; i++)
3397 {
3398 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3399 if (!init_basic_kernel_info (kernel, d, agent, module))
3400 return -1;
3401 if (strcmp (d->name, "_init_array") == 0)
3402 module->init_array_func = kernel;
3403 else if (strcmp (d->name, "_fini_array") == 0)
3404 module->fini_array_func = kernel;
3405 else
3406 {
3407 pair->start = (uintptr_t) kernel;
3408 pair->end = (uintptr_t) (kernel + 1);
3409 pair++;
3410 }
3411 kernel++;
3412 }
3413
3414 agent->module = module;
3415 if (pthread_rwlock_unlock (&agent->module_rwlock))
3416 {
3417 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3418 return -1;
3419 }
3420
3421 if (!create_and_finalize_hsa_program (agent))
3422 return -1;
3423
028b55a9 3424 if (var_count > 0)
237957cc 3425 {
237957cc
AS
3426 hsa_status_t status;
3427 hsa_executable_symbol_t var_symbol;
3428 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
028b55a9
AS
3429 ".offload_var_table",
3430 agent->id,
237957cc
AS
3431 0, &var_symbol);
3432
3433 if (status != HSA_STATUS_SUCCESS)
3434 hsa_fatal ("Could not find symbol for variable in the code object",
3435 status);
3436
028b55a9 3437 uint64_t var_table_addr;
237957cc 3438 status = hsa_fns.hsa_executable_symbol_get_info_fn
028b55a9
AS
3439 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3440 &var_table_addr);
237957cc
AS
3441 if (status != HSA_STATUS_SUCCESS)
3442 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3443
028b55a9
AS
3444 struct {
3445 uint64_t addr;
3446 uint64_t size;
3447 } var_table[var_count];
3448 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3449 (void*)var_table_addr, sizeof (var_table));
3450
3451 for (unsigned i = 0; i < var_count; i++)
3452 {
3453 pair->start = var_table[i].addr;
3454 pair->end = var_table[i].addr + var_table[i].size;
3455 GCN_DEBUG ("Found variable at %p with size %lu\n",
3456 (void *)var_table[i].addr, var_table[i].size);
3457 pair++;
3458 }
237957cc
AS
3459 }
3460
b5435aab 3461 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_DEVICE_NUM_VAR));
83177ca9
CLT
3462
3463 hsa_status_t status;
3464 hsa_executable_symbol_t var_symbol;
3465 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
b5435aab 3466 XSTRING (GOMP_DEVICE_NUM_VAR),
83177ca9
CLT
3467 agent->id, 0, &var_symbol);
3468 if (status == HSA_STATUS_SUCCESS)
3469 {
3470 uint64_t device_num_varptr;
3471 uint32_t device_num_varsize;
3472
3473 status = hsa_fns.hsa_executable_symbol_get_info_fn
3474 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3475 &device_num_varptr);
3476 if (status != HSA_STATUS_SUCCESS)
3477 hsa_fatal ("Could not extract a variable from its symbol", status);
3478 status = hsa_fns.hsa_executable_symbol_get_info_fn
3479 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3480 &device_num_varsize);
3481 if (status != HSA_STATUS_SUCCESS)
3482 hsa_fatal ("Could not extract a variable size from its symbol", status);
3483
3484 pair->start = device_num_varptr;
3485 pair->end = device_num_varptr + device_num_varsize;
3486 }
3487 else
3488 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
3489 pair->start = pair->end = 0;
3490 pair++;
3491
237957cc
AS
3492 /* Ensure that constructors are run first. */
3493 struct GOMP_kernel_launch_attributes kla =
3494 { 3,
3495 /* Grid size. */
3496 { 1, 64, 1 },
3497 /* Work-group size. */
3498 { 1, 64, 1 }
3499 };
3500
3501 if (module->init_array_func)
3502 {
3503 init_kernel (module->init_array_func);
3504 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3505 }
3506 module->constructors_run_p = true;
3507
3508 /* Don't report kernels that libgomp need not know about. */
3509 if (module->init_array_func)
3510 kernel_count--;
3511 if (module->fini_array_func)
3512 kernel_count--;
3513
83177ca9 3514 return kernel_count + var_count + other_count;
237957cc
AS
3515}
3516
3517/* Unload GCN object-code module described by struct gcn_image_desc in
3518 TARGET_DATA from agent number N. Return TRUE on success. */
3519
3520bool
3521GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3522{
3523 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3524 {
3525 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3526 " (expected %u, received %u)",
3527 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3528 return false;
3529 }
3530
3531 struct agent_info *agent;
3532 agent = get_agent_info (n);
3533 if (!agent)
3534 return false;
3535
3536 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3537 {
3538 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3539 return false;
3540 }
3541
3542 if (!agent->module || agent->module->image_desc != target_data)
3543 {
3544 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3545 "loaded before");
3546 return false;
3547 }
3548
3549 if (!destroy_module (agent->module, true))
3550 return false;
3551 free (agent->module);
3552 agent->module = NULL;
3553 if (!destroy_hsa_program (agent))
3554 return false;
3555 if (pthread_rwlock_unlock (&agent->module_rwlock))
3556 {
3557 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3558 return false;
3559 }
3560 return true;
3561}
3562
3563/* Deinitialize all information and status associated with agent number N. We
3564 do not attempt any synchronization, assuming the user and libgomp will not
3565 attempt deinitialization of a device that is in any way being used at the
3566 same time. Return TRUE on success. */
3567
3568bool
3569GOMP_OFFLOAD_fini_device (int n)
3570{
3571 struct agent_info *agent = get_agent_info (n);
3572 if (!agent)
3573 return false;
3574
3575 if (!agent->initialized)
3576 return true;
3577
3578 if (agent->omp_async_queue)
3579 {
3580 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3581 agent->omp_async_queue = NULL;
3582 }
3583
3584 if (agent->module)
3585 {
3586 if (!destroy_module (agent->module, false))
3587 return false;
3588 free (agent->module);
3589 agent->module = NULL;
3590 }
3591
3592 if (!destroy_team_arenas (agent))
3593 return false;
3594
3595 if (!destroy_hsa_program (agent))
3596 return false;
3597
3598 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3599 if (status != HSA_STATUS_SUCCESS)
3600 return hsa_error ("Error destroying command queue", status);
3601
3602 if (pthread_mutex_destroy (&agent->prog_mutex))
3603 {
3604 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3605 return false;
3606 }
3607 if (pthread_rwlock_destroy (&agent->module_rwlock))
3608 {
3609 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3610 return false;
3611 }
3612
3613 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3614 {
3615 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3616 return false;
3617 }
3618 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3619 {
3620 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3621 return false;
3622 }
3623 agent->initialized = false;
3624 return true;
3625}
3626
3627/* Return true if the HSA runtime can run function FN_PTR. */
3628
3629bool
3630GOMP_OFFLOAD_can_run (void *fn_ptr)
3631{
3632 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3633
3634 init_kernel (kernel);
3635 if (kernel->initialization_failed)
3636 goto failure;
3637
3638 return true;
3639
3640failure:
3641 if (suppress_host_fallback)
3642 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3643 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3644 return false;
3645}
3646
3647/* Allocate memory on device N. */
3648
3649void *
3650GOMP_OFFLOAD_alloc (int n, size_t size)
3651{
3652 struct agent_info *agent = get_agent_info (n);
3653 return alloc_by_agent (agent, size);
3654}
3655
3656/* Free memory from device N. */
3657
3658bool
3659GOMP_OFFLOAD_free (int device, void *ptr)
3660{
3661 GCN_DEBUG ("Freeing memory on device %d\n", device);
3662
3663 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3664 if (status != HSA_STATUS_SUCCESS)
3665 {
3666 hsa_error ("Could not free device memory", status);
3667 return false;
3668 }
3669
3670 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3671 bool profiling_dispatch_p
3672 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3673 if (profiling_dispatch_p)
3674 {
3675 acc_prof_info *prof_info = thr->prof_info;
3676 acc_event_info data_event_info;
3677 acc_api_info *api_info = thr->api_info;
3678
3679 prof_info->event_type = acc_ev_free;
3680
3681 data_event_info.data_event.event_type = prof_info->event_type;
3682 data_event_info.data_event.valid_bytes
3683 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3684 data_event_info.data_event.parent_construct
3685 = acc_construct_parallel;
3686 data_event_info.data_event.implicit = 1;
3687 data_event_info.data_event.tool_info = NULL;
3688 data_event_info.data_event.var_name = NULL;
3689 data_event_info.data_event.bytes = 0;
3690 data_event_info.data_event.host_ptr = NULL;
3691 data_event_info.data_event.device_ptr = (void *) ptr;
3692
3693 api_info->device_api = acc_device_api_other;
3694
3695 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3696 api_info);
3697 }
3698
3699 return true;
3700}
3701
3702/* Copy data from DEVICE to host. */
3703
3704bool
3705GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3706{
3707 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3708 src, dst);
8d2f4ddf
JB
3709 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3710 if (status != HSA_STATUS_SUCCESS)
3711 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3712 return true;
3713}
3714
3715/* Copy data from host to DEVICE. */
3716
3717bool
3718GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3719{
3720 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3721 device, dst);
8d2f4ddf 3722 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3723 return true;
3724}
3725
3726/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3727
3728bool
3729GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3730{
3731 struct gcn_thread *thread_data = gcn_thread ();
3732
3733 if (thread_data && !async_synchronous_p (thread_data->async))
3734 {
3735 struct agent_info *agent = get_agent_info (device);
3736 maybe_init_omp_async (agent);
3737 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3738 return true;
3739 }
3740
3741 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3742 device, src, device, dst);
8d2f4ddf
JB
3743 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3744 if (status != HSA_STATUS_SUCCESS)
3745 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3746 return true;
3747}
3748
3749/* }}} */
3750/* {{{ OpenMP Plugin API */
3751
3752/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3753 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3754 to a kernel_info structure, and must have previously been loaded to the
3755 specified device. */
3756
3757void
3758GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3759{
3760 struct agent_info *agent = get_agent_info (device);
3761 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3762 struct GOMP_kernel_launch_attributes def;
3763 struct GOMP_kernel_launch_attributes *kla;
3764 assert (agent == kernel->agent);
3765
3766 /* If we get here then the kernel must be OpenMP. */
3767 kernel->kind = KIND_OPENMP;
3768
3769 if (!parse_target_attributes (args, &def, &kla, agent))
3770 {
3771 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3772 return;
3773 }
3774 run_kernel (kernel, vars, kla, NULL, false);
3775}
3776
3777/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3778 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3779 GOMP_PLUGIN_target_task_completion when it has finished. */
3780
3781void
3782GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3783 void **args, void *async_data)
3784{
3785 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3786 struct agent_info *agent = get_agent_info (device);
3787 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3788 struct GOMP_kernel_launch_attributes def;
3789 struct GOMP_kernel_launch_attributes *kla;
3790 assert (agent == kernel->agent);
3791
3792 /* If we get here then the kernel must be OpenMP. */
3793 kernel->kind = KIND_OPENMP;
3794
3795 if (!parse_target_attributes (args, &def, &kla, agent))
3796 {
3797 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3798 return;
3799 }
3800
3801 maybe_init_omp_async (agent);
3802 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3803 queue_push_callback (agent->omp_async_queue,
3804 GOMP_PLUGIN_target_task_completion, async_data);
3805}
3806
3807/* }}} */
3808/* {{{ OpenACC Plugin API */
3809
3810/* Run a synchronous OpenACC kernel. The device number is inferred from the
3811 already-loaded KERNEL. */
3812
3813void
3814GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3815 void **hostaddrs, void **devaddrs, unsigned *dims,
3816 void *targ_mem_desc)
3817{
3818 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3819
3820 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3821 NULL);
3822}
3823
3824/* Run an asynchronous OpenACC kernel on the specified queue. */
3825
f5a70027
JB
3826void
3827GOMP_OFFLOAD_openacc_exec_params (void (*fn_ptr) (void *), size_t mapnum,
3828 void **hostaddrs, void **devaddrs,
3829 unsigned *dims, void *targ_mem_desc)
3830{
3831 GOMP_PLUGIN_fatal ("OpenACC exec params unimplemented.");
3832}
3833
237957cc
AS
3834void
3835GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3836 void **hostaddrs, void **devaddrs,
3837 unsigned *dims, void *targ_mem_desc,
3838 struct goacc_asyncqueue *aq)
3839{
3840 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3841
3842 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3843 aq);
3844}
3845
3846/* Create a new asynchronous thread and queue for running future kernels. */
3847
f5a70027
JB
3848void
3849GOMP_OFFLOAD_openacc_async_exec_params (void (*fn) (void *), size_t mapnum,
3850 void **hostaddrs, void **devaddrs,
3851 unsigned *dims, void *targ_mem_desc,
3852 struct goacc_asyncqueue *aq)
3853{
3854 GOMP_PLUGIN_fatal ("OpenACC async exec params unimplemented.");
3855}
3856
237957cc
AS
3857struct goacc_asyncqueue *
3858GOMP_OFFLOAD_openacc_async_construct (int device)
3859{
3860 struct agent_info *agent = get_agent_info (device);
3861
3862 pthread_mutex_lock (&agent->async_queues_mutex);
3863
3864 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3865 aq->agent = get_agent_info (device);
3866 aq->prev = NULL;
3867 aq->next = agent->async_queues;
3868 if (aq->next)
3869 {
3870 aq->next->prev = aq;
3871 aq->id = aq->next->id + 1;
3872 }
3873 else
3874 aq->id = 1;
3875 agent->async_queues = aq;
3876
3877 aq->queue_first = 0;
3878 aq->queue_n = 0;
3879 aq->drain_queue_stop = 0;
3880
3881 if (pthread_mutex_init (&aq->mutex, NULL))
3882 {
3883 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3884 return false;
3885 }
3886 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3887 {
3888 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3889 return false;
3890 }
3891 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3892 {
3893 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3894 return false;
3895 }
3896
3897 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3898 ASYNC_QUEUE_SIZE,
3899 HSA_QUEUE_TYPE_MULTI,
3900 hsa_queue_callback, NULL,
3901 UINT32_MAX, UINT32_MAX,
3902 &aq->hsa_queue);
3903 if (status != HSA_STATUS_SUCCESS)
3904 hsa_fatal ("Error creating command queue", status);
3905
3906 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3907 if (err != 0)
3908 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3909 strerror (err));
3910 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3911 aq->id);
3912
3913 pthread_mutex_unlock (&agent->async_queues_mutex);
3914
3915 return aq;
3916}
3917
93d90219 3918/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3919 currently-running task to complete, but cancels any queued tasks. */
3920
3921bool
3922GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3923{
3924 struct agent_info *agent = aq->agent;
3925
3926 finalize_async_thread (aq);
3927
3928 pthread_mutex_lock (&agent->async_queues_mutex);
3929
3930 int err;
3931 if ((err = pthread_mutex_destroy (&aq->mutex)))
3932 {
3933 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3934 goto fail;
3935 }
3936 if (pthread_cond_destroy (&aq->queue_cond_in))
3937 {
3938 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3939 goto fail;
3940 }
3941 if (pthread_cond_destroy (&aq->queue_cond_out))
3942 {
3943 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3944 goto fail;
3945 }
3946 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3947 if (status != HSA_STATUS_SUCCESS)
3948 {
3949 hsa_error ("Error destroying command queue", status);
3950 goto fail;
3951 }
3952
3953 if (aq->prev)
3954 aq->prev->next = aq->next;
3955 if (aq->next)
3956 aq->next->prev = aq->prev;
3957 if (agent->async_queues == aq)
3958 agent->async_queues = aq->next;
3959
3960 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3961
3962 free (aq);
3963 pthread_mutex_unlock (&agent->async_queues_mutex);
3964 return true;
3965
3966fail:
3967 pthread_mutex_unlock (&agent->async_queues_mutex);
3968 return false;
3969}
3970
3971/* Return true if the specified async queue is currently empty. */
3972
3973int
3974GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3975{
3976 return queue_empty (aq);
3977}
3978
3979/* Block until the specified queue has executed all its tasks and the
3980 queue is empty. */
3981
3982bool
3983GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3984{
3985 wait_queue (aq);
3986 return true;
3987}
3988
3989/* Add a serialization point across two async queues. Any new tasks added to
3990 AQ2, after this call, will not run until all tasks on AQ1, at the time
3991 of this call, have completed. */
3992
3993bool
3994GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3995 struct goacc_asyncqueue *aq2)
3996{
3997 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3998 scheduled to run on it up to this point. */
3999 if (aq1 != aq2)
4000 {
4001 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4002 queue_push_asyncwait (aq2, placeholderp);
4003 }
4004 return true;
4005}
4006
4007/* Add an opaque callback to the given async queue. */
4008
4009void
4010GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4011 void (*fn) (void *), void *data)
4012{
4013 queue_push_callback (aq, fn, data);
4014}
4015
4016/* Queue up an asynchronous data copy from host to DEVICE. */
4017
4018bool
4019GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
0e72dbd0
JB
4020 size_t n, bool ephemeral,
4021 struct goacc_asyncqueue *aq)
237957cc
AS
4022{
4023 struct agent_info *agent = get_agent_info (device);
4024 assert (agent == aq->agent);
0e72dbd0
JB
4025
4026 if (ephemeral)
4027 {
4028 /* The source data is on the stack or otherwise may be deallocated
4029 before the asynchronous copy takes place. Take a copy of the source
4030 data. */
4031 void *src_copy = GOMP_PLUGIN_malloc (n);
4032 memcpy (src_copy, src, n);
4033 src = src_copy;
4034 }
4035 queue_push_copy (aq, dst, src, n, ephemeral);
237957cc
AS
4036 return true;
4037}
4038
4039/* Queue up an asynchronous data copy from DEVICE to host. */
4040
4041bool
4042GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4043 size_t n, struct goacc_asyncqueue *aq)
4044{
4045 struct agent_info *agent = get_agent_info (device);
4046 assert (agent == aq->agent);
4047 queue_push_copy (aq, dst, src, n, false);
4048 return true;
4049}
4050
6fc0385c
TS
4051union goacc_property_value
4052GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4053{
2e5ea579
FH
4054 struct agent_info *agent = get_agent_info (device);
4055
4056 union goacc_property_value propval = { .val = 0 };
4057
4058 switch (prop)
4059 {
4060 case GOACC_PROPERTY_FREE_MEMORY:
4061 /* Not supported. */
4062 break;
4063 case GOACC_PROPERTY_MEMORY:
4064 {
4065 size_t size;
4066 hsa_region_t region = agent->data_region;
4067 hsa_status_t status =
4068 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4069 if (status == HSA_STATUS_SUCCESS)
4070 propval.val = size;
4071 break;
4072 }
4073 case GOACC_PROPERTY_NAME:
4074 propval.ptr = agent->name;
4075 break;
4076 case GOACC_PROPERTY_VENDOR:
4077 propval.ptr = agent->vendor_name;
4078 break;
4079 case GOACC_PROPERTY_DRIVER:
4080 propval.ptr = hsa_context.driver_version_s;
4081 break;
4082 }
6fc0385c 4083
2e5ea579 4084 return propval;
6fc0385c
TS
4085}
4086
237957cc
AS
4087/* Set up plugin-specific thread-local-data (host-side). */
4088
4089void *
4090GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4091{
4092 struct gcn_thread *thread_data
4093 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4094
4095 thread_data->async = GOMP_ASYNC_SYNC;
4096
4097 return (void *) thread_data;
4098}
4099
4100/* Clean up plugin-specific thread-local-data. */
4101
4102void
4103GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4104{
4105 free (data);
4106}
4107
f5bfc65f
CLT
4108/* Indicate which GOMP_REQUIRES_* features are supported, currently none. */
4109
4110bool
4111GOMP_OFFLOAD_supported_features (unsigned int *mask)
4112{
4113 return (*mask == 0);
4114}
4115
237957cc 4116/* }}} */
This page took 0.573291 seconds and 5 git commands to generate.