]>
Commit | Line | Data |
---|---|---|
5624e564 | 1 | /* Copyright (C) 2005-2015 Free Software Foundation, Inc. |
953ff289 DN |
2 | Contributed by Richard Henderson <rth@redhat.com>. |
3 | ||
f1f3453e TS |
4 | This file is part of the GNU Offloading and Multi Processing Library |
5 | (libgomp). | |
953ff289 DN |
6 | |
7 | Libgomp is free software; you can redistribute it and/or modify it | |
748086b7 JJ |
8 | under the terms of the GNU General Public License as published by |
9 | the Free Software Foundation; either version 3, or (at your option) | |
10 | any later version. | |
953ff289 DN |
11 | |
12 | Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
748086b7 | 14 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
953ff289 DN |
15 | more details. |
16 | ||
748086b7 JJ |
17 | Under Section 7 of GPL version 3, you are granted additional |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
953ff289 DN |
25 | |
26 | /* This file handles the ORDERED construct. */ | |
27 | ||
28 | #include "libgomp.h" | |
d9a6bd32 JJ |
29 | #include <stdarg.h> |
30 | #include <string.h> | |
31 | #include "doacross.h" | |
953ff289 DN |
32 | |
33 | ||
34 | /* This function is called when first allocating an iteration block. That | |
35 | is, the thread is not currently on the queue. The work-share lock must | |
36 | be held on entry. */ | |
37 | ||
38 | void | |
39 | gomp_ordered_first (void) | |
40 | { | |
41 | struct gomp_thread *thr = gomp_thread (); | |
42 | struct gomp_team *team = thr->ts.team; | |
43 | struct gomp_work_share *ws = thr->ts.work_share; | |
44 | unsigned index; | |
45 | ||
46 | /* Work share constructs can be orphaned. */ | |
47 | if (team == NULL || team->nthreads == 1) | |
48 | return; | |
49 | ||
50 | index = ws->ordered_cur + ws->ordered_num_used; | |
51 | if (index >= team->nthreads) | |
52 | index -= team->nthreads; | |
53 | ws->ordered_team_ids[index] = thr->ts.team_id; | |
54 | ||
55 | /* If this is the first and only thread in the queue, then there is | |
56 | no one to release us when we get to our ordered section. Post to | |
57 | our own release queue now so that we won't block later. */ | |
58 | if (ws->ordered_num_used++ == 0) | |
59 | gomp_sem_post (team->ordered_release[thr->ts.team_id]); | |
60 | } | |
61 | ||
62 | /* This function is called when completing the last iteration block. That | |
63 | is, there are no more iterations to perform and so the thread should be | |
64 | removed from the queue entirely. Because of the way ORDERED blocks are | |
65 | managed, it follows that we currently own access to the ORDERED block, | |
66 | and should now pass it on to the next thread. The work-share lock must | |
67 | be held on entry. */ | |
68 | ||
69 | void | |
70 | gomp_ordered_last (void) | |
71 | { | |
72 | struct gomp_thread *thr = gomp_thread (); | |
73 | struct gomp_team *team = thr->ts.team; | |
74 | struct gomp_work_share *ws = thr->ts.work_share; | |
75 | unsigned next_id; | |
76 | ||
77 | /* Work share constructs can be orphaned. */ | |
78 | if (team == NULL || team->nthreads == 1) | |
79 | return; | |
80 | ||
81 | /* We're no longer the owner. */ | |
82 | ws->ordered_owner = -1; | |
83 | ||
84 | /* If we're not the last thread in the queue, then wake the next. */ | |
85 | if (--ws->ordered_num_used > 0) | |
86 | { | |
87 | unsigned next = ws->ordered_cur + 1; | |
88 | if (next == team->nthreads) | |
89 | next = 0; | |
90 | ws->ordered_cur = next; | |
91 | ||
92 | next_id = ws->ordered_team_ids[next]; | |
93 | gomp_sem_post (team->ordered_release[next_id]); | |
94 | } | |
95 | } | |
96 | ||
97 | ||
98 | /* This function is called when allocating a subsequent allocation block. | |
99 | That is, we're done with the current iteration block and we're allocating | |
100 | another. This is the logical combination of a call to gomp_ordered_last | |
101 | followed by a call to gomp_ordered_first. The work-share lock must be | |
102 | held on entry. */ | |
103 | ||
104 | void | |
105 | gomp_ordered_next (void) | |
106 | { | |
107 | struct gomp_thread *thr = gomp_thread (); | |
108 | struct gomp_team *team = thr->ts.team; | |
109 | struct gomp_work_share *ws = thr->ts.work_share; | |
110 | unsigned index, next_id; | |
111 | ||
112 | /* Work share constructs can be orphaned. */ | |
113 | if (team == NULL || team->nthreads == 1) | |
114 | return; | |
115 | ||
116 | /* We're no longer the owner. */ | |
117 | ws->ordered_owner = -1; | |
118 | ||
119 | /* If there's only one thread in the queue, that must be us. */ | |
120 | if (ws->ordered_num_used == 1) | |
121 | { | |
122 | /* We have a similar situation as in gomp_ordered_first | |
123 | where we need to post to our own release semaphore. */ | |
124 | gomp_sem_post (team->ordered_release[thr->ts.team_id]); | |
125 | return; | |
126 | } | |
127 | ||
128 | /* If the queue is entirely full, then we move ourself to the end of | |
129 | the queue merely by incrementing ordered_cur. Only if it's not | |
130 | full do we have to write our id. */ | |
131 | if (ws->ordered_num_used < team->nthreads) | |
132 | { | |
133 | index = ws->ordered_cur + ws->ordered_num_used; | |
134 | if (index >= team->nthreads) | |
135 | index -= team->nthreads; | |
136 | ws->ordered_team_ids[index] = thr->ts.team_id; | |
137 | } | |
138 | ||
139 | index = ws->ordered_cur + 1; | |
140 | if (index == team->nthreads) | |
141 | index = 0; | |
142 | ws->ordered_cur = index; | |
143 | ||
144 | next_id = ws->ordered_team_ids[index]; | |
145 | gomp_sem_post (team->ordered_release[next_id]); | |
146 | } | |
147 | ||
148 | ||
149 | /* This function is called when a statically scheduled loop is first | |
150 | being created. */ | |
151 | ||
152 | void | |
153 | gomp_ordered_static_init (void) | |
154 | { | |
155 | struct gomp_thread *thr = gomp_thread (); | |
156 | struct gomp_team *team = thr->ts.team; | |
157 | ||
158 | if (team == NULL || team->nthreads == 1) | |
159 | return; | |
160 | ||
161 | gomp_sem_post (team->ordered_release[0]); | |
162 | } | |
163 | ||
164 | /* This function is called when a statically scheduled loop is moving to | |
165 | the next allocation block. Static schedules are not first come first | |
166 | served like the others, so we're to move to the numerically next thread, | |
167 | not the next thread on a list. The work-share lock should *not* be held | |
168 | on entry. */ | |
169 | ||
170 | void | |
171 | gomp_ordered_static_next (void) | |
172 | { | |
173 | struct gomp_thread *thr = gomp_thread (); | |
174 | struct gomp_team *team = thr->ts.team; | |
175 | struct gomp_work_share *ws = thr->ts.work_share; | |
176 | unsigned id = thr->ts.team_id; | |
177 | ||
178 | if (team == NULL || team->nthreads == 1) | |
179 | return; | |
180 | ||
181 | ws->ordered_owner = -1; | |
182 | ||
183 | /* This thread currently owns the lock. Increment the owner. */ | |
184 | if (++id == team->nthreads) | |
185 | id = 0; | |
186 | ws->ordered_team_ids[0] = id; | |
187 | gomp_sem_post (team->ordered_release[id]); | |
188 | } | |
189 | ||
190 | /* This function is called when we need to assert that the thread owns the | |
191 | ordered section. Due to the problem of posted-but-not-waited semaphores, | |
192 | this needs to happen before completing a loop iteration. */ | |
193 | ||
194 | void | |
195 | gomp_ordered_sync (void) | |
196 | { | |
197 | struct gomp_thread *thr = gomp_thread (); | |
198 | struct gomp_team *team = thr->ts.team; | |
199 | struct gomp_work_share *ws = thr->ts.work_share; | |
200 | ||
201 | /* Work share constructs can be orphaned. But this clearly means that | |
202 | we are the only thread, and so we automatically own the section. */ | |
203 | if (team == NULL || team->nthreads == 1) | |
204 | return; | |
205 | ||
206 | /* ??? I believe it to be safe to access this data without taking the | |
207 | ws->lock. The only presumed race condition is with the previous | |
208 | thread on the queue incrementing ordered_cur such that it points | |
209 | to us, concurrently with our check below. But our team_id is | |
210 | already present in the queue, and the other thread will always | |
211 | post to our release semaphore. So the two cases are that we will | |
212 | either win the race an momentarily block on the semaphore, or lose | |
213 | the race and find the semaphore already unlocked and so not block. | |
b40c885f AM |
214 | Either way we get correct results. |
215 | However, there is an implicit flush on entry to an ordered region, | |
216 | so we do need to have a barrier here. If we were taking a lock | |
217 | this could be MEMMODEL_RELEASE since the acquire would be coverd | |
218 | by the lock. */ | |
953ff289 | 219 | |
b40c885f | 220 | __atomic_thread_fence (MEMMODEL_ACQ_REL); |
953ff289 DN |
221 | if (ws->ordered_owner != thr->ts.team_id) |
222 | { | |
223 | gomp_sem_wait (team->ordered_release[thr->ts.team_id]); | |
224 | ws->ordered_owner = thr->ts.team_id; | |
225 | } | |
226 | } | |
227 | ||
228 | /* This function is called by user code when encountering the start of an | |
229 | ORDERED block. We must check to see if the current thread is at the | |
230 | head of the queue, and if not, block. */ | |
231 | ||
232 | #ifdef HAVE_ATTRIBUTE_ALIAS | |
233 | extern void GOMP_ordered_start (void) | |
234 | __attribute__((alias ("gomp_ordered_sync"))); | |
235 | #else | |
236 | void | |
237 | GOMP_ordered_start (void) | |
238 | { | |
239 | gomp_ordered_sync (); | |
240 | } | |
241 | #endif | |
242 | ||
243 | /* This function is called by user code when encountering the end of an | |
244 | ORDERED block. With the current ORDERED implementation there's nothing | |
245 | for us to do. | |
246 | ||
247 | However, the current implementation has a flaw in that it does not allow | |
248 | the next thread into the ORDERED section immediately after the current | |
249 | thread exits the ORDERED section in its last iteration. The existance | |
250 | of this function allows the implementation to change. */ | |
251 | ||
252 | void | |
253 | GOMP_ordered_end (void) | |
254 | { | |
255 | } | |
d9a6bd32 JJ |
256 | |
257 | /* DOACROSS initialization. */ | |
258 | ||
259 | #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) | |
260 | ||
261 | void | |
262 | gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) | |
263 | { | |
264 | struct gomp_thread *thr = gomp_thread (); | |
265 | struct gomp_team *team = thr->ts.team; | |
266 | struct gomp_work_share *ws = thr->ts.work_share; | |
267 | unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; | |
268 | unsigned long ent, num_ents, elt_sz, shift_sz; | |
269 | struct gomp_doacross_work_share *doacross; | |
270 | ||
271 | if (team == NULL || team->nthreads == 1) | |
272 | return; | |
273 | ||
274 | for (i = 0; i < ncounts; i++) | |
275 | { | |
276 | /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ | |
277 | if (counts[i] == 0) | |
278 | return; | |
279 | ||
280 | if (num_bits <= MAX_COLLAPSED_BITS) | |
281 | { | |
282 | unsigned int this_bits; | |
283 | if (counts[i] == 1) | |
284 | this_bits = 1; | |
285 | else | |
286 | this_bits = __SIZEOF_LONG__ * __CHAR_BIT__ | |
287 | - __builtin_clzl (counts[i] - 1); | |
288 | if (num_bits + this_bits <= MAX_COLLAPSED_BITS) | |
289 | { | |
290 | bits[i] = this_bits; | |
291 | num_bits += this_bits; | |
292 | } | |
293 | else | |
294 | num_bits = MAX_COLLAPSED_BITS + 1; | |
295 | } | |
296 | } | |
297 | ||
298 | if (ws->sched == GFS_STATIC) | |
299 | num_ents = team->nthreads; | |
e4606348 JJ |
300 | else if (ws->sched == GFS_GUIDED) |
301 | num_ents = counts[0]; | |
d9a6bd32 JJ |
302 | else |
303 | num_ents = (counts[0] - 1) / chunk_size + 1; | |
304 | if (num_bits <= MAX_COLLAPSED_BITS) | |
305 | { | |
306 | elt_sz = sizeof (unsigned long); | |
307 | shift_sz = ncounts * sizeof (unsigned int); | |
308 | } | |
309 | else | |
310 | { | |
311 | elt_sz = sizeof (unsigned long) * ncounts; | |
312 | shift_sz = 0; | |
313 | } | |
314 | elt_sz = (elt_sz + 63) & ~63UL; | |
315 | ||
316 | doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz | |
317 | + shift_sz); | |
318 | doacross->chunk_size = chunk_size; | |
319 | doacross->elt_sz = elt_sz; | |
320 | doacross->ncounts = ncounts; | |
321 | doacross->flattened = false; | |
322 | doacross->array = (unsigned char *) | |
323 | ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) | |
324 | & ~(uintptr_t) 63); | |
325 | if (num_bits <= MAX_COLLAPSED_BITS) | |
326 | { | |
327 | unsigned int shift_count = 0; | |
328 | doacross->flattened = true; | |
329 | for (i = ncounts; i > 0; i--) | |
330 | { | |
331 | doacross->shift_counts[i - 1] = shift_count; | |
332 | shift_count += bits[i - 1]; | |
333 | } | |
334 | for (ent = 0; ent < num_ents; ent++) | |
335 | *(unsigned long *) (doacross->array + ent * elt_sz) = 0; | |
336 | } | |
337 | else | |
338 | for (ent = 0; ent < num_ents; ent++) | |
339 | memset (doacross->array + ent * elt_sz, '\0', | |
340 | sizeof (unsigned long) * ncounts); | |
341 | if (ws->sched == GFS_STATIC && chunk_size == 0) | |
342 | { | |
343 | unsigned long q = counts[0] / num_ents; | |
344 | unsigned long t = counts[0] % num_ents; | |
345 | doacross->boundary = t * (q + 1); | |
346 | doacross->q = q; | |
347 | doacross->t = t; | |
348 | } | |
349 | ws->doacross = doacross; | |
350 | } | |
351 | ||
352 | /* DOACROSS POST operation. */ | |
353 | ||
354 | void | |
355 | GOMP_doacross_post (long *counts) | |
356 | { | |
357 | struct gomp_thread *thr = gomp_thread (); | |
358 | struct gomp_work_share *ws = thr->ts.work_share; | |
359 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
360 | unsigned long ent; | |
361 | unsigned int i; | |
362 | ||
363 | if (__builtin_expect (doacross == NULL, 0)) | |
364 | { | |
365 | __sync_synchronize (); | |
366 | return; | |
367 | } | |
368 | ||
369 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
370 | ent = thr->ts.team_id; | |
e4606348 JJ |
371 | else if (ws->sched == GFS_GUIDED) |
372 | ent = counts[0]; | |
d9a6bd32 JJ |
373 | else |
374 | ent = counts[0] / doacross->chunk_size; | |
375 | unsigned long *array = (unsigned long *) (doacross->array | |
376 | + ent * doacross->elt_sz); | |
377 | ||
378 | if (__builtin_expect (doacross->flattened, 1)) | |
379 | { | |
380 | unsigned long flattened | |
381 | = (unsigned long) counts[0] << doacross->shift_counts[0]; | |
382 | ||
383 | for (i = 1; i < doacross->ncounts; i++) | |
384 | flattened |= (unsigned long) counts[i] | |
385 | << doacross->shift_counts[i]; | |
386 | flattened++; | |
387 | if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) | |
388 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
389 | else | |
390 | __atomic_store_n (array, flattened, MEMMODEL_RELEASE); | |
391 | return; | |
392 | } | |
393 | ||
394 | __atomic_thread_fence (MEMMODEL_ACQUIRE); | |
395 | for (i = doacross->ncounts; i-- > 0; ) | |
396 | { | |
397 | if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) | |
398 | __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); | |
399 | } | |
400 | } | |
401 | ||
402 | /* DOACROSS WAIT operation. */ | |
403 | ||
404 | void | |
405 | GOMP_doacross_wait (long first, ...) | |
406 | { | |
407 | struct gomp_thread *thr = gomp_thread (); | |
408 | struct gomp_work_share *ws = thr->ts.work_share; | |
409 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
410 | va_list ap; | |
411 | unsigned long ent; | |
412 | unsigned int i; | |
413 | ||
414 | if (__builtin_expect (doacross == NULL, 0)) | |
415 | { | |
416 | __sync_synchronize (); | |
417 | return; | |
418 | } | |
419 | ||
420 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
421 | { | |
422 | if (ws->chunk_size == 0) | |
423 | { | |
424 | if (first < doacross->boundary) | |
425 | ent = first / (doacross->q + 1); | |
426 | else | |
427 | ent = (first - doacross->boundary) / doacross->q | |
428 | + doacross->t; | |
429 | } | |
430 | else | |
431 | ent = first / ws->chunk_size % thr->ts.team->nthreads; | |
432 | } | |
e4606348 JJ |
433 | else if (ws->sched == GFS_GUIDED) |
434 | ent = first; | |
d9a6bd32 JJ |
435 | else |
436 | ent = first / doacross->chunk_size; | |
437 | unsigned long *array = (unsigned long *) (doacross->array | |
438 | + ent * doacross->elt_sz); | |
439 | ||
440 | if (__builtin_expect (doacross->flattened, 1)) | |
441 | { | |
442 | unsigned long flattened | |
443 | = (unsigned long) first << doacross->shift_counts[0]; | |
444 | unsigned long cur; | |
445 | ||
446 | va_start (ap, first); | |
447 | for (i = 1; i < doacross->ncounts; i++) | |
448 | flattened |= (unsigned long) va_arg (ap, long) | |
449 | << doacross->shift_counts[i]; | |
450 | cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); | |
451 | if (flattened < cur) | |
452 | { | |
453 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
454 | va_end (ap); | |
455 | return; | |
456 | } | |
457 | doacross_spin (array, flattened, cur); | |
458 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
459 | va_end (ap); | |
460 | return; | |
461 | } | |
462 | ||
463 | do | |
464 | { | |
465 | va_start (ap, first); | |
466 | for (i = 0; i < doacross->ncounts; i++) | |
467 | { | |
468 | unsigned long thisv | |
469 | = (unsigned long) (i ? va_arg (ap, long) : first) + 1; | |
470 | unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); | |
471 | if (thisv < cur) | |
472 | { | |
473 | i = doacross->ncounts; | |
474 | break; | |
475 | } | |
476 | if (thisv > cur) | |
477 | break; | |
478 | } | |
479 | va_end (ap); | |
480 | if (i == doacross->ncounts) | |
481 | break; | |
482 | cpu_relax (); | |
483 | } | |
484 | while (1); | |
485 | __sync_synchronize (); | |
486 | } | |
487 | ||
488 | typedef unsigned long long gomp_ull; | |
489 | ||
490 | void | |
491 | gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) | |
492 | { | |
493 | struct gomp_thread *thr = gomp_thread (); | |
494 | struct gomp_team *team = thr->ts.team; | |
495 | struct gomp_work_share *ws = thr->ts.work_share; | |
496 | unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; | |
497 | unsigned long ent, num_ents, elt_sz, shift_sz; | |
498 | struct gomp_doacross_work_share *doacross; | |
499 | ||
500 | if (team == NULL || team->nthreads == 1) | |
501 | return; | |
502 | ||
503 | for (i = 0; i < ncounts; i++) | |
504 | { | |
505 | /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ | |
506 | if (counts[i] == 0) | |
507 | return; | |
508 | ||
509 | if (num_bits <= MAX_COLLAPSED_BITS) | |
510 | { | |
511 | unsigned int this_bits; | |
512 | if (counts[i] == 1) | |
513 | this_bits = 1; | |
514 | else | |
515 | this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ | |
516 | - __builtin_clzll (counts[i] - 1); | |
517 | if (num_bits + this_bits <= MAX_COLLAPSED_BITS) | |
518 | { | |
519 | bits[i] = this_bits; | |
520 | num_bits += this_bits; | |
521 | } | |
522 | else | |
523 | num_bits = MAX_COLLAPSED_BITS + 1; | |
524 | } | |
525 | } | |
526 | ||
527 | if (ws->sched == GFS_STATIC) | |
528 | num_ents = team->nthreads; | |
e4606348 JJ |
529 | else if (ws->sched == GFS_GUIDED) |
530 | num_ents = counts[0]; | |
d9a6bd32 JJ |
531 | else |
532 | num_ents = (counts[0] - 1) / chunk_size + 1; | |
533 | if (num_bits <= MAX_COLLAPSED_BITS) | |
534 | { | |
535 | elt_sz = sizeof (unsigned long); | |
536 | shift_sz = ncounts * sizeof (unsigned int); | |
537 | } | |
538 | else | |
539 | { | |
540 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
541 | elt_sz = sizeof (gomp_ull) * ncounts; | |
542 | else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) | |
543 | elt_sz = sizeof (unsigned long) * 2 * ncounts; | |
544 | else | |
545 | abort (); | |
546 | shift_sz = 0; | |
547 | } | |
548 | elt_sz = (elt_sz + 63) & ~63UL; | |
549 | ||
550 | doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz | |
551 | + shift_sz); | |
552 | doacross->chunk_size_ull = chunk_size; | |
553 | doacross->elt_sz = elt_sz; | |
554 | doacross->ncounts = ncounts; | |
555 | doacross->flattened = false; | |
556 | doacross->boundary = 0; | |
557 | doacross->array = (unsigned char *) | |
558 | ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) | |
559 | & ~(uintptr_t) 63); | |
560 | if (num_bits <= MAX_COLLAPSED_BITS) | |
561 | { | |
562 | unsigned int shift_count = 0; | |
563 | doacross->flattened = true; | |
564 | for (i = ncounts; i > 0; i--) | |
565 | { | |
566 | doacross->shift_counts[i - 1] = shift_count; | |
567 | shift_count += bits[i - 1]; | |
568 | } | |
569 | for (ent = 0; ent < num_ents; ent++) | |
570 | *(unsigned long *) (doacross->array + ent * elt_sz) = 0; | |
571 | } | |
572 | else | |
573 | for (ent = 0; ent < num_ents; ent++) | |
574 | memset (doacross->array + ent * elt_sz, '\0', | |
575 | sizeof (unsigned long) * ncounts); | |
576 | if (ws->sched == GFS_STATIC && chunk_size == 0) | |
577 | { | |
578 | gomp_ull q = counts[0] / num_ents; | |
579 | gomp_ull t = counts[0] % num_ents; | |
580 | doacross->boundary_ull = t * (q + 1); | |
581 | doacross->q_ull = q; | |
582 | doacross->t = t; | |
583 | } | |
584 | ws->doacross = doacross; | |
585 | } | |
586 | ||
587 | /* DOACROSS POST operation. */ | |
588 | ||
589 | void | |
590 | GOMP_doacross_ull_post (gomp_ull *counts) | |
591 | { | |
592 | struct gomp_thread *thr = gomp_thread (); | |
593 | struct gomp_work_share *ws = thr->ts.work_share; | |
594 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
595 | unsigned long ent; | |
596 | unsigned int i; | |
597 | ||
598 | if (__builtin_expect (doacross == NULL, 0)) | |
599 | { | |
600 | __sync_synchronize (); | |
601 | return; | |
602 | } | |
603 | ||
604 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
605 | ent = thr->ts.team_id; | |
e4606348 JJ |
606 | else if (ws->sched == GFS_GUIDED) |
607 | ent = counts[0]; | |
d9a6bd32 JJ |
608 | else |
609 | ent = counts[0] / doacross->chunk_size_ull; | |
610 | ||
611 | if (__builtin_expect (doacross->flattened, 1)) | |
612 | { | |
613 | unsigned long *array = (unsigned long *) (doacross->array | |
614 | + ent * doacross->elt_sz); | |
615 | gomp_ull flattened | |
616 | = counts[0] << doacross->shift_counts[0]; | |
617 | ||
618 | for (i = 1; i < doacross->ncounts; i++) | |
619 | flattened |= counts[i] << doacross->shift_counts[i]; | |
620 | flattened++; | |
621 | if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) | |
622 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
623 | else | |
624 | __atomic_store_n (array, flattened, MEMMODEL_RELEASE); | |
625 | return; | |
626 | } | |
627 | ||
628 | __atomic_thread_fence (MEMMODEL_ACQUIRE); | |
629 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
630 | { | |
631 | gomp_ull *array = (gomp_ull *) (doacross->array | |
632 | + ent * doacross->elt_sz); | |
633 | ||
634 | for (i = doacross->ncounts; i-- > 0; ) | |
635 | { | |
636 | if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) | |
637 | __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); | |
638 | } | |
639 | } | |
640 | else | |
641 | { | |
642 | unsigned long *array = (unsigned long *) (doacross->array | |
643 | + ent * doacross->elt_sz); | |
644 | ||
645 | for (i = doacross->ncounts; i-- > 0; ) | |
646 | { | |
647 | gomp_ull cull = counts[i] + 1UL; | |
648 | unsigned long c = (unsigned long) cull; | |
649 | if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) | |
650 | __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); | |
651 | c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); | |
652 | if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) | |
653 | __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); | |
654 | } | |
655 | } | |
656 | } | |
657 | ||
658 | /* DOACROSS WAIT operation. */ | |
659 | ||
660 | void | |
661 | GOMP_doacross_ull_wait (gomp_ull first, ...) | |
662 | { | |
663 | struct gomp_thread *thr = gomp_thread (); | |
664 | struct gomp_work_share *ws = thr->ts.work_share; | |
665 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
666 | va_list ap; | |
667 | unsigned long ent; | |
668 | unsigned int i; | |
669 | ||
670 | if (__builtin_expect (doacross == NULL, 0)) | |
671 | { | |
672 | __sync_synchronize (); | |
673 | return; | |
674 | } | |
675 | ||
676 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
677 | { | |
678 | if (ws->chunk_size_ull == 0) | |
679 | { | |
680 | if (first < doacross->boundary_ull) | |
681 | ent = first / (doacross->q_ull + 1); | |
682 | else | |
683 | ent = (first - doacross->boundary_ull) / doacross->q_ull | |
684 | + doacross->t; | |
685 | } | |
686 | else | |
687 | ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; | |
688 | } | |
e4606348 JJ |
689 | else if (ws->sched == GFS_GUIDED) |
690 | ent = first; | |
d9a6bd32 JJ |
691 | else |
692 | ent = first / doacross->chunk_size_ull; | |
693 | ||
694 | if (__builtin_expect (doacross->flattened, 1)) | |
695 | { | |
696 | unsigned long *array = (unsigned long *) (doacross->array | |
697 | + ent * doacross->elt_sz); | |
698 | gomp_ull flattened = first << doacross->shift_counts[0]; | |
699 | unsigned long cur; | |
700 | ||
701 | va_start (ap, first); | |
702 | for (i = 1; i < doacross->ncounts; i++) | |
703 | flattened |= va_arg (ap, gomp_ull) | |
704 | << doacross->shift_counts[i]; | |
705 | cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); | |
706 | if (flattened < cur) | |
707 | { | |
708 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
709 | va_end (ap); | |
710 | return; | |
711 | } | |
712 | doacross_spin (array, flattened, cur); | |
713 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
714 | va_end (ap); | |
715 | return; | |
716 | } | |
717 | ||
718 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
719 | { | |
720 | gomp_ull *array = (gomp_ull *) (doacross->array | |
721 | + ent * doacross->elt_sz); | |
722 | do | |
723 | { | |
724 | va_start (ap, first); | |
725 | for (i = 0; i < doacross->ncounts; i++) | |
726 | { | |
727 | gomp_ull thisv | |
728 | = (i ? va_arg (ap, gomp_ull) : first) + 1; | |
729 | gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); | |
730 | if (thisv < cur) | |
731 | { | |
732 | i = doacross->ncounts; | |
733 | break; | |
734 | } | |
735 | if (thisv > cur) | |
736 | break; | |
737 | } | |
738 | va_end (ap); | |
739 | if (i == doacross->ncounts) | |
740 | break; | |
741 | cpu_relax (); | |
742 | } | |
743 | while (1); | |
744 | } | |
745 | else | |
746 | { | |
747 | unsigned long *array = (unsigned long *) (doacross->array | |
748 | + ent * doacross->elt_sz); | |
749 | do | |
750 | { | |
751 | va_start (ap, first); | |
752 | for (i = 0; i < doacross->ncounts; i++) | |
753 | { | |
754 | gomp_ull thisv | |
755 | = (i ? va_arg (ap, gomp_ull) : first) + 1; | |
756 | unsigned long t | |
757 | = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); | |
758 | unsigned long cur | |
759 | = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); | |
760 | if (t < cur) | |
761 | { | |
762 | i = doacross->ncounts; | |
763 | break; | |
764 | } | |
765 | if (t > cur) | |
766 | break; | |
767 | t = thisv; | |
768 | cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); | |
769 | if (t < cur) | |
770 | { | |
771 | i = doacross->ncounts; | |
772 | break; | |
773 | } | |
774 | if (t > cur) | |
775 | break; | |
776 | } | |
777 | va_end (ap); | |
778 | if (i == doacross->ncounts) | |
779 | break; | |
780 | cpu_relax (); | |
781 | } | |
782 | while (1); | |
783 | } | |
784 | __sync_synchronize (); | |
785 | } |