HIP: Heterogenous-computing Interface for Portability
hip_cooperative_groups.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
32 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
33 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
34 
35 #if __cplusplus
37 
38 namespace cooperative_groups {
39 
45 class thread_group {
46  protected:
47  uint32_t _type; // thread_group type
48  uint32_t _size; // total number of threads in the tread_group
49  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
50  // LSB represents lane 0, and MSB represents lane 63
51 
52  // Construct a thread group, and set thread group type and other essential
53  // thread group properties. This generic thread group is directly constructed
54  // only when the group is supposed to contain only the calling the thread
55  // (throurh the API - `this_thread()`), and in all other cases, this thread
56  // group object is a sub-object of some other derived thread group object
57  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
58  uint64_t mask = (uint64_t)0) {
59  _type = type;
60  _size = size;
61  _mask = mask;
62  }
63 
64  struct _tiled_info {
65  bool is_tiled;
66  unsigned int size;
67  } tiled_info;
68 
69  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
70  unsigned int tile_size);
71  friend class thread_block;
72 
73  public:
74  // Total number of threads in the thread group, and this serves the purpose
75  // for all derived cooperative group types since their `size` is directly
76  // saved during the construction
77  __CG_QUALIFIER__ uint32_t size() const { return _size; }
78  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
79  // Rank of the calling thread within [0, size())
80  __CG_QUALIFIER__ uint32_t thread_rank() const;
81  // Is this cooperative group type valid?
82  __CG_QUALIFIER__ bool is_valid() const;
83  // synchronize the threads in the thread group
84  __CG_QUALIFIER__ void sync() const;
85 };
86 
93 class multi_grid_group : public thread_group {
94  // Only these friend functions are allowed to construct an object of this class
95  // and access its resources
96  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
97 
98  protected:
99  // Construct mutli-grid thread group (through the API this_multi_grid())
100  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
101  : thread_group(internal::cg_multi_grid, size) {}
102 
103  public:
104  // Number of invocations participating in this multi-grid group. In other
105  // words, the number of GPUs
106  __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
107  // Rank of this invocation. In other words, an ID number within the range
108  // [0, num_grids()) of the GPU, this kernel is running on
109  __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
110  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
111  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
112  __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
113 };
114 
122 __CG_QUALIFIER__ multi_grid_group this_multi_grid() {
123  return multi_grid_group(internal::multi_grid::size());
124 }
125 
132 class grid_group : public thread_group {
133  // Only these friend functions are allowed to construct an object of this class
134  // and access its resources
135  friend __CG_QUALIFIER__ grid_group this_grid();
136 
137  protected:
138  // Construct grid thread group (through the API this_grid())
139  explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
140 
141  public:
142  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
143  __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
144  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
145 };
146 
154 __CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
155 
163 class thread_block : public thread_group {
164  // Only these friend functions are allowed to construct an object of thi
165  // class and access its resources
166  friend __CG_QUALIFIER__ thread_block this_thread_block();
167  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
168  unsigned int tile_size);
169  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
170  unsigned int tile_size);
171 
172  protected:
173  // Construct a workgroup thread group (through the API this_thread_block())
174  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
175  : thread_group(internal::cg_workgroup, size) {}
176 
177  __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
178  const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
179  // Invalid tile size, assert
180  if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
181  assert(false && "invalid tile size");
182  }
183 
184  thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
185  tiledGroup.tiled_info.size = tile_size;
186  tiledGroup.tiled_info.is_tiled = true;
187  return tiledGroup;
188  }
189 
190  public:
191  // 3-dimensional block index within the grid
192  __CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
193  // 3-dimensional thread index within the block
194  __CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
195  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
196  __CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
197  __CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
198 };
199 
207 __CG_QUALIFIER__ thread_block this_thread_block() {
208  return thread_block(internal::workgroup::size());
209 }
210 
217 class tiled_group : public thread_group {
218  private:
219  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
220  unsigned int tile_size);
221  friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
222  unsigned int tile_size);
223 
224  __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
225  const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
226 
227  if (!tile_size || (tile_size > WAVEFRONT_SIZE) || !pow2) {
228  assert(false && "invalid tile size");
229  }
230 
231  if (size() <= tile_size) {
232  return (*this);
233  }
234 
235  tiled_group tiledGroup = tiled_group(tile_size);
236  tiledGroup.tiled_info.is_tiled = true;
237  return tiledGroup;
238  }
239 
240  protected:
241  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
242  : thread_group(internal::cg_tiled_group, tileSize) {
243  tiled_info.size = tileSize;
244  tiled_info.is_tiled = true;
245  }
246 
247  public:
248  __CG_QUALIFIER__ unsigned int size() const { return (tiled_info.size); }
249 
250  __CG_QUALIFIER__ unsigned int thread_rank() const {
251  return (internal::workgroup::thread_rank() & (tiled_info.size - 1));
252  }
253 
254  __CG_QUALIFIER__ void sync() const {
255  // enforce memory ordering for memory instructions.
256  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
257  }
258 };
259 
263 __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
264  switch (this->_type) {
265  case internal::cg_multi_grid: {
266  return (static_cast<const multi_grid_group*>(this)->thread_rank());
267  }
268  case internal::cg_grid: {
269  return (static_cast<const grid_group*>(this)->thread_rank());
270  }
271  case internal::cg_workgroup: {
272  return (static_cast<const thread_block*>(this)->thread_rank());
273  }
274  case internal::cg_tiled_group: {
275  return (static_cast<const tiled_group*>(this)->thread_rank());
276  }
277  default: {
278  assert(false && "invalid cooperative group type");
279  return -1;
280  }
281  }
282 }
283 
284 __CG_QUALIFIER__ bool thread_group::is_valid() const {
285  switch (this->_type) {
286  case internal::cg_multi_grid: {
287  return (static_cast<const multi_grid_group*>(this)->is_valid());
288  }
289  case internal::cg_grid: {
290  return (static_cast<const grid_group*>(this)->is_valid());
291  }
292  case internal::cg_workgroup: {
293  return (static_cast<const thread_block*>(this)->is_valid());
294  }
295  case internal::cg_tiled_group: {
296  return (static_cast<const tiled_group*>(this)->is_valid());
297  }
298  default: {
299  assert(false && "invalid cooperative group type");
300  return false;
301  }
302  }
303 }
304 
305 __CG_QUALIFIER__ void thread_group::sync() const {
306  switch (this->_type) {
307  case internal::cg_multi_grid: {
308  static_cast<const multi_grid_group*>(this)->sync();
309  break;
310  }
311  case internal::cg_grid: {
312  static_cast<const grid_group*>(this)->sync();
313  break;
314  }
315  case internal::cg_workgroup: {
316  static_cast<const thread_block*>(this)->sync();
317  break;
318  }
319  case internal::cg_tiled_group: {
320  static_cast<const tiled_group*>(this)->sync();
321  break;
322  }
323  default: {
324  assert(false && "invalid cooperative group type");
325  }
326  }
327 }
328 
333 template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
334 
335 template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
336  return g.thread_rank();
337 }
338 
339 template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
340 
341 template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
342 
343 template <unsigned int tileSize> class tile_base {
344  protected:
345  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
346 
347  public:
348  // Rank of the thread within this tile
349  _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
350  return (internal::workgroup::thread_rank() & (numThreads - 1));
351  }
352 
353  // Number of threads within this tile
354  __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
355 };
356 
357 template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
358  static_assert(is_valid_tile_size<size>::value,
359  "Tile size is either not a power of 2 or greater than the wavefront size");
360  using tile_base<size>::numThreads;
361 
362  public:
363  __CG_STATIC_QUALIFIER__ void sync() {
364  // enforce ordering for memory instructions
365  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
366  }
367 
368  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
369  static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
370  return (__shfl(var, srcRank, numThreads));
371  }
372 
373  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
374  static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
375  return (__shfl_down(var, lane_delta, numThreads));
376  }
377 
378  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
379  static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
380  return (__shfl_up(var, lane_delta, numThreads));
381  }
382 
383  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
384  static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
385  return (__shfl_xor(var, laneMask, numThreads));
386  }
387 };
388 
394 template <unsigned int tileSize, class ParentCGTy = void>
395 class thread_block_tile_type : public thread_block_tile_base<tileSize>, public tiled_group {
396  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
397 
398  friend class thread_block_tile_type<tileSize, ParentCGTy>;
399 
400  typedef thread_block_tile_base<numThreads> tbtBase;
401 
402  protected:
403  __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
404  tiled_info.size = numThreads;
405  tiled_info.is_tiled = true;
406  }
407 
408  public:
409  using tbtBase::size;
410  using tbtBase::sync;
411  using tbtBase::thread_rank;
412 };
413 
414 
421 __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
422  if (parent.cg_type() == internal::cg_tiled_group) {
423  const tiled_group* cg = static_cast<const tiled_group*>(&parent);
424  return cg->new_tiled_group(tile_size);
425  } else {
426  const thread_block* tb = static_cast<const thread_block*>(&parent);
427  return tb->new_tiled_group(tile_size);
428  }
429 }
430 
431 // Thread block type overload
432 __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
433  return (parent.new_tiled_group(tile_size));
434 }
435 
436 // Coalesced group type overload
437 __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
438  return (parent.new_tiled_group(tile_size));
439 }
440 
441 template <unsigned int size, class ParentCGTy> class thread_block_tile;
442 
443 namespace impl {
444 template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
445 
446 template <unsigned int size, class ParentCGTy>
447 class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
448  protected:
449  template <unsigned int tbtSize, class tbtParentT>
450  __CG_QUALIFIER__ thread_block_tile_internal(
451  const thread_block_tile_internal<tbtSize, tbtParentT>& g)
452  : thread_block_tile_type<size, ParentCGTy>() {}
453 
454  __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
455  : thread_block_tile_type<size, ParentCGTy>() {}
456 };
457 } // namespace impl
458 
459 template <unsigned int size, class ParentCGTy>
460 class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
461  protected:
462  __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
463  : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
464 
465  public:
466  __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
467  return thread_block_tile<size, void>(*this);
468  }
469 };
470 
471 
472 template <unsigned int size>
473 class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
474  template <unsigned int, class ParentCGTy> friend class thread_block_tile;
475 
476  protected:
477  public:
478  template <class ParentCGTy>
479  __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
480  : impl::thread_block_tile_internal<size, void>(g) {}
481 };
482 
483 template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
484 
485 namespace impl {
486 template <unsigned int size, class ParentCGTy = void> struct tiled_partition_internal;
487 
488 template <unsigned int size>
489 struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
490  __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
491  : thread_block_tile<size, thread_block>(g) {}
492 };
493 
494 } // namespace impl
495 
501 template <unsigned int size, class ParentCGTy>
502 __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
503  static_assert(is_valid_tile_size<size>::value,
504  "Tiled partition with size > wavefront size. Currently not supported ");
505  return impl::tiled_partition_internal<size, ParentCGTy>(g);
506 }
507 } // namespace cooperative_groups
508 
509 #endif // __cplusplus
510 #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
hip_cooperative_groups_helper.h
Device side implementation of cooperative group feature.
dim3
Definition: hip_runtime_api.h:318