31 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
32 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
36 #include <hip/amd_detail/device_functions.h>
38 #if !defined(__align__)
39 #define __align__(x) __attribute__((aligned(x)))
42 #if !defined(__CG_QUALIFIER__)
43 #define __CG_QUALIFIER__ __device__ __forceinline__
46 #if !defined(__CG_STATIC_QUALIFIER__)
47 #define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
50 #if !defined(_CG_STATIC_CONST_DECL_)
51 #define _CG_STATIC_CONST_DECL_ static constexpr
54 #if !defined(WAVEFRONT_SIZE)
55 #if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
56 #define WAVEFRONT_SIZE 32
58 #define WAVEFRONT_SIZE 64
61 namespace cooperative_groups {
64 template <
unsigned int size>
65 using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
67 template <
unsigned int size>
68 using is_valid_wavefront = std::integral_constant<bool, (size <= WAVEFRONT_SIZE)>;
70 template <
unsigned int size>
71 using is_valid_tile_size =
72 std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
76 std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
93 namespace multi_grid {
95 __CG_STATIC_QUALIFIER__ uint32_t num_grids() {
return (uint32_t)__ockl_multi_grid_num_grids(); }
97 __CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
return (uint32_t)__ockl_multi_grid_grid_rank(); }
99 __CG_STATIC_QUALIFIER__ uint32_t size() {
return (uint32_t)__ockl_multi_grid_size(); }
101 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
return (uint32_t)__ockl_multi_grid_thread_rank(); }
103 __CG_STATIC_QUALIFIER__
bool is_valid() {
return (
bool)__ockl_multi_grid_is_valid(); }
105 __CG_STATIC_QUALIFIER__
void sync() { __ockl_multi_grid_sync(); }
114 __CG_STATIC_QUALIFIER__ uint32_t size() {
115 return (uint32_t)((hipBlockDim_z * hipGridDim_z) * (hipBlockDim_y * hipGridDim_y) *
116 (hipBlockDim_x * hipGridDim_x));
119 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
121 uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
122 (hipBlockIdx_y * hipGridDim_x) + (hipBlockIdx_x));
126 uint32_t num_threads_till_current_workgroup =
127 (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
130 uint32_t local_thread_rank = (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
131 (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x));
133 return (num_threads_till_current_workgroup + local_thread_rank);
136 __CG_STATIC_QUALIFIER__
bool is_valid() {
return (
bool)__ockl_grid_is_valid(); }
138 __CG_STATIC_QUALIFIER__
void sync() { __ockl_grid_sync(); }
146 namespace workgroup {
148 __CG_STATIC_QUALIFIER__
dim3 group_index() {
149 return (
dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, (uint32_t)hipBlockIdx_z));
152 __CG_STATIC_QUALIFIER__
dim3 thread_index() {
153 return (
dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, (uint32_t)hipThreadIdx_z));
156 __CG_STATIC_QUALIFIER__ uint32_t size() {
157 return ((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
160 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
161 return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
162 (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x)));
165 __CG_STATIC_QUALIFIER__
bool is_valid() {
170 __CG_STATIC_QUALIFIER__
void sync() { __syncthreads(); }
178 #endif // __cplusplus
179 #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H