29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
32 #include <hip/amd_detail/hip_common.h>
48 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
49 #define __HIP_ENABLE_DEVICE_MALLOC__ 0
52 #if __HIP_CLANG_ONLY__
54 #if !defined(__align__)
55 #define __align__(x) __attribute__((aligned(x)))
58 #define CUDA_SUCCESS hipSuccess
62 extern int HIP_TRACE_API;
65 #include <hip/amd_detail/hip_ldg.h>
67 #include <hip/amd_detail/hip_atomic.h>
69 #include <hip/amd_detail/device_functions.h>
70 #include <hip/amd_detail/surface_functions.h>
71 #include <hip/amd_detail/texture_fetch_functions.h>
72 #include <hip/amd_detail/texture_indirect_functions.h>
75 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
76 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
80 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
84 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
85 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
86 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
87 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
88 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
91 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
92 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
95 #define __HIP_ARCH_HAS_DOUBLES__ (1)
98 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
99 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
100 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
101 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
104 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
105 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
108 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
109 #define __HIP_ARCH_HAS_3DGRID__ (1)
110 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
115 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
116 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
117 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
118 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
119 amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
120 #define select_impl_(_1, _2, impl_, ...) impl_
121 #define __launch_bounds__(...) \
122 select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
124 __host__ inline void* __get_dynamicgroupbaseptr() {
return nullptr; }
126 #if __HIP_ARCH_GFX701__ == 0
128 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
129 __device__
float __hip_ds_bpermutef(
int index,
float src);
130 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
131 __device__
float __hip_ds_permutef(
int index,
float src);
133 template <
int pattern>
134 __device__
unsigned __hip_ds_swizzle_N(
unsigned int src);
135 template <
int pattern>
136 __device__
float __hip_ds_swizzlef_N(
float src);
138 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
139 __device__
int __hip_move_dpp_N(
int src);
141 #endif //__HIP_ARCH_GFX803__ == 1
143 #ifndef __OPENMP_AMDGCN__
144 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
145 #if __HIP_ENABLE_DEVICE_MALLOC__
146 extern "C" __device__
void* __hip_malloc(
size_t);
147 extern "C" __device__
void* __hip_free(
void* ptr);
148 static inline __device__
void* malloc(
size_t size) {
return __hip_malloc(size); }
149 static inline __device__
void* free(
void* ptr) {
return __hip_free(ptr); }
151 static inline __device__
void* malloc(
size_t size) { __builtin_trap();
return nullptr; }
152 static inline __device__
void* free(
void* ptr) { __builtin_trap();
return nullptr; }
154 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
155 #endif // !__OPENMP_AMDGCN__
165 #define HIP_KERNEL_NAME(...) __VA_ARGS__
166 #define HIP_SYMBOL(X) X
168 typedef int hipLaunchParm;
170 template <std::size_t n,
typename... Ts,
171 typename std::enable_if<n ==
sizeof...(Ts)>::type* =
nullptr>
172 void pArgs(
const std::tuple<Ts...>&,
void*) {}
174 template <std::size_t n,
typename... Ts,
175 typename std::enable_if<n !=
sizeof...(Ts)>::type* =
nullptr>
176 void pArgs(
const std::tuple<Ts...>& formals,
void** _vargs) {
177 using T =
typename std::tuple_element<n, std::tuple<Ts...> >::type;
179 static_assert(!std::is_reference<T>{},
180 "A __global__ function cannot have a reference as one of its "
182 #if defined(HIP_STRICT)
183 static_assert(std::is_trivially_copyable<T>{},
184 "Only TriviallyCopyable types can be arguments to a __global__ "
187 _vargs[n] =
const_cast<void*
>(
reinterpret_cast<const void*
>(&std::get<n>(formals)));
188 return pArgs<n + 1>(formals, _vargs);
191 template <
typename... Formals,
typename... Actuals>
192 std::tuple<Formals...> validateArgsCountType(
void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
193 static_assert(
sizeof...(Formals) ==
sizeof...(Actuals),
"Argument Count Mismatch");
194 std::tuple<Formals...> to_formals{std::move(actuals)};
198 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
199 template <
typename... Args,
typename F = void (*)(Args...)>
200 void hipLaunchKernelGGL(F kernel,
const dim3& numBlocks,
const dim3& dimBlocks,
201 std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
202 constexpr
size_t count =
sizeof...(Args);
203 auto tup_ = std::tuple<Args...>{args...};
204 auto tup = validateArgsCountType(kernel, tup_);
206 pArgs<0>(tup, _Args);
208 auto k =
reinterpret_cast<void*
>(kernel);
209 hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
212 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
214 kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
217 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
221 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
222 extern "C" __device__ __attribute__((const))
size_t __ockl_get_group_id(uint);
223 extern "C" __device__ __attribute__((const))
size_t __ockl_get_local_size(uint);
224 extern "C" __device__ __attribute__((const))
size_t __ockl_get_num_groups(uint);
225 struct __HIP_BlockIdx {
227 std::uint32_t operator()(std::uint32_t x)
const noexcept {
return __ockl_get_group_id(x); }
229 struct __HIP_BlockDim {
231 std::uint32_t operator()(std::uint32_t x)
const noexcept {
232 return __ockl_get_local_size(x);
235 struct __HIP_GridDim {
237 std::uint32_t operator()(std::uint32_t x)
const noexcept {
238 return __ockl_get_num_groups(x);
241 struct __HIP_ThreadIdx {
243 std::uint32_t operator()(std::uint32_t x)
const noexcept {
244 return __ockl_get_local_id(x);
248 template <
typename F>
249 struct __HIP_Coordinates {
250 using R = decltype(F{}(0));
252 struct X { __device__
operator R() const noexcept {
return F{}(0); } };
253 struct Y { __device__
operator R() const noexcept {
return F{}(1); } };
254 struct Z { __device__
operator R() const noexcept {
return F{}(2); } };
256 static constexpr X x{};
257 static constexpr Y y{};
258 static constexpr Z z{};
260 __device__
operator dim3()
const {
return dim3(x, y, z); }
264 template <
typename F>
265 #if !defined(_MSC_VER)
266 __attribute__((weak))
268 constexpr
typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
269 template <
typename F>
270 #if !defined(_MSC_VER)
271 __attribute__((weak))
273 constexpr
typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
274 template <
typename F>
275 #if !defined(_MSC_VER)
276 __attribute__((weak))
278 constexpr
typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
280 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_size(uint);
283 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::X,
284 __HIP_Coordinates<__HIP_BlockDim>::X) noexcept {
285 return __ockl_get_global_size(0);
289 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::X,
290 __HIP_Coordinates<__HIP_GridDim>::X) noexcept {
291 return __ockl_get_global_size(0);
295 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Y,
296 __HIP_Coordinates<__HIP_BlockDim>::Y) noexcept {
297 return __ockl_get_global_size(1);
301 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Y,
302 __HIP_Coordinates<__HIP_GridDim>::Y) noexcept {
303 return __ockl_get_global_size(1);
307 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Z,
308 __HIP_Coordinates<__HIP_BlockDim>::Z) noexcept {
309 return __ockl_get_global_size(2);
313 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Z,
314 __HIP_Coordinates<__HIP_GridDim>::Z) noexcept {
315 return __ockl_get_global_size(2);
318 static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
319 static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
320 static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
321 static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
323 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_id(uint);
324 #define hipThreadIdx_x (__ockl_get_local_id(0))
325 #define hipThreadIdx_y (__ockl_get_local_id(1))
326 #define hipThreadIdx_z (__ockl_get_local_id(2))
328 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_group_id(uint);
329 #define hipBlockIdx_x (__ockl_get_group_id(0))
330 #define hipBlockIdx_y (__ockl_get_group_id(1))
331 #define hipBlockIdx_z (__ockl_get_group_id(2))
333 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_local_size(uint);
334 #define hipBlockDim_x (__ockl_get_local_size(0))
335 #define hipBlockDim_y (__ockl_get_local_size(1))
336 #define hipBlockDim_z (__ockl_get_local_size(2))
338 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_num_groups(uint);
339 #define hipGridDim_x (__ockl_get_num_groups(0))
340 #define hipGridDim_y (__ockl_get_num_groups(1))
341 #define hipGridDim_z (__ockl_get_num_groups(2))
343 #include <hip/amd_detail/math_functions.h>
345 #if __HIP_HCC_COMPAT_MODE__
347 #pragma push_macro("__DEFINE_HCC_FUNC")
348 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
349 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
358 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
359 __DEFINE_HCC_FUNC(group_id, blockIdx)
360 __DEFINE_HCC_FUNC(group_size, blockDim)
361 __DEFINE_HCC_FUNC(num_groups, gridDim)
362 #pragma pop_macro("__DEFINE_HCC_FUNC")
364 extern "C" __device__ __attribute__((
const)) size_t __ockl_get_global_id(uint);
365 inline __device__ __attribute__((always_inline)) uint
366 hc_get_workitem_absolute_id(
int dim)
368 return (uint)__ockl_get_global_id(dim);
373 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
375 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
376 #pragma push_macro("__CUDA__")
378 #include <__clang_cuda_math_forward_declares.h>
379 #include <__clang_cuda_complex_builtins.h>
385 #include <include/cuda_wrappers/algorithm>
386 #include <include/cuda_wrappers/complex>
387 #include <include/cuda_wrappers/new>
389 #pragma pop_macro("__CUDA__")
390 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
391 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
392 #endif // __HIP_CLANG_ONLY__
394 #include <hip/amd_detail/hip_memory.h>
396 #endif // HIP_AMD_DETAIL_RUNTIME_H