HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/amd_detail/hip_common.h>
33 
34 //---
35 // Top part of file can be compiled with any compiler
36 
37 #if !defined(__HIPCC_RTC__)
38 //#include <cstring>
39 #if __cplusplus
40 #include <cmath>
41 #include <cstdint>
42 #else
43 #include <math.h>
44 #include <string.h>
45 #include <stddef.h>
46 #endif // __cplusplus
47 #endif // !defined(__HIPCC_RTC__)
48 
49 // __hip_malloc is not working. Disable it by default.
50 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
51 #define __HIP_ENABLE_DEVICE_MALLOC__ 0
52 #endif
53 
54 #if __HIP_CLANG_ONLY__
55 
56 #if !defined(__align__)
57 #define __align__(x) __attribute__((aligned(x)))
58 #endif
59 
60 #define CUDA_SUCCESS hipSuccess
61 
62 #if !defined(__HIPCC_RTC__)
63 #include <hip/hip_runtime_api.h>
64 extern int HIP_TRACE_API;
65 #endif // !defined(__HIPCC_RTC__)
66 
67 #ifdef __cplusplus
68 #include <hip/amd_detail/hip_ldg.h>
69 #endif
70 #include <hip/amd_detail/hip_atomic.h>
72 #include <hip/amd_detail/device_functions.h>
73 #include <hip/amd_detail/surface_functions.h>
74 #include <hip/amd_detail/texture_fetch_functions.h>
75 #include <hip/amd_detail/texture_indirect_functions.h>
76 
77 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
78 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
79 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
80 #endif
81 
82 // Feature tests:
83 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
84 // Device compile and not host compile:
85 
86 // 32-bit Atomics:
87 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
88 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
89 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
90 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
91 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
92 
93 // 64-bit Atomics:
94 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
95 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
96 
97 // Doubles
98 #define __HIP_ARCH_HAS_DOUBLES__ (1)
99 
100 // warp cross-lane operations:
101 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
102 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
103 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
104 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
105 
106 // sync
107 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
108 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
109 
110 // misc
111 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
112 #define __HIP_ARCH_HAS_3DGRID__ (1)
113 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
114 
115 #endif /* Device feature flags */
116 
117 
118 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
119  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
120 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
121  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
122  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
123 #define select_impl_(_1, _2, impl_, ...) impl_
124 #define __launch_bounds__(...) \
125  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
126 
127 #if !defined(__HIPCC_RTC__)
128 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
129 #endif // !defined(__HIPCC_RTC__)
130 
131 #if __HIP_ARCH_GFX701__ == 0
132 
133 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
134 __device__ float __hip_ds_bpermutef(int index, float src);
135 __device__ unsigned __hip_ds_permute(int index, unsigned src);
136 __device__ float __hip_ds_permutef(int index, float src);
137 
138 template <int pattern>
139 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
140 template <int pattern>
141 __device__ float __hip_ds_swizzlef_N(float src);
142 
143 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
144 __device__ int __hip_move_dpp_N(int src);
145 
146 #endif //__HIP_ARCH_GFX803__ == 1
147 
148 #ifndef __OPENMP_AMDGCN__
149 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
150 #if __HIP_ENABLE_DEVICE_MALLOC__
151 extern "C" __device__ void* __hip_malloc(size_t);
152 extern "C" __device__ void* __hip_free(void* ptr);
153 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
154 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
155 #else
156 static inline __device__ void* malloc(size_t size) { __builtin_trap(); return nullptr; }
157 static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullptr; }
158 #endif
159 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
160 #endif // !__OPENMP_AMDGCN__
161 
162 // End doxygen API:
167 //
168 // hip-clang functions
169 //
170 #if !defined(__HIPCC_RTC__)
171 #define HIP_KERNEL_NAME(...) __VA_ARGS__
172 #define HIP_SYMBOL(X) X
173 
174 typedef int hipLaunchParm;
175 
176 template <std::size_t n, typename... Ts,
177  typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
178 void pArgs(const std::tuple<Ts...>&, void*) {}
179 
180 template <std::size_t n, typename... Ts,
181  typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
182 void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
183  using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
184 
185  static_assert(!std::is_reference<T>{},
186  "A __global__ function cannot have a reference as one of its "
187  "arguments.");
188 #if defined(HIP_STRICT)
189  static_assert(std::is_trivially_copyable<T>{},
190  "Only TriviallyCopyable types can be arguments to a __global__ "
191  "function");
192 #endif
193  _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
194  return pArgs<n + 1>(formals, _vargs);
195 }
196 
197 template <typename... Formals, typename... Actuals>
198 std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
199  static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
200  std::tuple<Formals...> to_formals{std::move(actuals)};
201  return to_formals;
202 }
203 
204 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
205 template <typename... Args, typename F = void (*)(Args...)>
206 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
207  std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
208  constexpr size_t count = sizeof...(Args);
209  auto tup_ = std::tuple<Args...>{args...};
210  auto tup = validateArgsCountType(kernel, tup_);
211  void* _Args[count];
212  pArgs<0>(tup, _Args);
213 
214  auto k = reinterpret_cast<void*>(kernel);
215  hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
216 }
217 #else
218 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
219  do { \
220  kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
221  } while (0)
222 
223 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
224 #endif
225 
226 #include <hip/hip_runtime_api.h>
227 #endif // !defined(__HIPCC_RTC__)
228 
229 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
230 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
231 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
232 extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
233 struct __HIP_BlockIdx {
234  __device__
235  std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
236 };
237 struct __HIP_BlockDim {
238  __device__
239  std::uint32_t operator()(std::uint32_t x) const noexcept {
240  return __ockl_get_local_size(x);
241  }
242 };
243 struct __HIP_GridDim {
244  __device__
245  std::uint32_t operator()(std::uint32_t x) const noexcept {
246  return __ockl_get_num_groups(x);
247  }
248 };
249 struct __HIP_ThreadIdx {
250  __device__
251  std::uint32_t operator()(std::uint32_t x) const noexcept {
252  return __ockl_get_local_id(x);
253  }
254 };
255 
256 #if defined(__HIPCC_RTC__)
257 typedef struct dim3 {
258  uint32_t x;
259  uint32_t y;
260  uint32_t z;
261 #ifdef __cplusplus
262  constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
263 #endif
264 } dim3;
265 #endif // !defined(__HIPCC_RTC__)
266 
267 template <typename F>
268 struct __HIP_Coordinates {
269  using R = decltype(F{}(0));
270 
271  struct __X { __device__ operator R() const noexcept { return F{}(0); } };
272  struct __Y { __device__ operator R() const noexcept { return F{}(1); } };
273  struct __Z { __device__ operator R() const noexcept { return F{}(2); } };
274 
275  static constexpr __X x{};
276  static constexpr __Y y{};
277  static constexpr __Z z{};
278 #ifdef __cplusplus
279  __device__ operator dim3() const { return dim3(x, y, z); }
280 #endif
281 
282 };
283 template <typename F>
284 #if !defined(_MSC_VER)
285 __attribute__((weak))
286 #endif
287 constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
288 template <typename F>
289 #if !defined(_MSC_VER)
290 __attribute__((weak))
291 #endif
292 constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
293 template <typename F>
294 #if !defined(_MSC_VER)
295 __attribute__((weak))
296 #endif
297 constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
298 
299 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
300 inline
301 __device__
302 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
303  __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
304  return __ockl_get_global_size(0);
305 }
306 inline
307 __device__
308 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
309  __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
310  return __ockl_get_global_size(0);
311 }
312 inline
313 __device__
314 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
315  __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
316  return __ockl_get_global_size(1);
317 }
318 inline
319 __device__
320 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
321  __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
322  return __ockl_get_global_size(1);
323 }
324 inline
325 __device__
326 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
327  __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
328  return __ockl_get_global_size(2);
329 }
330 inline
331 __device__
332 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
333  __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
334  return __ockl_get_global_size(2);
335 }
336 
337 static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
338 static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
339 static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
340 static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
341 
342 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
343 #define hipThreadIdx_x (__ockl_get_local_id(0))
344 #define hipThreadIdx_y (__ockl_get_local_id(1))
345 #define hipThreadIdx_z (__ockl_get_local_id(2))
346 
347 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
348 #define hipBlockIdx_x (__ockl_get_group_id(0))
349 #define hipBlockIdx_y (__ockl_get_group_id(1))
350 #define hipBlockIdx_z (__ockl_get_group_id(2))
351 
352 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
353 #define hipBlockDim_x (__ockl_get_local_size(0))
354 #define hipBlockDim_y (__ockl_get_local_size(1))
355 #define hipBlockDim_z (__ockl_get_local_size(2))
356 
357 extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
358 #define hipGridDim_x (__ockl_get_num_groups(0))
359 #define hipGridDim_y (__ockl_get_num_groups(1))
360 #define hipGridDim_z (__ockl_get_num_groups(2))
361 
362 #include <hip/amd_detail/math_functions.h>
363 
364 #if __HIP_HCC_COMPAT_MODE__
365 // Define HCC work item functions in terms of HIP builtin variables.
366 #pragma push_macro("__DEFINE_HCC_FUNC")
367 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
368 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
369  if (i==0) \
370  return hip_var.x; \
371  else if(i==1) \
372  return hip_var.y; \
373  else \
374  return hip_var.z; \
375 }
376 
377 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
378 __DEFINE_HCC_FUNC(group_id, blockIdx)
379 __DEFINE_HCC_FUNC(group_size, blockDim)
380 __DEFINE_HCC_FUNC(num_groups, gridDim)
381 #pragma pop_macro("__DEFINE_HCC_FUNC")
382 
383 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
384 inline __device__ __attribute__((always_inline)) uint
385 hc_get_workitem_absolute_id(int dim)
386 {
387  return (uint)__ockl_get_global_id(dim);
388 }
389 
390 #endif
391 
392 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
393 #if !defined(__HIPCC_RTC__)
394 // Support std::complex.
395 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
396 #pragma push_macro("__CUDA__")
397 #define __CUDA__
398 #include <__clang_cuda_math_forward_declares.h>
399 #include <__clang_cuda_complex_builtins.h>
400 // Workaround for using libc++ with HIP-Clang.
401 // The following headers requires clang include path before standard C++ include path.
402 // However libc++ include path requires to be before clang include path.
403 // To workaround this, we pass -isystem with the parent directory of clang include
404 // path instead of the clang include path itself.
405 #include <include/cuda_wrappers/algorithm>
406 #include <include/cuda_wrappers/complex>
407 #include <include/cuda_wrappers/new>
408 #undef __CUDA__
409 #pragma pop_macro("__CUDA__")
410 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
411 #endif // !defined(__HIPCC_RTC__)
412 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
413 #endif // __HIP_CLANG_ONLY__
414 
415 #include <hip/amd_detail/hip_memory.h>
416 
417 #endif // HIP_AMD_DETAIL_RUNTIME_H
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
hipLaunchKernel
hipError_t hipLaunchKernel(const void *function_address, dim3 numBlocks, dim3 dimBlocks, void **args, size_t sharedMemBytes __dparm(0), hipStream_t stream __dparm(0))
C compliant kernel launch API.
dim3::y
uint32_t y
y
Definition: hip_runtime_api.h:320
dim3::x
uint32_t x
x
Definition: hip_runtime_api.h:319
dim3
struct dim3 dim3
__host__
#define __host__
Definition: host_defines.h:59
host_defines.h
TODO-doc.
dim3
Definition: hip_runtime_api.h:318
dim3::z
uint32_t z
z
Definition: hip_runtime_api.h:321