HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/amd_detail/hip_common.h>
33 
34 //---
35 // Top part of file can be compiled with any compiler
36 
37 //#include <cstring>
38 #if __cplusplus
39 #include <cmath>
40 #include <cstdint>
41 #else
42 #include <math.h>
43 #include <string.h>
44 #include <stddef.h>
45 #endif //__cplusplus
46 
47 // __hip_malloc is not working. Disable it by default.
48 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
49 #define __HIP_ENABLE_DEVICE_MALLOC__ 0
50 #endif
51 
52 #if __HIP_CLANG_ONLY__
53 
54 #if !defined(__align__)
55 #define __align__(x) __attribute__((aligned(x)))
56 #endif
57 
58 #define CUDA_SUCCESS hipSuccess
59 
60 #include <hip/hip_runtime_api.h>
61 
62 extern int HIP_TRACE_API;
63 
64 #ifdef __cplusplus
65 #include <hip/amd_detail/hip_ldg.h>
66 #endif
67 #include <hip/amd_detail/hip_atomic.h>
69 #include <hip/amd_detail/device_functions.h>
70 #include <hip/amd_detail/surface_functions.h>
71 #include <hip/amd_detail/texture_fetch_functions.h>
72 #include <hip/amd_detail/texture_indirect_functions.h>
73 
74 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
75 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
76 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
77 #endif
78 
79 // Feature tests:
80 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
81 // Device compile and not host compile:
82 
83 // 32-bit Atomics:
84 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
85 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
86 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
87 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
88 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
89 
90 // 64-bit Atomics:
91 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
92 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
93 
94 // Doubles
95 #define __HIP_ARCH_HAS_DOUBLES__ (1)
96 
97 // warp cross-lane operations:
98 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
99 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
100 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
101 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
102 
103 // sync
104 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
105 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
106 
107 // misc
108 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
109 #define __HIP_ARCH_HAS_3DGRID__ (1)
110 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
111 
112 #endif /* Device feature flags */
113 
114 
115 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
116  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
117 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
118  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
119  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
120 #define select_impl_(_1, _2, impl_, ...) impl_
121 #define __launch_bounds__(...) \
122  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
123 
124 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
125 
126 #if __HIP_ARCH_GFX701__ == 0
127 
128 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
129 __device__ float __hip_ds_bpermutef(int index, float src);
130 __device__ unsigned __hip_ds_permute(int index, unsigned src);
131 __device__ float __hip_ds_permutef(int index, float src);
132 
133 template <int pattern>
134 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
135 template <int pattern>
136 __device__ float __hip_ds_swizzlef_N(float src);
137 
138 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
139 __device__ int __hip_move_dpp_N(int src);
140 
141 #endif //__HIP_ARCH_GFX803__ == 1
142 
143 #ifndef __OPENMP_AMDGCN__
144 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
145 #if __HIP_ENABLE_DEVICE_MALLOC__
146 extern "C" __device__ void* __hip_malloc(size_t);
147 extern "C" __device__ void* __hip_free(void* ptr);
148 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
149 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
150 #else
151 static inline __device__ void* malloc(size_t size) { __builtin_trap(); return nullptr; }
152 static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullptr; }
153 #endif
154 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
155 #endif // !__OPENMP_AMDGCN__
156 
157 // End doxygen API:
162 //
163 // hip-clang functions
164 //
165 #define HIP_KERNEL_NAME(...) __VA_ARGS__
166 #define HIP_SYMBOL(X) X
167 
168 typedef int hipLaunchParm;
169 
170 template <std::size_t n, typename... Ts,
171  typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
172 void pArgs(const std::tuple<Ts...>&, void*) {}
173 
174 template <std::size_t n, typename... Ts,
175  typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
176 void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
177  using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
178 
179  static_assert(!std::is_reference<T>{},
180  "A __global__ function cannot have a reference as one of its "
181  "arguments.");
182 #if defined(HIP_STRICT)
183  static_assert(std::is_trivially_copyable<T>{},
184  "Only TriviallyCopyable types can be arguments to a __global__ "
185  "function");
186 #endif
187  _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
188  return pArgs<n + 1>(formals, _vargs);
189 }
190 
191 template <typename... Formals, typename... Actuals>
192 std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
193  static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
194  std::tuple<Formals...> to_formals{std::move(actuals)};
195  return to_formals;
196 }
197 
198 #if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
199 template <typename... Args, typename F = void (*)(Args...)>
200 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
201  std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
202  constexpr size_t count = sizeof...(Args);
203  auto tup_ = std::tuple<Args...>{args...};
204  auto tup = validateArgsCountType(kernel, tup_);
205  void* _Args[count];
206  pArgs<0>(tup, _Args);
207 
208  auto k = reinterpret_cast<void*>(kernel);
209  hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
210 }
211 #else
212 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
213  do { \
214  kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
215  } while (0)
216 
217 #define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
218 #endif
219 
220 #include <hip/hip_runtime_api.h>
221 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
222 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
223 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
224 extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
225 struct __HIP_BlockIdx {
226  __device__
227  std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
228 };
229 struct __HIP_BlockDim {
230  __device__
231  std::uint32_t operator()(std::uint32_t x) const noexcept {
232  return __ockl_get_local_size(x);
233  }
234 };
235 struct __HIP_GridDim {
236  __device__
237  std::uint32_t operator()(std::uint32_t x) const noexcept {
238  return __ockl_get_num_groups(x);
239  }
240 };
241 struct __HIP_ThreadIdx {
242  __device__
243  std::uint32_t operator()(std::uint32_t x) const noexcept {
244  return __ockl_get_local_id(x);
245  }
246 };
247 
248 template <typename F>
249 struct __HIP_Coordinates {
250  using R = decltype(F{}(0));
251 
252  struct X { __device__ operator R() const noexcept { return F{}(0); } };
253  struct Y { __device__ operator R() const noexcept { return F{}(1); } };
254  struct Z { __device__ operator R() const noexcept { return F{}(2); } };
255 
256  static constexpr X x{};
257  static constexpr Y y{};
258  static constexpr Z z{};
259 #ifdef __cplusplus
260  __device__ operator dim3() const { return dim3(x, y, z); }
261 #endif
262 
263 };
264 template <typename F>
265 #if !defined(_MSC_VER)
266 __attribute__((weak))
267 #endif
268 constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
269 template <typename F>
270 #if !defined(_MSC_VER)
271 __attribute__((weak))
272 #endif
273 constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
274 template <typename F>
275 #if !defined(_MSC_VER)
276 __attribute__((weak))
277 #endif
278 constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
279 
280 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
281 inline
282 __device__
283 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::X,
284  __HIP_Coordinates<__HIP_BlockDim>::X) noexcept {
285  return __ockl_get_global_size(0);
286 }
287 inline
288 __device__
289 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::X,
290  __HIP_Coordinates<__HIP_GridDim>::X) noexcept {
291  return __ockl_get_global_size(0);
292 }
293 inline
294 __device__
295 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Y,
296  __HIP_Coordinates<__HIP_BlockDim>::Y) noexcept {
297  return __ockl_get_global_size(1);
298 }
299 inline
300 __device__
301 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Y,
302  __HIP_Coordinates<__HIP_GridDim>::Y) noexcept {
303  return __ockl_get_global_size(1);
304 }
305 inline
306 __device__
307 std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Z,
308  __HIP_Coordinates<__HIP_BlockDim>::Z) noexcept {
309  return __ockl_get_global_size(2);
310 }
311 inline
312 __device__
313 std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Z,
314  __HIP_Coordinates<__HIP_GridDim>::Z) noexcept {
315  return __ockl_get_global_size(2);
316 }
317 
318 static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
319 static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
320 static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
321 static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
322 
323 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
324 #define hipThreadIdx_x (__ockl_get_local_id(0))
325 #define hipThreadIdx_y (__ockl_get_local_id(1))
326 #define hipThreadIdx_z (__ockl_get_local_id(2))
327 
328 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
329 #define hipBlockIdx_x (__ockl_get_group_id(0))
330 #define hipBlockIdx_y (__ockl_get_group_id(1))
331 #define hipBlockIdx_z (__ockl_get_group_id(2))
332 
333 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
334 #define hipBlockDim_x (__ockl_get_local_size(0))
335 #define hipBlockDim_y (__ockl_get_local_size(1))
336 #define hipBlockDim_z (__ockl_get_local_size(2))
337 
338 extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
339 #define hipGridDim_x (__ockl_get_num_groups(0))
340 #define hipGridDim_y (__ockl_get_num_groups(1))
341 #define hipGridDim_z (__ockl_get_num_groups(2))
342 
343 #include <hip/amd_detail/math_functions.h>
344 
345 #if __HIP_HCC_COMPAT_MODE__
346 // Define HCC work item functions in terms of HIP builtin variables.
347 #pragma push_macro("__DEFINE_HCC_FUNC")
348 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
349 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
350  if (i==0) \
351  return hip_var.x; \
352  else if(i==1) \
353  return hip_var.y; \
354  else \
355  return hip_var.z; \
356 }
357 
358 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
359 __DEFINE_HCC_FUNC(group_id, blockIdx)
360 __DEFINE_HCC_FUNC(group_size, blockDim)
361 __DEFINE_HCC_FUNC(num_groups, gridDim)
362 #pragma pop_macro("__DEFINE_HCC_FUNC")
363 
364 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
365 inline __device__ __attribute__((always_inline)) uint
366 hc_get_workitem_absolute_id(int dim)
367 {
368  return (uint)__ockl_get_global_id(dim);
369 }
370 
371 #endif
372 
373 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
374 // Support std::complex.
375 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
376 #pragma push_macro("__CUDA__")
377 #define __CUDA__
378 #include <__clang_cuda_math_forward_declares.h>
379 #include <__clang_cuda_complex_builtins.h>
380 // Workaround for using libc++ with HIP-Clang.
381 // The following headers requires clang include path before standard C++ include path.
382 // However libc++ include path requires to be before clang include path.
383 // To workaround this, we pass -isystem with the parent directory of clang include
384 // path instead of the clang include path itself.
385 #include <include/cuda_wrappers/algorithm>
386 #include <include/cuda_wrappers/complex>
387 #include <include/cuda_wrappers/new>
388 #undef __CUDA__
389 #pragma pop_macro("__CUDA__")
390 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
391 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
392 #endif // __HIP_CLANG_ONLY__
393 
394 #include <hip/amd_detail/hip_memory.h>
395 
396 #endif // HIP_AMD_DETAIL_RUNTIME_H
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
hipLaunchKernel
hipError_t hipLaunchKernel(const void *function_address, dim3 numBlocks, dim3 dimBlocks, void **args, size_t sharedMemBytes __dparm(0), hipStream_t stream __dparm(0))
C compliant kernel launch API.
dim3
struct dim3 dim3
__host__
#define __host__
Definition: host_defines.h:59
host_defines.h
TODO-doc.
dim3
Definition: hip_runtime_api.h:318