HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/hcc_detail/hip_common.h>
33 
34 //---
35 // Top part of file can be compiled with any compiler
36 
37 //#include <cstring>
38 #if __cplusplus
39 #include <cmath>
40 #else
41 #include <math.h>
42 #include <string.h>
43 #include <stddef.h>
44 #endif //__cplusplus
45 
46 #if __HCC_OR_HIP_CLANG__
47 
48 #if __HIP__
49 #if !defined(__align__)
50 #define __align__(x) __attribute__((aligned(x)))
51 #endif
52 #endif
53 
54 #define CUDA_SUCCESS hipSuccess
55 
56 #include <hip/hip_runtime_api.h>
57 #endif // __HCC_OR_HIP_CLANG__
58 
59 #if __HCC__
60 // define HIP_ENABLE_PRINTF to enable printf
61 #ifdef HIP_ENABLE_PRINTF
62 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
63 #endif
64 
65 //---
66 // Remainder of this file only compiles with HCC
67 #if defined __HCC__
68 #include "grid_launch.h"
69 #include "hc_printf.hpp"
70 // TODO-HCC-GL - change this to typedef.
71 // typedef grid_launch_parm hipLaunchParm ;
72 
73 #if GENERIC_GRID_LAUNCH == 0
74 #define hipLaunchParm grid_launch_parm
75 #else
76 namespace hip_impl {
77 struct Empty_launch_parm {};
78 } // namespace hip_impl
79 #define hipLaunchParm hip_impl::Empty_launch_parm
80 #endif // GENERIC_GRID_LAUNCH
81 
82 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
83 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
84 #error(HCC must support GRID_LAUNCH_20)
85 #endif // GRID_LAUNCH_VERSION
86 
87 #endif // HCC
88 
89 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
90 #include "grid_launch_GGL.hpp"
91 #endif // GENERIC_GRID_LAUNCH
92 
93 #endif // HCC
94 
95 #if __HCC_OR_HIP_CLANG__
96 extern int HIP_TRACE_API;
97 
98 #ifdef __cplusplus
99 #include <hip/hcc_detail/hip_ldg.h>
100 #endif
101 #include <hip/hcc_detail/hip_atomic.h>
103 #include <hip/hcc_detail/device_functions.h>
104 #include <hip/hcc_detail/surface_functions.h>
105 #include <hip/hcc_detail/texture_functions.h>
106 #if __HCC__
107  #include <hip/hcc_detail/math_functions.h>
108 #endif
109 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
110 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
111 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
112 #endif
113 
114 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
115 #if __HIP_DEVICE_COMPILE__ == 1
116 #undef assert
117 #define assert(COND) \
118  { \
119  if (!(COND)) { \
120  abort(); \
121  } \
122  }
123 #endif
124 
125 
126 // Feature tests:
127 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
128 // Device compile and not host compile:
129 
130 // 32-bit Atomics:
131 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
132 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
133 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
134 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
135 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
136 
137 // 64-bit Atomics:
138 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
139 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
140 
141 // Doubles
142 #define __HIP_ARCH_HAS_DOUBLES__ (1)
143 
144 // warp cross-lane operations:
145 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
146 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
147 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
148 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
149 
150 // sync
151 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
152 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
153 
154 // misc
155 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
156 #define __HIP_ARCH_HAS_3DGRID__ (1)
157 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
158 
159 #endif /* Device feature flags */
160 
161 
162 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
163  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
164 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
165  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
166  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
167 #define select_impl_(_1, _2, impl_, ...) impl_
168 #define __launch_bounds__(...) \
169  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
170 
171 // Detect if we are compiling C++ mode or C mode
172 #if defined(__cplusplus)
173 #define __HCC_CPP__
174 #elif defined(__STDC_VERSION__)
175 #define __HCC_C__
176 #endif
177 
178 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
179 
180 #if __HIP_ARCH_GFX701__ == 0
181 
182 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
183 __device__ float __hip_ds_bpermutef(int index, float src);
184 __device__ unsigned __hip_ds_permute(int index, unsigned src);
185 __device__ float __hip_ds_permutef(int index, float src);
186 
187 template <int pattern>
188 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
189 template <int pattern>
190 __device__ float __hip_ds_swizzlef_N(float src);
191 
192 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
193 __device__ int __hip_move_dpp_N(int src);
194 
195 #endif //__HIP_ARCH_GFX803__ == 1
196 
197 #endif // __HCC_OR_HIP_CLANG__
198 
199 #if defined __HCC__
200 
201 template <
202  typename std::common_type<decltype(hc_get_group_id), decltype(hc_get_group_size),
203  decltype(hc_get_num_groups), decltype(hc_get_workitem_id)>::type f>
204 class Coordinates {
205  using R = decltype(f(0));
206 
207  struct X {
208  __device__ operator R() const { return f(0); }
209  __device__ uint32_t operator=(R _) { return f(0); }
210  };
211  struct Y {
212  __device__ operator R() const { return f(1); }
213  __device__ uint32_t operator=(R _) { return f(1); }
214  };
215  struct Z {
216  __device__ operator R() const { return f(2); }
217  __device__ uint32_t operator=(R _) { return f(2); }
218  };
219 
220  public:
221  static constexpr X x{};
222  static constexpr Y y{};
223  static constexpr Z z{};
224 };
225 
226 static constexpr Coordinates<hc_get_group_size> blockDim;
227 static constexpr Coordinates<hc_get_group_id> blockIdx;
228 static constexpr Coordinates<hc_get_num_groups> gridDim;
229 static constexpr Coordinates<hc_get_workitem_id> threadIdx;
230 
231 #define hipThreadIdx_x (hc_get_workitem_id(0))
232 #define hipThreadIdx_y (hc_get_workitem_id(1))
233 #define hipThreadIdx_z (hc_get_workitem_id(2))
234 
235 #define hipBlockIdx_x (hc_get_group_id(0))
236 #define hipBlockIdx_y (hc_get_group_id(1))
237 #define hipBlockIdx_z (hc_get_group_id(2))
238 
239 #define hipBlockDim_x (hc_get_group_size(0))
240 #define hipBlockDim_y (hc_get_group_size(1))
241 #define hipBlockDim_z (hc_get_group_size(2))
242 
243 #define hipGridDim_x (hc_get_num_groups(0))
244 #define hipGridDim_y (hc_get_num_groups(1))
245 #define hipGridDim_z (hc_get_num_groups(2))
246 
247 #endif // defined __HCC__
248 #if __HCC_OR_HIP_CLANG__
249 extern "C" __device__ void* __hip_malloc(size_t);
250 extern "C" __device__ void* __hip_free(void* ptr);
251 
252 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
253 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
254 
255 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
256 template <typename... All>
257 static inline __device__ void printf(const char* format, All... all) {
258  hc::printf(format, all...);
259 }
260 #elif defined(__HCC_ACCELERATOR__) || __HIP__
261 template <typename... All>
262 static inline __device__ void printf(const char* format, All... all) {}
263 #endif
264 
265 #endif //__HCC_OR_HIP_CLANG__
266 
267 #ifdef __HCC__
268 
269 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
270 
271 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
272 #define HIP_SYMBOL(X) #X
273 
274 #if defined __HCC_CPP__
275 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
276  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
277 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
278  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
279 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
280  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
281 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
282  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
283 extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0);
284 
285 #if GENERIC_GRID_LAUNCH == 0
286 //#warning "Original hipLaunchKernel defined"
287 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
288 // either size_t or dim3 types
289 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
290  do { \
291  grid_launch_parm lp; \
292  lp.dynamic_group_mem_bytes = _groupMemBytes; \
293  hipStream_t trueStream = \
294  (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
295  _kernelName(lp, ##__VA_ARGS__); \
296  ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
297  } while (0)
298 #endif // GENERIC_GRID_LAUNCH
299 
300 #elif defined(__HCC_C__)
301 
302 // TODO - develop C interface.
303 
304 #endif //__HCC_CPP__
305 
310 // extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
311 // extern int HIP_TRACE_API; ///< Trace HIP APIs.
312 // extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
313 
319 // End doxygen API:
324 //
325 // hip-clang functions
326 //
327 #elif defined(__clang__) && defined(__HIP__)
328 
329 #define HIP_KERNEL_NAME(...) __VA_ARGS__
330 #define HIP_SYMBOL(X) #X
331 
332 typedef int hipLaunchParm;
333 
334 #define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
335  do { \
336  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(hipLaunchParm{}, ##__VA_ARGS__); \
337  } while (0)
338 
339 #define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
340  do { \
341  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__); \
342  } while (0)
343 
344 #include <hip/hip_runtime_api.h>
345 
346 #pragma push_macro("__DEVICE__")
347 #define __DEVICE__ static __device__ __forceinline__
348 
349 extern "C" __device__ size_t __ockl_get_local_id(uint);
350 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
351 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
352 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
353 
354 extern "C" __device__ size_t __ockl_get_group_id(uint);
355 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
356 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
357 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
358 
359 extern "C" __device__ size_t __ockl_get_local_size(uint);
360 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
361 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
362 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
363 
364 extern "C" __device__ size_t __ockl_get_num_groups(uint);
365 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
366 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
367 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
368 
369 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
370  __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
371  __DEVICE__ uint __get_##DIMENSION(void) { \
372  return FUNCTION; \
373  }
374 
375 struct __hip_builtin_threadIdx_t {
376  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
377  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
378  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
379 };
380 
381 struct __hip_builtin_blockIdx_t {
382  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
383  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
384  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
385 };
386 
387 struct __hip_builtin_blockDim_t {
388  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
389  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
390  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
391 };
392 
393 struct __hip_builtin_gridDim_t {
394  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
395  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
396  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
397 };
398 
399 #undef __HIP_DEVICE_BUILTIN
400 #pragma pop_macro("__DEVICE__")
401 
402 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
403 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
404 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
405 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
406 
407 
408 #define hipThreadIdx_x threadIdx.x
409 #define hipThreadIdx_y threadIdx.y
410 #define hipThreadIdx_z threadIdx.z
411 
412 #define hipBlockIdx_x blockIdx.x
413 #define hipBlockIdx_y blockIdx.y
414 #define hipBlockIdx_z blockIdx.z
415 
416 #define hipBlockDim_x blockDim.x
417 #define hipBlockDim_y blockDim.y
418 #define hipBlockDim_z blockDim.z
419 
420 #define hipGridDim_x gridDim.x
421 #define hipGridDim_y gridDim.y
422 #define hipGridDim_z gridDim.z
423 
424 #include <hip/hcc_detail/math_functions.h>
425 
426 #if __HIP_HCC_COMPAT_MODE__
427 // Define HCC work item functions in terms of HIP builtin variables.
428 #pragma push_macro("__DEFINE_HCC_FUNC")
429 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
430 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
431  if (i==0) \
432  return hip_var.x; \
433  else if(i==1) \
434  return hip_var.y; \
435  else \
436  return hip_var.z; \
437 }
438 
439 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
440 __DEFINE_HCC_FUNC(group_id, blockIdx)
441 __DEFINE_HCC_FUNC(group_size, blockDim)
442 __DEFINE_HCC_FUNC(num_groups, gridDim)
443 #pragma pop_macro("__DEFINE_HCC_FUNC")
444 
445 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
446 inline __device__ __attribute__((always_inline)) uint
447 hc_get_workitem_absolute_id(int dim)
448 {
449  return (uint)__ockl_get_global_id(dim);
450 }
451 
452 #endif
453 
454 // Support std::complex.
455 #pragma push_macro("__CUDA__")
456 #define __CUDA__
457 #include <__clang_cuda_math_forward_declares.h>
458 #include <__clang_cuda_complex_builtins.h>
459 #include <cuda_wrappers/algorithm>
460 #include <cuda_wrappers/complex>
461 #include <cuda_wrappers/new>
462 #undef __CUDA__
463 #pragma pop_macro("__CUDA__")
464 
465 hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
466  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
467  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
468  uint32_t localWorkSizeZ, size_t sharedMemBytes,
469  hipStream_t hStream, void** kernelParams, void** extra,
470  hipEvent_t startEvent = nullptr,
471  hipEvent_t stopEvent = nullptr,
472  uint32_t flags = 0);
473 
474 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
475  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
476  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
477  uint32_t localWorkSizeZ, size_t sharedMemBytes,
478  hipStream_t hStream, void** kernelParams, void** extra,
479  hipEvent_t startEvent = nullptr,
480  hipEvent_t stopEvent = nullptr)
481  __attribute__((deprecated("use hipExtModuleLaunchKernel instead")));
482 
483 #endif // defined(__clang__) && defined(__HIP__)
484 
485 #include <hip/hcc_detail/hip_memory.h>
486 
487 #endif // HIP_HCC_DETAIL_RUNTIME_H
TODO-doc.
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Copies the size of symbol symbolName to size.
Definition: hip_fp16_math_fwd.h:53
hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, hipEvent_t startEvent=nullptr, hipEvent_t stopEvent=nullptr, uint32_t flags=0)
launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelp...
Definition: hip_module.cpp:293
Definition: hip_module.cpp:106
Definition: hip_runtime_api.h:269
#define __host__
Definition: host_defines.h:41
Definition: grid_launch.h:31
Definition: hip_runtime_api.h:82
Definition: hip_hcc_internal.h:703
Definition: hip_hcc_internal.h:524
Definition: hip_runtime.h:204