HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 #if defined(__HCC__)
33 #define __HCC_OR_HIP_CLANG__ 1
34 #define __HCC_ONLY__ 1
35 #define __HIP_CLANG_ONLY__ 0
36 #elif defined(__clang__) && defined(__HIP__)
37 #define __HCC_OR_HIP_CLANG__ 1
38 #define __HCC_ONLY__ 0
39 #define __HIP_CLANG_ONLY__ 1
40 #else
41 #define __HCC_OR_HIP_CLANG__ 0
42 #define __HCC_ONLY__ 0
43 #define __HIP_CLANG_ONLY__ 0
44 #endif
45 
46 //---
47 // Top part of file can be compiled with any compiler
48 
49 //#include <cstring>
50 #if __cplusplus
51 #include <cmath>
52 #else
53 #include <math.h>
54 #include <string.h>
55 #include <stddef.h>
56 #endif //__cplusplus
57 
58 #if __HCC_OR_HIP_CLANG__
59 
60 #define CUDA_SUCCESS hipSuccess
61 
62 #include <hip/hip_runtime_api.h>
63 #endif // __HCC_OR_HIP_CLANG__
64 
65 #if __HCC__
66 // define HIP_ENABLE_PRINTF to enable printf
67 #ifdef HIP_ENABLE_PRINTF
68 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
69 #endif
70 
71 //---
72 // Remainder of this file only compiles with HCC
73 #if defined __HCC__
74 #include "grid_launch.h"
75 #include "hc_printf.hpp"
76 // TODO-HCC-GL - change this to typedef.
77 // typedef grid_launch_parm hipLaunchParm ;
78 
79 #if GENERIC_GRID_LAUNCH == 0
80 #define hipLaunchParm grid_launch_parm
81 #else
82 namespace hip_impl {
83 struct Empty_launch_parm {};
84 } // namespace hip_impl
85 #define hipLaunchParm hip_impl::Empty_launch_parm
86 #endif // GENERIC_GRID_LAUNCH
87 
88 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
89 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
90 #error(HCC must support GRID_LAUNCH_20)
91 #endif // GRID_LAUNCH_VERSION
92 
93 #endif // HCC
94 
95 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
96 #include "grid_launch_GGL.hpp"
97 #endif // GENERIC_GRID_LAUNCH
98 
99 #endif // HCC
100 
101 #if __HCC_OR_HIP_CLANG__
102 extern int HIP_TRACE_API;
103 
104 #ifdef __cplusplus
105 #include <hip/hcc_detail/hip_ldg.h>
106 #endif
107 #include <hip/hcc_detail/hip_atomic.h>
109 #include <hip/hcc_detail/device_functions.h>
110 #include <hip/hcc_detail/surface_functions.h>
111 #include <hip/hcc_detail/texture_functions.h>
112 #if __HCC__
113  #include <hip/hcc_detail/math_functions.h>
114 #endif
115 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
116 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
117 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
118 #endif
119 
120 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
121 #if __HIP_DEVICE_COMPILE__ == 1
122 #undef assert
123 #define assert(COND) \
124  { \
125  if (!(COND)) { \
126  abort(); \
127  } \
128  }
129 #endif
130 
131 
132 // Feature tests:
133 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
134 // Device compile and not host compile:
135 
136 // 32-bit Atomics:
137 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
138 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
139 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
140 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
141 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
142 
143 // 64-bit Atomics:
144 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
145 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
146 
147 // Doubles
148 #define __HIP_ARCH_HAS_DOUBLES__ (1)
149 
150 // warp cross-lane operations:
151 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
152 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
153 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
154 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
155 
156 // sync
157 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
158 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
159 
160 // misc
161 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
162 #define __HIP_ARCH_HAS_3DGRID__ (1)
163 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
164 
165 #endif /* Device feature flags */
166 
167 
168 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
169  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
170 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
171  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
172  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
173 #define select_impl_(_1, _2, impl_, ...) impl_
174 #define __launch_bounds__(...) \
175  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
176 
177 // Detect if we are compiling C++ mode or C mode
178 #if defined(__cplusplus)
179 #define __HCC_CPP__
180 #elif defined(__STDC_VERSION__)
181 #define __HCC_C__
182 #endif
183 
184 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
185 
186 #if __HIP_ARCH_GFX701__ == 0
187 
188 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
189 __device__ float __hip_ds_bpermutef(int index, float src);
190 __device__ unsigned __hip_ds_permute(int index, unsigned src);
191 __device__ float __hip_ds_permutef(int index, float src);
192 
193 template <int pattern>
194 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
195 template <int pattern>
196 __device__ float __hip_ds_swizzlef_N(float src);
197 
198 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
199 __device__ int __hip_move_dpp_N(int src);
200 
201 #endif //__HIP_ARCH_GFX803__ == 1
202 
203 #endif // __HCC_OR_HIP_CLANG__
204 
205 #if defined __HCC__
206 
207 template <
208  typename std::common_type<decltype(hc_get_group_id), decltype(hc_get_group_size),
209  decltype(hc_get_num_groups), decltype(hc_get_workitem_id)>::type f>
210 class Coordinates {
211  using R = decltype(f(0));
212 
213  struct X {
214  __device__ operator R() const { return f(0); }
215  };
216  struct Y {
217  __device__ operator R() const { return f(1); }
218  };
219  struct Z {
220  __device__ operator R() const { return f(2); }
221  };
222 
223  public:
224  static constexpr X x{};
225  static constexpr Y y{};
226  static constexpr Z z{};
227 };
228 
229 static constexpr Coordinates<hc_get_group_size> blockDim;
230 static constexpr Coordinates<hc_get_group_id> blockIdx;
231 static constexpr Coordinates<hc_get_num_groups> gridDim;
232 static constexpr Coordinates<hc_get_workitem_id> threadIdx;
233 
234 #define hipThreadIdx_x (hc_get_workitem_id(0))
235 #define hipThreadIdx_y (hc_get_workitem_id(1))
236 #define hipThreadIdx_z (hc_get_workitem_id(2))
237 
238 #define hipBlockIdx_x (hc_get_group_id(0))
239 #define hipBlockIdx_y (hc_get_group_id(1))
240 #define hipBlockIdx_z (hc_get_group_id(2))
241 
242 #define hipBlockDim_x (hc_get_group_size(0))
243 #define hipBlockDim_y (hc_get_group_size(1))
244 #define hipBlockDim_z (hc_get_group_size(2))
245 
246 #define hipGridDim_x (hc_get_num_groups(0))
247 #define hipGridDim_y (hc_get_num_groups(1))
248 #define hipGridDim_z (hc_get_num_groups(2))
249 
250 #endif // defined __HCC__
251 #if __HCC_OR_HIP_CLANG__
252 extern "C" __device__ void* __hip_malloc(size_t);
253 extern "C" __device__ void* __hip_free(void* ptr);
254 
255 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
256 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
257 
258 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
259 template <typename... All>
260 static inline __device__ void printf(const char* format, All... all) {
261  hc::printf(format, all...);
262 }
263 #elif defined(__HCC_ACCELERATOR__) || __HIP__
264 template <typename... All>
265 static inline __device__ void printf(const char* format, All... all) {}
266 #endif
267 
268 #endif //__HCC_OR_HIP_CLANG__
269 
270 #ifdef __HCC__
271 
272 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
273 
274 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
275 #define HIP_SYMBOL(X) #X
276 
277 #if defined __HCC_CPP__
278 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
279  grid_launch_parm* lp, const char* kernelNameStr);
280 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
281  grid_launch_parm* lp, const char* kernelNameStr);
282 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
283  grid_launch_parm* lp, const char* kernelNameStr);
284 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
285  grid_launch_parm* lp, const char* kernelNameStr);
286 extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);
287 
288 #if GENERIC_GRID_LAUNCH == 0
289 //#warning "Original hipLaunchKernel defined"
290 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
291 // either size_t or dim3 types
292 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
293  do { \
294  grid_launch_parm lp; \
295  lp.dynamic_group_mem_bytes = _groupMemBytes; \
296  hipStream_t trueStream = \
297  (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
298  _kernelName(lp, ##__VA_ARGS__); \
299  ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
300  } while (0)
301 #endif // GENERIC_GRID_LAUNCH
302 
303 #elif defined(__HCC_C__)
304 
305 // TODO - develop C interface.
306 
307 #endif //__HCC_CPP__
308 
313 // extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
314 // extern int HIP_TRACE_API; ///< Trace HIP APIs.
315 // extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
316 
322 // End doxygen API:
327 //
328 // hip-clang functions
329 //
330 #elif defined(__clang__) && defined(__HIP__)
331 
332 #define HIP_KERNEL_NAME(...) __VA_ARGS__
333 #define HIP_SYMBOL(X) #X
334 
335 typedef int hipLaunchParm;
336 
337 #define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
338  do { \
339  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(hipLaunchParm{}, ##__VA_ARGS__); \
340  } while (0)
341 
342 #define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
343  do { \
344  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__); \
345  } while (0)
346 
347 #include <hip/hip_runtime_api.h>
348 
349 #pragma push_macro("__DEVICE__")
350 #define __DEVICE__ static __device__ __forceinline__
351 
352 extern "C" __device__ size_t __ockl_get_local_id(uint);
353 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
354 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
355 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
356 
357 extern "C" __device__ size_t __ockl_get_group_id(uint);
358 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
359 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
360 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
361 
362 extern "C" __device__ size_t __ockl_get_local_size(uint);
363 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
364 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
365 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
366 
367 extern "C" __device__ size_t __ockl_get_num_groups(uint);
368 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
369 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
370 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
371 
372 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
373  __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
374  __DEVICE__ uint __get_##DIMENSION(void) { \
375  return FUNCTION; \
376  }
377 
378 struct __hip_builtin_threadIdx_t {
379  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
380  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
381  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
382 };
383 
384 struct __hip_builtin_blockIdx_t {
385  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
386  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
387  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
388 };
389 
390 struct __hip_builtin_blockDim_t {
391  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
392  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
393  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
394 };
395 
396 struct __hip_builtin_gridDim_t {
397  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
398  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
399  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
400 };
401 
402 #undef __HIP_DEVICE_BUILTIN
403 #pragma pop_macro("__DEVICE__")
404 
405 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
406 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
407 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
408 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
409 
410 
411 #define hipThreadIdx_x threadIdx.x
412 #define hipThreadIdx_y threadIdx.y
413 #define hipThreadIdx_z threadIdx.z
414 
415 #define hipBlockIdx_x blockIdx.x
416 #define hipBlockIdx_y blockIdx.y
417 #define hipBlockIdx_z blockIdx.z
418 
419 #define hipBlockDim_x blockDim.x
420 #define hipBlockDim_y blockDim.y
421 #define hipBlockDim_z blockDim.z
422 
423 #define hipGridDim_x gridDim.x
424 #define hipGridDim_y gridDim.y
425 #define hipGridDim_z gridDim.z
426 
427 #include <hip/hcc_detail/math_functions.h>
428 
429 #if __HIP_HCC_COMPAT_MODE__
430 // Define HCC work item functions in terms of HIP builtin variables.
431 #pragma push_macro("__DEFINE_HCC_FUNC")
432 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
433 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
434  if (i==0) \
435  return hip_var.x; \
436  else if(i==1) \
437  return hip_var.y; \
438  else \
439  return hip_var.z; \
440 }
441 
442 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
443 __DEFINE_HCC_FUNC(group_id, blockIdx)
444 __DEFINE_HCC_FUNC(group_size, blockDim)
445 __DEFINE_HCC_FUNC(num_groups, gridDim)
446 #pragma pop_macro("__DEFINE_HCC_FUNC")
447 
448 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
449 inline __device__ __attribute__((always_inline)) uint
450 hc_get_workitem_absolute_id(int dim)
451 {
452  return (uint)__ockl_get_global_id(dim);
453 }
454 
455 #endif
456 
457 // Support std::complex.
458 #pragma push_macro("__CUDA__")
459 #define __CUDA__
460 #include <__clang_cuda_math_forward_declares.h>
461 #include <__clang_cuda_complex_builtins.h>
462 #include <cuda_wrappers/algorithm>
463 #include <cuda_wrappers/complex>
464 #include <cuda_wrappers/new>
465 #undef __CUDA__
466 #pragma pop_macro("__CUDA__")
467 
468 
469 hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
470  uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
471  uint32_t localWorkSizeX, uint32_t localWorkSizeY,
472  uint32_t localWorkSizeZ, size_t sharedMemBytes,
473  hipStream_t hStream, void** kernelParams, void** extra,
474  hipEvent_t startEvent = nullptr,
475  hipEvent_t stopEvent = nullptr);
476 
477 #endif // defined(__clang__) && defined(__HIP__)
478 
479 #include <hip/hcc_detail/hip_memory.h>
480 
481 #endif // HIP_HCC_DETAIL_RUNTIME_H
TODO-doc.
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Copies the memory address of symbol symbolName to devPtr.
Definition: hip_fp16_math_fwd.h:53
Definition: hip_module.cpp:89
Definition: hip_runtime_api.h:265
#define __host__
Definition: host_defines.h:41
Definition: grid_launch.h:31
Definition: hip_runtime_api.h:81
Definition: hip_hcc_internal.h:700
Definition: hip_hcc_internal.h:519
Definition: hip_runtime.h:210