HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 #if defined(__HCC__)
33 #define __HCC_OR_HIP_CLANG__ 1
34 #define __HCC_ONLY__ 1
35 #define __HIP_CLANG_ONLY__ 0
36 #elif defined(__clang__) && defined(__HIP__)
37 #define __HCC_OR_HIP_CLANG__ 1
38 #define __HCC_ONLY__ 0
39 #define __HIP_CLANG_ONLY__ 1
40 #else
41 #define __HCC_OR_HIP_CLANG__ 0
42 #define __HCC_ONLY__ 0
43 #define __HIP_CLANG_ONLY__ 0
44 #endif
45 
46 //---
47 // Top part of file can be compiled with any compiler
48 
49 //#include <cstring>
50 #if __cplusplus
51 #include <cmath>
52 #else
53 #include <math.h>
54 #include <string.h>
55 #include <stddef.h>
56 #endif //__cplusplus
57 
58 #if __HCC_OR_HIP_CLANG__
59 
60 #define CUDA_SUCCESS hipSuccess
61 
62 #include <hip/hip_runtime_api.h>
63 #endif // __HCC_OR_HIP_CLANG__
64 
65 #if __HCC__
66 // define HIP_ENABLE_PRINTF to enable printf
67 #ifdef HIP_ENABLE_PRINTF
68 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
69 #endif
70 
71 //---
72 // Remainder of this file only compiles with HCC
73 #if defined __HCC__
74 #include <grid_launch.h>
75 #include "hc_printf.hpp"
76 // TODO-HCC-GL - change this to typedef.
77 // typedef grid_launch_parm hipLaunchParm ;
78 
79 #if GENERIC_GRID_LAUNCH == 0
80 #define hipLaunchParm grid_launch_parm
81 #else
82 namespace hip_impl {
83 struct Empty_launch_parm {};
84 } // namespace hip_impl
85 #define hipLaunchParm hip_impl::Empty_launch_parm
86 #endif // GENERIC_GRID_LAUNCH
87 
88 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
89 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
90 #error(HCC must support GRID_LAUNCH_20)
91 #endif // GRID_LAUNCH_VERSION
92 
93 #endif // HCC
94 
95 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
96 #include "grid_launch_GGL.hpp"
97 #endif // GENERIC_GRID_LAUNCH
98 
99 #endif // HCC
100 
101 #if __HCC_OR_HIP_CLANG__
102 extern int HIP_TRACE_API;
103 
104 #ifdef __cplusplus
105 #include <hip/hcc_detail/hip_ldg.h>
106 #endif
107 #include <hip/hcc_detail/hip_atomic.h>
109 #include <hip/hcc_detail/device_functions.h>
110 #include <hip/hcc_detail/surface_functions.h>
111 #if __HCC__
112 #include <hip/hcc_detail/math_functions.h>
113 #include <hip/hcc_detail/texture_functions.h>
114 #endif // __HCC__
115 
116 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
117 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
118 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
119 #endif
120 
121 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
122 #if __HIP_DEVICE_COMPILE__ == 1
123 #undef assert
124 #define assert(COND) \
125  { \
126  if (!(COND)) { \
127  abort(); \
128  } \
129  }
130 #endif
131 
132 
133 // Feature tests:
134 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
135 // Device compile and not host compile:
136 
137 // 32-bit Atomics:
138 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
139 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
140 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
141 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
142 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
143 
144 // 64-bit Atomics:
145 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
146 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
147 
148 // Doubles
149 #define __HIP_ARCH_HAS_DOUBLES__ (1)
150 
151 // warp cross-lane operations:
152 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
153 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
154 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
155 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
156 
157 // sync
158 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
159 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
160 
161 // misc
162 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
163 #define __HIP_ARCH_HAS_3DGRID__ (1)
164 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
165 
166 #endif /* Device feature flags */
167 
168 
169 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
170  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
171 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
172  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
173  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
174 #define select_impl_(_1, _2, impl_, ...) impl_
175 #define __launch_bounds__(...) \
176  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
177 
178 // Detect if we are compiling C++ mode or C mode
179 #if defined(__cplusplus)
180 #define __HCC_CPP__
181 #elif defined(__STDC_VERSION__)
182 #define __HCC_C__
183 #endif
184 
185 #endif // defined __HCC__
186 
187 #if __HCC_OR_HIP_CLANG__
188 
189 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
190 
191 #if __HIP_ARCH_GFX701__ == 0
192 
193 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
194 __device__ float __hip_ds_bpermutef(int index, float src);
195 __device__ unsigned __hip_ds_permute(int index, unsigned src);
196 __device__ float __hip_ds_permutef(int index, float src);
197 
198 __device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
199 __device__ float __hip_ds_swizzlef(float src, int pattern);
200 
201 __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
202 
203 #endif //__HIP_ARCH_GFX803__ == 1
204 
205 #endif // __HCC_OR_HIP_CLANG__
206 
207 #if defined __HCC__
208 
209 template <
210  typename std::common_type<decltype(hc_get_group_id), decltype(hc_get_group_size),
211  decltype(hc_get_num_groups), decltype(hc_get_workitem_id)>::type f>
212 class Coordinates {
213  using R = decltype(f(0));
214 
215  struct X {
216  __device__ operator R() const { return f(0); }
217  };
218  struct Y {
219  __device__ operator R() const { return f(1); }
220  };
221  struct Z {
222  __device__ operator R() const { return f(2); }
223  };
224 
225  public:
226  static constexpr X x{};
227  static constexpr Y y{};
228  static constexpr Z z{};
229 };
230 
231 static constexpr Coordinates<hc_get_group_size> blockDim;
232 static constexpr Coordinates<hc_get_group_id> blockIdx;
233 static constexpr Coordinates<hc_get_num_groups> gridDim;
234 static constexpr Coordinates<hc_get_workitem_id> threadIdx;
235 
236 #define hipThreadIdx_x (hc_get_workitem_id(0))
237 #define hipThreadIdx_y (hc_get_workitem_id(1))
238 #define hipThreadIdx_z (hc_get_workitem_id(2))
239 
240 #define hipBlockIdx_x (hc_get_group_id(0))
241 #define hipBlockIdx_y (hc_get_group_id(1))
242 #define hipBlockIdx_z (hc_get_group_id(2))
243 
244 #define hipBlockDim_x (hc_get_group_size(0))
245 #define hipBlockDim_y (hc_get_group_size(1))
246 #define hipBlockDim_z (hc_get_group_size(2))
247 
248 #define hipGridDim_x (hc_get_num_groups(0))
249 #define hipGridDim_y (hc_get_num_groups(1))
250 #define hipGridDim_z (hc_get_num_groups(2))
251 
252 #endif // defined __HCC__
253 #if __HCC_OR_HIP_CLANG__
254 extern "C" __device__ void* __hip_malloc(size_t);
255 extern "C" __device__ void* __hip_free(void* ptr);
256 
257 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
258 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
259 
260 #ifdef __HCC_ACCELERATOR__
261 
262 #ifdef HC_FEATURE_PRINTF
263 template <typename... All>
264 static inline __device__ void printf(const char* format, All... all) {
265  hc::printf(format, all...);
266 }
267 #else
268 template <typename... All>
269 static inline __device__ void printf(const char* format, All... all) {}
270 #endif
271 
272 #endif
273 #endif //__HCC_OR_HIP_CLANG__
274 
275 #ifdef __HCC__
276 
277 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
278 
279 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
280 #define HIP_SYMBOL(X) #X
281 
282 #if defined __HCC_CPP__
283 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
284  grid_launch_parm* lp, const char* kernelNameStr);
285 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
286  grid_launch_parm* lp, const char* kernelNameStr);
287 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
288  grid_launch_parm* lp, const char* kernelNameStr);
289 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
290  grid_launch_parm* lp, const char* kernelNameStr);
291 extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp);
292 
293 #if GENERIC_GRID_LAUNCH == 0
294 //#warning "Original hipLaunchKernel defined"
295 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
296 // either size_t or dim3 types
297 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
298  do { \
299  grid_launch_parm lp; \
300  lp.dynamic_group_mem_bytes = _groupMemBytes; \
301  hipStream_t trueStream = \
302  (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
303  _kernelName(lp, ##__VA_ARGS__); \
304  ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
305  } while (0)
306 #endif // GENERIC_GRID_LAUNCH
307 
308 #elif defined(__HCC_C__)
309 
310 // TODO - develop C interface.
311 
312 #endif //__HCC_CPP__
313 
318 // extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
319 // extern int HIP_TRACE_API; ///< Trace HIP APIs.
320 // extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
321 
327 // End doxygen API:
332 //
333 // hip-clang functions
334 //
335 #elif defined(__clang__) && defined(__HIP__)
336 
337 #define HIP_KERNEL_NAME(...) __VA_ARGS__
338 #define HIP_SYMBOL(X) #X
339 
340 typedef int hipLaunchParm;
341 
342 #define hipLaunchKernel(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
343  do { \
344  kernelName<<<numblocks, numthreads, memperblock, streamId>>>(0, ##__VA_ARGS__); \
345  } while (0)
346 
347 #define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
348  do { \
349  kernelName<<<numblocks, numthreads, memperblock, streamId>>>(__VA_ARGS__); \
350  } while (0)
351 
352 #include <hip/hip_runtime_api.h>
353 
354 #pragma push_macro("__DEVICE__")
355 #define __DEVICE__ static __device__ __forceinline__
356 
357 extern "C" __device__ size_t __ockl_get_local_id(uint);
358 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
359 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
360 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
361 
362 extern "C" __device__ size_t __ockl_get_group_id(uint);
363 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
364 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
365 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
366 
367 extern "C" __device__ size_t __ockl_get_local_size(uint);
368 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
369 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
370 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
371 
372 extern "C" __device__ size_t __ockl_get_num_groups(uint);
373 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
374 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
375 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
376 
377 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
378  __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
379  __DEVICE__ uint __get_##DIMENSION(void) { \
380  return FUNCTION; \
381  }
382 
383 struct __hip_builtin_threadIdx_t {
384  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
385  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
386  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
387 };
388 
389 struct __hip_builtin_blockIdx_t {
390  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
391  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
392  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
393 };
394 
395 struct __hip_builtin_blockDim_t {
396  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
397  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
398  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
399 };
400 
401 struct __hip_builtin_gridDim_t {
402  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
403  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
404  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
405 };
406 
407 #undef __HIP_DEVICE_BUILTIN
408 #pragma pop_macro("__DEVICE__")
409 
410 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
411 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
412 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
413 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
414 
415 
416 #define hipThreadIdx_x threadIdx.x
417 #define hipThreadIdx_y threadIdx.y
418 #define hipThreadIdx_z threadIdx.z
419 
420 #define hipBlockIdx_x blockIdx.x
421 #define hipBlockIdx_y blockIdx.y
422 #define hipBlockIdx_z blockIdx.z
423 
424 #define hipBlockDim_x blockDim.x
425 #define hipBlockDim_y blockDim.y
426 #define hipBlockDim_z blockDim.z
427 
428 #define hipGridDim_x gridDim.x
429 #define hipGridDim_y gridDim.y
430 #define hipGridDim_z gridDim.z
431 
432 #include <hip/hcc_detail/math_functions.h>
433 
434 // Support std::complex.
435 #pragma push_macro("__CUDA__")
436 #define __CUDA__
437 #include <__clang_cuda_math_forward_declares.h>
438 #include <__clang_cuda_complex_builtins.h>
439 #include <cuda_wrappers/algorithm>
440 #include <cuda_wrappers/complex>
441 #undef __CUDA__
442 #pragma pop_macro("__CUDA__")
443 
444 
445 #endif
446 
447 #include <hip/hcc_detail/hip_memory.h>
448 
449 #endif // HIP_HCC_DETAIL_RUNTIME_H
TODO-doc.
Definition: hip_runtime_api.h:240
#define __host__
Definition: host_defines.h:41
Definition: program_state.cpp:302
Definition: hip_hcc_internal.h:518
Definition: hip_runtime.h:212