HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 #include <hip/hcc_detail/hip_common.h>
33 
34 //---
35 // Top part of file can be compiled with any compiler
36 
37 //#include <cstring>
38 #if __cplusplus
39 #include <cmath>
40 #include <cstdint>
41 #else
42 #include <math.h>
43 #include <string.h>
44 #include <stddef.h>
45 #endif //__cplusplus
46 
47 #if __HCC_OR_HIP_CLANG__
48 
49 #if __HIP__
50 #if !defined(__align__)
51 #define __align__(x) __attribute__((aligned(x)))
52 #endif
53 #endif
54 
55 #define CUDA_SUCCESS hipSuccess
56 
57 #include <hip/hip_runtime_api.h>
58 #endif // __HCC_OR_HIP_CLANG__
59 
60 #if __HCC__
61 // define HIP_ENABLE_PRINTF to enable printf
62 #ifdef HIP_ENABLE_PRINTF
63 #define HCC_ENABLE_ACCELERATOR_PRINTF 1
64 #endif
65 
66 //---
67 // Remainder of this file only compiles with HCC
68 #if defined __HCC__
69 #include "grid_launch.h"
70 #include "hc_printf.hpp"
71 // TODO-HCC-GL - change this to typedef.
72 // typedef grid_launch_parm hipLaunchParm ;
73 
74 #if GENERIC_GRID_LAUNCH == 0
75 #define hipLaunchParm grid_launch_parm
76 #else
77 namespace hip_impl {
78 struct Empty_launch_parm {};
79 } // namespace hip_impl
80 #define hipLaunchParm hip_impl::Empty_launch_parm
81 #endif // GENERIC_GRID_LAUNCH
82 
83 #if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
84 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
85 #error(HCC must support GRID_LAUNCH_20)
86 #endif // GRID_LAUNCH_VERSION
87 
88 #endif // HCC
89 
90 #if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
91 #include "grid_launch_GGL.hpp"
92 #endif // GENERIC_GRID_LAUNCH
93 
94 #endif // HCC
95 
96 #if __HCC_OR_HIP_CLANG__
97 extern int HIP_TRACE_API;
98 
99 #ifdef __cplusplus
100 #include <hip/hcc_detail/hip_ldg.h>
101 #endif
102 #include <hip/hcc_detail/hip_atomic.h>
104 #include <hip/hcc_detail/device_functions.h>
105 #include <hip/hcc_detail/surface_functions.h>
106 #include <hip/hcc_detail/texture_functions.h>
107 #if __HCC__
108  #include <hip/hcc_detail/math_functions.h>
109 #endif
110 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
111 #if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
112 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
113 #endif
114 
115 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
116 #if __HIP_DEVICE_COMPILE__ == 1
117 #undef assert
118 #define assert(COND) \
119  { \
120  if (!(COND)) { \
121  abort(); \
122  } \
123  }
124 #endif
125 
126 
127 // Feature tests:
128 #if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
129 // Device compile and not host compile:
130 
131 // 32-bit Atomics:
132 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
133 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
134 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
135 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
136 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
137 
138 // 64-bit Atomics:
139 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
140 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
141 
142 // Doubles
143 #define __HIP_ARCH_HAS_DOUBLES__ (1)
144 
145 // warp cross-lane operations:
146 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
147 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
148 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
149 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
150 
151 // sync
152 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
153 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
154 
155 // misc
156 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
157 #define __HIP_ARCH_HAS_3DGRID__ (1)
158 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
159 
160 #endif /* Device feature flags */
161 
162 
163 #define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
164  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
165 #define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
166  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
167  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
168 #define select_impl_(_1, _2, impl_, ...) impl_
169 #define __launch_bounds__(...) \
170  select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
171 
172 // Detect if we are compiling C++ mode or C mode
173 #if defined(__cplusplus)
174 #define __HCC_CPP__
175 #elif defined(__STDC_VERSION__)
176 #define __HCC_C__
177 #endif
178 
179 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
180 
181 #if __HIP_ARCH_GFX701__ == 0
182 
183 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
184 __device__ float __hip_ds_bpermutef(int index, float src);
185 __device__ unsigned __hip_ds_permute(int index, unsigned src);
186 __device__ float __hip_ds_permutef(int index, float src);
187 
188 template <int pattern>
189 __device__ unsigned __hip_ds_swizzle_N(unsigned int src);
190 template <int pattern>
191 __device__ float __hip_ds_swizzlef_N(float src);
192 
193 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
194 __device__ int __hip_move_dpp_N(int src);
195 
196 #endif //__HIP_ARCH_GFX803__ == 1
197 
198 #endif // __HCC_OR_HIP_CLANG__
199 
200 #if defined __HCC__
201 
202 namespace hip_impl {
203  struct GroupId {
204  using R = decltype(hc_get_group_id(0));
205 
206  __device__
207  R operator()(std::uint32_t x) const noexcept { return hc_get_group_id(x); }
208  };
209  struct GroupSize {
210  using R = decltype(hc_get_group_size(0));
211 
212  __device__
213  R operator()(std::uint32_t x) const noexcept {
214  return hc_get_group_size(x);
215  }
216  };
217  struct NumGroups {
218  using R = decltype(hc_get_num_groups(0));
219 
220  __device__
221  R operator()(std::uint32_t x) const noexcept {
222  return hc_get_num_groups(x);
223  }
224  };
225  struct WorkitemId {
226  using R = decltype(hc_get_workitem_id(0));
227 
228  __device__
229  R operator()(std::uint32_t x) const noexcept {
230  return hc_get_workitem_id(x);
231  }
232  };
233 } // Namespace hip_impl.
234 
235 template <typename F>
236 struct Coordinates {
237  using R = decltype(F{}(0));
238 
239  struct X { __device__ operator R() const noexcept { return F{}(0); } };
240  struct Y { __device__ operator R() const noexcept { return F{}(1); } };
241  struct Z { __device__ operator R() const noexcept { return F{}(2); } };
242 
243  static constexpr X x{};
244  static constexpr Y y{};
245  static constexpr Z z{};
246 };
247 
248 inline
249 __device__
250 std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::X,
252  return hc_get_grid_size(0);
253 }
254 inline
255 __device__
256 std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::X,
258  return hc_get_grid_size(0);
259 }
260 inline
261 __device__
262 std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::Y,
264  return hc_get_grid_size(1);
265 }
266 inline
267 __device__
268 std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::Y,
270  return hc_get_grid_size(1);
271 }
272 inline
273 __device__
274 std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::Z,
276  return hc_get_grid_size(2);
277 }
278 inline
279 __device__
280 std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::Z,
282  return hc_get_grid_size(2);
283 }
284 
285 static constexpr Coordinates<hip_impl::GroupSize> blockDim{};
286 static constexpr Coordinates<hip_impl::GroupId> blockIdx{};
287 static constexpr Coordinates<hip_impl::NumGroups> gridDim{};
288 static constexpr Coordinates<hip_impl::WorkitemId> threadIdx{};
289 
290 #define hipThreadIdx_x (hc_get_workitem_id(0))
291 #define hipThreadIdx_y (hc_get_workitem_id(1))
292 #define hipThreadIdx_z (hc_get_workitem_id(2))
293 
294 #define hipBlockIdx_x (hc_get_group_id(0))
295 #define hipBlockIdx_y (hc_get_group_id(1))
296 #define hipBlockIdx_z (hc_get_group_id(2))
297 
298 #define hipBlockDim_x (hc_get_group_size(0))
299 #define hipBlockDim_y (hc_get_group_size(1))
300 #define hipBlockDim_z (hc_get_group_size(2))
301 
302 #define hipGridDim_x (hc_get_num_groups(0))
303 #define hipGridDim_y (hc_get_num_groups(1))
304 #define hipGridDim_z (hc_get_num_groups(2))
305 
306 #endif // defined __HCC__
307 #if __HCC_OR_HIP_CLANG__
308 extern "C" __device__ void* __hip_malloc(size_t);
309 extern "C" __device__ void* __hip_free(void* ptr);
310 
311 static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
312 static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
313 
314 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
315 template <typename... All>
316 static inline __device__ void printf(const char* format, All... all) {
317  hc::printf(format, all...);
318 }
319 #elif defined(__HCC_ACCELERATOR__) || __HIP__
320 template <typename... All>
321 static inline __device__ void printf(const char* format, All... all) {}
322 #endif
323 
324 #endif //__HCC_OR_HIP_CLANG__
325 
326 #ifdef __HCC__
327 
328 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
329 
330 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
331 #define HIP_SYMBOL(X) #X
332 
333 #if defined __HCC_CPP__
334 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
335  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
336 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
337  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
338 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
339  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
340 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
341  grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
342 extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0);
343 
344 #if GENERIC_GRID_LAUNCH == 0
345 //#warning "Original hipLaunchKernel defined"
346 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
347 // either size_t or dim3 types
348 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
349  do { \
350  grid_launch_parm lp; \
351  lp.dynamic_group_mem_bytes = _groupMemBytes; \
352  hipStream_t trueStream = \
353  (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
354  _kernelName(lp, ##__VA_ARGS__); \
355  ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
356  } while (0)
357 #endif // GENERIC_GRID_LAUNCH
358 
359 #elif defined(__HCC_C__)
360 
361 // TODO - develop C interface.
362 
363 #endif //__HCC_CPP__
364 
369 // extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
370 // extern int HIP_TRACE_API; ///< Trace HIP APIs.
371 // extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
372 
378 // End doxygen API:
383 //
384 // hip-clang functions
385 //
386 #elif defined(__clang__) && defined(__HIP__)
387 
388 #define HIP_KERNEL_NAME(...) __VA_ARGS__
389 #define HIP_SYMBOL(X) #X
390 
391 typedef int hipLaunchParm;
392 
393 #define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \
394  do { \
395  kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__); \
396  } while (0)
397 
398 #include <hip/hip_runtime_api.h>
399 
400 #pragma push_macro("__DEVICE__")
401 #define __DEVICE__ static __device__ __forceinline__
402 
403 extern "C" __device__ size_t __ockl_get_local_id(uint);
404 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
405 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
406 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }
407 
408 extern "C" __device__ size_t __ockl_get_group_id(uint);
409 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
410 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
411 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }
412 
413 extern "C" __device__ size_t __ockl_get_local_size(uint);
414 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
415 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
416 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }
417 
418 extern "C" __device__ size_t __ockl_get_num_groups(uint);
419 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
420 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
421 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
422 
423 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION) \
424  __declspec(property(get = __get_##DIMENSION)) uint DIMENSION; \
425  __DEVICE__ uint __get_##DIMENSION(void) { \
426  return FUNCTION; \
427  }
428 
429 struct __hip_builtin_threadIdx_t {
430  __HIP_DEVICE_BUILTIN(x,__hip_get_thread_idx_x());
431  __HIP_DEVICE_BUILTIN(y,__hip_get_thread_idx_y());
432  __HIP_DEVICE_BUILTIN(z,__hip_get_thread_idx_z());
433 };
434 
435 struct __hip_builtin_blockIdx_t {
436  __HIP_DEVICE_BUILTIN(x,__hip_get_block_idx_x());
437  __HIP_DEVICE_BUILTIN(y,__hip_get_block_idx_y());
438  __HIP_DEVICE_BUILTIN(z,__hip_get_block_idx_z());
439 };
440 
441 struct __hip_builtin_blockDim_t {
442  __HIP_DEVICE_BUILTIN(x,__hip_get_block_dim_x());
443  __HIP_DEVICE_BUILTIN(y,__hip_get_block_dim_y());
444  __HIP_DEVICE_BUILTIN(z,__hip_get_block_dim_z());
445 };
446 
447 struct __hip_builtin_gridDim_t {
448  __HIP_DEVICE_BUILTIN(x,__hip_get_grid_dim_x());
449  __HIP_DEVICE_BUILTIN(y,__hip_get_grid_dim_y());
450  __HIP_DEVICE_BUILTIN(z,__hip_get_grid_dim_z());
451 };
452 
453 #undef __HIP_DEVICE_BUILTIN
454 #pragma pop_macro("__DEVICE__")
455 
456 extern const __device__ __attribute__((weak)) __hip_builtin_threadIdx_t threadIdx;
457 extern const __device__ __attribute__((weak)) __hip_builtin_blockIdx_t blockIdx;
458 extern const __device__ __attribute__((weak)) __hip_builtin_blockDim_t blockDim;
459 extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
460 
461 
462 #define hipThreadIdx_x threadIdx.x
463 #define hipThreadIdx_y threadIdx.y
464 #define hipThreadIdx_z threadIdx.z
465 
466 #define hipBlockIdx_x blockIdx.x
467 #define hipBlockIdx_y blockIdx.y
468 #define hipBlockIdx_z blockIdx.z
469 
470 #define hipBlockDim_x blockDim.x
471 #define hipBlockDim_y blockDim.y
472 #define hipBlockDim_z blockDim.z
473 
474 #define hipGridDim_x gridDim.x
475 #define hipGridDim_y gridDim.y
476 #define hipGridDim_z gridDim.z
477 
478 #include <hip/hcc_detail/math_functions.h>
479 
480 #if __HIP_HCC_COMPAT_MODE__
481 // Define HCC work item functions in terms of HIP builtin variables.
482 #pragma push_macro("__DEFINE_HCC_FUNC")
483 #define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
484 inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
485  if (i==0) \
486  return hip_var.x; \
487  else if(i==1) \
488  return hip_var.y; \
489  else \
490  return hip_var.z; \
491 }
492 
493 __DEFINE_HCC_FUNC(workitem_id, threadIdx)
494 __DEFINE_HCC_FUNC(group_id, blockIdx)
495 __DEFINE_HCC_FUNC(group_size, blockDim)
496 __DEFINE_HCC_FUNC(num_groups, gridDim)
497 #pragma pop_macro("__DEFINE_HCC_FUNC")
498 
499 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
500 inline __device__ __attribute__((always_inline)) uint
501 hc_get_workitem_absolute_id(int dim)
502 {
503  return (uint)__ockl_get_global_id(dim);
504 }
505 
506 #endif
507 
508 // Support std::complex.
509 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
510 #pragma push_macro("__CUDA__")
511 #define __CUDA__
512 #include <__clang_cuda_math_forward_declares.h>
513 #include <__clang_cuda_complex_builtins.h>
514 #include <cuda_wrappers/algorithm>
515 #include <cuda_wrappers/complex>
516 #include <cuda_wrappers/new>
517 #undef __CUDA__
518 #pragma pop_macro("__CUDA__")
519 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
520 
521 #endif // defined(__clang__) && defined(__HIP__)
522 
523 #include <hip/hcc_detail/hip_memory.h>
524 
525 #endif // HIP_HCC_DETAIL_RUNTIME_H
Definition: hip_runtime.h:225
Definition: hip_runtime.h:240
TODO-doc.
Definition: hip_runtime.h:241
Definition: hip_runtime.h:217
Definition: hip_runtime_api.h:269
#define __host__
Definition: host_defines.h:41
Definition: hip_runtime.h:203
Definition: grid_launch.h:31
Definition: hip_runtime.h:202
Definition: hip_hcc_internal.h:593
Definition: hip_runtime.h:239
Definition: hip_runtime.h:209
Definition: hip_runtime.h:236