HIP: Heterogenous-computing Interface for Portability
functional_grid_launch.hpp
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #pragma once
24 
25 #include "concepts.hpp"
26 #include "helpers.hpp"
27 #include "program_state.hpp"
28 #include "hip_runtime_api.h"
29 
30 #include <cstdint>
31 #include <cstring>
32 #include <stdexcept>
33 #include <tuple>
34 #include <type_traits>
35 #include <utility>
36 
37 hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
38  unsigned int flags, hip_impl::program_state& ps);
39 
40 hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
41  dim3 blockDim, void** args,
42  size_t sharedMem, hipStream_t stream,
44 
45 hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
46  int numDevices,
47  unsigned int flags,
49 
50 #pragma GCC visibility push(hidden)
51 
52 namespace hip_impl {
53 template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
54 inline T round_up_to_next_multiple_nonnegative(T x, T y) {
55  T tmp = x + y - 1;
56  return tmp - tmp % y;
57 }
58 
59 template <
60  std::size_t n,
61  typename... Ts,
62  typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
63 inline hip_impl::kernarg make_kernarg(
64  const std::tuple<Ts...>&,
65  const kernargs_size_align&,
66  hip_impl::kernarg kernarg) {
67  return kernarg;
68 }
69 
70 template <
71  std::size_t n,
72  typename... Ts,
73  typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
74 inline hip_impl::kernarg make_kernarg(
75  const std::tuple<Ts...>& formals,
76  const kernargs_size_align& size_align,
77  hip_impl::kernarg kernarg) {
78  using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
79 
80  static_assert(
81  !std::is_reference<T>{},
82  "A __global__ function cannot have a reference as one of its "
83  "arguments.");
84  #if defined(HIP_STRICT)
85  static_assert(
86  std::is_trivially_copyable<T>{},
87  "Only TriviallyCopyable types can be arguments to a __global__ "
88  "function");
89  #endif
90 
91  kernarg.resize(round_up_to_next_multiple_nonnegative(
92  kernarg.size(), size_align.alignment(n)) + size_align.size(n));
93 
94  std::memcpy(
95  kernarg.data() + kernarg.size() - size_align.size(n),
96  &std::get<n>(formals),
97  size_align.size(n));
98  return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
99 }
100 
101 template <typename... Formals, typename... Actuals>
102 inline hip_impl::kernarg make_kernarg(
103  void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
104  static_assert(sizeof...(Formals) == sizeof...(Actuals),
105  "The count of formal arguments must match the count of actuals.");
106 
107  if (sizeof...(Formals) == 0) return {};
108 
109  std::tuple<Formals...> to_formals{std::move(actuals)};
110  hip_impl::kernarg kernarg;
111  kernarg.reserve(sizeof(to_formals));
112 
113  auto& ps = hip_impl::get_program_state();
114  return make_kernarg<0>(to_formals,
115  ps.get_kernargs_size_align(
116  reinterpret_cast<std::uintptr_t>(kernel)),
117  std::move(kernarg));
118 }
119 
120 
121 HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
122 
123 inline
124 __attribute__((visibility("hidden")))
125 void hipLaunchKernelGGLImpl(
126  std::uintptr_t function_address,
127  const dim3& numBlocks,
128  const dim3& dimBlocks,
129  std::uint32_t sharedMemBytes,
130  hipStream_t stream,
131  void** kernarg) {
132 
133  const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
134  target_agent(stream));
135 
136  hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
137  dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
138  stream, nullptr, kernarg);
139 }
140 } // Namespace hip_impl.
141 
142 
143 template <class T>
144 inline
145 hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
146  T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
147 
148  using namespace hip_impl;
149 
150  hip_impl::hip_init();
151  auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
152  target_agent(0));
153 
154  return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
155  dynSharedMemPerBlk, blockSizeLimit);
156 }
157 
158 template <class T>
159 inline
160 hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
161  T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) {
162 
163  using namespace hip_impl;
164 
165  hip_impl::hip_init();
166  if(flags != hipOccupancyDefault) return hipErrorNotSupported;
167  auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
168  target_agent(0));
169 
170  return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
171  dynSharedMemPerBlk, blockSizeLimit);
172 }
173 
174 template <typename... Args, typename F = void (*)(Args...)>
175 inline
176 void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
177  std::uint32_t sharedMemBytes, hipStream_t stream,
178  Args... args) {
179  hip_impl::hip_init();
180  auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
181  std::size_t kernarg_size = kernarg.size();
182 
183  void* config[]{
184  HIP_LAUNCH_PARAM_BUFFER_POINTER,
185  kernarg.data(),
186  HIP_LAUNCH_PARAM_BUFFER_SIZE,
187  &kernarg_size,
188  HIP_LAUNCH_PARAM_END};
189 
190  hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
191  numBlocks, dimBlocks, sharedMemBytes,
192  stream, &config[0]);
193 }
194 
195 template <typename F>
196 inline
197 __attribute__((visibility("hidden")))
198 hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
199  void** args, size_t sharedMem,
200  hipStream_t stream) {
201  hip_impl::hip_init();
202  auto& ps = hip_impl::get_program_state();
203  return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
204  blockDim, args, sharedMem, stream, ps);
205 }
206 
207 inline
208 __attribute__((visibility("hidden")))
209 hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
210  int numDevices,
211  unsigned int flags) {
212 
213  hip_impl::hip_init();
214  auto& ps = hip_impl::get_program_state();
215  return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
216 }
217 
218 #pragma GCC visibility pop
hipModuleOccupancyMaxPotentialBlockSize
hipError_t hipModuleOccupancyMaxPotentialBlockSize(int *gridSize, int *blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, int blockSizeLimit)
determine the grid and block sizes to achieves maximum occupancy for a kernel
hipLaunchCooperativeKernelMultiDevice
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams *launchParamsList, int numDevices, unsigned int flags)
Launches kernels on multiple devices where thread blocks can cooperate and synchronize as they execut...
hipLaunchCooperativeKernel
hipError_t hipLaunchCooperativeKernel(const void *f, dim3 gridDim, dim3 blockDimX, void **kernelParams, unsigned int sharedMemBytes, hipStream_t stream)
launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelp...
hip_impl::kernarg
Definition: program_state.hpp:48
dim3
Definition: hip_runtime_api.h:318
hip_impl::program_state
Definition: program_state.hpp:63
hipLaunchParams_t
Definition: hip_runtime_api.h:327
hipModuleLaunchKernel
hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream, void **kernelParams, void **extra)
launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelp...
hipErrorNotSupported
hipErrorNotSupported
Produced when the hip API is not supported/implemented.
Definition: hip_runtime_api.h:282
hipOccupancyMaxPotentialBlockSize
hipError_t hipOccupancyMaxPotentialBlockSize(int *gridSize, int *blockSize, const void *f, size_t dynSharedMemPerBlk, int blockSizeLimit)
determine the grid and block sizes to achieves maximum occupancy for a kernel