HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 extern int HIP_PROFILE_API;
67 // extern int HIP_TRACE_API;
68 extern int HIP_ATP;
69 extern int HIP_DB;
70 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
71 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
72 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
73 extern int HIP_FORCE_P2P_HOST;
74 
75 extern int HIP_HOST_COHERENT;
76 
77 extern int HIP_HIDDEN_FREE_MEM;
78 //---
79 // Chicken bits for disabling functionality to work around potential issues:
80 extern int HIP_SYNC_HOST_ALLOC;
81 extern int HIP_SYNC_STREAM_WAIT;
82 
83 extern int HIP_SYNC_NULL_STREAM;
84 extern int HIP_INIT_ALLOC;
85 extern int HIP_FORCE_NULL_STREAM;
86 
87 extern int HIP_SYNC_FREE;
88 
89 extern int HIP_DUMP_CODE_OBJECT;
90 
91 // TODO - remove when this is standard behavior.
92 extern int HCC_OPT_FLUSH;
93 
94 #define IMAGE_PITCH_ALIGNMENT 256
95 template <typename T> inline T alignDown(T value, size_t alignment) {
96  return (T)(value & ~(alignment - 1));
97 }
98 
99 template <typename T> inline T* alignDown(T* value, size_t alignment) {
100  return (T*)alignDown((intptr_t)value, alignment);
101 }
102 
103 template <typename T> inline T alignUp(T value, size_t alignment) {
104  return alignDown((T)(value + alignment - 1), alignment);
105 }
106 
107 template <typename T> inline T* alignUp(T* value, size_t alignment) {
108  return (T*)alignDown((intptr_t)(value + alignment - 1), alignment);
109 }
110 
111 size_t getNumChannels(hsa_ext_image_channel_order_t channelOrder) {
112  switch (channelOrder) {
113  case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
114  return 2;
115  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
116  return 3;
117  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
118  return 4;
119  case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
120  default:
121  return 1;
122  }
123 }
124 
125 size_t getElementSize(hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType) {
126  size_t bytesPerPixel = getNumChannels(channelOrder);
127  switch (channelType) {
128  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
129  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
130  break;
131 
132  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
133  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
134  case HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT:
135  bytesPerPixel *= 4;
136  break;
137 
138  default:
139  bytesPerPixel *= 2;
140  break;
141  }
142  return bytesPerPixel;
143 }
144 
145 // Class to assign a short TID to each new thread, for HIP debugging purposes.
146 class TidInfo {
147  public:
148  TidInfo();
149 
150  int tid() const { return _shortTid; };
151  pid_t pid() const { return _pid; };
152  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
153  uint64_t apiSeqNum() const { return _apiSeqNum; };
154 
155  private:
156  int _shortTid;
157  pid_t _pid;
158 
159  // monotonically increasing API sequence number for this threa.
160  uint64_t _apiSeqNum;
161 };
162 
163 struct ProfTrigger {
164  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
165 
166  void print(int tid) {
167  std::cout << "Enabling tracing for ";
168  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
169  std::cout << "tid:" << tid << "." << *iter << ",";
170  }
171  std::cout << "\n";
172  };
173 
174  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
175  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
176  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
177 
178  private:
179  std::vector<uint64_t> _profTrigger;
180 };
181 
182 
183 //---
184 // Extern TLS
185 // Use a single struct to hold all TLS data. Attempt to reduce TLS accesses.
186 struct TlsData {
187  explicit TlsData() {
188  lastHipError = hipSuccess;
189  getPrimaryCtx = true;
190  defaultCtx = nullptr;
191  }
192 
193  hipError_t lastHipError;
194  TidInfo tidInfo;
195  // This is the implicit context used by all HIP commands.
196  // It can be set by hipSetDevice or by the CTX manipulation commands:
197  ihipCtx_t* defaultCtx;
198  // Stack of contexts
199  std::stack<ihipCtx_t*> ctxStack;
200  bool getPrimaryCtx;
201 };
202 TlsData* tls_get_ptr();
203 #define GET_TLS() TlsData *tls = tls_get_ptr()
204 
205 extern std::vector<ProfTrigger> g_dbStartTriggers;
206 extern std::vector<ProfTrigger> g_dbStopTriggers;
207 
208 //---
209 // Forward defs:
210 class ihipStream_t;
211 class ihipDevice_t;
212 class ihipCtx_t;
213 struct ihipEventData_t;
214 
215 // Color defs for debug messages:
216 #define KNRM "\x1B[0m"
217 #define KRED "\x1B[31m"
218 #define KGRN "\x1B[32m"
219 #define KYEL "\x1B[33m"
220 #define KBLU "\x1B[34m"
221 #define KMAG "\x1B[35m"
222 #define KCYN "\x1B[36m"
223 #define KWHT "\x1B[37m"
224 
225 extern const char* API_COLOR;
226 extern const char* API_COLOR_END;
227 
228 
229 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
230 // Can disable for performance or functional experiments - in this case
231 // the code uses a dummy "no-op" mutex.
232 #define EVENT_THREAD_SAFE 1
233 
234 #define STREAM_THREAD_SAFE 1
235 
236 #define CTX_THREAD_SAFE 1
237 
238 #define DEVICE_THREAD_SAFE 1
239 
240 
241 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
242 // May be set to 0 to remove debug if checks - possible code size and performance difference?
243 #define COMPILE_HIP_DB 1
244 
245 
246 // Compile HIP tracing capability.
247 // 0x1 = print a string at function entry with arguments.
248 // 0x2 = prints a simple message with function name + return code when function exits.
249 // 0x3 = print both.
250 // Must be enabled at runtime with HIP_TRACE_API
251 #define COMPILE_HIP_TRACE_API 0x3
252 
253 
254 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
255 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
256 #ifndef COMPILE_HIP_ATP_MARKER
257 #define COMPILE_HIP_ATP_MARKER 0
258 #endif
259 
260 
261 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
262 // boundary.
263 // TODO - currently we print the trace message at the beginning. if we waited, we could also
264 // tls->tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
265 // by hipMalloc).
266 #if COMPILE_HIP_ATP_MARKER
267 #include "CXLActivityLogger.h"
268 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
269 #define MARKER_END() amdtEndMarker();
270 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
271 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
272 #else
273 // Swallow scoped markers:
274 #define MARKER_BEGIN(markerName, group)
275 #define MARKER_END()
276 #define RESUME_PROFILING
277 #define STOP_PROFILING
278 #endif
279 
280 
281 //---
282 // HIP Trace modes - use with HIP_TRACE_API=...
283 #define TRACE_ALL 0 // 0x01
284 #define TRACE_KCMD 1 // 0x02, kernel command
285 #define TRACE_MCMD 2 // 0x04, memory command
286 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
287 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
288 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
289 
290 
291 //---
292 // HIP_DB Debug flags:
293 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
294 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
295 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
296 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
297 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
298 #define DB_FB 5 /* 0x20 - trace loading fat binary */
299 #define DB_MAX_FLAG 6
300 // When adding a new debug flag, also add to the char name table below.
301 //
302 //
303 
304 struct DbName {
305  const char* _color;
306  const char* _shortName;
307 };
308 
309 // This table must be kept in-sync with the defines above.
310 static const DbName dbName[] = {
311  {KGRN, "api"}, // not used,
312  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
313  {KBLU, "fatbin"},
314 };
315 
316 
317 #if COMPILE_HIP_DB
318 #define tprintf(trace_level, ...) \
319  { \
320  if (HIP_DB & (1 << (trace_level))) { \
321  GET_TLS(); \
322  char msgStr[1000]; \
323  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
324  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
325  dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \
326  } \
327  }
328 #else
329 /* Compile to empty code */
330 #define tprintf(trace_level, ...)
331 #endif
332 
333 
334 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
335 
336 //---
337 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
338 
339 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
340 #define API_TRACE(forceTrace, ...) \
341  GET_TLS(); \
342  uint64_t hipApiStartTick = 0; \
343  { \
344  tls->tidInfo.incApiSeqNum(); \
345  if (forceTrace || \
346  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
347  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
348  std::string fullStr; \
349  hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
350  if (HIP_PROFILE_API == 0x1) { \
351  MARKER_BEGIN(__func__, "HIP") \
352  } else if (HIP_PROFILE_API == 0x2) { \
353  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
354  } \
355  } \
356  }
357 
358 #else
359 // Swallow API_TRACE
360 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum();
361 #endif
362 
363 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls)
364 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx
365 
366 #define HIP_SET_DEVICE() ihipDeviceSetState(tls);
367 
368 // This macro should be called at the beginning of every HIP API.
369 // It initializes the hip runtime (exactly once), and
370 // generates a trace string that can be output to stderr or to ATP file.
371 #define HIP_INIT_API(cid, ...) \
372  hip_impl::hip_init(); \
373  API_TRACE(0, __VA_ARGS__); \
374  HIP_CB_SPAWNER_OBJECT(cid);
375 
376 
377 // Like above, but will trace with a specified "special" bit.
378 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
379 // kernel launches, copy commands, memory sets, etc.
380 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
381  hip_impl::hip_init(); \
382  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
383  HIP_CB_SPAWNER_OBJECT(cid);
384 
385 
386 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
387 // APIS (not internal hip) It has dual function: logs the last error returned for use by
388 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
389 #define ihipLogStatus(hipStatus) \
390  ({ \
391  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
392  tls->lastHipError = localHipStatus; \
393  \
394  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
395  auto ticks = getTicks() - hipApiStartTick; \
396  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
397  (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \
398  tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
399  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
400  } \
401  if (HIP_PROFILE_API) { \
402  MARKER_END(); \
403  } \
404  localHipStatus; \
405  })
406 
407 
408 class ihipException : public std::exception {
409  public:
410  explicit ihipException(hipError_t e) : _code(e){};
411 
412  hipError_t _code;
413 };
414 
415 
416 #ifdef __cplusplus
417 extern "C" {
418 #endif
419 
420 
421 #ifdef __cplusplus
422 }
423 #endif
424 
425 const hipStream_t hipStreamNull = 0x0;
426 
427 
431 #define HIP_IPC_RESERVED_SIZE 24
433  public:
434 #if USE_IPC
435  hsa_amd_ipc_memory_t ipc_handle;
436 #endif
437  size_t psize;
438  char reserved[HIP_IPC_RESERVED_SIZE];
439 };
440 
441 
442 struct ihipModule_t {
443  std::string fileName;
444  hsa_executable_t executable = {};
445  hsa_code_object_reader_t coReader = {};
446  std::string hash;
447  std::unordered_map<
448  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
449 
450  ~ihipModule_t() {
451  if (executable.handle) hsa_executable_destroy(executable);
452  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
453  }
454 };
455 
456 
457 //---
458 // Used to remove lock, for performance or stimulating bugs.
459 class FakeMutex {
460  public:
461  void lock() {}
462  bool try_lock() { return true; }
463  void unlock() {}
464 };
465 
466 #if EVENT_THREAD_SAFE
467 typedef std::mutex EventMutex;
468 #else
469 #warning "Stream thread-safe disabled"
470 typedef FakeMutex EventMutex;
471 #endif
472 
473 #if STREAM_THREAD_SAFE
474 typedef std::mutex StreamMutex;
475 #else
476 #warning "Stream thread-safe disabled"
477 typedef FakeMutex StreamMutex;
478 #endif
479 
480 // Pair Device and Ctx together, these could also be toggled separately if desired.
481 #if CTX_THREAD_SAFE
482 typedef std::mutex CtxMutex;
483 #else
484 typedef FakeMutex CtxMutex;
485 #warning "Ctx thread-safe disabled"
486 #endif
487 
488 #if DEVICE_THREAD_SAFE
489 typedef std::mutex DeviceMutex;
490 #else
491 typedef FakeMutex DeviceMutex;
492 #warning "Device thread-safe disabled"
493 #endif
494 
495 //
496 //---
497 // Protects access to the member _data with a lock acquired on contruction/destruction.
498 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
499 template <typename T>
501  public:
502  LockedAccessor(T& criticalData, bool autoUnlock = true)
503  : _criticalData(&criticalData),
504  _autoUnlock(autoUnlock)
505 
506  {
507  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
508  ToString(_criticalData->_parent).c_str());
509  _criticalData->_mutex.lock();
510  };
511 
512  ~LockedAccessor() {
513  if (_autoUnlock) {
514  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
515  ToString(_criticalData->_parent).c_str());
516  _criticalData->_mutex.unlock();
517  }
518  }
519 
520  void unlock() {
521  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
522  ToString(_criticalData->_parent).c_str());
523  _criticalData->_mutex.unlock();
524  }
525 
526  // Syntactic sugar so -> can be used to get the underlying type.
527  T* operator->() { return _criticalData; };
528 
529  private:
530  T* _criticalData;
531  bool _autoUnlock;
532 };
533 
534 
535 template <typename MUTEX_TYPE>
536 struct LockedBase {
537  // Experts-only interface for explicit locking.
538  // Most uses should use the lock-accessor.
539  void lock() { _mutex.lock(); }
540  void unlock() { _mutex.unlock(); }
541  bool try_lock() { return _mutex.try_lock(); }
542 
543  MUTEX_TYPE _mutex;
544 };
545 
546 
547 template <typename MUTEX_TYPE>
548 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
549 public:
550  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
551  : _parent{parentStream}, _av{av}, _last_op_was_a_copy{false}
552  {}
553 
555 
558  return this;
559  };
560 
561  void munlock() {
562  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
563  ToString(this->_parent).c_str());
565  };
566 
568  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
569  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
570  ToString(this->_parent).c_str());
571  return gotLock ? this : nullptr;
572  };
573 
574  ihipStream_t* _parent;
575  hc::accelerator_view _av;
576  bool _last_op_was_a_copy;
577 };
578 
579 
580 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
581 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
582 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
583 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
584 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
585 // query.
586 
587 
590 
591 //---
592 // Internal stream structure.
594  public:
595  enum ScheduleMode { Auto, Spin, Yield };
596  typedef uint64_t SeqNum_t;
597 
598  // TODOD -make av a reference to avoid shared_ptr overhead?
599  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
600  ~ihipStream_t();
601 
602  // kind is hipMemcpyKind
603  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
604  bool resolveOn = true);
605 
606  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
607  bool resolveOn = true);
608 
609  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
610 
611  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
612 
613  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
614  size_t offset, unsigned kind);
615  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
616  size_t offset, unsigned kind);
617 
618  //---
619  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
620  // the critical mutex.
621  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
622  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
623 
624  void locked_wait(bool& waited);
625  void locked_wait();
626 
627  hc::accelerator_view* locked_getAv() {
628  LockedAccessor_StreamCrit_t crit(_criticalData);
629  return &(crit->_av);
630  };
631 
632  void locked_streamWaitEvent(ihipEventData_t& event);
633  hc::completion_future locked_recordEvent(hipEvent_t event);
634 
635  ihipStreamCritical_t& criticalData() { return _criticalData; };
636 
637  //---
638  hc::hcWaitMode waitMode() const;
639 
640  // Use this if we already have the stream critical data mutex:
641  void wait(LockedAccessor_StreamCrit_t& crit);
642 
643  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
644  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
645  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
646  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
647  uint64_t kernel);
648 
649 
650  //-- Non-racy accessors:
651  // These functions access fields set at initialization time and are non-racy (so do not acquire
652  // mutex)
653  const ihipDevice_t* getDevice() const;
654  ihipCtx_t* getCtx() const;
655 
656  // Before calling this function, stream must be resolved from "0" to the actual stream:
657  bool isDefaultStream() const { return _id == 0; };
658 
659  public:
660  //---
661  // Public member vars - these are set at initialization and never change:
662  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
663  unsigned _flags;
664 
665 
666  private:
667  // The unsigned return is hipMemcpyKind
668  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
669  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
670  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
671  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
672 
673  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
674  const hc::AmPointerInfo* srcInfo);
675 
676  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
677 
678  private: // Data
679  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
680  ihipStreamCritical_t _criticalData;
681 
682  std::mutex _hasQueueLock;
683 
684  ihipCtx_t* _ctx; // parent context that owns this stream.
685 
686  // Friends:
687  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
688  friend hipError_t hipStreamQuery(hipStream_t);
689 
690  ScheduleMode _scheduleMode;
691 };
692 
693 
694 //----
695 // Internal structure for stream callback handler
697  public:
698  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
699  : _stream(stream), _callback(callback), _userData(userData) {
700  };
701  hipStream_t _stream;
702  hipStreamCallback_t _callback;
703  void* _userData;
704 };
705 
706 
707 //----
708 // Internal event structure:
709 enum hipEventStatus_t {
710  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
711  hipEventStatusCreated = 1, // event created, but not yet Recorded
712  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
713  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
714 };
715 
716 // TODO - rename to ihip type of some kind
717 enum ihipEventType_t {
718  hipEventTypeIndependent,
719  hipEventTypeStartCommand,
720  hipEventTypeStopCommand,
721 };
722 
723 
725  ihipEventData_t() {
726  _state = hipEventStatusCreated;
727  _stream = NULL;
728  _timestamp = 0;
729  _type = hipEventTypeIndependent;
730  };
731 
732  void marker(const hc::completion_future& marker) { _marker = marker; };
733  hc::completion_future& marker() { return _marker; }
734  uint64_t timestamp() const { return _timestamp; };
735  ihipEventType_t type() const { return _type; };
736 
737  ihipEventType_t _type;
738  hipEventStatus_t _state;
739  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
740  // stream when recorded
741  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
742  private:
743  hc::completion_future _marker;
744 };
745 
746 
747 //=============================================================================
748 // class ihipEventCriticalBase_t
749 template <typename MUTEX_TYPE>
750 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
751  public:
752  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
754 
755  // Keep data in structure so it can be easily copied into snapshots
756  // (used to reduce lock contention and preserve correct lock order)
757  ihipEventData_t _eventData;
758 
759  private:
760  const ihipEvent_t* _parent;
761  friend class LockedAccessor<ihipEventCriticalBase_t>;
762 };
763 
765 
767 
768 // internal hip event structure.
769 class ihipEvent_t {
770  public:
771  explicit ihipEvent_t(unsigned flags);
772  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
773  ihipEventType_t eventType);
774 
775  // Return a copy of the critical state. The critical data is locked during the copy.
776  ihipEventData_t locked_copyCrit() {
777  LockedAccessor_EventCrit_t crit(_criticalData);
778  return _criticalData._eventData;
779  };
780 
781  ihipEventCritical_t& criticalData() { return _criticalData; };
782 
783  public:
784  unsigned _flags;
785 
786  private:
787  ihipEventCritical_t _criticalData;
788 
789  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
790 };
791 
792 
793 //=============================================================================
794 // class ihipDeviceCriticalBase_t
795 template <typename MUTEX_TYPE>
797  public:
798  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
799  : _parent(parentDevice), _ctxCount(0){};
800 
802 
803  // Contexts:
804  void addContext(ihipCtx_t* ctx);
805  void removeContext(ihipCtx_t* ctx);
806  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
807  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
808  int getcount() { return _ctxCount; };
809  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
810 
811  private:
812  ihipDevice_t* _parent;
813 
814  //--- Context Tracker:
815  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
816 
817  int _ctxCount;
818 };
819 
821 
823 
824 //----
825 // Properties of the HIP device.
826 // Multiple contexts can point to same device.
828  public:
829  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
830  ~ihipDevice_t();
831 
832  // Accessors:
833  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
834  void locked_removeContext(ihipCtx_t* c);
835  void locked_reset();
836  ihipDeviceCritical_t& criticalData() { return _criticalData; };
837 
838  public:
839  unsigned _deviceId; // device ID
840 
841  hc::accelerator _acc;
842  hsa_agent_t _hsaAgent; // hsa agent handle
843 
845  unsigned _computeUnits;
846  hipDeviceProp_t _props; // saved device properties.
847 
848  // TODO - report this through device properties, base on HCC API call.
849  int _isLargeBar;
850 
851  ihipCtx_t* _primaryCtx;
852 
853  int _state; // 1 if device is set otherwise 0
854 
855  private:
856  hipError_t initProperties(hipDeviceProp_t* prop);
857 
858  private:
859  ihipDeviceCritical_t _criticalData;
860 };
861 //=============================================================================
862 
863 
864 //---
865 //
866 struct ihipExec_t {
867  dim3 _gridDim;
868  dim3 _blockDim;
869  size_t _sharedMem;
870  hipStream_t _hStream;
871  std::vector<char> _arguments;
872 };
873 
874 //=============================================================================
875 // class ihipCtxCriticalBase_t
876 template <typename MUTEX_TYPE>
877 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
878  public:
879  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
880  : _parent(parentCtx), _peerCnt(0) {
881  _peerAgents = new hsa_agent_t[deviceCnt];
882  };
883 
885  if (_peerAgents != nullptr) {
886  delete _peerAgents;
887  _peerAgents = nullptr;
888  }
889  _peerCnt = 0;
890  }
891 
892  // Streams:
893  void addStream(ihipStream_t* stream);
894  std::list<ihipStream_t*>& streams() { return _streams; };
895  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
896 
897 
898  // Peer Accessor classes:
899  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
900  // physically located on this device.
901  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
902  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
903  void resetPeerWatchers(ihipCtx_t* thisDevice);
904  void printPeerWatchers(FILE* f) const;
905 
906  uint32_t peerCnt() const { return _peerCnt; };
907  hsa_agent_t* peerAgents() const { return _peerAgents; };
908 
909 
910  // TODO - move private
911  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
912  //--- Execution stack:
913  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
914 
915  friend class LockedAccessor<ihipCtxCriticalBase_t>;
916 
917  private:
918  ihipCtx_t* _parent;
919 
920  //--- Stream Tracker:
921  std::list<ihipStream_t*> _streams; // streams associated with this device.
922 
923 
924  //--- Peer Tracker:
925  // These reflect the currently Enabled set of peers for this GPU:
926  // Enabled peers have permissions to access the memory physically allocated on this device.
927  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
928  uint32_t _peerCnt; // number of enabled peers
929  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
930  private:
931  void recomputePeerAgents();
932 };
933 // Note Mutex type Real/Fake selected based on CtxMutex
935 
936 // This type is used by functions that need access to the critical device structures.
938 //=============================================================================
939 
940 
941 //=============================================================================
942 // class ihipCtx_t:
943 // A HIP CTX (context) points at one of the existing devices and contains the streams,
944 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
945 // device.
946 //
947 class ihipCtx_t {
948  public: // Functions:
949  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
950  unsigned flags); // note: calls constructor for _criticalData
951  ~ihipCtx_t();
952 
953  // Functions which read or write the critical data are named locked_.
954  // (might be better called "locking_"
955  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
956  // locked_ function from within a locked_ function. External functions which call several
957  // locked_ functions will acquire and release the lock for each function. if this occurs in
958  // performance-sensitive code we may want to refactor by adding non-locked functions and
959  // creating a new locked_ member function to call them all.
960  void locked_removeStream(ihipStream_t* s);
961  void locked_reset();
962  void locked_waitAllStreams();
963  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
964 
965  ihipCtxCritical_t& criticalData() { return _criticalData; };
966 
967  const ihipDevice_t* getDevice() const { return _device; };
968  int getDeviceNum() const { return _device->_deviceId; };
969 
970  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
971  ihipDevice_t* getWriteableDevice() const { return _device; };
972 
973  std::string toString() const;
974 
975  public: // Data
976  // The NULL stream is used if no other stream is specified.
977  // Default stream has special synchronization properties with other streams.
978  ihipStream_t* _defaultStream;
979 
980  // Flags specified when the context is created:
981  unsigned _ctxFlags;
982 
983  private:
984  ihipDevice_t* _device;
985 
986 
987  private: // Critical data, protected with locked access:
988  // Members of _protected data MUST be accessed through the LockedAccessor.
989  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
990  // directly.
991  ihipCtxCritical_t _criticalData;
992 };
993 
994 
995 //=================================================================================================
996 // Global variable definition:
997 extern unsigned g_deviceCnt;
998 extern hsa_agent_t g_cpu_agent; // the CPU agent.
999 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
1000 
1001 //=================================================================================================
1002 // Extern functions:
1003 extern void ihipInit();
1004 extern const char* ihipErrorString(hipError_t);
1005 extern hipError_t ihipSynchronize(TlsData *tls);
1006 extern void ihipCtxStackUpdate();
1007 extern hipError_t ihipDeviceSetState(TlsData *tls);
1008 
1009 extern ihipDevice_t* ihipGetDevice(int);
1010 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
1011 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
1012  const char* name, hsa_agent_t *agent);
1013 
1014 
1015 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
1016 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
1017 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
1018 
1019 // Stream printf functions:
1020 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
1021  os << "stream:";
1022  os << s.getDevice()->_deviceId;
1023  ;
1024  os << '.';
1025  os << s._id;
1026  return os;
1027 }
1028 
1029 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
1030  os << '{';
1031  os << s.x;
1032  os << ',';
1033  os << s.y;
1034  os << ',';
1035  os << s.z;
1036  os << '}';
1037  return os;
1038 }
1039 
1040 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
1041  os << '{';
1042  os << s.x;
1043  os << ',';
1044  os << s.y;
1045  os << ',';
1046  os << s.z;
1047  os << '}';
1048  return os;
1049 }
1050 
1051 // Stream printf functions:
1052 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
1053  os << "event:" << std::hex << static_cast<void*>(e);
1054  return os;
1055 }
1056 
1057 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
1058  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
1059  return os;
1060 }
1061 
1062 
1063 // Helper functions that are used across src files:
1064 namespace hip_internal {
1065 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1066  hipStream_t stream);
1067 
1068 hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
1069 
1070 hipError_t ihipHostFree(TlsData *tls, void* ptr);
1071 
1072 };
1073 
1074 #define MAX_COOPERATIVE_GPUs 255
1075 
1076 // do not change these two structs without changing the device library
1077 struct mg_sync {
1078  uint w0;
1079  uint w1;
1080 };
1081 
1082 struct mg_info {
1083  struct mg_sync *mgs;
1084  uint grid_id;
1085  uint num_grids;
1086  ulong prev_sum;
1087  ulong all_sum;
1088 };
1089 
1090 //---
1091 // TODO - review the context creation strategy here. Really should be:
1092 // - first "non-device" runtime call creates the context for this thread. Allowed to call
1093 // setDevice first.
1094 // - hipDeviceReset destroys the primary context for device?
1095 // - Then context is created again for next usage.
1096 static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
1097  // Per-thread initialization of the TLS:
1098  if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
1099  tls->defaultCtx = ihipGetPrimaryCtx(0);
1100  }
1101  return tls->defaultCtx;
1102 }
1103 
1104 #endif
Definition: hip_hcc_internal.h:304
Definition: hip_hcc_internal.h:1077
Definition: hip_hcc_internal.h:827
Definition: hip_hcc_internal.h:536
Definition: hip_hcc_internal.h:163
Definition: hip_hcc_internal.h:459
Definition: hip_hcc_internal.h:432
uint32_t x
x
Definition: hip_runtime_api.h:270
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:1082
Definition: hip_module.cpp:108
Definition: hip_hcc_internal.h:186
Definition: hip_hcc_internal.h:866
Definition: hip_hcc_internal.h:947
Definition: hip_runtime_api.h:269
uint32_t y
y
Definition: hip_runtime_api.h:271
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:843
Definition: hip_hcc_internal.h:696
Definition: hip_hcc_internal.h:796
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:845
uint32_t z
z
Definition: hip_runtime_api.h:272
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:724
Definition: hip_hcc_internal.h:750
Definition: hip_hcc_internal.h:1064
Definition: hip_hcc_internal.h:408
Definition: hip_hcc_internal.h:769
Definition: hip_hcc_internal.h:877
Definition: hip_hcc_internal.h:593
Definition: hip_hcc_internal.h:548
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:104
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:435
Definition: hip_hcc_internal.h:500
Definition: hip_hcc_internal.h:442
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:160
Definition: hip_hcc_internal.h:146