HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_util.h"
34 #include "env.h"
35 
36 
37 #if (__hcc_workweek__ < 16354)
38 #error("This version of HIP requires a newer version of HCC.");
39 #endif
40 
41 // Use the __appPtr field in the am memtracker to store the context.
42 // Requires a bug fix in HCC
43 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
44  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
45 #define USE_APP_PTR_FOR_CTX 1
46 #endif
47 
48 
49 #define USE_IPC 1
50 
51 //---
52 // Environment variables:
53 
54 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
55 // in debug+release.
56 // static const int debug = 0;
57 extern const int release;
58 
59 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
60 // kernels?
61 extern int HIP_LAUNCH_BLOCKING;
62 extern int HIP_API_BLOCKING;
63 
64 extern int HIP_PRINT_ENV;
65 extern int HIP_PROFILE_API;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 extern int HIP_DUMP_CODE_OBJECT;
87 
88 // TODO - remove when this is standard behavior.
89 extern int HCC_OPT_FLUSH;
90 
91 // Class to assign a short TID to each new thread, for HIP debugging purposes.
92 class TidInfo {
93  public:
94  TidInfo();
95 
96  int tid() const { return _shortTid; };
97  pid_t pid() const { return _pid; };
98  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
99  uint64_t apiSeqNum() const { return _apiSeqNum; };
100 
101  private:
102  int _shortTid;
103  pid_t _pid;
104 
105  // monotonically increasing API sequence number for this threa.
106  uint64_t _apiSeqNum;
107 };
108 
109 struct ProfTrigger {
110  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
111 
112  void print(int tid) {
113  std::cout << "Enabling tracing for ";
114  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
115  std::cout << "tid:" << tid << "." << *iter << ",";
116  }
117  std::cout << "\n";
118  };
119 
120  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
121  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
122  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
123 
124  private:
125  std::vector<uint64_t> _profTrigger;
126 };
127 
128 
129 //---
130 // Extern tls
131 extern thread_local hipError_t tls_lastHipError;
132 extern thread_local TidInfo tls_tidInfo;
133 extern thread_local bool tls_getPrimaryCtx;
134 
135 extern std::vector<ProfTrigger> g_dbStartTriggers;
136 extern std::vector<ProfTrigger> g_dbStopTriggers;
137 
138 //---
139 // Forward defs:
140 class ihipStream_t;
141 class ihipDevice_t;
142 class ihipCtx_t;
143 struct ihipEventData_t;
144 
145 // Color defs for debug messages:
146 #define KNRM "\x1B[0m"
147 #define KRED "\x1B[31m"
148 #define KGRN "\x1B[32m"
149 #define KYEL "\x1B[33m"
150 #define KBLU "\x1B[34m"
151 #define KMAG "\x1B[35m"
152 #define KCYN "\x1B[36m"
153 #define KWHT "\x1B[37m"
154 
155 extern const char* API_COLOR;
156 extern const char* API_COLOR_END;
157 
158 
159 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
160 // Can disable for performance or functional experiments - in this case
161 // the code uses a dummy "no-op" mutex.
162 #define EVENT_THREAD_SAFE 1
163 
164 #define STREAM_THREAD_SAFE 1
165 
166 #define CTX_THREAD_SAFE 1
167 
168 #define DEVICE_THREAD_SAFE 1
169 
170 
171 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
172 // May be set to 0 to remove debug if checks - possible code size and performance difference?
173 #define COMPILE_HIP_DB 1
174 
175 
176 // Compile HIP tracing capability.
177 // 0x1 = print a string at function entry with arguments.
178 // 0x2 = prints a simple message with function name + return code when function exits.
179 // 0x3 = print both.
180 // Must be enabled at runtime with HIP_TRACE_API
181 #define COMPILE_HIP_TRACE_API 0x3
182 
183 
184 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
185 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
186 #ifndef COMPILE_HIP_ATP_MARKER
187 #define COMPILE_HIP_ATP_MARKER 0
188 #endif
189 
190 
191 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
192 // boundary.
193 // TODO - currently we print the trace message at the beginning. if we waited, we could also
194 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
195 // by hipMalloc).
196 #if COMPILE_HIP_ATP_MARKER
197 #include "CXLActivityLogger.h"
198 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
199 #define MARKER_END() amdtEndMarker();
200 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
201 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
202 #else
203 // Swallow scoped markers:
204 #define MARKER_BEGIN(markerName, group)
205 #define MARKER_END()
206 #define RESUME_PROFILING
207 #define STOP_PROFILING
208 #endif
209 
210 
211 //---
212 // HIP Trace modes - use with HIP_TRACE_API=...
213 #define TRACE_ALL 0 // 0x01
214 #define TRACE_KCMD 1 // 0x02, kernel command
215 #define TRACE_MCMD 2 // 0x04, memory command
216 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
217 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
218 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
219 
220 
221 //---
222 // HIP_DB Debug flags:
223 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
224 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
225 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
226 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
227 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
228 #define DB_FB 5 /* 0x20 - trace loading fat binary */
229 #define DB_MAX_FLAG 6
230 // When adding a new debug flag, also add to the char name table below.
231 //
232 //
233 
234 struct DbName {
235  const char* _color;
236  const char* _shortName;
237 };
238 
239 // This table must be kept in-sync with the defines above.
240 static const DbName dbName[] = {
241  {KGRN, "api"}, // not used,
242  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
243  {KBLU, "fatbin"},
244 };
245 
246 
247 #if COMPILE_HIP_DB
248 #define tprintf(trace_level, ...) \
249  { \
250  if (HIP_DB & (1 << (trace_level))) { \
251  char msgStr[1000]; \
252  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
253  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
254  dbName[trace_level]._shortName, tls_tidInfo.pid(), tls_tidInfo.tid(), msgStr, KNRM); \
255  } \
256  }
257 #else
258 /* Compile to empty code */
259 #define tprintf(trace_level, ...)
260 #endif
261 
262 
263 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
264 
265 //---
266 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
267 
268 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
269 #define API_TRACE(forceTrace, ...) \
270  uint64_t hipApiStartTick = 0; \
271  { \
272  tls_tidInfo.incApiSeqNum(); \
273  if (forceTrace || \
274  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
275  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
276  std::string fullStr; \
277  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
278  if (HIP_PROFILE_API == 0x1) { \
279  MARKER_BEGIN(__func__, "HIP") \
280  } else if (HIP_PROFILE_API == 0x2) { \
281  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
282  } \
283  } \
284  }
285 
286 #else
287 // Swallow API_TRACE
288 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
289 #endif
290 
291 
292 // Just initialize the HIP runtime, but don't log any trace information.
293 #define HIP_INIT() \
294  std::call_once(hip_initialized, ihipInit); \
295  ihipCtxStackUpdate();
296 #define HIP_SET_DEVICE() ihipDeviceSetState();
297 
298 
299 // This macro should be called at the beginning of every HIP API.
300 // It initializes the hip runtime (exactly once), and
301 // generates a trace string that can be output to stderr or to ATP file.
302 #define HIP_INIT_API(cid, ...) \
303  HIP_INIT() \
304  API_TRACE(0, __VA_ARGS__); \
305  HIP_CB_SPAWNER_OBJECT(cid);
306 
307 
308 // Like above, but will trace with a specified "special" bit.
309 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
310 // kernel launches, copy commands, memory sets, etc.
311 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
312  HIP_INIT() \
313  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
314  HIP_CB_SPAWNER_OBJECT(cid);
315 
316 
317 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
318 // APIS (not internal hip) It has dual function: logs the last error returned for use by
319 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
320 #define ihipLogStatus(hipStatus) \
321  ({ \
322  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
323  tls_lastHipError = localHipStatus; \
324  \
325  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
326  auto ticks = getTicks() - hipApiStartTick; \
327  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
328  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.pid(), tls_tidInfo.tid(), \
329  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
330  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
331  } \
332  if (HIP_PROFILE_API) { \
333  MARKER_END(); \
334  } \
335  localHipStatus; \
336  })
337 
338 
339 class ihipException : public std::exception {
340  public:
341  explicit ihipException(hipError_t e) : _code(e){};
342 
343  hipError_t _code;
344 };
345 
346 
347 #ifdef __cplusplus
348 extern "C" {
349 #endif
350 
351 
352 #ifdef __cplusplus
353 }
354 #endif
355 
356 const hipStream_t hipStreamNull = 0x0;
357 
358 
362 #define HIP_IPC_RESERVED_SIZE 24
364  public:
365 #if USE_IPC
366  hsa_amd_ipc_memory_t ipc_handle;
367 #endif
368  size_t psize;
369  char reserved[HIP_IPC_RESERVED_SIZE];
370 };
371 
372 
373 struct ihipModule_t {
374  std::string fileName;
375  hsa_executable_t executable = {};
376  hsa_code_object_reader_t coReader = {};
377  std::string hash;
378 
379  ~ihipModule_t() {
380  if (executable.handle) hsa_executable_destroy(executable);
381  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
382  }
383 };
384 
385 
386 //---
387 // Used to remove lock, for performance or stimulating bugs.
388 class FakeMutex {
389  public:
390  void lock() {}
391  bool try_lock() { return true; }
392  void unlock() {}
393 };
394 
395 #if EVENT_THREAD_SAFE
396 typedef std::mutex EventMutex;
397 #else
398 #warning "Stream thread-safe disabled"
399 typedef FakeMutex EventMutex;
400 #endif
401 
402 #if STREAM_THREAD_SAFE
403 typedef std::mutex StreamMutex;
404 #else
405 #warning "Stream thread-safe disabled"
406 typedef FakeMutex StreamMutex;
407 #endif
408 
409 // Pair Device and Ctx together, these could also be toggled separately if desired.
410 #if CTX_THREAD_SAFE
411 typedef std::mutex CtxMutex;
412 #else
413 typedef FakeMutex CtxMutex;
414 #warning "Ctx thread-safe disabled"
415 #endif
416 
417 #if DEVICE_THREAD_SAFE
418 typedef std::mutex DeviceMutex;
419 #else
420 typedef FakeMutex DeviceMutex;
421 #warning "Device thread-safe disabled"
422 #endif
423 
424 //
425 //---
426 // Protects access to the member _data with a lock acquired on contruction/destruction.
427 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
428 template <typename T>
430  public:
431  LockedAccessor(T& criticalData, bool autoUnlock = true)
432  : _criticalData(&criticalData),
433  _autoUnlock(autoUnlock)
434 
435  {
436  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
437  ToString(_criticalData->_parent).c_str());
438  _criticalData->_mutex.lock();
439  };
440 
441  ~LockedAccessor() {
442  if (_autoUnlock) {
443  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
444  ToString(_criticalData->_parent).c_str());
445  _criticalData->_mutex.unlock();
446  }
447  }
448 
449  void unlock() {
450  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
451  ToString(_criticalData->_parent).c_str());
452  _criticalData->_mutex.unlock();
453  }
454 
455  // Syntactic sugar so -> can be used to get the underlying type.
456  T* operator->() { return _criticalData; };
457 
458  private:
459  T* _criticalData;
460  bool _autoUnlock;
461 };
462 
463 
464 template <typename MUTEX_TYPE>
465 struct LockedBase {
466  // Experts-only interface for explicit locking.
467  // Most uses should use the lock-accessor.
468  void lock() { _mutex.lock(); }
469  void unlock() { _mutex.unlock(); }
470  bool try_lock() { return _mutex.try_lock(); }
471 
472  MUTEX_TYPE _mutex;
473 };
474 
475 
476 template <typename MUTEX_TYPE>
477 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
478  public:
479  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
480  : _kernelCnt(0), _av(av), _parent(parentStream){};
481 
483 
486  return this;
487  };
488 
489  void munlock() {
490  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
491  ToString(this->_parent).c_str());
493  };
494 
496  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
497  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
498  ToString(this->_parent).c_str());
499  return gotLock ? this : nullptr;
500  };
501 
502  public:
503  ihipStream_t* _parent;
504  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
505 
506  hc::accelerator_view _av;
507 
508  private:
509 };
510 
511 
512 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
513 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
514 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
515 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
516 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
517 // query.
518 
519 
522 
523 //---
524 // Internal stream structure.
526  public:
527  enum ScheduleMode { Auto, Spin, Yield };
528  typedef uint64_t SeqNum_t;
529 
530  // TODOD -make av a reference to avoid shared_ptr overhead?
531  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
532  ~ihipStream_t();
533 
534  // kind is hipMemcpyKind
535  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
536  bool resolveOn = true);
537 
538  void locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
539  bool resolveOn = true);
540 
541  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
542 
543  void locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
544 
545  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
546  size_t offset, unsigned kind);
547  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
548  size_t offset, unsigned kind);
549 
550  //---
551  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
552  // the critical mutex.
553  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
554  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
555 
556 
557  void locked_wait();
558 
559  hc::accelerator_view* locked_getAv() {
560  LockedAccessor_StreamCrit_t crit(_criticalData);
561  return &(crit->_av);
562  };
563 
564  void locked_streamWaitEvent(ihipEventData_t& event);
565  hc::completion_future locked_recordEvent(hipEvent_t event);
566 
567  bool locked_eventIsReady(hipEvent_t event);
568  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
569 
570  ihipStreamCritical_t& criticalData() { return _criticalData; };
571 
572  //---
573  hc::hcWaitMode waitMode() const;
574 
575  // Use this if we already have the stream critical data mutex:
576  void wait(LockedAccessor_StreamCrit_t& crit);
577 
578  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
579  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
580  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
581  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
582  uint64_t kernel);
583 
584 
585  //-- Non-racy accessors:
586  // These functions access fields set at initialization time and are non-racy (so do not acquire
587  // mutex)
588  const ihipDevice_t* getDevice() const;
589  ihipCtx_t* getCtx() const;
590 
591  // Before calling this function, stream must be resolved from "0" to the actual stream:
592  bool isDefaultStream() const { return _id == 0; };
593 
594  public:
595  //---
596  // Public member vars - these are set at initialization and never change:
597  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
598  unsigned _flags;
599 
600 
601  private:
602  // The unsigned return is hipMemcpyKind
603  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
604  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
605  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
606  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
607 
608  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
609  const hc::AmPointerInfo* srcInfo);
610 
611  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
612 
613 
614  public: // TODO - move private
615  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
616  ihipStreamCritical_t _criticalData;
617 
618  private: // Data
619  std::mutex _hasQueueLock;
620 
621  ihipCtx_t* _ctx; // parent context that owns this stream.
622 
623  // Friends:
624  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
625  friend hipError_t hipStreamQuery(hipStream_t);
626 
627  ScheduleMode _scheduleMode;
628 };
629 
630 
631 //----
632 // Internal structure for stream callback handler
634  public:
635  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
636  : _stream(stream), _callback(callback), _userData(userData) {
637  };
638  hipStream_t _stream;
639  hipStreamCallback_t _callback;
640  void* _userData;
641 };
642 
643 
644 //----
645 // Internal event structure:
646 enum hipEventStatus_t {
647  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
648  hipEventStatusCreated = 1, // event created, but not yet Recorded
649  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
650  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
651 };
652 
653 // TODO - rename to ihip type of some kind
654 enum ihipEventType_t {
655  hipEventTypeIndependent,
656  hipEventTypeStartCommand,
657  hipEventTypeStopCommand,
658 };
659 
660 
662  ihipEventData_t() {
663  _state = hipEventStatusCreated;
664  _stream = NULL;
665  _timestamp = 0;
666  _type = hipEventTypeIndependent;
667  };
668 
669  void marker(const hc::completion_future& marker) { _marker = marker; };
670  hc::completion_future& marker() { return _marker; }
671  uint64_t timestamp() const { return _timestamp; };
672  ihipEventType_t type() const { return _type; };
673 
674  ihipEventType_t _type;
675  hipEventStatus_t _state;
676  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
677  // stream when recorded
678  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
679  private:
680  hc::completion_future _marker;
681 };
682 
683 
684 //=============================================================================
685 // class ihipEventCriticalBase_t
686 template <typename MUTEX_TYPE>
687 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
688  public:
689  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
691 
692  // Keep data in structure so it can be easily copied into snapshots
693  // (used to reduce lock contention and preserve correct lock order)
694  ihipEventData_t _eventData;
695 
696  private:
697  const ihipEvent_t* _parent;
698  friend class LockedAccessor<ihipEventCriticalBase_t>;
699 };
700 
702 
704 
705 // internal hip event structure.
706 class ihipEvent_t {
707  public:
708  explicit ihipEvent_t(unsigned flags);
709  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
710  ihipEventType_t eventType);
711  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
712 
713 
714  // Return a copy of the critical state. The critical data is locked during the copy.
715  ihipEventData_t locked_copyCrit() {
716  LockedAccessor_EventCrit_t crit(_criticalData);
717  return _criticalData._eventData;
718  };
719 
720  ihipEventCritical_t& criticalData() { return _criticalData; };
721 
722  public:
723  unsigned _flags;
724 
725  private:
726  ihipEventCritical_t _criticalData;
727 
728  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
729 };
730 
731 
732 //=============================================================================
733 // class ihipDeviceCriticalBase_t
734 template <typename MUTEX_TYPE>
736  public:
737  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
738  : _parent(parentDevice), _ctxCount(0){};
739 
741 
742  // Contexts:
743  void addContext(ihipCtx_t* ctx);
744  void removeContext(ihipCtx_t* ctx);
745  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
746  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
747  int getcount() { return _ctxCount; };
748  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
749 
750  private:
751  ihipDevice_t* _parent;
752 
753  //--- Context Tracker:
754  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
755 
756  int _ctxCount;
757 };
758 
760 
762 
763 //----
764 // Properties of the HIP device.
765 // Multiple contexts can point to same device.
767  public:
768  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
769  ~ihipDevice_t();
770 
771  // Accessors:
772  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
773  void locked_removeContext(ihipCtx_t* c);
774  void locked_reset();
775  ihipDeviceCritical_t& criticalData() { return _criticalData; };
776 
777  public:
778  unsigned _deviceId; // device ID
779 
780  hc::accelerator _acc;
781  hsa_agent_t _hsaAgent; // hsa agent handle
782 
784  unsigned _computeUnits;
785  hipDeviceProp_t _props; // saved device properties.
786 
787  // TODO - report this through device properties, base on HCC API call.
788  int _isLargeBar;
789 
790  ihipCtx_t* _primaryCtx;
791 
792  int _state; // 1 if device is set otherwise 0
793 
794  private:
795  hipError_t initProperties(hipDeviceProp_t* prop);
796 
797  private:
798  ihipDeviceCritical_t _criticalData;
799 };
800 //=============================================================================
801 
802 
803 //---
804 //
805 struct ihipExec_t {
806  dim3 _gridDim;
807  dim3 _blockDim;
808  size_t _sharedMem;
809  hipStream_t _hStream;
810  std::vector<char> _arguments;
811 };
812 
813 //=============================================================================
814 // class ihipCtxCriticalBase_t
815 template <typename MUTEX_TYPE>
816 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
817  public:
818  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
819  : _parent(parentCtx), _peerCnt(0) {
820  _peerAgents = new hsa_agent_t[deviceCnt];
821  };
822 
824  if (_peerAgents != nullptr) {
825  delete _peerAgents;
826  _peerAgents = nullptr;
827  }
828  _peerCnt = 0;
829  }
830 
831  // Streams:
832  void addStream(ihipStream_t* stream);
833  std::list<ihipStream_t*>& streams() { return _streams; };
834  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
835 
836 
837  // Peer Accessor classes:
838  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
839  // physically located on this device.
840  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
841  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
842  void resetPeerWatchers(ihipCtx_t* thisDevice);
843  void printPeerWatchers(FILE* f) const;
844 
845  uint32_t peerCnt() const { return _peerCnt; };
846  hsa_agent_t* peerAgents() const { return _peerAgents; };
847 
848 
849  // TODO - move private
850  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
851  //--- Execution stack:
852  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
853 
854  friend class LockedAccessor<ihipCtxCriticalBase_t>;
855 
856  private:
857  ihipCtx_t* _parent;
858 
859  //--- Stream Tracker:
860  std::list<ihipStream_t*> _streams; // streams associated with this device.
861 
862 
863  //--- Peer Tracker:
864  // These reflect the currently Enabled set of peers for this GPU:
865  // Enabled peers have permissions to access the memory physically allocated on this device.
866  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
867  uint32_t _peerCnt; // number of enabled peers
868  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
869  private:
870  void recomputePeerAgents();
871 };
872 // Note Mutex type Real/Fake selected based on CtxMutex
874 
875 // This type is used by functions that need access to the critical device structures.
877 //=============================================================================
878 
879 
880 //=============================================================================
881 // class ihipCtx_t:
882 // A HIP CTX (context) points at one of the existing devices and contains the streams,
883 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
884 // device.
885 //
886 class ihipCtx_t {
887  public: // Functions:
888  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
889  unsigned flags); // note: calls constructor for _criticalData
890  ~ihipCtx_t();
891 
892  // Functions which read or write the critical data are named locked_.
893  // (might be better called "locking_"
894  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
895  // locked_ function from within a locked_ function. External functions which call several
896  // locked_ functions will acquire and release the lock for each function. if this occurs in
897  // performance-sensitive code we may want to refactor by adding non-locked functions and
898  // creating a new locked_ member function to call them all.
899  void locked_removeStream(ihipStream_t* s);
900  void locked_reset();
901  void locked_waitAllStreams();
902  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
903 
904  ihipCtxCritical_t& criticalData() { return _criticalData; };
905 
906  const ihipDevice_t* getDevice() const { return _device; };
907  int getDeviceNum() const { return _device->_deviceId; };
908 
909  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
910  ihipDevice_t* getWriteableDevice() const { return _device; };
911 
912  std::string toString() const;
913 
914  public: // Data
915  // The NULL stream is used if no other stream is specified.
916  // Default stream has special synchronization properties with other streams.
917  ihipStream_t* _defaultStream;
918 
919  // Flags specified when the context is created:
920  unsigned _ctxFlags;
921 
922  private:
923  ihipDevice_t* _device;
924 
925 
926  private: // Critical data, protected with locked access:
927  // Members of _protected data MUST be accessed through the LockedAccessor.
928  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
929  // directly.
930  ihipCtxCritical_t _criticalData;
931 };
932 
933 
934 //=================================================================================================
935 // Global variable definition:
936 extern std::once_flag hip_initialized;
937 extern unsigned g_deviceCnt;
938 extern hsa_agent_t g_cpu_agent; // the CPU agent.
939 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
940 
941 //=================================================================================================
942 // Extern functions:
943 extern void ihipInit();
944 extern const char* ihipErrorString(hipError_t);
945 extern ihipCtx_t* ihipGetTlsDefaultCtx();
946 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
947 extern hipError_t ihipSynchronize(void);
948 extern void ihipCtxStackUpdate();
949 extern hipError_t ihipDeviceSetState();
950 
951 extern ihipDevice_t* ihipGetDevice(int);
952 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
953 
954 
955 hipStream_t ihipSyncAndResolveStream(hipStream_t);
956 hipError_t ihipStreamSynchronize(hipStream_t stream);
957 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
958 
959 // Stream printf functions:
960 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
961  os << "stream:";
962  os << s.getDevice()->_deviceId;
963  ;
964  os << '.';
965  os << s._id;
966  return os;
967 }
968 
969 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
970  os << '{';
971  os << s.x;
972  os << ',';
973  os << s.y;
974  os << ',';
975  os << s.z;
976  os << '}';
977  return os;
978 }
979 
980 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
981  os << '{';
982  os << s.x;
983  os << ',';
984  os << s.y;
985  os << ',';
986  os << s.z;
987  os << '}';
988  return os;
989 }
990 
991 // Stream printf functions:
992 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
993  os << "event:" << std::hex << static_cast<void*>(e);
994  return os;
995 }
996 
997 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
998  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
999  return os;
1000 }
1001 
1002 
1003 // Helper functions that are used across src files:
1004 namespace hip_internal {
1005 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1006  hipStream_t stream);
1007 };
1008 
1009 
1010 #endif
Definition: hip_hcc_internal.h:234
Definition: hip_hcc_internal.h:766
Definition: hip_hcc_internal.h:465
Definition: hip_hcc_internal.h:109
Definition: hip_hcc_internal.h:388
Definition: hip_hcc_internal.h:363
uint32_t x
x
Definition: hip_runtime_api.h:246
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:805
Definition: hip_hcc_internal.h:886
Definition: hip_runtime_api.h:245
uint32_t y
y
Definition: hip_runtime_api.h:247
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:796
Definition: hip_hcc_internal.h:633
Definition: hip_hcc_internal.h:735
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:784
uint32_t z
z
Definition: hip_runtime_api.h:248
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:661
Definition: hip_hcc_internal.h:687
Definition: hip_hcc_internal.h:1004
Definition: hip_hcc_internal.h:339
Definition: hip_hcc_internal.h:706
Definition: hip_hcc_internal.h:816
Definition: hip_hcc_internal.h:525
Definition: hip_hcc_internal.h:477
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:366
Definition: hip_hcc_internal.h:429
Definition: hip_hcc_internal.h:373
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:154
Definition: hip_hcc_internal.h:92