HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_util.h"
34 #include "env.h"
35 
36 
37 #if (__hcc_workweek__ < 16354)
38 #error("This version of HIP requires a newer version of HCC.");
39 #endif
40 
41 // Use the __appPtr field in the am memtracker to store the context.
42 // Requires a bug fix in HCC
43 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
44  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
45 #define USE_APP_PTR_FOR_CTX 1
46 #endif
47 
48 
49 #define USE_IPC 1
50 
51 //---
52 // Environment variables:
53 
54 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
55 // in debug+release.
56 // static const int debug = 0;
57 extern const int release;
58 
59 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
60 // kernels?
61 extern int HIP_LAUNCH_BLOCKING;
62 extern int HIP_API_BLOCKING;
63 
64 extern int HIP_PRINT_ENV;
65 extern int HIP_PROFILE_API;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 
87 // TODO - remove when this is standard behavior.
88 extern int HCC_OPT_FLUSH;
89 
90 
91 // Class to assign a short TID to each new thread, for HIP debugging purposes.
92 class TidInfo {
93  public:
94  TidInfo();
95 
96  int tid() const { return _shortTid; };
97  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
98  uint64_t apiSeqNum() const { return _apiSeqNum; };
99 
100  private:
101  int _shortTid;
102 
103  // monotonically increasing API sequence number for this threa.
104  uint64_t _apiSeqNum;
105 };
106 
107 struct ProfTrigger {
108  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
109 
110  void print(int tid) {
111  std::cout << "Enabling tracing for ";
112  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
113  std::cout << "tid:" << tid << "." << *iter << ",";
114  }
115  std::cout << "\n";
116  };
117 
118  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
119  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
120  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
121 
122  private:
123  std::vector<uint64_t> _profTrigger;
124 };
125 
126 
127 //---
128 // Extern tls
129 extern thread_local hipError_t tls_lastHipError;
130 extern thread_local TidInfo tls_tidInfo;
131 extern thread_local bool tls_getPrimaryCtx;
132 
133 extern std::vector<ProfTrigger> g_dbStartTriggers;
134 extern std::vector<ProfTrigger> g_dbStopTriggers;
135 
136 //---
137 // Forward defs:
138 class ihipStream_t;
139 class ihipDevice_t;
140 class ihipCtx_t;
141 struct ihipEventData_t;
142 
143 // Color defs for debug messages:
144 #define KNRM "\x1B[0m"
145 #define KRED "\x1B[31m"
146 #define KGRN "\x1B[32m"
147 #define KYEL "\x1B[33m"
148 #define KBLU "\x1B[34m"
149 #define KMAG "\x1B[35m"
150 #define KCYN "\x1B[36m"
151 #define KWHT "\x1B[37m"
152 
153 extern const char* API_COLOR;
154 extern const char* API_COLOR_END;
155 
156 
157 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
158 // Can disable for performance or functional experiments - in this case
159 // the code uses a dummy "no-op" mutex.
160 #define EVENT_THREAD_SAFE 1
161 
162 #define STREAM_THREAD_SAFE 1
163 
164 #define CTX_THREAD_SAFE 1
165 
166 #define DEVICE_THREAD_SAFE 1
167 
168 
169 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
170 // May be set to 0 to remove debug if checks - possible code size and performance difference?
171 #define COMPILE_HIP_DB 1
172 
173 
174 // Compile HIP tracing capability.
175 // 0x1 = print a string at function entry with arguments.
176 // 0x2 = prints a simple message with function name + return code when function exits.
177 // 0x3 = print both.
178 // Must be enabled at runtime with HIP_TRACE_API
179 #define COMPILE_HIP_TRACE_API 0x3
180 
181 
182 // Compile code that generates trace markers for CodeXL ATP at HIP function begin/end.
183 // ATP is standard CodeXL format that includes timestamps for kernels, HSA RT APIs, and HIP APIs.
184 #ifndef COMPILE_HIP_ATP_MARKER
185 #define COMPILE_HIP_ATP_MARKER 0
186 #endif
187 
188 
189 // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function
190 // boundary.
191 // TODO - currently we print the trace message at the beginning. if we waited, we could also
192 // tls_tidInfo return codes, and any values returned through ptr-to-args (ie the pointers allocated
193 // by hipMalloc).
194 #if COMPILE_HIP_ATP_MARKER
195 #include "CXLActivityLogger.h"
196 #define MARKER_BEGIN(markerName, group) amdtBeginMarker(markerName, group, nullptr);
197 #define MARKER_END() amdtEndMarker();
198 #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING);
199 #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING);
200 #else
201 // Swallow scoped markers:
202 #define MARKER_BEGIN(markerName, group)
203 #define MARKER_END()
204 #define RESUME_PROFILING
205 #define STOP_PROFILING
206 #endif
207 
208 
209 //---
210 // HIP Trace modes - use with HIP_TRACE_API=...
211 #define TRACE_ALL 0 // 0x01
212 #define TRACE_KCMD 1 // 0x02, kernel command
213 #define TRACE_MCMD 2 // 0x04, memory command
214 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
215 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
216 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
217 
218 
219 //---
220 // HIP_DB Debug flags:
221 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
222 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
223 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
224 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
225 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
226 #define DB_MAX_FLAG 5
227 // When adding a new debug flag, also add to the char name table below.
228 //
229 //
230 
231 struct DbName {
232  const char* _color;
233  const char* _shortName;
234 };
235 
236 // This table must be kept in-sync with the defines above.
237 static const DbName dbName[] = {
238  {KGRN, "api"}, // not used,
239  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
240 };
241 
242 
243 #if COMPILE_HIP_DB
244 #define tprintf(trace_level, ...) \
245  { \
246  if (HIP_DB & (1 << (trace_level))) { \
247  char msgStr[1000]; \
248  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
249  fprintf(stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, \
250  dbName[trace_level]._shortName, tls_tidInfo.tid(), msgStr, KNRM); \
251  } \
252  }
253 #else
254 /* Compile to empty code */
255 #define tprintf(trace_level, ...)
256 #endif
257 
258 
259 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
260 
261 //---
262 extern uint64_t recordApiTrace(std::string* fullStr, const std::string& apiStr);
263 
264 #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
265 #define API_TRACE(forceTrace, ...) \
266  uint64_t hipApiStartTick = 0; \
267  { \
268  tls_tidInfo.incApiSeqNum(); \
269  if (forceTrace || \
270  (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL))))) { \
271  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
272  std::string fullStr; \
273  hipApiStartTick = recordApiTrace(&fullStr, apiStr); \
274  if (HIP_PROFILE_API == 0x1) { \
275  MARKER_BEGIN(__func__, "HIP") \
276  } else if (HIP_PROFILE_API == 0x2) { \
277  MARKER_BEGIN(fullStr.c_str(), "HIP"); \
278  } \
279  } \
280  }
281 
282 #else
283 // Swallow API_TRACE
284 #define API_TRACE(IS_CMD, ...) tls_tidInfo.incApiSeqNum();
285 #endif
286 
287 
288 // Just initialize the HIP runtime, but don't log any trace information.
289 #define HIP_INIT() \
290  std::call_once(hip_initialized, ihipInit); \
291  ihipCtxStackUpdate();
292 #define HIP_SET_DEVICE() ihipDeviceSetState();
293 
294 
295 // This macro should be called at the beginning of every HIP API.
296 // It initializes the hip runtime (exactly once), and
297 // generates a trace string that can be output to stderr or to ATP file.
298 #define HIP_INIT_API(...) \
299  HIP_INIT() \
300  API_TRACE(0, __VA_ARGS__);
301 
302 
303 // Like above, but will trace with a specified "special" bit.
304 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
305 // kernel launches, copy commands, memory sets, etc.
306 #define HIP_INIT_SPECIAL_API(tbit, ...) \
307  HIP_INIT() \
308  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__);
309 
310 
311 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
312 // APIS (not internal hip) It has dual function: logs the last error returned for use by
313 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
314 #define ihipLogStatus(hipStatus) \
315  ({ \
316  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
317  tls_lastHipError = localHipStatus; \
318  \
319  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
320  auto ticks = getTicks() - hipApiStartTick; \
321  fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
322  (localHipStatus == 0) ? API_COLOR : KRED, tls_tidInfo.tid(), \
323  tls_tidInfo.apiSeqNum(), __func__, localHipStatus, \
324  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
325  } \
326  if (HIP_PROFILE_API) { \
327  MARKER_END(); \
328  } \
329  localHipStatus; \
330  })
331 
332 
333 class ihipException : public std::exception {
334  public:
335  explicit ihipException(hipError_t e) : _code(e){};
336 
337  hipError_t _code;
338 };
339 
340 
341 #ifdef __cplusplus
342 extern "C" {
343 #endif
344 
345 
346 #ifdef __cplusplus
347 }
348 #endif
349 
350 const hipStream_t hipStreamNull = 0x0;
351 
352 
356 #define HIP_IPC_RESERVED_SIZE 24
358  public:
359 #if USE_IPC
360  hsa_amd_ipc_memory_t ipc_handle;
361 #endif
362  size_t psize;
363  char reserved[HIP_IPC_RESERVED_SIZE];
364 };
365 
366 
367 struct ihipModule_t {
368  std::string fileName;
369  hsa_executable_t executable = {};
370  hsa_code_object_reader_t coReader = {};
371 
372  ~ihipModule_t() {
373  if (executable.handle) hsa_executable_destroy(executable);
374  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
375  }
376 };
377 
378 
379 //---
380 // Used to remove lock, for performance or stimulating bugs.
381 class FakeMutex {
382  public:
383  void lock() {}
384  bool try_lock() { return true; }
385  void unlock() {}
386 };
387 
388 #if EVENT_THREAD_SAFE
389 typedef std::mutex EventMutex;
390 #else
391 #warning "Stream thread-safe disabled"
392 typedef FakeMutex EventMutex;
393 #endif
394 
395 #if STREAM_THREAD_SAFE
396 typedef std::mutex StreamMutex;
397 #else
398 #warning "Stream thread-safe disabled"
399 typedef FakeMutex StreamMutex;
400 #endif
401 
402 // Pair Device and Ctx together, these could also be toggled separately if desired.
403 #if CTX_THREAD_SAFE
404 typedef std::mutex CtxMutex;
405 #else
406 typedef FakeMutex CtxMutex;
407 #warning "Ctx thread-safe disabled"
408 #endif
409 
410 #if DEVICE_THREAD_SAFE
411 typedef std::mutex DeviceMutex;
412 #else
413 typedef FakeMutex DeviceMutex;
414 #warning "Device thread-safe disabled"
415 #endif
416 
417 //
418 //---
419 // Protects access to the member _data with a lock acquired on contruction/destruction.
420 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
421 template <typename T>
423  public:
424  LockedAccessor(T& criticalData, bool autoUnlock = true)
425  : _criticalData(&criticalData),
426  _autoUnlock(autoUnlock)
427 
428  {
429  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
430  ToString(_criticalData->_parent).c_str());
431  _criticalData->_mutex.lock();
432  };
433 
434  ~LockedAccessor() {
435  if (_autoUnlock) {
436  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
437  ToString(_criticalData->_parent).c_str());
438  _criticalData->_mutex.unlock();
439  }
440  }
441 
442  void unlock() {
443  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
444  ToString(_criticalData->_parent).c_str());
445  _criticalData->_mutex.unlock();
446  }
447 
448  // Syntactic sugar so -> can be used to get the underlying type.
449  T* operator->() { return _criticalData; };
450 
451  private:
452  T* _criticalData;
453  bool _autoUnlock;
454 };
455 
456 
457 template <typename MUTEX_TYPE>
458 struct LockedBase {
459  // Experts-only interface for explicit locking.
460  // Most uses should use the lock-accessor.
461  void lock() { _mutex.lock(); }
462  void unlock() { _mutex.unlock(); }
463  bool try_lock() { return _mutex.try_lock(); }
464 
465  MUTEX_TYPE _mutex;
466 };
467 
468 
469 template <typename MUTEX_TYPE>
470 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
471  public:
472  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
473  : _kernelCnt(0), _av(av), _parent(parentStream){};
474 
476 
479  return this;
480  };
481 
482  void munlock() {
483  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
484  ToString(this->_parent).c_str());
486  };
487 
489  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
490  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
491  ToString(this->_parent).c_str());
492  return gotLock ? this : nullptr;
493  };
494 
495  public:
496  ihipStream_t* _parent;
497  uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait().
498 
499  hc::accelerator_view _av;
500 
501  private:
502 };
503 
504 
505 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
506 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
507 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
508 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
509 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
510 // query.
511 
512 
515 
516 //---
517 // Internal stream structure.
519  public:
520  enum ScheduleMode { Auto, Spin, Yield };
521  typedef uint64_t SeqNum_t;
522 
523  // TODOD -make av a reference to avoid shared_ptr overhead?
524  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
525  ~ihipStream_t();
526 
527  // kind is hipMemcpyKind
528  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
529  bool resolveOn = true);
530 
531  void locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch,
532  size_t dstPitch, unsigned kind, bool resolveOn = true);
533 
534  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
535 
536  void locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch,
537  size_t dstPitch, unsigned kind);
538 
539  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
540  size_t offset, unsigned kind);
541  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
542  size_t offset, unsigned kind);
543 
544  //---
545  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
546  // the critical mutex.
547  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
548  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av);
549 
550 
551  void locked_wait();
552 
553  hc::accelerator_view* locked_getAv() {
554  LockedAccessor_StreamCrit_t crit(_criticalData);
555  return &(crit->_av);
556  };
557 
558  void locked_streamWaitEvent(ihipEventData_t& event);
559  hc::completion_future locked_recordEvent(hipEvent_t event);
560 
561  bool locked_eventIsReady(hipEvent_t event);
562  void locked_eventWaitComplete(hc::completion_future& marker, hc::hcWaitMode waitMode);
563 
564  ihipStreamCritical_t& criticalData() { return _criticalData; };
565 
566  //---
567  hc::hcWaitMode waitMode() const;
568 
569  // Use this if we already have the stream critical data mutex:
570  void wait(LockedAccessor_StreamCrit_t& crit);
571 
572  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
573  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
574  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
575  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
576  uint64_t kernel);
577 
578 
579  //-- Non-racy accessors:
580  // These functions access fields set at initialization time and are non-racy (so do not acquire
581  // mutex)
582  const ihipDevice_t* getDevice() const;
583  ihipCtx_t* getCtx() const;
584 
585  // Before calling this function, stream must be resolved from "0" to the actual stream:
586  bool isDefaultStream() const { return _id == 0; };
587 
588  public:
589  //---
590  // Public member vars - these are set at initialization and never change:
591  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
592  unsigned _flags;
593 
594 
595  private:
596  // The unsigned return is hipMemcpyKind
597  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
598  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
599  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
600  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
601 
602  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
603  const hc::AmPointerInfo* srcInfo);
604 
605  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
606 
607 
608  public: // TODO - move private
609  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
610  ihipStreamCritical_t _criticalData;
611 
612  private: // Data
613  std::mutex _hasQueueLock;
614 
615  ihipCtx_t* _ctx; // parent context that owns this stream.
616 
617  // Friends:
618  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
619  friend hipError_t hipStreamQuery(hipStream_t);
620 
621  ScheduleMode _scheduleMode;
622 };
623 
624 
625 //----
626 // Internal structure for stream callback handler
628  public:
629  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
630  : _stream(stream), _callback(callback), _userData(userData) {
631  };
632  hipStream_t _stream;
633  hipStreamCallback_t _callback;
634  void* _userData;
635 };
636 
637 
638 //----
639 // Internal event structure:
640 enum hipEventStatus_t {
641  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
642  hipEventStatusCreated = 1, // event created, but not yet Recorded
643  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
644  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
645 };
646 
647 // TODO - rename to ihip type of some kind
648 enum ihipEventType_t {
649  hipEventTypeIndependent,
650  hipEventTypeStartCommand,
651  hipEventTypeStopCommand,
652 };
653 
654 
656  ihipEventData_t() {
657  _state = hipEventStatusCreated;
658  _stream = NULL;
659  _timestamp = 0;
660  _type = hipEventTypeIndependent;
661  };
662 
663  void marker(const hc::completion_future& marker) { _marker = marker; };
664  hc::completion_future& marker() { return _marker; }
665  uint64_t timestamp() const { return _timestamp; };
666  ihipEventType_t type() const { return _type; };
667 
668  ihipEventType_t _type;
669  hipEventStatus_t _state;
670  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
671  // stream when recorded
672  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
673  private:
674  hc::completion_future _marker;
675 };
676 
677 
678 //=============================================================================
679 // class ihipEventCriticalBase_t
680 template <typename MUTEX_TYPE>
681 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
682  public:
683  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
685 
686  // Keep data in structure so it can be easily copied into snapshots
687  // (used to reduce lock contention and preserve correct lock order)
688  ihipEventData_t _eventData;
689 
690  private:
691  const ihipEvent_t* _parent;
692  friend class LockedAccessor<ihipEventCriticalBase_t>;
693 };
694 
696 
698 
699 // internal hip event structure.
700 class ihipEvent_t {
701  public:
702  explicit ihipEvent_t(unsigned flags);
703  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
704  ihipEventType_t eventType);
705  std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
706 
707 
708  // Return a copy of the critical state. The critical data is locked during the copy.
709  ihipEventData_t locked_copyCrit() {
710  LockedAccessor_EventCrit_t crit(_criticalData);
711  return _criticalData._eventData;
712  };
713 
714  ihipEventCritical_t& criticalData() { return _criticalData; };
715 
716  public:
717  unsigned _flags;
718 
719  private:
720  ihipEventCritical_t _criticalData;
721 
722  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
723 };
724 
725 
726 //=============================================================================
727 // class ihipDeviceCriticalBase_t
728 template <typename MUTEX_TYPE>
730  public:
731  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
732  : _parent(parentDevice), _ctxCount(0){};
733 
735 
736  // Contexts:
737  void addContext(ihipCtx_t* ctx);
738  void removeContext(ihipCtx_t* ctx);
739  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
740  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
741  int getcount() { return _ctxCount; };
742  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
743 
744  private:
745  ihipDevice_t* _parent;
746 
747  //--- Context Tracker:
748  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
749 
750  int _ctxCount;
751 };
752 
754 
756 
757 //----
758 // Properties of the HIP device.
759 // Multiple contexts can point to same device.
761  public:
762  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
763  ~ihipDevice_t();
764 
765  // Accessors:
766  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
767  void locked_removeContext(ihipCtx_t* c);
768  void locked_reset();
769  ihipDeviceCritical_t& criticalData() { return _criticalData; };
770 
771  public:
772  unsigned _deviceId; // device ID
773 
774  hc::accelerator _acc;
775  hsa_agent_t _hsaAgent; // hsa agent handle
776 
778  unsigned _computeUnits;
779  hipDeviceProp_t _props; // saved device properties.
780 
781  // TODO - report this through device properties, base on HCC API call.
782  int _isLargeBar;
783 
784  ihipCtx_t* _primaryCtx;
785 
786  int _state; // 1 if device is set otherwise 0
787 
788  private:
789  hipError_t initProperties(hipDeviceProp_t* prop);
790 
791  private:
792  ihipDeviceCritical_t _criticalData;
793 };
794 //=============================================================================
795 
796 
797 //---
798 //
799 struct ihipExec_t {
800  dim3 _gridDim;
801  dim3 _blockDim;
802  size_t _sharedMem;
803  hipStream_t _hStream;
804  std::vector<char> _arguments;
805 };
806 
807 //=============================================================================
808 // class ihipCtxCriticalBase_t
809 template <typename MUTEX_TYPE>
810 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
811  public:
812  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
813  : _parent(parentCtx), _peerCnt(0) {
814  _peerAgents = new hsa_agent_t[deviceCnt];
815  };
816 
818  if (_peerAgents != nullptr) {
819  delete _peerAgents;
820  _peerAgents = nullptr;
821  }
822  _peerCnt = 0;
823  }
824 
825  // Streams:
826  void addStream(ihipStream_t* stream);
827  std::list<ihipStream_t*>& streams() { return _streams; };
828  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
829 
830 
831  // Peer Accessor classes:
832  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
833  // physically located on this device.
834  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
835  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
836  void resetPeerWatchers(ihipCtx_t* thisDevice);
837  void printPeerWatchers(FILE* f) const;
838 
839  uint32_t peerCnt() const { return _peerCnt; };
840  hsa_agent_t* peerAgents() const { return _peerAgents; };
841 
842 
843  // TODO - move private
844  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
845  //--- Execution stack:
846  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
847 
848  friend class LockedAccessor<ihipCtxCriticalBase_t>;
849 
850  private:
851  ihipCtx_t* _parent;
852 
853  //--- Stream Tracker:
854  std::list<ihipStream_t*> _streams; // streams associated with this device.
855 
856 
857  //--- Peer Tracker:
858  // These reflect the currently Enabled set of peers for this GPU:
859  // Enabled peers have permissions to access the memory physically allocated on this device.
860  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
861  uint32_t _peerCnt; // number of enabled peers
862  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
863  private:
864  void recomputePeerAgents();
865 };
866 // Note Mutex type Real/Fake selected based on CtxMutex
868 
869 // This type is used by functions that need access to the critical device structures.
871 //=============================================================================
872 
873 
874 //=============================================================================
875 // class ihipCtx_t:
876 // A HIP CTX (context) points at one of the existing devices and contains the streams,
877 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
878 // device.
879 //
880 class ihipCtx_t {
881  public: // Functions:
882  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
883  unsigned flags); // note: calls constructor for _criticalData
884  ~ihipCtx_t();
885 
886  // Functions which read or write the critical data are named locked_.
887  // (might be better called "locking_"
888  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
889  // locked_ function from within a locked_ function. External functions which call several
890  // locked_ functions will acquire and release the lock for each function. if this occurs in
891  // performance-sensitive code we may want to refactor by adding non-locked functions and
892  // creating a new locked_ member function to call them all.
893  void locked_removeStream(ihipStream_t* s);
894  void locked_reset();
895  void locked_waitAllStreams();
896  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
897 
898  ihipCtxCritical_t& criticalData() { return _criticalData; };
899 
900  const ihipDevice_t* getDevice() const { return _device; };
901  int getDeviceNum() const { return _device->_deviceId; };
902 
903  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
904  ihipDevice_t* getWriteableDevice() const { return _device; };
905 
906  std::string toString() const;
907 
908  public: // Data
909  // The NULL stream is used if no other stream is specified.
910  // Default stream has special synchronization properties with other streams.
911  ihipStream_t* _defaultStream;
912 
913  // Flags specified when the context is created:
914  unsigned _ctxFlags;
915 
916  private:
917  ihipDevice_t* _device;
918 
919 
920  private: // Critical data, protected with locked access:
921  // Members of _protected data MUST be accessed through the LockedAccessor.
922  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
923  // directly.
924  ihipCtxCritical_t _criticalData;
925 };
926 
927 
928 //=================================================================================================
929 // Global variable definition:
930 extern std::once_flag hip_initialized;
931 extern unsigned g_deviceCnt;
932 extern hsa_agent_t g_cpu_agent; // the CPU agent.
933 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
934 
935 //=================================================================================================
936 // Extern functions:
937 extern void ihipInit();
938 extern const char* ihipErrorString(hipError_t);
939 extern ihipCtx_t* ihipGetTlsDefaultCtx();
940 extern void ihipSetTlsDefaultCtx(ihipCtx_t* ctx);
941 extern hipError_t ihipSynchronize(void);
942 extern void ihipCtxStackUpdate();
943 extern hipError_t ihipDeviceSetState();
944 
945 extern ihipDevice_t* ihipGetDevice(int);
946 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
947 
948 
949 hipStream_t ihipSyncAndResolveStream(hipStream_t);
950 hipError_t ihipStreamSynchronize(hipStream_t stream);
951 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
952 
953 // Stream printf functions:
954 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
955  os << "stream:";
956  os << s.getDevice()->_deviceId;
957  ;
958  os << '.';
959  os << s._id;
960  return os;
961 }
962 
963 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
964  os << '{';
965  os << s.x;
966  os << ',';
967  os << s.y;
968  os << ',';
969  os << s.z;
970  os << '}';
971  return os;
972 }
973 
974 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
975  os << '{';
976  os << s.x;
977  os << ',';
978  os << s.y;
979  os << ',';
980  os << s.z;
981  os << '}';
982  return os;
983 }
984 
985 // Stream printf functions:
986 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
987  os << "event:" << std::hex << static_cast<void*>(e);
988  return os;
989 }
990 
991 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
992  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
993  return os;
994 }
995 
996 
997 // Helper functions that are used across src files:
998 namespace hip_internal {
999 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1000  hipStream_t stream);
1001 };
1002 
1003 
1004 #endif
Definition: hip_hcc_internal.h:231
Definition: hip_hcc_internal.h:760
Definition: hip_hcc_internal.h:458
Definition: hip_hcc_internal.h:107
Definition: hip_hcc_internal.h:381
Definition: hip_hcc_internal.h:357
uint32_t x
x
Definition: hip_runtime_api.h:241
Definition: hip_hcc_internal.h:799
Definition: hip_hcc_internal.h:880
Definition: hip_runtime_api.h:240
uint32_t y
y
Definition: hip_runtime_api.h:242
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:738
Definition: hip_hcc_internal.h:627
Definition: hip_hcc_internal.h:729
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:778
uint32_t z
z
Definition: hip_runtime_api.h:243
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:655
Definition: hip_hcc_internal.h:681
Definition: hip_hcc_internal.h:998
Definition: hip_hcc_internal.h:333
Definition: hip_hcc_internal.h:700
Definition: hip_hcc_internal.h:810
Definition: hip_hcc_internal.h:518
Definition: hip_hcc_internal.h:470
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:110
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:360
Definition: hip_hcc_internal.h:422
Definition: hip_hcc_internal.h:367
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:118
Definition: hip_hcc_internal.h:92