HIP: Heterogenous-computing Interface for Portability
hip_hcc_internal.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H
24 #define HIP_SRC_HIP_HCC_INTERNAL_H
25 
26 #include <hc.hpp>
27 #include <hsa/hsa.h>
28 #include <unordered_map>
29 #include <stack>
30 
31 #include "hsa/hsa_ext_amd.h"
32 #include "hip/hip_runtime.h"
33 #include "hip_prof_api.h"
34 #include "hip_util.h"
35 #include "env.h"
36 
37 
38 #if (__hcc_workweek__ < 16354)
39 #error("This version of HIP requires a newer version of HCC.");
40 #endif
41 
42 // Use the __appPtr field in the am memtracker to store the context.
43 // Requires a bug fix in HCC
44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \
45  (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
46 #define USE_APP_PTR_FOR_CTX 1
47 #endif
48 
49 
50 #define USE_IPC 1
51 
52 //---
53 // Environment variables:
54 
55 // Intended to distinguish whether an environment variable should be visible only in debug mode, or
56 // in debug+release.
57 // static const int debug = 0;
58 extern const int release;
59 
60 // TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for
61 // kernels?
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
64 
65 extern int HIP_PRINT_ENV;
66 // extern int HIP_TRACE_API;
67 extern int HIP_ATP;
68 extern int HIP_DB;
69 extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */
70 extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */
71 extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */
72 extern int HIP_FORCE_P2P_HOST;
73 
74 extern int HIP_HOST_COHERENT;
75 
76 extern int HIP_HIDDEN_FREE_MEM;
77 //---
78 // Chicken bits for disabling functionality to work around potential issues:
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
81 
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
85 
86 extern int HIP_SYNC_FREE;
87 
88 extern int HIP_DUMP_CODE_OBJECT;
89 
90 // TODO - remove when this is standard behavior.
91 extern int HCC_OPT_FLUSH;
92 
93 #define IMAGE_PITCH_ALIGNMENT 256
94 template <typename T> inline T alignDown(T value, size_t alignment) {
95  return (T)(value & ~(alignment - 1));
96 }
97 
98 template <typename T> inline T* alignDown(T* value, size_t alignment) {
99  return (T*)alignDown((intptr_t)value, alignment);
100 }
101 
102 template <typename T> inline T alignUp(T value, size_t alignment) {
103  return alignDown((T)(value + alignment - 1), alignment);
104 }
105 
106 template <typename T> inline T* alignUp(T* value, size_t alignment) {
107  return (T*)alignDown((intptr_t)(value + alignment - 1), alignment);
108 }
109 
110 size_t getNumChannels(hsa_ext_image_channel_order_t channelOrder) {
111  switch (channelOrder) {
112  case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
113  return 2;
114  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
115  return 3;
116  case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
117  return 4;
118  case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
119  default:
120  return 1;
121  }
122 }
123 
124 size_t getElementSize(hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType) {
125  size_t bytesPerPixel = getNumChannels(channelOrder);
126  switch (channelType) {
127  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
128  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
129  break;
130 
131  case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
132  case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
133  case HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT:
134  bytesPerPixel *= 4;
135  break;
136 
137  default:
138  bytesPerPixel *= 2;
139  break;
140  }
141  return bytesPerPixel;
142 }
143 
144 // Class to assign a short TID to each new thread, for HIP debugging purposes.
145 class TidInfo {
146  public:
147  TidInfo();
148 
149  int tid() const { return _shortTid; };
150  pid_t pid() const { return _pid; };
151  uint64_t incApiSeqNum() { return ++_apiSeqNum; };
152  uint64_t apiSeqNum() const { return _apiSeqNum; };
153 
154  private:
155  int _shortTid;
156  pid_t _pid;
157 
158  // monotonically increasing API sequence number for this threa.
159  uint64_t _apiSeqNum;
160 };
161 
162 struct ProfTrigger {
163  static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
164 
165  void print(int tid) {
166  std::cout << "Enabling tracing for ";
167  for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
168  std::cout << "tid:" << tid << "." << *iter << ",";
169  }
170  std::cout << "\n";
171  };
172 
173  uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
174  void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
175  void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
176 
177  private:
178  std::vector<uint64_t> _profTrigger;
179 };
180 
181 
182 //---
183 // Extern TLS
184 // Use a single struct to hold all TLS data. Attempt to reduce TLS accesses.
185 struct TlsData {
186  explicit TlsData() {
187  lastHipError = hipSuccess;
188  getPrimaryCtx = true;
189  defaultCtx = nullptr;
190  }
191 
192  hipError_t lastHipError;
193  TidInfo tidInfo;
194  // This is the implicit context used by all HIP commands.
195  // It can be set by hipSetDevice or by the CTX manipulation commands:
196  ihipCtx_t* defaultCtx;
197  // Stack of contexts
198  std::stack<ihipCtx_t*> ctxStack;
199  bool getPrimaryCtx;
200 };
201 TlsData* tls_get_ptr();
202 #define GET_TLS() TlsData *tls = tls_get_ptr()
203 
204 extern std::vector<ProfTrigger> g_dbStartTriggers;
205 extern std::vector<ProfTrigger> g_dbStopTriggers;
206 
207 //---
208 // Forward defs:
209 class ihipStream_t;
210 class ihipDevice_t;
211 class ihipCtx_t;
212 struct ihipEventData_t;
213 
214 // Color defs for debug messages:
215 #define KNRM "\x1B[0m"
216 #define KRED "\x1B[31m"
217 #define KGRN "\x1B[32m"
218 #define KYEL "\x1B[33m"
219 #define KBLU "\x1B[34m"
220 #define KMAG "\x1B[35m"
221 #define KCYN "\x1B[36m"
222 #define KWHT "\x1B[37m"
223 
224 extern const char* API_COLOR;
225 extern const char* API_COLOR_END;
226 
227 
228 // If set, thread-safety is enforced on all event/stream/ctx/device functions.
229 // Can disable for performance or functional experiments - in this case
230 // the code uses a dummy "no-op" mutex.
231 #define EVENT_THREAD_SAFE 1
232 
233 #define STREAM_THREAD_SAFE 1
234 
235 #define CTX_THREAD_SAFE 1
236 
237 #define DEVICE_THREAD_SAFE 1
238 
239 
240 // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set.
241 // May be set to 0 to remove debug if checks - possible code size and performance difference?
242 #define COMPILE_HIP_DB 1
243 
244 
245 // Compile HIP tracing capability.
246 // 0x1 = print a string at function entry with arguments.
247 // 0x2 = prints a simple message with function name + return code when function exits.
248 // 0x3 = print both.
249 // Must be enabled at runtime with HIP_TRACE_API
250 #define COMPILE_HIP_TRACE_API 0x3
251 
252 //---
253 // HIP Trace modes - use with HIP_TRACE_API=...
254 #define TRACE_ALL 0 // 0x01
255 #define TRACE_KCMD 1 // 0x02, kernel command
256 #define TRACE_MCMD 2 // 0x04, memory command
257 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation.
258 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent)
259 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery
260 
261 
262 //---
263 // HIP_DB Debug flags:
264 #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */
265 #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
266 #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
267 #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
268 #define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
269 #define DB_FB 5 /* 0x20 - trace loading fat binary */
270 #define DB_MAX_FLAG 6
271 // When adding a new debug flag, also add to the char name table below.
272 //
273 //
274 
275 struct DbName {
276  const char* _color;
277  const char* _shortName;
278 };
279 
280 // This table must be kept in-sync with the defines above.
281 static const DbName dbName[] = {
282  {KGRN, "api"}, // not used,
283  {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"},
284  {KBLU, "fatbin"},
285 };
286 
287 
288 #if COMPILE_HIP_DB
289 #define tprintf(trace_level, ...) \
290  { \
291  if (HIP_DB & (1 << (trace_level))) { \
292  GET_TLS(); \
293  char msgStr[1000]; \
294  snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \
295  fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \
296  dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \
297  } \
298  }
299 #else
300 /* Compile to empty code */
301 #define tprintf(trace_level, ...)
302 #endif
303 
304 
305 static inline uint64_t getTicks() { return hc::get_system_ticks(); }
306 
307 //---
308 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
309 
310 #if (COMPILE_HIP_TRACE_API & 0x1)
311 #define API_TRACE(forceTrace, ...) \
312  GET_TLS(); \
313  uint64_t hipApiStartTick = 0; \
314  { \
315  tls->tidInfo.incApiSeqNum(); \
316  if (forceTrace || \
317  (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \
318  std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \
319  std::string fullStr; \
320  hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \
321  } \
322  }
323 
324 #else
325 // Swallow API_TRACE
326 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum();
327 #endif
328 
329 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls)
330 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx
331 
332 #define HIP_SET_DEVICE() ihipDeviceSetState(tls);
333 
334 // This macro should be called at the beginning of every HIP API.
335 // It initializes the hip runtime (exactly once), and
336 // generates a trace string that can be output to stderr or to ATP file.
337 #define HIP_INIT_API(cid, ...) \
338  hip_impl::hip_init(); \
339  API_TRACE(0, __VA_ARGS__); \
340  HIP_CB_SPAWNER_OBJECT(cid);
341 
342 
343 // Like above, but will trace with a specified "special" bit.
344 // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
345 // kernel launches, copy commands, memory sets, etc.
346 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \
347  hip_impl::hip_init(); \
348  API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \
349  HIP_CB_SPAWNER_OBJECT(cid);
350 
351 
352 // This macro should be called at the end of every HIP API, and only at the end of top-level hip
353 // APIS (not internal hip) It has dual function: logs the last error returned for use by
354 // hipGetLastError, and also prints the closing message when the debug trace is enabled.
355 #define ihipLogStatus(hipStatus) \
356  ({ \
357  hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \
358  tls->lastHipError = localHipStatus; \
359  \
360  if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \
361  auto ticks = getTicks() - hipApiStartTick; \
362  fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
363  (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \
364  tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \
365  ihipErrorString(localHipStatus), ticks, API_COLOR_END); \
366  } \
367  localHipStatus; \
368  })
369 
370 
371 class ihipException : public std::exception {
372  public:
373  explicit ihipException(hipError_t e) : _code(e){};
374 
375  hipError_t _code;
376 };
377 
378 
379 #ifdef __cplusplus
380 extern "C" {
381 #endif
382 
383 
384 #ifdef __cplusplus
385 }
386 #endif
387 
388 const hipStream_t hipStreamNull = 0x0;
389 
390 
394 #define HIP_IPC_RESERVED_SIZE 24
396  public:
397 #if USE_IPC
398  hsa_amd_ipc_memory_t ipc_handle;
399 #endif
400  size_t psize;
401  char reserved[HIP_IPC_RESERVED_SIZE];
402 };
403 
404 
405 struct ihipModule_t {
406  std::string fileName;
407  hsa_executable_t executable = {};
408  hsa_code_object_reader_t coReader = {};
409  std::string hash;
410  std::unordered_map<
411  std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
412 
413  ~ihipModule_t() {
414  if (executable.handle) hsa_executable_destroy(executable);
415  if (coReader.handle) hsa_code_object_reader_destroy(coReader);
416  }
417 };
418 
419 
420 //---
421 // Used to remove lock, for performance or stimulating bugs.
422 class FakeMutex {
423  public:
424  void lock() {}
425  bool try_lock() { return true; }
426  void unlock() {}
427 };
428 
429 #if EVENT_THREAD_SAFE
430 typedef std::mutex EventMutex;
431 #else
432 #warning "Stream thread-safe disabled"
433 typedef FakeMutex EventMutex;
434 #endif
435 
436 #if STREAM_THREAD_SAFE
437 typedef std::mutex StreamMutex;
438 #else
439 #warning "Stream thread-safe disabled"
440 typedef FakeMutex StreamMutex;
441 #endif
442 
443 // Pair Device and Ctx together, these could also be toggled separately if desired.
444 #if CTX_THREAD_SAFE
445 typedef std::mutex CtxMutex;
446 #else
447 typedef FakeMutex CtxMutex;
448 #warning "Ctx thread-safe disabled"
449 #endif
450 
451 #if DEVICE_THREAD_SAFE
452 typedef std::mutex DeviceMutex;
453 #else
454 typedef FakeMutex DeviceMutex;
455 #warning "Device thread-safe disabled"
456 #endif
457 
458 //
459 //---
460 // Protects access to the member _data with a lock acquired on contruction/destruction.
461 // T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock)
462 template <typename T>
464  public:
465  LockedAccessor(T& criticalData, bool autoUnlock = true)
466  : _criticalData(&criticalData),
467  _autoUnlock(autoUnlock)
468 
469  {
470  tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData,
471  ToString(_criticalData->_parent).c_str());
472  _criticalData->_mutex.lock();
473  };
474 
475  ~LockedAccessor() {
476  if (_autoUnlock) {
477  tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData,
478  ToString(_criticalData->_parent).c_str());
479  _criticalData->_mutex.unlock();
480  }
481  }
482 
483  void unlock() {
484  tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData,
485  ToString(_criticalData->_parent).c_str());
486  _criticalData->_mutex.unlock();
487  }
488 
489  // Syntactic sugar so -> can be used to get the underlying type.
490  T* operator->() { return _criticalData; };
491 
492  private:
493  T* _criticalData;
494  bool _autoUnlock;
495 };
496 
497 
498 template <typename MUTEX_TYPE>
499 struct LockedBase {
500  // Experts-only interface for explicit locking.
501  // Most uses should use the lock-accessor.
502  void lock() { _mutex.lock(); }
503  void unlock() { _mutex.unlock(); }
504  bool try_lock() { return _mutex.try_lock(); }
505 
506  MUTEX_TYPE _mutex;
507 };
508 
509 
510 template <typename MUTEX_TYPE>
511 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE> {
512 public:
513  ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av)
514  : _parent{parentStream}, _av{av}, _last_op_was_a_copy{false}
515  {}
516 
518 
521  return this;
522  };
523 
524  void munlock() {
525  tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this,
526  ToString(this->_parent).c_str());
528  };
529 
531  bool gotLock = LockedBase<MUTEX_TYPE>::try_lock();
532  tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this,
533  ToString(this->_parent).c_str());
534  return gotLock ? this : nullptr;
535  };
536 
537  ihipStream_t* _parent;
538  hc::accelerator_view _av;
539  bool _last_op_was_a_copy;
540 };
541 
542 
543 // if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire
544 // the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired
545 // in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range
546 // where the locks must be held. HIP routines should avoid acquiring and releasing the same lock
547 // during the execution of a single HIP API. Another option is to use try_lock in the innermost lock
548 // query.
549 
550 
553 
554 //---
555 // Internal stream structure.
557  public:
558  enum ScheduleMode { Auto, Spin, Yield };
559  typedef uint64_t SeqNum_t;
560 
561  // TODOD -make av a reference to avoid shared_ptr overhead?
562  ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags);
563  ~ihipStream_t();
564 
565  // kind is hipMemcpyKind
566  void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
567  bool resolveOn = true);
568 
569  bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
570  bool resolveOn = true);
571 
572  void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);
573 
574  bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind);
575 
576  void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
577  size_t offset, unsigned kind);
578  void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes,
579  size_t offset, unsigned kind);
580 
581  //---
582  // Member functions that begin with locked_ are thread-safe accessors - these acquire / release
583  // the critical mutex.
584  LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
585  void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0);
586 
587  void locked_wait(bool& waited);
588  void locked_wait();
589 
590  hc::accelerator_view* locked_getAv() {
591  LockedAccessor_StreamCrit_t crit(_criticalData);
592  return &(crit->_av);
593  };
594 
595  void locked_streamWaitEvent(ihipEventData_t& event);
596  hc::completion_future locked_recordEvent(hipEvent_t event);
597 
598  ihipStreamCritical_t& criticalData() { return _criticalData; };
599 
600  //---
601  hc::hcWaitMode waitMode() const;
602 
603  // Use this if we already have the stream critical data mutex:
604  void wait(LockedAccessor_StreamCrit_t& crit);
605 
606  void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
607  uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
608  uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
609  uint32_t sharedMemBytes, void* kernarg, size_t kernSize,
610  uint64_t kernel);
611 
612 
613  //-- Non-racy accessors:
614  // These functions access fields set at initialization time and are non-racy (so do not acquire
615  // mutex)
616  const ihipDevice_t* getDevice() const;
617  ihipCtx_t* getCtx() const;
618 
619  // Before calling this function, stream must be resolved from "0" to the actual stream:
620  bool isDefaultStream() const { return _id == 0; };
621 
622  public:
623  //---
624  // Public member vars - these are set at initialization and never change:
625  SeqNum_t _id; // monotonic sequence ID. 0 is the default stream.
626  unsigned _flags;
627 
628 
629  private:
630  // The unsigned return is hipMemcpyKind
631  unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem);
632  void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo,
633  const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
634  ihipCtx_t** copyDevice, bool* forceUnpinnedCopy);
635 
636  bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo,
637  const hc::AmPointerInfo* srcInfo);
638 
639  void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
640 
641  private: // Data
642  // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
643  ihipStreamCritical_t _criticalData;
644 
645  std::mutex _hasQueueLock;
646 
647  ihipCtx_t* _ctx; // parent context that owns this stream.
648 
649  // Friends:
650  friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s);
651  friend hipError_t hipStreamQuery(hipStream_t);
652 
653  ScheduleMode _scheduleMode;
654 };
655 
656 
657 //----
658 // Internal structure for stream callback handler
660  public:
661  ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void* userData)
662  : _stream(stream), _callback(callback), _userData(userData) {
663  };
664  hipStream_t _stream;
665  hipStreamCallback_t _callback;
666  void* _userData;
667 };
668 
669 
670 //----
671 // Internal event structure:
672 enum hipEventStatus_t {
673  hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
674  hipEventStatusCreated = 1, // event created, but not yet Recorded
675  hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet.
676  hipEventStatusComplete = 3, // event has been recorded - timestamps are valid.
677 };
678 
679 // TODO - rename to ihip type of some kind
680 enum ihipEventType_t {
681  hipEventTypeIndependent,
682  hipEventTypeStartCommand,
683  hipEventTypeStopCommand,
684 };
685 
686 
688  ihipEventData_t() {
689  _state = hipEventStatusCreated;
690  _stream = NULL;
691  _timestamp = 0;
692  _type = hipEventTypeIndependent;
693  };
694 
695  void marker(const hc::completion_future& marker) { _marker = marker; };
696  hc::completion_future& marker() { return _marker; }
697  uint64_t timestamp() const { return _timestamp; };
698  ihipEventType_t type() const { return _type; };
699 
700  ihipEventType_t _type;
701  hipEventStatus_t _state;
702  hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual
703  // stream when recorded
704  uint64_t _timestamp; // store timestamp, may be set on host or by marker.
705  private:
706  hc::completion_future _marker;
707 };
708 
709 
710 //=============================================================================
711 // class ihipEventCriticalBase_t
712 template <typename MUTEX_TYPE>
713 class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE> {
714  public:
715  explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {}
717 
718  // Keep data in structure so it can be easily copied into snapshots
719  // (used to reduce lock contention and preserve correct lock order)
720  ihipEventData_t _eventData;
721 
722  private:
723  const ihipEvent_t* _parent;
724  friend class LockedAccessor<ihipEventCriticalBase_t>;
725 };
726 
728 
730 
731 // internal hip event structure.
732 class ihipEvent_t {
733  public:
734  explicit ihipEvent_t(unsigned flags);
735  void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream,
736  ihipEventType_t eventType);
737 
738  // Return a copy of the critical state. The critical data is locked during the copy.
739  ihipEventData_t locked_copyCrit() {
740  LockedAccessor_EventCrit_t crit(_criticalData);
741  return _criticalData._eventData;
742  };
743 
744  ihipEventCritical_t& criticalData() { return _criticalData; };
745 
746  public:
747  unsigned _flags;
748 
749  private:
750  ihipEventCritical_t _criticalData;
751 
752  friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
753 };
754 
755 
756 //=============================================================================
757 // class ihipDeviceCriticalBase_t
758 template <typename MUTEX_TYPE>
760  public:
761  explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice)
762  : _parent(parentDevice), _ctxCount(0){};
763 
765 
766  // Contexts:
767  void addContext(ihipCtx_t* ctx);
768  void removeContext(ihipCtx_t* ctx);
769  std::list<ihipCtx_t*>& ctxs() { return _ctxs; };
770  const std::list<ihipCtx_t*>& const_ctxs() const { return _ctxs; };
771  int getcount() { return _ctxCount; };
772  friend class LockedAccessor<ihipDeviceCriticalBase_t>;
773 
774  private:
775  ihipDevice_t* _parent;
776 
777  //--- Context Tracker:
778  std::list<ihipCtx_t*> _ctxs; // contexts associated with this device across all threads.
779 
780  int _ctxCount;
781 };
782 
784 
786 
787 //----
788 // Properties of the HIP device.
789 // Multiple contexts can point to same device.
791  public:
792  ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc);
793  ~ihipDevice_t();
794 
795  // Accessors:
796  ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; };
797  void locked_removeContext(ihipCtx_t* c);
798  void locked_reset();
799  ihipDeviceCritical_t& criticalData() { return _criticalData; };
800 
801  public:
802  unsigned _deviceId; // device ID
803 
804  hc::accelerator _acc;
805  hsa_agent_t _hsaAgent; // hsa agent handle
806 
808  unsigned _computeUnits;
809  hipDeviceProp_t _props; // saved device properties.
810 
811  // TODO - report this through device properties, base on HCC API call.
812  int _isLargeBar;
813 
814  ihipCtx_t* _primaryCtx;
815 
816  int _state; // 1 if device is set otherwise 0
817 
818  private:
819  hipError_t initProperties(hipDeviceProp_t* prop);
820 
821  private:
822  ihipDeviceCritical_t _criticalData;
823 };
824 //=============================================================================
825 
826 
827 //---
828 //
829 struct ihipExec_t {
830  dim3 _gridDim;
831  dim3 _blockDim;
832  size_t _sharedMem;
833  hipStream_t _hStream;
834  std::vector<char> _arguments;
835 };
836 
837 //=============================================================================
838 // class ihipCtxCriticalBase_t
839 template <typename MUTEX_TYPE>
840 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE> {
841  public:
842  ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt)
843  : _parent(parentCtx), _peerCnt(0) {
844  _peerAgents = new hsa_agent_t[deviceCnt];
845  };
846 
848  if (_peerAgents != nullptr) {
849  delete _peerAgents;
850  _peerAgents = nullptr;
851  }
852  _peerCnt = 0;
853  }
854 
855  // Streams:
856  void addStream(ihipStream_t* stream);
857  std::list<ihipStream_t*>& streams() { return _streams; };
858  const std::list<ihipStream_t*>& const_streams() const { return _streams; };
859 
860 
861  // Peer Accessor classes:
862  bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory
863  // physically located on this device.
864  bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
865  bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer);
866  void resetPeerWatchers(ihipCtx_t* thisDevice);
867  void printPeerWatchers(FILE* f) const;
868 
869  uint32_t peerCnt() const { return _peerCnt; };
870  hsa_agent_t* peerAgents() const { return _peerAgents; };
871 
872 
873  // TODO - move private
874  std::list<ihipCtx_t*> _peers; // list of enabled peer devices.
875  //--- Execution stack:
876  std::stack<ihipExec_t> _execStack; // Execution stack for this device.
877 
878  friend class LockedAccessor<ihipCtxCriticalBase_t>;
879 
880  private:
881  ihipCtx_t* _parent;
882 
883  //--- Stream Tracker:
884  std::list<ihipStream_t*> _streams; // streams associated with this device.
885 
886 
887  //--- Peer Tracker:
888  // These reflect the currently Enabled set of peers for this GPU:
889  // Enabled peers have permissions to access the memory physically allocated on this device.
890  // Note the peers always contain the self agent for easy interfacing with HSA APIs.
891  uint32_t _peerCnt; // number of enabled peers
892  hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.)
893  private:
894  void recomputePeerAgents();
895 };
896 // Note Mutex type Real/Fake selected based on CtxMutex
898 
899 // This type is used by functions that need access to the critical device structures.
901 //=============================================================================
902 
903 
904 //=============================================================================
905 // class ihipCtx_t:
906 // A HIP CTX (context) points at one of the existing devices and contains the streams,
907 // peer-to-peer mappings, creation flags. Multiple contexts can point to the same
908 // device.
909 //
910 class ihipCtx_t {
911  public: // Functions:
912  ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt,
913  unsigned flags); // note: calls constructor for _criticalData
914  ~ihipCtx_t();
915 
916  // Functions which read or write the critical data are named locked_.
917  // (might be better called "locking_"
918  // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a
919  // locked_ function from within a locked_ function. External functions which call several
920  // locked_ functions will acquire and release the lock for each function. if this occurs in
921  // performance-sensitive code we may want to refactor by adding non-locked functions and
922  // creating a new locked_ member function to call them all.
923  void locked_removeStream(ihipStream_t* s);
924  void locked_reset();
925  void locked_waitAllStreams();
926  void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
927 
928  ihipCtxCritical_t& criticalData() { return _criticalData; };
929 
930  const ihipDevice_t* getDevice() const { return _device; };
931  int getDeviceNum() const { return _device->_deviceId; };
932 
933  // TODO - review uses of getWriteableDevice(), can these be converted to getDevice()
934  ihipDevice_t* getWriteableDevice() const { return _device; };
935 
936  std::string toString() const;
937 
938  public: // Data
939  // The NULL stream is used if no other stream is specified.
940  // Default stream has special synchronization properties with other streams.
941  ihipStream_t* _defaultStream;
942 
943  // Flags specified when the context is created:
944  unsigned _ctxFlags;
945 
946  private:
947  ihipDevice_t* _device;
948 
949 
950  private: // Critical data, protected with locked access:
951  // Members of _protected data MUST be accessed through the LockedAccessor.
952  // Search for LockedAccessor<ihipCtxCritical_t> for examples; do not access _criticalData
953  // directly.
954  ihipCtxCritical_t _criticalData;
955 };
956 
957 
958 //=================================================================================================
959 // Global variable definition:
960 extern unsigned g_deviceCnt;
961 extern hsa_agent_t g_cpu_agent; // the CPU agent.
962 extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents.
963 
964 //=================================================================================================
965 // Extern functions:
966 extern void ihipInit();
967 extern const char* ihipErrorString(hipError_t);
968 extern hipError_t ihipSynchronize(TlsData *tls);
969 extern void ihipCtxStackUpdate();
970 extern hipError_t ihipDeviceSetState(TlsData *tls);
971 
972 extern ihipDevice_t* ihipGetDevice(int);
973 ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex);
974 hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
975  const char* name, hsa_agent_t *agent);
976 
977 
978 hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0);
979 hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream);
980 void ihipStreamCallbackHandler(ihipStreamCallback_t* cb);
981 
982 // Stream printf functions:
983 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) {
984  os << "stream:";
985  os << s.getDevice()->_deviceId;
986  ;
987  os << '.';
988  os << s._id;
989  return os;
990 }
991 
992 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
993  os << '{';
994  os << s.x;
995  os << ',';
996  os << s.y;
997  os << ',';
998  os << s.z;
999  os << '}';
1000  return os;
1001 }
1002 
1003 inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) {
1004  os << '{';
1005  os << s.x;
1006  os << ',';
1007  os << s.y;
1008  os << ',';
1009  os << s.z;
1010  os << '}';
1011  return os;
1012 }
1013 
1014 // Stream printf functions:
1015 inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) {
1016  os << "event:" << std::hex << static_cast<void*>(e);
1017  return os;
1018 }
1019 
1020 inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) {
1021  os << "ctx:" << static_cast<const void*>(c) << ".dev:" << c->getDevice()->_deviceId;
1022  return os;
1023 }
1024 
1025 
1026 // Helper functions that are used across src files:
1027 namespace hip_internal {
1028 hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
1029  hipStream_t stream);
1030 
1031 hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags);
1032 
1033 hipError_t ihipHostFree(TlsData *tls, void* ptr);
1034 
1035 };
1036 
1037 #define MAX_COOPERATIVE_GPUs 255
1038 
1039 // do not change these two structs without changing the device library
1040 struct mg_sync {
1041  uint w0;
1042  uint w1;
1043 };
1044 
1045 struct mg_info {
1046  struct mg_sync *mgs;
1047  uint grid_id;
1048  uint num_grids;
1049  ulong prev_sum;
1050  ulong all_sum;
1051 };
1052 
1053 //---
1054 // TODO - review the context creation strategy here. Really should be:
1055 // - first "non-device" runtime call creates the context for this thread. Allowed to call
1056 // setDevice first.
1057 // - hipDeviceReset destroys the primary context for device?
1058 // - Then context is created again for next usage.
1059 static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
1060  // Per-thread initialization of the TLS:
1061  if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
1062  tls->defaultCtx = ihipGetPrimaryCtx(0);
1063  }
1064  return tls->defaultCtx;
1065 }
1066 
1067 #endif
Definition: hip_hcc_internal.h:275
Definition: hip_hcc_internal.h:1040
Definition: hip_hcc_internal.h:790
Definition: hip_hcc_internal.h:499
Definition: hip_hcc_internal.h:162
Definition: hip_hcc_internal.h:422
Definition: hip_hcc_internal.h:395
uint32_t x
x
Definition: hip_runtime_api.h:275
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:1045
Definition: hip_module.cpp:108
Definition: hip_hcc_internal.h:185
Definition: hip_hcc_internal.h:829
Definition: hip_hcc_internal.h:910
Definition: hip_runtime_api.h:274
uint32_t y
y
Definition: hip_runtime_api.h:276
void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void *userData)
Definition: hip_runtime_api.h:848
Definition: hip_hcc_internal.h:659
Definition: hip_hcc_internal.h:759
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:808
uint32_t z
z
Definition: hip_runtime_api.h:277
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:687
Definition: hip_hcc_internal.h:713
Definition: hip_hcc_internal.h:1027
Definition: hip_hcc_internal.h:371
Definition: hip_hcc_internal.h:732
Definition: hip_hcc_internal.h:840
Definition: hip_hcc_internal.h:556
Definition: hip_hcc_internal.h:511
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:104
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:398
Definition: hip_hcc_internal.h:463
Definition: hip_hcc_internal.h:405
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:160
Definition: hip_hcc_internal.h:145