23 #ifndef HIP_SRC_HIP_HCC_INTERNAL_H 24 #define HIP_SRC_HIP_HCC_INTERNAL_H 28 #include <unordered_map> 31 #include "hsa/hsa_ext_amd.h" 32 #include "hip/hip_runtime.h" 33 #include "hip_prof_api.h" 36 #include <unordered_map> 38 #if (__hcc_workweek__ < 16354) 39 #error("This version of HIP requires a newer version of HCC."); 44 #if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \ 45 (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0) 46 #define USE_APP_PTR_FOR_CTX 1 58 extern const int release;
62 extern int HIP_LAUNCH_BLOCKING;
63 extern int HIP_API_BLOCKING;
65 extern int HIP_PRINT_ENV;
69 extern int HIP_STAGING_SIZE;
70 extern int HIP_STREAM_SIGNALS;
71 extern int HIP_VISIBLE_DEVICES;
72 extern int HIP_FORCE_P2P_HOST;
74 extern int HIP_HOST_COHERENT;
76 extern int HIP_HIDDEN_FREE_MEM;
79 extern int HIP_SYNC_HOST_ALLOC;
80 extern int HIP_SYNC_STREAM_WAIT;
82 extern int HIP_SYNC_NULL_STREAM;
83 extern int HIP_INIT_ALLOC;
84 extern int HIP_FORCE_NULL_STREAM;
86 extern int HIP_SYNC_FREE;
88 extern int HIP_DUMP_CODE_OBJECT;
91 extern int HCC_OPT_FLUSH;
93 #define IMAGE_PITCH_ALIGNMENT 256 94 template <
typename T>
inline T alignDown(T value,
size_t alignment) {
95 return (T)(value & ~(alignment - 1));
98 template <
typename T>
inline T* alignDown(T* value,
size_t alignment) {
99 return (T*)alignDown((intptr_t)value, alignment);
102 template <
typename T>
inline T alignUp(T value,
size_t alignment) {
103 return alignDown((T)(value + alignment - 1), alignment);
106 template <
typename T>
inline T* alignUp(T* value,
size_t alignment) {
107 return (T*)alignDown((intptr_t)(value + alignment - 1), alignment);
110 size_t getNumChannels(hsa_ext_image_channel_order_t channelOrder) {
111 switch (channelOrder) {
112 case HSA_EXT_IMAGE_CHANNEL_ORDER_RG:
114 case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB:
116 case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA:
118 case HSA_EXT_IMAGE_CHANNEL_ORDER_R:
124 size_t getElementSize(hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType) {
125 size_t bytesPerPixel = getNumChannels(channelOrder);
126 switch (channelType) {
127 case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
128 case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
131 case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
132 case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
133 case HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT:
141 return bytesPerPixel;
149 int tid()
const {
return _shortTid; };
150 pid_t pid()
const {
return _pid; };
151 uint64_t incApiSeqNum() {
return ++_apiSeqNum; };
152 uint64_t apiSeqNum()
const {
return _apiSeqNum; };
163 static const uint64_t MAX_TRIGGER = std::numeric_limits<uint64_t>::max();
165 void print(
int tid) {
166 std::cout <<
"Enabling tracing for ";
167 for (
auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) {
168 std::cout <<
"tid:" << tid <<
"." << *iter <<
",";
173 uint64_t nextTrigger() {
return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); };
174 void add(uint64_t trigger) { _profTrigger.push_back(trigger); };
175 void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater<int>()); };
178 std::vector<uint64_t> _profTrigger;
187 lastHipError = hipSuccess;
188 getPrimaryCtx =
true;
189 defaultCtx =
nullptr;
192 hipError_t lastHipError;
198 std::stack<ihipCtx_t*> ctxStack;
202 #define GET_TLS() TlsData *tls = tls_get_ptr() 204 extern std::vector<ProfTrigger> g_dbStartTriggers;
205 extern std::vector<ProfTrigger> g_dbStopTriggers;
215 #define KNRM "\x1B[0m" 216 #define KRED "\x1B[31m" 217 #define KGRN "\x1B[32m" 218 #define KYEL "\x1B[33m" 219 #define KBLU "\x1B[34m" 220 #define KMAG "\x1B[35m" 221 #define KCYN "\x1B[36m" 222 #define KWHT "\x1B[37m" 224 extern const char* API_COLOR;
225 extern const char* API_COLOR_END;
231 #define EVENT_THREAD_SAFE 1 233 #define STREAM_THREAD_SAFE 1 235 #define CTX_THREAD_SAFE 1 237 #define DEVICE_THREAD_SAFE 1 242 #define COMPILE_HIP_DB 1 250 #define COMPILE_HIP_TRACE_API 0x3 254 #define TRACE_ALL 0 // 0x01 255 #define TRACE_KCMD 1 // 0x02, kernel command 256 #define TRACE_MCMD 2 // 0x04, memory command 257 #define TRACE_MEM 3 // 0x08, memory allocation or deallocation. 258 #define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent) 259 #define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery 270 #define DB_MAX_FLAG 6 277 const char* _shortName;
281 static const DbName dbName[] = {
283 {KYEL,
"sync"}, {KCYN,
"mem"}, {KMAG,
"copy"}, {KRED,
"warn"},
289 #define tprintf(trace_level, ...) \ 291 if (HIP_DB & (1 << (trace_level))) { \ 294 snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \ 295 fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \ 296 dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \ 301 #define tprintf(trace_level, ...) 305 static inline uint64_t getTicks() {
return hc::get_system_ticks(); }
308 extern uint64_t recordApiTrace(
TlsData *tls, std::string* fullStr,
const std::string& apiStr);
310 #if (COMPILE_HIP_TRACE_API & 0x1) 311 #define API_TRACE(forceTrace, ...) \ 313 uint64_t hipApiStartTick = 0; \ 315 tls->tidInfo.incApiSeqNum(); \ 317 (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \ 318 std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \ 319 std::string fullStr; \ 320 hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \ 326 #define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum(); 329 #define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls) 330 #define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx 332 #define HIP_SET_DEVICE() ihipDeviceSetState(tls); 337 #define HIP_INIT_API(cid, ...) \ 338 hip_impl::hip_init(); \ 339 API_TRACE(0, __VA_ARGS__); \ 340 HIP_CB_SPAWNER_OBJECT(cid); 346 #define HIP_INIT_SPECIAL_API(cid, tbit, ...) \ 347 hip_impl::hip_init(); \ 348 API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \ 349 HIP_CB_SPAWNER_OBJECT(cid); 355 #define ihipLogStatus(hipStatus) \ 357 hipError_t localHipStatus = hipStatus; \ 358 tls->lastHipError = localHipStatus; \ 360 if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \ 361 auto ticks = getTicks() - hipApiStartTick; \ 362 fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \ 363 (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \ 364 tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \ 365 ihipErrorString(localHipStatus), ticks, API_COLOR_END); \ 394 #define HIP_IPC_MEM_RESERVED_SIZE 24 401 char reserved[HIP_IPC_MEM_RESERVED_SIZE];
407 #define HIP_IPC_EVENT_RESERVED_SIZE 32 411 char shmem_name[HIP_IPC_HANDLE_SIZE];
416 std::string fileName;
417 hsa_executable_t executable = {};
418 hsa_code_object_reader_t coReader = {};
421 std::string, std::vector<std::pair<std::size_t, std::size_t>>> kernargs;
424 if (executable.handle) hsa_executable_destroy(executable);
425 if (coReader.handle) hsa_code_object_reader_destroy(coReader);
435 bool try_lock() {
return true; }
439 #if EVENT_THREAD_SAFE 440 typedef std::mutex EventMutex;
442 #warning "Stream thread-safe disabled" 446 #if STREAM_THREAD_SAFE 447 typedef std::mutex StreamMutex;
449 #warning "Stream thread-safe disabled" 455 typedef std::mutex CtxMutex;
458 #warning "Ctx thread-safe disabled" 461 #if DEVICE_THREAD_SAFE 462 typedef std::mutex DeviceMutex;
465 #warning "Device thread-safe disabled" 472 template <
typename T>
476 : _criticalData(&criticalData),
477 _autoUnlock(autoUnlock)
480 tprintf(DB_SYNC,
"locking criticalData=%p for %s..\n", _criticalData,
481 ToString(_criticalData->_parent).c_str());
482 _criticalData->_mutex.lock();
487 tprintf(DB_SYNC,
"auto-unlocking criticalData=%p for %s...\n", _criticalData,
488 ToString(_criticalData->_parent).c_str());
489 _criticalData->_mutex.unlock();
494 tprintf(DB_SYNC,
"unlocking criticalData=%p for %s...\n", _criticalData,
495 ToString(_criticalData->_parent).c_str());
496 _criticalData->_mutex.unlock();
500 T* operator->() {
return _criticalData; };
508 template <
typename MUTEX_TYPE>
512 void lock() { _mutex.lock(); }
513 void unlock() { _mutex.unlock(); }
514 bool try_lock() {
return _mutex.try_lock(); }
520 template <
typename MUTEX_TYPE>
524 : _parent{parentStream}, _av{av}, _last_op_was_a_copy{
false}
535 tprintf(DB_SYNC,
"munlocking criticalData=%p for %s...\n",
this,
536 ToString(this->_parent).c_str());
542 tprintf(DB_SYNC,
"mtry_locking=%d criticalData=%p for %s...\n", gotLock,
this,
543 ToString(this->_parent).c_str());
544 return gotLock ?
this :
nullptr;
548 hc::accelerator_view _av;
549 bool _last_op_was_a_copy;
582 enum ScheduleMode { Auto, Spin, Yield };
583 typedef uint64_t SeqNum_t;
590 void locked_copySync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind,
591 bool resolveOn =
true);
593 bool locked_copy2DSync(
void* dst,
const void* src,
size_t width,
size_t height,
size_t srcPitch,
size_t dstPitch,
unsigned kind,
594 bool resolveOn =
true);
596 void locked_copyAsync(
void* dst,
const void* src,
size_t sizeBytes,
unsigned kind);
598 bool locked_copy2DAsync(
void* dst,
const void* src,
size_t width,
size_t height,
size_t srcPitch,
size_t dstPitch,
unsigned kind);
600 void lockedSymbolCopySync(hc::accelerator& acc,
void* dst,
void* src,
size_t sizeBytes,
601 size_t offset,
unsigned kind);
602 void lockedSymbolCopyAsync(hc::accelerator& acc,
void* dst,
void* src,
size_t sizeBytes,
603 size_t offset,
unsigned kind);
608 LockedAccessor_StreamCrit_t lockopen_preKernelCommand();
609 void lockclose_postKernelCommand(
const char* kernelName, hc::accelerator_view* av,
bool unlockNotNeeded = 0);
611 void locked_wait(
bool& waited);
614 hc::accelerator_view* locked_getAv() {
615 LockedAccessor_StreamCrit_t crit(_criticalData);
620 hc::completion_future locked_recordEvent(
hipEvent_t event);
622 ihipStreamCritical_t& criticalData() {
return _criticalData; };
625 hc::hcWaitMode waitMode()
const;
628 void wait(LockedAccessor_StreamCrit_t& crit);
630 void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX,
631 uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX,
632 uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize,
633 uint32_t sharedMemBytes,
void* kernarg,
size_t kernSize,
644 bool isDefaultStream()
const {
return _id == 0; };
646 std::vector<mg_info*> coopMemsTracker;
657 unsigned resolveMemcpyDirection(
bool srcInDeviceMem,
bool dstInDeviceMem);
658 void resolveHcMemcpyDirection(
unsigned hipMemKind,
const hc::AmPointerInfo* dstPtrInfo,
659 const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir,
660 ihipCtx_t** copyDevice,
bool* forceUnpinnedCopy);
662 bool canSeeMemory(
const ihipCtx_t* thisCtx,
const hc::AmPointerInfo* dstInfo,
663 const hc::AmPointerInfo* srcInfo);
665 void addSymbolPtrToTracker(hc::accelerator& acc,
void* ptr,
size_t sizeBytes);
669 ihipStreamCritical_t _criticalData;
671 std::mutex _hasQueueLock;
676 friend std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s);
679 ScheduleMode _scheduleMode;
685 enum hipEventStatus_t {
686 hipEventStatusUnitialized = 0,
687 hipEventStatusCreated = 1,
688 hipEventStatusRecording = 2,
689 hipEventStatusComplete = 3,
693 enum ihipEventType_t {
694 hipEventTypeIndependent,
695 hipEventTypeStartCommand,
696 hipEventTypeStopCommand,
699 #define IPC_SIGNALS_PER_EVENT 32 701 std::atomic<int> owners;
702 std::atomic<int> read_index;
703 std::atomic<int> write_index;
704 std::atomic<int> signal[IPC_SIGNALS_PER_EVENT];
710 _state = hipEventStatusCreated;
713 _type = hipEventTypeIndependent;
719 void marker(
const hc::completion_future& marker) { _marker = marker; }
720 hc::completion_future& marker() {
return _marker; }
721 uint64_t timestamp()
const {
return _timestamp; }
722 ihipEventType_t type()
const {
return _type; }
724 ihipEventType_t _type;
725 hipEventStatus_t _state;
729 std::string _ipc_name;
733 hc::completion_future _marker;
739 template <
typename MUTEX_TYPE>
762 void attachToCompletionFuture(
const hc::completion_future* cf,
hipStream_t stream,
763 ihipEventType_t eventType);
767 LockedAccessor_EventCrit_t crit(_criticalData);
768 return _criticalData._eventData;
771 ihipEventCritical_t& criticalData() {
return _criticalData; };
778 ihipEventCritical_t _criticalData;
786 template <
typename MUTEX_TYPE>
790 : _parent(parentDevice), _ctxCount(0){};
797 std::list<ihipCtx_t*>& ctxs() {
return _ctxs; };
798 const std::list<ihipCtx_t*>& const_ctxs()
const {
return _ctxs; };
799 int getcount() {
return _ctxCount; };
806 std::list<ihipCtx_t*> _ctxs;
820 ihipDevice_t(
unsigned deviceId,
unsigned deviceCnt, hc::accelerator& acc);
824 ihipCtx_t* getPrimaryCtx()
const {
return _primaryCtx; };
827 ihipDeviceCritical_t& criticalData() {
return _criticalData; };
832 hc::accelerator _acc;
833 hsa_agent_t _hsaAgent;
840 uint32_t _driver_node_id;
850 ihipDeviceCritical_t _criticalData;
862 std::vector<char> _arguments;
867 template <
typename MUTEX_TYPE>
871 : _parent(parentCtx), _peerCnt(0) {
872 _peerAgents =
new hsa_agent_t[deviceCnt];
876 if (_peerAgents !=
nullptr) {
878 _peerAgents =
nullptr;
885 std::list<ihipStream_t*>& streams() {
return _streams; };
886 const std::list<ihipStream_t*>& const_streams()
const {
return _streams; };
890 bool isPeerWatcher(
const ihipCtx_t* peer);
894 void resetPeerWatchers(
ihipCtx_t* thisDevice);
895 void printPeerWatchers(FILE* f)
const;
897 uint32_t peerCnt()
const {
return _peerCnt; };
898 hsa_agent_t* peerAgents()
const {
return _peerAgents; };
902 std::list<ihipCtx_t*> _peers;
904 std::stack<ihipExec_t> _execStack;
912 std::list<ihipStream_t*> _streams;
920 hsa_agent_t* _peerAgents;
922 void recomputePeerAgents();
953 void locked_waitAllStreams();
954 void locked_syncDefaultStream(
bool waitOnSelf,
bool syncHost);
956 ihipCtxCritical_t& criticalData() {
return _criticalData; };
958 const ihipDevice_t* getDevice()
const {
return _device; };
959 int getDeviceNum()
const {
return _device->_deviceId; };
962 ihipDevice_t* getWriteableDevice()
const {
return _device; };
964 std::string toString()
const;
982 ihipCtxCritical_t _criticalData;
988 extern unsigned g_deviceCnt;
989 extern hsa_agent_t g_cpu_agent;
990 extern hsa_agent_t* g_allAgents;
994 extern void ihipInit();
995 extern const char* ihipErrorString(hipError_t);
996 extern hipError_t ihipSynchronize(
TlsData *tls);
997 extern void ihipCtxStackUpdate();
998 extern hipError_t ihipDeviceSetState(
TlsData *tls);
1001 ihipCtx_t* ihipGetPrimaryCtx(
unsigned deviceIndex);
1003 const char* name, hsa_agent_t *agent);
1018 hipError_t ihipGetGlobalVar(hipDeviceptr_t* dev_ptr,
size_t* size_ptr,
const char* hostVar,
1022 inline std::ostream& operator<<(std::ostream& os,
const ihipStream_t& s) {
1024 os << s.getDevice()->_deviceId;
1031 inline std::ostream& operator<<(std::ostream& os,
const dim3& s) {
1042 inline std::ostream& operator<<(std::ostream& os,
const gl_dim3& s) {
1054 inline std::ostream& operator<<(std::ostream& os,
const hipEvent_t& e) {
1055 os <<
"event:" << std::hex << static_cast<void*>(e);
1059 inline std::ostream& operator<<(std::ostream& os,
const ihipCtx_t* c) {
1060 os <<
"ctx:" <<
static_cast<const void*
>(c) <<
".dev:" << c->getDevice()->_deviceId;
1067 hipError_t memcpyAsync(
void* dst,
const void* src,
size_t sizeBytes, hipMemcpyKind kind,
1070 hipError_t ihipHostMalloc(
TlsData *tls,
void** ptr,
size_t sizeBytes,
unsigned int flags,
bool noSync = 0);
1072 hipError_t ihipHostFree(
TlsData *tls,
void* ptr);
1076 #define MAX_COOPERATIVE_GPUs 255 1086 if ((tls->defaultCtx ==
nullptr) && (g_deviceCnt > 0)) {
1087 tls->defaultCtx = ihipGetPrimaryCtx(0);
1089 return tls->defaultCtx;
1100 hipFunction_t ihipGetDeviceFunction(
const void *hostFunction);
Definition: hip_hcc_internal.h:275
Definition: hip_hcc_internal.h:565
Definition: hip_hcc_internal.h:408
Definition: hip_hcc_internal.h:818
Definition: hip_hcc_internal.h:509
Definition: hip_hcc_internal.h:700
Definition: hip_hcc_internal.h:162
Definition: hip_hcc_internal.h:432
Definition: hip_hcc_internal.h:395
uint32_t x
x
Definition: hip_runtime_api.h:274
Definition: grid_launch.h:17
Definition: hip_hcc_internal.h:570
Definition: hip_module.cpp:108
Definition: hip_hcc_internal.h:185
Definition: hip_hcc_internal.h:857
Definition: hip_hcc_internal.h:938
Definition: hip_runtime_api.h:273
uint32_t y
y
Definition: hip_runtime_api.h:275
Definition: hip_hcc_internal.h:787
unsigned _computeUnits
Number of compute units supported by the device:
Definition: hip_hcc_internal.h:836
uint32_t z
z
Definition: hip_runtime_api.h:276
Definition: hip_runtime_api.h:83
Definition: hip_hcc_internal.h:708
Definition: hip_hcc_internal.h:740
Definition: hip_hcc_internal.h:1066
Definition: hip_hcc_internal.h:371
Definition: hip_hcc_internal.h:759
Definition: hip_hcc_internal.h:868
Definition: hip_hcc_internal.h:580
Definition: hip_hcc_internal.h:521
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
Record an event in the specified stream.
Definition: hip_event.cpp:213
hsa_amd_ipc_memory_t ipc_handle
ipc memory handle on ROCr
Definition: hip_hcc_internal.h:398
Definition: hip_hcc_internal.h:473
Definition: hip_hcc_internal.h:415
hipError_t hipStreamQuery(hipStream_t stream)
Return #hipSuccess if all of the operations in the specified stream have completed, or #hipErrorNotReady if not.
Definition: hip_stream.cpp:161
Definition: hip_hcc_internal.h:145