23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
29 #if !defined(__HIPCC_RTC__)
32 #endif // !defined(__HIPCC_RTC__)
34 #include <hip/hip_vector_types.h>
38 #if __HIP_CLANG_ONLY__
39 extern "C" __device__
int printf(
const char *fmt, ...);
41 template <
typename... All>
42 static inline __device__
void printf(
const char* format, All... all) {}
43 #endif // __HIP_CLANG_ONLY__
50 __device__
static inline unsigned int __popc(
unsigned int input) {
51 return __builtin_popcount(input);
53 __device__
static inline unsigned int __popcll(
unsigned long long int input) {
54 return __builtin_popcountll(input);
57 __device__
static inline int __clz(
int input) {
58 return __ockl_clz_u32((uint)input);
61 __device__
static inline int __clzll(
long long int input) {
62 return __ockl_clz_u64((ullong)input);
65 __device__
static inline unsigned int __ffs(
unsigned int input) {
66 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
69 __device__
static inline unsigned int __ffsll(
unsigned long long int input) {
70 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
73 __device__
static inline unsigned int __ffs(
int input) {
74 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
77 __device__
static inline unsigned int __ffsll(
long long int input) {
78 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
81 __device__
static inline unsigned int __brev(
unsigned int input) {
82 return __builtin_bitreverse32(input);
85 __device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
86 return __builtin_bitreverse64(input);
89 __device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
90 return input == 0 ? -1 : __builtin_ctzl(input);
93 __device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
94 uint32_t offset = src1 & 31;
95 uint32_t width = src2 & 31;
96 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
99 __device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
100 uint64_t offset = src1 & 63;
101 uint64_t width = src2 & 63;
102 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
105 __device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
106 uint32_t offset = src2 & 31;
107 uint32_t width = src3 & 31;
108 uint32_t mask = (1 << width) - 1;
109 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
112 __device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
113 uint64_t offset = src2 & 63;
114 uint64_t width = src3 & 63;
115 uint64_t mask = (1ULL << width) - 1;
116 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
119 __device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
120 __device__
static unsigned int __hadd(
int x,
int y);
121 __device__
static int __mul24(
int x,
int y);
122 __device__
static long long int __mul64hi(
long long int x,
long long int y);
123 __device__
static int __mulhi(
int x,
int y);
124 __device__
static int __rhadd(
int x,
int y);
125 __device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
126 __device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
127 __device__
static int __umul24(
unsigned int x,
unsigned int y);
128 __device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
129 __device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
130 __device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
131 __device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
138 } __attribute__((aligned(4)));
145 } __attribute__((aligned(8)));
148 static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
155 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
156 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
157 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
158 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
162 __device__
static inline unsigned int __hadd(
int x,
int y) {
164 int sign = z & 0x8000000;
165 int value = z & 0x7FFFFFFF;
166 return ((value) >> 1 || sign);
169 __device__
static inline int __mul24(
int x,
int y) {
170 return __ockl_mul24_i32(x, y);
173 __device__
static inline long long __mul64hi(
long long int x,
long long int y) {
174 ulong x0 = (ulong)x & 0xffffffffUL;
176 ulong y0 = (ulong)y & 0xffffffffUL;
179 long t = x1*y0 + (z0 >> 32);
180 long z1 = t & 0xffffffffL;
183 return x1*y1 + z2 + (z1 >> 32);
186 __device__
static inline int __mulhi(
int x,
int y) {
187 return __ockl_mul_hi_i32(x, y);
190 __device__
static inline int __rhadd(
int x,
int y) {
192 int sign = z & 0x8000000;
193 int value = z & 0x7FFFFFFF;
194 return ((value) >> 1 || sign);
196 __device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
197 return x > y ? x - y + z : y - x + z;
199 __device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
202 __device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
203 return __ockl_mul24_u32(x, y);
207 static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
208 ulong x0 = x & 0xffffffffUL;
210 ulong y0 = y & 0xffffffffUL;
213 ulong t = x1*y0 + (z0 >> 32);
214 ulong z1 = t & 0xffffffffUL;
217 return x1*y1 + z2 + (z1 >> 32);
220 __device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
221 return __ockl_mul_hi_u32(x, y);
223 __device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
224 return (x + y + 1) >> 1;
226 __device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
227 return __ockl_sadd_u32(x, y, z);
230 __device__
static inline unsigned int __lane_id() {
231 return __builtin_amdgcn_mbcnt_hi(
232 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
236 static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
239 static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
245 __device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
246 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
247 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
251 __device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
252 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
253 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
257 __device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
258 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
259 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
263 __device__
static inline float __hip_ds_permutef(
int index,
float src) {
264 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
265 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
269 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
270 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
272 template <
int pattern>
273 __device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
274 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
275 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
279 template <
int pattern>
280 __device__
static inline float __hip_ds_swizzlef_N(
float src) {
281 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
282 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
286 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
287 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
289 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
290 __device__
static inline int __hip_move_dpp_N(
int src) {
291 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
295 static constexpr
int warpSize = __AMDGCN_WAVEFRONT_SIZE;
299 int __shfl(
int var,
int src_lane,
int width = warpSize) {
300 int self = __lane_id();
301 int index = src_lane + (
self & ~(width-1));
302 return __builtin_amdgcn_ds_bpermute(index<<2, var);
306 unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
307 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
308 tmp.i = __shfl(tmp.i, src_lane, width);
313 float __shfl(
float var,
int src_lane,
int width = warpSize) {
314 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
315 tmp.i = __shfl(tmp.i, src_lane, width);
320 double __shfl(
double var,
int src_lane,
int width = warpSize) {
321 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
322 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
324 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
325 tmp[0] = __shfl(tmp[0], src_lane, width);
326 tmp[1] = __shfl(tmp[1], src_lane, width);
328 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
329 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
334 long __shfl(
long var,
int src_lane,
int width = warpSize)
337 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
338 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
340 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
341 tmp[0] = __shfl(tmp[0], src_lane, width);
342 tmp[1] = __shfl(tmp[1], src_lane, width);
344 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
345 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
348 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
349 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
354 unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
356 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
357 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
359 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
360 tmp[0] = __shfl(tmp[0], src_lane, width);
361 tmp[1] = __shfl(tmp[1], src_lane, width);
363 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
364 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
367 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
368 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
373 long long __shfl(
long long var,
int src_lane,
int width = warpSize)
375 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
376 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
378 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
379 tmp[0] = __shfl(tmp[0], src_lane, width);
380 tmp[1] = __shfl(tmp[1], src_lane, width);
382 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
383 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
388 unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
389 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
390 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
392 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
393 tmp[0] = __shfl(tmp[0], src_lane, width);
394 tmp[1] = __shfl(tmp[1], src_lane, width);
396 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
397 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
403 int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
404 int self = __lane_id();
405 int index =
self - lane_delta;
406 index = (index < (
self & ~(width-1)))?
self:index;
407 return __builtin_amdgcn_ds_bpermute(index<<2, var);
411 unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
412 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
413 tmp.i = __shfl_up(tmp.i, lane_delta, width);
418 float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
419 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
420 tmp.i = __shfl_up(tmp.i, lane_delta, width);
425 double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
426 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
427 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
429 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
430 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
431 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
433 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
434 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
439 long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
442 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
443 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
445 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
446 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
447 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
449 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
450 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
453 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
454 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
460 unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
463 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
464 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
466 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
467 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
468 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
470 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
471 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
474 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
475 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
481 long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
483 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
484 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
485 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
486 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
487 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
488 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
489 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
495 unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
497 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
498 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
499 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
500 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
501 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
502 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
503 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
509 int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
510 int self = __lane_id();
511 int index =
self + lane_delta;
512 index = (int)((
self&(width-1))+lane_delta) >= width?
self:index;
513 return __builtin_amdgcn_ds_bpermute(index<<2, var);
517 unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
518 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
519 tmp.i = __shfl_down(tmp.i, lane_delta, width);
524 float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
525 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
526 tmp.i = __shfl_down(tmp.i, lane_delta, width);
531 double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
532 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
533 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
535 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
536 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
537 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
539 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
540 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
545 long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
548 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
549 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
551 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
552 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
553 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
555 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
556 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
559 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
560 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
565 unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
568 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
569 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
571 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
572 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
573 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
575 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
576 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
579 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
580 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
585 long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
587 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
588 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
589 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
590 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
591 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
592 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
593 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
598 unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
600 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
601 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
602 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
603 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
604 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
605 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
606 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
612 int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
613 int self = __lane_id();
614 int index =
self^lane_mask;
615 index = index >= ((
self+width)&~(width-1))?
self:index;
616 return __builtin_amdgcn_ds_bpermute(index<<2, var);
620 unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
621 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
622 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
627 float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
628 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
629 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
634 double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
635 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
636 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
638 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
639 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
640 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
642 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
643 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
648 long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
651 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
652 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
654 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
655 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
656 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
658 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
659 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
662 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
663 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
668 unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
671 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
672 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
674 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
675 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
676 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
678 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
679 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
682 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
683 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
688 long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
690 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
691 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
692 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
693 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
694 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
695 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
696 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
701 unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
703 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
704 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
705 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
706 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
707 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
708 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
709 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
712 #define MASK1 0x00ff00ff
713 #define MASK2 0xff00ff00
717 unsigned one1 = in1.w & MASK1;
718 unsigned one2 = in2.w & MASK1;
719 out.w = (one1 + one2) & MASK1;
720 one1 = in1.w & MASK2;
721 one2 = in2.w & MASK2;
722 out.w = out.w | ((one1 + one2) & MASK2);
728 unsigned one1 = in1.w & MASK1;
729 unsigned one2 = in2.w & MASK1;
730 out.w = (one1 - one2) & MASK1;
731 one1 = in1.w & MASK2;
732 one2 = in2.w & MASK2;
733 out.w = out.w | ((one1 - one2) & MASK2);
739 unsigned one1 = in1.w & MASK1;
740 unsigned one2 = in2.w & MASK1;
741 out.w = (one1 * one2) & MASK1;
742 one1 = in1.w & MASK2;
743 one2 = in2.w & MASK2;
744 out.w = out.w | ((one1 * one2) & MASK2);
753 __device__
static inline float __double2float_rd(
double x) {
return (
double)x; }
754 __device__
static inline float __double2float_rn(
double x) {
return (
double)x; }
755 __device__
static inline float __double2float_ru(
double x) {
return (
double)x; }
756 __device__
static inline float __double2float_rz(
double x) {
return (
double)x; }
758 __device__
static inline int __double2hiint(
double x) {
759 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
762 __builtin_memcpy(tmp, &x,
sizeof(tmp));
766 __device__
static inline int __double2loint(
double x) {
767 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
770 __builtin_memcpy(tmp, &x,
sizeof(tmp));
775 __device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
776 __device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
777 __device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
778 __device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
780 __device__
static inline long long int __double2ll_rd(
double x) {
781 return (
long long)__ocml_floor_f64(x);
783 __device__
static inline long long int __double2ll_rn(
double x) {
784 return (
long long)__ocml_rint_f64(x);
786 __device__
static inline long long int __double2ll_ru(
double x) {
787 return (
long long)__ocml_ceil_f64(x);
789 __device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
791 __device__
static inline unsigned int __double2uint_rd(
double x) {
792 return (
unsigned int)__ocml_floor_f64(x);
794 __device__
static inline unsigned int __double2uint_rn(
double x) {
795 return (
unsigned int)__ocml_rint_f64(x);
797 __device__
static inline unsigned int __double2uint_ru(
double x) {
798 return (
unsigned int)__ocml_ceil_f64(x);
800 __device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
802 __device__
static inline unsigned long long int __double2ull_rd(
double x) {
803 return (
unsigned long long int)__ocml_floor_f64(x);
805 __device__
static inline unsigned long long int __double2ull_rn(
double x) {
806 return (
unsigned long long int)__ocml_rint_f64(x);
808 __device__
static inline unsigned long long int __double2ull_ru(
double x) {
809 return (
unsigned long long int)__ocml_ceil_f64(x);
811 __device__
static inline unsigned long long int __double2ull_rz(
double x) {
812 return (
unsigned long long int)x;
815 __device__
static inline long long int __double_as_longlong(
double x) {
816 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
819 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
838 __device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
839 __device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
840 __device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
841 __device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
843 __device__
static inline long long int __float2ll_rd(
float x) {
844 return (
long long int)__ocml_floor_f32(x);
846 __device__
static inline long long int __float2ll_rn(
float x) {
847 return (
long long int)__ocml_rint_f32(x);
849 __device__
static inline long long int __float2ll_ru(
float x) {
850 return (
long long int)__ocml_ceil_f32(x);
852 __device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
854 __device__
static inline unsigned int __float2uint_rd(
float x) {
855 return (
unsigned int)__ocml_floor_f32(x);
857 __device__
static inline unsigned int __float2uint_rn(
float x) {
858 return (
unsigned int)__ocml_rint_f32(x);
860 __device__
static inline unsigned int __float2uint_ru(
float x) {
861 return (
unsigned int)__ocml_ceil_f32(x);
863 __device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
865 __device__
static inline unsigned long long int __float2ull_rd(
float x) {
866 return (
unsigned long long int)__ocml_floor_f32(x);
868 __device__
static inline unsigned long long int __float2ull_rn(
float x) {
869 return (
unsigned long long int)__ocml_rint_f32(x);
871 __device__
static inline unsigned long long int __float2ull_ru(
float x) {
872 return (
unsigned long long int)__ocml_ceil_f32(x);
874 __device__
static inline unsigned long long int __float2ull_rz(
float x) {
875 return (
unsigned long long int)x;
878 __device__
static inline int __float_as_int(
float x) {
879 static_assert(
sizeof(
int) ==
sizeof(
float),
"");
882 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
887 __device__
static inline unsigned int __float_as_uint(
float x) {
888 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
891 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
896 __device__
static inline double __hiloint2double(
int hi,
int lo) {
897 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
899 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
901 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
906 __device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
908 __device__
static inline float __int2float_rd(
int x) {
return (
float)x; }
909 __device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
910 __device__
static inline float __int2float_ru(
int x) {
return (
float)x; }
911 __device__
static inline float __int2float_rz(
int x) {
return (
float)x; }
913 __device__
static inline float __int_as_float(
int x) {
914 static_assert(
sizeof(
float) ==
sizeof(
int),
"");
917 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
922 __device__
static inline double __ll2double_rd(
long long int x) {
return (
double)x; }
923 __device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
924 __device__
static inline double __ll2double_ru(
long long int x) {
return (
double)x; }
925 __device__
static inline double __ll2double_rz(
long long int x) {
return (
double)x; }
927 __device__
static inline float __ll2float_rd(
long long int x) {
return (
float)x; }
928 __device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
929 __device__
static inline float __ll2float_ru(
long long int x) {
return (
float)x; }
930 __device__
static inline float __ll2float_rz(
long long int x) {
return (
float)x; }
932 __device__
static inline double __longlong_as_double(
long long int x) {
933 static_assert(
sizeof(
double) ==
sizeof(
long long),
"");
936 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
941 __device__
static inline double __uint2double_rn(
int x) {
return (
double)x; }
943 __device__
static inline float __uint2float_rd(
unsigned int x) {
return (
float)x; }
944 __device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
945 __device__
static inline float __uint2float_ru(
unsigned int x) {
return (
float)x; }
946 __device__
static inline float __uint2float_rz(
unsigned int x) {
return (
float)x; }
948 __device__
static inline float __uint_as_float(
unsigned int x) {
949 static_assert(
sizeof(
float) ==
sizeof(
unsigned int),
"");
952 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
957 __device__
static inline double __ull2double_rd(
unsigned long long int x) {
return (
double)x; }
958 __device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
959 __device__
static inline double __ull2double_ru(
unsigned long long int x) {
return (
double)x; }
960 __device__
static inline double __ull2double_rz(
unsigned long long int x) {
return (
double)x; }
962 __device__
static inline float __ull2float_rd(
unsigned long long int x) {
return (
float)x; }
963 __device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
964 __device__
static inline float __ull2float_ru(
unsigned long long int x) {
return (
float)x; }
965 __device__
static inline float __ull2float_rz(
unsigned long long int x) {
return (
float)x; }
967 #if __HIP_CLANG_ONLY__
970 __device__
long long int __clock64();
971 __device__
long long int __clock();
972 __device__
long long int clock64();
973 __device__
long long int clock();
975 __device__
void __named_sync(
int a,
int b);
977 #ifdef __HIP_DEVICE_COMPILE__
981 inline __attribute((always_inline))
982 long long int __clock64() {
983 return (
long long int) __builtin_readcyclecounter();
987 inline __attribute((always_inline))
988 long long int __clock() {
return __clock64(); }
991 inline __attribute__((always_inline))
992 long long int clock64() {
return __clock64(); }
995 inline __attribute__((always_inline))
996 long long int clock() {
return __clock(); }
1001 void __named_sync(
int a,
int b) { __builtin_amdgcn_s_barrier(); }
1003 #endif // __HIP_DEVICE_COMPILE__
1008 int __all(
int predicate) {
1009 return __ockl_wfall_i32(predicate);
1014 int __any(
int predicate) {
1015 return __ockl_wfany_i32(predicate);
1023 unsigned long long int __ballot(
int predicate) {
1024 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1029 unsigned long long int __ballot64(
int predicate) {
1030 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1036 uint64_t __lanemask_gt()
1038 uint32_t lane = __ockl_lane_u32();
1041 uint64_t ballot = __ballot64(1);
1042 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1043 return mask & ballot;
1048 uint64_t __lanemask_lt()
1050 uint32_t lane = __ockl_lane_u32();
1051 int64_t ballot = __ballot64(1);
1052 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1053 return mask & ballot;
1058 uint64_t __lanemask_eq()
1060 uint32_t lane = __ockl_lane_u32();
1061 int64_t mask = ((uint64_t)1 << lane);
1066 __device__
inline void* __local_to_generic(
void* p) {
return p; }
1068 #ifdef __HIP_DEVICE_COMPILE__
1071 void* __get_dynamicgroupbaseptr()
1074 return (
char*)__local_to_generic((
void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1078 void* __get_dynamicgroupbaseptr();
1079 #endif // __HIP_DEVICE_COMPILE__
1083 void *__amdgcn_get_dynamicgroupbaseptr() {
1084 return __get_dynamicgroupbaseptr();
1090 static void __threadfence()
1092 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1097 static void __threadfence_block()
1099 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1104 static void __threadfence_system()
1106 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1112 __attribute__((weak))
1114 return __builtin_trap();
1122 #if defined(_WIN32) || defined(_WIN64)
1123 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1124 void _wassert(
const wchar_t *_msg,
const wchar_t *_file,
unsigned _line) {
1129 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1130 void __assert_fail(
const char *assertion,
1133 const char *
function)
1135 printf(
"%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
1136 function, assertion);
1140 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1141 void __assertfail(
const char *assertion,
1144 const char *
function,
1154 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1157 __atomic_work_item_fence(flags, __memory_order_release, scope);
1158 __builtin_amdgcn_s_barrier();
1159 __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1161 __builtin_amdgcn_s_barrier();
1167 static void __barrier(
int n)
1169 __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1174 __attribute__((convergent))
1175 void __syncthreads()
1177 __barrier(__CLK_LOCAL_MEM_FENCE);
1182 __attribute__((convergent))
1183 int __syncthreads_count(
int predicate)
1185 return __ockl_wgred_add_i32(!!predicate);
1190 __attribute__((convergent))
1191 int __syncthreads_and(
int predicate)
1193 return __ockl_wgred_and_i32(!!predicate);
1198 __attribute__((convergent))
1199 int __syncthreads_or(
int predicate)
1201 return __ockl_wgred_or_i32(!!predicate);
1222 #define HW_ID_CU_ID_SIZE 4
1223 #define HW_ID_CU_ID_OFFSET 8
1225 #define HW_ID_SE_ID_SIZE 2
1226 #define HW_ID_SE_ID_OFFSET 13
1235 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1245 unsigned __smid(
void)
1247 unsigned cu_id = __builtin_amdgcn_s_getreg(
1248 GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1249 unsigned se_id = __builtin_amdgcn_s_getreg(
1250 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1253 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1260 #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1261 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1263 #endif //defined(__clang__) && defined(__HIP__)
1267 static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1268 auto dstPtr =
static_cast<unsigned char*
>(dst);
1269 auto srcPtr =
static_cast<const unsigned char*
>(src);
1271 while (size >= 4u) {
1272 dstPtr[0] = srcPtr[0];
1273 dstPtr[1] = srcPtr[1];
1274 dstPtr[2] = srcPtr[2];
1275 dstPtr[3] = srcPtr[3];
1283 dstPtr[2] = srcPtr[2];
1285 dstPtr[1] = srcPtr[1];
1287 dstPtr[0] = srcPtr[0];
1293 static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1294 auto dstPtr =
static_cast<unsigned char*
>(dst);
1296 while (size >= 4u) {
1316 #ifndef __OPENMP_AMDGCN__
1317 static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1318 return __hip_hc_memcpy(dst, src, size);
1321 static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1322 unsigned char val8 =
static_cast<unsigned char>(val);
1323 return __hip_hc_memset(ptr, val8, size);
1325 #endif // !__OPENMP_AMDGCN__