23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
33 #include <hip/hip_vector_types.h>
37 #if __HIP_CLANG_ONLY__
38 extern "C" __device__
int printf(
const char *fmt, ...);
40 template <
typename... All>
41 static inline __device__
void printf(
const char* format, All... all) {}
42 #endif // __HIP_CLANG_ONLY__
49 __device__
static inline unsigned int __popc(
unsigned int input) {
50 return __builtin_popcount(input);
52 __device__
static inline unsigned int __popcll(
unsigned long long int input) {
53 return __builtin_popcountll(input);
56 __device__
static inline int __clz(
int input) {
57 return __ockl_clz_u32((uint)input);
60 __device__
static inline int __clzll(
long long int input) {
61 return __ockl_clz_u64((ullong)input);
64 __device__
static inline unsigned int __ffs(
unsigned int input) {
65 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68 __device__
static inline unsigned int __ffsll(
unsigned long long int input) {
69 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72 __device__
static inline unsigned int __ffs(
int input) {
73 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76 __device__
static inline unsigned int __ffsll(
long long int input) {
77 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80 __device__
static inline unsigned int __brev(
unsigned int input) {
81 return __builtin_bitreverse32(input);
84 __device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
85 return __builtin_bitreverse64(input);
88 __device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
89 return input == 0 ? -1 : __builtin_ctzl(input);
92 __device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
93 uint32_t offset = src1 & 31;
94 uint32_t width = src2 & 31;
95 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
98 __device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
99 uint64_t offset = src1 & 63;
100 uint64_t width = src2 & 63;
101 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
104 __device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
105 uint32_t offset = src2 & 31;
106 uint32_t width = src3 & 31;
107 uint32_t mask = (1 << width) - 1;
108 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
111 __device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
112 uint64_t offset = src2 & 63;
113 uint64_t width = src3 & 63;
114 uint64_t mask = (1ULL << width) - 1;
115 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
118 __device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
119 __device__
static unsigned int __hadd(
int x,
int y);
120 __device__
static int __mul24(
int x,
int y);
121 __device__
static long long int __mul64hi(
long long int x,
long long int y);
122 __device__
static int __mulhi(
int x,
int y);
123 __device__
static int __rhadd(
int x,
int y);
124 __device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
125 __device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
126 __device__
static int __umul24(
unsigned int x,
unsigned int y);
127 __device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
128 __device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
129 __device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
130 __device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
137 } __attribute__((aligned(4)));
144 } __attribute__((aligned(8)));
147 static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
154 cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
155 cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
156 cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
157 cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
161 __device__
static inline unsigned int __hadd(
int x,
int y) {
163 int sign = z & 0x8000000;
164 int value = z & 0x7FFFFFFF;
165 return ((value) >> 1 || sign);
168 __device__
static inline int __mul24(
int x,
int y) {
169 return __ockl_mul24_i32(x, y);
172 __device__
static inline long long __mul64hi(
long long int x,
long long int y) {
173 ulong x0 = (ulong)x & 0xffffffffUL;
175 ulong y0 = (ulong)y & 0xffffffffUL;
178 long t = x1*y0 + (z0 >> 32);
179 long z1 = t & 0xffffffffL;
182 return x1*y1 + z2 + (z1 >> 32);
185 __device__
static inline int __mulhi(
int x,
int y) {
186 return __ockl_mul_hi_i32(x, y);
189 __device__
static inline int __rhadd(
int x,
int y) {
191 int sign = z & 0x8000000;
192 int value = z & 0x7FFFFFFF;
193 return ((value) >> 1 || sign);
195 __device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
196 return x > y ? x - y + z : y - x + z;
198 __device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
201 __device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
202 return __ockl_mul24_u32(x, y);
206 static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
207 ulong x0 = x & 0xffffffffUL;
209 ulong y0 = y & 0xffffffffUL;
212 ulong t = x1*y0 + (z0 >> 32);
213 ulong z1 = t & 0xffffffffUL;
216 return x1*y1 + z2 + (z1 >> 32);
219 __device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
220 return __ockl_mul_hi_u32(x, y);
222 __device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
223 return (x + y + 1) >> 1;
225 __device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
226 return __ockl_sadd_u32(x, y, z);
229 __device__
static inline unsigned int __lane_id() {
230 return __builtin_amdgcn_mbcnt_hi(
231 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
235 static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
238 static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
244 __device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
245 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
246 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
250 __device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
251 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
252 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
256 __device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
257 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
258 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
262 __device__
static inline float __hip_ds_permutef(
int index,
float src) {
263 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
264 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
268 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
269 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
271 template <
int pattern>
272 __device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
273 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
274 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
278 template <
int pattern>
279 __device__
static inline float __hip_ds_swizzlef_N(
float src) {
280 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
281 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
285 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
286 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
288 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
289 __device__
static inline int __hip_move_dpp_N(
int src) {
290 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
297 #ifndef __AMDGCN_WAVEFRONT_SIZE
298 #if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
299 #define __AMDGCN_WAVEFRONT_SIZE 32
301 #define __AMDGCN_WAVEFRONT_SIZE 64
304 static constexpr
int warpSize = __AMDGCN_WAVEFRONT_SIZE;
308 int __shfl(
int var,
int src_lane,
int width = warpSize) {
309 int self = __lane_id();
310 int index = src_lane + (
self & ~(width-1));
311 return __builtin_amdgcn_ds_bpermute(index<<2, var);
315 unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
316 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
317 tmp.i = __shfl(tmp.i, src_lane, width);
322 float __shfl(
float var,
int src_lane,
int width = warpSize) {
323 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
324 tmp.i = __shfl(tmp.i, src_lane, width);
329 double __shfl(
double var,
int src_lane,
int width = warpSize) {
330 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
331 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
333 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
334 tmp[0] = __shfl(tmp[0], src_lane, width);
335 tmp[1] = __shfl(tmp[1], src_lane, width);
337 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
338 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
343 long __shfl(
long var,
int src_lane,
int width = warpSize)
346 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
347 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
349 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
350 tmp[0] = __shfl(tmp[0], src_lane, width);
351 tmp[1] = __shfl(tmp[1], src_lane, width);
353 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
354 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
357 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
358 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
363 unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
365 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
366 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
368 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
369 tmp[0] = __shfl(tmp[0], src_lane, width);
370 tmp[1] = __shfl(tmp[1], src_lane, width);
372 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
373 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
376 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
377 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
382 long long __shfl(
long long var,
int src_lane,
int width = warpSize)
384 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
385 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
387 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
388 tmp[0] = __shfl(tmp[0], src_lane, width);
389 tmp[1] = __shfl(tmp[1], src_lane, width);
391 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
392 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
397 unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
398 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
399 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
401 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
402 tmp[0] = __shfl(tmp[0], src_lane, width);
403 tmp[1] = __shfl(tmp[1], src_lane, width);
405 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
406 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
412 int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
413 int self = __lane_id();
414 int index =
self - lane_delta;
415 index = (index < (
self & ~(width-1)))?
self:index;
416 return __builtin_amdgcn_ds_bpermute(index<<2, var);
420 unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
421 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
422 tmp.i = __shfl_up(tmp.i, lane_delta, width);
427 float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
428 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
429 tmp.i = __shfl_up(tmp.i, lane_delta, width);
434 double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
435 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
436 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
438 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
439 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
440 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
442 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
443 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
448 long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
451 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
452 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
454 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
455 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
456 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
458 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
459 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
462 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
463 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
469 unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
472 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
473 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
475 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
476 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
477 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
479 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
480 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
483 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
484 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
490 long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
492 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
493 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
494 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
495 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
496 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
497 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
498 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
504 unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
506 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
507 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
508 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
509 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
510 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
511 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
512 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
518 int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
519 int self = __lane_id();
520 int index =
self + lane_delta;
521 index = (int)((
self&(width-1))+lane_delta) >= width?
self:index;
522 return __builtin_amdgcn_ds_bpermute(index<<2, var);
526 unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
527 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
528 tmp.i = __shfl_down(tmp.i, lane_delta, width);
533 float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
534 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
535 tmp.i = __shfl_down(tmp.i, lane_delta, width);
540 double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
541 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
542 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
544 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
545 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
546 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
548 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
549 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
554 long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
557 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
558 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
560 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
561 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
562 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
564 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
565 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
568 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
569 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
574 unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
577 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
578 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
580 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
581 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
582 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
584 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
585 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
588 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
589 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
594 long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
596 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
597 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
598 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
599 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
600 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
601 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
602 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
607 unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
609 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
610 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
611 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
612 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
613 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
614 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
615 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
621 int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
622 int self = __lane_id();
623 int index =
self^lane_mask;
624 index = index >= ((
self+width)&~(width-1))?
self:index;
625 return __builtin_amdgcn_ds_bpermute(index<<2, var);
629 unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
630 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
631 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
636 float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
637 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
638 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
643 double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
644 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
645 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
647 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
648 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
649 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
651 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
652 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
657 long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
660 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
661 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
663 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
664 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
665 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
667 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
668 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
671 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
672 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
677 unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
680 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
681 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
683 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
684 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
685 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
687 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
688 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
691 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
692 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
697 long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
699 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
700 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
701 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
702 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
703 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
704 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
705 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
710 unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
712 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
713 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
714 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
715 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
716 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
717 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
718 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
721 #define MASK1 0x00ff00ff
722 #define MASK2 0xff00ff00
726 unsigned one1 = in1.w & MASK1;
727 unsigned one2 = in2.w & MASK1;
728 out.w = (one1 + one2) & MASK1;
729 one1 = in1.w & MASK2;
730 one2 = in2.w & MASK2;
731 out.w = out.w | ((one1 + one2) & MASK2);
737 unsigned one1 = in1.w & MASK1;
738 unsigned one2 = in2.w & MASK1;
739 out.w = (one1 - one2) & MASK1;
740 one1 = in1.w & MASK2;
741 one2 = in2.w & MASK2;
742 out.w = out.w | ((one1 - one2) & MASK2);
748 unsigned one1 = in1.w & MASK1;
749 unsigned one2 = in2.w & MASK1;
750 out.w = (one1 * one2) & MASK1;
751 one1 = in1.w & MASK2;
752 one2 = in2.w & MASK2;
753 out.w = out.w | ((one1 * one2) & MASK2);
762 __device__
static inline float __double2float_rd(
double x) {
return (
double)x; }
763 __device__
static inline float __double2float_rn(
double x) {
return (
double)x; }
764 __device__
static inline float __double2float_ru(
double x) {
return (
double)x; }
765 __device__
static inline float __double2float_rz(
double x) {
return (
double)x; }
767 __device__
static inline int __double2hiint(
double x) {
768 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
771 __builtin_memcpy(tmp, &x,
sizeof(tmp));
775 __device__
static inline int __double2loint(
double x) {
776 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
779 __builtin_memcpy(tmp, &x,
sizeof(tmp));
784 __device__
static inline int __double2int_rd(
double x) {
return (
int)x; }
785 __device__
static inline int __double2int_rn(
double x) {
return (
int)x; }
786 __device__
static inline int __double2int_ru(
double x) {
return (
int)x; }
787 __device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
789 __device__
static inline long long int __double2ll_rd(
double x) {
return (
long long int)x; }
790 __device__
static inline long long int __double2ll_rn(
double x) {
return (
long long int)x; }
791 __device__
static inline long long int __double2ll_ru(
double x) {
return (
long long int)x; }
792 __device__
static inline long long int __double2ll_rz(
double x) {
return (
long long int)x; }
794 __device__
static inline unsigned int __double2uint_rd(
double x) {
return (
unsigned int)x; }
795 __device__
static inline unsigned int __double2uint_rn(
double x) {
return (
unsigned int)x; }
796 __device__
static inline unsigned int __double2uint_ru(
double x) {
return (
unsigned int)x; }
797 __device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
799 __device__
static inline unsigned long long int __double2ull_rd(
double x) {
800 return (
unsigned long long int)x;
802 __device__
static inline unsigned long long int __double2ull_rn(
double x) {
803 return (
unsigned long long int)x;
805 __device__
static inline unsigned long long int __double2ull_ru(
double x) {
806 return (
unsigned long long int)x;
808 __device__
static inline unsigned long long int __double2ull_rz(
double x) {
809 return (
unsigned long long int)x;
812 __device__
static inline long long int __double_as_longlong(
double x) {
813 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
816 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
835 __device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
836 __device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
837 __device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
838 __device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
840 __device__
static inline long long int __float2ll_rd(
float x) {
return (
long long int)x; }
841 __device__
static inline long long int __float2ll_rn(
float x) {
return (
long long int)x; }
842 __device__
static inline long long int __float2ll_ru(
float x) {
return (
long long int)x; }
843 __device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
845 __device__
static inline unsigned int __float2uint_rd(
float x) {
return (
unsigned int)x; }
846 __device__
static inline unsigned int __float2uint_rn(
float x) {
return (
unsigned int)x; }
847 __device__
static inline unsigned int __float2uint_ru(
float x) {
return (
unsigned int)x; }
848 __device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
850 __device__
static inline unsigned long long int __float2ull_rd(
float x) {
851 return (
unsigned long long int)x;
853 __device__
static inline unsigned long long int __float2ull_rn(
float x) {
854 return (
unsigned long long int)x;
856 __device__
static inline unsigned long long int __float2ull_ru(
float x) {
857 return (
unsigned long long int)x;
859 __device__
static inline unsigned long long int __float2ull_rz(
float x) {
860 return (
unsigned long long int)x;
863 __device__
static inline int __float_as_int(
float x) {
864 static_assert(
sizeof(
int) ==
sizeof(
float),
"");
867 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
872 __device__
static inline unsigned int __float_as_uint(
float x) {
873 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
876 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
881 __device__
static inline double __hiloint2double(
int hi,
int lo) {
882 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
884 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
886 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
891 __device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
893 __device__
static inline float __int2float_rd(
int x) {
return (
float)x; }
894 __device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
895 __device__
static inline float __int2float_ru(
int x) {
return (
float)x; }
896 __device__
static inline float __int2float_rz(
int x) {
return (
float)x; }
898 __device__
static inline float __int_as_float(
int x) {
899 static_assert(
sizeof(
float) ==
sizeof(
int),
"");
902 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
907 __device__
static inline double __ll2double_rd(
long long int x) {
return (
double)x; }
908 __device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
909 __device__
static inline double __ll2double_ru(
long long int x) {
return (
double)x; }
910 __device__
static inline double __ll2double_rz(
long long int x) {
return (
double)x; }
912 __device__
static inline float __ll2float_rd(
long long int x) {
return (
float)x; }
913 __device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
914 __device__
static inline float __ll2float_ru(
long long int x) {
return (
float)x; }
915 __device__
static inline float __ll2float_rz(
long long int x) {
return (
float)x; }
917 __device__
static inline double __longlong_as_double(
long long int x) {
918 static_assert(
sizeof(
double) ==
sizeof(
long long),
"");
921 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
926 __device__
static inline double __uint2double_rn(
int x) {
return (
double)x; }
928 __device__
static inline float __uint2float_rd(
unsigned int x) {
return (
float)x; }
929 __device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
930 __device__
static inline float __uint2float_ru(
unsigned int x) {
return (
float)x; }
931 __device__
static inline float __uint2float_rz(
unsigned int x) {
return (
float)x; }
933 __device__
static inline float __uint_as_float(
unsigned int x) {
934 static_assert(
sizeof(
float) ==
sizeof(
unsigned int),
"");
937 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
942 __device__
static inline double __ull2double_rd(
unsigned long long int x) {
return (
double)x; }
943 __device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
944 __device__
static inline double __ull2double_ru(
unsigned long long int x) {
return (
double)x; }
945 __device__
static inline double __ull2double_rz(
unsigned long long int x) {
return (
double)x; }
947 __device__
static inline float __ull2float_rd(
unsigned long long int x) {
return (
float)x; }
948 __device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
949 __device__
static inline float __ull2float_ru(
unsigned long long int x) {
return (
float)x; }
950 __device__
static inline float __ull2float_rz(
unsigned long long int x) {
return (
float)x; }
952 #if __HIP_CLANG_ONLY__
955 __device__
long long int __clock64();
956 __device__
long long int __clock();
957 __device__
long long int clock64();
958 __device__
long long int clock();
960 __device__
void __named_sync(
int a,
int b);
962 #ifdef __HIP_DEVICE_COMPILE__
966 inline __attribute((always_inline))
967 long long int __clock64() {
968 return (
long long int) __builtin_readcyclecounter();
972 inline __attribute((always_inline))
973 long long int __clock() {
return __clock64(); }
976 inline __attribute__((always_inline))
977 long long int clock64() {
return __clock64(); }
980 inline __attribute__((always_inline))
981 long long int clock() {
return __clock(); }
986 void __named_sync(
int a,
int b) { __builtin_amdgcn_s_barrier(); }
988 #endif // __HIP_DEVICE_COMPILE__
993 int __all(
int predicate) {
994 return __ockl_wfall_i32(predicate);
999 int __any(
int predicate) {
1000 return __ockl_wfany_i32(predicate);
1008 unsigned long long int __ballot(
int predicate) {
1009 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1014 unsigned long long int __ballot64(
int predicate) {
1015 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1021 uint64_t __lanemask_gt()
1023 uint32_t lane = __ockl_lane_u32();
1026 uint64_t ballot = __ballot64(1);
1027 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1028 return mask & ballot;
1033 uint64_t __lanemask_lt()
1035 uint32_t lane = __ockl_lane_u32();
1036 int64_t ballot = __ballot64(1);
1037 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1038 return mask & ballot;
1043 uint64_t __lanemask_eq()
1045 uint32_t lane = __ockl_lane_u32();
1046 int64_t mask = ((uint64_t)1 << lane);
1051 __device__
inline void* __local_to_generic(
void* p) {
return p; }
1053 #ifdef __HIP_DEVICE_COMPILE__
1056 void* __get_dynamicgroupbaseptr()
1059 return (
char*)__local_to_generic((
void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1063 void* __get_dynamicgroupbaseptr();
1064 #endif // __HIP_DEVICE_COMPILE__
1068 void *__amdgcn_get_dynamicgroupbaseptr() {
1069 return __get_dynamicgroupbaseptr();
1075 static void __threadfence()
1077 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1082 static void __threadfence_block()
1084 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1089 static void __threadfence_system()
1091 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1097 __attribute__((weak))
1099 return __builtin_trap();
1107 #if defined(_WIN32) || defined(_WIN64)
1108 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1109 void _wassert(
const wchar_t *_msg,
const wchar_t *_file,
unsigned _line) {
1114 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1115 void __assert_fail(
const char *assertion,
1118 const char *
function)
1120 printf(
"%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
1121 function, assertion);
1125 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1126 void __assertfail(
const char *assertion,
1129 const char *
function,
1139 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1142 __atomic_work_item_fence(flags, __memory_order_release, scope);
1143 __builtin_amdgcn_s_barrier();
1144 __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1146 __builtin_amdgcn_s_barrier();
1152 static void __barrier(
int n)
1154 __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1159 __attribute__((convergent))
1160 void __syncthreads()
1162 __barrier(__CLK_LOCAL_MEM_FENCE);
1167 __attribute__((convergent))
1168 int __syncthreads_count(
int predicate)
1170 return __ockl_wgred_add_i32(!!predicate);
1175 __attribute__((convergent))
1176 int __syncthreads_and(
int predicate)
1178 return __ockl_wgred_and_i32(!!predicate);
1183 __attribute__((convergent))
1184 int __syncthreads_or(
int predicate)
1186 return __ockl_wgred_or_i32(!!predicate);
1207 #define HW_ID_CU_ID_SIZE 4
1208 #define HW_ID_CU_ID_OFFSET 8
1210 #define HW_ID_SE_ID_SIZE 2
1211 #define HW_ID_SE_ID_OFFSET 13
1220 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1230 unsigned __smid(
void)
1232 unsigned cu_id = __builtin_amdgcn_s_getreg(
1233 GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1234 unsigned se_id = __builtin_amdgcn_s_getreg(
1235 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1238 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1245 #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1246 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1248 #endif //defined(__clang__) && defined(__HIP__)
1252 static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1253 auto dstPtr =
static_cast<unsigned char*
>(dst);
1254 auto srcPtr =
static_cast<const unsigned char*
>(src);
1256 while (size >= 4u) {
1257 dstPtr[0] = srcPtr[0];
1258 dstPtr[1] = srcPtr[1];
1259 dstPtr[2] = srcPtr[2];
1260 dstPtr[3] = srcPtr[3];
1268 dstPtr[2] = srcPtr[2];
1270 dstPtr[1] = srcPtr[1];
1272 dstPtr[0] = srcPtr[0];
1278 static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1279 auto dstPtr =
static_cast<unsigned char*
>(dst);
1281 while (size >= 4u) {
1301 #ifndef __OPENMP_AMDGCN__
1302 static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1303 return __hip_hc_memcpy(dst, src, size);
1306 static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1307 unsigned char val8 =
static_cast<unsigned char>(val);
1308 return __hip_hc_memset(ptr, val8, size);
1310 #endif // !__OPENMP_AMDGCN__