23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H 24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H 29 #include <hip/hip_runtime_api.h> 33 #include <hip/hip_vector_types.h> 41 __device__
static inline unsigned int __popc(
unsigned int input) {
42 return __builtin_popcount(input);
44 __device__
static inline unsigned int __popcll(
unsigned long long int input) {
45 return __builtin_popcountll(input);
48 __device__
static inline int __clz(
int input) {
49 return __ockl_clz_u32((uint)input);
52 __device__
static inline int __clzll(
long long int input) {
53 return __ockl_clz_u64((ullong)input);
56 __device__
static inline unsigned int __ffs(
unsigned int input) {
57 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
60 __device__
static inline unsigned int __ffsll(
unsigned long long int input) {
61 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
64 __device__
static inline unsigned int __ffs(
int input) {
65 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68 __device__
static inline unsigned int __ffsll(
long long int input) {
69 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72 __device__
static inline unsigned int __brev(
unsigned int input) {
73 return __llvm_bitrev_b32(input);
76 __device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
77 return __llvm_bitrev_b64(input);
80 __device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81 return input == 0 ? -1 : __builtin_ctzl(input);
84 __device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
85 uint32_t offset = src1 & 31;
86 uint32_t width = src2 & 31;
87 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
90 __device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
91 uint64_t offset = src1 & 63;
92 uint64_t width = src2 & 63;
93 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
96 __device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
97 uint32_t offset = src2 & 31;
98 uint32_t width = src3 & 31;
99 uint32_t mask = (1 << width) - 1;
100 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
103 __device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
104 uint64_t offset = src2 & 63;
105 uint64_t width = src3 & 63;
106 uint64_t mask = (1ULL << width) - 1;
107 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
110 __device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
111 __device__
static unsigned int __hadd(
int x,
int y);
112 __device__
static int __mul24(
int x,
int y);
113 __device__
static long long int __mul64hi(
long long int x,
long long int y);
114 __device__
static int __mulhi(
int x,
int y);
115 __device__
static int __rhadd(
int x,
int y);
116 __device__
static unsigned int __sad(
int x,
int y,
int z);
117 __device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
118 __device__
static int __umul24(
unsigned int x,
unsigned int y);
119 __device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
120 __device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
121 __device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
122 __device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
129 } __attribute__((aligned(4)));
136 } __attribute__((aligned(8)));
139 static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
146 cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147 cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148 cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149 cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
153 __device__
static inline unsigned int __hadd(
int x,
int y) {
155 int sign = z & 0x8000000;
156 int value = z & 0x7FFFFFFF;
157 return ((value) >> 1 || sign);
160 __device__
static inline int __mul24(
int x,
int y) {
161 return __ockl_mul24_i32(x, y);
164 __device__
static inline long long __mul64hi(
long long int x,
long long int y) {
165 ulong x0 = (ulong)x & 0xffffffffUL;
167 ulong y0 = (ulong)y & 0xffffffffUL;
170 long t = x1*y0 + (z0 >> 32);
171 long z1 = t & 0xffffffffL;
174 return x1*y1 + z2 + (z1 >> 32);
177 __device__
static inline int __mulhi(
int x,
int y) {
178 return __ockl_mul_hi_i32(x, y);
181 __device__
static inline int __rhadd(
int x,
int y) {
183 int sign = z & 0x8000000;
184 int value = z & 0x7FFFFFFF;
185 return ((value) >> 1 || sign);
187 __device__
static inline unsigned int __sad(
int x,
int y,
int z) {
188 return x > y ? x - y + z : y - x + z;
190 __device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
193 __device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
194 return __ockl_mul24_u32(x, y);
198 static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
199 ulong x0 = x & 0xffffffffUL;
201 ulong y0 = y & 0xffffffffUL;
204 ulong t = x1*y0 + (z0 >> 32);
205 ulong z1 = t & 0xffffffffUL;
208 return x1*y1 + z2 + (z1 >> 32);
211 __device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
212 return __ockl_mul_hi_u32(x, y);
214 __device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
215 return (x + y + 1) >> 1;
217 __device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
218 return __ockl_sad_u32(x, y, z);
221 __device__
static inline unsigned int __lane_id() {
return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
227 __device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
228 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
229 tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
233 __device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
234 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
235 tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
239 __device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
240 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
241 tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
245 __device__
static inline float __hip_ds_permutef(
int index,
float src) {
246 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
247 tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
251 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src)) 252 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src)) 254 template <
int pattern>
255 __device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
256 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
258 tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
260 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
265 template <
int pattern>
266 __device__
static inline float __hip_ds_swizzlef_N(
float src) {
267 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
269 tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
271 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
276 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \ 277 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src)) 279 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
280 __device__
static inline int __hip_move_dpp_N(
int src) {
281 return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
285 static constexpr
int warpSize = 64;
289 int __shfl(
int var,
int src_lane,
int width = warpSize) {
290 int self = __lane_id();
291 int index = src_lane + (
self & ~(width-1));
292 return __llvm_amdgcn_ds_bpermute(index<<2, var);
296 unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
297 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
298 tmp.i = __shfl(tmp.i, src_lane, width);
303 float __shfl(
float var,
int src_lane,
int width = warpSize) {
304 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
305 tmp.i = __shfl(tmp.i, src_lane, width);
310 double __shfl(
double var,
int src_lane,
int width = warpSize) {
311 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
312 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
314 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
315 tmp[0] = __shfl(tmp[0], src_lane, width);
316 tmp[1] = __shfl(tmp[1], src_lane, width);
318 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
319 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
324 long __shfl(
long var,
int src_lane,
int width = warpSize)
327 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
328 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
330 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
331 tmp[0] = __shfl(tmp[0], src_lane, width);
332 tmp[1] = __shfl(tmp[1], src_lane, width);
334 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
335 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
338 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
339 return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
344 long long __shfl(
long long var,
int src_lane,
int width = warpSize)
346 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
347 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
349 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
350 tmp[0] = __shfl(tmp[0], src_lane, width);
351 tmp[1] = __shfl(tmp[1], src_lane, width);
353 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
354 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
360 int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
361 int self = __lane_id();
362 int index =
self - lane_delta;
363 index = (index < (
self & ~(width-1)))?
self:index;
364 return __llvm_amdgcn_ds_bpermute(index<<2, var);
368 unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
369 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
370 tmp.i = __shfl_up(tmp.i, lane_delta, width);
375 float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
376 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
377 tmp.i = __shfl_up(tmp.i, lane_delta, width);
382 double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
383 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
384 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
386 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
387 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
388 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
390 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
391 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
396 long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
399 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
400 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
402 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
403 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
404 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
406 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
407 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
410 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
411 return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
416 long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
418 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
419 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
420 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
421 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
422 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
423 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
424 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
430 int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
431 int self = __lane_id();
432 int index =
self + lane_delta;
433 index = (int)((
self&(width-1))+lane_delta) >= width?
self:index;
434 return __llvm_amdgcn_ds_bpermute(index<<2, var);
438 unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
439 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
440 tmp.i = __shfl_down(tmp.i, lane_delta, width);
445 float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
446 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
447 tmp.i = __shfl_down(tmp.i, lane_delta, width);
452 double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
453 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
454 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
456 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
457 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
458 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
460 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
461 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
466 long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
469 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
470 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
472 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
473 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
474 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
476 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
477 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
480 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
481 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
486 long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
488 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
489 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
490 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
491 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
492 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
493 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
494 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
500 int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
501 int self = __lane_id();
502 int index =
self^lane_mask;
503 index = index >= ((
self+width)&~(width-1))?
self:index;
504 return __llvm_amdgcn_ds_bpermute(index<<2, var);
508 unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
509 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
510 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
515 float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
516 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
517 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
522 double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
523 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
524 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
526 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
527 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
528 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
530 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
531 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
536 long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
539 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
540 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
542 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
543 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
544 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
546 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
547 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
550 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
551 return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
556 long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
558 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
559 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
560 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
561 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
562 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
563 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
564 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
568 #define MASK1 0x00ff00ff 569 #define MASK2 0xff00ff00 571 __device__
static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
573 unsigned one1 = in1.w & MASK1;
574 unsigned one2 = in2.w & MASK1;
575 out.w = (one1 + one2) & MASK1;
576 one1 = in1.w & MASK2;
577 one2 = in2.w & MASK2;
578 out.w = out.w | ((one1 + one2) & MASK2);
582 __device__
static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
584 unsigned one1 = in1.w & MASK1;
585 unsigned one2 = in2.w & MASK1;
586 out.w = (one1 - one2) & MASK1;
587 one1 = in1.w & MASK2;
588 one2 = in2.w & MASK2;
589 out.w = out.w | ((one1 - one2) & MASK2);
593 __device__
static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
595 unsigned one1 = in1.w & MASK1;
596 unsigned one2 = in2.w & MASK1;
597 out.w = (one1 * one2) & MASK1;
598 one1 = in1.w & MASK2;
599 one2 = in2.w & MASK2;
600 out.w = out.w | ((one1 * one2) & MASK2);
609 __device__
static inline float __double2float_rd(
double x) {
return (
double)x; }
610 __device__
static inline float __double2float_rn(
double x) {
return (
double)x; }
611 __device__
static inline float __double2float_ru(
double x) {
return (
double)x; }
612 __device__
static inline float __double2float_rz(
double x) {
return (
double)x; }
614 __device__
static inline int __double2hiint(
double x) {
615 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
618 __builtin_memcpy(tmp, &x,
sizeof(tmp));
622 __device__
static inline int __double2loint(
double x) {
623 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
626 __builtin_memcpy(tmp, &x,
sizeof(tmp));
631 __device__
static inline int __double2int_rd(
double x) {
return (
int)x; }
632 __device__
static inline int __double2int_rn(
double x) {
return (
int)x; }
633 __device__
static inline int __double2int_ru(
double x) {
return (
int)x; }
634 __device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
636 __device__
static inline long long int __double2ll_rd(
double x) {
return (
long long int)x; }
637 __device__
static inline long long int __double2ll_rn(
double x) {
return (
long long int)x; }
638 __device__
static inline long long int __double2ll_ru(
double x) {
return (
long long int)x; }
639 __device__
static inline long long int __double2ll_rz(
double x) {
return (
long long int)x; }
641 __device__
static inline unsigned int __double2uint_rd(
double x) {
return (
unsigned int)x; }
642 __device__
static inline unsigned int __double2uint_rn(
double x) {
return (
unsigned int)x; }
643 __device__
static inline unsigned int __double2uint_ru(
double x) {
return (
unsigned int)x; }
644 __device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
646 __device__
static inline unsigned long long int __double2ull_rd(
double x) {
647 return (
unsigned long long int)x;
649 __device__
static inline unsigned long long int __double2ull_rn(
double x) {
650 return (
unsigned long long int)x;
652 __device__
static inline unsigned long long int __double2ull_ru(
double x) {
653 return (
unsigned long long int)x;
655 __device__
static inline unsigned long long int __double2ull_rz(
double x) {
656 return (
unsigned long long int)x;
659 __device__
static inline long long int __double_as_longlong(
double x) {
660 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
663 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
682 __device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
683 __device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
684 __device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
685 __device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
687 __device__
static inline long long int __float2ll_rd(
float x) {
return (
long long int)x; }
688 __device__
static inline long long int __float2ll_rn(
float x) {
return (
long long int)x; }
689 __device__
static inline long long int __float2ll_ru(
float x) {
return (
long long int)x; }
690 __device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
692 __device__
static inline unsigned int __float2uint_rd(
float x) {
return (
unsigned int)x; }
693 __device__
static inline unsigned int __float2uint_rn(
float x) {
return (
unsigned int)x; }
694 __device__
static inline unsigned int __float2uint_ru(
float x) {
return (
unsigned int)x; }
695 __device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
697 __device__
static inline unsigned long long int __float2ull_rd(
float x) {
698 return (
unsigned long long int)x;
700 __device__
static inline unsigned long long int __float2ull_rn(
float x) {
701 return (
unsigned long long int)x;
703 __device__
static inline unsigned long long int __float2ull_ru(
float x) {
704 return (
unsigned long long int)x;
706 __device__
static inline unsigned long long int __float2ull_rz(
float x) {
707 return (
unsigned long long int)x;
710 __device__
static inline int __float_as_int(
float x) {
711 static_assert(
sizeof(
int) ==
sizeof(
float),
"");
714 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
719 __device__
static inline unsigned int __float_as_uint(
float x) {
720 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
723 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
728 __device__
static inline double __hiloint2double(
int hi,
int lo) {
729 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
731 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
733 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
738 __device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
740 __device__
static inline float __int2float_rd(
int x) {
return (
float)x; }
741 __device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
742 __device__
static inline float __int2float_ru(
int x) {
return (
float)x; }
743 __device__
static inline float __int2float_rz(
int x) {
return (
float)x; }
745 __device__
static inline float __int_as_float(
int x) {
746 static_assert(
sizeof(
float) ==
sizeof(
int),
"");
749 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
754 __device__
static inline double __ll2double_rd(
long long int x) {
return (
double)x; }
755 __device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
756 __device__
static inline double __ll2double_ru(
long long int x) {
return (
double)x; }
757 __device__
static inline double __ll2double_rz(
long long int x) {
return (
double)x; }
759 __device__
static inline float __ll2float_rd(
long long int x) {
return (
float)x; }
760 __device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
761 __device__
static inline float __ll2float_ru(
long long int x) {
return (
float)x; }
762 __device__
static inline float __ll2float_rz(
long long int x) {
return (
float)x; }
764 __device__
static inline double __longlong_as_double(
long long int x) {
765 static_assert(
sizeof(
double) ==
sizeof(
long long),
"");
768 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
773 __device__
static inline double __uint2double_rn(
int x) {
return (
double)x; }
775 __device__
static inline float __uint2float_rd(
unsigned int x) {
return (
float)x; }
776 __device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
777 __device__
static inline float __uint2float_ru(
unsigned int x) {
return (
float)x; }
778 __device__
static inline float __uint2float_rz(
unsigned int x) {
return (
float)x; }
780 __device__
static inline float __uint_as_float(
unsigned int x) {
781 static_assert(
sizeof(
float) ==
sizeof(
unsigned int),
"");
784 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
789 __device__
static inline double __ull2double_rd(
unsigned long long int x) {
return (
double)x; }
790 __device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
791 __device__
static inline double __ull2double_ru(
unsigned long long int x) {
return (
double)x; }
792 __device__
static inline double __ull2double_rz(
unsigned long long int x) {
return (
double)x; }
794 __device__
static inline float __ull2float_rd(
unsigned long long int x) {
return (
float)x; }
795 __device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
796 __device__
static inline float __ull2float_ru(
unsigned long long int x) {
return (
float)x; }
797 __device__
static inline float __ull2float_rz(
unsigned long long int x) {
return (
float)x; }
800 #define __HCC_OR_HIP_CLANG__ 1 801 #elif defined(__clang__) && defined(__HIP__) 802 #define __HCC_OR_HIP_CLANG__ 1 804 #define __HCC_OR_HIP_CLANG__ 0 807 #ifdef __HCC_OR_HIP_CLANG__ 810 __device__
long long int __clock64();
811 __device__
long long int __clock();
812 __device__
long long int clock64();
813 __device__
long long int clock();
815 __device__
void __named_sync(
int a,
int b);
817 #ifdef __HIP_DEVICE_COMPILE__ 821 extern "C" uint64_t __clock_u64() __HC__;
825 inline __attribute((always_inline))
826 long long int __clock64() {
827 return (
long long int) __builtin_readcyclecounter();
831 inline __attribute((always_inline))
832 long long int __clock() {
return __clock64(); }
835 inline __attribute__((always_inline))
836 long long int clock64() {
return __clock64(); }
839 inline __attribute__((always_inline))
840 long long int clock() {
return __clock(); }
845 void __named_sync(
int a,
int b) { __builtin_amdgcn_s_barrier(); }
847 #endif // __HIP_DEVICE_COMPILE__ 852 int __all(
int predicate) {
853 return __ockl_wfall_i32(predicate);
858 int __any(
int predicate) {
859 return __ockl_wfany_i32(predicate);
867 unsigned long long int __ballot(
int predicate) {
868 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
873 unsigned long long int __ballot64(
int predicate) {
874 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
880 uint64_t __lanemask_gt()
882 uint32_t lane = __ockl_lane_u32();
885 uint64_t ballot = __ballot64(1);
886 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
887 return mask & ballot;
892 uint64_t __lanemask_lt()
894 uint32_t lane = __ockl_lane_u32();
895 int64_t ballot = __ballot64(1);
896 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
897 return mask & ballot;
902 uint64_t __lanemask_eq()
904 uint32_t lane = __ockl_lane_u32();
905 int64_t mask = ((uint64_t)1 << lane);
910 __device__
inline void* __local_to_generic(
void* p) {
return p; }
912 #ifdef __HIP_DEVICE_COMPILE__ 915 void* __get_dynamicgroupbaseptr()
918 return (
char*)__local_to_generic((
void*)__to_local(__llvm_amdgcn_groupstaticsize()));
922 void* __get_dynamicgroupbaseptr();
923 #endif // __HIP_DEVICE_COMPILE__ 927 void *__amdgcn_get_dynamicgroupbaseptr() {
928 return __get_dynamicgroupbaseptr();
931 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3) 933 #define __CLK_LOCAL_MEM_FENCE 0x01 934 typedef unsigned __cl_mem_fence_flags;
936 typedef enum __memory_scope {
937 __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
938 __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
939 __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
940 __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
941 __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
945 typedef enum __memory_order
947 __memory_order_relaxed = __ATOMIC_RELAXED,
948 __memory_order_acquire = __ATOMIC_ACQUIRE,
949 __memory_order_release = __ATOMIC_RELEASE,
950 __memory_order_acq_rel = __ATOMIC_ACQ_REL,
951 __memory_order_seq_cst = __ATOMIC_SEQ_CST
957 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
960 if (order != __memory_order_relaxed) {
962 case __memory_scope_work_item:
964 case __memory_scope_sub_group:
966 case __memory_order_relaxed:
break;
967 case __memory_order_acquire: __llvm_fence_acq_sg();
break;
968 case __memory_order_release: __llvm_fence_rel_sg();
break;
969 case __memory_order_acq_rel: __llvm_fence_ar_sg();
break;
970 case __memory_order_seq_cst: __llvm_fence_sc_sg();
break;
973 case __memory_scope_work_group:
975 case __memory_order_relaxed:
break;
976 case __memory_order_acquire: __llvm_fence_acq_wg();
break;
977 case __memory_order_release: __llvm_fence_rel_wg();
break;
978 case __memory_order_acq_rel: __llvm_fence_ar_wg();
break;
979 case __memory_order_seq_cst: __llvm_fence_sc_wg();
break;
982 case __memory_scope_device:
984 case __memory_order_relaxed:
break;
985 case __memory_order_acquire: __llvm_fence_acq_dev();
break;
986 case __memory_order_release: __llvm_fence_rel_dev();
break;
987 case __memory_order_acq_rel: __llvm_fence_ar_dev();
break;
988 case __memory_order_seq_cst: __llvm_fence_sc_dev();
break;
991 case __memory_scope_all_svm_devices:
993 case __memory_order_relaxed:
break;
994 case __memory_order_acquire: __llvm_fence_acq_sys();
break;
995 case __memory_order_release: __llvm_fence_rel_sys();
break;
996 case __memory_order_acq_rel: __llvm_fence_ar_sys();
break;
997 case __memory_order_seq_cst: __llvm_fence_sc_sys();
break;
1008 static void __threadfence()
1010 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1015 static void __threadfence_block()
1017 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1022 static void __threadfence_system()
1024 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1030 __attribute__((weak))
1032 return __builtin_trap();
1036 #endif // __HCC_OR_HIP_CLANG__ 1046 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr(); 1048 #define HIP_DYNAMIC_SHARED_ATTRIBUTE 1051 #elif defined(__clang__) && defined(__HIP__) 1053 #pragma push_macro("__DEVICE__") 1054 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \ 1055 __attribute__((weak)) 1059 void __assert_fail(
const char * __assertion,
1061 unsigned int __line,
1062 const char *__function)
1070 void __assertfail(
const char * __assertion,
1072 unsigned int __line,
1073 const char *__function,
1082 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1085 __atomic_work_item_fence(flags, __memory_order_release, scope);
1086 __builtin_amdgcn_s_barrier();
1087 __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1089 __builtin_amdgcn_s_barrier();
1095 static void __barrier(
int n)
1097 __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1102 __attribute__((convergent))
1103 void __syncthreads()
1105 __barrier(__CLK_LOCAL_MEM_FENCE);
1126 #define HW_ID_CU_ID_SIZE 4 1127 #define HW_ID_CU_ID_OFFSET 8 1129 #define HW_ID_SE_ID_SIZE 2 1130 #define HW_ID_SE_ID_OFFSET 13 1139 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG)) 1149 unsigned __smid(
void)
1151 unsigned cu_id = __builtin_amdgcn_s_getreg(
1152 GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1153 unsigned se_id = __builtin_amdgcn_s_getreg(
1154 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1157 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1160 #pragma push_macro("__DEVICE__") 1164 #define HIP_DYNAMIC_SHARED(type, var) \ 1165 type* var = (type*)__amdgcn_get_dynamicgroupbaseptr(); 1167 #define HIP_DYNAMIC_SHARED_ATTRIBUTE 1170 #endif //defined(__clang__) && defined(__HIP__) 1174 static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1175 auto dstPtr =
static_cast<unsigned char*
>(dst);
1176 auto srcPtr =
static_cast<const unsigned char*
>(src);
1178 while (size >= 4u) {
1179 dstPtr[0] = srcPtr[0];
1180 dstPtr[1] = srcPtr[1];
1181 dstPtr[2] = srcPtr[2];
1182 dstPtr[3] = srcPtr[3];
1190 dstPtr[2] = srcPtr[2];
1192 dstPtr[1] = srcPtr[1];
1194 dstPtr[0] = srcPtr[0];
1200 static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1201 auto dstPtr =
static_cast<unsigned char*
>(dst);
1203 while (size >= 4u) {
1223 static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1224 return __hip_hc_memcpy(dst, src, size);
1227 static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1228 unsigned char val8 =
static_cast<unsigned char>(val);
1229 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.