23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
33 #include <hip/hip_vector_types.h>
37 #if __HIP_CLANG_ONLY__ && __HIP_ROCclr__ && !_WIN32
38 extern "C" __device__
int printf(
const char *fmt, ...);
41 template <
typename... All>
42 static inline __device__
void printf(
const char* format, All... all) {
43 hc::printf(format, all...);
46 template <
typename... All>
47 static inline __device__
void printf(
const char* format, All... all) {}
48 #endif // HC_FEATURE_PRINTF
49 #endif // __HIP_CLANG_ONLY__ && __HIP_ROCclr__
56 __device__
static inline unsigned int __popc(
unsigned int input) {
57 return __builtin_popcount(input);
59 __device__
static inline unsigned int __popcll(
unsigned long long int input) {
60 return __builtin_popcountll(input);
63 __device__
static inline int __clz(
int input) {
64 return __ockl_clz_u32((uint)input);
67 __device__
static inline int __clzll(
long long int input) {
68 return __ockl_clz_u64((ullong)input);
71 __device__
static inline unsigned int __ffs(
unsigned int input) {
72 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
75 __device__
static inline unsigned int __ffsll(
unsigned long long int input) {
76 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
79 __device__
static inline unsigned int __ffs(
int input) {
80 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
83 __device__
static inline unsigned int __ffsll(
long long int input) {
84 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
87 __device__
static inline unsigned int __brev(
unsigned int input) {
88 return __builtin_bitreverse32(input);
91 __device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
92 return __builtin_bitreverse64(input);
95 __device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
96 return input == 0 ? -1 : __builtin_ctzl(input);
99 __device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
100 uint32_t offset = src1 & 31;
101 uint32_t width = src2 & 31;
102 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
105 __device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
106 uint64_t offset = src1 & 63;
107 uint64_t width = src2 & 63;
108 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
111 __device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
112 uint32_t offset = src2 & 31;
113 uint32_t width = src3 & 31;
114 uint32_t mask = (1 << width) - 1;
115 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
118 __device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
119 uint64_t offset = src2 & 63;
120 uint64_t width = src3 & 63;
121 uint64_t mask = (1ULL << width) - 1;
122 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
125 __device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
126 __device__
static unsigned int __hadd(
int x,
int y);
127 __device__
static int __mul24(
int x,
int y);
128 __device__
static long long int __mul64hi(
long long int x,
long long int y);
129 __device__
static int __mulhi(
int x,
int y);
130 __device__
static int __rhadd(
int x,
int y);
131 __device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
132 __device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
133 __device__
static int __umul24(
unsigned int x,
unsigned int y);
134 __device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
135 __device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
136 __device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
137 __device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
144 } __attribute__((aligned(4)));
151 } __attribute__((aligned(8)));
154 static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
161 cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
162 cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
163 cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
164 cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
168 __device__
static inline unsigned int __hadd(
int x,
int y) {
170 int sign = z & 0x8000000;
171 int value = z & 0x7FFFFFFF;
172 return ((value) >> 1 || sign);
175 __device__
static inline int __mul24(
int x,
int y) {
176 return __ockl_mul24_i32(x, y);
179 __device__
static inline long long __mul64hi(
long long int x,
long long int y) {
180 ulong x0 = (ulong)x & 0xffffffffUL;
182 ulong y0 = (ulong)y & 0xffffffffUL;
185 long t = x1*y0 + (z0 >> 32);
186 long z1 = t & 0xffffffffL;
189 return x1*y1 + z2 + (z1 >> 32);
192 __device__
static inline int __mulhi(
int x,
int y) {
193 return __ockl_mul_hi_i32(x, y);
196 __device__
static inline int __rhadd(
int x,
int y) {
198 int sign = z & 0x8000000;
199 int value = z & 0x7FFFFFFF;
200 return ((value) >> 1 || sign);
202 __device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
203 return x > y ? x - y + z : y - x + z;
205 __device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
208 __device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
209 return __ockl_mul24_u32(x, y);
213 static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
214 ulong x0 = x & 0xffffffffUL;
216 ulong y0 = y & 0xffffffffUL;
219 ulong t = x1*y0 + (z0 >> 32);
220 ulong z1 = t & 0xffffffffUL;
223 return x1*y1 + z2 + (z1 >> 32);
226 __device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
227 return __ockl_mul_hi_u32(x, y);
229 __device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
230 return (x + y + 1) >> 1;
232 __device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
233 return __ockl_sadd_u32(x, y, z);
236 __device__
static inline unsigned int __lane_id() {
237 return __builtin_amdgcn_mbcnt_hi(
238 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
242 static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
245 static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
251 __device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
252 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
253 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
257 __device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
258 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
259 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
263 __device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
264 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
265 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
269 __device__
static inline float __hip_ds_permutef(
int index,
float src) {
270 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
271 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
275 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
276 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
278 template <
int pattern>
279 __device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
280 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
282 tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
284 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
289 template <
int pattern>
290 __device__
static inline float __hip_ds_swizzlef_N(
float src) {
291 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
293 tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
295 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
300 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
301 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
303 template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
304 __device__
static inline int __hip_move_dpp_N(
int src) {
305 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
312 #ifndef __AMDGCN_WAVEFRONT_SIZE
313 #if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
314 #define __AMDGCN_WAVEFRONT_SIZE 32
316 #define __AMDGCN_WAVEFRONT_SIZE 64
319 static constexpr
int warpSize = __AMDGCN_WAVEFRONT_SIZE;
323 int __shfl(
int var,
int src_lane,
int width = warpSize) {
324 int self = __lane_id();
325 int index = src_lane + (
self & ~(width-1));
326 return __builtin_amdgcn_ds_bpermute(index<<2, var);
330 unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
331 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
332 tmp.i = __shfl(tmp.i, src_lane, width);
337 float __shfl(
float var,
int src_lane,
int width = warpSize) {
338 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
339 tmp.i = __shfl(tmp.i, src_lane, width);
344 double __shfl(
double var,
int src_lane,
int width = warpSize) {
345 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
346 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
348 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
349 tmp[0] = __shfl(tmp[0], src_lane, width);
350 tmp[1] = __shfl(tmp[1], src_lane, width);
352 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
353 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
358 long __shfl(
long var,
int src_lane,
int width = warpSize)
361 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
362 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
364 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
365 tmp[0] = __shfl(tmp[0], src_lane, width);
366 tmp[1] = __shfl(tmp[1], src_lane, width);
368 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
369 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
372 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
373 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
378 unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
380 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
381 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
383 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
384 tmp[0] = __shfl(tmp[0], src_lane, width);
385 tmp[1] = __shfl(tmp[1], src_lane, width);
387 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
388 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
391 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
392 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
397 long long __shfl(
long long var,
int src_lane,
int width = warpSize)
399 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
400 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
402 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
403 tmp[0] = __shfl(tmp[0], src_lane, width);
404 tmp[1] = __shfl(tmp[1], src_lane, width);
406 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
407 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
412 unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
413 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
414 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
416 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
417 tmp[0] = __shfl(tmp[0], src_lane, width);
418 tmp[1] = __shfl(tmp[1], src_lane, width);
420 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
421 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
427 int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
428 int self = __lane_id();
429 int index =
self - lane_delta;
430 index = (index < (
self & ~(width-1)))?
self:index;
431 return __builtin_amdgcn_ds_bpermute(index<<2, var);
435 unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
436 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
437 tmp.i = __shfl_up(tmp.i, lane_delta, width);
442 float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
443 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
444 tmp.i = __shfl_up(tmp.i, lane_delta, width);
449 double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
450 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
451 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
453 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
454 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
455 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
457 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
458 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
463 long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
466 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
467 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
469 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
470 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
471 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
473 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
474 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
477 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
478 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
484 unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
487 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
488 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
490 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
491 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
492 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
494 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
495 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
498 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
499 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
505 long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
507 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
508 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
509 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
510 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
511 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
512 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
513 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
519 unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
521 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
522 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
523 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
524 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
525 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
526 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
527 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
533 int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
534 int self = __lane_id();
535 int index =
self + lane_delta;
536 index = (int)((
self&(width-1))+lane_delta) >= width?
self:index;
537 return __builtin_amdgcn_ds_bpermute(index<<2, var);
541 unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
542 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
543 tmp.i = __shfl_down(tmp.i, lane_delta, width);
548 float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
549 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
550 tmp.i = __shfl_down(tmp.i, lane_delta, width);
555 double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
556 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
557 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
559 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
560 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
561 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
563 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
564 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
569 long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
572 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
573 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
575 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
576 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
577 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
579 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
580 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
583 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
584 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
589 unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
592 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
593 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
595 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
596 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
597 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
599 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
600 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
603 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
604 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
609 long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
611 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
612 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
613 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
614 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
615 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
616 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
617 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
622 unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
624 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
625 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
626 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
627 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
628 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
629 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
630 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
636 int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
637 int self = __lane_id();
638 int index =
self^lane_mask;
639 index = index >= ((
self+width)&~(width-1))?
self:index;
640 return __builtin_amdgcn_ds_bpermute(index<<2, var);
644 unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
645 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
646 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
651 float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
652 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
653 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
658 double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
659 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
660 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
662 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
663 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
664 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
666 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
667 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
672 long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
675 static_assert(
sizeof(
long) == 2 *
sizeof(
int),
"");
676 static_assert(
sizeof(
long) ==
sizeof(uint64_t),
"");
678 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
679 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
680 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
682 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
683 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
686 static_assert(
sizeof(
long) ==
sizeof(
int),
"");
687 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
692 unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
695 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
696 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
698 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
699 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
700 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
702 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
703 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
706 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
707 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
712 long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
714 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
715 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
716 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
717 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
718 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
719 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
720 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
725 unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
727 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
728 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
729 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
730 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
731 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
732 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
733 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
736 #define MASK1 0x00ff00ff
737 #define MASK2 0xff00ff00
741 unsigned one1 = in1.w & MASK1;
742 unsigned one2 = in2.w & MASK1;
743 out.w = (one1 + one2) & MASK1;
744 one1 = in1.w & MASK2;
745 one2 = in2.w & MASK2;
746 out.w = out.w | ((one1 + one2) & MASK2);
752 unsigned one1 = in1.w & MASK1;
753 unsigned one2 = in2.w & MASK1;
754 out.w = (one1 - one2) & MASK1;
755 one1 = in1.w & MASK2;
756 one2 = in2.w & MASK2;
757 out.w = out.w | ((one1 - one2) & MASK2);
763 unsigned one1 = in1.w & MASK1;
764 unsigned one2 = in2.w & MASK1;
765 out.w = (one1 * one2) & MASK1;
766 one1 = in1.w & MASK2;
767 one2 = in2.w & MASK2;
768 out.w = out.w | ((one1 * one2) & MASK2);
777 __device__
static inline float __double2float_rd(
double x) {
return (
double)x; }
778 __device__
static inline float __double2float_rn(
double x) {
return (
double)x; }
779 __device__
static inline float __double2float_ru(
double x) {
return (
double)x; }
780 __device__
static inline float __double2float_rz(
double x) {
return (
double)x; }
782 __device__
static inline int __double2hiint(
double x) {
783 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
786 __builtin_memcpy(tmp, &x,
sizeof(tmp));
790 __device__
static inline int __double2loint(
double x) {
791 static_assert(
sizeof(
double) == 2 *
sizeof(
int),
"");
794 __builtin_memcpy(tmp, &x,
sizeof(tmp));
799 __device__
static inline int __double2int_rd(
double x) {
return (
int)x; }
800 __device__
static inline int __double2int_rn(
double x) {
return (
int)x; }
801 __device__
static inline int __double2int_ru(
double x) {
return (
int)x; }
802 __device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
804 __device__
static inline long long int __double2ll_rd(
double x) {
return (
long long int)x; }
805 __device__
static inline long long int __double2ll_rn(
double x) {
return (
long long int)x; }
806 __device__
static inline long long int __double2ll_ru(
double x) {
return (
long long int)x; }
807 __device__
static inline long long int __double2ll_rz(
double x) {
return (
long long int)x; }
809 __device__
static inline unsigned int __double2uint_rd(
double x) {
return (
unsigned int)x; }
810 __device__
static inline unsigned int __double2uint_rn(
double x) {
return (
unsigned int)x; }
811 __device__
static inline unsigned int __double2uint_ru(
double x) {
return (
unsigned int)x; }
812 __device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
814 __device__
static inline unsigned long long int __double2ull_rd(
double x) {
815 return (
unsigned long long int)x;
817 __device__
static inline unsigned long long int __double2ull_rn(
double x) {
818 return (
unsigned long long int)x;
820 __device__
static inline unsigned long long int __double2ull_ru(
double x) {
821 return (
unsigned long long int)x;
823 __device__
static inline unsigned long long int __double2ull_rz(
double x) {
824 return (
unsigned long long int)x;
827 __device__
static inline long long int __double_as_longlong(
double x) {
828 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
831 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
850 __device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
851 __device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
852 __device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
853 __device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
855 __device__
static inline long long int __float2ll_rd(
float x) {
return (
long long int)x; }
856 __device__
static inline long long int __float2ll_rn(
float x) {
return (
long long int)x; }
857 __device__
static inline long long int __float2ll_ru(
float x) {
return (
long long int)x; }
858 __device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
860 __device__
static inline unsigned int __float2uint_rd(
float x) {
return (
unsigned int)x; }
861 __device__
static inline unsigned int __float2uint_rn(
float x) {
return (
unsigned int)x; }
862 __device__
static inline unsigned int __float2uint_ru(
float x) {
return (
unsigned int)x; }
863 __device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
865 __device__
static inline unsigned long long int __float2ull_rd(
float x) {
866 return (
unsigned long long int)x;
868 __device__
static inline unsigned long long int __float2ull_rn(
float x) {
869 return (
unsigned long long int)x;
871 __device__
static inline unsigned long long int __float2ull_ru(
float x) {
872 return (
unsigned long long int)x;
874 __device__
static inline unsigned long long int __float2ull_rz(
float x) {
875 return (
unsigned long long int)x;
878 __device__
static inline int __float_as_int(
float x) {
879 static_assert(
sizeof(
int) ==
sizeof(
float),
"");
882 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
887 __device__
static inline unsigned int __float_as_uint(
float x) {
888 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
891 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
896 __device__
static inline double __hiloint2double(
int hi,
int lo) {
897 static_assert(
sizeof(
double) ==
sizeof(uint64_t),
"");
899 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
901 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
906 __device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
908 __device__
static inline float __int2float_rd(
int x) {
return (
float)x; }
909 __device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
910 __device__
static inline float __int2float_ru(
int x) {
return (
float)x; }
911 __device__
static inline float __int2float_rz(
int x) {
return (
float)x; }
913 __device__
static inline float __int_as_float(
int x) {
914 static_assert(
sizeof(
float) ==
sizeof(
int),
"");
917 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
922 __device__
static inline double __ll2double_rd(
long long int x) {
return (
double)x; }
923 __device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
924 __device__
static inline double __ll2double_ru(
long long int x) {
return (
double)x; }
925 __device__
static inline double __ll2double_rz(
long long int x) {
return (
double)x; }
927 __device__
static inline float __ll2float_rd(
long long int x) {
return (
float)x; }
928 __device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
929 __device__
static inline float __ll2float_ru(
long long int x) {
return (
float)x; }
930 __device__
static inline float __ll2float_rz(
long long int x) {
return (
float)x; }
932 __device__
static inline double __longlong_as_double(
long long int x) {
933 static_assert(
sizeof(
double) ==
sizeof(
long long),
"");
936 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
941 __device__
static inline double __uint2double_rn(
int x) {
return (
double)x; }
943 __device__
static inline float __uint2float_rd(
unsigned int x) {
return (
float)x; }
944 __device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
945 __device__
static inline float __uint2float_ru(
unsigned int x) {
return (
float)x; }
946 __device__
static inline float __uint2float_rz(
unsigned int x) {
return (
float)x; }
948 __device__
static inline float __uint_as_float(
unsigned int x) {
949 static_assert(
sizeof(
float) ==
sizeof(
unsigned int),
"");
952 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
957 __device__
static inline double __ull2double_rd(
unsigned long long int x) {
return (
double)x; }
958 __device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
959 __device__
static inline double __ull2double_ru(
unsigned long long int x) {
return (
double)x; }
960 __device__
static inline double __ull2double_rz(
unsigned long long int x) {
return (
double)x; }
962 __device__
static inline float __ull2float_rd(
unsigned long long int x) {
return (
float)x; }
963 __device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
964 __device__
static inline float __ull2float_ru(
unsigned long long int x) {
return (
float)x; }
965 __device__
static inline float __ull2float_rz(
unsigned long long int x) {
return (
float)x; }
968 #define __HCC_OR_HIP_CLANG__ 1
969 #elif defined(__clang__) && defined(__HIP__)
970 #define __HCC_OR_HIP_CLANG__ 1
972 #define __HCC_OR_HIP_CLANG__ 0
975 #ifdef __HCC_OR_HIP_CLANG__
978 __device__
long long int __clock64();
979 __device__
long long int __clock();
980 __device__
long long int clock64();
981 __device__
long long int clock();
983 __device__
void __named_sync(
int a,
int b);
985 #ifdef __HIP_DEVICE_COMPILE__
989 extern "C" uint64_t __clock_u64() __HC__;
993 inline __attribute((always_inline))
994 long long int __clock64() {
995 return (
long long int) __builtin_readcyclecounter();
999 inline __attribute((always_inline))
1000 long long int __clock() {
return __clock64(); }
1003 inline __attribute__((always_inline))
1004 long long int clock64() {
return __clock64(); }
1007 inline __attribute__((always_inline))
1008 long long int clock() {
return __clock(); }
1013 void __named_sync(
int a,
int b) { __builtin_amdgcn_s_barrier(); }
1015 #endif // __HIP_DEVICE_COMPILE__
1020 int __all(
int predicate) {
1021 return __ockl_wfall_i32(predicate);
1026 int __any(
int predicate) {
1027 return __ockl_wfany_i32(predicate);
1035 unsigned long long int __ballot(
int predicate) {
1036 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1041 unsigned long long int __ballot64(
int predicate) {
1042 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1048 uint64_t __lanemask_gt()
1050 uint32_t lane = __ockl_lane_u32();
1053 uint64_t ballot = __ballot64(1);
1054 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1055 return mask & ballot;
1060 uint64_t __lanemask_lt()
1062 uint32_t lane = __ockl_lane_u32();
1063 int64_t ballot = __ballot64(1);
1064 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1065 return mask & ballot;
1070 uint64_t __lanemask_eq()
1072 uint32_t lane = __ockl_lane_u32();
1073 int64_t mask = ((uint64_t)1 << lane);
1078 __device__
inline void* __local_to_generic(
void* p) {
return p; }
1080 #ifdef __HIP_DEVICE_COMPILE__
1083 void* __get_dynamicgroupbaseptr()
1086 return (
char*)__local_to_generic((
void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1090 void* __get_dynamicgroupbaseptr();
1091 #endif // __HIP_DEVICE_COMPILE__
1095 void *__amdgcn_get_dynamicgroupbaseptr() {
1096 return __get_dynamicgroupbaseptr();
1099 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
1101 #define __CLK_LOCAL_MEM_FENCE 0x01
1102 typedef unsigned __cl_mem_fence_flags;
1104 typedef enum __memory_scope {
1105 __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
1106 __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
1107 __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
1108 __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
1109 __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
1113 typedef enum __memory_order
1115 __memory_order_relaxed = __ATOMIC_RELAXED,
1116 __memory_order_acquire = __ATOMIC_ACQUIRE,
1117 __memory_order_release = __ATOMIC_RELEASE,
1118 __memory_order_acq_rel = __ATOMIC_ACQ_REL,
1119 __memory_order_seq_cst = __ATOMIC_SEQ_CST
1125 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
1128 if (order != __memory_order_relaxed) {
1130 case __memory_scope_work_item:
1132 case __memory_scope_sub_group:
1134 case __memory_order_relaxed:
break;
1135 case __memory_order_acquire: __llvm_fence_acq_sg();
break;
1136 case __memory_order_release: __llvm_fence_rel_sg();
break;
1137 case __memory_order_acq_rel: __llvm_fence_ar_sg();
break;
1138 case __memory_order_seq_cst: __llvm_fence_sc_sg();
break;
1141 case __memory_scope_work_group:
1143 case __memory_order_relaxed:
break;
1144 case __memory_order_acquire: __llvm_fence_acq_wg();
break;
1145 case __memory_order_release: __llvm_fence_rel_wg();
break;
1146 case __memory_order_acq_rel: __llvm_fence_ar_wg();
break;
1147 case __memory_order_seq_cst: __llvm_fence_sc_wg();
break;
1150 case __memory_scope_device:
1152 case __memory_order_relaxed:
break;
1153 case __memory_order_acquire: __llvm_fence_acq_dev();
break;
1154 case __memory_order_release: __llvm_fence_rel_dev();
break;
1155 case __memory_order_acq_rel: __llvm_fence_ar_dev();
break;
1156 case __memory_order_seq_cst: __llvm_fence_sc_dev();
break;
1159 case __memory_scope_all_svm_devices:
1161 case __memory_order_relaxed:
break;
1162 case __memory_order_acquire: __llvm_fence_acq_sys();
break;
1163 case __memory_order_release: __llvm_fence_rel_sys();
break;
1164 case __memory_order_acq_rel: __llvm_fence_ar_sys();
break;
1165 case __memory_order_seq_cst: __llvm_fence_sc_sys();
break;
1176 static void __threadfence()
1178 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1183 static void __threadfence_block()
1185 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1190 static void __threadfence_system()
1192 __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1198 __attribute__((weak))
1200 return __builtin_trap();
1204 #endif // __HCC_OR_HIP_CLANG__
1214 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
1216 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1219 #elif defined(__clang__) && defined(__HIP__)
1226 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1227 void __assert_fail(
const char * __assertion,
1229 unsigned int __line,
1230 const char *__function)
1232 printf(
"%s:%u: %s: Device-side assertion `%s' failed.\n", __file, __line,
1233 __function, __assertion);
1237 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1238 void __assertfail(
const char * __assertion,
1240 unsigned int __line,
1241 const char *__function,
1250 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1253 __atomic_work_item_fence(flags, __memory_order_release, scope);
1254 __builtin_amdgcn_s_barrier();
1255 __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1257 __builtin_amdgcn_s_barrier();
1263 static void __barrier(
int n)
1265 __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1270 __attribute__((convergent))
1271 void __syncthreads()
1273 __barrier(__CLK_LOCAL_MEM_FENCE);
1278 __attribute__((convergent))
1279 int __syncthreads_count(
int predicate)
1281 return __ockl_wgred_add_i32(!!predicate);
1286 __attribute__((convergent))
1287 int __syncthreads_and(
int predicate)
1289 return __ockl_wgred_and_i32(!!predicate);
1294 __attribute__((convergent))
1295 int __syncthreads_or(
int predicate)
1297 return __ockl_wgred_or_i32(!!predicate);
1318 #define HW_ID_CU_ID_SIZE 4
1319 #define HW_ID_CU_ID_OFFSET 8
1321 #define HW_ID_SE_ID_SIZE 2
1322 #define HW_ID_SE_ID_OFFSET 13
1331 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1341 unsigned __smid(
void)
1343 unsigned cu_id = __builtin_amdgcn_s_getreg(
1344 GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1345 unsigned se_id = __builtin_amdgcn_s_getreg(
1346 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1349 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1354 #define HIP_DYNAMIC_SHARED(type, var) \
1355 type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1357 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1360 #endif //defined(__clang__) && defined(__HIP__)
1364 static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1365 auto dstPtr =
static_cast<unsigned char*
>(dst);
1366 auto srcPtr =
static_cast<const unsigned char*
>(src);
1368 while (size >= 4u) {
1369 dstPtr[0] = srcPtr[0];
1370 dstPtr[1] = srcPtr[1];
1371 dstPtr[2] = srcPtr[2];
1372 dstPtr[3] = srcPtr[3];
1380 dstPtr[2] = srcPtr[2];
1382 dstPtr[1] = srcPtr[1];
1384 dstPtr[0] = srcPtr[0];
1390 static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1391 auto dstPtr =
static_cast<unsigned char*
>(dst);
1393 while (size >= 4u) {
1413 static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1414 return __hip_hc_memcpy(dst, src, size);
1417 static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1418 unsigned char val8 =
static_cast<unsigned char>(val);
1419 return __hip_hc_memset(ptr, val8, size);