HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 
37 #if __HIP_CLANG_ONLY__ && __HIP_ROCclr__ && !_WIN32
38 extern "C" __device__ int printf(const char *fmt, ...);
39 #else
40 #if HC_FEATURE_PRINTF
41 template <typename... All>
42 static inline __device__ void printf(const char* format, All... all) {
43  hc::printf(format, all...);
44 }
45 #else
46 template <typename... All>
47 static inline __device__ void printf(const char* format, All... all) {}
48 #endif // HC_FEATURE_PRINTF
49 #endif // __HIP_CLANG_ONLY__ && __HIP_ROCclr__
50 
51 /*
52 Integer Intrinsics
53 */
54 
55 // integer intrinsic function __poc __clz __ffs __brev
56 __device__ static inline unsigned int __popc(unsigned int input) {
57  return __builtin_popcount(input);
58 }
59 __device__ static inline unsigned int __popcll(unsigned long long int input) {
60  return __builtin_popcountll(input);
61 }
62 
63 __device__ static inline int __clz(int input) {
64  return __ockl_clz_u32((uint)input);
65 }
66 
67 __device__ static inline int __clzll(long long int input) {
68  return __ockl_clz_u64((ullong)input);
69 }
70 
71 __device__ static inline unsigned int __ffs(unsigned int input) {
72  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
73 }
74 
75 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
76  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
77 }
78 
79 __device__ static inline unsigned int __ffs(int input) {
80  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
81 }
82 
83 __device__ static inline unsigned int __ffsll(long long int input) {
84  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85 }
86 
87 __device__ static inline unsigned int __brev(unsigned int input) {
88  return __llvm_bitrev_b32(input);
89 }
90 
91 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
92  return __llvm_bitrev_b64(input);
93 }
94 
95 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
96  return input == 0 ? -1 : __builtin_ctzl(input);
97 }
98 
99 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
100  uint32_t offset = src1 & 31;
101  uint32_t width = src2 & 31;
102  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
103 }
104 
105 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
106  uint64_t offset = src1 & 63;
107  uint64_t width = src2 & 63;
108  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
109 }
110 
111 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
112  uint32_t offset = src2 & 31;
113  uint32_t width = src3 & 31;
114  uint32_t mask = (1 << width) - 1;
115  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
116 }
117 
118 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
119  uint64_t offset = src2 & 63;
120  uint64_t width = src3 & 63;
121  uint64_t mask = (1ULL << width) - 1;
122  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
123 }
124 
125 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
126 __device__ static unsigned int __hadd(int x, int y);
127 __device__ static int __mul24(int x, int y);
128 __device__ static long long int __mul64hi(long long int x, long long int y);
129 __device__ static int __mulhi(int x, int y);
130 __device__ static int __rhadd(int x, int y);
131 __device__ static unsigned int __sad(int x, int y,unsigned int z);
132 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
133 __device__ static int __umul24(unsigned int x, unsigned int y);
134 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
135 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
136 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
137 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
138 
139 struct ucharHolder {
140  union {
141  unsigned char c[4];
142  unsigned int ui;
143  };
144 } __attribute__((aligned(4)));
145 
146 struct uchar2Holder {
147  union {
148  unsigned int ui[2];
149  unsigned char c[8];
150  };
151 } __attribute__((aligned(8)));
152 
153 __device__
154 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
155  struct uchar2Holder cHoldVal;
156  struct ucharHolder cHoldKey;
157  struct ucharHolder cHoldOut;
158  cHoldKey.ui = s;
159  cHoldVal.ui[0] = x;
160  cHoldVal.ui[1] = y;
161  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
162  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
163  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
164  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
165  return cHoldOut.ui;
166 }
167 
168 __device__ static inline unsigned int __hadd(int x, int y) {
169  int z = x + y;
170  int sign = z & 0x8000000;
171  int value = z & 0x7FFFFFFF;
172  return ((value) >> 1 || sign);
173 }
174 
175 __device__ static inline int __mul24(int x, int y) {
176  return __ockl_mul24_i32(x, y);
177 }
178 
179 __device__ static inline long long __mul64hi(long long int x, long long int y) {
180  ulong x0 = (ulong)x & 0xffffffffUL;
181  long x1 = x >> 32;
182  ulong y0 = (ulong)y & 0xffffffffUL;
183  long y1 = y >> 32;
184  ulong z0 = x0*y0;
185  long t = x1*y0 + (z0 >> 32);
186  long z1 = t & 0xffffffffL;
187  long z2 = t >> 32;
188  z1 = x0*y1 + z1;
189  return x1*y1 + z2 + (z1 >> 32);
190 }
191 
192 __device__ static inline int __mulhi(int x, int y) {
193  return __ockl_mul_hi_i32(x, y);
194 }
195 
196 __device__ static inline int __rhadd(int x, int y) {
197  int z = x + y + 1;
198  int sign = z & 0x8000000;
199  int value = z & 0x7FFFFFFF;
200  return ((value) >> 1 || sign);
201 }
202 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
203  return x > y ? x - y + z : y - x + z;
204 }
205 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
206  return (x + y) >> 1;
207 }
208 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
209  return __ockl_mul24_u32(x, y);
210 }
211 
212 __device__
213 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
214  ulong x0 = x & 0xffffffffUL;
215  ulong x1 = x >> 32;
216  ulong y0 = y & 0xffffffffUL;
217  ulong y1 = y >> 32;
218  ulong z0 = x0*y0;
219  ulong t = x1*y0 + (z0 >> 32);
220  ulong z1 = t & 0xffffffffUL;
221  ulong z2 = t >> 32;
222  z1 = x0*y1 + z1;
223  return x1*y1 + z2 + (z1 >> 32);
224 }
225 
226 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
227  return __ockl_mul_hi_u32(x, y);
228 }
229 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
230  return (x + y + 1) >> 1;
231 }
232 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
233  return __ockl_sadd_u32(x, y, z);
234 }
235 
236 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
237 
238 /*
239 HIP specific device functions
240 */
241 
242 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
243  union { int i; unsigned u; float f; } tmp; tmp.u = src;
244  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
245  return tmp.u;
246 }
247 
248 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
249  union { int i; unsigned u; float f; } tmp; tmp.f = src;
250  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
251  return tmp.f;
252 }
253 
254 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
255  union { int i; unsigned u; float f; } tmp; tmp.u = src;
256  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
257  return tmp.u;
258 }
259 
260 __device__ static inline float __hip_ds_permutef(int index, float src) {
261  union { int i; unsigned u; float f; } tmp; tmp.u = src;
262  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
263  return tmp.u;
264 }
265 
266 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
267 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
268 
269 template <int pattern>
270 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
271  union { int i; unsigned u; float f; } tmp; tmp.u = src;
272 #if defined(__HCC__)
273  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
274 #else
275  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
276 #endif
277  return tmp.u;
278 }
279 
280 template <int pattern>
281 __device__ static inline float __hip_ds_swizzlef_N(float src) {
282  union { int i; unsigned u; float f; } tmp; tmp.f = src;
283 #if defined(__HCC__)
284  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
285 #else
286  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
287 #endif
288  return tmp.f;
289 }
290 
291 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
292  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
293 
294 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
295 __device__ static inline int __hip_move_dpp_N(int src) {
296  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
297  bound_ctrl);
298 }
299 
300 static constexpr int warpSize = 64;
301 
302 __device__
303 inline
304 int __shfl(int var, int src_lane, int width = warpSize) {
305  int self = __lane_id();
306  int index = src_lane + (self & ~(width-1));
307  return __llvm_amdgcn_ds_bpermute(index<<2, var);
308 }
309 __device__
310 inline
311 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
312  union { int i; unsigned u; float f; } tmp; tmp.u = var;
313  tmp.i = __shfl(tmp.i, src_lane, width);
314  return tmp.u;
315 }
316 __device__
317 inline
318 float __shfl(float var, int src_lane, int width = warpSize) {
319  union { int i; unsigned u; float f; } tmp; tmp.f = var;
320  tmp.i = __shfl(tmp.i, src_lane, width);
321  return tmp.f;
322 }
323 __device__
324 inline
325 double __shfl(double var, int src_lane, int width = warpSize) {
326  static_assert(sizeof(double) == 2 * sizeof(int), "");
327  static_assert(sizeof(double) == sizeof(uint64_t), "");
328 
329  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
330  tmp[0] = __shfl(tmp[0], src_lane, width);
331  tmp[1] = __shfl(tmp[1], src_lane, width);
332 
333  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
334  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
335  return tmp1;
336 }
337 __device__
338 inline
339 long __shfl(long var, int src_lane, int width = warpSize)
340 {
341  #ifndef _MSC_VER
342  static_assert(sizeof(long) == 2 * sizeof(int), "");
343  static_assert(sizeof(long) == sizeof(uint64_t), "");
344 
345  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
346  tmp[0] = __shfl(tmp[0], src_lane, width);
347  tmp[1] = __shfl(tmp[1], src_lane, width);
348 
349  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
350  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
351  return tmp1;
352  #else
353  static_assert(sizeof(long) == sizeof(int), "");
354  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
355  #endif
356 }
357 __device__
358 inline
359 long long __shfl(long long var, int src_lane, int width = warpSize)
360 {
361  static_assert(sizeof(long long) == 2 * sizeof(int), "");
362  static_assert(sizeof(long long) == sizeof(uint64_t), "");
363 
364  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
365  tmp[0] = __shfl(tmp[0], src_lane, width);
366  tmp[1] = __shfl(tmp[1], src_lane, width);
367 
368  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
369  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
370  return tmp1;
371 }
372 
373  __device__
374 inline
375 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
376  int self = __lane_id();
377  int index = self - lane_delta;
378  index = (index < (self & ~(width-1)))?self:index;
379  return __llvm_amdgcn_ds_bpermute(index<<2, var);
380 }
381 __device__
382 inline
383 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
384  union { int i; unsigned u; float f; } tmp; tmp.u = var;
385  tmp.i = __shfl_up(tmp.i, lane_delta, width);
386  return tmp.u;
387 }
388 __device__
389 inline
390 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
391  union { int i; unsigned u; float f; } tmp; tmp.f = var;
392  tmp.i = __shfl_up(tmp.i, lane_delta, width);
393  return tmp.f;
394 }
395 __device__
396 inline
397 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
398  static_assert(sizeof(double) == 2 * sizeof(int), "");
399  static_assert(sizeof(double) == sizeof(uint64_t), "");
400 
401  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
402  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
403  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
404 
405  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
406  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
407  return tmp1;
408 }
409 __device__
410 inline
411 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
412 {
413  #ifndef _MSC_VER
414  static_assert(sizeof(long) == 2 * sizeof(int), "");
415  static_assert(sizeof(long) == sizeof(uint64_t), "");
416 
417  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
418  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
419  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
420 
421  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
422  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
423  return tmp1;
424  #else
425  static_assert(sizeof(long) == sizeof(int), "");
426  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
427  #endif
428 }
429 __device__
430 inline
431 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
432 {
433  static_assert(sizeof(long long) == 2 * sizeof(int), "");
434  static_assert(sizeof(long long) == sizeof(uint64_t), "");
435  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
436  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
437  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
438  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
439  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
440  return tmp1;
441 }
442 
443 __device__
444 inline
445 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
446  int self = __lane_id();
447  int index = self + lane_delta;
448  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
449  return __llvm_amdgcn_ds_bpermute(index<<2, var);
450 }
451 __device__
452 inline
453 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
454  union { int i; unsigned u; float f; } tmp; tmp.u = var;
455  tmp.i = __shfl_down(tmp.i, lane_delta, width);
456  return tmp.u;
457 }
458 __device__
459 inline
460 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
461  union { int i; unsigned u; float f; } tmp; tmp.f = var;
462  tmp.i = __shfl_down(tmp.i, lane_delta, width);
463  return tmp.f;
464 }
465 __device__
466 inline
467 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
468  static_assert(sizeof(double) == 2 * sizeof(int), "");
469  static_assert(sizeof(double) == sizeof(uint64_t), "");
470 
471  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
472  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
473  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
474 
475  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
476  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
477  return tmp1;
478 }
479 __device__
480 inline
481 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
482 {
483  #ifndef _MSC_VER
484  static_assert(sizeof(long) == 2 * sizeof(int), "");
485  static_assert(sizeof(long) == sizeof(uint64_t), "");
486 
487  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
488  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
489  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
490 
491  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
492  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
493  return tmp1;
494  #else
495  static_assert(sizeof(long) == sizeof(int), "");
496  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
497  #endif
498 }
499 __device__
500 inline
501 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
502 {
503  static_assert(sizeof(long long) == 2 * sizeof(int), "");
504  static_assert(sizeof(long long) == sizeof(uint64_t), "");
505  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
506  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
507  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
508  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
509  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
510  return tmp1;
511 }
512 
513 __device__
514 inline
515 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
516  int self = __lane_id();
517  int index = self^lane_mask;
518  index = index >= ((self+width)&~(width-1))?self:index;
519  return __llvm_amdgcn_ds_bpermute(index<<2, var);
520 }
521 __device__
522 inline
523 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
524  union { int i; unsigned u; float f; } tmp; tmp.u = var;
525  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
526  return tmp.u;
527 }
528 __device__
529 inline
530 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
531  union { int i; unsigned u; float f; } tmp; tmp.f = var;
532  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
533  return tmp.f;
534 }
535 __device__
536 inline
537 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
538  static_assert(sizeof(double) == 2 * sizeof(int), "");
539  static_assert(sizeof(double) == sizeof(uint64_t), "");
540 
541  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
542  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
543  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
544 
545  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
546  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
547  return tmp1;
548 }
549 __device__
550 inline
551 long __shfl_xor(long var, int lane_mask, int width = warpSize)
552 {
553  #ifndef _MSC_VER
554  static_assert(sizeof(long) == 2 * sizeof(int), "");
555  static_assert(sizeof(long) == sizeof(uint64_t), "");
556 
557  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
558  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
559  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
560 
561  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
562  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
563  return tmp1;
564  #else
565  static_assert(sizeof(long) == sizeof(int), "");
566  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
567  #endif
568 }
569 __device__
570 inline
571 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
572 {
573  static_assert(sizeof(long long) == 2 * sizeof(int), "");
574  static_assert(sizeof(long long) == sizeof(uint64_t), "");
575  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
576  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
577  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
578  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
579  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
580  return tmp1;
581 }
582 
583 #define MASK1 0x00ff00ff
584 #define MASK2 0xff00ff00
585 
586 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
587  char4 out;
588  unsigned one1 = in1.w & MASK1;
589  unsigned one2 = in2.w & MASK1;
590  out.w = (one1 + one2) & MASK1;
591  one1 = in1.w & MASK2;
592  one2 = in2.w & MASK2;
593  out.w = out.w | ((one1 + one2) & MASK2);
594  return out;
595 }
596 
597 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
598  char4 out;
599  unsigned one1 = in1.w & MASK1;
600  unsigned one2 = in2.w & MASK1;
601  out.w = (one1 - one2) & MASK1;
602  one1 = in1.w & MASK2;
603  one2 = in2.w & MASK2;
604  out.w = out.w | ((one1 - one2) & MASK2);
605  return out;
606 }
607 
608 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
609  char4 out;
610  unsigned one1 = in1.w & MASK1;
611  unsigned one2 = in2.w & MASK1;
612  out.w = (one1 * one2) & MASK1;
613  one1 = in1.w & MASK2;
614  one2 = in2.w & MASK2;
615  out.w = out.w | ((one1 * one2) & MASK2);
616  return out;
617 }
618 
619 /*
620  * Rounding modes are not yet supported in HIP
621  * TODO: Conversion functions are not correct, need to fix when BE is ready
622 */
623 
624 __device__ static inline float __double2float_rd(double x) { return (double)x; }
625 __device__ static inline float __double2float_rn(double x) { return (double)x; }
626 __device__ static inline float __double2float_ru(double x) { return (double)x; }
627 __device__ static inline float __double2float_rz(double x) { return (double)x; }
628 
629 __device__ static inline int __double2hiint(double x) {
630  static_assert(sizeof(double) == 2 * sizeof(int), "");
631 
632  int tmp[2];
633  __builtin_memcpy(tmp, &x, sizeof(tmp));
634 
635  return tmp[1];
636 }
637 __device__ static inline int __double2loint(double x) {
638  static_assert(sizeof(double) == 2 * sizeof(int), "");
639 
640  int tmp[2];
641  __builtin_memcpy(tmp, &x, sizeof(tmp));
642 
643  return tmp[0];
644 }
645 
646 __device__ static inline int __double2int_rd(double x) { return (int)x; }
647 __device__ static inline int __double2int_rn(double x) { return (int)x; }
648 __device__ static inline int __double2int_ru(double x) { return (int)x; }
649 __device__ static inline int __double2int_rz(double x) { return (int)x; }
650 
651 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
652 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
653 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
654 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
655 
656 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
657 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
658 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
659 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
660 
661 __device__ static inline unsigned long long int __double2ull_rd(double x) {
662  return (unsigned long long int)x;
663 }
664 __device__ static inline unsigned long long int __double2ull_rn(double x) {
665  return (unsigned long long int)x;
666 }
667 __device__ static inline unsigned long long int __double2ull_ru(double x) {
668  return (unsigned long long int)x;
669 }
670 __device__ static inline unsigned long long int __double2ull_rz(double x) {
671  return (unsigned long long int)x;
672 }
673 
674 __device__ static inline long long int __double_as_longlong(double x) {
675  static_assert(sizeof(long long) == sizeof(double), "");
676 
677  long long tmp;
678  __builtin_memcpy(&tmp, &x, sizeof(tmp));
679 
680  return tmp;
681 }
682 
683 /*
684 __device__ unsigned short __float2half_rn(float x);
685 __device__ float __half2float(unsigned short);
686 
687 The above device function are not a valid .
688 Use
689 __device__ __half __float2half_rn(float x);
690 __device__ float __half2float(__half);
691 from hip_fp16.h
692 
693 CUDA implements half as unsigned short whereas, HIP doesn't.
694 
695 */
696 
697 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
698 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
699 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
700 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
701 
702 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
703 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
704 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
705 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
706 
707 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
708 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
709 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
710 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
711 
712 __device__ static inline unsigned long long int __float2ull_rd(float x) {
713  return (unsigned long long int)x;
714 }
715 __device__ static inline unsigned long long int __float2ull_rn(float x) {
716  return (unsigned long long int)x;
717 }
718 __device__ static inline unsigned long long int __float2ull_ru(float x) {
719  return (unsigned long long int)x;
720 }
721 __device__ static inline unsigned long long int __float2ull_rz(float x) {
722  return (unsigned long long int)x;
723 }
724 
725 __device__ static inline int __float_as_int(float x) {
726  static_assert(sizeof(int) == sizeof(float), "");
727 
728  int tmp;
729  __builtin_memcpy(&tmp, &x, sizeof(tmp));
730 
731  return tmp;
732 }
733 
734 __device__ static inline unsigned int __float_as_uint(float x) {
735  static_assert(sizeof(unsigned int) == sizeof(float), "");
736 
737  unsigned int tmp;
738  __builtin_memcpy(&tmp, &x, sizeof(tmp));
739 
740  return tmp;
741 }
742 
743 __device__ static inline double __hiloint2double(int hi, int lo) {
744  static_assert(sizeof(double) == sizeof(uint64_t), "");
745 
746  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
747  double tmp1;
748  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
749 
750  return tmp1;
751 }
752 
753 __device__ static inline double __int2double_rn(int x) { return (double)x; }
754 
755 __device__ static inline float __int2float_rd(int x) { return (float)x; }
756 __device__ static inline float __int2float_rn(int x) { return (float)x; }
757 __device__ static inline float __int2float_ru(int x) { return (float)x; }
758 __device__ static inline float __int2float_rz(int x) { return (float)x; }
759 
760 __device__ static inline float __int_as_float(int x) {
761  static_assert(sizeof(float) == sizeof(int), "");
762 
763  float tmp;
764  __builtin_memcpy(&tmp, &x, sizeof(tmp));
765 
766  return tmp;
767 }
768 
769 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
770 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
771 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
772 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
773 
774 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
775 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
776 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
777 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
778 
779 __device__ static inline double __longlong_as_double(long long int x) {
780  static_assert(sizeof(double) == sizeof(long long), "");
781 
782  double tmp;
783  __builtin_memcpy(&tmp, &x, sizeof(tmp));
784 
785  return tmp;
786 }
787 
788 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
789 
790 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
791 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
792 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
793 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
794 
795 __device__ static inline float __uint_as_float(unsigned int x) {
796  static_assert(sizeof(float) == sizeof(unsigned int), "");
797 
798  float tmp;
799  __builtin_memcpy(&tmp, &x, sizeof(tmp));
800 
801  return tmp;
802 }
803 
804 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
805 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
806 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
807 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
808 
809 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
810 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
811 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
812 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
813 
814 #if defined(__HCC__)
815 #define __HCC_OR_HIP_CLANG__ 1
816 #elif defined(__clang__) && defined(__HIP__)
817 #define __HCC_OR_HIP_CLANG__ 1
818 #else
819 #define __HCC_OR_HIP_CLANG__ 0
820 #endif
821 
822 #ifdef __HCC_OR_HIP_CLANG__
823 
824 // Clock functions
825 __device__ long long int __clock64();
826 __device__ long long int __clock();
827 __device__ long long int clock64();
828 __device__ long long int clock();
829 // hip.amdgcn.bc - named sync
830 __device__ void __named_sync(int a, int b);
831 
832 #ifdef __HIP_DEVICE_COMPILE__
833 
834 // Clock functions
835 #if __HCC__
836 extern "C" uint64_t __clock_u64() __HC__;
837 #endif
838 
839 __device__
840 inline __attribute((always_inline))
841 long long int __clock64() {
842 return (long long int) __builtin_readcyclecounter();
843 }
844 
845 __device__
846 inline __attribute((always_inline))
847 long long int __clock() { return __clock64(); }
848 
849 __device__
850 inline __attribute__((always_inline))
851 long long int clock64() { return __clock64(); }
852 
853 __device__
854 inline __attribute__((always_inline))
855 long long int clock() { return __clock(); }
856 
857 // hip.amdgcn.bc - named sync
858 __device__
859 inline
860 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
861 
862 #endif // __HIP_DEVICE_COMPILE__
863 
864 // warp vote function __all __any __ballot
865 __device__
866 inline
867 int __all(int predicate) {
868  return __ockl_wfall_i32(predicate);
869 }
870 
871 __device__
872 inline
873 int __any(int predicate) {
874  return __ockl_wfany_i32(predicate);
875 }
876 
877 // XXX from llvm/include/llvm/IR/InstrTypes.h
878 #define ICMP_NE 33
879 
880 __device__
881 inline
882 unsigned long long int __ballot(int predicate) {
883  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
884 }
885 
886 __device__
887 inline
888 unsigned long long int __ballot64(int predicate) {
889  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
890 }
891 
892 // hip.amdgcn.bc - lanemask
893 __device__
894 inline
895 uint64_t __lanemask_gt()
896 {
897  uint32_t lane = __ockl_lane_u32();
898  if (lane == 63)
899  return 0;
900  uint64_t ballot = __ballot64(1);
901  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
902  return mask & ballot;
903 }
904 
905 __device__
906 inline
907 uint64_t __lanemask_lt()
908 {
909  uint32_t lane = __ockl_lane_u32();
910  int64_t ballot = __ballot64(1);
911  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
912  return mask & ballot;
913 }
914 
915 __device__
916 inline
917 uint64_t __lanemask_eq()
918 {
919  uint32_t lane = __ockl_lane_u32();
920  int64_t mask = ((uint64_t)1 << lane);
921  return mask;
922 }
923 
924 
925 __device__ inline void* __local_to_generic(void* p) { return p; }
926 
927 #ifdef __HIP_DEVICE_COMPILE__
928 __device__
929 inline
930 void* __get_dynamicgroupbaseptr()
931 {
932  // Get group segment base pointer.
933  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
934 }
935 #else
936 __device__
937 void* __get_dynamicgroupbaseptr();
938 #endif // __HIP_DEVICE_COMPILE__
939 
940 __device__
941 inline
942 void *__amdgcn_get_dynamicgroupbaseptr() {
943  return __get_dynamicgroupbaseptr();
944 }
945 
946 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
947 // hip.amdgcn.bc - sync threads
948 #define __CLK_LOCAL_MEM_FENCE 0x01
949 typedef unsigned __cl_mem_fence_flags;
950 
951 typedef enum __memory_scope {
952  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
953  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
954  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
955  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
956  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
957 } __memory_scope;
958 
959 // enum values aligned with what clang uses in EmitAtomicExpr()
960 typedef enum __memory_order
961 {
962  __memory_order_relaxed = __ATOMIC_RELAXED,
963  __memory_order_acquire = __ATOMIC_ACQUIRE,
964  __memory_order_release = __ATOMIC_RELEASE,
965  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
966  __memory_order_seq_cst = __ATOMIC_SEQ_CST
967 } __memory_order;
968 
969 __device__
970 inline
971 static void
972 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
973 {
974  // We're tying global-happens-before and local-happens-before together as does HSA
975  if (order != __memory_order_relaxed) {
976  switch (scope) {
977  case __memory_scope_work_item:
978  break;
979  case __memory_scope_sub_group:
980  switch (order) {
981  case __memory_order_relaxed: break;
982  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
983  case __memory_order_release: __llvm_fence_rel_sg(); break;
984  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
985  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
986  }
987  break;
988  case __memory_scope_work_group:
989  switch (order) {
990  case __memory_order_relaxed: break;
991  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
992  case __memory_order_release: __llvm_fence_rel_wg(); break;
993  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
994  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
995  }
996  break;
997  case __memory_scope_device:
998  switch (order) {
999  case __memory_order_relaxed: break;
1000  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
1001  case __memory_order_release: __llvm_fence_rel_dev(); break;
1002  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
1003  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
1004  }
1005  break;
1006  case __memory_scope_all_svm_devices:
1007  switch (order) {
1008  case __memory_order_relaxed: break;
1009  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
1010  case __memory_order_release: __llvm_fence_rel_sys(); break;
1011  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
1012  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
1013  }
1014  break;
1015  }
1016  }
1017 }
1018 #endif
1019 
1020 // Memory Fence Functions
1021 __device__
1022 inline
1023 static void __threadfence()
1024 {
1025  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1026 }
1027 
1028 __device__
1029 inline
1030 static void __threadfence_block()
1031 {
1032  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1033 }
1034 
1035 __device__
1036 inline
1037 static void __threadfence_system()
1038 {
1039  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1040 }
1041 
1042 // abort
1043 __device__
1044 inline
1045 __attribute__((weak))
1046 void abort() {
1047  return __builtin_trap();
1048 }
1049 
1050 
1051 #endif // __HCC_OR_HIP_CLANG__
1052 
1053 #ifdef __HCC__
1054 
1059 // Macro to replace extern __shared__ declarations
1060 // to local variable definitions
1061 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
1062 
1063 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1064 
1065 
1066 #elif defined(__clang__) && defined(__HIP__)
1067 
1068 #pragma push_macro("__DEVICE__")
1069 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
1070  __attribute__((weak))
1071 
1072 __DEVICE__
1073 inline
1074 void __assert_fail(const char * __assertion,
1075  const char *__file,
1076  unsigned int __line,
1077  const char *__function)
1078 {
1079  printf("%s:%u: %s: Device-side assertion `%s' failed.\n", __file, __line,
1080  __function, __assertion);
1081  // Ignore all the args for now.
1082  __builtin_trap();
1083 }
1084 
1085 __DEVICE__
1086 inline
1087 void __assertfail(const char * __assertion,
1088  const char *__file,
1089  unsigned int __line,
1090  const char *__function,
1091  size_t charsize)
1092 {
1093  // ignore all the args for now.
1094  __builtin_trap();
1095 }
1096 
1097 __device__
1098 inline
1099 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1100 {
1101  if (flags) {
1102  __atomic_work_item_fence(flags, __memory_order_release, scope);
1103  __builtin_amdgcn_s_barrier();
1104  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1105  } else {
1106  __builtin_amdgcn_s_barrier();
1107  }
1108 }
1109 
1110 __device__
1111 inline
1112 static void __barrier(int n)
1113 {
1114  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1115 }
1116 
1117 __device__
1118 inline
1119 __attribute__((convergent))
1120 void __syncthreads()
1121 {
1122  __barrier(__CLK_LOCAL_MEM_FENCE);
1123 }
1124 
1125 // hip.amdgcn.bc - device routine
1126 /*
1127  HW_ID Register bit structure
1128  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1129  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1130  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1131  CU_ID 11:8 Compute Unit the wave is assigned to.
1132  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1133  SE_ID 14:13 Shader Engine the wave is assigned to.
1134  TG_ID 19:16 Thread-group ID
1135  VM_ID 23:20 Virtual Memory ID
1136  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1137  STATE_ID 29:27 State ID (graphics only, not compute).
1138  ME_ID 31:30 Micro-engine ID.
1139  */
1140 
1141 #define HW_ID 4
1142 
1143 #define HW_ID_CU_ID_SIZE 4
1144 #define HW_ID_CU_ID_OFFSET 8
1145 
1146 #define HW_ID_SE_ID_SIZE 2
1147 #define HW_ID_SE_ID_OFFSET 13
1148 
1149 /*
1150  Encoding of parameter bitmask
1151  HW_ID 5:0 HW_ID
1152  OFFSET 10:6 Range: 0..31
1153  SIZE 15:11 Range: 1..32
1154  */
1155 
1156 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1157 
1158 /*
1159  __smid returns the wave's assigned Compute Unit and Shader Engine.
1160  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1161  Note: the results vary over time.
1162  SZ minus 1 since SIZE is 1-based.
1163 */
1164 __device__
1165 inline
1166 unsigned __smid(void)
1167 {
1168  unsigned cu_id = __builtin_amdgcn_s_getreg(
1169  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1170  unsigned se_id = __builtin_amdgcn_s_getreg(
1171  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1172 
1173  /* Each shader engine has 16 CU */
1174  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1175 }
1176 
1177 #pragma push_macro("__DEVICE__")
1178 
1179 // Macro to replace extern __shared__ declarations
1180 // to local variable definitions
1181 #define HIP_DYNAMIC_SHARED(type, var) \
1182  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1183 
1184 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1185 
1186 
1187 #endif //defined(__clang__) && defined(__HIP__)
1188 
1189 
1190 // loop unrolling
1191 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1192  auto dstPtr = static_cast<unsigned char*>(dst);
1193  auto srcPtr = static_cast<const unsigned char*>(src);
1194 
1195  while (size >= 4u) {
1196  dstPtr[0] = srcPtr[0];
1197  dstPtr[1] = srcPtr[1];
1198  dstPtr[2] = srcPtr[2];
1199  dstPtr[3] = srcPtr[3];
1200 
1201  size -= 4u;
1202  srcPtr += 4u;
1203  dstPtr += 4u;
1204  }
1205  switch (size) {
1206  case 3:
1207  dstPtr[2] = srcPtr[2];
1208  case 2:
1209  dstPtr[1] = srcPtr[1];
1210  case 1:
1211  dstPtr[0] = srcPtr[0];
1212  }
1213 
1214  return dst;
1215 }
1216 
1217 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1218  auto dstPtr = static_cast<unsigned char*>(dst);
1219 
1220  while (size >= 4u) {
1221  dstPtr[0] = val;
1222  dstPtr[1] = val;
1223  dstPtr[2] = val;
1224  dstPtr[3] = val;
1225 
1226  size -= 4u;
1227  dstPtr += 4u;
1228  }
1229  switch (size) {
1230  case 3:
1231  dstPtr[2] = val;
1232  case 2:
1233  dstPtr[1] = val;
1234  case 1:
1235  dstPtr[0] = val;
1236  }
1237 
1238  return dst;
1239 }
1240 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1241  return __hip_hc_memcpy(dst, src, size);
1242 }
1243 
1244 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1245  unsigned char val8 = static_cast<unsigned char>(val);
1246  return __hip_hc_memset(ptr, val8, size);
1247 }
1248 
1249 #endif
TODO-doc.
Contains declarations for types and functions in device library.
Definition: device_functions.h:139
Definition: hip_vector_types.h:1356
Definition: device_functions.h:146
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.