HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 
37 #if __HIP_CLANG_ONLY__ && __HIP_ROCclr__ && !_WIN32
38 extern "C" __device__ int printf(const char *fmt, ...);
39 #else
40 #if HC_FEATURE_PRINTF
41 template <typename... All>
42 static inline __device__ void printf(const char* format, All... all) {
43  hc::printf(format, all...);
44 }
45 #else
46 template <typename... All>
47 static inline __device__ void printf(const char* format, All... all) {}
48 #endif // HC_FEATURE_PRINTF
49 #endif // __HIP_CLANG_ONLY__ && __HIP_ROCclr__
50 
51 /*
52 Integer Intrinsics
53 */
54 
55 // integer intrinsic function __poc __clz __ffs __brev
56 __device__ static inline unsigned int __popc(unsigned int input) {
57  return __builtin_popcount(input);
58 }
59 __device__ static inline unsigned int __popcll(unsigned long long int input) {
60  return __builtin_popcountll(input);
61 }
62 
63 __device__ static inline int __clz(int input) {
64  return __ockl_clz_u32((uint)input);
65 }
66 
67 __device__ static inline int __clzll(long long int input) {
68  return __ockl_clz_u64((ullong)input);
69 }
70 
71 __device__ static inline unsigned int __ffs(unsigned int input) {
72  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
73 }
74 
75 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
76  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
77 }
78 
79 __device__ static inline unsigned int __ffs(int input) {
80  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
81 }
82 
83 __device__ static inline unsigned int __ffsll(long long int input) {
84  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85 }
86 
87 __device__ static inline unsigned int __brev(unsigned int input) {
88  return __builtin_bitreverse32(input);
89 }
90 
91 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
92  return __builtin_bitreverse64(input);
93 }
94 
95 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
96  return input == 0 ? -1 : __builtin_ctzl(input);
97 }
98 
99 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
100  uint32_t offset = src1 & 31;
101  uint32_t width = src2 & 31;
102  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
103 }
104 
105 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
106  uint64_t offset = src1 & 63;
107  uint64_t width = src2 & 63;
108  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
109 }
110 
111 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
112  uint32_t offset = src2 & 31;
113  uint32_t width = src3 & 31;
114  uint32_t mask = (1 << width) - 1;
115  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
116 }
117 
118 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
119  uint64_t offset = src2 & 63;
120  uint64_t width = src3 & 63;
121  uint64_t mask = (1ULL << width) - 1;
122  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
123 }
124 
125 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
126 __device__ static unsigned int __hadd(int x, int y);
127 __device__ static int __mul24(int x, int y);
128 __device__ static long long int __mul64hi(long long int x, long long int y);
129 __device__ static int __mulhi(int x, int y);
130 __device__ static int __rhadd(int x, int y);
131 __device__ static unsigned int __sad(int x, int y,unsigned int z);
132 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
133 __device__ static int __umul24(unsigned int x, unsigned int y);
134 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
135 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
136 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
137 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
138 
139 struct ucharHolder {
140  union {
141  unsigned char c[4];
142  unsigned int ui;
143  };
144 } __attribute__((aligned(4)));
145 
146 struct uchar2Holder {
147  union {
148  unsigned int ui[2];
149  unsigned char c[8];
150  };
151 } __attribute__((aligned(8)));
152 
153 __device__
154 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
155  struct uchar2Holder cHoldVal;
156  struct ucharHolder cHoldKey;
157  struct ucharHolder cHoldOut;
158  cHoldKey.ui = s;
159  cHoldVal.ui[0] = x;
160  cHoldVal.ui[1] = y;
161  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
162  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
163  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
164  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
165  return cHoldOut.ui;
166 }
167 
168 __device__ static inline unsigned int __hadd(int x, int y) {
169  int z = x + y;
170  int sign = z & 0x8000000;
171  int value = z & 0x7FFFFFFF;
172  return ((value) >> 1 || sign);
173 }
174 
175 __device__ static inline int __mul24(int x, int y) {
176  return __ockl_mul24_i32(x, y);
177 }
178 
179 __device__ static inline long long __mul64hi(long long int x, long long int y) {
180  ulong x0 = (ulong)x & 0xffffffffUL;
181  long x1 = x >> 32;
182  ulong y0 = (ulong)y & 0xffffffffUL;
183  long y1 = y >> 32;
184  ulong z0 = x0*y0;
185  long t = x1*y0 + (z0 >> 32);
186  long z1 = t & 0xffffffffL;
187  long z2 = t >> 32;
188  z1 = x0*y1 + z1;
189  return x1*y1 + z2 + (z1 >> 32);
190 }
191 
192 __device__ static inline int __mulhi(int x, int y) {
193  return __ockl_mul_hi_i32(x, y);
194 }
195 
196 __device__ static inline int __rhadd(int x, int y) {
197  int z = x + y + 1;
198  int sign = z & 0x8000000;
199  int value = z & 0x7FFFFFFF;
200  return ((value) >> 1 || sign);
201 }
202 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
203  return x > y ? x - y + z : y - x + z;
204 }
205 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
206  return (x + y) >> 1;
207 }
208 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
209  return __ockl_mul24_u32(x, y);
210 }
211 
212 __device__
213 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
214  ulong x0 = x & 0xffffffffUL;
215  ulong x1 = x >> 32;
216  ulong y0 = y & 0xffffffffUL;
217  ulong y1 = y >> 32;
218  ulong z0 = x0*y0;
219  ulong t = x1*y0 + (z0 >> 32);
220  ulong z1 = t & 0xffffffffUL;
221  ulong z2 = t >> 32;
222  z1 = x0*y1 + z1;
223  return x1*y1 + z2 + (z1 >> 32);
224 }
225 
226 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
227  return __ockl_mul_hi_u32(x, y);
228 }
229 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
230  return (x + y + 1) >> 1;
231 }
232 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
233  return __ockl_sadd_u32(x, y, z);
234 }
235 
236 __device__ static inline unsigned int __lane_id() {
237  return __builtin_amdgcn_mbcnt_hi(
238  -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
239 }
240 
241 __device__
242 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
243 
244 __device__
245 static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
246 
247 /*
248 HIP specific device functions
249 */
250 
251 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
252  union { int i; unsigned u; float f; } tmp; tmp.u = src;
253  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
254  return tmp.u;
255 }
256 
257 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
258  union { int i; unsigned u; float f; } tmp; tmp.f = src;
259  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
260  return tmp.f;
261 }
262 
263 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
264  union { int i; unsigned u; float f; } tmp; tmp.u = src;
265  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
266  return tmp.u;
267 }
268 
269 __device__ static inline float __hip_ds_permutef(int index, float src) {
270  union { int i; unsigned u; float f; } tmp; tmp.u = src;
271  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
272  return tmp.u;
273 }
274 
275 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
276 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
277 
278 template <int pattern>
279 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
280  union { int i; unsigned u; float f; } tmp; tmp.u = src;
281 #if defined(__HCC__)
282  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
283 #else
284  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
285 #endif
286  return tmp.u;
287 }
288 
289 template <int pattern>
290 __device__ static inline float __hip_ds_swizzlef_N(float src) {
291  union { int i; unsigned u; float f; } tmp; tmp.f = src;
292 #if defined(__HCC__)
293  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
294 #else
295  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
296 #endif
297  return tmp.f;
298 }
299 
300 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
301  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
302 
303 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
304 __device__ static inline int __hip_move_dpp_N(int src) {
305  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
306  bound_ctrl);
307 }
308 
309 static constexpr int warpSize = 64;
310 
311 __device__
312 inline
313 int __shfl(int var, int src_lane, int width = warpSize) {
314  int self = __lane_id();
315  int index = src_lane + (self & ~(width-1));
316  return __builtin_amdgcn_ds_bpermute(index<<2, var);
317 }
318 __device__
319 inline
320 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
321  union { int i; unsigned u; float f; } tmp; tmp.u = var;
322  tmp.i = __shfl(tmp.i, src_lane, width);
323  return tmp.u;
324 }
325 __device__
326 inline
327 float __shfl(float var, int src_lane, int width = warpSize) {
328  union { int i; unsigned u; float f; } tmp; tmp.f = var;
329  tmp.i = __shfl(tmp.i, src_lane, width);
330  return tmp.f;
331 }
332 __device__
333 inline
334 double __shfl(double var, int src_lane, int width = warpSize) {
335  static_assert(sizeof(double) == 2 * sizeof(int), "");
336  static_assert(sizeof(double) == sizeof(uint64_t), "");
337 
338  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
339  tmp[0] = __shfl(tmp[0], src_lane, width);
340  tmp[1] = __shfl(tmp[1], src_lane, width);
341 
342  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
343  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
344  return tmp1;
345 }
346 __device__
347 inline
348 long __shfl(long var, int src_lane, int width = warpSize)
349 {
350  #ifndef _MSC_VER
351  static_assert(sizeof(long) == 2 * sizeof(int), "");
352  static_assert(sizeof(long) == sizeof(uint64_t), "");
353 
354  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
355  tmp[0] = __shfl(tmp[0], src_lane, width);
356  tmp[1] = __shfl(tmp[1], src_lane, width);
357 
358  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
359  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
360  return tmp1;
361  #else
362  static_assert(sizeof(long) == sizeof(int), "");
363  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
364  #endif
365 }
366 __device__
367 inline
368 long long __shfl(long long var, int src_lane, int width = warpSize)
369 {
370  static_assert(sizeof(long long) == 2 * sizeof(int), "");
371  static_assert(sizeof(long long) == sizeof(uint64_t), "");
372 
373  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
374  tmp[0] = __shfl(tmp[0], src_lane, width);
375  tmp[1] = __shfl(tmp[1], src_lane, width);
376 
377  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
378  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
379  return tmp1;
380 }
381 
382  __device__
383 inline
384 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
385  int self = __lane_id();
386  int index = self - lane_delta;
387  index = (index < (self & ~(width-1)))?self:index;
388  return __builtin_amdgcn_ds_bpermute(index<<2, var);
389 }
390 __device__
391 inline
392 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
393  union { int i; unsigned u; float f; } tmp; tmp.u = var;
394  tmp.i = __shfl_up(tmp.i, lane_delta, width);
395  return tmp.u;
396 }
397 __device__
398 inline
399 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
400  union { int i; unsigned u; float f; } tmp; tmp.f = var;
401  tmp.i = __shfl_up(tmp.i, lane_delta, width);
402  return tmp.f;
403 }
404 __device__
405 inline
406 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
407  static_assert(sizeof(double) == 2 * sizeof(int), "");
408  static_assert(sizeof(double) == sizeof(uint64_t), "");
409 
410  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
411  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
412  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
413 
414  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
415  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
416  return tmp1;
417 }
418 __device__
419 inline
420 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
421 {
422  #ifndef _MSC_VER
423  static_assert(sizeof(long) == 2 * sizeof(int), "");
424  static_assert(sizeof(long) == sizeof(uint64_t), "");
425 
426  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
427  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
428  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
429 
430  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432  return tmp1;
433  #else
434  static_assert(sizeof(long) == sizeof(int), "");
435  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
436  #endif
437 }
438 __device__
439 inline
440 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
441 {
442  static_assert(sizeof(long long) == 2 * sizeof(int), "");
443  static_assert(sizeof(long long) == sizeof(uint64_t), "");
444  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
445  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
446  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
447  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
448  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
449  return tmp1;
450 }
451 
452 __device__
453 inline
454 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
455  int self = __lane_id();
456  int index = self + lane_delta;
457  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
458  return __builtin_amdgcn_ds_bpermute(index<<2, var);
459 }
460 __device__
461 inline
462 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
463  union { int i; unsigned u; float f; } tmp; tmp.u = var;
464  tmp.i = __shfl_down(tmp.i, lane_delta, width);
465  return tmp.u;
466 }
467 __device__
468 inline
469 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
470  union { int i; unsigned u; float f; } tmp; tmp.f = var;
471  tmp.i = __shfl_down(tmp.i, lane_delta, width);
472  return tmp.f;
473 }
474 __device__
475 inline
476 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
477  static_assert(sizeof(double) == 2 * sizeof(int), "");
478  static_assert(sizeof(double) == sizeof(uint64_t), "");
479 
480  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
481  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
482  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
483 
484  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
485  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
486  return tmp1;
487 }
488 __device__
489 inline
490 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
491 {
492  #ifndef _MSC_VER
493  static_assert(sizeof(long) == 2 * sizeof(int), "");
494  static_assert(sizeof(long) == sizeof(uint64_t), "");
495 
496  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
497  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
498  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
499 
500  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
501  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
502  return tmp1;
503  #else
504  static_assert(sizeof(long) == sizeof(int), "");
505  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
506  #endif
507 }
508 __device__
509 inline
510 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
511 {
512  static_assert(sizeof(long long) == 2 * sizeof(int), "");
513  static_assert(sizeof(long long) == sizeof(uint64_t), "");
514  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
515  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
516  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
517  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
518  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
519  return tmp1;
520 }
521 
522 __device__
523 inline
524 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
525  int self = __lane_id();
526  int index = self^lane_mask;
527  index = index >= ((self+width)&~(width-1))?self:index;
528  return __builtin_amdgcn_ds_bpermute(index<<2, var);
529 }
530 __device__
531 inline
532 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
533  union { int i; unsigned u; float f; } tmp; tmp.u = var;
534  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
535  return tmp.u;
536 }
537 __device__
538 inline
539 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
540  union { int i; unsigned u; float f; } tmp; tmp.f = var;
541  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
542  return tmp.f;
543 }
544 __device__
545 inline
546 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
547  static_assert(sizeof(double) == 2 * sizeof(int), "");
548  static_assert(sizeof(double) == sizeof(uint64_t), "");
549 
550  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
551  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
552  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
553 
554  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
555  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
556  return tmp1;
557 }
558 __device__
559 inline
560 long __shfl_xor(long var, int lane_mask, int width = warpSize)
561 {
562  #ifndef _MSC_VER
563  static_assert(sizeof(long) == 2 * sizeof(int), "");
564  static_assert(sizeof(long) == sizeof(uint64_t), "");
565 
566  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
567  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
568  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
569 
570  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
571  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
572  return tmp1;
573  #else
574  static_assert(sizeof(long) == sizeof(int), "");
575  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
576  #endif
577 }
578 __device__
579 inline
580 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
581 {
582  static_assert(sizeof(long long) == 2 * sizeof(int), "");
583  static_assert(sizeof(long long) == sizeof(uint64_t), "");
584  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
585  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
586  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
587  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
588  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
589  return tmp1;
590 }
591 
592 #define MASK1 0x00ff00ff
593 #define MASK2 0xff00ff00
594 
595 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
596  char4 out;
597  unsigned one1 = in1.w & MASK1;
598  unsigned one2 = in2.w & MASK1;
599  out.w = (one1 + one2) & MASK1;
600  one1 = in1.w & MASK2;
601  one2 = in2.w & MASK2;
602  out.w = out.w | ((one1 + one2) & MASK2);
603  return out;
604 }
605 
606 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
607  char4 out;
608  unsigned one1 = in1.w & MASK1;
609  unsigned one2 = in2.w & MASK1;
610  out.w = (one1 - one2) & MASK1;
611  one1 = in1.w & MASK2;
612  one2 = in2.w & MASK2;
613  out.w = out.w | ((one1 - one2) & MASK2);
614  return out;
615 }
616 
617 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
618  char4 out;
619  unsigned one1 = in1.w & MASK1;
620  unsigned one2 = in2.w & MASK1;
621  out.w = (one1 * one2) & MASK1;
622  one1 = in1.w & MASK2;
623  one2 = in2.w & MASK2;
624  out.w = out.w | ((one1 * one2) & MASK2);
625  return out;
626 }
627 
628 /*
629  * Rounding modes are not yet supported in HIP
630  * TODO: Conversion functions are not correct, need to fix when BE is ready
631 */
632 
633 __device__ static inline float __double2float_rd(double x) { return (double)x; }
634 __device__ static inline float __double2float_rn(double x) { return (double)x; }
635 __device__ static inline float __double2float_ru(double x) { return (double)x; }
636 __device__ static inline float __double2float_rz(double x) { return (double)x; }
637 
638 __device__ static inline int __double2hiint(double x) {
639  static_assert(sizeof(double) == 2 * sizeof(int), "");
640 
641  int tmp[2];
642  __builtin_memcpy(tmp, &x, sizeof(tmp));
643 
644  return tmp[1];
645 }
646 __device__ static inline int __double2loint(double x) {
647  static_assert(sizeof(double) == 2 * sizeof(int), "");
648 
649  int tmp[2];
650  __builtin_memcpy(tmp, &x, sizeof(tmp));
651 
652  return tmp[0];
653 }
654 
655 __device__ static inline int __double2int_rd(double x) { return (int)x; }
656 __device__ static inline int __double2int_rn(double x) { return (int)x; }
657 __device__ static inline int __double2int_ru(double x) { return (int)x; }
658 __device__ static inline int __double2int_rz(double x) { return (int)x; }
659 
660 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
661 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
662 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
663 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
664 
665 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
666 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
667 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
668 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
669 
670 __device__ static inline unsigned long long int __double2ull_rd(double x) {
671  return (unsigned long long int)x;
672 }
673 __device__ static inline unsigned long long int __double2ull_rn(double x) {
674  return (unsigned long long int)x;
675 }
676 __device__ static inline unsigned long long int __double2ull_ru(double x) {
677  return (unsigned long long int)x;
678 }
679 __device__ static inline unsigned long long int __double2ull_rz(double x) {
680  return (unsigned long long int)x;
681 }
682 
683 __device__ static inline long long int __double_as_longlong(double x) {
684  static_assert(sizeof(long long) == sizeof(double), "");
685 
686  long long tmp;
687  __builtin_memcpy(&tmp, &x, sizeof(tmp));
688 
689  return tmp;
690 }
691 
692 /*
693 __device__ unsigned short __float2half_rn(float x);
694 __device__ float __half2float(unsigned short);
695 
696 The above device function are not a valid .
697 Use
698 __device__ __half __float2half_rn(float x);
699 __device__ float __half2float(__half);
700 from hip_fp16.h
701 
702 CUDA implements half as unsigned short whereas, HIP doesn't.
703 
704 */
705 
706 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
707 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
708 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
709 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
710 
711 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
712 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
713 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
714 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
715 
716 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
717 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
718 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
719 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
720 
721 __device__ static inline unsigned long long int __float2ull_rd(float x) {
722  return (unsigned long long int)x;
723 }
724 __device__ static inline unsigned long long int __float2ull_rn(float x) {
725  return (unsigned long long int)x;
726 }
727 __device__ static inline unsigned long long int __float2ull_ru(float x) {
728  return (unsigned long long int)x;
729 }
730 __device__ static inline unsigned long long int __float2ull_rz(float x) {
731  return (unsigned long long int)x;
732 }
733 
734 __device__ static inline int __float_as_int(float x) {
735  static_assert(sizeof(int) == sizeof(float), "");
736 
737  int tmp;
738  __builtin_memcpy(&tmp, &x, sizeof(tmp));
739 
740  return tmp;
741 }
742 
743 __device__ static inline unsigned int __float_as_uint(float x) {
744  static_assert(sizeof(unsigned int) == sizeof(float), "");
745 
746  unsigned int tmp;
747  __builtin_memcpy(&tmp, &x, sizeof(tmp));
748 
749  return tmp;
750 }
751 
752 __device__ static inline double __hiloint2double(int hi, int lo) {
753  static_assert(sizeof(double) == sizeof(uint64_t), "");
754 
755  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
756  double tmp1;
757  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
758 
759  return tmp1;
760 }
761 
762 __device__ static inline double __int2double_rn(int x) { return (double)x; }
763 
764 __device__ static inline float __int2float_rd(int x) { return (float)x; }
765 __device__ static inline float __int2float_rn(int x) { return (float)x; }
766 __device__ static inline float __int2float_ru(int x) { return (float)x; }
767 __device__ static inline float __int2float_rz(int x) { return (float)x; }
768 
769 __device__ static inline float __int_as_float(int x) {
770  static_assert(sizeof(float) == sizeof(int), "");
771 
772  float tmp;
773  __builtin_memcpy(&tmp, &x, sizeof(tmp));
774 
775  return tmp;
776 }
777 
778 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
779 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
780 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
781 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
782 
783 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
784 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
785 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
786 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
787 
788 __device__ static inline double __longlong_as_double(long long int x) {
789  static_assert(sizeof(double) == sizeof(long long), "");
790 
791  double tmp;
792  __builtin_memcpy(&tmp, &x, sizeof(tmp));
793 
794  return tmp;
795 }
796 
797 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
798 
799 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
800 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
801 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
802 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
803 
804 __device__ static inline float __uint_as_float(unsigned int x) {
805  static_assert(sizeof(float) == sizeof(unsigned int), "");
806 
807  float tmp;
808  __builtin_memcpy(&tmp, &x, sizeof(tmp));
809 
810  return tmp;
811 }
812 
813 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
814 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
815 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
816 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
817 
818 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
819 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
820 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
821 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
822 
823 #if defined(__HCC__)
824 #define __HCC_OR_HIP_CLANG__ 1
825 #elif defined(__clang__) && defined(__HIP__)
826 #define __HCC_OR_HIP_CLANG__ 1
827 #else
828 #define __HCC_OR_HIP_CLANG__ 0
829 #endif
830 
831 #ifdef __HCC_OR_HIP_CLANG__
832 
833 // Clock functions
834 __device__ long long int __clock64();
835 __device__ long long int __clock();
836 __device__ long long int clock64();
837 __device__ long long int clock();
838 // hip.amdgcn.bc - named sync
839 __device__ void __named_sync(int a, int b);
840 
841 #ifdef __HIP_DEVICE_COMPILE__
842 
843 // Clock functions
844 #if __HCC__
845 extern "C" uint64_t __clock_u64() __HC__;
846 #endif
847 
848 __device__
849 inline __attribute((always_inline))
850 long long int __clock64() {
851 return (long long int) __builtin_readcyclecounter();
852 }
853 
854 __device__
855 inline __attribute((always_inline))
856 long long int __clock() { return __clock64(); }
857 
858 __device__
859 inline __attribute__((always_inline))
860 long long int clock64() { return __clock64(); }
861 
862 __device__
863 inline __attribute__((always_inline))
864 long long int clock() { return __clock(); }
865 
866 // hip.amdgcn.bc - named sync
867 __device__
868 inline
869 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
870 
871 #endif // __HIP_DEVICE_COMPILE__
872 
873 // warp vote function __all __any __ballot
874 __device__
875 inline
876 int __all(int predicate) {
877  return __ockl_wfall_i32(predicate);
878 }
879 
880 __device__
881 inline
882 int __any(int predicate) {
883  return __ockl_wfany_i32(predicate);
884 }
885 
886 // XXX from llvm/include/llvm/IR/InstrTypes.h
887 #define ICMP_NE 33
888 
889 __device__
890 inline
891 unsigned long long int __ballot(int predicate) {
892  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
893 }
894 
895 __device__
896 inline
897 unsigned long long int __ballot64(int predicate) {
898  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
899 }
900 
901 // hip.amdgcn.bc - lanemask
902 __device__
903 inline
904 uint64_t __lanemask_gt()
905 {
906  uint32_t lane = __ockl_lane_u32();
907  if (lane == 63)
908  return 0;
909  uint64_t ballot = __ballot64(1);
910  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
911  return mask & ballot;
912 }
913 
914 __device__
915 inline
916 uint64_t __lanemask_lt()
917 {
918  uint32_t lane = __ockl_lane_u32();
919  int64_t ballot = __ballot64(1);
920  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
921  return mask & ballot;
922 }
923 
924 __device__
925 inline
926 uint64_t __lanemask_eq()
927 {
928  uint32_t lane = __ockl_lane_u32();
929  int64_t mask = ((uint64_t)1 << lane);
930  return mask;
931 }
932 
933 
934 __device__ inline void* __local_to_generic(void* p) { return p; }
935 
936 #ifdef __HIP_DEVICE_COMPILE__
937 __device__
938 inline
939 void* __get_dynamicgroupbaseptr()
940 {
941  // Get group segment base pointer.
942  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
943 }
944 #else
945 __device__
946 void* __get_dynamicgroupbaseptr();
947 #endif // __HIP_DEVICE_COMPILE__
948 
949 __device__
950 inline
951 void *__amdgcn_get_dynamicgroupbaseptr() {
952  return __get_dynamicgroupbaseptr();
953 }
954 
955 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
956 // hip.amdgcn.bc - sync threads
957 #define __CLK_LOCAL_MEM_FENCE 0x01
958 typedef unsigned __cl_mem_fence_flags;
959 
960 typedef enum __memory_scope {
961  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
962  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
963  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
964  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
965  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
966 } __memory_scope;
967 
968 // enum values aligned with what clang uses in EmitAtomicExpr()
969 typedef enum __memory_order
970 {
971  __memory_order_relaxed = __ATOMIC_RELAXED,
972  __memory_order_acquire = __ATOMIC_ACQUIRE,
973  __memory_order_release = __ATOMIC_RELEASE,
974  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
975  __memory_order_seq_cst = __ATOMIC_SEQ_CST
976 } __memory_order;
977 
978 __device__
979 inline
980 static void
981 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
982 {
983  // We're tying global-happens-before and local-happens-before together as does HSA
984  if (order != __memory_order_relaxed) {
985  switch (scope) {
986  case __memory_scope_work_item:
987  break;
988  case __memory_scope_sub_group:
989  switch (order) {
990  case __memory_order_relaxed: break;
991  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
992  case __memory_order_release: __llvm_fence_rel_sg(); break;
993  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
994  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
995  }
996  break;
997  case __memory_scope_work_group:
998  switch (order) {
999  case __memory_order_relaxed: break;
1000  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
1001  case __memory_order_release: __llvm_fence_rel_wg(); break;
1002  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
1003  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
1004  }
1005  break;
1006  case __memory_scope_device:
1007  switch (order) {
1008  case __memory_order_relaxed: break;
1009  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
1010  case __memory_order_release: __llvm_fence_rel_dev(); break;
1011  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
1012  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
1013  }
1014  break;
1015  case __memory_scope_all_svm_devices:
1016  switch (order) {
1017  case __memory_order_relaxed: break;
1018  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
1019  case __memory_order_release: __llvm_fence_rel_sys(); break;
1020  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
1021  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
1022  }
1023  break;
1024  }
1025  }
1026 }
1027 #endif
1028 
1029 // Memory Fence Functions
1030 __device__
1031 inline
1032 static void __threadfence()
1033 {
1034  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1035 }
1036 
1037 __device__
1038 inline
1039 static void __threadfence_block()
1040 {
1041  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1042 }
1043 
1044 __device__
1045 inline
1046 static void __threadfence_system()
1047 {
1048  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1049 }
1050 
1051 // abort
1052 __device__
1053 inline
1054 __attribute__((weak))
1055 void abort() {
1056  return __builtin_trap();
1057 }
1058 
1059 
1060 #endif // __HCC_OR_HIP_CLANG__
1061 
1062 #ifdef __HCC__
1063 
1068 // Macro to replace extern __shared__ declarations
1069 // to local variable definitions
1070 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
1071 
1072 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1073 
1074 
1075 #elif defined(__clang__) && defined(__HIP__)
1076 
1077 #pragma push_macro("__DEVICE__")
1078 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
1079  __attribute__((weak))
1080 
1081 __DEVICE__
1082 inline
1083 void __assert_fail(const char * __assertion,
1084  const char *__file,
1085  unsigned int __line,
1086  const char *__function)
1087 {
1088  printf("%s:%u: %s: Device-side assertion `%s' failed.\n", __file, __line,
1089  __function, __assertion);
1090  // Ignore all the args for now.
1091  __builtin_trap();
1092 }
1093 
1094 __DEVICE__
1095 inline
1096 void __assertfail(const char * __assertion,
1097  const char *__file,
1098  unsigned int __line,
1099  const char *__function,
1100  size_t charsize)
1101 {
1102  // ignore all the args for now.
1103  __builtin_trap();
1104 }
1105 
1106 __device__
1107 inline
1108 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1109 {
1110  if (flags) {
1111  __atomic_work_item_fence(flags, __memory_order_release, scope);
1112  __builtin_amdgcn_s_barrier();
1113  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1114  } else {
1115  __builtin_amdgcn_s_barrier();
1116  }
1117 }
1118 
1119 __device__
1120 inline
1121 static void __barrier(int n)
1122 {
1123  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1124 }
1125 
1126 __device__
1127 inline
1128 __attribute__((convergent))
1129 void __syncthreads()
1130 {
1131  __barrier(__CLK_LOCAL_MEM_FENCE);
1132 }
1133 
1134 __device__
1135 inline
1136 __attribute__((convergent))
1137 int __syncthreads_count(int predicate)
1138 {
1139  return __ockl_wgred_add_i32(!!predicate);
1140 }
1141 
1142 __device__
1143 inline
1144 __attribute__((convergent))
1145 int __syncthreads_and(int predicate)
1146 {
1147  return __ockl_wgred_and_i32(!!predicate);
1148 }
1149 
1150 __device__
1151 inline
1152 __attribute__((convergent))
1153 int __syncthreads_or(int predicate)
1154 {
1155  return __ockl_wgred_or_i32(!!predicate);
1156 }
1157 
1158 // hip.amdgcn.bc - device routine
1159 /*
1160  HW_ID Register bit structure
1161  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1162  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1163  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1164  CU_ID 11:8 Compute Unit the wave is assigned to.
1165  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1166  SE_ID 14:13 Shader Engine the wave is assigned to.
1167  TG_ID 19:16 Thread-group ID
1168  VM_ID 23:20 Virtual Memory ID
1169  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1170  STATE_ID 29:27 State ID (graphics only, not compute).
1171  ME_ID 31:30 Micro-engine ID.
1172  */
1173 
1174 #define HW_ID 4
1175 
1176 #define HW_ID_CU_ID_SIZE 4
1177 #define HW_ID_CU_ID_OFFSET 8
1178 
1179 #define HW_ID_SE_ID_SIZE 2
1180 #define HW_ID_SE_ID_OFFSET 13
1181 
1182 /*
1183  Encoding of parameter bitmask
1184  HW_ID 5:0 HW_ID
1185  OFFSET 10:6 Range: 0..31
1186  SIZE 15:11 Range: 1..32
1187  */
1188 
1189 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1190 
1191 /*
1192  __smid returns the wave's assigned Compute Unit and Shader Engine.
1193  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1194  Note: the results vary over time.
1195  SZ minus 1 since SIZE is 1-based.
1196 */
1197 __device__
1198 inline
1199 unsigned __smid(void)
1200 {
1201  unsigned cu_id = __builtin_amdgcn_s_getreg(
1202  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1203  unsigned se_id = __builtin_amdgcn_s_getreg(
1204  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1205 
1206  /* Each shader engine has 16 CU */
1207  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1208 }
1209 
1210 #pragma push_macro("__DEVICE__")
1211 
1212 // Macro to replace extern __shared__ declarations
1213 // to local variable definitions
1214 #define HIP_DYNAMIC_SHARED(type, var) \
1215  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1216 
1217 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1218 
1219 
1220 #endif //defined(__clang__) && defined(__HIP__)
1221 
1222 
1223 // loop unrolling
1224 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1225  auto dstPtr = static_cast<unsigned char*>(dst);
1226  auto srcPtr = static_cast<const unsigned char*>(src);
1227 
1228  while (size >= 4u) {
1229  dstPtr[0] = srcPtr[0];
1230  dstPtr[1] = srcPtr[1];
1231  dstPtr[2] = srcPtr[2];
1232  dstPtr[3] = srcPtr[3];
1233 
1234  size -= 4u;
1235  srcPtr += 4u;
1236  dstPtr += 4u;
1237  }
1238  switch (size) {
1239  case 3:
1240  dstPtr[2] = srcPtr[2];
1241  case 2:
1242  dstPtr[1] = srcPtr[1];
1243  case 1:
1244  dstPtr[0] = srcPtr[0];
1245  }
1246 
1247  return dst;
1248 }
1249 
1250 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1251  auto dstPtr = static_cast<unsigned char*>(dst);
1252 
1253  while (size >= 4u) {
1254  dstPtr[0] = val;
1255  dstPtr[1] = val;
1256  dstPtr[2] = val;
1257  dstPtr[3] = val;
1258 
1259  size -= 4u;
1260  dstPtr += 4u;
1261  }
1262  switch (size) {
1263  case 3:
1264  dstPtr[2] = val;
1265  case 2:
1266  dstPtr[1] = val;
1267  case 1:
1268  dstPtr[0] = val;
1269  }
1270 
1271  return dst;
1272 }
1273 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1274  return __hip_hc_memcpy(dst, src, size);
1275 }
1276 
1277 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1278  unsigned char val8 = static_cast<unsigned char>(val);
1279  return __hip_hc_memset(ptr, val8, size);
1280 }
1281 
1282 #endif
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
char4
Definition: hip_vector_types.h:1509
uchar2Holder
Definition: device_functions.h:146
llvm_intrinsics.h
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s....
host_defines.h
TODO-doc.
ucharHolder
Definition: device_functions.h:139
device_library_decls.h
Contains declarations for types and functions in device library.