HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountll(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ullong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1ULL << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
228  union { int i; unsigned u; float f; } tmp; tmp.u = src;
229  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
230  return tmp.u;
231 }
232 
233 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
234  union { int i; unsigned u; float f; } tmp; tmp.f = src;
235  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
236  return tmp.f;
237 }
238 
239 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
240  union { int i; unsigned u; float f; } tmp; tmp.u = src;
241  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
242  return tmp.u;
243 }
244 
245 __device__ static inline float __hip_ds_permutef(int index, float src) {
246  union { int i; unsigned u; float f; } tmp; tmp.u = src;
247  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
248  return tmp.u;
249 }
250 
251 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
252 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
253 
254 template <int pattern>
255 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
256  union { int i; unsigned u; float f; } tmp; tmp.u = src;
257 #if defined(__HCC__)
258  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
259 #else
260  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
261 #endif
262  return tmp.u;
263 }
264 
265 template <int pattern>
266 __device__ static inline float __hip_ds_swizzlef_N(float src) {
267  union { int i; unsigned u; float f; } tmp; tmp.f = src;
268 #if defined(__HCC__)
269  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
270 #else
271  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
272 #endif
273  return tmp.f;
274 }
275 
276 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
277  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
278 
279 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
280 __device__ static inline int __hip_move_dpp_N(int src) {
281  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
282  bound_ctrl);
283 }
284 
285 static constexpr int warpSize = 64;
286 
287 __device__
288 inline
289 int __shfl(int var, int src_lane, int width = warpSize) {
290  int self = __lane_id();
291  int index = src_lane + (self & ~(width-1));
292  return __llvm_amdgcn_ds_bpermute(index<<2, var);
293 }
294 __device__
295 inline
296 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
297  union { int i; unsigned u; float f; } tmp; tmp.u = var;
298  tmp.i = __shfl(tmp.i, src_lane, width);
299  return tmp.u;
300 }
301 __device__
302 inline
303 float __shfl(float var, int src_lane, int width = warpSize) {
304  union { int i; unsigned u; float f; } tmp; tmp.f = var;
305  tmp.i = __shfl(tmp.i, src_lane, width);
306  return tmp.f;
307 }
308 __device__
309 inline
310 double __shfl(double var, int src_lane, int width = warpSize) {
311  static_assert(sizeof(double) == 2 * sizeof(int), "");
312  static_assert(sizeof(double) == sizeof(uint64_t), "");
313 
314  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
315  tmp[0] = __shfl(tmp[0], src_lane, width);
316  tmp[1] = __shfl(tmp[1], src_lane, width);
317 
318  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
319  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
320  return tmp1;
321 }
322 __device__
323 inline
324 long __shfl(long var, int src_lane, int width = warpSize)
325 {
326  #ifndef _MSC_VER
327  static_assert(sizeof(long) == 2 * sizeof(int), "");
328  static_assert(sizeof(long) == sizeof(uint64_t), "");
329 
330  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
331  tmp[0] = __shfl(tmp[0], src_lane, width);
332  tmp[1] = __shfl(tmp[1], src_lane, width);
333 
334  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
335  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
336  return tmp1;
337  #else
338  static_assert(sizeof(long) == sizeof(int), "");
339  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
340  #endif
341 }
342 __device__
343 inline
344 long long __shfl(long long var, int src_lane, int width = warpSize)
345 {
346  static_assert(sizeof(long long) == 2 * sizeof(int), "");
347  static_assert(sizeof(long long) == sizeof(uint64_t), "");
348 
349  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
350  tmp[0] = __shfl(tmp[0], src_lane, width);
351  tmp[1] = __shfl(tmp[1], src_lane, width);
352 
353  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
354  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
355  return tmp1;
356 }
357 
358  __device__
359 inline
360 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
361  int self = __lane_id();
362  int index = self - lane_delta;
363  index = (index < (self & ~(width-1)))?self:index;
364  return __llvm_amdgcn_ds_bpermute(index<<2, var);
365 }
366 __device__
367 inline
368 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
369  union { int i; unsigned u; float f; } tmp; tmp.u = var;
370  tmp.i = __shfl_up(tmp.i, lane_delta, width);
371  return tmp.u;
372 }
373 __device__
374 inline
375 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
376  union { int i; unsigned u; float f; } tmp; tmp.f = var;
377  tmp.i = __shfl_up(tmp.i, lane_delta, width);
378  return tmp.f;
379 }
380 __device__
381 inline
382 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
383  static_assert(sizeof(double) == 2 * sizeof(int), "");
384  static_assert(sizeof(double) == sizeof(uint64_t), "");
385 
386  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
387  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
388  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
389 
390  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
391  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
392  return tmp1;
393 }
394 __device__
395 inline
396 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
397 {
398  #ifndef _MSC_VER
399  static_assert(sizeof(long) == 2 * sizeof(int), "");
400  static_assert(sizeof(long) == sizeof(uint64_t), "");
401 
402  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
403  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
404  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
405 
406  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
407  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
408  return tmp1;
409  #else
410  static_assert(sizeof(long) == sizeof(int), "");
411  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
412  #endif
413 }
414 __device__
415 inline
416 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
417 {
418  static_assert(sizeof(long long) == 2 * sizeof(int), "");
419  static_assert(sizeof(long long) == sizeof(uint64_t), "");
420  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
421  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
422  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
423  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
424  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
425  return tmp1;
426 }
427 
428 __device__
429 inline
430 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
431  int self = __lane_id();
432  int index = self + lane_delta;
433  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
434  return __llvm_amdgcn_ds_bpermute(index<<2, var);
435 }
436 __device__
437 inline
438 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
439  union { int i; unsigned u; float f; } tmp; tmp.u = var;
440  tmp.i = __shfl_down(tmp.i, lane_delta, width);
441  return tmp.u;
442 }
443 __device__
444 inline
445 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
446  union { int i; unsigned u; float f; } tmp; tmp.f = var;
447  tmp.i = __shfl_down(tmp.i, lane_delta, width);
448  return tmp.f;
449 }
450 __device__
451 inline
452 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
453  static_assert(sizeof(double) == 2 * sizeof(int), "");
454  static_assert(sizeof(double) == sizeof(uint64_t), "");
455 
456  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
457  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
458  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
459 
460  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
461  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
462  return tmp1;
463 }
464 __device__
465 inline
466 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
467 {
468  #ifndef _MSC_VER
469  static_assert(sizeof(long) == 2 * sizeof(int), "");
470  static_assert(sizeof(long) == sizeof(uint64_t), "");
471 
472  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
473  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
474  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
475 
476  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
477  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
478  return tmp1;
479  #else
480  static_assert(sizeof(long) == sizeof(int), "");
481  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
482  #endif
483 }
484 __device__
485 inline
486 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
487 {
488  static_assert(sizeof(long long) == 2 * sizeof(int), "");
489  static_assert(sizeof(long long) == sizeof(uint64_t), "");
490  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
491  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
492  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
493  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
494  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
495  return tmp1;
496 }
497 
498 __device__
499 inline
500 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
501  int self = __lane_id();
502  int index = self^lane_mask;
503  index = index >= ((self+width)&~(width-1))?self:index;
504  return __llvm_amdgcn_ds_bpermute(index<<2, var);
505 }
506 __device__
507 inline
508 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
509  union { int i; unsigned u; float f; } tmp; tmp.u = var;
510  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
511  return tmp.u;
512 }
513 __device__
514 inline
515 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
516  union { int i; unsigned u; float f; } tmp; tmp.f = var;
517  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
518  return tmp.f;
519 }
520 __device__
521 inline
522 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
523  static_assert(sizeof(double) == 2 * sizeof(int), "");
524  static_assert(sizeof(double) == sizeof(uint64_t), "");
525 
526  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
527  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
528  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
529 
530  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
531  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
532  return tmp1;
533 }
534 __device__
535 inline
536 long __shfl_xor(long var, int lane_mask, int width = warpSize)
537 {
538  #ifndef _MSC_VER
539  static_assert(sizeof(long) == 2 * sizeof(int), "");
540  static_assert(sizeof(long) == sizeof(uint64_t), "");
541 
542  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
543  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
544  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
545 
546  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
547  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
548  return tmp1;
549  #else
550  static_assert(sizeof(long) == sizeof(int), "");
551  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
552  #endif
553 }
554 __device__
555 inline
556 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
557 {
558  static_assert(sizeof(long long) == 2 * sizeof(int), "");
559  static_assert(sizeof(long long) == sizeof(uint64_t), "");
560  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
561  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
562  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
563  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
564  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
565  return tmp1;
566 }
567 
568 #define MASK1 0x00ff00ff
569 #define MASK2 0xff00ff00
570 
571 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
572  char4 out;
573  unsigned one1 = in1.w & MASK1;
574  unsigned one2 = in2.w & MASK1;
575  out.w = (one1 + one2) & MASK1;
576  one1 = in1.w & MASK2;
577  one2 = in2.w & MASK2;
578  out.w = out.w | ((one1 + one2) & MASK2);
579  return out;
580 }
581 
582 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
583  char4 out;
584  unsigned one1 = in1.w & MASK1;
585  unsigned one2 = in2.w & MASK1;
586  out.w = (one1 - one2) & MASK1;
587  one1 = in1.w & MASK2;
588  one2 = in2.w & MASK2;
589  out.w = out.w | ((one1 - one2) & MASK2);
590  return out;
591 }
592 
593 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
594  char4 out;
595  unsigned one1 = in1.w & MASK1;
596  unsigned one2 = in2.w & MASK1;
597  out.w = (one1 * one2) & MASK1;
598  one1 = in1.w & MASK2;
599  one2 = in2.w & MASK2;
600  out.w = out.w | ((one1 * one2) & MASK2);
601  return out;
602 }
603 
604 /*
605  * Rounding modes are not yet supported in HIP
606  * TODO: Conversion functions are not correct, need to fix when BE is ready
607 */
608 
609 __device__ static inline float __double2float_rd(double x) { return (double)x; }
610 __device__ static inline float __double2float_rn(double x) { return (double)x; }
611 __device__ static inline float __double2float_ru(double x) { return (double)x; }
612 __device__ static inline float __double2float_rz(double x) { return (double)x; }
613 
614 __device__ static inline int __double2hiint(double x) {
615  static_assert(sizeof(double) == 2 * sizeof(int), "");
616 
617  int tmp[2];
618  __builtin_memcpy(tmp, &x, sizeof(tmp));
619 
620  return tmp[1];
621 }
622 __device__ static inline int __double2loint(double x) {
623  static_assert(sizeof(double) == 2 * sizeof(int), "");
624 
625  int tmp[2];
626  __builtin_memcpy(tmp, &x, sizeof(tmp));
627 
628  return tmp[0];
629 }
630 
631 __device__ static inline int __double2int_rd(double x) { return (int)x; }
632 __device__ static inline int __double2int_rn(double x) { return (int)x; }
633 __device__ static inline int __double2int_ru(double x) { return (int)x; }
634 __device__ static inline int __double2int_rz(double x) { return (int)x; }
635 
636 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
637 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
638 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
639 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
640 
641 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
642 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
643 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
644 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
645 
646 __device__ static inline unsigned long long int __double2ull_rd(double x) {
647  return (unsigned long long int)x;
648 }
649 __device__ static inline unsigned long long int __double2ull_rn(double x) {
650  return (unsigned long long int)x;
651 }
652 __device__ static inline unsigned long long int __double2ull_ru(double x) {
653  return (unsigned long long int)x;
654 }
655 __device__ static inline unsigned long long int __double2ull_rz(double x) {
656  return (unsigned long long int)x;
657 }
658 
659 __device__ static inline long long int __double_as_longlong(double x) {
660  static_assert(sizeof(long long) == sizeof(double), "");
661 
662  long long tmp;
663  __builtin_memcpy(&tmp, &x, sizeof(tmp));
664 
665  return tmp;
666 }
667 
668 /*
669 __device__ unsigned short __float2half_rn(float x);
670 __device__ float __half2float(unsigned short);
671 
672 The above device function are not a valid .
673 Use
674 __device__ __half __float2half_rn(float x);
675 __device__ float __half2float(__half);
676 from hip_fp16.h
677 
678 CUDA implements half as unsigned short whereas, HIP doesn't.
679 
680 */
681 
682 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
683 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
684 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
685 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
686 
687 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
688 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
689 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
690 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
691 
692 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
693 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
694 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
695 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
696 
697 __device__ static inline unsigned long long int __float2ull_rd(float x) {
698  return (unsigned long long int)x;
699 }
700 __device__ static inline unsigned long long int __float2ull_rn(float x) {
701  return (unsigned long long int)x;
702 }
703 __device__ static inline unsigned long long int __float2ull_ru(float x) {
704  return (unsigned long long int)x;
705 }
706 __device__ static inline unsigned long long int __float2ull_rz(float x) {
707  return (unsigned long long int)x;
708 }
709 
710 __device__ static inline int __float_as_int(float x) {
711  static_assert(sizeof(int) == sizeof(float), "");
712 
713  int tmp;
714  __builtin_memcpy(&tmp, &x, sizeof(tmp));
715 
716  return tmp;
717 }
718 
719 __device__ static inline unsigned int __float_as_uint(float x) {
720  static_assert(sizeof(unsigned int) == sizeof(float), "");
721 
722  unsigned int tmp;
723  __builtin_memcpy(&tmp, &x, sizeof(tmp));
724 
725  return tmp;
726 }
727 
728 __device__ static inline double __hiloint2double(int hi, int lo) {
729  static_assert(sizeof(double) == sizeof(uint64_t), "");
730 
731  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
732  double tmp1;
733  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
734 
735  return tmp1;
736 }
737 
738 __device__ static inline double __int2double_rn(int x) { return (double)x; }
739 
740 __device__ static inline float __int2float_rd(int x) { return (float)x; }
741 __device__ static inline float __int2float_rn(int x) { return (float)x; }
742 __device__ static inline float __int2float_ru(int x) { return (float)x; }
743 __device__ static inline float __int2float_rz(int x) { return (float)x; }
744 
745 __device__ static inline float __int_as_float(int x) {
746  static_assert(sizeof(float) == sizeof(int), "");
747 
748  float tmp;
749  __builtin_memcpy(&tmp, &x, sizeof(tmp));
750 
751  return tmp;
752 }
753 
754 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
755 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
756 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
757 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
758 
759 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
760 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
761 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
762 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
763 
764 __device__ static inline double __longlong_as_double(long long int x) {
765  static_assert(sizeof(double) == sizeof(long long), "");
766 
767  double tmp;
768  __builtin_memcpy(&tmp, &x, sizeof(tmp));
769 
770  return tmp;
771 }
772 
773 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
774 
775 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
776 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
777 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
778 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
779 
780 __device__ static inline float __uint_as_float(unsigned int x) {
781  static_assert(sizeof(float) == sizeof(unsigned int), "");
782 
783  float tmp;
784  __builtin_memcpy(&tmp, &x, sizeof(tmp));
785 
786  return tmp;
787 }
788 
789 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
790 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
791 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
792 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
793 
794 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
795 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
796 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
797 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
798 
799 #if defined(__HCC__)
800 #define __HCC_OR_HIP_CLANG__ 1
801 #elif defined(__clang__) && defined(__HIP__)
802 #define __HCC_OR_HIP_CLANG__ 1
803 #else
804 #define __HCC_OR_HIP_CLANG__ 0
805 #endif
806 
807 #ifdef __HCC_OR_HIP_CLANG__
808 
809 // Clock functions
810 __device__ long long int __clock64();
811 __device__ long long int __clock();
812 __device__ long long int clock64();
813 __device__ long long int clock();
814 // hip.amdgcn.bc - named sync
815 __device__ void __named_sync(int a, int b);
816 
817 #ifdef __HIP_DEVICE_COMPILE__
818 
819 // Clock functions
820 #if __HCC__
821 extern "C" uint64_t __clock_u64() __HC__;
822 #endif
823 
824 __device__
825 inline __attribute((always_inline))
826 long long int __clock64() {
827 return (long long int) __builtin_readcyclecounter();
828 }
829 
830 __device__
831 inline __attribute((always_inline))
832 long long int __clock() { return __clock64(); }
833 
834 __device__
835 inline __attribute__((always_inline))
836 long long int clock64() { return __clock64(); }
837 
838 __device__
839 inline __attribute__((always_inline))
840 long long int clock() { return __clock(); }
841 
842 // hip.amdgcn.bc - named sync
843 __device__
844 inline
845 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
846 
847 #endif // __HIP_DEVICE_COMPILE__
848 
849 // warp vote function __all __any __ballot
850 __device__
851 inline
852 int __all(int predicate) {
853  return __ockl_wfall_i32(predicate);
854 }
855 
856 __device__
857 inline
858 int __any(int predicate) {
859  return __ockl_wfany_i32(predicate);
860 }
861 
862 // XXX from llvm/include/llvm/IR/InstrTypes.h
863 #define ICMP_NE 33
864 
865 __device__
866 inline
867 unsigned long long int __ballot(int predicate) {
868  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
869 }
870 
871 __device__
872 inline
873 unsigned long long int __ballot64(int predicate) {
874  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
875 }
876 
877 // hip.amdgcn.bc - lanemask
878 __device__
879 inline
880 uint64_t __lanemask_gt()
881 {
882  uint32_t lane = __ockl_lane_u32();
883  if (lane == 63)
884  return 0;
885  uint64_t ballot = __ballot64(1);
886  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
887  return mask & ballot;
888 }
889 
890 __device__
891 inline
892 uint64_t __lanemask_lt()
893 {
894  uint32_t lane = __ockl_lane_u32();
895  int64_t ballot = __ballot64(1);
896  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
897  return mask & ballot;
898 }
899 
900 __device__
901 inline
902 uint64_t __lanemask_eq()
903 {
904  uint32_t lane = __ockl_lane_u32();
905  int64_t mask = ((uint64_t)1 << lane);
906  return mask;
907 }
908 
909 
910 __device__ inline void* __local_to_generic(void* p) { return p; }
911 
912 #ifdef __HIP_DEVICE_COMPILE__
913 __device__
914 inline
915 void* __get_dynamicgroupbaseptr()
916 {
917  // Get group segment base pointer.
918  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
919 }
920 #else
921 __device__
922 void* __get_dynamicgroupbaseptr();
923 #endif // __HIP_DEVICE_COMPILE__
924 
925 __device__
926 inline
927 void *__amdgcn_get_dynamicgroupbaseptr() {
928  return __get_dynamicgroupbaseptr();
929 }
930 
931 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
932 // hip.amdgcn.bc - sync threads
933 #define __CLK_LOCAL_MEM_FENCE 0x01
934 typedef unsigned __cl_mem_fence_flags;
935 
936 typedef enum __memory_scope {
937  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
938  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
939  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
940  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
941  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
942 } __memory_scope;
943 
944 // enum values aligned with what clang uses in EmitAtomicExpr()
945 typedef enum __memory_order
946 {
947  __memory_order_relaxed = __ATOMIC_RELAXED,
948  __memory_order_acquire = __ATOMIC_ACQUIRE,
949  __memory_order_release = __ATOMIC_RELEASE,
950  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
951  __memory_order_seq_cst = __ATOMIC_SEQ_CST
952 } __memory_order;
953 
954 __device__
955 inline
956 static void
957 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
958 {
959  // We're tying global-happens-before and local-happens-before together as does HSA
960  if (order != __memory_order_relaxed) {
961  switch (scope) {
962  case __memory_scope_work_item:
963  break;
964  case __memory_scope_sub_group:
965  switch (order) {
966  case __memory_order_relaxed: break;
967  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
968  case __memory_order_release: __llvm_fence_rel_sg(); break;
969  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
970  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
971  }
972  break;
973  case __memory_scope_work_group:
974  switch (order) {
975  case __memory_order_relaxed: break;
976  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
977  case __memory_order_release: __llvm_fence_rel_wg(); break;
978  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
979  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
980  }
981  break;
982  case __memory_scope_device:
983  switch (order) {
984  case __memory_order_relaxed: break;
985  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
986  case __memory_order_release: __llvm_fence_rel_dev(); break;
987  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
988  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
989  }
990  break;
991  case __memory_scope_all_svm_devices:
992  switch (order) {
993  case __memory_order_relaxed: break;
994  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
995  case __memory_order_release: __llvm_fence_rel_sys(); break;
996  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
997  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
998  }
999  break;
1000  }
1001  }
1002 }
1003 #endif
1004 
1005 // Memory Fence Functions
1006 __device__
1007 inline
1008 static void __threadfence()
1009 {
1010  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1011 }
1012 
1013 __device__
1014 inline
1015 static void __threadfence_block()
1016 {
1017  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1018 }
1019 
1020 __device__
1021 inline
1022 static void __threadfence_system()
1023 {
1024  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1025 }
1026 
1027 // abort
1028 __device__
1029 inline
1030 __attribute__((weak))
1031 void abort() {
1032  return __builtin_trap();
1033 }
1034 
1035 
1036 #endif // __HCC_OR_HIP_CLANG__
1037 
1038 #ifdef __HCC__
1039 
1044 // Macro to replace extern __shared__ declarations
1045 // to local variable definitions
1046 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
1047 
1048 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1049 
1050 
1051 #elif defined(__clang__) && defined(__HIP__)
1052 
1053 #pragma push_macro("__DEVICE__")
1054 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
1055  __attribute__((weak))
1056 
1057 __DEVICE__
1058 inline
1059 void __assert_fail(const char * __assertion,
1060  const char *__file,
1061  unsigned int __line,
1062  const char *__function)
1063 {
1064  // Ignore all the args for now.
1065  __builtin_trap();
1066 }
1067 
1068 __DEVICE__
1069 inline
1070 void __assertfail(const char * __assertion,
1071  const char *__file,
1072  unsigned int __line,
1073  const char *__function,
1074  size_t charsize)
1075 {
1076  // ignore all the args for now.
1077  __builtin_trap();
1078 }
1079 
1080 __device__
1081 inline
1082 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1083 {
1084  if (flags) {
1085  __atomic_work_item_fence(flags, __memory_order_release, scope);
1086  __builtin_amdgcn_s_barrier();
1087  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1088  } else {
1089  __builtin_amdgcn_s_barrier();
1090  }
1091 }
1092 
1093 __device__
1094 inline
1095 static void __barrier(int n)
1096 {
1097  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1098 }
1099 
1100 __device__
1101 inline
1102 __attribute__((convergent))
1103 void __syncthreads()
1104 {
1105  __barrier(__CLK_LOCAL_MEM_FENCE);
1106 }
1107 
1108 // hip.amdgcn.bc - device routine
1109 /*
1110  HW_ID Register bit structure
1111  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1112  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1113  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1114  CU_ID 11:8 Compute Unit the wave is assigned to.
1115  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1116  SE_ID 14:13 Shader Engine the wave is assigned to.
1117  TG_ID 19:16 Thread-group ID
1118  VM_ID 23:20 Virtual Memory ID
1119  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1120  STATE_ID 29:27 State ID (graphics only, not compute).
1121  ME_ID 31:30 Micro-engine ID.
1122  */
1123 
1124 #define HW_ID 4
1125 
1126 #define HW_ID_CU_ID_SIZE 4
1127 #define HW_ID_CU_ID_OFFSET 8
1128 
1129 #define HW_ID_SE_ID_SIZE 2
1130 #define HW_ID_SE_ID_OFFSET 13
1131 
1132 /*
1133  Encoding of parameter bitmask
1134  HW_ID 5:0 HW_ID
1135  OFFSET 10:6 Range: 0..31
1136  SIZE 15:11 Range: 1..32
1137  */
1138 
1139 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1140 
1141 /*
1142  __smid returns the wave's assigned Compute Unit and Shader Engine.
1143  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1144  Note: the results vary over time.
1145  SZ minus 1 since SIZE is 1-based.
1146 */
1147 __device__
1148 inline
1149 unsigned __smid(void)
1150 {
1151  unsigned cu_id = __builtin_amdgcn_s_getreg(
1152  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1153  unsigned se_id = __builtin_amdgcn_s_getreg(
1154  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1155 
1156  /* Each shader engine has 16 CU */
1157  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1158 }
1159 
1160 #pragma push_macro("__DEVICE__")
1161 
1162 // Macro to replace extern __shared__ declarations
1163 // to local variable definitions
1164 #define HIP_DYNAMIC_SHARED(type, var) \
1165  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1166 
1167 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1168 
1169 
1170 #endif //defined(__clang__) && defined(__HIP__)
1171 
1172 
1173 // loop unrolling
1174 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1175  auto dstPtr = static_cast<unsigned char*>(dst);
1176  auto srcPtr = static_cast<const unsigned char*>(src);
1177 
1178  while (size >= 4u) {
1179  dstPtr[0] = srcPtr[0];
1180  dstPtr[1] = srcPtr[1];
1181  dstPtr[2] = srcPtr[2];
1182  dstPtr[3] = srcPtr[3];
1183 
1184  size -= 4u;
1185  srcPtr += 4u;
1186  dstPtr += 4u;
1187  }
1188  switch (size) {
1189  case 3:
1190  dstPtr[2] = srcPtr[2];
1191  case 2:
1192  dstPtr[1] = srcPtr[1];
1193  case 1:
1194  dstPtr[0] = srcPtr[0];
1195  }
1196 
1197  return dst;
1198 }
1199 
1200 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1201  auto dstPtr = static_cast<unsigned char*>(dst);
1202 
1203  while (size >= 4u) {
1204  dstPtr[0] = val;
1205  dstPtr[1] = val;
1206  dstPtr[2] = val;
1207  dstPtr[3] = val;
1208 
1209  size -= 4u;
1210  dstPtr += 4u;
1211  }
1212  switch (size) {
1213  case 3:
1214  dstPtr[2] = val;
1215  case 2:
1216  dstPtr[1] = val;
1217  case 1:
1218  dstPtr[0] = val;
1219  }
1220 
1221  return dst;
1222 }
1223 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1224  return __hip_hc_memcpy(dst, src, size);
1225 }
1226 
1227 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1228  unsigned char val8 = static_cast<unsigned char>(val);
1229  return __hip_hc_memset(ptr, val8, size);
1230 }
1231 
1232 #endif
TODO-doc.
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.