HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #if !defined(__HIPCC_RTC__)
30 #include <hip/hip_runtime_api.h>
31 #include <stddef.h>
32 #endif // !defined(__HIPCC_RTC__)
33 
34 #include <hip/hip_vector_types.h>
37 
38 #if __HIP_CLANG_ONLY__
39 extern "C" __device__ int printf(const char *fmt, ...);
40 #else
41 template <typename... All>
42 static inline __device__ void printf(const char* format, All... all) {}
43 #endif // __HIP_CLANG_ONLY__
44 
45 /*
46 Integer Intrinsics
47 */
48 
49 // integer intrinsic function __poc __clz __ffs __brev
50 __device__ static inline unsigned int __popc(unsigned int input) {
51  return __builtin_popcount(input);
52 }
53 __device__ static inline unsigned int __popcll(unsigned long long int input) {
54  return __builtin_popcountll(input);
55 }
56 
57 __device__ static inline int __clz(int input) {
58  return __ockl_clz_u32((uint)input);
59 }
60 
61 __device__ static inline int __clzll(long long int input) {
62  return __ockl_clz_u64((ullong)input);
63 }
64 
65 __device__ static inline unsigned int __ffs(unsigned int input) {
66  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
67 }
68 
69 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
70  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
71 }
72 
73 __device__ static inline unsigned int __ffs(int input) {
74  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
75 }
76 
77 __device__ static inline unsigned int __ffsll(long long int input) {
78  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
79 }
80 
81 __device__ static inline unsigned int __brev(unsigned int input) {
82  return __builtin_bitreverse32(input);
83 }
84 
85 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
86  return __builtin_bitreverse64(input);
87 }
88 
89 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
90  return input == 0 ? -1 : __builtin_ctzl(input);
91 }
92 
93 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
94  uint32_t offset = src1 & 31;
95  uint32_t width = src2 & 31;
96  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
97 }
98 
99 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
100  uint64_t offset = src1 & 63;
101  uint64_t width = src2 & 63;
102  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
103 }
104 
105 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
106  uint32_t offset = src2 & 31;
107  uint32_t width = src3 & 31;
108  uint32_t mask = (1 << width) - 1;
109  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
110 }
111 
112 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
113  uint64_t offset = src2 & 63;
114  uint64_t width = src3 & 63;
115  uint64_t mask = (1ULL << width) - 1;
116  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
117 }
118 
119 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
120 __device__ static unsigned int __hadd(int x, int y);
121 __device__ static int __mul24(int x, int y);
122 __device__ static long long int __mul64hi(long long int x, long long int y);
123 __device__ static int __mulhi(int x, int y);
124 __device__ static int __rhadd(int x, int y);
125 __device__ static unsigned int __sad(int x, int y,unsigned int z);
126 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
127 __device__ static int __umul24(unsigned int x, unsigned int y);
128 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
129 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
130 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
131 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
132 
133 struct ucharHolder {
134  union {
135  unsigned char c[4];
136  unsigned int ui;
137  };
138 } __attribute__((aligned(4)));
139 
140 struct uchar2Holder {
141  union {
142  unsigned int ui[2];
143  unsigned char c[8];
144  };
145 } __attribute__((aligned(8)));
146 
147 __device__
148 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
149  struct uchar2Holder cHoldVal;
150  struct ucharHolder cHoldKey;
151  cHoldKey.ui = s;
152  cHoldVal.ui[0] = x;
153  cHoldVal.ui[1] = y;
154  unsigned int result;
155  result = cHoldVal.c[cHoldKey.c[0] & 0x07];
156  result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
157  result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
158  result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
159  return result;
160 }
161 
162 __device__ static inline unsigned int __hadd(int x, int y) {
163  int z = x + y;
164  int sign = z & 0x8000000;
165  int value = z & 0x7FFFFFFF;
166  return ((value) >> 1 || sign);
167 }
168 
169 __device__ static inline int __mul24(int x, int y) {
170  return __ockl_mul24_i32(x, y);
171 }
172 
173 __device__ static inline long long __mul64hi(long long int x, long long int y) {
174  ulong x0 = (ulong)x & 0xffffffffUL;
175  long x1 = x >> 32;
176  ulong y0 = (ulong)y & 0xffffffffUL;
177  long y1 = y >> 32;
178  ulong z0 = x0*y0;
179  long t = x1*y0 + (z0 >> 32);
180  long z1 = t & 0xffffffffL;
181  long z2 = t >> 32;
182  z1 = x0*y1 + z1;
183  return x1*y1 + z2 + (z1 >> 32);
184 }
185 
186 __device__ static inline int __mulhi(int x, int y) {
187  return __ockl_mul_hi_i32(x, y);
188 }
189 
190 __device__ static inline int __rhadd(int x, int y) {
191  int z = x + y + 1;
192  int sign = z & 0x8000000;
193  int value = z & 0x7FFFFFFF;
194  return ((value) >> 1 || sign);
195 }
196 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
197  return x > y ? x - y + z : y - x + z;
198 }
199 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
200  return (x + y) >> 1;
201 }
202 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
203  return __ockl_mul24_u32(x, y);
204 }
205 
206 __device__
207 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
208  ulong x0 = x & 0xffffffffUL;
209  ulong x1 = x >> 32;
210  ulong y0 = y & 0xffffffffUL;
211  ulong y1 = y >> 32;
212  ulong z0 = x0*y0;
213  ulong t = x1*y0 + (z0 >> 32);
214  ulong z1 = t & 0xffffffffUL;
215  ulong z2 = t >> 32;
216  z1 = x0*y1 + z1;
217  return x1*y1 + z2 + (z1 >> 32);
218 }
219 
220 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
221  return __ockl_mul_hi_u32(x, y);
222 }
223 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
224  return (x + y + 1) >> 1;
225 }
226 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
227  return __ockl_sadd_u32(x, y, z);
228 }
229 
230 __device__ static inline unsigned int __lane_id() {
231  return __builtin_amdgcn_mbcnt_hi(
232  -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
233 }
234 
235 __device__
236 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
237 
238 __device__
239 static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
240 
241 /*
242 HIP specific device functions
243 */
244 
245 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
246  union { int i; unsigned u; float f; } tmp; tmp.u = src;
247  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
248  return tmp.u;
249 }
250 
251 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
252  union { int i; unsigned u; float f; } tmp; tmp.f = src;
253  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
254  return tmp.f;
255 }
256 
257 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
258  union { int i; unsigned u; float f; } tmp; tmp.u = src;
259  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
260  return tmp.u;
261 }
262 
263 __device__ static inline float __hip_ds_permutef(int index, float src) {
264  union { int i; unsigned u; float f; } tmp; tmp.u = src;
265  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
266  return tmp.u;
267 }
268 
269 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
270 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
271 
272 template <int pattern>
273 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
274  union { int i; unsigned u; float f; } tmp; tmp.u = src;
275  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
276  return tmp.u;
277 }
278 
279 template <int pattern>
280 __device__ static inline float __hip_ds_swizzlef_N(float src) {
281  union { int i; unsigned u; float f; } tmp; tmp.f = src;
282  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
283  return tmp.f;
284 }
285 
286 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
287  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
288 
289 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
290 __device__ static inline int __hip_move_dpp_N(int src) {
291  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
292  bound_ctrl);
293 }
294 
295 static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
296 
297 __device__
298 inline
299 int __shfl(int var, int src_lane, int width = warpSize) {
300  int self = __lane_id();
301  int index = src_lane + (self & ~(width-1));
302  return __builtin_amdgcn_ds_bpermute(index<<2, var);
303 }
304 __device__
305 inline
306 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
307  union { int i; unsigned u; float f; } tmp; tmp.u = var;
308  tmp.i = __shfl(tmp.i, src_lane, width);
309  return tmp.u;
310 }
311 __device__
312 inline
313 float __shfl(float var, int src_lane, int width = warpSize) {
314  union { int i; unsigned u; float f; } tmp; tmp.f = var;
315  tmp.i = __shfl(tmp.i, src_lane, width);
316  return tmp.f;
317 }
318 __device__
319 inline
320 double __shfl(double var, int src_lane, int width = warpSize) {
321  static_assert(sizeof(double) == 2 * sizeof(int), "");
322  static_assert(sizeof(double) == sizeof(uint64_t), "");
323 
324  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325  tmp[0] = __shfl(tmp[0], src_lane, width);
326  tmp[1] = __shfl(tmp[1], src_lane, width);
327 
328  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
329  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
330  return tmp1;
331 }
332 __device__
333 inline
334 long __shfl(long var, int src_lane, int width = warpSize)
335 {
336  #ifndef _MSC_VER
337  static_assert(sizeof(long) == 2 * sizeof(int), "");
338  static_assert(sizeof(long) == sizeof(uint64_t), "");
339 
340  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
341  tmp[0] = __shfl(tmp[0], src_lane, width);
342  tmp[1] = __shfl(tmp[1], src_lane, width);
343 
344  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
345  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
346  return tmp1;
347  #else
348  static_assert(sizeof(long) == sizeof(int), "");
349  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
350  #endif
351 }
352 __device__
353 inline
354 unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
355  #ifndef _MSC_VER
356  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
357  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
358 
359  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
360  tmp[0] = __shfl(tmp[0], src_lane, width);
361  tmp[1] = __shfl(tmp[1], src_lane, width);
362 
363  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
364  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
365  return tmp1;
366  #else
367  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
368  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
369  #endif
370 }
371 __device__
372 inline
373 long long __shfl(long long var, int src_lane, int width = warpSize)
374 {
375  static_assert(sizeof(long long) == 2 * sizeof(int), "");
376  static_assert(sizeof(long long) == sizeof(uint64_t), "");
377 
378  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
379  tmp[0] = __shfl(tmp[0], src_lane, width);
380  tmp[1] = __shfl(tmp[1], src_lane, width);
381 
382  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
383  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
384  return tmp1;
385 }
386 __device__
387 inline
388 unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
389  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
390  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
391 
392  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
393  tmp[0] = __shfl(tmp[0], src_lane, width);
394  tmp[1] = __shfl(tmp[1], src_lane, width);
395 
396  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
397  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
398  return tmp1;
399 }
400 
401 __device__
402 inline
403 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
404  int self = __lane_id();
405  int index = self - lane_delta;
406  index = (index < (self & ~(width-1)))?self:index;
407  return __builtin_amdgcn_ds_bpermute(index<<2, var);
408 }
409 __device__
410 inline
411 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
412  union { int i; unsigned u; float f; } tmp; tmp.u = var;
413  tmp.i = __shfl_up(tmp.i, lane_delta, width);
414  return tmp.u;
415 }
416 __device__
417 inline
418 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
419  union { int i; unsigned u; float f; } tmp; tmp.f = var;
420  tmp.i = __shfl_up(tmp.i, lane_delta, width);
421  return tmp.f;
422 }
423 __device__
424 inline
425 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
426  static_assert(sizeof(double) == 2 * sizeof(int), "");
427  static_assert(sizeof(double) == sizeof(uint64_t), "");
428 
429  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
430  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
431  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
432 
433  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
434  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
435  return tmp1;
436 }
437 __device__
438 inline
439 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
440 {
441  #ifndef _MSC_VER
442  static_assert(sizeof(long) == 2 * sizeof(int), "");
443  static_assert(sizeof(long) == sizeof(uint64_t), "");
444 
445  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
446  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
447  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
448 
449  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
450  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
451  return tmp1;
452  #else
453  static_assert(sizeof(long) == sizeof(int), "");
454  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
455  #endif
456 }
457 
458 __device__
459 inline
460 unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
461 {
462  #ifndef _MSC_VER
463  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
464  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
465 
466  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
467  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
468  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
469 
470  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
471  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
472  return tmp1;
473  #else
474  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
475  return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
476  #endif
477 }
478 
479 __device__
480 inline
481 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
482 {
483  static_assert(sizeof(long long) == 2 * sizeof(int), "");
484  static_assert(sizeof(long long) == sizeof(uint64_t), "");
485  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
486  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
487  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
488  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
489  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
490  return tmp1;
491 }
492 
493 __device__
494 inline
495 unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
496 {
497  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
498  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
499  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
501  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
502  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
503  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
504  return tmp1;
505 }
506 
507 __device__
508 inline
509 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
510  int self = __lane_id();
511  int index = self + lane_delta;
512  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
513  return __builtin_amdgcn_ds_bpermute(index<<2, var);
514 }
515 __device__
516 inline
517 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
518  union { int i; unsigned u; float f; } tmp; tmp.u = var;
519  tmp.i = __shfl_down(tmp.i, lane_delta, width);
520  return tmp.u;
521 }
522 __device__
523 inline
524 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
525  union { int i; unsigned u; float f; } tmp; tmp.f = var;
526  tmp.i = __shfl_down(tmp.i, lane_delta, width);
527  return tmp.f;
528 }
529 __device__
530 inline
531 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
532  static_assert(sizeof(double) == 2 * sizeof(int), "");
533  static_assert(sizeof(double) == sizeof(uint64_t), "");
534 
535  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
536  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
537  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
538 
539  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
540  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
541  return tmp1;
542 }
543 __device__
544 inline
545 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
546 {
547  #ifndef _MSC_VER
548  static_assert(sizeof(long) == 2 * sizeof(int), "");
549  static_assert(sizeof(long) == sizeof(uint64_t), "");
550 
551  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
552  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
553  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
554 
555  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
556  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
557  return tmp1;
558  #else
559  static_assert(sizeof(long) == sizeof(int), "");
560  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
561  #endif
562 }
563 __device__
564 inline
565 unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
566 {
567  #ifndef _MSC_VER
568  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
569  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
570 
571  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
572  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
573  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
574 
575  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
576  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
577  return tmp1;
578  #else
579  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
580  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
581  #endif
582 }
583 __device__
584 inline
585 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
586 {
587  static_assert(sizeof(long long) == 2 * sizeof(int), "");
588  static_assert(sizeof(long long) == sizeof(uint64_t), "");
589  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
590  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
591  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
592  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
593  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
594  return tmp1;
595 }
596 __device__
597 inline
598 unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
599 {
600  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
601  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
602  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
603  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
604  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
605  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
606  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
607  return tmp1;
608 }
609 
610 __device__
611 inline
612 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
613  int self = __lane_id();
614  int index = self^lane_mask;
615  index = index >= ((self+width)&~(width-1))?self:index;
616  return __builtin_amdgcn_ds_bpermute(index<<2, var);
617 }
618 __device__
619 inline
620 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
621  union { int i; unsigned u; float f; } tmp; tmp.u = var;
622  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
623  return tmp.u;
624 }
625 __device__
626 inline
627 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
628  union { int i; unsigned u; float f; } tmp; tmp.f = var;
629  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
630  return tmp.f;
631 }
632 __device__
633 inline
634 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
635  static_assert(sizeof(double) == 2 * sizeof(int), "");
636  static_assert(sizeof(double) == sizeof(uint64_t), "");
637 
638  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
639  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
640  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
641 
642  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
643  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
644  return tmp1;
645 }
646 __device__
647 inline
648 long __shfl_xor(long var, int lane_mask, int width = warpSize)
649 {
650  #ifndef _MSC_VER
651  static_assert(sizeof(long) == 2 * sizeof(int), "");
652  static_assert(sizeof(long) == sizeof(uint64_t), "");
653 
654  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
655  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
656  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
657 
658  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
659  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
660  return tmp1;
661  #else
662  static_assert(sizeof(long) == sizeof(int), "");
663  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
664  #endif
665 }
666 __device__
667 inline
668 unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
669 {
670  #ifndef _MSC_VER
671  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
672  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
673 
674  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
675  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
676  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
677 
678  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
679  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
680  return tmp1;
681  #else
682  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
683  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
684  #endif
685 }
686 __device__
687 inline
688 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
689 {
690  static_assert(sizeof(long long) == 2 * sizeof(int), "");
691  static_assert(sizeof(long long) == sizeof(uint64_t), "");
692  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
693  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
694  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
695  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
696  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
697  return tmp1;
698 }
699 __device__
700 inline
701 unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
702 {
703  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
704  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
705  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
706  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
707  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
708  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
709  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
710  return tmp1;
711 }
712 #define MASK1 0x00ff00ff
713 #define MASK2 0xff00ff00
714 
715 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
716  char4 out;
717  unsigned one1 = in1.w & MASK1;
718  unsigned one2 = in2.w & MASK1;
719  out.w = (one1 + one2) & MASK1;
720  one1 = in1.w & MASK2;
721  one2 = in2.w & MASK2;
722  out.w = out.w | ((one1 + one2) & MASK2);
723  return out;
724 }
725 
726 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
727  char4 out;
728  unsigned one1 = in1.w & MASK1;
729  unsigned one2 = in2.w & MASK1;
730  out.w = (one1 - one2) & MASK1;
731  one1 = in1.w & MASK2;
732  one2 = in2.w & MASK2;
733  out.w = out.w | ((one1 - one2) & MASK2);
734  return out;
735 }
736 
737 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
738  char4 out;
739  unsigned one1 = in1.w & MASK1;
740  unsigned one2 = in2.w & MASK1;
741  out.w = (one1 * one2) & MASK1;
742  one1 = in1.w & MASK2;
743  one2 = in2.w & MASK2;
744  out.w = out.w | ((one1 * one2) & MASK2);
745  return out;
746 }
747 
748 /*
749  * Rounding modes are not yet supported in HIP
750  * TODO: Conversion functions are not correct, need to fix when BE is ready
751 */
752 
753 __device__ static inline float __double2float_rd(double x) { return (double)x; }
754 __device__ static inline float __double2float_rn(double x) { return (double)x; }
755 __device__ static inline float __double2float_ru(double x) { return (double)x; }
756 __device__ static inline float __double2float_rz(double x) { return (double)x; }
757 
758 __device__ static inline int __double2hiint(double x) {
759  static_assert(sizeof(double) == 2 * sizeof(int), "");
760 
761  int tmp[2];
762  __builtin_memcpy(tmp, &x, sizeof(tmp));
763 
764  return tmp[1];
765 }
766 __device__ static inline int __double2loint(double x) {
767  static_assert(sizeof(double) == 2 * sizeof(int), "");
768 
769  int tmp[2];
770  __builtin_memcpy(tmp, &x, sizeof(tmp));
771 
772  return tmp[0];
773 }
774 
775 __device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
776 __device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
777 __device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
778 __device__ static inline int __double2int_rz(double x) { return (int)x; }
779 
780 __device__ static inline long long int __double2ll_rd(double x) {
781  return (long long)__ocml_floor_f64(x);
782 }
783 __device__ static inline long long int __double2ll_rn(double x) {
784  return (long long)__ocml_rint_f64(x);
785 }
786 __device__ static inline long long int __double2ll_ru(double x) {
787  return (long long)__ocml_ceil_f64(x);
788 }
789 __device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
790 
791 __device__ static inline unsigned int __double2uint_rd(double x) {
792  return (unsigned int)__ocml_floor_f64(x);
793 }
794 __device__ static inline unsigned int __double2uint_rn(double x) {
795  return (unsigned int)__ocml_rint_f64(x);
796 }
797 __device__ static inline unsigned int __double2uint_ru(double x) {
798  return (unsigned int)__ocml_ceil_f64(x);
799 }
800 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
801 
802 __device__ static inline unsigned long long int __double2ull_rd(double x) {
803  return (unsigned long long int)__ocml_floor_f64(x);
804 }
805 __device__ static inline unsigned long long int __double2ull_rn(double x) {
806  return (unsigned long long int)__ocml_rint_f64(x);
807 }
808 __device__ static inline unsigned long long int __double2ull_ru(double x) {
809  return (unsigned long long int)__ocml_ceil_f64(x);
810 }
811 __device__ static inline unsigned long long int __double2ull_rz(double x) {
812  return (unsigned long long int)x;
813 }
814 
815 __device__ static inline long long int __double_as_longlong(double x) {
816  static_assert(sizeof(long long) == sizeof(double), "");
817 
818  long long tmp;
819  __builtin_memcpy(&tmp, &x, sizeof(tmp));
820 
821  return tmp;
822 }
823 
824 /*
825 __device__ unsigned short __float2half_rn(float x);
826 __device__ float __half2float(unsigned short);
827 
828 The above device function are not a valid .
829 Use
830 __device__ __half __float2half_rn(float x);
831 __device__ float __half2float(__half);
832 from hip_fp16.h
833 
834 CUDA implements half as unsigned short whereas, HIP doesn't.
835 
836 */
837 
838 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
839 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
840 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
841 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
842 
843 __device__ static inline long long int __float2ll_rd(float x) {
844  return (long long int)__ocml_floor_f32(x);
845 }
846 __device__ static inline long long int __float2ll_rn(float x) {
847  return (long long int)__ocml_rint_f32(x);
848 }
849 __device__ static inline long long int __float2ll_ru(float x) {
850  return (long long int)__ocml_ceil_f32(x);
851 }
852 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
853 
854 __device__ static inline unsigned int __float2uint_rd(float x) {
855  return (unsigned int)__ocml_floor_f32(x);
856 }
857 __device__ static inline unsigned int __float2uint_rn(float x) {
858  return (unsigned int)__ocml_rint_f32(x);
859 }
860 __device__ static inline unsigned int __float2uint_ru(float x) {
861  return (unsigned int)__ocml_ceil_f32(x);
862 }
863 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
864 
865 __device__ static inline unsigned long long int __float2ull_rd(float x) {
866  return (unsigned long long int)__ocml_floor_f32(x);
867 }
868 __device__ static inline unsigned long long int __float2ull_rn(float x) {
869  return (unsigned long long int)__ocml_rint_f32(x);
870 }
871 __device__ static inline unsigned long long int __float2ull_ru(float x) {
872  return (unsigned long long int)__ocml_ceil_f32(x);
873 }
874 __device__ static inline unsigned long long int __float2ull_rz(float x) {
875  return (unsigned long long int)x;
876 }
877 
878 __device__ static inline int __float_as_int(float x) {
879  static_assert(sizeof(int) == sizeof(float), "");
880 
881  int tmp;
882  __builtin_memcpy(&tmp, &x, sizeof(tmp));
883 
884  return tmp;
885 }
886 
887 __device__ static inline unsigned int __float_as_uint(float x) {
888  static_assert(sizeof(unsigned int) == sizeof(float), "");
889 
890  unsigned int tmp;
891  __builtin_memcpy(&tmp, &x, sizeof(tmp));
892 
893  return tmp;
894 }
895 
896 __device__ static inline double __hiloint2double(int hi, int lo) {
897  static_assert(sizeof(double) == sizeof(uint64_t), "");
898 
899  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
900  double tmp1;
901  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
902 
903  return tmp1;
904 }
905 
906 __device__ static inline double __int2double_rn(int x) { return (double)x; }
907 
908 __device__ static inline float __int2float_rd(int x) { return (float)x; }
909 __device__ static inline float __int2float_rn(int x) { return (float)x; }
910 __device__ static inline float __int2float_ru(int x) { return (float)x; }
911 __device__ static inline float __int2float_rz(int x) { return (float)x; }
912 
913 __device__ static inline float __int_as_float(int x) {
914  static_assert(sizeof(float) == sizeof(int), "");
915 
916  float tmp;
917  __builtin_memcpy(&tmp, &x, sizeof(tmp));
918 
919  return tmp;
920 }
921 
922 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
923 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
924 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
925 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
926 
927 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
928 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
929 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
930 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
931 
932 __device__ static inline double __longlong_as_double(long long int x) {
933  static_assert(sizeof(double) == sizeof(long long), "");
934 
935  double tmp;
936  __builtin_memcpy(&tmp, &x, sizeof(tmp));
937 
938  return tmp;
939 }
940 
941 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
942 
943 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
944 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
945 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
946 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
947 
948 __device__ static inline float __uint_as_float(unsigned int x) {
949  static_assert(sizeof(float) == sizeof(unsigned int), "");
950 
951  float tmp;
952  __builtin_memcpy(&tmp, &x, sizeof(tmp));
953 
954  return tmp;
955 }
956 
957 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
958 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
959 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
960 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
961 
962 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
963 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
964 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
965 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
966 
967 #if __HIP_CLANG_ONLY__
968 
969 // Clock functions
970 __device__ long long int __clock64();
971 __device__ long long int __clock();
972 __device__ long long int clock64();
973 __device__ long long int clock();
974 // hip.amdgcn.bc - named sync
975 __device__ void __named_sync(int a, int b);
976 
977 #ifdef __HIP_DEVICE_COMPILE__
978 
979 // Clock functions
980 __device__
981 inline __attribute((always_inline))
982 long long int __clock64() {
983 return (long long int) __builtin_readcyclecounter();
984 }
985 
986 __device__
987 inline __attribute((always_inline))
988 long long int __clock() { return __clock64(); }
989 
990 __device__
991 inline __attribute__((always_inline))
992 long long int clock64() { return __clock64(); }
993 
994 __device__
995 inline __attribute__((always_inline))
996 long long int clock() { return __clock(); }
997 
998 // hip.amdgcn.bc - named sync
999 __device__
1000 inline
1001 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
1002 
1003 #endif // __HIP_DEVICE_COMPILE__
1004 
1005 // warp vote function __all __any __ballot
1006 __device__
1007 inline
1008 int __all(int predicate) {
1009  return __ockl_wfall_i32(predicate);
1010 }
1011 
1012 __device__
1013 inline
1014 int __any(int predicate) {
1015  return __ockl_wfany_i32(predicate);
1016 }
1017 
1018 // XXX from llvm/include/llvm/IR/InstrTypes.h
1019 #define ICMP_NE 33
1020 
1021 __device__
1022 inline
1023 unsigned long long int __ballot(int predicate) {
1024  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1025 }
1026 
1027 __device__
1028 inline
1029 unsigned long long int __ballot64(int predicate) {
1030  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1031 }
1032 
1033 // hip.amdgcn.bc - lanemask
1034 __device__
1035 inline
1036 uint64_t __lanemask_gt()
1037 {
1038  uint32_t lane = __ockl_lane_u32();
1039  if (lane == 63)
1040  return 0;
1041  uint64_t ballot = __ballot64(1);
1042  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1043  return mask & ballot;
1044 }
1045 
1046 __device__
1047 inline
1048 uint64_t __lanemask_lt()
1049 {
1050  uint32_t lane = __ockl_lane_u32();
1051  int64_t ballot = __ballot64(1);
1052  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1053  return mask & ballot;
1054 }
1055 
1056 __device__
1057 inline
1058 uint64_t __lanemask_eq()
1059 {
1060  uint32_t lane = __ockl_lane_u32();
1061  int64_t mask = ((uint64_t)1 << lane);
1062  return mask;
1063 }
1064 
1065 
1066 __device__ inline void* __local_to_generic(void* p) { return p; }
1067 
1068 #ifdef __HIP_DEVICE_COMPILE__
1069 __device__
1070 inline
1071 void* __get_dynamicgroupbaseptr()
1072 {
1073  // Get group segment base pointer.
1074  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1075 }
1076 #else
1077 __device__
1078 void* __get_dynamicgroupbaseptr();
1079 #endif // __HIP_DEVICE_COMPILE__
1080 
1081 __device__
1082 inline
1083 void *__amdgcn_get_dynamicgroupbaseptr() {
1084  return __get_dynamicgroupbaseptr();
1085 }
1086 
1087 // Memory Fence Functions
1088 __device__
1089 inline
1090 static void __threadfence()
1091 {
1092  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1093 }
1094 
1095 __device__
1096 inline
1097 static void __threadfence_block()
1098 {
1099  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1100 }
1101 
1102 __device__
1103 inline
1104 static void __threadfence_system()
1105 {
1106  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1107 }
1108 
1109 // abort
1110 __device__
1111 inline
1112 __attribute__((weak))
1113 void abort() {
1114  return __builtin_trap();
1115 }
1116 
1117 // The noinline attribute helps encapsulate the printf expansion,
1118 // which otherwise has a performance impact just by increasing the
1119 // size of the calling function. Additionally, the weak attribute
1120 // allows the function to exist as a global although its definition is
1121 // included in every compilation unit.
1122 #if defined(_WIN32) || defined(_WIN64)
1123 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1124 void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
1125  // FIXME: Need `wchar_t` support to generate assertion message.
1126  __builtin_trap();
1127 }
1128 #else /* defined(_WIN32) || defined(_WIN64) */
1129 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1130 void __assert_fail(const char *assertion,
1131  const char *file,
1132  unsigned int line,
1133  const char *function)
1134 {
1135  printf("%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
1136  function, assertion);
1137  __builtin_trap();
1138 }
1139 
1140 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1141 void __assertfail(const char *assertion,
1142  const char *file,
1143  unsigned int line,
1144  const char *function,
1145  size_t charsize)
1146 {
1147  // ignore all the args for now.
1148  __builtin_trap();
1149 }
1150 #endif /* defined(_WIN32) || defined(_WIN64) */
1151 
1152 __device__
1153 inline
1154 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1155 {
1156  if (flags) {
1157  __atomic_work_item_fence(flags, __memory_order_release, scope);
1158  __builtin_amdgcn_s_barrier();
1159  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1160  } else {
1161  __builtin_amdgcn_s_barrier();
1162  }
1163 }
1164 
1165 __device__
1166 inline
1167 static void __barrier(int n)
1168 {
1169  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1170 }
1171 
1172 __device__
1173 inline
1174 __attribute__((convergent))
1175 void __syncthreads()
1176 {
1177  __barrier(__CLK_LOCAL_MEM_FENCE);
1178 }
1179 
1180 __device__
1181 inline
1182 __attribute__((convergent))
1183 int __syncthreads_count(int predicate)
1184 {
1185  return __ockl_wgred_add_i32(!!predicate);
1186 }
1187 
1188 __device__
1189 inline
1190 __attribute__((convergent))
1191 int __syncthreads_and(int predicate)
1192 {
1193  return __ockl_wgred_and_i32(!!predicate);
1194 }
1195 
1196 __device__
1197 inline
1198 __attribute__((convergent))
1199 int __syncthreads_or(int predicate)
1200 {
1201  return __ockl_wgred_or_i32(!!predicate);
1202 }
1203 
1204 // hip.amdgcn.bc - device routine
1205 /*
1206  HW_ID Register bit structure
1207  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1208  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1209  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1210  CU_ID 11:8 Compute Unit the wave is assigned to.
1211  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1212  SE_ID 14:13 Shader Engine the wave is assigned to.
1213  TG_ID 19:16 Thread-group ID
1214  VM_ID 23:20 Virtual Memory ID
1215  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1216  STATE_ID 29:27 State ID (graphics only, not compute).
1217  ME_ID 31:30 Micro-engine ID.
1218  */
1219 
1220 #define HW_ID 4
1221 
1222 #define HW_ID_CU_ID_SIZE 4
1223 #define HW_ID_CU_ID_OFFSET 8
1224 
1225 #define HW_ID_SE_ID_SIZE 2
1226 #define HW_ID_SE_ID_OFFSET 13
1227 
1228 /*
1229  Encoding of parameter bitmask
1230  HW_ID 5:0 HW_ID
1231  OFFSET 10:6 Range: 0..31
1232  SIZE 15:11 Range: 1..32
1233  */
1234 
1235 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1236 
1237 /*
1238  __smid returns the wave's assigned Compute Unit and Shader Engine.
1239  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1240  Note: the results vary over time.
1241  SZ minus 1 since SIZE is 1-based.
1242 */
1243 __device__
1244 inline
1245 unsigned __smid(void)
1246 {
1247  unsigned cu_id = __builtin_amdgcn_s_getreg(
1248  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1249  unsigned se_id = __builtin_amdgcn_s_getreg(
1250  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1251 
1252  /* Each shader engine has 16 CU */
1253  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1254 }
1255 
1260 #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1261 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1262 
1263 #endif //defined(__clang__) && defined(__HIP__)
1264 
1265 
1266 // loop unrolling
1267 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1268  auto dstPtr = static_cast<unsigned char*>(dst);
1269  auto srcPtr = static_cast<const unsigned char*>(src);
1270 
1271  while (size >= 4u) {
1272  dstPtr[0] = srcPtr[0];
1273  dstPtr[1] = srcPtr[1];
1274  dstPtr[2] = srcPtr[2];
1275  dstPtr[3] = srcPtr[3];
1276 
1277  size -= 4u;
1278  srcPtr += 4u;
1279  dstPtr += 4u;
1280  }
1281  switch (size) {
1282  case 3:
1283  dstPtr[2] = srcPtr[2];
1284  case 2:
1285  dstPtr[1] = srcPtr[1];
1286  case 1:
1287  dstPtr[0] = srcPtr[0];
1288  }
1289 
1290  return dst;
1291 }
1292 
1293 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1294  auto dstPtr = static_cast<unsigned char*>(dst);
1295 
1296  while (size >= 4u) {
1297  dstPtr[0] = val;
1298  dstPtr[1] = val;
1299  dstPtr[2] = val;
1300  dstPtr[3] = val;
1301 
1302  size -= 4u;
1303  dstPtr += 4u;
1304  }
1305  switch (size) {
1306  case 3:
1307  dstPtr[2] = val;
1308  case 2:
1309  dstPtr[1] = val;
1310  case 1:
1311  dstPtr[0] = val;
1312  }
1313 
1314  return dst;
1315 }
1316 #ifndef __OPENMP_AMDGCN__
1317 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1318  return __hip_hc_memcpy(dst, src, size);
1319 }
1320 
1321 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1322  unsigned char val8 = static_cast<unsigned char>(val);
1323  return __hip_hc_memset(ptr, val8, size);
1324 }
1325 #endif // !__OPENMP_AMDGCN__
1326 #endif
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
char4
Definition: hip_vector_types.h:1513
uchar2Holder
Definition: device_functions.h:140
llvm_intrinsics.h
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s....
host_defines.h
TODO-doc.
ucharHolder
Definition: device_functions.h:133
device_library_decls.h
Contains declarations for types and functions in device library.