HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 
37 #if __HIP_CLANG_ONLY__
38 extern "C" __device__ int printf(const char *fmt, ...);
39 #else
40 template <typename... All>
41 static inline __device__ void printf(const char* format, All... all) {}
42 #endif // __HIP_CLANG_ONLY__
43 
44 /*
45 Integer Intrinsics
46 */
47 
48 // integer intrinsic function __poc __clz __ffs __brev
49 __device__ static inline unsigned int __popc(unsigned int input) {
50  return __builtin_popcount(input);
51 }
52 __device__ static inline unsigned int __popcll(unsigned long long int input) {
53  return __builtin_popcountll(input);
54 }
55 
56 __device__ static inline int __clz(int input) {
57  return __ockl_clz_u32((uint)input);
58 }
59 
60 __device__ static inline int __clzll(long long int input) {
61  return __ockl_clz_u64((ullong)input);
62 }
63 
64 __device__ static inline unsigned int __ffs(unsigned int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __ffs(int input) {
73  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
74 }
75 
76 __device__ static inline unsigned int __ffsll(long long int input) {
77  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
78 }
79 
80 __device__ static inline unsigned int __brev(unsigned int input) {
81  return __builtin_bitreverse32(input);
82 }
83 
84 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
85  return __builtin_bitreverse64(input);
86 }
87 
88 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
89  return input == 0 ? -1 : __builtin_ctzl(input);
90 }
91 
92 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
93  uint32_t offset = src1 & 31;
94  uint32_t width = src2 & 31;
95  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
96 }
97 
98 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
99  uint64_t offset = src1 & 63;
100  uint64_t width = src2 & 63;
101  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
102 }
103 
104 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
105  uint32_t offset = src2 & 31;
106  uint32_t width = src3 & 31;
107  uint32_t mask = (1 << width) - 1;
108  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
109 }
110 
111 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
112  uint64_t offset = src2 & 63;
113  uint64_t width = src3 & 63;
114  uint64_t mask = (1ULL << width) - 1;
115  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
116 }
117 
118 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
119 __device__ static unsigned int __hadd(int x, int y);
120 __device__ static int __mul24(int x, int y);
121 __device__ static long long int __mul64hi(long long int x, long long int y);
122 __device__ static int __mulhi(int x, int y);
123 __device__ static int __rhadd(int x, int y);
124 __device__ static unsigned int __sad(int x, int y,unsigned int z);
125 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
126 __device__ static int __umul24(unsigned int x, unsigned int y);
127 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
128 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
129 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
130 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
131 
132 struct ucharHolder {
133  union {
134  unsigned char c[4];
135  unsigned int ui;
136  };
137 } __attribute__((aligned(4)));
138 
139 struct uchar2Holder {
140  union {
141  unsigned int ui[2];
142  unsigned char c[8];
143  };
144 } __attribute__((aligned(8)));
145 
146 __device__
147 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
148  struct uchar2Holder cHoldVal;
149  struct ucharHolder cHoldKey;
150  struct ucharHolder cHoldOut;
151  cHoldKey.ui = s;
152  cHoldVal.ui[0] = x;
153  cHoldVal.ui[1] = y;
154  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
155  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
156  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
157  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
158  return cHoldOut.ui;
159 }
160 
161 __device__ static inline unsigned int __hadd(int x, int y) {
162  int z = x + y;
163  int sign = z & 0x8000000;
164  int value = z & 0x7FFFFFFF;
165  return ((value) >> 1 || sign);
166 }
167 
168 __device__ static inline int __mul24(int x, int y) {
169  return __ockl_mul24_i32(x, y);
170 }
171 
172 __device__ static inline long long __mul64hi(long long int x, long long int y) {
173  ulong x0 = (ulong)x & 0xffffffffUL;
174  long x1 = x >> 32;
175  ulong y0 = (ulong)y & 0xffffffffUL;
176  long y1 = y >> 32;
177  ulong z0 = x0*y0;
178  long t = x1*y0 + (z0 >> 32);
179  long z1 = t & 0xffffffffL;
180  long z2 = t >> 32;
181  z1 = x0*y1 + z1;
182  return x1*y1 + z2 + (z1 >> 32);
183 }
184 
185 __device__ static inline int __mulhi(int x, int y) {
186  return __ockl_mul_hi_i32(x, y);
187 }
188 
189 __device__ static inline int __rhadd(int x, int y) {
190  int z = x + y + 1;
191  int sign = z & 0x8000000;
192  int value = z & 0x7FFFFFFF;
193  return ((value) >> 1 || sign);
194 }
195 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
196  return x > y ? x - y + z : y - x + z;
197 }
198 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
199  return (x + y) >> 1;
200 }
201 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
202  return __ockl_mul24_u32(x, y);
203 }
204 
205 __device__
206 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
207  ulong x0 = x & 0xffffffffUL;
208  ulong x1 = x >> 32;
209  ulong y0 = y & 0xffffffffUL;
210  ulong y1 = y >> 32;
211  ulong z0 = x0*y0;
212  ulong t = x1*y0 + (z0 >> 32);
213  ulong z1 = t & 0xffffffffUL;
214  ulong z2 = t >> 32;
215  z1 = x0*y1 + z1;
216  return x1*y1 + z2 + (z1 >> 32);
217 }
218 
219 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
220  return __ockl_mul_hi_u32(x, y);
221 }
222 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
223  return (x + y + 1) >> 1;
224 }
225 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
226  return __ockl_sadd_u32(x, y, z);
227 }
228 
229 __device__ static inline unsigned int __lane_id() {
230  return __builtin_amdgcn_mbcnt_hi(
231  -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
232 }
233 
234 __device__
235 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
236 
237 __device__
238 static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
239 
240 /*
241 HIP specific device functions
242 */
243 
244 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
245  union { int i; unsigned u; float f; } tmp; tmp.u = src;
246  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
247  return tmp.u;
248 }
249 
250 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
251  union { int i; unsigned u; float f; } tmp; tmp.f = src;
252  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
253  return tmp.f;
254 }
255 
256 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
257  union { int i; unsigned u; float f; } tmp; tmp.u = src;
258  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
259  return tmp.u;
260 }
261 
262 __device__ static inline float __hip_ds_permutef(int index, float src) {
263  union { int i; unsigned u; float f; } tmp; tmp.u = src;
264  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
265  return tmp.u;
266 }
267 
268 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
269 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
270 
271 template <int pattern>
272 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
273  union { int i; unsigned u; float f; } tmp; tmp.u = src;
274  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
275  return tmp.u;
276 }
277 
278 template <int pattern>
279 __device__ static inline float __hip_ds_swizzlef_N(float src) {
280  union { int i; unsigned u; float f; } tmp; tmp.f = src;
281  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
282  return tmp.f;
283 }
284 
285 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
286  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
287 
288 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
289 __device__ static inline int __hip_move_dpp_N(int src) {
290  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
291  bound_ctrl);
292 }
293 
294 // FIXME: Remove the following workaround once the clang change is released.
295 // This is for backward compatibility with older clang which does not define
296 // __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64.
297 #ifndef __AMDGCN_WAVEFRONT_SIZE
298 #if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
299 #define __AMDGCN_WAVEFRONT_SIZE 32
300 #else
301 #define __AMDGCN_WAVEFRONT_SIZE 64
302 #endif
303 #endif
304 static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
305 
306 __device__
307 inline
308 int __shfl(int var, int src_lane, int width = warpSize) {
309  int self = __lane_id();
310  int index = src_lane + (self & ~(width-1));
311  return __builtin_amdgcn_ds_bpermute(index<<2, var);
312 }
313 __device__
314 inline
315 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
316  union { int i; unsigned u; float f; } tmp; tmp.u = var;
317  tmp.i = __shfl(tmp.i, src_lane, width);
318  return tmp.u;
319 }
320 __device__
321 inline
322 float __shfl(float var, int src_lane, int width = warpSize) {
323  union { int i; unsigned u; float f; } tmp; tmp.f = var;
324  tmp.i = __shfl(tmp.i, src_lane, width);
325  return tmp.f;
326 }
327 __device__
328 inline
329 double __shfl(double var, int src_lane, int width = warpSize) {
330  static_assert(sizeof(double) == 2 * sizeof(int), "");
331  static_assert(sizeof(double) == sizeof(uint64_t), "");
332 
333  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
334  tmp[0] = __shfl(tmp[0], src_lane, width);
335  tmp[1] = __shfl(tmp[1], src_lane, width);
336 
337  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
338  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
339  return tmp1;
340 }
341 __device__
342 inline
343 long __shfl(long var, int src_lane, int width = warpSize)
344 {
345  #ifndef _MSC_VER
346  static_assert(sizeof(long) == 2 * sizeof(int), "");
347  static_assert(sizeof(long) == sizeof(uint64_t), "");
348 
349  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
350  tmp[0] = __shfl(tmp[0], src_lane, width);
351  tmp[1] = __shfl(tmp[1], src_lane, width);
352 
353  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
354  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
355  return tmp1;
356  #else
357  static_assert(sizeof(long) == sizeof(int), "");
358  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
359  #endif
360 }
361 __device__
362 inline
363 unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
364  #ifndef _MSC_VER
365  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
366  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
367 
368  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
369  tmp[0] = __shfl(tmp[0], src_lane, width);
370  tmp[1] = __shfl(tmp[1], src_lane, width);
371 
372  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
373  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
374  return tmp1;
375  #else
376  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
377  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
378  #endif
379 }
380 __device__
381 inline
382 long long __shfl(long long var, int src_lane, int width = warpSize)
383 {
384  static_assert(sizeof(long long) == 2 * sizeof(int), "");
385  static_assert(sizeof(long long) == sizeof(uint64_t), "");
386 
387  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
388  tmp[0] = __shfl(tmp[0], src_lane, width);
389  tmp[1] = __shfl(tmp[1], src_lane, width);
390 
391  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
392  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
393  return tmp1;
394 }
395 __device__
396 inline
397 unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
398  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
399  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
400 
401  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
402  tmp[0] = __shfl(tmp[0], src_lane, width);
403  tmp[1] = __shfl(tmp[1], src_lane, width);
404 
405  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
406  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
407  return tmp1;
408 }
409 
410 __device__
411 inline
412 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
413  int self = __lane_id();
414  int index = self - lane_delta;
415  index = (index < (self & ~(width-1)))?self:index;
416  return __builtin_amdgcn_ds_bpermute(index<<2, var);
417 }
418 __device__
419 inline
420 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
421  union { int i; unsigned u; float f; } tmp; tmp.u = var;
422  tmp.i = __shfl_up(tmp.i, lane_delta, width);
423  return tmp.u;
424 }
425 __device__
426 inline
427 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
428  union { int i; unsigned u; float f; } tmp; tmp.f = var;
429  tmp.i = __shfl_up(tmp.i, lane_delta, width);
430  return tmp.f;
431 }
432 __device__
433 inline
434 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
435  static_assert(sizeof(double) == 2 * sizeof(int), "");
436  static_assert(sizeof(double) == sizeof(uint64_t), "");
437 
438  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
439  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
440  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
441 
442  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
443  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
444  return tmp1;
445 }
446 __device__
447 inline
448 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
449 {
450  #ifndef _MSC_VER
451  static_assert(sizeof(long) == 2 * sizeof(int), "");
452  static_assert(sizeof(long) == sizeof(uint64_t), "");
453 
454  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
455  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
456  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
457 
458  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
459  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
460  return tmp1;
461  #else
462  static_assert(sizeof(long) == sizeof(int), "");
463  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
464  #endif
465 }
466 
467 __device__
468 inline
469 unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
470 {
471  #ifndef _MSC_VER
472  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
473  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
474 
475  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
476  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
477  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
478 
479  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
480  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
481  return tmp1;
482  #else
483  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
484  return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
485  #endif
486 }
487 
488 __device__
489 inline
490 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
491 {
492  static_assert(sizeof(long long) == 2 * sizeof(int), "");
493  static_assert(sizeof(long long) == sizeof(uint64_t), "");
494  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
495  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
496  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
497  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
498  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
499  return tmp1;
500 }
501 
502 __device__
503 inline
504 unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
505 {
506  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
507  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
508  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
509  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
510  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
511  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
512  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
513  return tmp1;
514 }
515 
516 __device__
517 inline
518 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
519  int self = __lane_id();
520  int index = self + lane_delta;
521  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
522  return __builtin_amdgcn_ds_bpermute(index<<2, var);
523 }
524 __device__
525 inline
526 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
527  union { int i; unsigned u; float f; } tmp; tmp.u = var;
528  tmp.i = __shfl_down(tmp.i, lane_delta, width);
529  return tmp.u;
530 }
531 __device__
532 inline
533 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
534  union { int i; unsigned u; float f; } tmp; tmp.f = var;
535  tmp.i = __shfl_down(tmp.i, lane_delta, width);
536  return tmp.f;
537 }
538 __device__
539 inline
540 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
541  static_assert(sizeof(double) == 2 * sizeof(int), "");
542  static_assert(sizeof(double) == sizeof(uint64_t), "");
543 
544  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
545  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
546  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
547 
548  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
549  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
550  return tmp1;
551 }
552 __device__
553 inline
554 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
555 {
556  #ifndef _MSC_VER
557  static_assert(sizeof(long) == 2 * sizeof(int), "");
558  static_assert(sizeof(long) == sizeof(uint64_t), "");
559 
560  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
561  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
562  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
563 
564  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
565  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
566  return tmp1;
567  #else
568  static_assert(sizeof(long) == sizeof(int), "");
569  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
570  #endif
571 }
572 __device__
573 inline
574 unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
575 {
576  #ifndef _MSC_VER
577  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
578  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
579 
580  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
581  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
582  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
583 
584  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
585  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
586  return tmp1;
587  #else
588  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
589  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
590  #endif
591 }
592 __device__
593 inline
594 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
595 {
596  static_assert(sizeof(long long) == 2 * sizeof(int), "");
597  static_assert(sizeof(long long) == sizeof(uint64_t), "");
598  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
599  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
600  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
601  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
602  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
603  return tmp1;
604 }
605 __device__
606 inline
607 unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
608 {
609  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
610  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
611  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
612  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
613  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
614  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
615  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
616  return tmp1;
617 }
618 
619 __device__
620 inline
621 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
622  int self = __lane_id();
623  int index = self^lane_mask;
624  index = index >= ((self+width)&~(width-1))?self:index;
625  return __builtin_amdgcn_ds_bpermute(index<<2, var);
626 }
627 __device__
628 inline
629 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
630  union { int i; unsigned u; float f; } tmp; tmp.u = var;
631  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
632  return tmp.u;
633 }
634 __device__
635 inline
636 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
637  union { int i; unsigned u; float f; } tmp; tmp.f = var;
638  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
639  return tmp.f;
640 }
641 __device__
642 inline
643 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
644  static_assert(sizeof(double) == 2 * sizeof(int), "");
645  static_assert(sizeof(double) == sizeof(uint64_t), "");
646 
647  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
648  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
649  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
650 
651  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
652  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
653  return tmp1;
654 }
655 __device__
656 inline
657 long __shfl_xor(long var, int lane_mask, int width = warpSize)
658 {
659  #ifndef _MSC_VER
660  static_assert(sizeof(long) == 2 * sizeof(int), "");
661  static_assert(sizeof(long) == sizeof(uint64_t), "");
662 
663  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
664  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
665  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
666 
667  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
668  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
669  return tmp1;
670  #else
671  static_assert(sizeof(long) == sizeof(int), "");
672  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
673  #endif
674 }
675 __device__
676 inline
677 unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
678 {
679  #ifndef _MSC_VER
680  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
681  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
682 
683  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
684  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
685  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
686 
687  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
688  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
689  return tmp1;
690  #else
691  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
692  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
693  #endif
694 }
695 __device__
696 inline
697 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
698 {
699  static_assert(sizeof(long long) == 2 * sizeof(int), "");
700  static_assert(sizeof(long long) == sizeof(uint64_t), "");
701  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
702  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
703  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
704  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
705  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
706  return tmp1;
707 }
708 __device__
709 inline
710 unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
711 {
712  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
713  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
714  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
715  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
716  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
717  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
718  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
719  return tmp1;
720 }
721 #define MASK1 0x00ff00ff
722 #define MASK2 0xff00ff00
723 
724 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
725  char4 out;
726  unsigned one1 = in1.w & MASK1;
727  unsigned one2 = in2.w & MASK1;
728  out.w = (one1 + one2) & MASK1;
729  one1 = in1.w & MASK2;
730  one2 = in2.w & MASK2;
731  out.w = out.w | ((one1 + one2) & MASK2);
732  return out;
733 }
734 
735 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
736  char4 out;
737  unsigned one1 = in1.w & MASK1;
738  unsigned one2 = in2.w & MASK1;
739  out.w = (one1 - one2) & MASK1;
740  one1 = in1.w & MASK2;
741  one2 = in2.w & MASK2;
742  out.w = out.w | ((one1 - one2) & MASK2);
743  return out;
744 }
745 
746 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
747  char4 out;
748  unsigned one1 = in1.w & MASK1;
749  unsigned one2 = in2.w & MASK1;
750  out.w = (one1 * one2) & MASK1;
751  one1 = in1.w & MASK2;
752  one2 = in2.w & MASK2;
753  out.w = out.w | ((one1 * one2) & MASK2);
754  return out;
755 }
756 
757 /*
758  * Rounding modes are not yet supported in HIP
759  * TODO: Conversion functions are not correct, need to fix when BE is ready
760 */
761 
762 __device__ static inline float __double2float_rd(double x) { return (double)x; }
763 __device__ static inline float __double2float_rn(double x) { return (double)x; }
764 __device__ static inline float __double2float_ru(double x) { return (double)x; }
765 __device__ static inline float __double2float_rz(double x) { return (double)x; }
766 
767 __device__ static inline int __double2hiint(double x) {
768  static_assert(sizeof(double) == 2 * sizeof(int), "");
769 
770  int tmp[2];
771  __builtin_memcpy(tmp, &x, sizeof(tmp));
772 
773  return tmp[1];
774 }
775 __device__ static inline int __double2loint(double x) {
776  static_assert(sizeof(double) == 2 * sizeof(int), "");
777 
778  int tmp[2];
779  __builtin_memcpy(tmp, &x, sizeof(tmp));
780 
781  return tmp[0];
782 }
783 
784 __device__ static inline int __double2int_rd(double x) { return (int)x; }
785 __device__ static inline int __double2int_rn(double x) { return (int)x; }
786 __device__ static inline int __double2int_ru(double x) { return (int)x; }
787 __device__ static inline int __double2int_rz(double x) { return (int)x; }
788 
789 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
790 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
791 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
792 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
793 
794 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
795 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
796 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
797 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
798 
799 __device__ static inline unsigned long long int __double2ull_rd(double x) {
800  return (unsigned long long int)x;
801 }
802 __device__ static inline unsigned long long int __double2ull_rn(double x) {
803  return (unsigned long long int)x;
804 }
805 __device__ static inline unsigned long long int __double2ull_ru(double x) {
806  return (unsigned long long int)x;
807 }
808 __device__ static inline unsigned long long int __double2ull_rz(double x) {
809  return (unsigned long long int)x;
810 }
811 
812 __device__ static inline long long int __double_as_longlong(double x) {
813  static_assert(sizeof(long long) == sizeof(double), "");
814 
815  long long tmp;
816  __builtin_memcpy(&tmp, &x, sizeof(tmp));
817 
818  return tmp;
819 }
820 
821 /*
822 __device__ unsigned short __float2half_rn(float x);
823 __device__ float __half2float(unsigned short);
824 
825 The above device function are not a valid .
826 Use
827 __device__ __half __float2half_rn(float x);
828 __device__ float __half2float(__half);
829 from hip_fp16.h
830 
831 CUDA implements half as unsigned short whereas, HIP doesn't.
832 
833 */
834 
835 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
836 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
837 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
838 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
839 
840 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
841 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
842 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
843 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
844 
845 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
846 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
847 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
848 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
849 
850 __device__ static inline unsigned long long int __float2ull_rd(float x) {
851  return (unsigned long long int)x;
852 }
853 __device__ static inline unsigned long long int __float2ull_rn(float x) {
854  return (unsigned long long int)x;
855 }
856 __device__ static inline unsigned long long int __float2ull_ru(float x) {
857  return (unsigned long long int)x;
858 }
859 __device__ static inline unsigned long long int __float2ull_rz(float x) {
860  return (unsigned long long int)x;
861 }
862 
863 __device__ static inline int __float_as_int(float x) {
864  static_assert(sizeof(int) == sizeof(float), "");
865 
866  int tmp;
867  __builtin_memcpy(&tmp, &x, sizeof(tmp));
868 
869  return tmp;
870 }
871 
872 __device__ static inline unsigned int __float_as_uint(float x) {
873  static_assert(sizeof(unsigned int) == sizeof(float), "");
874 
875  unsigned int tmp;
876  __builtin_memcpy(&tmp, &x, sizeof(tmp));
877 
878  return tmp;
879 }
880 
881 __device__ static inline double __hiloint2double(int hi, int lo) {
882  static_assert(sizeof(double) == sizeof(uint64_t), "");
883 
884  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
885  double tmp1;
886  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
887 
888  return tmp1;
889 }
890 
891 __device__ static inline double __int2double_rn(int x) { return (double)x; }
892 
893 __device__ static inline float __int2float_rd(int x) { return (float)x; }
894 __device__ static inline float __int2float_rn(int x) { return (float)x; }
895 __device__ static inline float __int2float_ru(int x) { return (float)x; }
896 __device__ static inline float __int2float_rz(int x) { return (float)x; }
897 
898 __device__ static inline float __int_as_float(int x) {
899  static_assert(sizeof(float) == sizeof(int), "");
900 
901  float tmp;
902  __builtin_memcpy(&tmp, &x, sizeof(tmp));
903 
904  return tmp;
905 }
906 
907 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
908 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
909 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
910 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
911 
912 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
913 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
914 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
915 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
916 
917 __device__ static inline double __longlong_as_double(long long int x) {
918  static_assert(sizeof(double) == sizeof(long long), "");
919 
920  double tmp;
921  __builtin_memcpy(&tmp, &x, sizeof(tmp));
922 
923  return tmp;
924 }
925 
926 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
927 
928 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
929 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
930 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
931 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
932 
933 __device__ static inline float __uint_as_float(unsigned int x) {
934  static_assert(sizeof(float) == sizeof(unsigned int), "");
935 
936  float tmp;
937  __builtin_memcpy(&tmp, &x, sizeof(tmp));
938 
939  return tmp;
940 }
941 
942 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
943 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
944 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
945 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
946 
947 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
948 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
949 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
950 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
951 
952 #if __HIP_CLANG_ONLY__
953 
954 // Clock functions
955 __device__ long long int __clock64();
956 __device__ long long int __clock();
957 __device__ long long int clock64();
958 __device__ long long int clock();
959 // hip.amdgcn.bc - named sync
960 __device__ void __named_sync(int a, int b);
961 
962 #ifdef __HIP_DEVICE_COMPILE__
963 
964 // Clock functions
965 __device__
966 inline __attribute((always_inline))
967 long long int __clock64() {
968 return (long long int) __builtin_readcyclecounter();
969 }
970 
971 __device__
972 inline __attribute((always_inline))
973 long long int __clock() { return __clock64(); }
974 
975 __device__
976 inline __attribute__((always_inline))
977 long long int clock64() { return __clock64(); }
978 
979 __device__
980 inline __attribute__((always_inline))
981 long long int clock() { return __clock(); }
982 
983 // hip.amdgcn.bc - named sync
984 __device__
985 inline
986 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
987 
988 #endif // __HIP_DEVICE_COMPILE__
989 
990 // warp vote function __all __any __ballot
991 __device__
992 inline
993 int __all(int predicate) {
994  return __ockl_wfall_i32(predicate);
995 }
996 
997 __device__
998 inline
999 int __any(int predicate) {
1000  return __ockl_wfany_i32(predicate);
1001 }
1002 
1003 // XXX from llvm/include/llvm/IR/InstrTypes.h
1004 #define ICMP_NE 33
1005 
1006 __device__
1007 inline
1008 unsigned long long int __ballot(int predicate) {
1009  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1010 }
1011 
1012 __device__
1013 inline
1014 unsigned long long int __ballot64(int predicate) {
1015  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1016 }
1017 
1018 // hip.amdgcn.bc - lanemask
1019 __device__
1020 inline
1021 uint64_t __lanemask_gt()
1022 {
1023  uint32_t lane = __ockl_lane_u32();
1024  if (lane == 63)
1025  return 0;
1026  uint64_t ballot = __ballot64(1);
1027  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1028  return mask & ballot;
1029 }
1030 
1031 __device__
1032 inline
1033 uint64_t __lanemask_lt()
1034 {
1035  uint32_t lane = __ockl_lane_u32();
1036  int64_t ballot = __ballot64(1);
1037  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1038  return mask & ballot;
1039 }
1040 
1041 __device__
1042 inline
1043 uint64_t __lanemask_eq()
1044 {
1045  uint32_t lane = __ockl_lane_u32();
1046  int64_t mask = ((uint64_t)1 << lane);
1047  return mask;
1048 }
1049 
1050 
1051 __device__ inline void* __local_to_generic(void* p) { return p; }
1052 
1053 #ifdef __HIP_DEVICE_COMPILE__
1054 __device__
1055 inline
1056 void* __get_dynamicgroupbaseptr()
1057 {
1058  // Get group segment base pointer.
1059  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1060 }
1061 #else
1062 __device__
1063 void* __get_dynamicgroupbaseptr();
1064 #endif // __HIP_DEVICE_COMPILE__
1065 
1066 __device__
1067 inline
1068 void *__amdgcn_get_dynamicgroupbaseptr() {
1069  return __get_dynamicgroupbaseptr();
1070 }
1071 
1072 // Memory Fence Functions
1073 __device__
1074 inline
1075 static void __threadfence()
1076 {
1077  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1078 }
1079 
1080 __device__
1081 inline
1082 static void __threadfence_block()
1083 {
1084  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1085 }
1086 
1087 __device__
1088 inline
1089 static void __threadfence_system()
1090 {
1091  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1092 }
1093 
1094 // abort
1095 __device__
1096 inline
1097 __attribute__((weak))
1098 void abort() {
1099  return __builtin_trap();
1100 }
1101 
1102 // The noinline attribute helps encapsulate the printf expansion,
1103 // which otherwise has a performance impact just by increasing the
1104 // size of the calling function. Additionally, the weak attribute
1105 // allows the function to exist as a global although its definition is
1106 // included in every compilation unit.
1107 #if defined(_WIN32) || defined(_WIN64)
1108 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1109 void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
1110  // FIXME: Need `wchar_t` support to generate assertion message.
1111  __builtin_trap();
1112 }
1113 #else /* defined(_WIN32) || defined(_WIN64) */
1114 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1115 void __assert_fail(const char *assertion,
1116  const char *file,
1117  unsigned int line,
1118  const char *function)
1119 {
1120  printf("%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
1121  function, assertion);
1122  __builtin_trap();
1123 }
1124 
1125 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1126 void __assertfail(const char *assertion,
1127  const char *file,
1128  unsigned int line,
1129  const char *function,
1130  size_t charsize)
1131 {
1132  // ignore all the args for now.
1133  __builtin_trap();
1134 }
1135 #endif /* defined(_WIN32) || defined(_WIN64) */
1136 
1137 __device__
1138 inline
1139 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1140 {
1141  if (flags) {
1142  __atomic_work_item_fence(flags, __memory_order_release, scope);
1143  __builtin_amdgcn_s_barrier();
1144  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1145  } else {
1146  __builtin_amdgcn_s_barrier();
1147  }
1148 }
1149 
1150 __device__
1151 inline
1152 static void __barrier(int n)
1153 {
1154  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1155 }
1156 
1157 __device__
1158 inline
1159 __attribute__((convergent))
1160 void __syncthreads()
1161 {
1162  __barrier(__CLK_LOCAL_MEM_FENCE);
1163 }
1164 
1165 __device__
1166 inline
1167 __attribute__((convergent))
1168 int __syncthreads_count(int predicate)
1169 {
1170  return __ockl_wgred_add_i32(!!predicate);
1171 }
1172 
1173 __device__
1174 inline
1175 __attribute__((convergent))
1176 int __syncthreads_and(int predicate)
1177 {
1178  return __ockl_wgred_and_i32(!!predicate);
1179 }
1180 
1181 __device__
1182 inline
1183 __attribute__((convergent))
1184 int __syncthreads_or(int predicate)
1185 {
1186  return __ockl_wgred_or_i32(!!predicate);
1187 }
1188 
1189 // hip.amdgcn.bc - device routine
1190 /*
1191  HW_ID Register bit structure
1192  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1193  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1194  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1195  CU_ID 11:8 Compute Unit the wave is assigned to.
1196  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1197  SE_ID 14:13 Shader Engine the wave is assigned to.
1198  TG_ID 19:16 Thread-group ID
1199  VM_ID 23:20 Virtual Memory ID
1200  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1201  STATE_ID 29:27 State ID (graphics only, not compute).
1202  ME_ID 31:30 Micro-engine ID.
1203  */
1204 
1205 #define HW_ID 4
1206 
1207 #define HW_ID_CU_ID_SIZE 4
1208 #define HW_ID_CU_ID_OFFSET 8
1209 
1210 #define HW_ID_SE_ID_SIZE 2
1211 #define HW_ID_SE_ID_OFFSET 13
1212 
1213 /*
1214  Encoding of parameter bitmask
1215  HW_ID 5:0 HW_ID
1216  OFFSET 10:6 Range: 0..31
1217  SIZE 15:11 Range: 1..32
1218  */
1219 
1220 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1221 
1222 /*
1223  __smid returns the wave's assigned Compute Unit and Shader Engine.
1224  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1225  Note: the results vary over time.
1226  SZ minus 1 since SIZE is 1-based.
1227 */
1228 __device__
1229 inline
1230 unsigned __smid(void)
1231 {
1232  unsigned cu_id = __builtin_amdgcn_s_getreg(
1233  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1234  unsigned se_id = __builtin_amdgcn_s_getreg(
1235  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1236 
1237  /* Each shader engine has 16 CU */
1238  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1239 }
1240 
1245 #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1246 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1247 
1248 #endif //defined(__clang__) && defined(__HIP__)
1249 
1250 
1251 // loop unrolling
1252 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1253  auto dstPtr = static_cast<unsigned char*>(dst);
1254  auto srcPtr = static_cast<const unsigned char*>(src);
1255 
1256  while (size >= 4u) {
1257  dstPtr[0] = srcPtr[0];
1258  dstPtr[1] = srcPtr[1];
1259  dstPtr[2] = srcPtr[2];
1260  dstPtr[3] = srcPtr[3];
1261 
1262  size -= 4u;
1263  srcPtr += 4u;
1264  dstPtr += 4u;
1265  }
1266  switch (size) {
1267  case 3:
1268  dstPtr[2] = srcPtr[2];
1269  case 2:
1270  dstPtr[1] = srcPtr[1];
1271  case 1:
1272  dstPtr[0] = srcPtr[0];
1273  }
1274 
1275  return dst;
1276 }
1277 
1278 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1279  auto dstPtr = static_cast<unsigned char*>(dst);
1280 
1281  while (size >= 4u) {
1282  dstPtr[0] = val;
1283  dstPtr[1] = val;
1284  dstPtr[2] = val;
1285  dstPtr[3] = val;
1286 
1287  size -= 4u;
1288  dstPtr += 4u;
1289  }
1290  switch (size) {
1291  case 3:
1292  dstPtr[2] = val;
1293  case 2:
1294  dstPtr[1] = val;
1295  case 1:
1296  dstPtr[0] = val;
1297  }
1298 
1299  return dst;
1300 }
1301 #ifndef __OPENMP_AMDGCN__
1302 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1303  return __hip_hc_memcpy(dst, src, size);
1304 }
1305 
1306 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1307  unsigned char val8 = static_cast<unsigned char>(val);
1308  return __hip_hc_memset(ptr, val8, size);
1309 }
1310 #endif // !__OPENMP_AMDGCN__
1311 #endif
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
char4
Definition: hip_vector_types.h:1505
uchar2Holder
Definition: device_functions.h:139
llvm_intrinsics.h
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s....
host_defines.h
TODO-doc.
ucharHolder
Definition: device_functions.h:132
device_library_decls.h
Contains declarations for types and functions in device library.