HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 
37 #if __HIP_CLANG_ONLY__ && __HIP_ROCclr__ && !_WIN32
38 extern "C" __device__ int printf(const char *fmt, ...);
39 #else
40 #if HC_FEATURE_PRINTF
41 template <typename... All>
42 static inline __device__ void printf(const char* format, All... all) {
43  hc::printf(format, all...);
44 }
45 #else
46 template <typename... All>
47 static inline __device__ void printf(const char* format, All... all) {}
48 #endif // HC_FEATURE_PRINTF
49 #endif // __HIP_CLANG_ONLY__ && __HIP_ROCclr__
50 
51 /*
52 Integer Intrinsics
53 */
54 
55 // integer intrinsic function __poc __clz __ffs __brev
56 __device__ static inline unsigned int __popc(unsigned int input) {
57  return __builtin_popcount(input);
58 }
59 __device__ static inline unsigned int __popcll(unsigned long long int input) {
60  return __builtin_popcountll(input);
61 }
62 
63 __device__ static inline int __clz(int input) {
64  return __ockl_clz_u32((uint)input);
65 }
66 
67 __device__ static inline int __clzll(long long int input) {
68  return __ockl_clz_u64((ullong)input);
69 }
70 
71 __device__ static inline unsigned int __ffs(unsigned int input) {
72  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
73 }
74 
75 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
76  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
77 }
78 
79 __device__ static inline unsigned int __ffs(int input) {
80  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
81 }
82 
83 __device__ static inline unsigned int __ffsll(long long int input) {
84  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85 }
86 
87 __device__ static inline unsigned int __brev(unsigned int input) {
88  return __builtin_bitreverse32(input);
89 }
90 
91 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
92  return __builtin_bitreverse64(input);
93 }
94 
95 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
96  return input == 0 ? -1 : __builtin_ctzl(input);
97 }
98 
99 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
100  uint32_t offset = src1 & 31;
101  uint32_t width = src2 & 31;
102  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
103 }
104 
105 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
106  uint64_t offset = src1 & 63;
107  uint64_t width = src2 & 63;
108  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
109 }
110 
111 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
112  uint32_t offset = src2 & 31;
113  uint32_t width = src3 & 31;
114  uint32_t mask = (1 << width) - 1;
115  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
116 }
117 
118 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
119  uint64_t offset = src2 & 63;
120  uint64_t width = src3 & 63;
121  uint64_t mask = (1ULL << width) - 1;
122  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
123 }
124 
125 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
126 __device__ static unsigned int __hadd(int x, int y);
127 __device__ static int __mul24(int x, int y);
128 __device__ static long long int __mul64hi(long long int x, long long int y);
129 __device__ static int __mulhi(int x, int y);
130 __device__ static int __rhadd(int x, int y);
131 __device__ static unsigned int __sad(int x, int y,unsigned int z);
132 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
133 __device__ static int __umul24(unsigned int x, unsigned int y);
134 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
135 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
136 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
137 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
138 
139 struct ucharHolder {
140  union {
141  unsigned char c[4];
142  unsigned int ui;
143  };
144 } __attribute__((aligned(4)));
145 
146 struct uchar2Holder {
147  union {
148  unsigned int ui[2];
149  unsigned char c[8];
150  };
151 } __attribute__((aligned(8)));
152 
153 __device__
154 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
155  struct uchar2Holder cHoldVal;
156  struct ucharHolder cHoldKey;
157  struct ucharHolder cHoldOut;
158  cHoldKey.ui = s;
159  cHoldVal.ui[0] = x;
160  cHoldVal.ui[1] = y;
161  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
162  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
163  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
164  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
165  return cHoldOut.ui;
166 }
167 
168 __device__ static inline unsigned int __hadd(int x, int y) {
169  int z = x + y;
170  int sign = z & 0x8000000;
171  int value = z & 0x7FFFFFFF;
172  return ((value) >> 1 || sign);
173 }
174 
175 __device__ static inline int __mul24(int x, int y) {
176  return __ockl_mul24_i32(x, y);
177 }
178 
179 __device__ static inline long long __mul64hi(long long int x, long long int y) {
180  ulong x0 = (ulong)x & 0xffffffffUL;
181  long x1 = x >> 32;
182  ulong y0 = (ulong)y & 0xffffffffUL;
183  long y1 = y >> 32;
184  ulong z0 = x0*y0;
185  long t = x1*y0 + (z0 >> 32);
186  long z1 = t & 0xffffffffL;
187  long z2 = t >> 32;
188  z1 = x0*y1 + z1;
189  return x1*y1 + z2 + (z1 >> 32);
190 }
191 
192 __device__ static inline int __mulhi(int x, int y) {
193  return __ockl_mul_hi_i32(x, y);
194 }
195 
196 __device__ static inline int __rhadd(int x, int y) {
197  int z = x + y + 1;
198  int sign = z & 0x8000000;
199  int value = z & 0x7FFFFFFF;
200  return ((value) >> 1 || sign);
201 }
202 __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
203  return x > y ? x - y + z : y - x + z;
204 }
205 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
206  return (x + y) >> 1;
207 }
208 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
209  return __ockl_mul24_u32(x, y);
210 }
211 
212 __device__
213 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
214  ulong x0 = x & 0xffffffffUL;
215  ulong x1 = x >> 32;
216  ulong y0 = y & 0xffffffffUL;
217  ulong y1 = y >> 32;
218  ulong z0 = x0*y0;
219  ulong t = x1*y0 + (z0 >> 32);
220  ulong z1 = t & 0xffffffffUL;
221  ulong z2 = t >> 32;
222  z1 = x0*y1 + z1;
223  return x1*y1 + z2 + (z1 >> 32);
224 }
225 
226 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
227  return __ockl_mul_hi_u32(x, y);
228 }
229 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
230  return (x + y + 1) >> 1;
231 }
232 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
233  return __ockl_sadd_u32(x, y, z);
234 }
235 
236 __device__ static inline unsigned int __lane_id() {
237  return __builtin_amdgcn_mbcnt_hi(
238  -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
239 }
240 
241 __device__
242 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
243 
244 __device__
245 static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
246 
247 /*
248 HIP specific device functions
249 */
250 
251 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
252  union { int i; unsigned u; float f; } tmp; tmp.u = src;
253  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
254  return tmp.u;
255 }
256 
257 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
258  union { int i; unsigned u; float f; } tmp; tmp.f = src;
259  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
260  return tmp.f;
261 }
262 
263 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
264  union { int i; unsigned u; float f; } tmp; tmp.u = src;
265  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
266  return tmp.u;
267 }
268 
269 __device__ static inline float __hip_ds_permutef(int index, float src) {
270  union { int i; unsigned u; float f; } tmp; tmp.u = src;
271  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
272  return tmp.u;
273 }
274 
275 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
276 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
277 
278 template <int pattern>
279 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
280  union { int i; unsigned u; float f; } tmp; tmp.u = src;
281 #if defined(__HCC__)
282  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
283 #else
284  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
285 #endif
286  return tmp.u;
287 }
288 
289 template <int pattern>
290 __device__ static inline float __hip_ds_swizzlef_N(float src) {
291  union { int i; unsigned u; float f; } tmp; tmp.f = src;
292 #if defined(__HCC__)
293  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
294 #else
295  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
296 #endif
297  return tmp.f;
298 }
299 
300 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
301  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
302 
303 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
304 __device__ static inline int __hip_move_dpp_N(int src) {
305  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
306  bound_ctrl);
307 }
308 
309 // FIXME: Remove the following workaround once the clang change is released.
310 // This is for backward compatibility with older clang which does not define
311 // __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64.
312 #ifndef __AMDGCN_WAVEFRONT_SIZE
313 #if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
314 #define __AMDGCN_WAVEFRONT_SIZE 32
315 #else
316 #define __AMDGCN_WAVEFRONT_SIZE 64
317 #endif
318 #endif
319 static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
320 
321 __device__
322 inline
323 int __shfl(int var, int src_lane, int width = warpSize) {
324  int self = __lane_id();
325  int index = src_lane + (self & ~(width-1));
326  return __builtin_amdgcn_ds_bpermute(index<<2, var);
327 }
328 __device__
329 inline
330 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
331  union { int i; unsigned u; float f; } tmp; tmp.u = var;
332  tmp.i = __shfl(tmp.i, src_lane, width);
333  return tmp.u;
334 }
335 __device__
336 inline
337 float __shfl(float var, int src_lane, int width = warpSize) {
338  union { int i; unsigned u; float f; } tmp; tmp.f = var;
339  tmp.i = __shfl(tmp.i, src_lane, width);
340  return tmp.f;
341 }
342 __device__
343 inline
344 double __shfl(double var, int src_lane, int width = warpSize) {
345  static_assert(sizeof(double) == 2 * sizeof(int), "");
346  static_assert(sizeof(double) == sizeof(uint64_t), "");
347 
348  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
349  tmp[0] = __shfl(tmp[0], src_lane, width);
350  tmp[1] = __shfl(tmp[1], src_lane, width);
351 
352  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
353  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
354  return tmp1;
355 }
356 __device__
357 inline
358 long __shfl(long var, int src_lane, int width = warpSize)
359 {
360  #ifndef _MSC_VER
361  static_assert(sizeof(long) == 2 * sizeof(int), "");
362  static_assert(sizeof(long) == sizeof(uint64_t), "");
363 
364  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
365  tmp[0] = __shfl(tmp[0], src_lane, width);
366  tmp[1] = __shfl(tmp[1], src_lane, width);
367 
368  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
369  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
370  return tmp1;
371  #else
372  static_assert(sizeof(long) == sizeof(int), "");
373  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
374  #endif
375 }
376 __device__
377 inline
378 unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
379  #ifndef _MSC_VER
380  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
381  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
382 
383  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
384  tmp[0] = __shfl(tmp[0], src_lane, width);
385  tmp[1] = __shfl(tmp[1], src_lane, width);
386 
387  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
388  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
389  return tmp1;
390  #else
391  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
392  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
393  #endif
394 }
395 __device__
396 inline
397 long long __shfl(long long var, int src_lane, int width = warpSize)
398 {
399  static_assert(sizeof(long long) == 2 * sizeof(int), "");
400  static_assert(sizeof(long long) == sizeof(uint64_t), "");
401 
402  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
403  tmp[0] = __shfl(tmp[0], src_lane, width);
404  tmp[1] = __shfl(tmp[1], src_lane, width);
405 
406  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
407  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
408  return tmp1;
409 }
410 __device__
411 inline
412 unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
413  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
414  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
415 
416  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
417  tmp[0] = __shfl(tmp[0], src_lane, width);
418  tmp[1] = __shfl(tmp[1], src_lane, width);
419 
420  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
421  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
422  return tmp1;
423 }
424 
425 __device__
426 inline
427 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
428  int self = __lane_id();
429  int index = self - lane_delta;
430  index = (index < (self & ~(width-1)))?self:index;
431  return __builtin_amdgcn_ds_bpermute(index<<2, var);
432 }
433 __device__
434 inline
435 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
436  union { int i; unsigned u; float f; } tmp; tmp.u = var;
437  tmp.i = __shfl_up(tmp.i, lane_delta, width);
438  return tmp.u;
439 }
440 __device__
441 inline
442 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
443  union { int i; unsigned u; float f; } tmp; tmp.f = var;
444  tmp.i = __shfl_up(tmp.i, lane_delta, width);
445  return tmp.f;
446 }
447 __device__
448 inline
449 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
450  static_assert(sizeof(double) == 2 * sizeof(int), "");
451  static_assert(sizeof(double) == sizeof(uint64_t), "");
452 
453  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
454  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
455  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
456 
457  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
458  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
459  return tmp1;
460 }
461 __device__
462 inline
463 long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
464 {
465  #ifndef _MSC_VER
466  static_assert(sizeof(long) == 2 * sizeof(int), "");
467  static_assert(sizeof(long) == sizeof(uint64_t), "");
468 
469  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
470  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
471  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
472 
473  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
474  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
475  return tmp1;
476  #else
477  static_assert(sizeof(long) == sizeof(int), "");
478  return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
479  #endif
480 }
481 
482 __device__
483 inline
484 unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
485 {
486  #ifndef _MSC_VER
487  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
488  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
489 
490  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
491  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
492  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
493 
494  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
495  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
496  return tmp1;
497  #else
498  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
499  return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
500  #endif
501 }
502 
503 __device__
504 inline
505 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
506 {
507  static_assert(sizeof(long long) == 2 * sizeof(int), "");
508  static_assert(sizeof(long long) == sizeof(uint64_t), "");
509  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
510  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
511  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
512  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
513  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
514  return tmp1;
515 }
516 
517 __device__
518 inline
519 unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
520 {
521  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
522  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
523  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
524  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
525  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
526  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
527  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
528  return tmp1;
529 }
530 
531 __device__
532 inline
533 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
534  int self = __lane_id();
535  int index = self + lane_delta;
536  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
537  return __builtin_amdgcn_ds_bpermute(index<<2, var);
538 }
539 __device__
540 inline
541 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
542  union { int i; unsigned u; float f; } tmp; tmp.u = var;
543  tmp.i = __shfl_down(tmp.i, lane_delta, width);
544  return tmp.u;
545 }
546 __device__
547 inline
548 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
549  union { int i; unsigned u; float f; } tmp; tmp.f = var;
550  tmp.i = __shfl_down(tmp.i, lane_delta, width);
551  return tmp.f;
552 }
553 __device__
554 inline
555 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
556  static_assert(sizeof(double) == 2 * sizeof(int), "");
557  static_assert(sizeof(double) == sizeof(uint64_t), "");
558 
559  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
560  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
561  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
562 
563  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
564  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
565  return tmp1;
566 }
567 __device__
568 inline
569 long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
570 {
571  #ifndef _MSC_VER
572  static_assert(sizeof(long) == 2 * sizeof(int), "");
573  static_assert(sizeof(long) == sizeof(uint64_t), "");
574 
575  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
576  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
577  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
578 
579  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
580  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
581  return tmp1;
582  #else
583  static_assert(sizeof(long) == sizeof(int), "");
584  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
585  #endif
586 }
587 __device__
588 inline
589 unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
590 {
591  #ifndef _MSC_VER
592  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
593  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
594 
595  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
596  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
597  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
598 
599  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
600  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
601  return tmp1;
602  #else
603  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
604  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
605  #endif
606 }
607 __device__
608 inline
609 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
610 {
611  static_assert(sizeof(long long) == 2 * sizeof(int), "");
612  static_assert(sizeof(long long) == sizeof(uint64_t), "");
613  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
614  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
615  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
616  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
617  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
618  return tmp1;
619 }
620 __device__
621 inline
622 unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
623 {
624  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
625  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
626  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
627  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
628  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
629  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
630  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
631  return tmp1;
632 }
633 
634 __device__
635 inline
636 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
637  int self = __lane_id();
638  int index = self^lane_mask;
639  index = index >= ((self+width)&~(width-1))?self:index;
640  return __builtin_amdgcn_ds_bpermute(index<<2, var);
641 }
642 __device__
643 inline
644 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
645  union { int i; unsigned u; float f; } tmp; tmp.u = var;
646  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
647  return tmp.u;
648 }
649 __device__
650 inline
651 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
652  union { int i; unsigned u; float f; } tmp; tmp.f = var;
653  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
654  return tmp.f;
655 }
656 __device__
657 inline
658 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
659  static_assert(sizeof(double) == 2 * sizeof(int), "");
660  static_assert(sizeof(double) == sizeof(uint64_t), "");
661 
662  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
663  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
664  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
665 
666  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
667  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
668  return tmp1;
669 }
670 __device__
671 inline
672 long __shfl_xor(long var, int lane_mask, int width = warpSize)
673 {
674  #ifndef _MSC_VER
675  static_assert(sizeof(long) == 2 * sizeof(int), "");
676  static_assert(sizeof(long) == sizeof(uint64_t), "");
677 
678  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
679  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
680  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
681 
682  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
683  long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
684  return tmp1;
685  #else
686  static_assert(sizeof(long) == sizeof(int), "");
687  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
688  #endif
689 }
690 __device__
691 inline
692 unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
693 {
694  #ifndef _MSC_VER
695  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
696  static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
697 
698  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
699  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
700  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
701 
702  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
703  unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
704  return tmp1;
705  #else
706  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
707  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
708  #endif
709 }
710 __device__
711 inline
712 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
713 {
714  static_assert(sizeof(long long) == 2 * sizeof(int), "");
715  static_assert(sizeof(long long) == sizeof(uint64_t), "");
716  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
717  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
718  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
719  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
720  long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
721  return tmp1;
722 }
723 __device__
724 inline
725 unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
726 {
727  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
728  static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
729  unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
730  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
731  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
732  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
733  unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
734  return tmp1;
735 }
736 #define MASK1 0x00ff00ff
737 #define MASK2 0xff00ff00
738 
739 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
740  char4 out;
741  unsigned one1 = in1.w & MASK1;
742  unsigned one2 = in2.w & MASK1;
743  out.w = (one1 + one2) & MASK1;
744  one1 = in1.w & MASK2;
745  one2 = in2.w & MASK2;
746  out.w = out.w | ((one1 + one2) & MASK2);
747  return out;
748 }
749 
750 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
751  char4 out;
752  unsigned one1 = in1.w & MASK1;
753  unsigned one2 = in2.w & MASK1;
754  out.w = (one1 - one2) & MASK1;
755  one1 = in1.w & MASK2;
756  one2 = in2.w & MASK2;
757  out.w = out.w | ((one1 - one2) & MASK2);
758  return out;
759 }
760 
761 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
762  char4 out;
763  unsigned one1 = in1.w & MASK1;
764  unsigned one2 = in2.w & MASK1;
765  out.w = (one1 * one2) & MASK1;
766  one1 = in1.w & MASK2;
767  one2 = in2.w & MASK2;
768  out.w = out.w | ((one1 * one2) & MASK2);
769  return out;
770 }
771 
772 /*
773  * Rounding modes are not yet supported in HIP
774  * TODO: Conversion functions are not correct, need to fix when BE is ready
775 */
776 
777 __device__ static inline float __double2float_rd(double x) { return (double)x; }
778 __device__ static inline float __double2float_rn(double x) { return (double)x; }
779 __device__ static inline float __double2float_ru(double x) { return (double)x; }
780 __device__ static inline float __double2float_rz(double x) { return (double)x; }
781 
782 __device__ static inline int __double2hiint(double x) {
783  static_assert(sizeof(double) == 2 * sizeof(int), "");
784 
785  int tmp[2];
786  __builtin_memcpy(tmp, &x, sizeof(tmp));
787 
788  return tmp[1];
789 }
790 __device__ static inline int __double2loint(double x) {
791  static_assert(sizeof(double) == 2 * sizeof(int), "");
792 
793  int tmp[2];
794  __builtin_memcpy(tmp, &x, sizeof(tmp));
795 
796  return tmp[0];
797 }
798 
799 __device__ static inline int __double2int_rd(double x) { return (int)x; }
800 __device__ static inline int __double2int_rn(double x) { return (int)x; }
801 __device__ static inline int __double2int_ru(double x) { return (int)x; }
802 __device__ static inline int __double2int_rz(double x) { return (int)x; }
803 
804 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
805 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
806 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
807 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
808 
809 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
810 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
811 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
812 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
813 
814 __device__ static inline unsigned long long int __double2ull_rd(double x) {
815  return (unsigned long long int)x;
816 }
817 __device__ static inline unsigned long long int __double2ull_rn(double x) {
818  return (unsigned long long int)x;
819 }
820 __device__ static inline unsigned long long int __double2ull_ru(double x) {
821  return (unsigned long long int)x;
822 }
823 __device__ static inline unsigned long long int __double2ull_rz(double x) {
824  return (unsigned long long int)x;
825 }
826 
827 __device__ static inline long long int __double_as_longlong(double x) {
828  static_assert(sizeof(long long) == sizeof(double), "");
829 
830  long long tmp;
831  __builtin_memcpy(&tmp, &x, sizeof(tmp));
832 
833  return tmp;
834 }
835 
836 /*
837 __device__ unsigned short __float2half_rn(float x);
838 __device__ float __half2float(unsigned short);
839 
840 The above device function are not a valid .
841 Use
842 __device__ __half __float2half_rn(float x);
843 __device__ float __half2float(__half);
844 from hip_fp16.h
845 
846 CUDA implements half as unsigned short whereas, HIP doesn't.
847 
848 */
849 
850 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
851 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
852 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
853 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
854 
855 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
856 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
857 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
858 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
859 
860 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
861 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
862 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
863 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
864 
865 __device__ static inline unsigned long long int __float2ull_rd(float x) {
866  return (unsigned long long int)x;
867 }
868 __device__ static inline unsigned long long int __float2ull_rn(float x) {
869  return (unsigned long long int)x;
870 }
871 __device__ static inline unsigned long long int __float2ull_ru(float x) {
872  return (unsigned long long int)x;
873 }
874 __device__ static inline unsigned long long int __float2ull_rz(float x) {
875  return (unsigned long long int)x;
876 }
877 
878 __device__ static inline int __float_as_int(float x) {
879  static_assert(sizeof(int) == sizeof(float), "");
880 
881  int tmp;
882  __builtin_memcpy(&tmp, &x, sizeof(tmp));
883 
884  return tmp;
885 }
886 
887 __device__ static inline unsigned int __float_as_uint(float x) {
888  static_assert(sizeof(unsigned int) == sizeof(float), "");
889 
890  unsigned int tmp;
891  __builtin_memcpy(&tmp, &x, sizeof(tmp));
892 
893  return tmp;
894 }
895 
896 __device__ static inline double __hiloint2double(int hi, int lo) {
897  static_assert(sizeof(double) == sizeof(uint64_t), "");
898 
899  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
900  double tmp1;
901  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
902 
903  return tmp1;
904 }
905 
906 __device__ static inline double __int2double_rn(int x) { return (double)x; }
907 
908 __device__ static inline float __int2float_rd(int x) { return (float)x; }
909 __device__ static inline float __int2float_rn(int x) { return (float)x; }
910 __device__ static inline float __int2float_ru(int x) { return (float)x; }
911 __device__ static inline float __int2float_rz(int x) { return (float)x; }
912 
913 __device__ static inline float __int_as_float(int x) {
914  static_assert(sizeof(float) == sizeof(int), "");
915 
916  float tmp;
917  __builtin_memcpy(&tmp, &x, sizeof(tmp));
918 
919  return tmp;
920 }
921 
922 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
923 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
924 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
925 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
926 
927 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
928 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
929 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
930 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
931 
932 __device__ static inline double __longlong_as_double(long long int x) {
933  static_assert(sizeof(double) == sizeof(long long), "");
934 
935  double tmp;
936  __builtin_memcpy(&tmp, &x, sizeof(tmp));
937 
938  return tmp;
939 }
940 
941 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
942 
943 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
944 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
945 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
946 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
947 
948 __device__ static inline float __uint_as_float(unsigned int x) {
949  static_assert(sizeof(float) == sizeof(unsigned int), "");
950 
951  float tmp;
952  __builtin_memcpy(&tmp, &x, sizeof(tmp));
953 
954  return tmp;
955 }
956 
957 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
958 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
959 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
960 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
961 
962 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
963 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
964 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
965 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
966 
967 #if defined(__HCC__)
968 #define __HCC_OR_HIP_CLANG__ 1
969 #elif defined(__clang__) && defined(__HIP__)
970 #define __HCC_OR_HIP_CLANG__ 1
971 #else
972 #define __HCC_OR_HIP_CLANG__ 0
973 #endif
974 
975 #if __HCC_OR_HIP_CLANG__
976 
977 // Clock functions
978 __device__ long long int __clock64();
979 __device__ long long int __clock();
980 __device__ long long int clock64();
981 __device__ long long int clock();
982 // hip.amdgcn.bc - named sync
983 __device__ void __named_sync(int a, int b);
984 
985 #ifdef __HIP_DEVICE_COMPILE__
986 
987 // Clock functions
988 #if __HCC__
989 extern "C" uint64_t __clock_u64() __HC__;
990 #endif
991 
992 __device__
993 inline __attribute((always_inline))
994 long long int __clock64() {
995 return (long long int) __builtin_readcyclecounter();
996 }
997 
998 __device__
999 inline __attribute((always_inline))
1000 long long int __clock() { return __clock64(); }
1001 
1002 __device__
1003 inline __attribute__((always_inline))
1004 long long int clock64() { return __clock64(); }
1005 
1006 __device__
1007 inline __attribute__((always_inline))
1008 long long int clock() { return __clock(); }
1009 
1010 // hip.amdgcn.bc - named sync
1011 __device__
1012 inline
1013 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
1014 
1015 #endif // __HIP_DEVICE_COMPILE__
1016 
1017 // warp vote function __all __any __ballot
1018 __device__
1019 inline
1020 int __all(int predicate) {
1021  return __ockl_wfall_i32(predicate);
1022 }
1023 
1024 __device__
1025 inline
1026 int __any(int predicate) {
1027  return __ockl_wfany_i32(predicate);
1028 }
1029 
1030 // XXX from llvm/include/llvm/IR/InstrTypes.h
1031 #define ICMP_NE 33
1032 
1033 __device__
1034 inline
1035 unsigned long long int __ballot(int predicate) {
1036  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1037 }
1038 
1039 __device__
1040 inline
1041 unsigned long long int __ballot64(int predicate) {
1042  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
1043 }
1044 
1045 // hip.amdgcn.bc - lanemask
1046 __device__
1047 inline
1048 uint64_t __lanemask_gt()
1049 {
1050  uint32_t lane = __ockl_lane_u32();
1051  if (lane == 63)
1052  return 0;
1053  uint64_t ballot = __ballot64(1);
1054  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
1055  return mask & ballot;
1056 }
1057 
1058 __device__
1059 inline
1060 uint64_t __lanemask_lt()
1061 {
1062  uint32_t lane = __ockl_lane_u32();
1063  int64_t ballot = __ballot64(1);
1064  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
1065  return mask & ballot;
1066 }
1067 
1068 __device__
1069 inline
1070 uint64_t __lanemask_eq()
1071 {
1072  uint32_t lane = __ockl_lane_u32();
1073  int64_t mask = ((uint64_t)1 << lane);
1074  return mask;
1075 }
1076 
1077 
1078 __device__ inline void* __local_to_generic(void* p) { return p; }
1079 
1080 #ifdef __HIP_DEVICE_COMPILE__
1081 __device__
1082 inline
1083 void* __get_dynamicgroupbaseptr()
1084 {
1085  // Get group segment base pointer.
1086  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
1087 }
1088 #else
1089 __device__
1090 void* __get_dynamicgroupbaseptr();
1091 #endif // __HIP_DEVICE_COMPILE__
1092 
1093 __device__
1094 inline
1095 void *__amdgcn_get_dynamicgroupbaseptr() {
1096  return __get_dynamicgroupbaseptr();
1097 }
1098 
1099 #if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
1100 // hip.amdgcn.bc - sync threads
1101 #define __CLK_LOCAL_MEM_FENCE 0x01
1102 typedef unsigned __cl_mem_fence_flags;
1103 
1104 typedef enum __memory_scope {
1105  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
1106  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
1107  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
1108  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
1109  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
1110 } __memory_scope;
1111 
1112 // enum values aligned with what clang uses in EmitAtomicExpr()
1113 typedef enum __memory_order
1114 {
1115  __memory_order_relaxed = __ATOMIC_RELAXED,
1116  __memory_order_acquire = __ATOMIC_ACQUIRE,
1117  __memory_order_release = __ATOMIC_RELEASE,
1118  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
1119  __memory_order_seq_cst = __ATOMIC_SEQ_CST
1120 } __memory_order;
1121 
1122 __device__
1123 inline
1124 static void
1125 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
1126 {
1127  // We're tying global-happens-before and local-happens-before together as does HSA
1128  if (order != __memory_order_relaxed) {
1129  switch (scope) {
1130  case __memory_scope_work_item:
1131  break;
1132  case __memory_scope_sub_group:
1133  switch (order) {
1134  case __memory_order_relaxed: break;
1135  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
1136  case __memory_order_release: __llvm_fence_rel_sg(); break;
1137  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
1138  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
1139  }
1140  break;
1141  case __memory_scope_work_group:
1142  switch (order) {
1143  case __memory_order_relaxed: break;
1144  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
1145  case __memory_order_release: __llvm_fence_rel_wg(); break;
1146  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
1147  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
1148  }
1149  break;
1150  case __memory_scope_device:
1151  switch (order) {
1152  case __memory_order_relaxed: break;
1153  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
1154  case __memory_order_release: __llvm_fence_rel_dev(); break;
1155  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
1156  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
1157  }
1158  break;
1159  case __memory_scope_all_svm_devices:
1160  switch (order) {
1161  case __memory_order_relaxed: break;
1162  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
1163  case __memory_order_release: __llvm_fence_rel_sys(); break;
1164  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
1165  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
1166  }
1167  break;
1168  }
1169  }
1170 }
1171 #endif
1172 
1173 // Memory Fence Functions
1174 __device__
1175 inline
1176 static void __threadfence()
1177 {
1178  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
1179 }
1180 
1181 __device__
1182 inline
1183 static void __threadfence_block()
1184 {
1185  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
1186 }
1187 
1188 __device__
1189 inline
1190 static void __threadfence_system()
1191 {
1192  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
1193 }
1194 
1195 // abort
1196 __device__
1197 inline
1198 __attribute__((weak))
1199 void abort() {
1200  return __builtin_trap();
1201 }
1202 
1203 
1204 #endif // __HCC_OR_HIP_CLANG__
1205 
1206 #ifdef __HCC__
1207 
1212 // Macro to replace extern __shared__ declarations
1213 // to local variable definitions
1214 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
1215 
1216 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1217 
1218 
1219 #elif defined(__clang__) && defined(__HIP__)
1220 
1221 // The noinline attribute helps encapsulate the printf expansion,
1222 // which otherwise has a performance impact just by increasing the
1223 // size of the calling function. Additionally, the weak attribute
1224 // allows the function to exist as a global although its definition is
1225 // included in every compilation unit.
1226 #if defined(_WIN32) || defined(_WIN64)
1227 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1228 void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
1229  // FIXME: Need `wchar_t` support to generate assertion message.
1230  __builtin_trap();
1231 }
1232 #else /* defined(_WIN32) || defined(_WIN64) */
1233 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1234 void __assert_fail(const char * __assertion,
1235  const char *__file,
1236  unsigned int __line,
1237  const char *__function)
1238 {
1239  printf("%s:%u: %s: Device-side assertion `%s' failed.\n", __file, __line,
1240  __function, __assertion);
1241  __builtin_trap();
1242 }
1243 
1244 extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
1245 void __assertfail(const char * __assertion,
1246  const char *__file,
1247  unsigned int __line,
1248  const char *__function,
1249  size_t charsize)
1250 {
1251  // ignore all the args for now.
1252  __builtin_trap();
1253 }
1254 #endif /* defined(_WIN32) || defined(_WIN64) */
1255 
1256 __device__
1257 inline
1258 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
1259 {
1260  if (flags) {
1261  __atomic_work_item_fence(flags, __memory_order_release, scope);
1262  __builtin_amdgcn_s_barrier();
1263  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
1264  } else {
1265  __builtin_amdgcn_s_barrier();
1266  }
1267 }
1268 
1269 __device__
1270 inline
1271 static void __barrier(int n)
1272 {
1273  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
1274 }
1275 
1276 __device__
1277 inline
1278 __attribute__((convergent))
1279 void __syncthreads()
1280 {
1281  __barrier(__CLK_LOCAL_MEM_FENCE);
1282 }
1283 
1284 __device__
1285 inline
1286 __attribute__((convergent))
1287 int __syncthreads_count(int predicate)
1288 {
1289  return __ockl_wgred_add_i32(!!predicate);
1290 }
1291 
1292 __device__
1293 inline
1294 __attribute__((convergent))
1295 int __syncthreads_and(int predicate)
1296 {
1297  return __ockl_wgred_and_i32(!!predicate);
1298 }
1299 
1300 __device__
1301 inline
1302 __attribute__((convergent))
1303 int __syncthreads_or(int predicate)
1304 {
1305  return __ockl_wgred_or_i32(!!predicate);
1306 }
1307 
1308 // hip.amdgcn.bc - device routine
1309 /*
1310  HW_ID Register bit structure
1311  WAVE_ID 3:0 Wave buffer slot number. 0-9.
1312  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
1313  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
1314  CU_ID 11:8 Compute Unit the wave is assigned to.
1315  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
1316  SE_ID 14:13 Shader Engine the wave is assigned to.
1317  TG_ID 19:16 Thread-group ID
1318  VM_ID 23:20 Virtual Memory ID
1319  QUEUE_ID 26:24 Queue from which this wave was dispatched.
1320  STATE_ID 29:27 State ID (graphics only, not compute).
1321  ME_ID 31:30 Micro-engine ID.
1322  */
1323 
1324 #define HW_ID 4
1325 
1326 #define HW_ID_CU_ID_SIZE 4
1327 #define HW_ID_CU_ID_OFFSET 8
1328 
1329 #define HW_ID_SE_ID_SIZE 2
1330 #define HW_ID_SE_ID_OFFSET 13
1331 
1332 /*
1333  Encoding of parameter bitmask
1334  HW_ID 5:0 HW_ID
1335  OFFSET 10:6 Range: 0..31
1336  SIZE 15:11 Range: 1..32
1337  */
1338 
1339 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1340 
1341 /*
1342  __smid returns the wave's assigned Compute Unit and Shader Engine.
1343  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1344  Note: the results vary over time.
1345  SZ minus 1 since SIZE is 1-based.
1346 */
1347 __device__
1348 inline
1349 unsigned __smid(void)
1350 {
1351  unsigned cu_id = __builtin_amdgcn_s_getreg(
1352  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1353  unsigned se_id = __builtin_amdgcn_s_getreg(
1354  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1355 
1356  /* Each shader engine has 16 CU */
1357  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1358 }
1359 
1360 // Macro to replace extern __shared__ declarations
1361 // to local variable definitions
1362 #define HIP_DYNAMIC_SHARED(type, var) \
1363  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1364 
1365 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1366 
1367 
1368 #endif //defined(__clang__) && defined(__HIP__)
1369 
1370 
1371 // loop unrolling
1372 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1373  auto dstPtr = static_cast<unsigned char*>(dst);
1374  auto srcPtr = static_cast<const unsigned char*>(src);
1375 
1376  while (size >= 4u) {
1377  dstPtr[0] = srcPtr[0];
1378  dstPtr[1] = srcPtr[1];
1379  dstPtr[2] = srcPtr[2];
1380  dstPtr[3] = srcPtr[3];
1381 
1382  size -= 4u;
1383  srcPtr += 4u;
1384  dstPtr += 4u;
1385  }
1386  switch (size) {
1387  case 3:
1388  dstPtr[2] = srcPtr[2];
1389  case 2:
1390  dstPtr[1] = srcPtr[1];
1391  case 1:
1392  dstPtr[0] = srcPtr[0];
1393  }
1394 
1395  return dst;
1396 }
1397 
1398 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1399  auto dstPtr = static_cast<unsigned char*>(dst);
1400 
1401  while (size >= 4u) {
1402  dstPtr[0] = val;
1403  dstPtr[1] = val;
1404  dstPtr[2] = val;
1405  dstPtr[3] = val;
1406 
1407  size -= 4u;
1408  dstPtr += 4u;
1409  }
1410  switch (size) {
1411  case 3:
1412  dstPtr[2] = val;
1413  case 2:
1414  dstPtr[1] = val;
1415  case 1:
1416  dstPtr[0] = val;
1417  }
1418 
1419  return dst;
1420 }
1421 #ifndef __OPENMP_AMDGCN__
1422 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1423  return __hip_hc_memcpy(dst, src, size);
1424 }
1425 
1426 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1427  unsigned char val8 = static_cast<unsigned char>(val);
1428  return __hip_hc_memset(ptr, val8, size);
1429 }
1430 #endif // !__OPENMP_AMDGCN__
1431 #endif
hip_runtime_api.h
Defines the API signatures for HIP runtime. This file can be compiled with a standard compiler.
char4
Definition: hip_vector_types.h:1509
uchar2Holder
Definition: device_functions.h:146
llvm_intrinsics.h
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s....
host_defines.h
TODO-doc.
ucharHolder
Definition: device_functions.h:139
device_library_decls.h
Contains declarations for types and functions in device library.