HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountl(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ulong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1 << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 // utility union type
228 union __u {
229  int i;
230  unsigned int u;
231  float f;
232 };
233 
234 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
235  __u tmp; tmp.u = src;
236  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
237  return tmp.u;
238 }
239 
240 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
241  __u tmp; tmp.f = src;
242  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
243  return tmp.f;
244 }
245 
246 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
247  __u tmp; tmp.u = src;
248  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
249  return tmp.u;
250 }
251 
252 __device__ static inline float __hip_ds_permutef(int index, float src) {
253  __u tmp; tmp.u = src;
254  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
255  return tmp.u;
256 }
257 
258 __device__ static inline unsigned __hip_ds_swizzle(unsigned int src, int pattern) {
259  __u tmp; tmp.u = src;
260  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
261  return tmp.u;
262 }
263 __device__ static inline float __hip_ds_swizzlef(float src, int pattern) {
264  __u tmp; tmp.f = src;
265  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
266  return tmp.f;
267 }
268 
269 __device__ static inline int __hip_move_dpp(int src, int dpp_ctrl, int row_mask,
270  int bank_mask, bool bound_ctrl) {
271  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
272 }
273 
274 static constexpr int warpSize = 64;
275 
276  __device__
277 inline
278 int __shfl(int var, int src_lane, int width = warpSize) {
279  int self = __lane_id();
280  int index = src_lane + (self & ~(width-1));
281  return __llvm_amdgcn_ds_bpermute(index<<2, var);
282 }
283 __device__
284 inline
285 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
286  __u tmp; tmp.u = var;
287  tmp.i = __shfl(tmp.i, src_lane, width);
288  return tmp.u;
289 }
290 __device__
291 inline
292 float __shfl(float var, int src_lane, int width = warpSize) {
293  __u tmp; tmp.f = var;
294  tmp.i = __shfl(tmp.i, src_lane, width);
295  return tmp.f;
296 }
297 __device__
298 inline
299 double __shfl(double var, int src_lane, int width = warpSize) {
300  static_assert(sizeof(double) == 2 * sizeof(int), "");
301  static_assert(sizeof(double) == sizeof(uint64_t), "");
302 
303  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
304  tmp[0] = __shfl(tmp[0], src_lane, width);
305  tmp[1] = __shfl(tmp[1], src_lane, width);
306 
307  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
308  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
309  return tmp1;
310 }
311 
312  __device__
313 inline
314 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
315  int self = __lane_id();
316  int index = self - lane_delta;
317  index = (index < (self & ~(width-1)))?self:index;
318  return __llvm_amdgcn_ds_bpermute(index<<2, var);
319 }
320 __device__
321 inline
322 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
323  __u tmp; tmp.u = var;
324  tmp.i = __shfl_up(tmp.i, lane_delta, width);
325  return tmp.u;
326 }
327 __device__
328 inline
329 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
330  __u tmp; tmp.f = var;
331  tmp.i = __shfl_up(tmp.i, lane_delta, width);
332  return tmp.f;
333 }
334 __device__
335 inline
336 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
337  static_assert(sizeof(double) == 2 * sizeof(int), "");
338  static_assert(sizeof(double) == sizeof(uint64_t), "");
339 
340  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
341  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
342  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
343 
344  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
345  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
346  return tmp1;
347 }
348 
349 __device__
350 inline
351 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
352  int self = __lane_id();
353  int index = self + lane_delta;
354  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
355  return __llvm_amdgcn_ds_bpermute(index<<2, var);
356 }
357 __device__
358 inline
359 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
360  __u tmp; tmp.u = var;
361  tmp.i = __shfl_down(tmp.i, lane_delta, width);
362  return tmp.u;
363 }
364 __device__
365 inline
366 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
367  __u tmp; tmp.f = var;
368  tmp.i = __shfl_down(tmp.i, lane_delta, width);
369  return tmp.f;
370 }
371 __device__
372 inline
373 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
374  static_assert(sizeof(double) == 2 * sizeof(int), "");
375  static_assert(sizeof(double) == sizeof(uint64_t), "");
376 
377  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
378  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
379  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
380 
381  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
382  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
383  return tmp1;
384 }
385 
386 __device__
387 inline
388 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
389  int self = __lane_id();
390  int index = self^lane_mask;
391  index = index >= ((self+width)&~(width-1))?self:index;
392  return __llvm_amdgcn_ds_bpermute(index<<2, var);
393 }
394 __device__
395 inline
396 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
397  __u tmp; tmp.u = var;
398  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
399  return tmp.u;
400 }
401 __device__
402 inline
403 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
404  __u tmp; tmp.f = var;
405  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
406  return tmp.f;
407 }
408 __device__
409 inline
410 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
411  static_assert(sizeof(double) == 2 * sizeof(int), "");
412  static_assert(sizeof(double) == sizeof(uint64_t), "");
413 
414  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
416  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
417 
418  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
419  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
420  return tmp1;
421 }
422 
423 #define MASK1 0x00ff00ff
424 #define MASK2 0xff00ff00
425 
426 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
427  char4 out;
428  unsigned one1 = in1.w & MASK1;
429  unsigned one2 = in2.w & MASK1;
430  out.w = (one1 + one2) & MASK1;
431  one1 = in1.w & MASK2;
432  one2 = in2.w & MASK2;
433  out.w = out.w | ((one1 + one2) & MASK2);
434  return out;
435 }
436 
437 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
438  char4 out;
439  unsigned one1 = in1.w & MASK1;
440  unsigned one2 = in2.w & MASK1;
441  out.w = (one1 - one2) & MASK1;
442  one1 = in1.w & MASK2;
443  one2 = in2.w & MASK2;
444  out.w = out.w | ((one1 - one2) & MASK2);
445  return out;
446 }
447 
448 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
449  char4 out;
450  unsigned one1 = in1.w & MASK1;
451  unsigned one2 = in2.w & MASK1;
452  out.w = (one1 * one2) & MASK1;
453  one1 = in1.w & MASK2;
454  one2 = in2.w & MASK2;
455  out.w = out.w | ((one1 * one2) & MASK2);
456  return out;
457 }
458 
459 /*
460  * Rounding modes are not yet supported in HIP
461  * TODO: Conversion functions are not correct, need to fix when BE is ready
462 */
463 
464 __device__ static inline float __double2float_rd(double x) { return (double)x; }
465 __device__ static inline float __double2float_rn(double x) { return (double)x; }
466 __device__ static inline float __double2float_ru(double x) { return (double)x; }
467 __device__ static inline float __double2float_rz(double x) { return (double)x; }
468 
469 __device__ static inline int __double2hiint(double x) {
470  static_assert(sizeof(double) == 2 * sizeof(int), "");
471 
472  int tmp[2];
473  __builtin_memcpy(tmp, &x, sizeof(tmp));
474 
475  return tmp[1];
476 }
477 __device__ static inline int __double2loint(double x) {
478  static_assert(sizeof(double) == 2 * sizeof(int), "");
479 
480  int tmp[2];
481  __builtin_memcpy(tmp, &x, sizeof(tmp));
482 
483  return tmp[0];
484 }
485 
486 __device__ static inline int __double2int_rd(double x) { return (int)x; }
487 __device__ static inline int __double2int_rn(double x) { return (int)x; }
488 __device__ static inline int __double2int_ru(double x) { return (int)x; }
489 __device__ static inline int __double2int_rz(double x) { return (int)x; }
490 
491 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
492 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
493 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
494 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
495 
496 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
497 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
498 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
499 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
500 
501 __device__ static inline unsigned long long int __double2ull_rd(double x) {
502  return (unsigned long long int)x;
503 }
504 __device__ static inline unsigned long long int __double2ull_rn(double x) {
505  return (unsigned long long int)x;
506 }
507 __device__ static inline unsigned long long int __double2ull_ru(double x) {
508  return (unsigned long long int)x;
509 }
510 __device__ static inline unsigned long long int __double2ull_rz(double x) {
511  return (unsigned long long int)x;
512 }
513 
514 __device__ static inline long long int __double_as_longlong(double x) {
515  static_assert(sizeof(long long) == sizeof(double), "");
516 
517  long long tmp;
518  __builtin_memcpy(&tmp, &x, sizeof(tmp));
519 
520  return tmp;
521 }
522 
523 /*
524 __device__ unsigned short __float2half_rn(float x);
525 __device__ float __half2float(unsigned short);
526 
527 The above device function are not a valid .
528 Use
529 __device__ __half __float2half_rn(float x);
530 __device__ float __half2float(__half);
531 from hip_fp16.h
532 
533 CUDA implements half as unsigned short whereas, HIP doesn't.
534 
535 */
536 
537 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
538 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
539 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
540 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
541 
542 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
543 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
544 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
545 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
546 
547 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
548 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
549 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
550 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
551 
552 __device__ static inline unsigned long long int __float2ull_rd(float x) {
553  return (unsigned long long int)x;
554 }
555 __device__ static inline unsigned long long int __float2ull_rn(float x) {
556  return (unsigned long long int)x;
557 }
558 __device__ static inline unsigned long long int __float2ull_ru(float x) {
559  return (unsigned long long int)x;
560 }
561 __device__ static inline unsigned long long int __float2ull_rz(float x) {
562  return (unsigned long long int)x;
563 }
564 
565 __device__ static inline int __float_as_int(float x) {
566  static_assert(sizeof(int) == sizeof(float), "");
567 
568  int tmp;
569  __builtin_memcpy(&tmp, &x, sizeof(tmp));
570 
571  return tmp;
572 }
573 
574 __device__ static inline unsigned int __float_as_uint(float x) {
575  static_assert(sizeof(unsigned int) == sizeof(float), "");
576 
577  unsigned int tmp;
578  __builtin_memcpy(&tmp, &x, sizeof(tmp));
579 
580  return tmp;
581 }
582 
583 __device__ static inline double __hiloint2double(int hi, int lo) {
584  static_assert(sizeof(double) == sizeof(uint64_t), "");
585 
586  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
587  double tmp1;
588  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
589 
590  return tmp1;
591 }
592 
593 __device__ static inline double __int2double_rn(int x) { return (double)x; }
594 
595 __device__ static inline float __int2float_rd(int x) { return (float)x; }
596 __device__ static inline float __int2float_rn(int x) { return (float)x; }
597 __device__ static inline float __int2float_ru(int x) { return (float)x; }
598 __device__ static inline float __int2float_rz(int x) { return (float)x; }
599 
600 __device__ static inline float __int_as_float(int x) {
601  static_assert(sizeof(float) == sizeof(int), "");
602 
603  float tmp;
604  __builtin_memcpy(&tmp, &x, sizeof(tmp));
605 
606  return tmp;
607 }
608 
609 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
610 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
611 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
612 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
613 
614 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
615 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
616 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
617 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
618 
619 __device__ static inline double __longlong_as_double(long long int x) {
620  static_assert(sizeof(double) == sizeof(long long), "");
621 
622  double tmp;
623  __builtin_memcpy(&tmp, &x, sizeof(tmp));
624 
625  return tmp;
626 }
627 
628 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
629 
630 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
631 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
632 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
633 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
634 
635 __device__ static inline float __uint_as_float(unsigned int x) {
636  static_assert(sizeof(float) == sizeof(unsigned int), "");
637 
638  float tmp;
639  __builtin_memcpy(&tmp, &x, sizeof(tmp));
640 
641  return tmp;
642 }
643 
644 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
645 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
646 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
647 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
648 
649 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
650 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
651 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
652 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
653 
654 #if defined(__HCC__)
655 #define __HCC_OR_HIP_CLANG__ 1
656 #elif defined(__clang__) && defined(__HIP__)
657 #define __HCC_OR_HIP_CLANG__ 1
658 #else
659 #define __HCC_OR_HIP_CLANG__ 0
660 #endif
661 
662 #ifdef __HCC_OR_HIP_CLANG__
663 
664 // Clock functions
665 __device__ long long int __clock64();
666 __device__ long long int __clock();
667 __device__ long long int clock64();
668 __device__ long long int clock();
669 // hip.amdgcn.bc - named sync
670 __device__ void __named_sync(int a, int b);
671 
672 #ifdef __HIP_DEVICE_COMPILE__
673 
674 // Clock functions
675 #if __HCC__
676 extern "C" uint64_t __clock_u64() __HC__;
677 #endif
678 
679 __device__
680 inline __attribute((always_inline))
681 long long int __clock64() {
682 // ToDo: Unify HCC and HIP implementation.
683 #if __HCC__
684  return (long long int) __clock_u64();
685 #else
686  return (long long int) __builtin_amdgcn_s_memrealtime();
687 #endif
688 }
689 
690 __device__
691 inline __attribute((always_inline))
692 long long int __clock() { return __clock64(); }
693 
694 __device__
695 inline __attribute__((always_inline))
696 long long int clock64() { return __clock64(); }
697 
698 __device__
699 inline __attribute__((always_inline))
700 long long int clock() { return __clock(); }
701 
702 // hip.amdgcn.bc - named sync
703 __device__
704 inline
705 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
706 
707 #endif // __HIP_DEVICE_COMPILE__
708 
709 // warp vote function __all __any __ballot
710 __device__
711 inline
712 int __all(int predicate) {
713  return __ockl_wfall_i32(predicate);
714 }
715 
716 __device__
717 inline
718 int __any(int predicate) {
719  return __ockl_wfany_i32(predicate);
720 }
721 
722 // XXX from llvm/include/llvm/IR/InstrTypes.h
723 #define ICMP_NE 33
724 
725 __device__
726 inline
727 unsigned long long int __ballot(int predicate) {
728  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
729 }
730 
731 __device__
732 inline
733 unsigned long long int __ballot64(int predicate) {
734  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
735 }
736 
737 // hip.amdgcn.bc - lanemask
738 __device__
739 inline
740 int64_t __lanemask_gt()
741 {
742  int32_t activelane = __ockl_activelane_u32();
743  int64_t ballot = __ballot64(1);
744  if (activelane != 63) {
745  int64_t tmp = (~0UL) << (activelane + 1);
746  return tmp & ballot;
747  }
748  return 0;
749 }
750 
751 __device__
752 inline
753 int64_t __lanemask_lt()
754 {
755  int32_t activelane = __ockl_activelane_u32();
756  int64_t ballot = __ballot64(1);
757  if (activelane == 0)
758  return 0;
759  return ballot;
760 }
761 
762 __device__ inline void* __local_to_generic(void* p) { return p; }
763 
764 #ifdef __HIP_DEVICE_COMPILE__
765 __device__
766 inline
767 void* __get_dynamicgroupbaseptr()
768 {
769  // Get group segment base pointer.
770  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
771 }
772 #else
773 __device__
774 void* __get_dynamicgroupbaseptr();
775 #endif // __HIP_DEVICE_COMPILE__
776 
777 __device__
778 inline
779 void *__amdgcn_get_dynamicgroupbaseptr() {
780  return __get_dynamicgroupbaseptr();
781 }
782 
783 #if defined(__HCC__) && (__hcc_minor__ < 3)
784 // hip.amdgcn.bc - sync threads
785 #define __CLK_LOCAL_MEM_FENCE 0x01
786 typedef unsigned __cl_mem_fence_flags;
787 
788 typedef enum __memory_scope {
789  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
790  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
791  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
792  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
793  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
794 } __memory_scope;
795 
796 // enum values aligned with what clang uses in EmitAtomicExpr()
797 typedef enum __memory_order
798 {
799  __memory_order_relaxed = __ATOMIC_RELAXED,
800  __memory_order_acquire = __ATOMIC_ACQUIRE,
801  __memory_order_release = __ATOMIC_RELEASE,
802  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
803  __memory_order_seq_cst = __ATOMIC_SEQ_CST
804 } __memory_order;
805 
806 __device__
807 inline
808 static void
809 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
810 {
811  // We're tying global-happens-before and local-happens-before together as does HSA
812  if (order != __memory_order_relaxed) {
813  switch (scope) {
814  case __memory_scope_work_item:
815  break;
816  case __memory_scope_sub_group:
817  switch (order) {
818  case __memory_order_relaxed: break;
819  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
820  case __memory_order_release: __llvm_fence_rel_sg(); break;
821  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
822  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
823  }
824  break;
825  case __memory_scope_work_group:
826  switch (order) {
827  case __memory_order_relaxed: break;
828  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
829  case __memory_order_release: __llvm_fence_rel_wg(); break;
830  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
831  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
832  }
833  break;
834  case __memory_scope_device:
835  switch (order) {
836  case __memory_order_relaxed: break;
837  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
838  case __memory_order_release: __llvm_fence_rel_dev(); break;
839  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
840  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
841  }
842  break;
843  case __memory_scope_all_svm_devices:
844  switch (order) {
845  case __memory_order_relaxed: break;
846  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
847  case __memory_order_release: __llvm_fence_rel_sys(); break;
848  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
849  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
850  }
851  break;
852  }
853  }
854 }
855 #endif
856 
857 // Memory Fence Functions
858 __device__
859 inline
860 static void __threadfence()
861 {
862  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
863 }
864 
865 __device__
866 inline
867 static void __threadfence_block()
868 {
869  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
870 }
871 
872 __device__
873 inline
874 static void __threadfence_system()
875 {
876  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
877 }
878 
879 // abort
880 __device__
881 inline
882 __attribute__((weak))
883 void abort() {
884  return __builtin_trap();
885 }
886 
887 
888 #endif // __HCC_OR_HIP_CLANG__
889 
890 #ifdef __HCC__
891 
896 // Macro to replace extern __shared__ declarations
897 // to local variable definitions
898 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
899 
900 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
901 
902 
903 #elif defined(__clang__) && defined(__HIP__)
904 
905 #pragma push_macro("__DEVICE__")
906 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
907  __attribute__((weak))
908 
909 __DEVICE__
910 inline
911 void __assert_fail(const char * __assertion,
912  const char *__file,
913  unsigned int __line,
914  const char *__function)
915 {
916  // Ignore all the args for now.
917  __builtin_trap();
918 }
919 
920 __DEVICE__
921 inline
922 void __assertfail(const char * __assertion,
923  const char *__file,
924  unsigned int __line,
925  const char *__function,
926  size_t charsize)
927 {
928  // ignore all the args for now.
929  __builtin_trap();
930 }
931 
932 __device__
933 inline
934 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
935 {
936  if (flags) {
937  __atomic_work_item_fence(flags, __memory_order_release, scope);
938  __builtin_amdgcn_s_barrier();
939  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
940  } else {
941  __builtin_amdgcn_s_barrier();
942  }
943 }
944 
945 __device__
946 inline
947 static void __barrier(int n)
948 {
949  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
950 }
951 
952 __device__
953 inline
954 __attribute__((noduplicate))
955 void __syncthreads()
956 {
957  __barrier(__CLK_LOCAL_MEM_FENCE);
958 }
959 
960 // hip.amdgcn.bc - device routine
961 /*
962  HW_ID Register bit structure
963  WAVE_ID 3:0 Wave buffer slot number. 0-9.
964  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
965  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
966  CU_ID 11:8 Compute Unit the wave is assigned to.
967  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
968  SE_ID 14:13 Shader Engine the wave is assigned to.
969  TG_ID 19:16 Thread-group ID
970  VM_ID 23:20 Virtual Memory ID
971  QUEUE_ID 26:24 Queue from which this wave was dispatched.
972  STATE_ID 29:27 State ID (graphics only, not compute).
973  ME_ID 31:30 Micro-engine ID.
974  */
975 
976 #define HW_ID 4
977 
978 #define HW_ID_CU_ID_SIZE 4
979 #define HW_ID_CU_ID_OFFSET 8
980 
981 #define HW_ID_SE_ID_SIZE 2
982 #define HW_ID_SE_ID_OFFSET 13
983 
984 /*
985  Encoding of parameter bitmask
986  HW_ID 5:0 HW_ID
987  OFFSET 10:6 Range: 0..31
988  SIZE 15:11 Range: 1..32
989  */
990 
991 #define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
992 
993 __device__
994 inline
995 unsigned __smid(void)
996 {
997  unsigned cu_id = __builtin_amdgcn_s_getreg(
998  GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
999  unsigned se_id = __builtin_amdgcn_s_getreg(
1000  GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
1001 
1002  /* Each shader engine has 16 CU */
1003  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1004 }
1005 
1006 #pragma push_macro("__DEVICE__")
1007 
1008 // Macro to replace extern __shared__ declarations
1009 // to local variable definitions
1010 #define HIP_DYNAMIC_SHARED(type, var) \
1011  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1012 
1013 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1014 
1015 
1016 #endif //defined(__clang__) && defined(__HIP__)
1017 
1018 
1019 // loop unrolling
1020 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1021  auto dstPtr = static_cast<unsigned char*>(dst);
1022  auto srcPtr = static_cast<const unsigned char*>(src);
1023 
1024  while (size >= 4u) {
1025  dstPtr[0] = srcPtr[0];
1026  dstPtr[1] = srcPtr[1];
1027  dstPtr[2] = srcPtr[2];
1028  dstPtr[3] = srcPtr[3];
1029 
1030  size -= 4u;
1031  srcPtr += 4u;
1032  dstPtr += 4u;
1033  }
1034  switch (size) {
1035  case 3:
1036  dstPtr[2] = srcPtr[2];
1037  case 2:
1038  dstPtr[1] = srcPtr[1];
1039  case 1:
1040  dstPtr[0] = srcPtr[0];
1041  }
1042 
1043  return dst;
1044 }
1045 
1046 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1047  auto dstPtr = static_cast<unsigned char*>(dst);
1048 
1049  while (size >= 4u) {
1050  dstPtr[0] = val;
1051  dstPtr[1] = val;
1052  dstPtr[2] = val;
1053  dstPtr[3] = val;
1054 
1055  size -= 4u;
1056  dstPtr += 4u;
1057  }
1058  switch (size) {
1059  case 3:
1060  dstPtr[2] = val;
1061  case 2:
1062  dstPtr[1] = val;
1063  case 1:
1064  dstPtr[0] = val;
1065  }
1066 
1067  return dst;
1068 }
1069 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1070  return __hip_hc_memcpy(dst, src, size);
1071 }
1072 
1073 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1074  unsigned char val8 = static_cast<unsigned char>(val);
1075  return __hip_hc_memset(ptr, val8, size);
1076 }
1077 
1078 #endif
TODO-doc.
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:228
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.