HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountl(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ulong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1 << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 // utility union type
228 union __u {
229  int i;
230  unsigned int u;
231  float f;
232 };
233 
234 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
235  __u tmp; tmp.u = src;
236  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
237  return tmp.u;
238 }
239 
240 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
241  __u tmp; tmp.f = src;
242  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
243  return tmp.f;
244 }
245 
246 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
247  __u tmp; tmp.u = src;
248  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
249  return tmp.u;
250 }
251 
252 __device__ static inline float __hip_ds_permutef(int index, float src) {
253  __u tmp; tmp.u = src;
254  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
255  return tmp.u;
256 }
257 
258 __device__ static inline unsigned __hip_ds_swizzle(unsigned int src, int pattern) {
259  __u tmp; tmp.u = src;
260  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
261  return tmp.u;
262 }
263 __device__ static inline float __hip_ds_swizzlef(float src, int pattern) {
264  __u tmp; tmp.f = src;
265  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
266  return tmp.f;
267 }
268 
269 __device__ static inline int __hip_move_dpp(int src, int dpp_ctrl, int row_mask,
270  int bank_mask, bool bound_ctrl) {
271  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
272 }
273 
274 static constexpr int warpSize = 64;
275 
276  __device__
277 inline
278 int __shfl(int var, int src_lane, int width = warpSize) {
279  int self = __lane_id();
280  int index = src_lane + (self & ~(width-1));
281  return __llvm_amdgcn_ds_bpermute(index<<2, var);
282 }
283 __device__
284 inline
285 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
286  __u tmp; tmp.u = var;
287  tmp.i = __shfl(tmp.i, src_lane, width);
288  return tmp.u;
289 }
290 __device__
291 inline
292 float __shfl(float var, int src_lane, int width = warpSize) {
293  __u tmp; tmp.f = var;
294  tmp.i = __shfl(tmp.i, src_lane, width);
295  return tmp.f;
296 }
297 __device__
298 inline
299 double __shfl(double var, int src_lane, int width = warpSize) {
300  __u tmp; tmp.f = (float) var;
301  tmp.i = __shfl(tmp.i, src_lane, width);
302  return (double) tmp.f;
303 }
304 
305  __device__
306 inline
307 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
308  int self = __lane_id();
309  int index = self - lane_delta;
310  index = (index < (self & ~(width-1)))?self:index;
311  return __llvm_amdgcn_ds_bpermute(index<<2, var);
312 }
313 __device__
314 inline
315 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
316  __u tmp; tmp.u = var;
317  tmp.i = __shfl_up(tmp.i, lane_delta, width);
318  return tmp.u;
319 }
320 __device__
321 inline
322 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
323  __u tmp; tmp.f = var;
324  tmp.i = __shfl_up(tmp.i, lane_delta, width);
325  return tmp.f;
326 }
327 __device__
328 inline
329 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
330  __u tmp; tmp.f = (float) var;
331  tmp.i = __shfl_up(tmp.i, lane_delta, width);
332  return (double) tmp.f;
333 }
334 
335 __device__
336 inline
337 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
338  int self = __lane_id();
339  int index = self + lane_delta;
340  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
341  return __llvm_amdgcn_ds_bpermute(index<<2, var);
342 }
343 __device__
344 inline
345 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
346  __u tmp; tmp.u = var;
347  tmp.i = __shfl_down(tmp.i, lane_delta, width);
348  return tmp.u;
349 }
350 __device__
351 inline
352 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
353  __u tmp; tmp.f = var;
354  tmp.i = __shfl_down(tmp.i, lane_delta, width);
355  return tmp.f;
356 }
357 __device__
358 inline
359 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
360  __u tmp; tmp.f = (float) var;
361  tmp.i = __shfl_down(tmp.i, lane_delta, width);
362  return (double) tmp.f;
363 }
364 
365 __device__
366 inline
367 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
368  int self = __lane_id();
369  int index = self^lane_mask;
370  index = index >= ((self+width)&~(width-1))?self:index;
371  return __llvm_amdgcn_ds_bpermute(index<<2, var);
372 }
373 __device__
374 inline
375 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
376  __u tmp; tmp.u = var;
377  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
378  return tmp.u;
379 }
380 __device__
381 inline
382 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
383  __u tmp; tmp.f = var;
384  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
385  return tmp.f;
386 }
387 __device__
388 inline
389 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
390  __u tmp; tmp.f = (float) var;
391  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
392  return (double) tmp.f;
393 }
394 
395 #define MASK1 0x00ff00ff
396 #define MASK2 0xff00ff00
397 
398 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
399  char4 out;
400  unsigned one1 = in1.a & MASK1;
401  unsigned one2 = in2.a & MASK1;
402  out.a = (one1 + one2) & MASK1;
403  one1 = in1.a & MASK2;
404  one2 = in2.a & MASK2;
405  out.a = out.a | ((one1 + one2) & MASK2);
406  return out;
407 }
408 
409 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
410  char4 out;
411  unsigned one1 = in1.a & MASK1;
412  unsigned one2 = in2.a & MASK1;
413  out.a = (one1 - one2) & MASK1;
414  one1 = in1.a & MASK2;
415  one2 = in2.a & MASK2;
416  out.a = out.a | ((one1 - one2) & MASK2);
417  return out;
418 }
419 
420 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
421  char4 out;
422  unsigned one1 = in1.a & MASK1;
423  unsigned one2 = in2.a & MASK1;
424  out.a = (one1 * one2) & MASK1;
425  one1 = in1.a & MASK2;
426  one2 = in2.a & MASK2;
427  out.a = out.a | ((one1 * one2) & MASK2);
428  return out;
429 }
430 
431 /*
432  * Rounding modes are not yet supported in HIP
433  * TODO: Conversion functions are not correct, need to fix when BE is ready
434 */
435 
436 __device__ static inline float __double2float_rd(double x) { return (double)x; }
437 __device__ static inline float __double2float_rn(double x) { return (double)x; }
438 __device__ static inline float __double2float_ru(double x) { return (double)x; }
439 __device__ static inline float __double2float_rz(double x) { return (double)x; }
440 
441 __device__ static inline int __double2hiint(double x) {
442  static_assert(sizeof(double) == 2 * sizeof(int), "");
443 
444  int tmp[2];
445  __builtin_memcpy(tmp, &x, sizeof(tmp));
446 
447  return tmp[1];
448 }
449 __device__ static inline int __double2loint(double x) {
450  static_assert(sizeof(double) == 2 * sizeof(int), "");
451 
452  int tmp[2];
453  __builtin_memcpy(tmp, &x, sizeof(tmp));
454 
455  return tmp[0];
456 }
457 
458 __device__ static inline int __double2int_rd(double x) { return (int)x; }
459 __device__ static inline int __double2int_rn(double x) { return (int)x; }
460 __device__ static inline int __double2int_ru(double x) { return (int)x; }
461 __device__ static inline int __double2int_rz(double x) { return (int)x; }
462 
463 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
464 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
465 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
466 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
467 
468 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
469 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
470 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
471 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
472 
473 __device__ static inline unsigned long long int __double2ull_rd(double x) {
474  return (unsigned long long int)x;
475 }
476 __device__ static inline unsigned long long int __double2ull_rn(double x) {
477  return (unsigned long long int)x;
478 }
479 __device__ static inline unsigned long long int __double2ull_ru(double x) {
480  return (unsigned long long int)x;
481 }
482 __device__ static inline unsigned long long int __double2ull_rz(double x) {
483  return (unsigned long long int)x;
484 }
485 
486 __device__ static inline long long int __double_as_longlong(double x) {
487  static_assert(sizeof(long long) == sizeof(double), "");
488 
489  long long tmp;
490  __builtin_memcpy(&tmp, &x, sizeof(tmp));
491 
492  return tmp;
493 }
494 
495 /*
496 __device__ unsigned short __float2half_rn(float x);
497 __device__ float __half2float(unsigned short);
498 
499 The above device function are not a valid .
500 Use
501 __device__ __half __float2half_rn(float x);
502 __device__ float __half2float(__half);
503 from hip_fp16.h
504 
505 CUDA implements half as unsigned short whereas, HIP doesn't.
506 
507 */
508 
509 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
510 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
511 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
512 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
513 
514 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
515 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
516 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
517 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
518 
519 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
520 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
521 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
522 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
523 
524 __device__ static inline unsigned long long int __float2ull_rd(float x) {
525  return (unsigned long long int)x;
526 }
527 __device__ static inline unsigned long long int __float2ull_rn(float x) {
528  return (unsigned long long int)x;
529 }
530 __device__ static inline unsigned long long int __float2ull_ru(float x) {
531  return (unsigned long long int)x;
532 }
533 __device__ static inline unsigned long long int __float2ull_rz(float x) {
534  return (unsigned long long int)x;
535 }
536 
537 __device__ static inline int __float_as_int(float x) {
538  static_assert(sizeof(int) == sizeof(float), "");
539 
540  int tmp;
541  __builtin_memcpy(&tmp, &x, sizeof(tmp));
542 
543  return tmp;
544 }
545 
546 __device__ static inline unsigned int __float_as_uint(float x) {
547  static_assert(sizeof(unsigned int) == sizeof(float), "");
548 
549  unsigned int tmp;
550  __builtin_memcpy(&tmp, &x, sizeof(tmp));
551 
552  return tmp;
553 }
554 
555 __device__ static inline double __hiloint2double(int hi, int lo) {
556  static_assert(sizeof(double) == sizeof(uint64_t), "");
557 
558  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
559  double tmp1;
560  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
561 
562  return tmp1;
563 }
564 
565 __device__ static inline double __int2double_rn(int x) { return (double)x; }
566 
567 __device__ static inline float __int2float_rd(int x) { return (float)x; }
568 __device__ static inline float __int2float_rn(int x) { return (float)x; }
569 __device__ static inline float __int2float_ru(int x) { return (float)x; }
570 __device__ static inline float __int2float_rz(int x) { return (float)x; }
571 
572 __device__ static inline float __int_as_float(int x) {
573  static_assert(sizeof(float) == sizeof(int), "");
574 
575  float tmp;
576  __builtin_memcpy(&tmp, &x, sizeof(tmp));
577 
578  return tmp;
579 }
580 
581 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
582 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
583 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
584 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
585 
586 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
587 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
588 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
589 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
590 
591 __device__ static inline double __longlong_as_double(long long int x) {
592  static_assert(sizeof(double) == sizeof(long long), "");
593 
594  double tmp;
595  __builtin_memcpy(&tmp, &x, sizeof(tmp));
596 
597  return tmp;
598 }
599 
600 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
601 
602 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
603 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
604 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
605 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
606 
607 __device__ static inline float __uint_as_float(unsigned int x) {
608  static_assert(sizeof(float) == sizeof(unsigned int), "");
609 
610  float tmp;
611  __builtin_memcpy(&tmp, &x, sizeof(tmp));
612 
613  return tmp;
614 }
615 
616 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
617 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
618 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
619 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
620 
621 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
622 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
623 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
624 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
625 
626 #if defined(__HCC__)
627 #define __HCC_OR_HIP_CLANG__ 1
628 #elif defined(__clang__) && defined(__HIP__)
629 #define __HCC_OR_HIP_CLANG__ 1
630 #else
631 #define __HCC_OR_HIP_CLANG__ 0
632 #endif
633 
634 #ifdef __HCC_OR_HIP_CLANG__
635 
636 // Clock functions
637 __device__ long long int __clock64();
638 __device__ long long int __clock();
639 __device__ long long int clock64();
640 __device__ long long int clock();
641 // hip.amdgcn.bc - named sync
642 __device__ void __named_sync(int a, int b);
643 
644 #ifdef __HIP_DEVICE_COMPILE__
645 
646 // Clock functions
647 #if __HCC__
648 extern "C" uint64_t __clock_u64() __HC__;
649 #endif
650 
651 __device__
652 inline __attribute((always_inline))
653 long long int __clock64() {
654 // ToDo: Unify HCC and HIP implementation.
655 #if __HCC__
656  return (long long int) __clock_u64();
657 #else
658  return (long long int) __builtin_amdgcn_s_memrealtime();
659 #endif
660 }
661 
662 __device__
663 inline __attribute((always_inline))
664 long long int __clock() { return __clock64(); }
665 
666 __device__
667 inline __attribute__((always_inline))
668 long long int clock64() { return __clock64(); }
669 
670 __device__
671 inline __attribute__((always_inline))
672 long long int clock() { return __clock(); }
673 
674 // hip.amdgcn.bc - named sync
675 __device__
676 inline
677 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
678 
679 #endif // __HIP_DEVICE_COMPILE__
680 
681 // warp vote function __all __any __ballot
682 __device__
683 inline
684 int __all(int predicate) {
685  return __ockl_wfall_i32(predicate);
686 }
687 
688 __device__
689 inline
690 int __any(int predicate) {
691  return __ockl_wfany_i32(predicate);
692 }
693 
694 // XXX from llvm/include/llvm/IR/InstrTypes.h
695 #define ICMP_NE 33
696 
697 __device__
698 inline
699 unsigned long long int __ballot(int predicate) {
700  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
701 }
702 
703 __device__
704 inline
705 unsigned long long int __ballot64(int predicate) {
706  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
707 }
708 
709 // hip.amdgcn.bc - lanemask
710 __device__
711 inline
712 int64_t __lanemask_gt()
713 {
714  int32_t activelane = __ockl_activelane_u32();
715  int64_t ballot = __ballot64(1);
716  if (activelane != 63) {
717  int64_t tmp = (~0UL) << (activelane + 1);
718  return tmp & ballot;
719  }
720  return 0;
721 }
722 
723 __device__
724 inline
725 int64_t __lanemask_lt()
726 {
727  int32_t activelane = __ockl_activelane_u32();
728  int64_t ballot = __ballot64(1);
729  if (activelane == 0)
730  return 0;
731  return ballot;
732 }
733 
734 __device__ inline void* __local_to_generic(void* p) { return p; }
735 
736 #ifdef __HIP_DEVICE_COMPILE__
737 __device__
738 inline
739 void* __get_dynamicgroupbaseptr()
740 {
741  // Get group segment base pointer.
742  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
743 }
744 #else
745 __device__
746 void* __get_dynamicgroupbaseptr();
747 #endif // __HIP_DEVICE_COMPILE__
748 
749 __device__
750 inline
751 void *__amdgcn_get_dynamicgroupbaseptr() {
752  return __get_dynamicgroupbaseptr();
753 }
754 
755 
756 
757 // hip.amdgcn.bc - sync threads
758 #define __CLK_LOCAL_MEM_FENCE 0x01
759 typedef unsigned __cl_mem_fence_flags;
760 
761 typedef enum __memory_scope {
762  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
763  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
764  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
765  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
766  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
767 } __memory_scope;
768 
769 // enum values aligned with what clang uses in EmitAtomicExpr()
770 typedef enum __memory_order
771 {
772  __memory_order_relaxed = __ATOMIC_RELAXED,
773  __memory_order_acquire = __ATOMIC_ACQUIRE,
774  __memory_order_release = __ATOMIC_RELEASE,
775  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
776  __memory_order_seq_cst = __ATOMIC_SEQ_CST
777 } __memory_order;
778 
779 __device__
780 inline
781 static void
782 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
783 {
784  // We're tying global-happens-before and local-happens-before together as does HSA
785  if (order != __memory_order_relaxed) {
786  switch (scope) {
787  case __memory_scope_work_item:
788  break;
789  case __memory_scope_sub_group:
790  switch (order) {
791  case __memory_order_relaxed: break;
792  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
793  case __memory_order_release: __llvm_fence_rel_sg(); break;
794  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
795  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
796  }
797  break;
798  case __memory_scope_work_group:
799  switch (order) {
800  case __memory_order_relaxed: break;
801  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
802  case __memory_order_release: __llvm_fence_rel_wg(); break;
803  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
804  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
805  }
806  break;
807  case __memory_scope_device:
808  switch (order) {
809  case __memory_order_relaxed: break;
810  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
811  case __memory_order_release: __llvm_fence_rel_dev(); break;
812  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
813  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
814  }
815  break;
816  case __memory_scope_all_svm_devices:
817  switch (order) {
818  case __memory_order_relaxed: break;
819  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
820  case __memory_order_release: __llvm_fence_rel_sys(); break;
821  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
822  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
823  }
824  break;
825  }
826  }
827 }
828 
829 // Memory Fence Functions
830 __device__
831 inline
832 static void __threadfence()
833 {
834  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
835 }
836 
837 __device__
838 inline
839 static void __threadfence_block()
840 {
841  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
842 }
843 
844 __device__
845 inline
846 static void __threadfence_system()
847 {
848  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
849 }
850 
851 // abort
852 __device__
853 inline
854 __attribute__((weak))
855 void abort() {
856  return __builtin_trap();
857 }
858 
859 
860 #endif // __HCC_OR_HIP_CLANG__
861 
862 #ifdef __HCC__
863 
868 // Macro to replace extern __shared__ declarations
869 // to local variable definitions
870 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
871 
872 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
873 
874 
875 #elif defined(__clang__) && defined(__HIP__)
876 
877 #pragma push_macro("__DEVICE__")
878 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
879  __attribute__((weak))
880 
881 __DEVICE__
882 inline
883 void __assert_fail(const char * __assertion,
884  const char *__file,
885  unsigned int __line,
886  const char *__function)
887 {
888  // Ignore all the args for now.
889  __builtin_trap();
890 }
891 
892 __DEVICE__
893 inline
894 void __assertfail(const char * __assertion,
895  const char *__file,
896  unsigned int __line,
897  const char *__function,
898  size_t charsize)
899 {
900  // ignore all the args for now.
901  __builtin_trap();
902 }
903 
904 __device__
905 inline
906 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
907 {
908  if (flags) {
909  __atomic_work_item_fence(flags, __memory_order_release, scope);
910  __builtin_amdgcn_s_barrier();
911  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
912  } else {
913  __builtin_amdgcn_s_barrier();
914  }
915 }
916 
917 __device__
918 inline
919 static void __barrier(int n)
920 {
921  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
922 }
923 
924 __device__
925 inline
926 __attribute__((noduplicate))
927 void __syncthreads()
928 {
929  __barrier(__CLK_LOCAL_MEM_FENCE);
930 }
931 
932 // hip.amdgcn.bc - device routine
933 /*
934  HW_ID Register bit structure
935  WAVE_ID 3:0 Wave buffer slot number. 0-9.
936  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
937  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
938  CU_ID 11:8 Compute Unit the wave is assigned to.
939  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
940  SE_ID 14:13 Shader Engine the wave is assigned to.
941  TG_ID 19:16 Thread-group ID
942  VM_ID 23:20 Virtual Memory ID
943  QUEUE_ID 26:24 Queue from which this wave was dispatched.
944  STATE_ID 29:27 State ID (graphics only, not compute).
945  ME_ID 31:30 Micro-engine ID.
946  */
947 
948 #define HW_ID 4
949 
950 #define HW_ID_CU_ID_SIZE 4
951 #define HW_ID_CU_ID_OFFSET 8
952 
953 #define HW_ID_SE_ID_SIZE 2
954 #define HW_ID_SE_ID_OFFSET 13
955 
956 /*
957  Encoding of parameter bitmask
958  HW_ID 5:0 HW_ID
959  OFFSET 10:6 Range: 0..31
960  SIZE 15:11 Range: 1..32
961  */
962 
963 #define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
964 
965 __device__
966 inline
967 unsigned __smid(void)
968 {
969  unsigned cu_id = __builtin_amdgcn_s_getreg(
970  GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
971  unsigned se_id = __builtin_amdgcn_s_getreg(
972  GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
973 
974  /* Each shader engine has 16 CU */
975  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
976 }
977 
978 #pragma push_macro("__DEVICE__")
979 
980 // Macro to replace extern __shared__ declarations
981 // to local variable definitions
982 #define HIP_DYNAMIC_SHARED(type, var) \
983  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
984 
985 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
986 
987 
988 #endif //defined(__clang__) && defined(__HIP__)
989 
990 
991 // loop unrolling
992 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
993  auto dstPtr = static_cast<unsigned char*>(dst);
994  auto srcPtr = static_cast<const unsigned char*>(src);
995 
996  while (size >= 4u) {
997  dstPtr[0] = srcPtr[0];
998  dstPtr[1] = srcPtr[1];
999  dstPtr[2] = srcPtr[2];
1000  dstPtr[3] = srcPtr[3];
1001 
1002  size -= 4u;
1003  srcPtr += 4u;
1004  dstPtr += 4u;
1005  }
1006  switch (size) {
1007  case 3:
1008  dstPtr[2] = srcPtr[2];
1009  case 2:
1010  dstPtr[1] = srcPtr[1];
1011  case 1:
1012  dstPtr[0] = srcPtr[0];
1013  }
1014 
1015  return dst;
1016 }
1017 
1018 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1019  auto dstPtr = static_cast<unsigned char*>(dst);
1020 
1021  while (size >= 4u) {
1022  dstPtr[0] = val;
1023  dstPtr[1] = val;
1024  dstPtr[2] = val;
1025  dstPtr[3] = val;
1026 
1027  size -= 4u;
1028  dstPtr += 4u;
1029  }
1030  switch (size) {
1031  case 3:
1032  dstPtr[2] = val;
1033  case 2:
1034  dstPtr[1] = val;
1035  case 1:
1036  dstPtr[0] = val;
1037  }
1038 
1039  return dst;
1040 }
1041 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1042  return __hip_hc_memcpy(dst, src, size);
1043 }
1044 
1045 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1046  unsigned char val8 = static_cast<unsigned char>(val);
1047  return __hip_hc_memset(ptr, val8, size);
1048 }
1049 
1050 #endif
TODO-doc.
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: hip_vector_types.h:236
Definition: device_functions.h:228
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.