HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountll(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ullong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1ULL << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
228  union { int i; unsigned u; float f; } tmp; tmp.u = src;
229  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
230  return tmp.u;
231 }
232 
233 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
234  union { int i; unsigned u; float f; } tmp; tmp.f = src;
235  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
236  return tmp.f;
237 }
238 
239 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
240  union { int i; unsigned u; float f; } tmp; tmp.u = src;
241  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
242  return tmp.u;
243 }
244 
245 __device__ static inline float __hip_ds_permutef(int index, float src) {
246  union { int i; unsigned u; float f; } tmp; tmp.u = src;
247  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
248  return tmp.u;
249 }
250 
251 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
252 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
253 
254 template <int pattern>
255 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
256  union { int i; unsigned u; float f; } tmp; tmp.u = src;
257 #if defined(__HCC__)
258  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
259 #else
260  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
261 #endif
262  return tmp.u;
263 }
264 
265 template <int pattern>
266 __device__ static inline float __hip_ds_swizzlef_N(float src) {
267  union { int i; unsigned u; float f; } tmp; tmp.f = src;
268 #if defined(__HCC__)
269  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
270 #else
271  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
272 #endif
273  return tmp.f;
274 }
275 
276 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
277  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
278 
279 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
280 __device__ static inline int __hip_move_dpp_N(int src) {
281  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
282  bound_ctrl);
283 }
284 
285 static constexpr int warpSize = 64;
286 
287 __device__
288 inline
289 int __shfl(int var, int src_lane, int width = warpSize) {
290  int self = __lane_id();
291  int index = src_lane + (self & ~(width-1));
292  return __llvm_amdgcn_ds_bpermute(index<<2, var);
293 }
294 __device__
295 inline
296 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
297  union { int i; unsigned u; float f; } tmp; tmp.u = var;
298  tmp.i = __shfl(tmp.i, src_lane, width);
299  return tmp.u;
300 }
301 __device__
302 inline
303 float __shfl(float var, int src_lane, int width = warpSize) {
304  union { int i; unsigned u; float f; } tmp; tmp.f = var;
305  tmp.i = __shfl(tmp.i, src_lane, width);
306  return tmp.f;
307 }
308 __device__
309 inline
310 double __shfl(double var, int src_lane, int width = warpSize) {
311  static_assert(sizeof(double) == 2 * sizeof(int), "");
312  static_assert(sizeof(double) == sizeof(uint64_t), "");
313 
314  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
315  tmp[0] = __shfl(tmp[0], src_lane, width);
316  tmp[1] = __shfl(tmp[1], src_lane, width);
317 
318  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
319  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
320  return tmp1;
321 }
322 
323  __device__
324 inline
325 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
326  int self = __lane_id();
327  int index = self - lane_delta;
328  index = (index < (self & ~(width-1)))?self:index;
329  return __llvm_amdgcn_ds_bpermute(index<<2, var);
330 }
331 __device__
332 inline
333 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
334  union { int i; unsigned u; float f; } tmp; tmp.u = var;
335  tmp.i = __shfl_up(tmp.i, lane_delta, width);
336  return tmp.u;
337 }
338 __device__
339 inline
340 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
341  union { int i; unsigned u; float f; } tmp; tmp.f = var;
342  tmp.i = __shfl_up(tmp.i, lane_delta, width);
343  return tmp.f;
344 }
345 __device__
346 inline
347 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
348  static_assert(sizeof(double) == 2 * sizeof(int), "");
349  static_assert(sizeof(double) == sizeof(uint64_t), "");
350 
351  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
352  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
353  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
354 
355  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
356  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
357  return tmp1;
358 }
359 
360 __device__
361 inline
362 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
363  int self = __lane_id();
364  int index = self + lane_delta;
365  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
366  return __llvm_amdgcn_ds_bpermute(index<<2, var);
367 }
368 __device__
369 inline
370 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
371  union { int i; unsigned u; float f; } tmp; tmp.u = var;
372  tmp.i = __shfl_down(tmp.i, lane_delta, width);
373  return tmp.u;
374 }
375 __device__
376 inline
377 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
378  union { int i; unsigned u; float f; } tmp; tmp.f = var;
379  tmp.i = __shfl_down(tmp.i, lane_delta, width);
380  return tmp.f;
381 }
382 __device__
383 inline
384 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
385  static_assert(sizeof(double) == 2 * sizeof(int), "");
386  static_assert(sizeof(double) == sizeof(uint64_t), "");
387 
388  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
389  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
390  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
391 
392  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
393  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
394  return tmp1;
395 }
396 
397 __device__
398 inline
399 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
400  int self = __lane_id();
401  int index = self^lane_mask;
402  index = index >= ((self+width)&~(width-1))?self:index;
403  return __llvm_amdgcn_ds_bpermute(index<<2, var);
404 }
405 __device__
406 inline
407 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
408  union { int i; unsigned u; float f; } tmp; tmp.u = var;
409  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
410  return tmp.u;
411 }
412 __device__
413 inline
414 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
415  union { int i; unsigned u; float f; } tmp; tmp.f = var;
416  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
417  return tmp.f;
418 }
419 __device__
420 inline
421 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
422  static_assert(sizeof(double) == 2 * sizeof(int), "");
423  static_assert(sizeof(double) == sizeof(uint64_t), "");
424 
425  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
426  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
427  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
428 
429  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
430  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
431  return tmp1;
432 }
433 
434 #define MASK1 0x00ff00ff
435 #define MASK2 0xff00ff00
436 
437 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
438  char4 out;
439  unsigned one1 = in1.w & MASK1;
440  unsigned one2 = in2.w & MASK1;
441  out.w = (one1 + one2) & MASK1;
442  one1 = in1.w & MASK2;
443  one2 = in2.w & MASK2;
444  out.w = out.w | ((one1 + one2) & MASK2);
445  return out;
446 }
447 
448 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
449  char4 out;
450  unsigned one1 = in1.w & MASK1;
451  unsigned one2 = in2.w & MASK1;
452  out.w = (one1 - one2) & MASK1;
453  one1 = in1.w & MASK2;
454  one2 = in2.w & MASK2;
455  out.w = out.w | ((one1 - one2) & MASK2);
456  return out;
457 }
458 
459 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
460  char4 out;
461  unsigned one1 = in1.w & MASK1;
462  unsigned one2 = in2.w & MASK1;
463  out.w = (one1 * one2) & MASK1;
464  one1 = in1.w & MASK2;
465  one2 = in2.w & MASK2;
466  out.w = out.w | ((one1 * one2) & MASK2);
467  return out;
468 }
469 
470 /*
471  * Rounding modes are not yet supported in HIP
472  * TODO: Conversion functions are not correct, need to fix when BE is ready
473 */
474 
475 __device__ static inline float __double2float_rd(double x) { return (double)x; }
476 __device__ static inline float __double2float_rn(double x) { return (double)x; }
477 __device__ static inline float __double2float_ru(double x) { return (double)x; }
478 __device__ static inline float __double2float_rz(double x) { return (double)x; }
479 
480 __device__ static inline int __double2hiint(double x) {
481  static_assert(sizeof(double) == 2 * sizeof(int), "");
482 
483  int tmp[2];
484  __builtin_memcpy(tmp, &x, sizeof(tmp));
485 
486  return tmp[1];
487 }
488 __device__ static inline int __double2loint(double x) {
489  static_assert(sizeof(double) == 2 * sizeof(int), "");
490 
491  int tmp[2];
492  __builtin_memcpy(tmp, &x, sizeof(tmp));
493 
494  return tmp[0];
495 }
496 
497 __device__ static inline int __double2int_rd(double x) { return (int)x; }
498 __device__ static inline int __double2int_rn(double x) { return (int)x; }
499 __device__ static inline int __double2int_ru(double x) { return (int)x; }
500 __device__ static inline int __double2int_rz(double x) { return (int)x; }
501 
502 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
503 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
504 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
505 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
506 
507 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
508 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
509 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
510 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
511 
512 __device__ static inline unsigned long long int __double2ull_rd(double x) {
513  return (unsigned long long int)x;
514 }
515 __device__ static inline unsigned long long int __double2ull_rn(double x) {
516  return (unsigned long long int)x;
517 }
518 __device__ static inline unsigned long long int __double2ull_ru(double x) {
519  return (unsigned long long int)x;
520 }
521 __device__ static inline unsigned long long int __double2ull_rz(double x) {
522  return (unsigned long long int)x;
523 }
524 
525 __device__ static inline long long int __double_as_longlong(double x) {
526  static_assert(sizeof(long long) == sizeof(double), "");
527 
528  long long tmp;
529  __builtin_memcpy(&tmp, &x, sizeof(tmp));
530 
531  return tmp;
532 }
533 
534 /*
535 __device__ unsigned short __float2half_rn(float x);
536 __device__ float __half2float(unsigned short);
537 
538 The above device function are not a valid .
539 Use
540 __device__ __half __float2half_rn(float x);
541 __device__ float __half2float(__half);
542 from hip_fp16.h
543 
544 CUDA implements half as unsigned short whereas, HIP doesn't.
545 
546 */
547 
548 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
549 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
550 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
551 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
552 
553 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
554 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
555 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
556 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
557 
558 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
559 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
560 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
561 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
562 
563 __device__ static inline unsigned long long int __float2ull_rd(float x) {
564  return (unsigned long long int)x;
565 }
566 __device__ static inline unsigned long long int __float2ull_rn(float x) {
567  return (unsigned long long int)x;
568 }
569 __device__ static inline unsigned long long int __float2ull_ru(float x) {
570  return (unsigned long long int)x;
571 }
572 __device__ static inline unsigned long long int __float2ull_rz(float x) {
573  return (unsigned long long int)x;
574 }
575 
576 __device__ static inline int __float_as_int(float x) {
577  static_assert(sizeof(int) == sizeof(float), "");
578 
579  int tmp;
580  __builtin_memcpy(&tmp, &x, sizeof(tmp));
581 
582  return tmp;
583 }
584 
585 __device__ static inline unsigned int __float_as_uint(float x) {
586  static_assert(sizeof(unsigned int) == sizeof(float), "");
587 
588  unsigned int tmp;
589  __builtin_memcpy(&tmp, &x, sizeof(tmp));
590 
591  return tmp;
592 }
593 
594 __device__ static inline double __hiloint2double(int hi, int lo) {
595  static_assert(sizeof(double) == sizeof(uint64_t), "");
596 
597  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
598  double tmp1;
599  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
600 
601  return tmp1;
602 }
603 
604 __device__ static inline double __int2double_rn(int x) { return (double)x; }
605 
606 __device__ static inline float __int2float_rd(int x) { return (float)x; }
607 __device__ static inline float __int2float_rn(int x) { return (float)x; }
608 __device__ static inline float __int2float_ru(int x) { return (float)x; }
609 __device__ static inline float __int2float_rz(int x) { return (float)x; }
610 
611 __device__ static inline float __int_as_float(int x) {
612  static_assert(sizeof(float) == sizeof(int), "");
613 
614  float tmp;
615  __builtin_memcpy(&tmp, &x, sizeof(tmp));
616 
617  return tmp;
618 }
619 
620 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
621 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
622 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
623 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
624 
625 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
626 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
627 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
628 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
629 
630 __device__ static inline double __longlong_as_double(long long int x) {
631  static_assert(sizeof(double) == sizeof(long long), "");
632 
633  double tmp;
634  __builtin_memcpy(&tmp, &x, sizeof(tmp));
635 
636  return tmp;
637 }
638 
639 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
640 
641 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
642 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
643 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
644 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
645 
646 __device__ static inline float __uint_as_float(unsigned int x) {
647  static_assert(sizeof(float) == sizeof(unsigned int), "");
648 
649  float tmp;
650  __builtin_memcpy(&tmp, &x, sizeof(tmp));
651 
652  return tmp;
653 }
654 
655 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
656 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
657 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
658 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
659 
660 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
661 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
662 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
663 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
664 
665 #if defined(__HCC__)
666 #define __HCC_OR_HIP_CLANG__ 1
667 #elif defined(__clang__) && defined(__HIP__)
668 #define __HCC_OR_HIP_CLANG__ 1
669 #else
670 #define __HCC_OR_HIP_CLANG__ 0
671 #endif
672 
673 #ifdef __HCC_OR_HIP_CLANG__
674 
675 // Clock functions
676 __device__ long long int __clock64();
677 __device__ long long int __clock();
678 __device__ long long int clock64();
679 __device__ long long int clock();
680 // hip.amdgcn.bc - named sync
681 __device__ void __named_sync(int a, int b);
682 
683 #ifdef __HIP_DEVICE_COMPILE__
684 
685 // Clock functions
686 #if __HCC__
687 extern "C" uint64_t __clock_u64() __HC__;
688 #endif
689 
690 __device__
691 inline __attribute((always_inline))
692 long long int __clock64() {
693 return (long long int) __builtin_readcyclecounter();
694 }
695 
696 __device__
697 inline __attribute((always_inline))
698 long long int __clock() { return __clock64(); }
699 
700 __device__
701 inline __attribute__((always_inline))
702 long long int clock64() { return __clock64(); }
703 
704 __device__
705 inline __attribute__((always_inline))
706 long long int clock() { return __clock(); }
707 
708 // hip.amdgcn.bc - named sync
709 __device__
710 inline
711 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
712 
713 #endif // __HIP_DEVICE_COMPILE__
714 
715 // warp vote function __all __any __ballot
716 __device__
717 inline
718 int __all(int predicate) {
719  return __ockl_wfall_i32(predicate);
720 }
721 
722 __device__
723 inline
724 int __any(int predicate) {
725  return __ockl_wfany_i32(predicate);
726 }
727 
728 // XXX from llvm/include/llvm/IR/InstrTypes.h
729 #define ICMP_NE 33
730 
731 __device__
732 inline
733 unsigned long long int __ballot(int predicate) {
734  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
735 }
736 
737 __device__
738 inline
739 unsigned long long int __ballot64(int predicate) {
740  return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
741 }
742 
743 // hip.amdgcn.bc - lanemask
744 __device__
745 inline
746 uint64_t __lanemask_gt()
747 {
748  uint32_t lane = __ockl_lane_u32();
749  if (lane == 63)
750  return 0;
751  uint64_t ballot = __ballot64(1);
752  uint64_t mask = (~((uint64_t)0)) << (lane + 1);
753  return mask & ballot;
754 }
755 
756 __device__
757 inline
758 uint64_t __lanemask_lt()
759 {
760  uint32_t lane = __ockl_lane_u32();
761  int64_t ballot = __ballot64(1);
762  uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
763  return mask & ballot;
764 }
765 
766 __device__
767 inline
768 uint64_t __lanemask_eq()
769 {
770  uint32_t lane = __ockl_lane_u32();
771  int64_t mask = ((uint64_t)1 << lane);
772  return mask;
773 }
774 
775 
776 __device__ inline void* __local_to_generic(void* p) { return p; }
777 
778 #ifdef __HIP_DEVICE_COMPILE__
779 __device__
780 inline
781 void* __get_dynamicgroupbaseptr()
782 {
783  // Get group segment base pointer.
784  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
785 }
786 #else
787 __device__
788 void* __get_dynamicgroupbaseptr();
789 #endif // __HIP_DEVICE_COMPILE__
790 
791 __device__
792 inline
793 void *__amdgcn_get_dynamicgroupbaseptr() {
794  return __get_dynamicgroupbaseptr();
795 }
796 
797 #if defined(__HCC__) && (__hcc_minor__ < 3)
798 // hip.amdgcn.bc - sync threads
799 #define __CLK_LOCAL_MEM_FENCE 0x01
800 typedef unsigned __cl_mem_fence_flags;
801 
802 typedef enum __memory_scope {
803  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
804  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
805  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
806  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
807  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
808 } __memory_scope;
809 
810 // enum values aligned with what clang uses in EmitAtomicExpr()
811 typedef enum __memory_order
812 {
813  __memory_order_relaxed = __ATOMIC_RELAXED,
814  __memory_order_acquire = __ATOMIC_ACQUIRE,
815  __memory_order_release = __ATOMIC_RELEASE,
816  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
817  __memory_order_seq_cst = __ATOMIC_SEQ_CST
818 } __memory_order;
819 
820 __device__
821 inline
822 static void
823 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
824 {
825  // We're tying global-happens-before and local-happens-before together as does HSA
826  if (order != __memory_order_relaxed) {
827  switch (scope) {
828  case __memory_scope_work_item:
829  break;
830  case __memory_scope_sub_group:
831  switch (order) {
832  case __memory_order_relaxed: break;
833  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
834  case __memory_order_release: __llvm_fence_rel_sg(); break;
835  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
836  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
837  }
838  break;
839  case __memory_scope_work_group:
840  switch (order) {
841  case __memory_order_relaxed: break;
842  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
843  case __memory_order_release: __llvm_fence_rel_wg(); break;
844  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
845  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
846  }
847  break;
848  case __memory_scope_device:
849  switch (order) {
850  case __memory_order_relaxed: break;
851  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
852  case __memory_order_release: __llvm_fence_rel_dev(); break;
853  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
854  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
855  }
856  break;
857  case __memory_scope_all_svm_devices:
858  switch (order) {
859  case __memory_order_relaxed: break;
860  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
861  case __memory_order_release: __llvm_fence_rel_sys(); break;
862  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
863  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
864  }
865  break;
866  }
867  }
868 }
869 #endif
870 
871 // Memory Fence Functions
872 __device__
873 inline
874 static void __threadfence()
875 {
876  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
877 }
878 
879 __device__
880 inline
881 static void __threadfence_block()
882 {
883  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
884 }
885 
886 __device__
887 inline
888 static void __threadfence_system()
889 {
890  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
891 }
892 
893 // abort
894 __device__
895 inline
896 __attribute__((weak))
897 void abort() {
898  return __builtin_trap();
899 }
900 
901 
902 #endif // __HCC_OR_HIP_CLANG__
903 
904 #ifdef __HCC__
905 
910 // Macro to replace extern __shared__ declarations
911 // to local variable definitions
912 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
913 
914 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
915 
916 
917 #elif defined(__clang__) && defined(__HIP__)
918 
919 #pragma push_macro("__DEVICE__")
920 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
921  __attribute__((weak))
922 
923 __DEVICE__
924 inline
925 void __assert_fail(const char * __assertion,
926  const char *__file,
927  unsigned int __line,
928  const char *__function)
929 {
930  // Ignore all the args for now.
931  __builtin_trap();
932 }
933 
934 __DEVICE__
935 inline
936 void __assertfail(const char * __assertion,
937  const char *__file,
938  unsigned int __line,
939  const char *__function,
940  size_t charsize)
941 {
942  // ignore all the args for now.
943  __builtin_trap();
944 }
945 
946 __device__
947 inline
948 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
949 {
950  if (flags) {
951  __atomic_work_item_fence(flags, __memory_order_release, scope);
952  __builtin_amdgcn_s_barrier();
953  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
954  } else {
955  __builtin_amdgcn_s_barrier();
956  }
957 }
958 
959 __device__
960 inline
961 static void __barrier(int n)
962 {
963  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
964 }
965 
966 __device__
967 inline
968 __attribute__((convergent))
969 void __syncthreads()
970 {
971  __barrier(__CLK_LOCAL_MEM_FENCE);
972 }
973 
974 // hip.amdgcn.bc - device routine
975 /*
976  HW_ID Register bit structure
977  WAVE_ID 3:0 Wave buffer slot number. 0-9.
978  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
979  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
980  CU_ID 11:8 Compute Unit the wave is assigned to.
981  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
982  SE_ID 14:13 Shader Engine the wave is assigned to.
983  TG_ID 19:16 Thread-group ID
984  VM_ID 23:20 Virtual Memory ID
985  QUEUE_ID 26:24 Queue from which this wave was dispatched.
986  STATE_ID 29:27 State ID (graphics only, not compute).
987  ME_ID 31:30 Micro-engine ID.
988  */
989 
990 #define HW_ID 4
991 
992 #define HW_ID_CU_ID_SIZE 4
993 #define HW_ID_CU_ID_OFFSET 8
994 
995 #define HW_ID_SE_ID_SIZE 2
996 #define HW_ID_SE_ID_OFFSET 13
997 
998 /*
999  Encoding of parameter bitmask
1000  HW_ID 5:0 HW_ID
1001  OFFSET 10:6 Range: 0..31
1002  SIZE 15:11 Range: 1..32
1003  */
1004 
1005 #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1006 
1007 /*
1008  __smid returns the wave's assigned Compute Unit and Shader Engine.
1009  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1010  Note: the results vary over time.
1011  SZ minus 1 since SIZE is 1-based.
1012 */
1013 __device__
1014 inline
1015 unsigned __smid(void)
1016 {
1017  unsigned cu_id = __builtin_amdgcn_s_getreg(
1018  GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
1019  unsigned se_id = __builtin_amdgcn_s_getreg(
1020  GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1021 
1022  /* Each shader engine has 16 CU */
1023  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1024 }
1025 
1026 #pragma push_macro("__DEVICE__")
1027 
1028 // Macro to replace extern __shared__ declarations
1029 // to local variable definitions
1030 #define HIP_DYNAMIC_SHARED(type, var) \
1031  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1032 
1033 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1034 
1035 
1036 #endif //defined(__clang__) && defined(__HIP__)
1037 
1038 
1039 // loop unrolling
1040 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1041  auto dstPtr = static_cast<unsigned char*>(dst);
1042  auto srcPtr = static_cast<const unsigned char*>(src);
1043 
1044  while (size >= 4u) {
1045  dstPtr[0] = srcPtr[0];
1046  dstPtr[1] = srcPtr[1];
1047  dstPtr[2] = srcPtr[2];
1048  dstPtr[3] = srcPtr[3];
1049 
1050  size -= 4u;
1051  srcPtr += 4u;
1052  dstPtr += 4u;
1053  }
1054  switch (size) {
1055  case 3:
1056  dstPtr[2] = srcPtr[2];
1057  case 2:
1058  dstPtr[1] = srcPtr[1];
1059  case 1:
1060  dstPtr[0] = srcPtr[0];
1061  }
1062 
1063  return dst;
1064 }
1065 
1066 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1067  auto dstPtr = static_cast<unsigned char*>(dst);
1068 
1069  while (size >= 4u) {
1070  dstPtr[0] = val;
1071  dstPtr[1] = val;
1072  dstPtr[2] = val;
1073  dstPtr[3] = val;
1074 
1075  size -= 4u;
1076  dstPtr += 4u;
1077  }
1078  switch (size) {
1079  case 3:
1080  dstPtr[2] = val;
1081  case 2:
1082  dstPtr[1] = val;
1083  case 1:
1084  dstPtr[0] = val;
1085  }
1086 
1087  return dst;
1088 }
1089 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1090  return __hip_hc_memcpy(dst, src, size);
1091 }
1092 
1093 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1094  unsigned char val8 = static_cast<unsigned char>(val);
1095  return __hip_hc_memset(ptr, val8, size);
1096 }
1097 
1098 #endif
TODO-doc.
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.