HIP: Heterogenous-computing Interface for Portability
device_functions.h
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
23 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
24 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
25 
26 #include "host_defines.h"
27 #include "math_fwd.h"
28 
29 #include <hip/hip_runtime_api.h>
30 #include <stddef.h>
31 
32 
33 #include <hip/hip_vector_types.h>
36 /*
37 Integer Intrinsics
38 */
39 
40 // integer intrinsic function __poc __clz __ffs __brev
41 __device__ static inline unsigned int __popc(unsigned int input) {
42  return __builtin_popcount(input);
43 }
44 __device__ static inline unsigned int __popcll(unsigned long long int input) {
45  return __builtin_popcountl(input);
46 }
47 
48 __device__ static inline int __clz(int input) {
49  return __ockl_clz_u32((uint)input);
50 }
51 
52 __device__ static inline int __clzll(long long int input) {
53  return __ockl_clz_u64((ulong)input);
54 }
55 
56 __device__ static inline unsigned int __ffs(unsigned int input) {
57  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
58 }
59 
60 __device__ static inline unsigned int __ffsll(unsigned long long int input) {
61  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
62 }
63 
64 __device__ static inline unsigned int __ffs(int input) {
65  return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
66 }
67 
68 __device__ static inline unsigned int __ffsll(long long int input) {
69  return ( input == 0 ? -1 : __builtin_ctzl(input) ) + 1;
70 }
71 
72 __device__ static inline unsigned int __brev(unsigned int input) {
73  return __llvm_bitrev_b32(input);
74 }
75 
76 __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
77  return __llvm_bitrev_b64(input);
78 }
79 
80 __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
81  return input == 0 ? -1 : __builtin_ctzl(input);
82 }
83 
84 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
85  uint32_t offset = src1 & 31;
86  uint32_t width = src2 & 31;
87  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
88 }
89 
90 __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
91  uint64_t offset = src1 & 63;
92  uint64_t width = src2 & 63;
93  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
94 }
95 
96 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
97  uint32_t offset = src2 & 31;
98  uint32_t width = src3 & 31;
99  uint32_t mask = (1 << width) - 1;
100  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
101 }
102 
103 __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
104  uint64_t offset = src2 & 63;
105  uint64_t width = src3 & 63;
106  uint64_t mask = (1 << width) - 1;
107  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
108 }
109 
110 __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
111 __device__ static unsigned int __hadd(int x, int y);
112 __device__ static int __mul24(int x, int y);
113 __device__ static long long int __mul64hi(long long int x, long long int y);
114 __device__ static int __mulhi(int x, int y);
115 __device__ static int __rhadd(int x, int y);
116 __device__ static unsigned int __sad(int x, int y, int z);
117 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
118 __device__ static int __umul24(unsigned int x, unsigned int y);
119 __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
120 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
121 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
122 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
123 
124 struct ucharHolder {
125  union {
126  unsigned char c[4];
127  unsigned int ui;
128  };
129 } __attribute__((aligned(4)));
130 
131 struct uchar2Holder {
132  union {
133  unsigned int ui[2];
134  unsigned char c[8];
135  };
136 } __attribute__((aligned(8)));
137 
138 __device__
139 static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
140  struct uchar2Holder cHoldVal;
141  struct ucharHolder cHoldKey;
142  struct ucharHolder cHoldOut;
143  cHoldKey.ui = s;
144  cHoldVal.ui[0] = x;
145  cHoldVal.ui[1] = y;
146  cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
147  cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
148  cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
149  cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
150  return cHoldOut.ui;
151 }
152 
153 __device__ static inline unsigned int __hadd(int x, int y) {
154  int z = x + y;
155  int sign = z & 0x8000000;
156  int value = z & 0x7FFFFFFF;
157  return ((value) >> 1 || sign);
158 }
159 
160 __device__ static inline int __mul24(int x, int y) {
161  return __ockl_mul24_i32(x, y);
162 }
163 
164 __device__ static inline long long __mul64hi(long long int x, long long int y) {
165  ulong x0 = (ulong)x & 0xffffffffUL;
166  long x1 = x >> 32;
167  ulong y0 = (ulong)y & 0xffffffffUL;
168  long y1 = y >> 32;
169  ulong z0 = x0*y0;
170  long t = x1*y0 + (z0 >> 32);
171  long z1 = t & 0xffffffffL;
172  long z2 = t >> 32;
173  z1 = x0*y1 + z1;
174  return x1*y1 + z2 + (z1 >> 32);
175 }
176 
177 __device__ static inline int __mulhi(int x, int y) {
178  return __ockl_mul_hi_i32(x, y);
179 }
180 
181 __device__ static inline int __rhadd(int x, int y) {
182  int z = x + y + 1;
183  int sign = z & 0x8000000;
184  int value = z & 0x7FFFFFFF;
185  return ((value) >> 1 || sign);
186 }
187 __device__ static inline unsigned int __sad(int x, int y, int z) {
188  return x > y ? x - y + z : y - x + z;
189 }
190 __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
191  return (x + y) >> 1;
192 }
193 __device__ static inline int __umul24(unsigned int x, unsigned int y) {
194  return __ockl_mul24_u32(x, y);
195 }
196 
197 __device__
198 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
199  ulong x0 = x & 0xffffffffUL;
200  ulong x1 = x >> 32;
201  ulong y0 = y & 0xffffffffUL;
202  ulong y1 = y >> 32;
203  ulong z0 = x0*y0;
204  ulong t = x1*y0 + (z0 >> 32);
205  ulong z1 = t & 0xffffffffUL;
206  ulong z2 = t >> 32;
207  z1 = x0*y1 + z1;
208  return x1*y1 + z2 + (z1 >> 32);
209 }
210 
211 __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
212  return __ockl_mul_hi_u32(x, y);
213 }
214 __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
215  return (x + y + 1) >> 1;
216 }
217 __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
218  return __ockl_sad_u32(x, y, z);
219 }
220 
221 __device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
222 
223 /*
224 HIP specific device functions
225 */
226 
227 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
228  union { int i; unsigned u; float f; } tmp; tmp.u = src;
229  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
230  return tmp.u;
231 }
232 
233 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
234  union { int i; unsigned u; float f; } tmp; tmp.f = src;
235  tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
236  return tmp.f;
237 }
238 
239 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
240  union { int i; unsigned u; float f; } tmp; tmp.u = src;
241  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
242  return tmp.u;
243 }
244 
245 __device__ static inline float __hip_ds_permutef(int index, float src) {
246  union { int i; unsigned u; float f; } tmp; tmp.u = src;
247  tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
248  return tmp.u;
249 }
250 
251 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
252 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
253 
254 template <int pattern>
255 __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
256  union { int i; unsigned u; float f; } tmp; tmp.u = src;
257 #if defined(__HCC__)
258  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
259 #else
260  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
261 #endif
262  return tmp.u;
263 }
264 
265 template <int pattern>
266 __device__ static inline float __hip_ds_swizzlef_N(float src) {
267  union { int i; unsigned u; float f; } tmp; tmp.f = src;
268 #if defined(__HCC__)
269  tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
270 #else
271  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
272 #endif
273  return tmp.f;
274 }
275 
276 #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
277  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
278 
279 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
280 __device__ static inline int __hip_move_dpp_N(int src) {
281  return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
282  bound_ctrl);
283 }
284 
285 static constexpr int warpSize = 64;
286 
287 __device__
288 inline
289 int __shfl(int var, int src_lane, int width = warpSize) {
290  int self = __lane_id();
291  int index = src_lane + (self & ~(width-1));
292  return __llvm_amdgcn_ds_bpermute(index<<2, var);
293 }
294 __device__
295 inline
296 unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
297  union { int i; unsigned u; float f; } tmp; tmp.u = var;
298  tmp.i = __shfl(tmp.i, src_lane, width);
299  return tmp.u;
300 }
301 __device__
302 inline
303 float __shfl(float var, int src_lane, int width = warpSize) {
304  union { int i; unsigned u; float f; } tmp; tmp.f = var;
305  tmp.i = __shfl(tmp.i, src_lane, width);
306  return tmp.f;
307 }
308 __device__
309 inline
310 double __shfl(double var, int src_lane, int width = warpSize) {
311  static_assert(sizeof(double) == 2 * sizeof(int), "");
312  static_assert(sizeof(double) == sizeof(uint64_t), "");
313 
314  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
315  tmp[0] = __shfl(tmp[0], src_lane, width);
316  tmp[1] = __shfl(tmp[1], src_lane, width);
317 
318  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
319  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
320  return tmp1;
321 }
322 
323  __device__
324 inline
325 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
326  int self = __lane_id();
327  int index = self - lane_delta;
328  index = (index < (self & ~(width-1)))?self:index;
329  return __llvm_amdgcn_ds_bpermute(index<<2, var);
330 }
331 __device__
332 inline
333 unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
334  union { int i; unsigned u; float f; } tmp; tmp.u = var;
335  tmp.i = __shfl_up(tmp.i, lane_delta, width);
336  return tmp.u;
337 }
338 __device__
339 inline
340 float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
341  union { int i; unsigned u; float f; } tmp; tmp.f = var;
342  tmp.i = __shfl_up(tmp.i, lane_delta, width);
343  return tmp.f;
344 }
345 __device__
346 inline
347 double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
348  static_assert(sizeof(double) == 2 * sizeof(int), "");
349  static_assert(sizeof(double) == sizeof(uint64_t), "");
350 
351  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
352  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
353  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
354 
355  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
356  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
357  return tmp1;
358 }
359 
360 __device__
361 inline
362 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
363  int self = __lane_id();
364  int index = self + lane_delta;
365  index = (int)((self&(width-1))+lane_delta) >= width?self:index;
366  return __llvm_amdgcn_ds_bpermute(index<<2, var);
367 }
368 __device__
369 inline
370 unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
371  union { int i; unsigned u; float f; } tmp; tmp.u = var;
372  tmp.i = __shfl_down(tmp.i, lane_delta, width);
373  return tmp.u;
374 }
375 __device__
376 inline
377 float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
378  union { int i; unsigned u; float f; } tmp; tmp.f = var;
379  tmp.i = __shfl_down(tmp.i, lane_delta, width);
380  return tmp.f;
381 }
382 __device__
383 inline
384 double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
385  static_assert(sizeof(double) == 2 * sizeof(int), "");
386  static_assert(sizeof(double) == sizeof(uint64_t), "");
387 
388  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
389  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
390  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
391 
392  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
393  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
394  return tmp1;
395 }
396 
397 __device__
398 inline
399 int __shfl_xor(int var, int lane_mask, int width = warpSize) {
400  int self = __lane_id();
401  int index = self^lane_mask;
402  index = index >= ((self+width)&~(width-1))?self:index;
403  return __llvm_amdgcn_ds_bpermute(index<<2, var);
404 }
405 __device__
406 inline
407 unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
408  union { int i; unsigned u; float f; } tmp; tmp.u = var;
409  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
410  return tmp.u;
411 }
412 __device__
413 inline
414 float __shfl_xor(float var, int lane_mask, int width = warpSize) {
415  union { int i; unsigned u; float f; } tmp; tmp.f = var;
416  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
417  return tmp.f;
418 }
419 __device__
420 inline
421 double __shfl_xor(double var, int lane_mask, int width = warpSize) {
422  static_assert(sizeof(double) == 2 * sizeof(int), "");
423  static_assert(sizeof(double) == sizeof(uint64_t), "");
424 
425  int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
426  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
427  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
428 
429  uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
430  double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
431  return tmp1;
432 }
433 
434 #define MASK1 0x00ff00ff
435 #define MASK2 0xff00ff00
436 
437 __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
438  char4 out;
439  unsigned one1 = in1.w & MASK1;
440  unsigned one2 = in2.w & MASK1;
441  out.w = (one1 + one2) & MASK1;
442  one1 = in1.w & MASK2;
443  one2 = in2.w & MASK2;
444  out.w = out.w | ((one1 + one2) & MASK2);
445  return out;
446 }
447 
448 __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
449  char4 out;
450  unsigned one1 = in1.w & MASK1;
451  unsigned one2 = in2.w & MASK1;
452  out.w = (one1 - one2) & MASK1;
453  one1 = in1.w & MASK2;
454  one2 = in2.w & MASK2;
455  out.w = out.w | ((one1 - one2) & MASK2);
456  return out;
457 }
458 
459 __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
460  char4 out;
461  unsigned one1 = in1.w & MASK1;
462  unsigned one2 = in2.w & MASK1;
463  out.w = (one1 * one2) & MASK1;
464  one1 = in1.w & MASK2;
465  one2 = in2.w & MASK2;
466  out.w = out.w | ((one1 * one2) & MASK2);
467  return out;
468 }
469 
470 /*
471  * Rounding modes are not yet supported in HIP
472  * TODO: Conversion functions are not correct, need to fix when BE is ready
473 */
474 
475 __device__ static inline float __double2float_rd(double x) { return (double)x; }
476 __device__ static inline float __double2float_rn(double x) { return (double)x; }
477 __device__ static inline float __double2float_ru(double x) { return (double)x; }
478 __device__ static inline float __double2float_rz(double x) { return (double)x; }
479 
480 __device__ static inline int __double2hiint(double x) {
481  static_assert(sizeof(double) == 2 * sizeof(int), "");
482 
483  int tmp[2];
484  __builtin_memcpy(tmp, &x, sizeof(tmp));
485 
486  return tmp[1];
487 }
488 __device__ static inline int __double2loint(double x) {
489  static_assert(sizeof(double) == 2 * sizeof(int), "");
490 
491  int tmp[2];
492  __builtin_memcpy(tmp, &x, sizeof(tmp));
493 
494  return tmp[0];
495 }
496 
497 __device__ static inline int __double2int_rd(double x) { return (int)x; }
498 __device__ static inline int __double2int_rn(double x) { return (int)x; }
499 __device__ static inline int __double2int_ru(double x) { return (int)x; }
500 __device__ static inline int __double2int_rz(double x) { return (int)x; }
501 
502 __device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
503 __device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
504 __device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
505 __device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
506 
507 __device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
508 __device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
509 __device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
510 __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
511 
512 __device__ static inline unsigned long long int __double2ull_rd(double x) {
513  return (unsigned long long int)x;
514 }
515 __device__ static inline unsigned long long int __double2ull_rn(double x) {
516  return (unsigned long long int)x;
517 }
518 __device__ static inline unsigned long long int __double2ull_ru(double x) {
519  return (unsigned long long int)x;
520 }
521 __device__ static inline unsigned long long int __double2ull_rz(double x) {
522  return (unsigned long long int)x;
523 }
524 
525 __device__ static inline long long int __double_as_longlong(double x) {
526  static_assert(sizeof(long long) == sizeof(double), "");
527 
528  long long tmp;
529  __builtin_memcpy(&tmp, &x, sizeof(tmp));
530 
531  return tmp;
532 }
533 
534 /*
535 __device__ unsigned short __float2half_rn(float x);
536 __device__ float __half2float(unsigned short);
537 
538 The above device function are not a valid .
539 Use
540 __device__ __half __float2half_rn(float x);
541 __device__ float __half2float(__half);
542 from hip_fp16.h
543 
544 CUDA implements half as unsigned short whereas, HIP doesn't.
545 
546 */
547 
548 __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
549 __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
550 __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
551 __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
552 
553 __device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
554 __device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
555 __device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
556 __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
557 
558 __device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
559 __device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
560 __device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
561 __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
562 
563 __device__ static inline unsigned long long int __float2ull_rd(float x) {
564  return (unsigned long long int)x;
565 }
566 __device__ static inline unsigned long long int __float2ull_rn(float x) {
567  return (unsigned long long int)x;
568 }
569 __device__ static inline unsigned long long int __float2ull_ru(float x) {
570  return (unsigned long long int)x;
571 }
572 __device__ static inline unsigned long long int __float2ull_rz(float x) {
573  return (unsigned long long int)x;
574 }
575 
576 __device__ static inline int __float_as_int(float x) {
577  static_assert(sizeof(int) == sizeof(float), "");
578 
579  int tmp;
580  __builtin_memcpy(&tmp, &x, sizeof(tmp));
581 
582  return tmp;
583 }
584 
585 __device__ static inline unsigned int __float_as_uint(float x) {
586  static_assert(sizeof(unsigned int) == sizeof(float), "");
587 
588  unsigned int tmp;
589  __builtin_memcpy(&tmp, &x, sizeof(tmp));
590 
591  return tmp;
592 }
593 
594 __device__ static inline double __hiloint2double(int hi, int lo) {
595  static_assert(sizeof(double) == sizeof(uint64_t), "");
596 
597  uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
598  double tmp1;
599  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
600 
601  return tmp1;
602 }
603 
604 __device__ static inline double __int2double_rn(int x) { return (double)x; }
605 
606 __device__ static inline float __int2float_rd(int x) { return (float)x; }
607 __device__ static inline float __int2float_rn(int x) { return (float)x; }
608 __device__ static inline float __int2float_ru(int x) { return (float)x; }
609 __device__ static inline float __int2float_rz(int x) { return (float)x; }
610 
611 __device__ static inline float __int_as_float(int x) {
612  static_assert(sizeof(float) == sizeof(int), "");
613 
614  float tmp;
615  __builtin_memcpy(&tmp, &x, sizeof(tmp));
616 
617  return tmp;
618 }
619 
620 __device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
621 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
622 __device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
623 __device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
624 
625 __device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
626 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
627 __device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
628 __device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
629 
630 __device__ static inline double __longlong_as_double(long long int x) {
631  static_assert(sizeof(double) == sizeof(long long), "");
632 
633  double tmp;
634  __builtin_memcpy(&tmp, &x, sizeof(tmp));
635 
636  return tmp;
637 }
638 
639 __device__ static inline double __uint2double_rn(int x) { return (double)x; }
640 
641 __device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
642 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
643 __device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
644 __device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
645 
646 __device__ static inline float __uint_as_float(unsigned int x) {
647  static_assert(sizeof(float) == sizeof(unsigned int), "");
648 
649  float tmp;
650  __builtin_memcpy(&tmp, &x, sizeof(tmp));
651 
652  return tmp;
653 }
654 
655 __device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
656 __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
657 __device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
658 __device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
659 
660 __device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
661 __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
662 __device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
663 __device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
664 
665 #if defined(__HCC__)
666 #define __HCC_OR_HIP_CLANG__ 1
667 #elif defined(__clang__) && defined(__HIP__)
668 #define __HCC_OR_HIP_CLANG__ 1
669 #else
670 #define __HCC_OR_HIP_CLANG__ 0
671 #endif
672 
673 #ifdef __HCC_OR_HIP_CLANG__
674 
675 // Clock functions
676 __device__ long long int __clock64();
677 __device__ long long int __clock();
678 __device__ long long int clock64();
679 __device__ long long int clock();
680 // hip.amdgcn.bc - named sync
681 __device__ void __named_sync(int a, int b);
682 
683 #ifdef __HIP_DEVICE_COMPILE__
684 
685 // Clock functions
686 #if __HCC__
687 extern "C" uint64_t __clock_u64() __HC__;
688 #endif
689 
690 __device__
691 inline __attribute((always_inline))
692 long long int __clock64() {
693 // ToDo: Unify HCC and HIP implementation.
694 #if __HCC__
695  return (long long int) __clock_u64();
696 #else
697  return (long long int) __builtin_amdgcn_s_memrealtime();
698 #endif
699 }
700 
701 __device__
702 inline __attribute((always_inline))
703 long long int __clock() { return __clock64(); }
704 
705 __device__
706 inline __attribute__((always_inline))
707 long long int clock64() { return __clock64(); }
708 
709 __device__
710 inline __attribute__((always_inline))
711 long long int clock() { return __clock(); }
712 
713 // hip.amdgcn.bc - named sync
714 __device__
715 inline
716 void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
717 
718 #endif // __HIP_DEVICE_COMPILE__
719 
720 // warp vote function __all __any __ballot
721 __device__
722 inline
723 int __all(int predicate) {
724  return __ockl_wfall_i32(predicate);
725 }
726 
727 __device__
728 inline
729 int __any(int predicate) {
730  return __ockl_wfany_i32(predicate);
731 }
732 
733 // XXX from llvm/include/llvm/IR/InstrTypes.h
734 #define ICMP_NE 33
735 
736 __device__
737 inline
738 unsigned long long int __ballot(int predicate) {
739  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
740 }
741 
742 __device__
743 inline
744 unsigned long long int __ballot64(int predicate) {
745  return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
746 }
747 
748 // hip.amdgcn.bc - lanemask
749 __device__
750 inline
751 int64_t __lanemask_gt()
752 {
753  int32_t activelane = __ockl_activelane_u32();
754  int64_t ballot = __ballot64(1);
755  if (activelane != 63) {
756  int64_t tmp = (~0UL) << (activelane + 1);
757  return tmp & ballot;
758  }
759  return 0;
760 }
761 
762 __device__
763 inline
764 int64_t __lanemask_lt()
765 {
766  int32_t activelane = __ockl_activelane_u32();
767  int64_t ballot = __ballot64(1);
768  if (activelane == 0)
769  return 0;
770  return ballot;
771 }
772 
773 __device__ inline void* __local_to_generic(void* p) { return p; }
774 
775 #ifdef __HIP_DEVICE_COMPILE__
776 __device__
777 inline
778 void* __get_dynamicgroupbaseptr()
779 {
780  // Get group segment base pointer.
781  return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
782 }
783 #else
784 __device__
785 void* __get_dynamicgroupbaseptr();
786 #endif // __HIP_DEVICE_COMPILE__
787 
788 __device__
789 inline
790 void *__amdgcn_get_dynamicgroupbaseptr() {
791  return __get_dynamicgroupbaseptr();
792 }
793 
794 #if defined(__HCC__) && (__hcc_minor__ < 3)
795 // hip.amdgcn.bc - sync threads
796 #define __CLK_LOCAL_MEM_FENCE 0x01
797 typedef unsigned __cl_mem_fence_flags;
798 
799 typedef enum __memory_scope {
800  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
801  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
802  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
803  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
804  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
805 } __memory_scope;
806 
807 // enum values aligned with what clang uses in EmitAtomicExpr()
808 typedef enum __memory_order
809 {
810  __memory_order_relaxed = __ATOMIC_RELAXED,
811  __memory_order_acquire = __ATOMIC_ACQUIRE,
812  __memory_order_release = __ATOMIC_RELEASE,
813  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
814  __memory_order_seq_cst = __ATOMIC_SEQ_CST
815 } __memory_order;
816 
817 __device__
818 inline
819 static void
820 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
821 {
822  // We're tying global-happens-before and local-happens-before together as does HSA
823  if (order != __memory_order_relaxed) {
824  switch (scope) {
825  case __memory_scope_work_item:
826  break;
827  case __memory_scope_sub_group:
828  switch (order) {
829  case __memory_order_relaxed: break;
830  case __memory_order_acquire: __llvm_fence_acq_sg(); break;
831  case __memory_order_release: __llvm_fence_rel_sg(); break;
832  case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
833  case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
834  }
835  break;
836  case __memory_scope_work_group:
837  switch (order) {
838  case __memory_order_relaxed: break;
839  case __memory_order_acquire: __llvm_fence_acq_wg(); break;
840  case __memory_order_release: __llvm_fence_rel_wg(); break;
841  case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
842  case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
843  }
844  break;
845  case __memory_scope_device:
846  switch (order) {
847  case __memory_order_relaxed: break;
848  case __memory_order_acquire: __llvm_fence_acq_dev(); break;
849  case __memory_order_release: __llvm_fence_rel_dev(); break;
850  case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
851  case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
852  }
853  break;
854  case __memory_scope_all_svm_devices:
855  switch (order) {
856  case __memory_order_relaxed: break;
857  case __memory_order_acquire: __llvm_fence_acq_sys(); break;
858  case __memory_order_release: __llvm_fence_rel_sys(); break;
859  case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
860  case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
861  }
862  break;
863  }
864  }
865 }
866 #endif
867 
868 // Memory Fence Functions
869 __device__
870 inline
871 static void __threadfence()
872 {
873  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
874 }
875 
876 __device__
877 inline
878 static void __threadfence_block()
879 {
880  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
881 }
882 
883 __device__
884 inline
885 static void __threadfence_system()
886 {
887  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
888 }
889 
890 // abort
891 __device__
892 inline
893 __attribute__((weak))
894 void abort() {
895  return __builtin_trap();
896 }
897 
898 
899 #endif // __HCC_OR_HIP_CLANG__
900 
901 #ifdef __HCC__
902 
907 // Macro to replace extern __shared__ declarations
908 // to local variable definitions
909 #define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
910 
911 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
912 
913 
914 #elif defined(__clang__) && defined(__HIP__)
915 
916 #pragma push_macro("__DEVICE__")
917 #define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
918  __attribute__((weak))
919 
920 __DEVICE__
921 inline
922 void __assert_fail(const char * __assertion,
923  const char *__file,
924  unsigned int __line,
925  const char *__function)
926 {
927  // Ignore all the args for now.
928  __builtin_trap();
929 }
930 
931 __DEVICE__
932 inline
933 void __assertfail(const char * __assertion,
934  const char *__file,
935  unsigned int __line,
936  const char *__function,
937  size_t charsize)
938 {
939  // ignore all the args for now.
940  __builtin_trap();
941 }
942 
943 __device__
944 inline
945 static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
946 {
947  if (flags) {
948  __atomic_work_item_fence(flags, __memory_order_release, scope);
949  __builtin_amdgcn_s_barrier();
950  __atomic_work_item_fence(flags, __memory_order_acquire, scope);
951  } else {
952  __builtin_amdgcn_s_barrier();
953  }
954 }
955 
956 __device__
957 inline
958 static void __barrier(int n)
959 {
960  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
961 }
962 
963 __device__
964 inline
965 __attribute__((noduplicate))
966 void __syncthreads()
967 {
968  __barrier(__CLK_LOCAL_MEM_FENCE);
969 }
970 
971 // hip.amdgcn.bc - device routine
972 /*
973  HW_ID Register bit structure
974  WAVE_ID 3:0 Wave buffer slot number. 0-9.
975  SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
976  PIPE_ID 7:6 Pipeline from which the wave was dispatched.
977  CU_ID 11:8 Compute Unit the wave is assigned to.
978  SH_ID 12 Shader Array (within an SE) the wave is assigned to.
979  SE_ID 14:13 Shader Engine the wave is assigned to.
980  TG_ID 19:16 Thread-group ID
981  VM_ID 23:20 Virtual Memory ID
982  QUEUE_ID 26:24 Queue from which this wave was dispatched.
983  STATE_ID 29:27 State ID (graphics only, not compute).
984  ME_ID 31:30 Micro-engine ID.
985  */
986 
987 #define HW_ID 4
988 
989 #define HW_ID_CU_ID_SIZE 4
990 #define HW_ID_CU_ID_OFFSET 8
991 
992 #define HW_ID_SE_ID_SIZE 2
993 #define HW_ID_SE_ID_OFFSET 13
994 
995 /*
996  Encoding of parameter bitmask
997  HW_ID 5:0 HW_ID
998  OFFSET 10:6 Range: 0..31
999  SIZE 15:11 Range: 1..32
1000  */
1001 
1002 #define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
1003 
1004 __device__
1005 inline
1006 unsigned __smid(void)
1007 {
1008  unsigned cu_id = __builtin_amdgcn_s_getreg(
1009  GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
1010  unsigned se_id = __builtin_amdgcn_s_getreg(
1011  GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
1012 
1013  /* Each shader engine has 16 CU */
1014  return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1015 }
1016 
1017 #pragma push_macro("__DEVICE__")
1018 
1019 // Macro to replace extern __shared__ declarations
1020 // to local variable definitions
1021 #define HIP_DYNAMIC_SHARED(type, var) \
1022  type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
1023 
1024 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
1025 
1026 
1027 #endif //defined(__clang__) && defined(__HIP__)
1028 
1029 
1030 // loop unrolling
1031 static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1032  auto dstPtr = static_cast<unsigned char*>(dst);
1033  auto srcPtr = static_cast<const unsigned char*>(src);
1034 
1035  while (size >= 4u) {
1036  dstPtr[0] = srcPtr[0];
1037  dstPtr[1] = srcPtr[1];
1038  dstPtr[2] = srcPtr[2];
1039  dstPtr[3] = srcPtr[3];
1040 
1041  size -= 4u;
1042  srcPtr += 4u;
1043  dstPtr += 4u;
1044  }
1045  switch (size) {
1046  case 3:
1047  dstPtr[2] = srcPtr[2];
1048  case 2:
1049  dstPtr[1] = srcPtr[1];
1050  case 1:
1051  dstPtr[0] = srcPtr[0];
1052  }
1053 
1054  return dst;
1055 }
1056 
1057 static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1058  auto dstPtr = static_cast<unsigned char*>(dst);
1059 
1060  while (size >= 4u) {
1061  dstPtr[0] = val;
1062  dstPtr[1] = val;
1063  dstPtr[2] = val;
1064  dstPtr[3] = val;
1065 
1066  size -= 4u;
1067  dstPtr += 4u;
1068  }
1069  switch (size) {
1070  case 3:
1071  dstPtr[2] = val;
1072  case 2:
1073  dstPtr[1] = val;
1074  case 1:
1075  dstPtr[0] = val;
1076  }
1077 
1078  return dst;
1079 }
1080 static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1081  return __hip_hc_memcpy(dst, src, size);
1082 }
1083 
1084 static inline __device__ void* memset(void* ptr, int val, size_t size) {
1085  unsigned char val8 = static_cast<unsigned char>(val);
1086  return __hip_hc_memset(ptr, val8, size);
1087 }
1088 
1089 #endif
TODO-doc.
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Copies the memory address of symbol symbolName to devPtr.
Definition: hip_fp16_math_fwd.h:53
Contains declarations for types and functions in device library.
Definition: device_functions.h:124
Definition: device_functions.h:131
Contains declarations for wrapper functions for llvm intrinsics like llvm.amdgcn.s.barrier.