HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
28#include "host_defines.h"
29#include "math_fwd.h"
30#include <hip/hip_runtime_api.h>
31#include <stddef.h>
32#include <hip/hip_vector_types.h>
34#endif // !defined(__HIPCC_RTC__)
35
36#if defined(__clang__) && defined(__HIP__)
37extern "C" __device__ int printf(const char *fmt, ...);
38#else
39template <typename... All>
40static inline __device__ void printf(const char* format, All... all) {}
41#endif // __HIP_CLANG_ONLY__
42
43extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
44
45/*
46Integer Intrinsics
47*/
48
49// integer intrinsic function __poc __clz __ffs __brev
50__device__ static inline unsigned int __popc(unsigned int input) {
51 return __builtin_popcount(input);
52}
53__device__ static inline unsigned int __popcll(unsigned long long int input) {
54 return __builtin_popcountll(input);
55}
56
57__device__ static inline int __clz(int input) {
58 return __ockl_clz_u32((uint)input);
59}
60
61__device__ static inline int __clzll(long long int input) {
62 return __ockl_clz_u64((uint64_t)input);
63}
64
65__device__ static inline unsigned int __ffs(unsigned int input) {
66 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
67}
68
69__device__ static inline unsigned int __ffsll(unsigned long long int input) {
70 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
71}
72
73__device__ static inline unsigned int __ffsll(unsigned long int input) {
74 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
75}
76
77__device__ static inline unsigned int __ffs(int input) {
78 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
79}
80
81__device__ static inline unsigned int __ffsll(long long int input) {
82 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
83}
84
85__device__ static inline unsigned int __ffsll(long int input) {
86 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
87}
88
89// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
90// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
91// If not found, return -1.
92__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
93 uint64_t temp_mask = mask;
94 int32_t temp_offset = offset;
95
96 if (offset == 0) {
97 temp_mask &= (1 << base);
98 temp_offset = 1;
99 }
100 else if (offset < 0) {
101 temp_mask = __builtin_bitreverse64(mask);
102 base = 63 - base;
103 temp_offset = -offset;
104 }
105
106 temp_mask = temp_mask & ((~0ULL) << base);
107 if (__builtin_popcountll(temp_mask) < temp_offset)
108 return -1;
109 int32_t total = 0;
110 for (int i = 0x20; i > 0; i >>= 1) {
111 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
112 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
113 if (pcnt < temp_offset) {
114 temp_mask = temp_mask >> i;
115 temp_offset -= pcnt;
116 total += i;
117 }
118 else {
119 temp_mask = temp_mask_lo;
120 }
121 }
122 if (offset < 0)
123 return 63 - total;
124 else
125 return total;
126}
127
128__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
129 uint64_t temp_mask = mask;
130 int32_t temp_offset = offset;
131 if (offset == 0) {
132 temp_mask &= (1 << base);
133 temp_offset = 1;
134 }
135 else if (offset < 0) {
136 temp_mask = __builtin_bitreverse64(mask);
137 base = 63 - base;
138 temp_offset = -offset;
139 }
140 temp_mask = temp_mask & ((~0ULL) << base);
141 if (__builtin_popcountll(temp_mask) < temp_offset)
142 return -1;
143 int32_t total = 0;
144 for (int i = 0x20; i > 0; i >>= 1) {
145 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
146 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
147 if (pcnt < temp_offset) {
148 temp_mask = temp_mask >> i;
149 temp_offset -= pcnt;
150 total += i;
151 }
152 else {
153 temp_mask = temp_mask_lo;
154 }
155 }
156 if (offset < 0)
157 return 63 - total;
158 else
159 return total;
160}
161__device__ static inline unsigned int __brev(unsigned int input) {
162 return __builtin_bitreverse32(input);
163}
164
165__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
166 return __builtin_bitreverse64(input);
167}
168
169__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
170 return input == 0 ? -1 : __builtin_ctzl(input);
171}
172
173__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
174 uint32_t offset = src1 & 31;
175 uint32_t width = src2 & 31;
176 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
177}
178
179__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
180 uint64_t offset = src1 & 63;
181 uint64_t width = src2 & 63;
182 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
183}
184
185__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
186 uint32_t offset = src2 & 31;
187 uint32_t width = src3 & 31;
188 uint32_t mask = (1 << width) - 1;
189 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
190}
191
192__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
193 uint64_t offset = src2 & 63;
194 uint64_t width = src3 & 63;
195 uint64_t mask = (1ULL << width) - 1;
196 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
197}
198
199__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
200{
201 uint32_t mask_shift = shift & 31;
202 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
203}
204
205__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
206{
207 uint32_t min_shift = shift >= 32 ? 32 : shift;
208 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
209}
210
211__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
212{
213 return __builtin_amdgcn_alignbit(hi, lo, shift);
214}
215
216__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
217{
218 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
219}
220
221__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
222__device__ static unsigned int __hadd(int x, int y);
223__device__ static int __mul24(int x, int y);
224__device__ static long long int __mul64hi(long long int x, long long int y);
225__device__ static int __mulhi(int x, int y);
226__device__ static int __rhadd(int x, int y);
227__device__ static unsigned int __sad(int x, int y,unsigned int z);
228__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
229__device__ static int __umul24(unsigned int x, unsigned int y);
230__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
231__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
232__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
233__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
234
236 union {
237 unsigned char c[4];
238 unsigned int ui;
239 };
240} __attribute__((aligned(4)));
241
243 union {
244 unsigned int ui[2];
245 unsigned char c[8];
246 };
247} __attribute__((aligned(8)));
248
249__device__
250static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
251 struct uchar2Holder cHoldVal;
252 struct ucharHolder cHoldKey;
253 cHoldKey.ui = s;
254 cHoldVal.ui[0] = x;
255 cHoldVal.ui[1] = y;
256 unsigned int result;
257 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
258 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
259 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
260 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
261 return result;
262}
263
264__device__ static inline unsigned int __hadd(int x, int y) {
265 int z = x + y;
266 int sign = z & 0x8000000;
267 int value = z & 0x7FFFFFFF;
268 return ((value) >> 1 || sign);
269}
270
271__device__ static inline int __mul24(int x, int y) {
272 return __ockl_mul24_i32(x, y);
273}
274
275__device__ static inline long long __mul64hi(long long int x, long long int y) {
276 ulong x0 = (ulong)x & 0xffffffffUL;
277 long x1 = x >> 32;
278 ulong y0 = (ulong)y & 0xffffffffUL;
279 long y1 = y >> 32;
280 ulong z0 = x0*y0;
281 long t = x1*y0 + (z0 >> 32);
282 long z1 = t & 0xffffffffL;
283 long z2 = t >> 32;
284 z1 = x0*y1 + z1;
285 return x1*y1 + z2 + (z1 >> 32);
286}
287
288__device__ static inline int __mulhi(int x, int y) {
289 return __ockl_mul_hi_i32(x, y);
290}
291
292__device__ static inline int __rhadd(int x, int y) {
293 int z = x + y + 1;
294 int sign = z & 0x8000000;
295 int value = z & 0x7FFFFFFF;
296 return ((value) >> 1 || sign);
297}
298__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
299 return x > y ? x - y + z : y - x + z;
300}
301__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
302 return (x + y) >> 1;
303}
304__device__ static inline int __umul24(unsigned int x, unsigned int y) {
305 return __ockl_mul24_u32(x, y);
306}
307
308__device__
309static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
310 ulong x0 = x & 0xffffffffUL;
311 ulong x1 = x >> 32;
312 ulong y0 = y & 0xffffffffUL;
313 ulong y1 = y >> 32;
314 ulong z0 = x0*y0;
315 ulong t = x1*y0 + (z0 >> 32);
316 ulong z1 = t & 0xffffffffUL;
317 ulong z2 = t >> 32;
318 z1 = x0*y1 + z1;
319 return x1*y1 + z2 + (z1 >> 32);
320}
321
322__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
323 return __ockl_mul_hi_u32(x, y);
324}
325__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
326 return (x + y + 1) >> 1;
327}
328__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
329 return __ockl_sadd_u32(x, y, z);
330}
331
332__device__ static inline unsigned int __lane_id() {
333 return __builtin_amdgcn_mbcnt_hi(
334 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
335}
336
337__device__
338static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
339
340__device__
341static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
342
343/*
344HIP specific device functions
345*/
346
347#if !defined(__HIPCC_RTC__)
348#include "amd_warp_functions.h"
349#endif
350
351#define MASK1 0x00ff00ff
352#define MASK2 0xff00ff00
353
354__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
355 char4 out;
356 unsigned one1 = in1.w & MASK1;
357 unsigned one2 = in2.w & MASK1;
358 out.w = (one1 + one2) & MASK1;
359 one1 = in1.w & MASK2;
360 one2 = in2.w & MASK2;
361 out.w = out.w | ((one1 + one2) & MASK2);
362 return out;
363}
364
365__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
366 char4 out;
367 unsigned one1 = in1.w & MASK1;
368 unsigned one2 = in2.w & MASK1;
369 out.w = (one1 - one2) & MASK1;
370 one1 = in1.w & MASK2;
371 one2 = in2.w & MASK2;
372 out.w = out.w | ((one1 - one2) & MASK2);
373 return out;
374}
375
376__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
377 char4 out;
378 unsigned one1 = in1.w & MASK1;
379 unsigned one2 = in2.w & MASK1;
380 out.w = (one1 * one2) & MASK1;
381 one1 = in1.w & MASK2;
382 one2 = in2.w & MASK2;
383 out.w = out.w | ((one1 * one2) & MASK2);
384 return out;
385}
386
387__device__ static inline float __double2float_rd(double x) {
388 return __ocml_cvtrtn_f32_f64(x);
389}
390__device__ static inline float __double2float_rn(double x) { return x; }
391__device__ static inline float __double2float_ru(double x) {
392 return __ocml_cvtrtp_f32_f64(x);
393}
394__device__ static inline float __double2float_rz(double x) {
395 return __ocml_cvtrtz_f32_f64(x);
396}
397
398__device__ static inline int __double2hiint(double x) {
399 static_assert(sizeof(double) == 2 * sizeof(int), "");
400
401 int tmp[2];
402 __builtin_memcpy(tmp, &x, sizeof(tmp));
403
404 return tmp[1];
405}
406__device__ static inline int __double2loint(double x) {
407 static_assert(sizeof(double) == 2 * sizeof(int), "");
408
409 int tmp[2];
410 __builtin_memcpy(tmp, &x, sizeof(tmp));
411
412 return tmp[0];
413}
414
415__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
416__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
417__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
418__device__ static inline int __double2int_rz(double x) { return (int)x; }
419
420__device__ static inline long long int __double2ll_rd(double x) {
421 return (long long)__ocml_floor_f64(x);
422}
423__device__ static inline long long int __double2ll_rn(double x) {
424 return (long long)__ocml_rint_f64(x);
425}
426__device__ static inline long long int __double2ll_ru(double x) {
427 return (long long)__ocml_ceil_f64(x);
428}
429__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
430
431__device__ static inline unsigned int __double2uint_rd(double x) {
432 return (unsigned int)__ocml_floor_f64(x);
433}
434__device__ static inline unsigned int __double2uint_rn(double x) {
435 return (unsigned int)__ocml_rint_f64(x);
436}
437__device__ static inline unsigned int __double2uint_ru(double x) {
438 return (unsigned int)__ocml_ceil_f64(x);
439}
440__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
441
442__device__ static inline unsigned long long int __double2ull_rd(double x) {
443 return (unsigned long long int)__ocml_floor_f64(x);
444}
445__device__ static inline unsigned long long int __double2ull_rn(double x) {
446 return (unsigned long long int)__ocml_rint_f64(x);
447}
448__device__ static inline unsigned long long int __double2ull_ru(double x) {
449 return (unsigned long long int)__ocml_ceil_f64(x);
450}
451__device__ static inline unsigned long long int __double2ull_rz(double x) {
452 return (unsigned long long int)x;
453}
454__device__ static inline long long int __double_as_longlong(double x) {
455 static_assert(sizeof(long long) == sizeof(double), "");
456
457 long long tmp;
458 __builtin_memcpy(&tmp, &x, sizeof(tmp));
459
460 return tmp;
461}
462
463/*
464__device__ unsigned short __float2half_rn(float x);
465__device__ float __half2float(unsigned short);
466
467The above device function are not a valid .
468Use
469__device__ __half __float2half_rn(float x);
470__device__ float __half2float(__half);
471from hip_fp16.h
472
473CUDA implements half as unsigned short whereas, HIP doesn't.
474
475*/
476
477__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
478__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
479__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
480__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
481
482__device__ static inline long long int __float2ll_rd(float x) {
483 return (long long int)__ocml_floor_f32(x);
484}
485__device__ static inline long long int __float2ll_rn(float x) {
486 return (long long int)__ocml_rint_f32(x);
487}
488__device__ static inline long long int __float2ll_ru(float x) {
489 return (long long int)__ocml_ceil_f32(x);
490}
491__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
492
493__device__ static inline unsigned int __float2uint_rd(float x) {
494 return (unsigned int)__ocml_floor_f32(x);
495}
496__device__ static inline unsigned int __float2uint_rn(float x) {
497 return (unsigned int)__ocml_rint_f32(x);
498}
499__device__ static inline unsigned int __float2uint_ru(float x) {
500 return (unsigned int)__ocml_ceil_f32(x);
501}
502__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
503
504__device__ static inline unsigned long long int __float2ull_rd(float x) {
505 return (unsigned long long int)__ocml_floor_f32(x);
506}
507__device__ static inline unsigned long long int __float2ull_rn(float x) {
508 return (unsigned long long int)__ocml_rint_f32(x);
509}
510__device__ static inline unsigned long long int __float2ull_ru(float x) {
511 return (unsigned long long int)__ocml_ceil_f32(x);
512}
513__device__ static inline unsigned long long int __float2ull_rz(float x) {
514 return (unsigned long long int)x;
515}
516
517__device__ static inline int __float_as_int(float x) {
518 static_assert(sizeof(int) == sizeof(float), "");
519
520 int tmp;
521 __builtin_memcpy(&tmp, &x, sizeof(tmp));
522
523 return tmp;
524}
525
526__device__ static inline unsigned int __float_as_uint(float x) {
527 static_assert(sizeof(unsigned int) == sizeof(float), "");
528
529 unsigned int tmp;
530 __builtin_memcpy(&tmp, &x, sizeof(tmp));
531
532 return tmp;
533}
534
535__device__ static inline double __hiloint2double(int hi, int lo) {
536 static_assert(sizeof(double) == sizeof(uint64_t), "");
537
538 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
539 double tmp1;
540 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
541
542 return tmp1;
543}
544
545__device__ static inline double __int2double_rn(int x) { return (double)x; }
546
547__device__ static inline float __int2float_rd(int x) {
548 return __ocml_cvtrtn_f32_s32(x);
549}
550__device__ static inline float __int2float_rn(int x) { return (float)x; }
551__device__ static inline float __int2float_ru(int x) {
552 return __ocml_cvtrtp_f32_s32(x);
553}
554__device__ static inline float __int2float_rz(int x) {
555 return __ocml_cvtrtz_f32_s32(x);
556}
557
558__device__ static inline float __int_as_float(int x) {
559 static_assert(sizeof(float) == sizeof(int), "");
560
561 float tmp;
562 __builtin_memcpy(&tmp, &x, sizeof(tmp));
563
564 return tmp;
565}
566
567__device__ static inline double __ll2double_rd(long long int x) {
568 return __ocml_cvtrtn_f64_s64(x);
569}
570__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
571__device__ static inline double __ll2double_ru(long long int x) {
572 return __ocml_cvtrtp_f64_s64(x);
573}
574__device__ static inline double __ll2double_rz(long long int x) {
575 return __ocml_cvtrtz_f64_s64(x);
576}
577
578__device__ static inline float __ll2float_rd(long long int x) {
579 return __ocml_cvtrtn_f32_s64(x);
580}
581__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
582__device__ static inline float __ll2float_ru(long long int x) {
583 return __ocml_cvtrtp_f32_s64(x);
584}
585__device__ static inline float __ll2float_rz(long long int x) {
586 return __ocml_cvtrtz_f32_s64(x);
587}
588
589__device__ static inline double __longlong_as_double(long long int x) {
590 static_assert(sizeof(double) == sizeof(long long), "");
591
592 double tmp;
593 __builtin_memcpy(&tmp, &x, sizeof(tmp));
594
595 return tmp;
596}
597
598__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
599
600__device__ static inline float __uint2float_rd(unsigned int x) {
601 return __ocml_cvtrtn_f32_u32(x);
602}
603__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
604__device__ static inline float __uint2float_ru(unsigned int x) {
605 return __ocml_cvtrtp_f32_u32(x);
606}
607__device__ static inline float __uint2float_rz(unsigned int x) {
608 return __ocml_cvtrtz_f32_u32(x);
609}
610
611__device__ static inline float __uint_as_float(unsigned int x) {
612 static_assert(sizeof(float) == sizeof(unsigned int), "");
613
614 float tmp;
615 __builtin_memcpy(&tmp, &x, sizeof(tmp));
616
617 return tmp;
618}
619
620__device__ static inline double __ull2double_rd(unsigned long long int x) {
621 return __ocml_cvtrtn_f64_u64(x);
622}
623__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
624__device__ static inline double __ull2double_ru(unsigned long long int x) {
625 return __ocml_cvtrtp_f64_u64(x);
626}
627__device__ static inline double __ull2double_rz(unsigned long long int x) {
628 return __ocml_cvtrtz_f64_u64(x);
629}
630
631__device__ static inline float __ull2float_rd(unsigned long long int x) {
632 return __ocml_cvtrtn_f32_u64(x);
633}
634__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
635__device__ static inline float __ull2float_ru(unsigned long long int x) {
636 return __ocml_cvtrtp_f32_u64(x);
637}
638__device__ static inline float __ull2float_rz(unsigned long long int x) {
639 return __ocml_cvtrtz_f32_u64(x);
640}
641
642#if defined(__clang__) && defined(__HIP__)
643
644// Clock functions
645__device__ long long int __clock64();
646__device__ long long int __clock();
647__device__ long long int clock64();
648__device__ long long int clock();
649__device__ long long int wall_clock64();
650// hip.amdgcn.bc - named sync
651__device__ void __named_sync();
652
653#ifdef __HIP_DEVICE_COMPILE__
654
655// Clock function to return GPU core cycle count.
656// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
657// through hipDeviceAttributeClockRate attribute.
658__device__
659inline __attribute((always_inline))
660long long int __clock64() {
661#if __has_builtin(__builtin_amdgcn_s_memtime)
662 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
663 return (long long int) __builtin_amdgcn_s_memtime();
664#else
665 // Subject to change when better solution available
666 return (long long int) __builtin_readcyclecounter();
667#endif
668}
669
670__device__
671inline __attribute((always_inline))
672long long int __clock() { return __clock64(); }
673
674// Clock function to return wall clock count at a constant frequency that can be queried
675// through hipDeviceAttributeWallClockRate attribute.
676__device__
677inline __attribute__((always_inline))
678long long int wall_clock64() {
679 return (long long int) __ockl_steadyctr_u64();
680}
681
682__device__
683inline __attribute__((always_inline))
684long long int clock64() { return __clock64(); }
685
686__device__
687inline __attribute__((always_inline))
688long long int clock() { return __clock(); }
689
690// hip.amdgcn.bc - named sync
691__device__
692inline
693void __named_sync() { __builtin_amdgcn_s_barrier(); }
694
695#endif // __HIP_DEVICE_COMPILE__
696
697// warp vote function __all __any __ballot
698__device__
699inline
700int __all(int predicate) {
701 return __ockl_wfall_i32(predicate);
702}
703
704__device__
705inline
706int __any(int predicate) {
707 return __ockl_wfany_i32(predicate);
708}
709
710// XXX from llvm/include/llvm/IR/InstrTypes.h
711#define ICMP_NE 33
712
713__device__
714inline
715unsigned long long int __ballot(int predicate) {
716 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
717}
718
719__device__
720inline
721unsigned long long int __ballot64(int predicate) {
722 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
723}
724
725// hip.amdgcn.bc - lanemask
726__device__
727inline
728uint64_t __lanemask_gt()
729{
730 uint32_t lane = __ockl_lane_u32();
731 if (lane == 63)
732 return 0;
733 uint64_t ballot = __ballot64(1);
734 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
735 return mask & ballot;
736}
737
738__device__
739inline
740uint64_t __lanemask_lt()
741{
742 uint32_t lane = __ockl_lane_u32();
743 int64_t ballot = __ballot64(1);
744 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
745 return mask & ballot;
746}
747
748__device__
749inline
750uint64_t __lanemask_eq()
751{
752 uint32_t lane = __ockl_lane_u32();
753 int64_t mask = ((uint64_t)1 << lane);
754 return mask;
755}
756
757
758__device__ inline void* __local_to_generic(void* p) { return p; }
759
760#ifdef __HIP_DEVICE_COMPILE__
761__device__
762inline
763void* __get_dynamicgroupbaseptr()
764{
765 // Get group segment base pointer.
766 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
767}
768#else
769__device__
770void* __get_dynamicgroupbaseptr();
771#endif // __HIP_DEVICE_COMPILE__
772
773__device__
774inline
775void *__amdgcn_get_dynamicgroupbaseptr() {
776 return __get_dynamicgroupbaseptr();
777}
778
779// Memory Fence Functions
780__device__
781inline
782static void __threadfence()
783{
784 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
785}
786
787__device__
788inline
789static void __threadfence_block()
790{
791 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
792}
793
794__device__
795inline
796static void __threadfence_system()
797{
798 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
799}
800
801// abort
802__device__
803inline
804__attribute__((weak))
805void abort() {
806 return __builtin_trap();
807}
808
809// The noinline attribute helps encapsulate the printf expansion,
810// which otherwise has a performance impact just by increasing the
811// size of the calling function. Additionally, the weak attribute
812// allows the function to exist as a global although its definition is
813// included in every compilation unit.
814#if defined(_WIN32) || defined(_WIN64)
815extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
816void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
817 // FIXME: Need `wchar_t` support to generate assertion message.
818 __builtin_trap();
819}
820#else /* defined(_WIN32) || defined(_WIN64) */
821extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
822void __assert_fail(const char *assertion,
823 const char *file,
824 unsigned int line,
825 const char *function)
826{
827 const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
828
829 // strlen is not available as a built-in yet, so we create our own
830 // loop in a macro. With a string literal argument, the compiler
831 // usually manages to replace the loop with a constant.
832 //
833 // The macro does not check for null pointer, since all the string
834 // arguments are defined to be constant literals when called from
835 // the assert() macro.
836 //
837 // NOTE: The loop below includes the null terminator in the length
838 // as required by append_string_n().
839#define __hip_get_string_length(LEN, STR) \
840 do { \
841 const char *tmp = STR; \
842 while (*tmp++); \
843 LEN = tmp - STR; \
844 } while (0)
845
846 auto msg = __ockl_fprintf_stderr_begin();
847 int len = 0;
848 __hip_get_string_length(len, fmt);
849 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
850 __hip_get_string_length(len, file);
851 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
852 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
853 __hip_get_string_length(len, function);
854 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
855 __hip_get_string_length(len, assertion);
856 __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
857
858#undef __hip_get_string_length
859
860 __builtin_trap();
861}
862
863extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
864void __assertfail()
865{
866 // ignore all the args for now.
867 __builtin_trap();
868}
869#endif /* defined(_WIN32) || defined(_WIN64) */
870
871__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
872 if (flags) {
873 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
874 __builtin_amdgcn_s_barrier();
875 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
876 } else {
877 __builtin_amdgcn_s_barrier();
878 }
879}
880
881__device__
882inline
883static void __barrier(int n)
884{
885 __work_group_barrier((__cl_mem_fence_flags)n);
886}
887
888__device__
889inline
890__attribute__((convergent))
891void __syncthreads()
892{
893 __barrier(__CLK_LOCAL_MEM_FENCE);
894}
895
896__device__
897inline
898__attribute__((convergent))
899int __syncthreads_count(int predicate)
900{
901 return __ockl_wgred_add_i32(!!predicate);
902}
903
904__device__
905inline
906__attribute__((convergent))
907int __syncthreads_and(int predicate)
908{
909 return __ockl_wgred_and_i32(!!predicate);
910}
911
912__device__
913inline
914__attribute__((convergent))
915int __syncthreads_or(int predicate)
916{
917 return __ockl_wgred_or_i32(!!predicate);
918}
919
920// hip.amdgcn.bc - device routine
921/*
922 HW_ID Register bit structure for RDNA2 & RDNA3
923 WAVE_ID 4:0 Wave id within the SIMD.
924 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
925 WGP_ID 13:10 Physical WGP ID.
926 SA_ID 16 Shader Array ID
927 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
928 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
929 DP_RATE 31:29 Number of double-precision float units per SIMD
930
931 HW_ID Register bit structure for GCN and CDNA
932 WAVE_ID 3:0 Wave buffer slot number. 0-9.
933 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
934 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
935 CU_ID 11:8 Compute Unit the wave is assigned to.
936 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
937 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
938 14:13 Shader Engine the wave is assigned to for Vega.
939 TG_ID 19:16 Thread-group ID
940 VM_ID 23:20 Virtual Memory ID
941 QUEUE_ID 26:24 Queue from which this wave was dispatched.
942 STATE_ID 29:27 State ID (graphics only, not compute).
943 ME_ID 31:30 Micro-engine ID.
944
945 XCC_ID Register bit structure for gfx940
946 XCC_ID 3:0 XCC the wave is assigned to.
947 */
948
949#if (defined (__GFX10__) || defined (__GFX11__))
950 #define HW_ID 23
951#else
952 #define HW_ID 4
953#endif
954
955#if (defined(__GFX10__) || defined(__GFX11__))
956 #define HW_ID_WGP_ID_SIZE 4
957 #define HW_ID_WGP_ID_OFFSET 10
958#else
959 #define HW_ID_CU_ID_SIZE 4
960 #define HW_ID_CU_ID_OFFSET 8
961#endif
962
963#if (defined(__gfx908__) || defined(__gfx90a__) || \
964 defined(__GFX11__))
965 #define HW_ID_SE_ID_SIZE 3
966#else //4 SEs/XCC for gfx940-942
967 #define HW_ID_SE_ID_SIZE 2
968#endif
969#if (defined(__GFX10__) || defined(__GFX11__))
970 #define HW_ID_SE_ID_OFFSET 18
971 #define HW_ID_SA_ID_OFFSET 16
972 #define HW_ID_SA_ID_SIZE 1
973#else
974 #define HW_ID_SE_ID_OFFSET 13
975#endif
976
977#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
978 #define XCC_ID 20
979 #define XCC_ID_XCC_ID_SIZE 4
980 #define XCC_ID_XCC_ID_OFFSET 0
981#endif
982
983#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
984 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
985 #define __HIP_NO_IMAGE_SUPPORT 1
986#endif
987
988/*
989 Encoding of parameter bitmask
990 HW_ID 5:0 HW_ID
991 OFFSET 10:6 Range: 0..31
992 SIZE 15:11 Range: 1..32
993 */
994
995#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
996
997/*
998 __smid returns the wave's assigned Compute Unit and Shader Engine.
999 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1000 Note: the results vary over time.
1001 SZ minus 1 since SIZE is 1-based.
1002*/
1003__device__
1004inline
1005unsigned __smid(void)
1006{
1007 unsigned se_id = __builtin_amdgcn_s_getreg(
1008 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1009 #if (defined(__GFX10__) || defined(__GFX11__))
1010 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1011 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1012 unsigned sa_id = __builtin_amdgcn_s_getreg(
1013 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1014 #else
1015 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
1016 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1017 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1018 #endif
1019 unsigned cu_id = __builtin_amdgcn_s_getreg(
1020 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1021 #endif
1022 #if (defined(__GFX10__) || defined(__GFX11__))
1023 unsigned temp = se_id;
1024 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1025 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1026 return temp;
1027 //TODO : CU Mode impl
1028 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
1029 unsigned temp = xcc_id;
1030 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1031 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1032 return temp;
1033 #else
1034 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1035 #endif
1036}
1037
1042#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1043#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1044
1045#endif //defined(__clang__) && defined(__HIP__)
1046
1047
1048// loop unrolling
1049static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1050 auto dstPtr = static_cast<unsigned char*>(dst);
1051 auto srcPtr = static_cast<const unsigned char*>(src);
1052
1053 while (size >= 4u) {
1054 dstPtr[0] = srcPtr[0];
1055 dstPtr[1] = srcPtr[1];
1056 dstPtr[2] = srcPtr[2];
1057 dstPtr[3] = srcPtr[3];
1058
1059 size -= 4u;
1060 srcPtr += 4u;
1061 dstPtr += 4u;
1062 }
1063 switch (size) {
1064 case 3:
1065 dstPtr[2] = srcPtr[2];
1066 case 2:
1067 dstPtr[1] = srcPtr[1];
1068 case 1:
1069 dstPtr[0] = srcPtr[0];
1070 }
1071
1072 return dst;
1073}
1074
1075static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1076 auto dstPtr = static_cast<unsigned char*>(dst);
1077
1078 while (size >= 4u) {
1079 dstPtr[0] = val;
1080 dstPtr[1] = val;
1081 dstPtr[2] = val;
1082 dstPtr[3] = val;
1083
1084 size -= 4u;
1085 dstPtr += 4u;
1086 }
1087 switch (size) {
1088 case 3:
1089 dstPtr[2] = val;
1090 case 2:
1091 dstPtr[1] = val;
1092 case 1:
1093 dstPtr[0] = val;
1094 }
1095
1096 return dst;
1097}
1098#ifndef __OPENMP_AMDGCN__
1099static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1100 return __hip_hc_memcpy(dst, src, size);
1101}
1102
1103static inline __device__ void* memset(void* ptr, int val, size_t size) {
1104 unsigned char val8 = static_cast<unsigned char>(val);
1105 return __hip_hc_memset(ptr, val8, size);
1106}
1107#endif // !__OPENMP_AMDGCN__
1108
1109#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:235
Definition amd_device_functions.h:242
Definition amd_hip_vector_types.h:1623