23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
30#include <hip/hip_runtime_api.h>
32#include <hip/hip_vector_types.h>
36#if defined(__clang__) && defined(__HIP__)
37extern "C" __device__
int printf(
const char *fmt, ...);
39template <
typename... All>
40static inline __device__
void printf(
const char* format, All... all) {}
43extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
50__device__
static inline unsigned int __popc(
unsigned int input) {
51 return __builtin_popcount(input);
53__device__
static inline unsigned int __popcll(
unsigned long long int input) {
54 return __builtin_popcountll(input);
57__device__
static inline int __clz(
int input) {
58 return __ockl_clz_u32((uint)input);
61__device__
static inline int __clzll(
long long int input) {
62 return __ockl_clz_u64((uint64_t)input);
65__device__
static inline unsigned int __ffs(
unsigned int input) {
66 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
69__device__
static inline unsigned int __ffsll(
unsigned long long int input) {
70 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
73__device__
static inline unsigned int __ffsll(
unsigned long int input) {
74 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
77__device__
static inline unsigned int __ffs(
int input) {
78 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
81__device__
static inline unsigned int __ffsll(
long long int input) {
82 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85__device__
static inline unsigned int __ffsll(
long int input) {
86 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
92__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
93 uint64_t temp_mask = mask;
94 int32_t temp_offset = offset;
97 temp_mask &= (1 << base);
100 else if (offset < 0) {
101 temp_mask = __builtin_bitreverse64(mask);
103 temp_offset = -offset;
106 temp_mask = temp_mask & ((~0ULL) << base);
107 if (__builtin_popcountll(temp_mask) < temp_offset)
110 for (
int i = 0x20; i > 0; i >>= 1) {
111 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
112 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
113 if (pcnt < temp_offset) {
114 temp_mask = temp_mask >> i;
119 temp_mask = temp_mask_lo;
128__device__
static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
129 uint64_t temp_mask = mask;
130 int32_t temp_offset = offset;
132 temp_mask &= (1 << base);
135 else if (offset < 0) {
136 temp_mask = __builtin_bitreverse64(mask);
138 temp_offset = -offset;
140 temp_mask = temp_mask & ((~0ULL) << base);
141 if (__builtin_popcountll(temp_mask) < temp_offset)
144 for (
int i = 0x20; i > 0; i >>= 1) {
145 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
146 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
147 if (pcnt < temp_offset) {
148 temp_mask = temp_mask >> i;
153 temp_mask = temp_mask_lo;
161__device__
static inline unsigned int __brev(
unsigned int input) {
162 return __builtin_bitreverse32(input);
165__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
166 return __builtin_bitreverse64(input);
169__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
170 return input == 0 ? -1 : __builtin_ctzl(input);
173__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
174 uint32_t offset = src1 & 31;
175 uint32_t width = src2 & 31;
176 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
179__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
180 uint64_t offset = src1 & 63;
181 uint64_t width = src2 & 63;
182 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
185__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
186 uint32_t offset = src2 & 31;
187 uint32_t width = src3 & 31;
188 uint32_t mask = (1 << width) - 1;
189 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
192__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
193 uint64_t offset = src2 & 63;
194 uint64_t width = src3 & 63;
195 uint64_t mask = (1ULL << width) - 1;
196 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
199__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
201 uint32_t mask_shift = shift & 31;
202 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
205__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
207 uint32_t min_shift = shift >= 32 ? 32 : shift;
208 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
211__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
213 return __builtin_amdgcn_alignbit(hi, lo, shift);
216__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
218 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
221__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
222__device__
static unsigned int __hadd(
int x,
int y);
223__device__
static int __mul24(
int x,
int y);
224__device__
static long long int __mul64hi(
long long int x,
long long int y);
225__device__
static int __mulhi(
int x,
int y);
226__device__
static int __rhadd(
int x,
int y);
227__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
228__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
229__device__
static int __umul24(
unsigned int x,
unsigned int y);
230__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
231__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
232__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
233__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
240} __attribute__((aligned(4)));
247} __attribute__((aligned(8)));
250static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
257 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
258 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
259 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
260 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
264__device__
static inline unsigned int __hadd(
int x,
int y) {
266 int sign = z & 0x8000000;
267 int value = z & 0x7FFFFFFF;
268 return ((value) >> 1 || sign);
271__device__
static inline int __mul24(
int x,
int y) {
272 return __ockl_mul24_i32(x, y);
275__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
276 ulong x0 = (ulong)x & 0xffffffffUL;
278 ulong y0 = (ulong)y & 0xffffffffUL;
281 long t = x1*y0 + (z0 >> 32);
282 long z1 = t & 0xffffffffL;
285 return x1*y1 + z2 + (z1 >> 32);
288__device__
static inline int __mulhi(
int x,
int y) {
289 return __ockl_mul_hi_i32(x, y);
292__device__
static inline int __rhadd(
int x,
int y) {
294 int sign = z & 0x8000000;
295 int value = z & 0x7FFFFFFF;
296 return ((value) >> 1 || sign);
298__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
299 return x > y ? x - y + z : y - x + z;
301__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
304__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
305 return __ockl_mul24_u32(x, y);
309static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
310 ulong x0 = x & 0xffffffffUL;
312 ulong y0 = y & 0xffffffffUL;
315 ulong t = x1*y0 + (z0 >> 32);
316 ulong z1 = t & 0xffffffffUL;
319 return x1*y1 + z2 + (z1 >> 32);
322__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
323 return __ockl_mul_hi_u32(x, y);
325__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
326 return (x + y + 1) >> 1;
328__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
329 return __ockl_sadd_u32(x, y, z);
332__device__
static inline unsigned int __lane_id() {
333 return __builtin_amdgcn_mbcnt_hi(
334 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
338static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
341static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
347#if !defined(__HIPCC_RTC__)
348#include "amd_warp_functions.h"
351#define MASK1 0x00ff00ff
352#define MASK2 0xff00ff00
356 unsigned one1 = in1.w & MASK1;
357 unsigned one2 = in2.w & MASK1;
358 out.w = (one1 + one2) & MASK1;
359 one1 = in1.w & MASK2;
360 one2 = in2.w & MASK2;
361 out.w = out.w | ((one1 + one2) & MASK2);
367 unsigned one1 = in1.w & MASK1;
368 unsigned one2 = in2.w & MASK1;
369 out.w = (one1 - one2) & MASK1;
370 one1 = in1.w & MASK2;
371 one2 = in2.w & MASK2;
372 out.w = out.w | ((one1 - one2) & MASK2);
378 unsigned one1 = in1.w & MASK1;
379 unsigned one2 = in2.w & MASK1;
380 out.w = (one1 * one2) & MASK1;
381 one1 = in1.w & MASK2;
382 one2 = in2.w & MASK2;
383 out.w = out.w | ((one1 * one2) & MASK2);
387__device__
static inline float __double2float_rd(
double x) {
388 return __ocml_cvtrtn_f32_f64(x);
390__device__
static inline float __double2float_rn(
double x) {
return x; }
391__device__
static inline float __double2float_ru(
double x) {
392 return __ocml_cvtrtp_f32_f64(x);
394__device__
static inline float __double2float_rz(
double x) {
395 return __ocml_cvtrtz_f32_f64(x);
398__device__
static inline int __double2hiint(
double x) {
399 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
402 __builtin_memcpy(tmp, &x,
sizeof(tmp));
406__device__
static inline int __double2loint(
double x) {
407 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
410 __builtin_memcpy(tmp, &x,
sizeof(tmp));
415__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
416__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
417__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
418__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
420__device__
static inline long long int __double2ll_rd(
double x) {
421 return (
long long)__ocml_floor_f64(x);
423__device__
static inline long long int __double2ll_rn(
double x) {
424 return (
long long)__ocml_rint_f64(x);
426__device__
static inline long long int __double2ll_ru(
double x) {
427 return (
long long)__ocml_ceil_f64(x);
429__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
431__device__
static inline unsigned int __double2uint_rd(
double x) {
432 return (
unsigned int)__ocml_floor_f64(x);
434__device__
static inline unsigned int __double2uint_rn(
double x) {
435 return (
unsigned int)__ocml_rint_f64(x);
437__device__
static inline unsigned int __double2uint_ru(
double x) {
438 return (
unsigned int)__ocml_ceil_f64(x);
440__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
442__device__
static inline unsigned long long int __double2ull_rd(
double x) {
443 return (
unsigned long long int)__ocml_floor_f64(x);
445__device__
static inline unsigned long long int __double2ull_rn(
double x) {
446 return (
unsigned long long int)__ocml_rint_f64(x);
448__device__
static inline unsigned long long int __double2ull_ru(
double x) {
449 return (
unsigned long long int)__ocml_ceil_f64(x);
451__device__
static inline unsigned long long int __double2ull_rz(
double x) {
452 return (
unsigned long long int)x;
454__device__
static inline long long int __double_as_longlong(
double x) {
455 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
458 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
477__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
478__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
479__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
480__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
482__device__
static inline long long int __float2ll_rd(
float x) {
483 return (
long long int)__ocml_floor_f32(x);
485__device__
static inline long long int __float2ll_rn(
float x) {
486 return (
long long int)__ocml_rint_f32(x);
488__device__
static inline long long int __float2ll_ru(
float x) {
489 return (
long long int)__ocml_ceil_f32(x);
491__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
493__device__
static inline unsigned int __float2uint_rd(
float x) {
494 return (
unsigned int)__ocml_floor_f32(x);
496__device__
static inline unsigned int __float2uint_rn(
float x) {
497 return (
unsigned int)__ocml_rint_f32(x);
499__device__
static inline unsigned int __float2uint_ru(
float x) {
500 return (
unsigned int)__ocml_ceil_f32(x);
502__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
504__device__
static inline unsigned long long int __float2ull_rd(
float x) {
505 return (
unsigned long long int)__ocml_floor_f32(x);
507__device__
static inline unsigned long long int __float2ull_rn(
float x) {
508 return (
unsigned long long int)__ocml_rint_f32(x);
510__device__
static inline unsigned long long int __float2ull_ru(
float x) {
511 return (
unsigned long long int)__ocml_ceil_f32(x);
513__device__
static inline unsigned long long int __float2ull_rz(
float x) {
514 return (
unsigned long long int)x;
517__device__
static inline int __float_as_int(
float x) {
518 static_assert(
sizeof(int) ==
sizeof(
float),
"");
521 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
526__device__
static inline unsigned int __float_as_uint(
float x) {
527 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
530 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
535__device__
static inline double __hiloint2double(
int hi,
int lo) {
536 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
538 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
540 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
545__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
547__device__
static inline float __int2float_rd(
int x) {
548 return __ocml_cvtrtn_f32_s32(x);
550__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
551__device__
static inline float __int2float_ru(
int x) {
552 return __ocml_cvtrtp_f32_s32(x);
554__device__
static inline float __int2float_rz(
int x) {
555 return __ocml_cvtrtz_f32_s32(x);
558__device__
static inline float __int_as_float(
int x) {
559 static_assert(
sizeof(float) ==
sizeof(
int),
"");
562 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
567__device__
static inline double __ll2double_rd(
long long int x) {
568 return __ocml_cvtrtn_f64_s64(x);
570__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
571__device__
static inline double __ll2double_ru(
long long int x) {
572 return __ocml_cvtrtp_f64_s64(x);
574__device__
static inline double __ll2double_rz(
long long int x) {
575 return __ocml_cvtrtz_f64_s64(x);
578__device__
static inline float __ll2float_rd(
long long int x) {
579 return __ocml_cvtrtn_f32_s64(x);
581__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
582__device__
static inline float __ll2float_ru(
long long int x) {
583 return __ocml_cvtrtp_f32_s64(x);
585__device__
static inline float __ll2float_rz(
long long int x) {
586 return __ocml_cvtrtz_f32_s64(x);
589__device__
static inline double __longlong_as_double(
long long int x) {
590 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
593 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
598__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
600__device__
static inline float __uint2float_rd(
unsigned int x) {
601 return __ocml_cvtrtn_f32_u32(x);
603__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
604__device__
static inline float __uint2float_ru(
unsigned int x) {
605 return __ocml_cvtrtp_f32_u32(x);
607__device__
static inline float __uint2float_rz(
unsigned int x) {
608 return __ocml_cvtrtz_f32_u32(x);
611__device__
static inline float __uint_as_float(
unsigned int x) {
612 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
615 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
620__device__
static inline double __ull2double_rd(
unsigned long long int x) {
621 return __ocml_cvtrtn_f64_u64(x);
623__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
624__device__
static inline double __ull2double_ru(
unsigned long long int x) {
625 return __ocml_cvtrtp_f64_u64(x);
627__device__
static inline double __ull2double_rz(
unsigned long long int x) {
628 return __ocml_cvtrtz_f64_u64(x);
631__device__
static inline float __ull2float_rd(
unsigned long long int x) {
632 return __ocml_cvtrtn_f32_u64(x);
634__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
635__device__
static inline float __ull2float_ru(
unsigned long long int x) {
636 return __ocml_cvtrtp_f32_u64(x);
638__device__
static inline float __ull2float_rz(
unsigned long long int x) {
639 return __ocml_cvtrtz_f32_u64(x);
642#if defined(__clang__) && defined(__HIP__)
645__device__
long long int __clock64();
646__device__
long long int __clock();
647__device__
long long int clock64();
648__device__
long long int clock();
649__device__
long long int wall_clock64();
651__device__
void __named_sync();
653#ifdef __HIP_DEVICE_COMPILE__
659inline __attribute((always_inline))
660long long int __clock64() {
661#if __has_builtin(__builtin_amdgcn_s_memtime)
663 return (
long long int) __builtin_amdgcn_s_memtime();
666 return (
long long int) __builtin_readcyclecounter();
671inline __attribute((always_inline))
672long long int __clock() {
return __clock64(); }
677inline __attribute__((always_inline))
678long long int wall_clock64() {
679 return (
long long int) __ockl_steadyctr_u64();
683inline __attribute__((always_inline))
684long long int clock64() {
return __clock64(); }
687inline __attribute__((always_inline))
688long long int clock() {
return __clock(); }
693void __named_sync() { __builtin_amdgcn_s_barrier(); }
700int __all(
int predicate) {
701 return __ockl_wfall_i32(predicate);
706int __any(
int predicate) {
707 return __ockl_wfany_i32(predicate);
715unsigned long long int __ballot(
int predicate) {
716 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
721unsigned long long int __ballot64(
int predicate) {
722 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
728uint64_t __lanemask_gt()
730 uint32_t lane = __ockl_lane_u32();
733 uint64_t ballot = __ballot64(1);
734 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
735 return mask & ballot;
740uint64_t __lanemask_lt()
742 uint32_t lane = __ockl_lane_u32();
743 int64_t ballot = __ballot64(1);
744 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
745 return mask & ballot;
750uint64_t __lanemask_eq()
752 uint32_t lane = __ockl_lane_u32();
753 int64_t mask = ((uint64_t)1 << lane);
758__device__
inline void* __local_to_generic(
void* p) {
return p; }
760#ifdef __HIP_DEVICE_COMPILE__
763void* __get_dynamicgroupbaseptr()
766 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
770void* __get_dynamicgroupbaseptr();
775void *__amdgcn_get_dynamicgroupbaseptr() {
776 return __get_dynamicgroupbaseptr();
782static void __threadfence()
784 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
789static void __threadfence_block()
791 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
796static void __threadfence_system()
798 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
806 return __builtin_trap();
814#if defined(_WIN32) || defined(_WIN64)
815extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
816void _wassert(
const wchar_t *_msg,
const wchar_t *_file,
unsigned _line) {
821extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
822void __assert_fail(
const char *assertion,
825 const char *function)
827 const char fmt[] =
"%s:%u: %s: Device-side assertion `%s' failed.\n";
839#define __hip_get_string_length(LEN, STR) \
841 const char *tmp = STR; \
846 auto msg = __ockl_fprintf_stderr_begin();
848 __hip_get_string_length(len, fmt);
849 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
850 __hip_get_string_length(len, file);
851 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
852 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
853 __hip_get_string_length(len, function);
854 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
855 __hip_get_string_length(len, assertion);
856 __ockl_fprintf_append_string_n(msg, assertion, len, 1);
858#undef __hip_get_string_length
863extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
871__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
873 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
874 __builtin_amdgcn_s_barrier();
875 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
877 __builtin_amdgcn_s_barrier();
883static void __barrier(
int n)
885 __work_group_barrier((__cl_mem_fence_flags)n);
890__attribute__((convergent))
893 __barrier(__CLK_LOCAL_MEM_FENCE);
898__attribute__((convergent))
899int __syncthreads_count(
int predicate)
901 return __ockl_wgred_add_i32(!!predicate);
906__attribute__((convergent))
907int __syncthreads_and(
int predicate)
909 return __ockl_wgred_and_i32(!!predicate);
914__attribute__((convergent))
915int __syncthreads_or(
int predicate)
917 return __ockl_wgred_or_i32(!!predicate);
949#if (defined (__GFX10__) || defined (__GFX11__))
955#if (defined(__GFX10__) || defined(__GFX11__))
956 #define HW_ID_WGP_ID_SIZE 4
957 #define HW_ID_WGP_ID_OFFSET 10
959 #define HW_ID_CU_ID_SIZE 4
960 #define HW_ID_CU_ID_OFFSET 8
963#if (defined(__gfx908__) || defined(__gfx90a__) || \
965 #define HW_ID_SE_ID_SIZE 3
967 #define HW_ID_SE_ID_SIZE 2
969#if (defined(__GFX10__) || defined(__GFX11__))
970 #define HW_ID_SE_ID_OFFSET 18
971 #define HW_ID_SA_ID_OFFSET 16
972 #define HW_ID_SA_ID_SIZE 1
974 #define HW_ID_SE_ID_OFFSET 13
977#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
979 #define XCC_ID_XCC_ID_SIZE 4
980 #define XCC_ID_XCC_ID_OFFSET 0
983#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
984 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
985 #define __HIP_NO_IMAGE_SUPPORT 1
995#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1005unsigned __smid(
void)
1007 unsigned se_id = __builtin_amdgcn_s_getreg(
1008 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1009 #if (defined(__GFX10__) || defined(__GFX11__))
1010 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1011 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1012 unsigned sa_id = __builtin_amdgcn_s_getreg(
1013 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1015 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
1016 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1017 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1019 unsigned cu_id = __builtin_amdgcn_s_getreg(
1020 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1022 #if (defined(__GFX10__) || defined(__GFX11__))
1023 unsigned temp = se_id;
1024 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1025 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1028 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
1029 unsigned temp = xcc_id;
1030 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1031 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1034 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1042#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1043#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1049static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1050 auto dstPtr =
static_cast<unsigned char*
>(dst);
1051 auto srcPtr =
static_cast<const unsigned char*
>(src);
1053 while (size >= 4u) {
1054 dstPtr[0] = srcPtr[0];
1055 dstPtr[1] = srcPtr[1];
1056 dstPtr[2] = srcPtr[2];
1057 dstPtr[3] = srcPtr[3];
1065 dstPtr[2] = srcPtr[2];
1067 dstPtr[1] = srcPtr[1];
1069 dstPtr[0] = srcPtr[0];
1075static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1076 auto dstPtr =
static_cast<unsigned char*
>(dst);
1078 while (size >= 4u) {
1098#ifndef __OPENMP_AMDGCN__
1099static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1100 return __hip_hc_memcpy(dst, src, size);
1103static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1104 unsigned char val8 =
static_cast<unsigned char>(val);
1105 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:235
Definition amd_device_functions.h:242
Definition amd_hip_vector_types.h:1623