Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | #ifndef __CLANG_HIP_MATH_H__ |
||
10 | #define __CLANG_HIP_MATH_H__ |
||
11 | |||
12 | #if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) |
||
13 | #error "This file is for HIP and OpenMP AMDGCN device compilation only." |
||
14 | #endif |
||
15 | |||
16 | #if !defined(__HIPCC_RTC__) |
||
17 | #if defined(__cplusplus) |
||
18 | #include <algorithm> |
||
19 | #endif |
||
20 | #include <limits.h> |
||
21 | #include <stdint.h> |
||
22 | #ifdef __OPENMP_AMDGCN__ |
||
23 | #include <omp.h> |
||
24 | #endif |
||
25 | #endif // !defined(__HIPCC_RTC__) |
||
26 | |||
27 | #pragma push_macro("__DEVICE__") |
||
28 | |||
29 | #ifdef __OPENMP_AMDGCN__ |
||
30 | #define __DEVICE__ static inline __attribute__((always_inline, nothrow)) |
||
31 | #else |
||
32 | #define __DEVICE__ static __device__ inline __attribute__((always_inline)) |
||
33 | #endif |
||
34 | |||
35 | // A few functions return bool type starting only in C++11. |
||
36 | #pragma push_macro("__RETURN_TYPE") |
||
37 | #ifdef __OPENMP_AMDGCN__ |
||
38 | #define __RETURN_TYPE int |
||
39 | #else |
||
40 | #if defined(__cplusplus) |
||
41 | #define __RETURN_TYPE bool |
||
42 | #else |
||
43 | #define __RETURN_TYPE int |
||
44 | #endif |
||
45 | #endif // __OPENMP_AMDGCN__ |
||
46 | |||
47 | #if defined (__cplusplus) && __cplusplus < 201103L |
||
48 | // emulate static_assert on type sizes |
||
49 | template<bool> |
||
50 | struct __compare_result{}; |
||
51 | template<> |
||
52 | struct __compare_result<true> { |
||
53 | static const __device__ bool valid; |
||
54 | }; |
||
55 | |||
56 | __DEVICE__ |
||
57 | void __suppress_unused_warning(bool b){}; |
||
58 | template <unsigned int S, unsigned int T> |
||
59 | __DEVICE__ void __static_assert_equal_size() { |
||
60 | __suppress_unused_warning(__compare_result<S == T>::valid); |
||
61 | } |
||
62 | |||
63 | #define __static_assert_type_size_equal(A, B) \ |
||
64 | __static_assert_equal_size<A,B>() |
||
65 | |||
66 | #else |
||
67 | #define __static_assert_type_size_equal(A,B) \ |
||
68 | static_assert((A) == (B), "") |
||
69 | |||
70 | #endif |
||
71 | |||
72 | __DEVICE__ |
||
73 | uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { |
||
74 | uint64_t __r = 0; |
||
75 | while (*__tagp != '\0') { |
||
76 | char __tmp = *__tagp; |
||
77 | |||
78 | if (__tmp >= '0' && __tmp <= '7') |
||
79 | __r = (__r * 8u) + __tmp - '0'; |
||
80 | else |
||
81 | return 0; |
||
82 | |||
83 | ++__tagp; |
||
84 | } |
||
85 | |||
86 | return __r; |
||
87 | } |
||
88 | |||
89 | __DEVICE__ |
||
90 | uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { |
||
91 | uint64_t __r = 0; |
||
92 | while (*__tagp != '\0') { |
||
93 | char __tmp = *__tagp; |
||
94 | |||
95 | if (__tmp >= '0' && __tmp <= '9') |
||
96 | __r = (__r * 10u) + __tmp - '0'; |
||
97 | else |
||
98 | return 0; |
||
99 | |||
100 | ++__tagp; |
||
101 | } |
||
102 | |||
103 | return __r; |
||
104 | } |
||
105 | |||
106 | __DEVICE__ |
||
107 | uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { |
||
108 | uint64_t __r = 0; |
||
109 | while (*__tagp != '\0') { |
||
110 | char __tmp = *__tagp; |
||
111 | |||
112 | if (__tmp >= '0' && __tmp <= '9') |
||
113 | __r = (__r * 16u) + __tmp - '0'; |
||
114 | else if (__tmp >= 'a' && __tmp <= 'f') |
||
115 | __r = (__r * 16u) + __tmp - 'a' + 10; |
||
116 | else if (__tmp >= 'A' && __tmp <= 'F') |
||
117 | __r = (__r * 16u) + __tmp - 'A' + 10; |
||
118 | else |
||
119 | return 0; |
||
120 | |||
121 | ++__tagp; |
||
122 | } |
||
123 | |||
124 | return __r; |
||
125 | } |
||
126 | |||
127 | __DEVICE__ |
||
128 | uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) { |
||
129 | if (*__tagp == '0') { |
||
130 | ++__tagp; |
||
131 | |||
132 | if (*__tagp == 'x' || *__tagp == 'X') |
||
133 | return __make_mantissa_base16(__tagp); |
||
134 | else |
||
135 | return __make_mantissa_base8(__tagp); |
||
136 | } |
||
137 | |||
138 | return __make_mantissa_base10(__tagp); |
||
139 | } |
||
140 | |||
141 | // BEGIN FLOAT |
||
142 | #if defined(__cplusplus) |
||
143 | __DEVICE__ |
||
144 | int abs(int __x) { |
||
145 | int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1); |
||
146 | return (__x ^ __sgn) - __sgn; |
||
147 | } |
||
148 | __DEVICE__ |
||
149 | long labs(long __x) { |
||
150 | long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1); |
||
151 | return (__x ^ __sgn) - __sgn; |
||
152 | } |
||
153 | __DEVICE__ |
||
154 | long long llabs(long long __x) { |
||
155 | long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1); |
||
156 | return (__x ^ __sgn) - __sgn; |
||
157 | } |
||
158 | #endif |
||
159 | |||
160 | __DEVICE__ |
||
161 | float acosf(float __x) { return __ocml_acos_f32(__x); } |
||
162 | |||
163 | __DEVICE__ |
||
164 | float acoshf(float __x) { return __ocml_acosh_f32(__x); } |
||
165 | |||
166 | __DEVICE__ |
||
167 | float asinf(float __x) { return __ocml_asin_f32(__x); } |
||
168 | |||
169 | __DEVICE__ |
||
170 | float asinhf(float __x) { return __ocml_asinh_f32(__x); } |
||
171 | |||
172 | __DEVICE__ |
||
173 | float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } |
||
174 | |||
175 | __DEVICE__ |
||
176 | float atanf(float __x) { return __ocml_atan_f32(__x); } |
||
177 | |||
178 | __DEVICE__ |
||
179 | float atanhf(float __x) { return __ocml_atanh_f32(__x); } |
||
180 | |||
181 | __DEVICE__ |
||
182 | float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } |
||
183 | |||
184 | __DEVICE__ |
||
185 | float ceilf(float __x) { return __ocml_ceil_f32(__x); } |
||
186 | |||
187 | __DEVICE__ |
||
188 | float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); } |
||
189 | |||
190 | __DEVICE__ |
||
191 | float cosf(float __x) { return __ocml_cos_f32(__x); } |
||
192 | |||
193 | __DEVICE__ |
||
194 | float coshf(float __x) { return __ocml_cosh_f32(__x); } |
||
195 | |||
196 | __DEVICE__ |
||
197 | float cospif(float __x) { return __ocml_cospi_f32(__x); } |
||
198 | |||
199 | __DEVICE__ |
||
200 | float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } |
||
201 | |||
202 | __DEVICE__ |
||
203 | float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } |
||
204 | |||
205 | __DEVICE__ |
||
206 | float erfcf(float __x) { return __ocml_erfc_f32(__x); } |
||
207 | |||
208 | __DEVICE__ |
||
209 | float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } |
||
210 | |||
211 | __DEVICE__ |
||
212 | float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } |
||
213 | |||
214 | __DEVICE__ |
||
215 | float erff(float __x) { return __ocml_erf_f32(__x); } |
||
216 | |||
217 | __DEVICE__ |
||
218 | float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } |
||
219 | |||
220 | __DEVICE__ |
||
221 | float exp10f(float __x) { return __ocml_exp10_f32(__x); } |
||
222 | |||
223 | __DEVICE__ |
||
224 | float exp2f(float __x) { return __ocml_exp2_f32(__x); } |
||
225 | |||
226 | __DEVICE__ |
||
227 | float expf(float __x) { return __ocml_exp_f32(__x); } |
||
228 | |||
229 | __DEVICE__ |
||
230 | float expm1f(float __x) { return __ocml_expm1_f32(__x); } |
||
231 | |||
232 | __DEVICE__ |
||
233 | float fabsf(float __x) { return __builtin_fabsf(__x); } |
||
234 | |||
235 | __DEVICE__ |
||
236 | float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } |
||
237 | |||
238 | __DEVICE__ |
||
239 | float fdividef(float __x, float __y) { return __x / __y; } |
||
240 | |||
241 | __DEVICE__ |
||
242 | float floorf(float __x) { return __ocml_floor_f32(__x); } |
||
243 | |||
244 | __DEVICE__ |
||
245 | float fmaf(float __x, float __y, float __z) { |
||
246 | return __ocml_fma_f32(__x, __y, __z); |
||
247 | } |
||
248 | |||
249 | __DEVICE__ |
||
250 | float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); } |
||
251 | |||
252 | __DEVICE__ |
||
253 | float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); } |
||
254 | |||
255 | __DEVICE__ |
||
256 | float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } |
||
257 | |||
258 | __DEVICE__ |
||
259 | float frexpf(float __x, int *__nptr) { |
||
260 | int __tmp; |
||
261 | #ifdef __OPENMP_AMDGCN__ |
||
262 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
263 | #endif |
||
264 | float __r = |
||
265 | __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp); |
||
266 | *__nptr = __tmp; |
||
267 | |||
268 | return __r; |
||
269 | } |
||
270 | |||
271 | __DEVICE__ |
||
272 | float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } |
||
273 | |||
274 | __DEVICE__ |
||
275 | int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } |
||
276 | |||
277 | __DEVICE__ |
||
278 | __RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); } |
||
279 | |||
280 | __DEVICE__ |
||
281 | __RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); } |
||
282 | |||
283 | __DEVICE__ |
||
284 | __RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); } |
||
285 | |||
286 | __DEVICE__ |
||
287 | float j0f(float __x) { return __ocml_j0_f32(__x); } |
||
288 | |||
289 | __DEVICE__ |
||
290 | float j1f(float __x) { return __ocml_j1_f32(__x); } |
||
291 | |||
292 | __DEVICE__ |
||
293 | float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication |
||
294 | // and the Miller & Brown algorithm |
||
295 | // for linear recurrences to get O(log n) steps, but it's unclear if |
||
296 | // it'd be beneficial in this case. |
||
297 | if (__n == 0) |
||
298 | return j0f(__x); |
||
299 | if (__n == 1) |
||
300 | return j1f(__x); |
||
301 | |||
302 | float __x0 = j0f(__x); |
||
303 | float __x1 = j1f(__x); |
||
304 | for (int __i = 1; __i < __n; ++__i) { |
||
305 | float __x2 = (2 * __i) / __x * __x1 - __x0; |
||
306 | __x0 = __x1; |
||
307 | __x1 = __x2; |
||
308 | } |
||
309 | |||
310 | return __x1; |
||
311 | } |
||
312 | |||
313 | __DEVICE__ |
||
314 | float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); } |
||
315 | |||
316 | __DEVICE__ |
||
317 | float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } |
||
318 | |||
319 | __DEVICE__ |
||
320 | long long int llrintf(float __x) { return __ocml_rint_f32(__x); } |
||
321 | |||
322 | __DEVICE__ |
||
323 | long long int llroundf(float __x) { return __ocml_round_f32(__x); } |
||
324 | |||
325 | __DEVICE__ |
||
326 | float log10f(float __x) { return __ocml_log10_f32(__x); } |
||
327 | |||
328 | __DEVICE__ |
||
329 | float log1pf(float __x) { return __ocml_log1p_f32(__x); } |
||
330 | |||
331 | __DEVICE__ |
||
332 | float log2f(float __x) { return __ocml_log2_f32(__x); } |
||
333 | |||
334 | __DEVICE__ |
||
335 | float logbf(float __x) { return __ocml_logb_f32(__x); } |
||
336 | |||
337 | __DEVICE__ |
||
338 | float logf(float __x) { return __ocml_log_f32(__x); } |
||
339 | |||
340 | __DEVICE__ |
||
341 | long int lrintf(float __x) { return __ocml_rint_f32(__x); } |
||
342 | |||
343 | __DEVICE__ |
||
344 | long int lroundf(float __x) { return __ocml_round_f32(__x); } |
||
345 | |||
346 | __DEVICE__ |
||
347 | float modff(float __x, float *__iptr) { |
||
348 | float __tmp; |
||
349 | #ifdef __OPENMP_AMDGCN__ |
||
350 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
351 | #endif |
||
352 | float __r = |
||
353 | __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); |
||
354 | *__iptr = __tmp; |
||
355 | return __r; |
||
356 | } |
||
357 | |||
358 | __DEVICE__ |
||
359 | float nanf(const char *__tagp __attribute__((nonnull))) { |
||
360 | union { |
||
361 | float val; |
||
362 | struct ieee_float { |
||
363 | unsigned int mantissa : 22; |
||
364 | unsigned int quiet : 1; |
||
365 | unsigned int exponent : 8; |
||
366 | unsigned int sign : 1; |
||
367 | } bits; |
||
368 | } __tmp; |
||
369 | __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); |
||
370 | |||
371 | __tmp.bits.sign = 0u; |
||
372 | __tmp.bits.exponent = ~0u; |
||
373 | __tmp.bits.quiet = 1u; |
||
374 | __tmp.bits.mantissa = __make_mantissa(__tagp); |
||
375 | |||
376 | return __tmp.val; |
||
377 | } |
||
378 | |||
379 | __DEVICE__ |
||
380 | float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); } |
||
381 | |||
382 | __DEVICE__ |
||
383 | float nextafterf(float __x, float __y) { |
||
384 | return __ocml_nextafter_f32(__x, __y); |
||
385 | } |
||
386 | |||
387 | __DEVICE__ |
||
388 | float norm3df(float __x, float __y, float __z) { |
||
389 | return __ocml_len3_f32(__x, __y, __z); |
||
390 | } |
||
391 | |||
392 | __DEVICE__ |
||
393 | float norm4df(float __x, float __y, float __z, float __w) { |
||
394 | return __ocml_len4_f32(__x, __y, __z, __w); |
||
395 | } |
||
396 | |||
397 | __DEVICE__ |
||
398 | float normcdff(float __x) { return __ocml_ncdf_f32(__x); } |
||
399 | |||
400 | __DEVICE__ |
||
401 | float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } |
||
402 | |||
403 | __DEVICE__ |
||
404 | float normf(int __dim, |
||
405 | const float *__a) { // TODO: placeholder until OCML adds support. |
||
406 | float __r = 0; |
||
407 | while (__dim--) { |
||
408 | __r += __a[0] * __a[0]; |
||
409 | ++__a; |
||
410 | } |
||
411 | |||
412 | return __ocml_sqrt_f32(__r); |
||
413 | } |
||
414 | |||
415 | __DEVICE__ |
||
416 | float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } |
||
417 | |||
418 | __DEVICE__ |
||
419 | float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } |
||
420 | |||
421 | __DEVICE__ |
||
422 | float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } |
||
423 | |||
424 | __DEVICE__ |
||
425 | float remainderf(float __x, float __y) { |
||
426 | return __ocml_remainder_f32(__x, __y); |
||
427 | } |
||
428 | |||
429 | __DEVICE__ |
||
430 | float remquof(float __x, float __y, int *__quo) { |
||
431 | int __tmp; |
||
432 | #ifdef __OPENMP_AMDGCN__ |
||
433 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
434 | #endif |
||
435 | float __r = __ocml_remquo_f32( |
||
436 | __x, __y, (__attribute__((address_space(5))) int *)&__tmp); |
||
437 | *__quo = __tmp; |
||
438 | |||
439 | return __r; |
||
440 | } |
||
441 | |||
442 | __DEVICE__ |
||
443 | float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } |
||
444 | |||
445 | __DEVICE__ |
||
446 | float rintf(float __x) { return __ocml_rint_f32(__x); } |
||
447 | |||
448 | __DEVICE__ |
||
449 | float rnorm3df(float __x, float __y, float __z) { |
||
450 | return __ocml_rlen3_f32(__x, __y, __z); |
||
451 | } |
||
452 | |||
453 | __DEVICE__ |
||
454 | float rnorm4df(float __x, float __y, float __z, float __w) { |
||
455 | return __ocml_rlen4_f32(__x, __y, __z, __w); |
||
456 | } |
||
457 | |||
458 | __DEVICE__ |
||
459 | float rnormf(int __dim, |
||
460 | const float *__a) { // TODO: placeholder until OCML adds support. |
||
461 | float __r = 0; |
||
462 | while (__dim--) { |
||
463 | __r += __a[0] * __a[0]; |
||
464 | ++__a; |
||
465 | } |
||
466 | |||
467 | return __ocml_rsqrt_f32(__r); |
||
468 | } |
||
469 | |||
470 | __DEVICE__ |
||
471 | float roundf(float __x) { return __ocml_round_f32(__x); } |
||
472 | |||
473 | __DEVICE__ |
||
474 | float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } |
||
475 | |||
476 | __DEVICE__ |
||
477 | float scalblnf(float __x, long int __n) { |
||
478 | return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n) |
||
479 | : __ocml_scalb_f32(__x, __n); |
||
480 | } |
||
481 | |||
482 | __DEVICE__ |
||
483 | float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); } |
||
484 | |||
485 | __DEVICE__ |
||
486 | __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); } |
||
487 | |||
488 | __DEVICE__ |
||
489 | void sincosf(float __x, float *__sinptr, float *__cosptr) { |
||
490 | float __tmp; |
||
491 | #ifdef __OPENMP_AMDGCN__ |
||
492 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
493 | #endif |
||
494 | *__sinptr = |
||
495 | __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); |
||
496 | *__cosptr = __tmp; |
||
497 | } |
||
498 | |||
499 | __DEVICE__ |
||
500 | void sincospif(float __x, float *__sinptr, float *__cosptr) { |
||
501 | float __tmp; |
||
502 | #ifdef __OPENMP_AMDGCN__ |
||
503 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
504 | #endif |
||
505 | *__sinptr = __ocml_sincospi_f32( |
||
506 | __x, (__attribute__((address_space(5))) float *)&__tmp); |
||
507 | *__cosptr = __tmp; |
||
508 | } |
||
509 | |||
510 | __DEVICE__ |
||
511 | float sinf(float __x) { return __ocml_sin_f32(__x); } |
||
512 | |||
513 | __DEVICE__ |
||
514 | float sinhf(float __x) { return __ocml_sinh_f32(__x); } |
||
515 | |||
516 | __DEVICE__ |
||
517 | float sinpif(float __x) { return __ocml_sinpi_f32(__x); } |
||
518 | |||
519 | __DEVICE__ |
||
520 | float sqrtf(float __x) { return __ocml_sqrt_f32(__x); } |
||
521 | |||
522 | __DEVICE__ |
||
523 | float tanf(float __x) { return __ocml_tan_f32(__x); } |
||
524 | |||
525 | __DEVICE__ |
||
526 | float tanhf(float __x) { return __ocml_tanh_f32(__x); } |
||
527 | |||
528 | __DEVICE__ |
||
529 | float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } |
||
530 | |||
531 | __DEVICE__ |
||
532 | float truncf(float __x) { return __ocml_trunc_f32(__x); } |
||
533 | |||
534 | __DEVICE__ |
||
535 | float y0f(float __x) { return __ocml_y0_f32(__x); } |
||
536 | |||
537 | __DEVICE__ |
||
538 | float y1f(float __x) { return __ocml_y1_f32(__x); } |
||
539 | |||
540 | __DEVICE__ |
||
541 | float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication |
||
542 | // and the Miller & Brown algorithm |
||
543 | // for linear recurrences to get O(log n) steps, but it's unclear if |
||
544 | // it'd be beneficial in this case. Placeholder until OCML adds |
||
545 | // support. |
||
546 | if (__n == 0) |
||
547 | return y0f(__x); |
||
548 | if (__n == 1) |
||
549 | return y1f(__x); |
||
550 | |||
551 | float __x0 = y0f(__x); |
||
552 | float __x1 = y1f(__x); |
||
553 | for (int __i = 1; __i < __n; ++__i) { |
||
554 | float __x2 = (2 * __i) / __x * __x1 - __x0; |
||
555 | __x0 = __x1; |
||
556 | __x1 = __x2; |
||
557 | } |
||
558 | |||
559 | return __x1; |
||
560 | } |
||
561 | |||
562 | // BEGIN INTRINSICS |
||
563 | |||
564 | __DEVICE__ |
||
565 | float __cosf(float __x) { return __ocml_native_cos_f32(__x); } |
||
566 | |||
567 | __DEVICE__ |
||
568 | float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); } |
||
569 | |||
570 | __DEVICE__ |
||
571 | float __expf(float __x) { return __ocml_native_exp_f32(__x); } |
||
572 | |||
573 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
574 | __DEVICE__ |
||
575 | float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } |
||
576 | __DEVICE__ |
||
577 | float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } |
||
578 | __DEVICE__ |
||
579 | float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } |
||
580 | __DEVICE__ |
||
581 | float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } |
||
582 | #else |
||
583 | __DEVICE__ |
||
584 | float __fadd_rn(float __x, float __y) { return __x + __y; } |
||
585 | #endif |
||
586 | |||
587 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
588 | __DEVICE__ |
||
589 | float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } |
||
590 | __DEVICE__ |
||
591 | float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } |
||
592 | __DEVICE__ |
||
593 | float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } |
||
594 | __DEVICE__ |
||
595 | float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } |
||
596 | #else |
||
597 | __DEVICE__ |
||
598 | float __fdiv_rn(float __x, float __y) { return __x / __y; } |
||
599 | #endif |
||
600 | |||
601 | __DEVICE__ |
||
602 | float __fdividef(float __x, float __y) { return __x / __y; } |
||
603 | |||
604 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
605 | __DEVICE__ |
||
606 | float __fmaf_rd(float __x, float __y, float __z) { |
||
607 | return __ocml_fma_rtn_f32(__x, __y, __z); |
||
608 | } |
||
609 | __DEVICE__ |
||
610 | float __fmaf_rn(float __x, float __y, float __z) { |
||
611 | return __ocml_fma_rte_f32(__x, __y, __z); |
||
612 | } |
||
613 | __DEVICE__ |
||
614 | float __fmaf_ru(float __x, float __y, float __z) { |
||
615 | return __ocml_fma_rtp_f32(__x, __y, __z); |
||
616 | } |
||
617 | __DEVICE__ |
||
618 | float __fmaf_rz(float __x, float __y, float __z) { |
||
619 | return __ocml_fma_rtz_f32(__x, __y, __z); |
||
620 | } |
||
621 | #else |
||
622 | __DEVICE__ |
||
623 | float __fmaf_rn(float __x, float __y, float __z) { |
||
624 | return __ocml_fma_f32(__x, __y, __z); |
||
625 | } |
||
626 | #endif |
||
627 | |||
628 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
629 | __DEVICE__ |
||
630 | float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } |
||
631 | __DEVICE__ |
||
632 | float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } |
||
633 | __DEVICE__ |
||
634 | float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } |
||
635 | __DEVICE__ |
||
636 | float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } |
||
637 | #else |
||
638 | __DEVICE__ |
||
639 | float __fmul_rn(float __x, float __y) { return __x * __y; } |
||
640 | #endif |
||
641 | |||
642 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
643 | __DEVICE__ |
||
644 | float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } |
||
645 | __DEVICE__ |
||
646 | float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } |
||
647 | __DEVICE__ |
||
648 | float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } |
||
649 | __DEVICE__ |
||
650 | float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } |
||
651 | #else |
||
652 | __DEVICE__ |
||
653 | float __frcp_rn(float __x) { return 1.0f / __x; } |
||
654 | #endif |
||
655 | |||
656 | __DEVICE__ |
||
657 | float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); } |
||
658 | |||
659 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
660 | __DEVICE__ |
||
661 | float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } |
||
662 | __DEVICE__ |
||
663 | float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } |
||
664 | __DEVICE__ |
||
665 | float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } |
||
666 | __DEVICE__ |
||
667 | float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } |
||
668 | #else |
||
669 | __DEVICE__ |
||
670 | float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } |
||
671 | #endif |
||
672 | |||
673 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
674 | __DEVICE__ |
||
675 | float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } |
||
676 | __DEVICE__ |
||
677 | float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } |
||
678 | __DEVICE__ |
||
679 | float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } |
||
680 | __DEVICE__ |
||
681 | float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } |
||
682 | #else |
||
683 | __DEVICE__ |
||
684 | float __fsub_rn(float __x, float __y) { return __x - __y; } |
||
685 | #endif |
||
686 | |||
687 | __DEVICE__ |
||
688 | float __log10f(float __x) { return __ocml_native_log10_f32(__x); } |
||
689 | |||
690 | __DEVICE__ |
||
691 | float __log2f(float __x) { return __ocml_native_log2_f32(__x); } |
||
692 | |||
693 | __DEVICE__ |
||
694 | float __logf(float __x) { return __ocml_native_log_f32(__x); } |
||
695 | |||
696 | __DEVICE__ |
||
697 | float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } |
||
698 | |||
699 | __DEVICE__ |
||
700 | float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } |
||
701 | |||
702 | __DEVICE__ |
||
703 | void __sincosf(float __x, float *__sinptr, float *__cosptr) { |
||
704 | *__sinptr = __ocml_native_sin_f32(__x); |
||
705 | *__cosptr = __ocml_native_cos_f32(__x); |
||
706 | } |
||
707 | |||
708 | __DEVICE__ |
||
709 | float __sinf(float __x) { return __ocml_native_sin_f32(__x); } |
||
710 | |||
711 | __DEVICE__ |
||
712 | float __tanf(float __x) { return __ocml_tan_f32(__x); } |
||
713 | // END INTRINSICS |
||
714 | // END FLOAT |
||
715 | |||
716 | // BEGIN DOUBLE |
||
717 | __DEVICE__ |
||
718 | double acos(double __x) { return __ocml_acos_f64(__x); } |
||
719 | |||
720 | __DEVICE__ |
||
721 | double acosh(double __x) { return __ocml_acosh_f64(__x); } |
||
722 | |||
723 | __DEVICE__ |
||
724 | double asin(double __x) { return __ocml_asin_f64(__x); } |
||
725 | |||
726 | __DEVICE__ |
||
727 | double asinh(double __x) { return __ocml_asinh_f64(__x); } |
||
728 | |||
729 | __DEVICE__ |
||
730 | double atan(double __x) { return __ocml_atan_f64(__x); } |
||
731 | |||
732 | __DEVICE__ |
||
733 | double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } |
||
734 | |||
735 | __DEVICE__ |
||
736 | double atanh(double __x) { return __ocml_atanh_f64(__x); } |
||
737 | |||
738 | __DEVICE__ |
||
739 | double cbrt(double __x) { return __ocml_cbrt_f64(__x); } |
||
740 | |||
741 | __DEVICE__ |
||
742 | double ceil(double __x) { return __ocml_ceil_f64(__x); } |
||
743 | |||
744 | __DEVICE__ |
||
745 | double copysign(double __x, double __y) { |
||
746 | return __ocml_copysign_f64(__x, __y); |
||
747 | } |
||
748 | |||
749 | __DEVICE__ |
||
750 | double cos(double __x) { return __ocml_cos_f64(__x); } |
||
751 | |||
752 | __DEVICE__ |
||
753 | double cosh(double __x) { return __ocml_cosh_f64(__x); } |
||
754 | |||
755 | __DEVICE__ |
||
756 | double cospi(double __x) { return __ocml_cospi_f64(__x); } |
||
757 | |||
758 | __DEVICE__ |
||
759 | double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } |
||
760 | |||
761 | __DEVICE__ |
||
762 | double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } |
||
763 | |||
764 | __DEVICE__ |
||
765 | double erf(double __x) { return __ocml_erf_f64(__x); } |
||
766 | |||
767 | __DEVICE__ |
||
768 | double erfc(double __x) { return __ocml_erfc_f64(__x); } |
||
769 | |||
770 | __DEVICE__ |
||
771 | double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } |
||
772 | |||
773 | __DEVICE__ |
||
774 | double erfcx(double __x) { return __ocml_erfcx_f64(__x); } |
||
775 | |||
776 | __DEVICE__ |
||
777 | double erfinv(double __x) { return __ocml_erfinv_f64(__x); } |
||
778 | |||
779 | __DEVICE__ |
||
780 | double exp(double __x) { return __ocml_exp_f64(__x); } |
||
781 | |||
782 | __DEVICE__ |
||
783 | double exp10(double __x) { return __ocml_exp10_f64(__x); } |
||
784 | |||
785 | __DEVICE__ |
||
786 | double exp2(double __x) { return __ocml_exp2_f64(__x); } |
||
787 | |||
788 | __DEVICE__ |
||
789 | double expm1(double __x) { return __ocml_expm1_f64(__x); } |
||
790 | |||
791 | __DEVICE__ |
||
792 | double fabs(double __x) { return __builtin_fabs(__x); } |
||
793 | |||
794 | __DEVICE__ |
||
795 | double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } |
||
796 | |||
797 | __DEVICE__ |
||
798 | double floor(double __x) { return __ocml_floor_f64(__x); } |
||
799 | |||
800 | __DEVICE__ |
||
801 | double fma(double __x, double __y, double __z) { |
||
802 | return __ocml_fma_f64(__x, __y, __z); |
||
803 | } |
||
804 | |||
805 | __DEVICE__ |
||
806 | double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); } |
||
807 | |||
808 | __DEVICE__ |
||
809 | double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); } |
||
810 | |||
811 | __DEVICE__ |
||
812 | double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } |
||
813 | |||
814 | __DEVICE__ |
||
815 | double frexp(double __x, int *__nptr) { |
||
816 | int __tmp; |
||
817 | #ifdef __OPENMP_AMDGCN__ |
||
818 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
819 | #endif |
||
820 | double __r = |
||
821 | __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp); |
||
822 | *__nptr = __tmp; |
||
823 | return __r; |
||
824 | } |
||
825 | |||
826 | __DEVICE__ |
||
827 | double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } |
||
828 | |||
829 | __DEVICE__ |
||
830 | int ilogb(double __x) { return __ocml_ilogb_f64(__x); } |
||
831 | |||
832 | __DEVICE__ |
||
833 | __RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); } |
||
834 | |||
835 | __DEVICE__ |
||
836 | __RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); } |
||
837 | |||
838 | __DEVICE__ |
||
839 | __RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); } |
||
840 | |||
841 | __DEVICE__ |
||
842 | double j0(double __x) { return __ocml_j0_f64(__x); } |
||
843 | |||
844 | __DEVICE__ |
||
845 | double j1(double __x) { return __ocml_j1_f64(__x); } |
||
846 | |||
847 | __DEVICE__ |
||
848 | double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication |
||
849 | // and the Miller & Brown algorithm |
||
850 | // for linear recurrences to get O(log n) steps, but it's unclear if |
||
851 | // it'd be beneficial in this case. Placeholder until OCML adds |
||
852 | // support. |
||
853 | if (__n == 0) |
||
854 | return j0(__x); |
||
855 | if (__n == 1) |
||
856 | return j1(__x); |
||
857 | |||
858 | double __x0 = j0(__x); |
||
859 | double __x1 = j1(__x); |
||
860 | for (int __i = 1; __i < __n; ++__i) { |
||
861 | double __x2 = (2 * __i) / __x * __x1 - __x0; |
||
862 | __x0 = __x1; |
||
863 | __x1 = __x2; |
||
864 | } |
||
865 | return __x1; |
||
866 | } |
||
867 | |||
868 | __DEVICE__ |
||
869 | double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); } |
||
870 | |||
871 | __DEVICE__ |
||
872 | double lgamma(double __x) { return __ocml_lgamma_f64(__x); } |
||
873 | |||
874 | __DEVICE__ |
||
875 | long long int llrint(double __x) { return __ocml_rint_f64(__x); } |
||
876 | |||
877 | __DEVICE__ |
||
878 | long long int llround(double __x) { return __ocml_round_f64(__x); } |
||
879 | |||
880 | __DEVICE__ |
||
881 | double log(double __x) { return __ocml_log_f64(__x); } |
||
882 | |||
883 | __DEVICE__ |
||
884 | double log10(double __x) { return __ocml_log10_f64(__x); } |
||
885 | |||
886 | __DEVICE__ |
||
887 | double log1p(double __x) { return __ocml_log1p_f64(__x); } |
||
888 | |||
889 | __DEVICE__ |
||
890 | double log2(double __x) { return __ocml_log2_f64(__x); } |
||
891 | |||
892 | __DEVICE__ |
||
893 | double logb(double __x) { return __ocml_logb_f64(__x); } |
||
894 | |||
895 | __DEVICE__ |
||
896 | long int lrint(double __x) { return __ocml_rint_f64(__x); } |
||
897 | |||
898 | __DEVICE__ |
||
899 | long int lround(double __x) { return __ocml_round_f64(__x); } |
||
900 | |||
901 | __DEVICE__ |
||
902 | double modf(double __x, double *__iptr) { |
||
903 | double __tmp; |
||
904 | #ifdef __OPENMP_AMDGCN__ |
||
905 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
906 | #endif |
||
907 | double __r = |
||
908 | __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); |
||
909 | *__iptr = __tmp; |
||
910 | |||
911 | return __r; |
||
912 | } |
||
913 | |||
914 | __DEVICE__ |
||
915 | double nan(const char *__tagp) { |
||
916 | #if !_WIN32 |
||
917 | union { |
||
918 | double val; |
||
919 | struct ieee_double { |
||
920 | uint64_t mantissa : 51; |
||
921 | uint32_t quiet : 1; |
||
922 | uint32_t exponent : 11; |
||
923 | uint32_t sign : 1; |
||
924 | } bits; |
||
925 | } __tmp; |
||
926 | __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); |
||
927 | |||
928 | __tmp.bits.sign = 0u; |
||
929 | __tmp.bits.exponent = ~0u; |
||
930 | __tmp.bits.quiet = 1u; |
||
931 | __tmp.bits.mantissa = __make_mantissa(__tagp); |
||
932 | |||
933 | return __tmp.val; |
||
934 | #else |
||
935 | __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); |
||
936 | uint64_t __val = __make_mantissa(__tagp); |
||
937 | __val |= 0xFFF << 51; |
||
938 | return *reinterpret_cast<double *>(&__val); |
||
939 | #endif |
||
940 | } |
||
941 | |||
942 | __DEVICE__ |
||
943 | double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); } |
||
944 | |||
945 | __DEVICE__ |
||
946 | double nextafter(double __x, double __y) { |
||
947 | return __ocml_nextafter_f64(__x, __y); |
||
948 | } |
||
949 | |||
950 | __DEVICE__ |
||
951 | double norm(int __dim, |
||
952 | const double *__a) { // TODO: placeholder until OCML adds support. |
||
953 | double __r = 0; |
||
954 | while (__dim--) { |
||
955 | __r += __a[0] * __a[0]; |
||
956 | ++__a; |
||
957 | } |
||
958 | |||
959 | return __ocml_sqrt_f64(__r); |
||
960 | } |
||
961 | |||
962 | __DEVICE__ |
||
963 | double norm3d(double __x, double __y, double __z) { |
||
964 | return __ocml_len3_f64(__x, __y, __z); |
||
965 | } |
||
966 | |||
967 | __DEVICE__ |
||
968 | double norm4d(double __x, double __y, double __z, double __w) { |
||
969 | return __ocml_len4_f64(__x, __y, __z, __w); |
||
970 | } |
||
971 | |||
972 | __DEVICE__ |
||
973 | double normcdf(double __x) { return __ocml_ncdf_f64(__x); } |
||
974 | |||
975 | __DEVICE__ |
||
976 | double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } |
||
977 | |||
978 | __DEVICE__ |
||
979 | double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } |
||
980 | |||
981 | __DEVICE__ |
||
982 | double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } |
||
983 | |||
984 | __DEVICE__ |
||
985 | double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } |
||
986 | |||
987 | __DEVICE__ |
||
988 | double remainder(double __x, double __y) { |
||
989 | return __ocml_remainder_f64(__x, __y); |
||
990 | } |
||
991 | |||
992 | __DEVICE__ |
||
993 | double remquo(double __x, double __y, int *__quo) { |
||
994 | int __tmp; |
||
995 | #ifdef __OPENMP_AMDGCN__ |
||
996 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
997 | #endif |
||
998 | double __r = __ocml_remquo_f64( |
||
999 | __x, __y, (__attribute__((address_space(5))) int *)&__tmp); |
||
1000 | *__quo = __tmp; |
||
1001 | |||
1002 | return __r; |
||
1003 | } |
||
1004 | |||
1005 | __DEVICE__ |
||
1006 | double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } |
||
1007 | |||
1008 | __DEVICE__ |
||
1009 | double rint(double __x) { return __ocml_rint_f64(__x); } |
||
1010 | |||
1011 | __DEVICE__ |
||
1012 | double rnorm(int __dim, |
||
1013 | const double *__a) { // TODO: placeholder until OCML adds support. |
||
1014 | double __r = 0; |
||
1015 | while (__dim--) { |
||
1016 | __r += __a[0] * __a[0]; |
||
1017 | ++__a; |
||
1018 | } |
||
1019 | |||
1020 | return __ocml_rsqrt_f64(__r); |
||
1021 | } |
||
1022 | |||
1023 | __DEVICE__ |
||
1024 | double rnorm3d(double __x, double __y, double __z) { |
||
1025 | return __ocml_rlen3_f64(__x, __y, __z); |
||
1026 | } |
||
1027 | |||
1028 | __DEVICE__ |
||
1029 | double rnorm4d(double __x, double __y, double __z, double __w) { |
||
1030 | return __ocml_rlen4_f64(__x, __y, __z, __w); |
||
1031 | } |
||
1032 | |||
1033 | __DEVICE__ |
||
1034 | double round(double __x) { return __ocml_round_f64(__x); } |
||
1035 | |||
1036 | __DEVICE__ |
||
1037 | double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } |
||
1038 | |||
1039 | __DEVICE__ |
||
1040 | double scalbln(double __x, long int __n) { |
||
1041 | return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n) |
||
1042 | : __ocml_scalb_f64(__x, __n); |
||
1043 | } |
||
1044 | __DEVICE__ |
||
1045 | double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); } |
||
1046 | |||
1047 | __DEVICE__ |
||
1048 | __RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); } |
||
1049 | |||
1050 | __DEVICE__ |
||
1051 | double sin(double __x) { return __ocml_sin_f64(__x); } |
||
1052 | |||
1053 | __DEVICE__ |
||
1054 | void sincos(double __x, double *__sinptr, double *__cosptr) { |
||
1055 | double __tmp; |
||
1056 | #ifdef __OPENMP_AMDGCN__ |
||
1057 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
1058 | #endif |
||
1059 | *__sinptr = __ocml_sincos_f64( |
||
1060 | __x, (__attribute__((address_space(5))) double *)&__tmp); |
||
1061 | *__cosptr = __tmp; |
||
1062 | } |
||
1063 | |||
1064 | __DEVICE__ |
||
1065 | void sincospi(double __x, double *__sinptr, double *__cosptr) { |
||
1066 | double __tmp; |
||
1067 | #ifdef __OPENMP_AMDGCN__ |
||
1068 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
||
1069 | #endif |
||
1070 | *__sinptr = __ocml_sincospi_f64( |
||
1071 | __x, (__attribute__((address_space(5))) double *)&__tmp); |
||
1072 | *__cosptr = __tmp; |
||
1073 | } |
||
1074 | |||
1075 | __DEVICE__ |
||
1076 | double sinh(double __x) { return __ocml_sinh_f64(__x); } |
||
1077 | |||
1078 | __DEVICE__ |
||
1079 | double sinpi(double __x) { return __ocml_sinpi_f64(__x); } |
||
1080 | |||
1081 | __DEVICE__ |
||
1082 | double sqrt(double __x) { return __ocml_sqrt_f64(__x); } |
||
1083 | |||
1084 | __DEVICE__ |
||
1085 | double tan(double __x) { return __ocml_tan_f64(__x); } |
||
1086 | |||
1087 | __DEVICE__ |
||
1088 | double tanh(double __x) { return __ocml_tanh_f64(__x); } |
||
1089 | |||
1090 | __DEVICE__ |
||
1091 | double tgamma(double __x) { return __ocml_tgamma_f64(__x); } |
||
1092 | |||
1093 | __DEVICE__ |
||
1094 | double trunc(double __x) { return __ocml_trunc_f64(__x); } |
||
1095 | |||
1096 | __DEVICE__ |
||
1097 | double y0(double __x) { return __ocml_y0_f64(__x); } |
||
1098 | |||
1099 | __DEVICE__ |
||
1100 | double y1(double __x) { return __ocml_y1_f64(__x); } |
||
1101 | |||
1102 | __DEVICE__ |
||
1103 | double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication |
||
1104 | // and the Miller & Brown algorithm |
||
1105 | // for linear recurrences to get O(log n) steps, but it's unclear if |
||
1106 | // it'd be beneficial in this case. Placeholder until OCML adds |
||
1107 | // support. |
||
1108 | if (__n == 0) |
||
1109 | return y0(__x); |
||
1110 | if (__n == 1) |
||
1111 | return y1(__x); |
||
1112 | |||
1113 | double __x0 = y0(__x); |
||
1114 | double __x1 = y1(__x); |
||
1115 | for (int __i = 1; __i < __n; ++__i) { |
||
1116 | double __x2 = (2 * __i) / __x * __x1 - __x0; |
||
1117 | __x0 = __x1; |
||
1118 | __x1 = __x2; |
||
1119 | } |
||
1120 | |||
1121 | return __x1; |
||
1122 | } |
||
1123 | |||
1124 | // BEGIN INTRINSICS |
||
1125 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1126 | __DEVICE__ |
||
1127 | double __dadd_rd(double __x, double __y) { |
||
1128 | return __ocml_add_rtn_f64(__x, __y); |
||
1129 | } |
||
1130 | __DEVICE__ |
||
1131 | double __dadd_rn(double __x, double __y) { |
||
1132 | return __ocml_add_rte_f64(__x, __y); |
||
1133 | } |
||
1134 | __DEVICE__ |
||
1135 | double __dadd_ru(double __x, double __y) { |
||
1136 | return __ocml_add_rtp_f64(__x, __y); |
||
1137 | } |
||
1138 | __DEVICE__ |
||
1139 | double __dadd_rz(double __x, double __y) { |
||
1140 | return __ocml_add_rtz_f64(__x, __y); |
||
1141 | } |
||
1142 | #else |
||
1143 | __DEVICE__ |
||
1144 | double __dadd_rn(double __x, double __y) { return __x + __y; } |
||
1145 | #endif |
||
1146 | |||
1147 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1148 | __DEVICE__ |
||
1149 | double __ddiv_rd(double __x, double __y) { |
||
1150 | return __ocml_div_rtn_f64(__x, __y); |
||
1151 | } |
||
1152 | __DEVICE__ |
||
1153 | double __ddiv_rn(double __x, double __y) { |
||
1154 | return __ocml_div_rte_f64(__x, __y); |
||
1155 | } |
||
1156 | __DEVICE__ |
||
1157 | double __ddiv_ru(double __x, double __y) { |
||
1158 | return __ocml_div_rtp_f64(__x, __y); |
||
1159 | } |
||
1160 | __DEVICE__ |
||
1161 | double __ddiv_rz(double __x, double __y) { |
||
1162 | return __ocml_div_rtz_f64(__x, __y); |
||
1163 | } |
||
1164 | #else |
||
1165 | __DEVICE__ |
||
1166 | double __ddiv_rn(double __x, double __y) { return __x / __y; } |
||
1167 | #endif |
||
1168 | |||
1169 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1170 | __DEVICE__ |
||
1171 | double __dmul_rd(double __x, double __y) { |
||
1172 | return __ocml_mul_rtn_f64(__x, __y); |
||
1173 | } |
||
1174 | __DEVICE__ |
||
1175 | double __dmul_rn(double __x, double __y) { |
||
1176 | return __ocml_mul_rte_f64(__x, __y); |
||
1177 | } |
||
1178 | __DEVICE__ |
||
1179 | double __dmul_ru(double __x, double __y) { |
||
1180 | return __ocml_mul_rtp_f64(__x, __y); |
||
1181 | } |
||
1182 | __DEVICE__ |
||
1183 | double __dmul_rz(double __x, double __y) { |
||
1184 | return __ocml_mul_rtz_f64(__x, __y); |
||
1185 | } |
||
1186 | #else |
||
1187 | __DEVICE__ |
||
1188 | double __dmul_rn(double __x, double __y) { return __x * __y; } |
||
1189 | #endif |
||
1190 | |||
1191 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1192 | __DEVICE__ |
||
1193 | double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } |
||
1194 | __DEVICE__ |
||
1195 | double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } |
||
1196 | __DEVICE__ |
||
1197 | double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } |
||
1198 | __DEVICE__ |
||
1199 | double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } |
||
1200 | #else |
||
1201 | __DEVICE__ |
||
1202 | double __drcp_rn(double __x) { return 1.0 / __x; } |
||
1203 | #endif |
||
1204 | |||
1205 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1206 | __DEVICE__ |
||
1207 | double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } |
||
1208 | __DEVICE__ |
||
1209 | double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } |
||
1210 | __DEVICE__ |
||
1211 | double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } |
||
1212 | __DEVICE__ |
||
1213 | double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } |
||
1214 | #else |
||
1215 | __DEVICE__ |
||
1216 | double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); } |
||
1217 | #endif |
||
1218 | |||
1219 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1220 | __DEVICE__ |
||
1221 | double __dsub_rd(double __x, double __y) { |
||
1222 | return __ocml_sub_rtn_f64(__x, __y); |
||
1223 | } |
||
1224 | __DEVICE__ |
||
1225 | double __dsub_rn(double __x, double __y) { |
||
1226 | return __ocml_sub_rte_f64(__x, __y); |
||
1227 | } |
||
1228 | __DEVICE__ |
||
1229 | double __dsub_ru(double __x, double __y) { |
||
1230 | return __ocml_sub_rtp_f64(__x, __y); |
||
1231 | } |
||
1232 | __DEVICE__ |
||
1233 | double __dsub_rz(double __x, double __y) { |
||
1234 | return __ocml_sub_rtz_f64(__x, __y); |
||
1235 | } |
||
1236 | #else |
||
1237 | __DEVICE__ |
||
1238 | double __dsub_rn(double __x, double __y) { return __x - __y; } |
||
1239 | #endif |
||
1240 | |||
1241 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
||
1242 | __DEVICE__ |
||
1243 | double __fma_rd(double __x, double __y, double __z) { |
||
1244 | return __ocml_fma_rtn_f64(__x, __y, __z); |
||
1245 | } |
||
1246 | __DEVICE__ |
||
1247 | double __fma_rn(double __x, double __y, double __z) { |
||
1248 | return __ocml_fma_rte_f64(__x, __y, __z); |
||
1249 | } |
||
1250 | __DEVICE__ |
||
1251 | double __fma_ru(double __x, double __y, double __z) { |
||
1252 | return __ocml_fma_rtp_f64(__x, __y, __z); |
||
1253 | } |
||
1254 | __DEVICE__ |
||
1255 | double __fma_rz(double __x, double __y, double __z) { |
||
1256 | return __ocml_fma_rtz_f64(__x, __y, __z); |
||
1257 | } |
||
1258 | #else |
||
1259 | __DEVICE__ |
||
1260 | double __fma_rn(double __x, double __y, double __z) { |
||
1261 | return __ocml_fma_f64(__x, __y, __z); |
||
1262 | } |
||
1263 | #endif |
||
1264 | // END INTRINSICS |
||
1265 | // END DOUBLE |
||
1266 | |||
1267 | // C only macros |
||
1268 | #if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L |
||
1269 | #define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) |
||
1270 | #define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) |
||
1271 | #define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) |
||
1272 | #define signbit(__x) \ |
||
1273 | _Generic((__x), float : __signbitf, double : __signbit)(__x) |
||
1274 | #endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L |
||
1275 | |||
1276 | #if defined(__cplusplus) |
||
1277 | template <class T> __DEVICE__ T min(T __arg1, T __arg2) { |
||
1278 | return (__arg1 < __arg2) ? __arg1 : __arg2; |
||
1279 | } |
||
1280 | |||
1281 | template <class T> __DEVICE__ T max(T __arg1, T __arg2) { |
||
1282 | return (__arg1 > __arg2) ? __arg1 : __arg2; |
||
1283 | } |
||
1284 | |||
1285 | __DEVICE__ int min(int __arg1, int __arg2) { |
||
1286 | return (__arg1 < __arg2) ? __arg1 : __arg2; |
||
1287 | } |
||
1288 | __DEVICE__ int max(int __arg1, int __arg2) { |
||
1289 | return (__arg1 > __arg2) ? __arg1 : __arg2; |
||
1290 | } |
||
1291 | |||
1292 | __DEVICE__ |
||
1293 | float max(float __x, float __y) { return fmaxf(__x, __y); } |
||
1294 | |||
1295 | __DEVICE__ |
||
1296 | double max(double __x, double __y) { return fmax(__x, __y); } |
||
1297 | |||
1298 | __DEVICE__ |
||
1299 | float min(float __x, float __y) { return fminf(__x, __y); } |
||
1300 | |||
1301 | __DEVICE__ |
||
1302 | double min(double __x, double __y) { return fmin(__x, __y); } |
||
1303 | |||
1304 | #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) |
||
1305 | __host__ inline static int min(int __arg1, int __arg2) { |
||
1306 | return std::min(__arg1, __arg2); |
||
1307 | } |
||
1308 | |||
1309 | __host__ inline static int max(int __arg1, int __arg2) { |
||
1310 | return std::max(__arg1, __arg2); |
||
1311 | } |
||
1312 | #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) |
||
1313 | #endif |
||
1314 | |||
1315 | #pragma pop_macro("__DEVICE__") |
||
1316 | #pragma pop_macro("__RETURN_TYPE") |
||
1317 | |||
1318 | #endif // __CLANG_HIP_MATH_H__ |