Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
  11. #define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
  12.  
  13. #ifndef __OPENMP_NVPTX__
  14. #if CUDA_VERSION < 9000
  15. #error This file is intended to be used with CUDA-9+ only.
  16. #endif
  17. #endif
  18.  
  19. // __DEVICE__ is a helper macro with common set of attributes for the wrappers
  20. // we implement in this file. We need static in order to avoid emitting unused
  21. // functions and __forceinline__ helps inlining these wrappers at -O1.
  22. #pragma push_macro("__DEVICE__")
  23. #ifdef __OPENMP_NVPTX__
  24. #define __DEVICE__ static __attribute__((always_inline, nothrow))
  25. #else
  26. #define __DEVICE__ static __device__ __forceinline__
  27. #endif
  28.  
  29. __DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
  30. __DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
  31. __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
  32. __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
  33. __DEVICE__ unsigned long long __brevll(unsigned long long __a) {
  34.   return __nv_brevll(__a);
  35. }
  36. #if defined(__cplusplus)
  37. __DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); }
  38. __DEVICE__ void __brkpt(int __a) { __brkpt(); }
  39. #else
  40. __DEVICE__ void __attribute__((overloadable)) __brkpt(void) {
  41.   __asm__ __volatile__("brkpt;");
  42. }
  43. __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
  44. #endif
  45. __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
  46.                                     unsigned int __c) {
  47.   return __nv_byte_perm(__a, __b, __c);
  48. }
  49. __DEVICE__ int __clz(int __a) { return __nv_clz(__a); }
  50. __DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); }
  51. __DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); }
  52. __DEVICE__ double __dAtomicAdd(double *__p, double __v) {
  53.   return __nvvm_atom_add_gen_d(__p, __v);
  54. }
  55. __DEVICE__ double __dAtomicAdd_block(double *__p, double __v) {
  56.   return __nvvm_atom_cta_add_gen_d(__p, __v);
  57. }
  58. __DEVICE__ double __dAtomicAdd_system(double *__p, double __v) {
  59.   return __nvvm_atom_sys_add_gen_d(__p, __v);
  60. }
  61. __DEVICE__ double __dadd_rd(double __a, double __b) {
  62.   return __nv_dadd_rd(__a, __b);
  63. }
  64. __DEVICE__ double __dadd_rn(double __a, double __b) {
  65.   return __nv_dadd_rn(__a, __b);
  66. }
  67. __DEVICE__ double __dadd_ru(double __a, double __b) {
  68.   return __nv_dadd_ru(__a, __b);
  69. }
  70. __DEVICE__ double __dadd_rz(double __a, double __b) {
  71.   return __nv_dadd_rz(__a, __b);
  72. }
  73. __DEVICE__ double __ddiv_rd(double __a, double __b) {
  74.   return __nv_ddiv_rd(__a, __b);
  75. }
  76. __DEVICE__ double __ddiv_rn(double __a, double __b) {
  77.   return __nv_ddiv_rn(__a, __b);
  78. }
  79. __DEVICE__ double __ddiv_ru(double __a, double __b) {
  80.   return __nv_ddiv_ru(__a, __b);
  81. }
  82. __DEVICE__ double __ddiv_rz(double __a, double __b) {
  83.   return __nv_ddiv_rz(__a, __b);
  84. }
  85. __DEVICE__ double __dmul_rd(double __a, double __b) {
  86.   return __nv_dmul_rd(__a, __b);
  87. }
  88. __DEVICE__ double __dmul_rn(double __a, double __b) {
  89.   return __nv_dmul_rn(__a, __b);
  90. }
  91. __DEVICE__ double __dmul_ru(double __a, double __b) {
  92.   return __nv_dmul_ru(__a, __b);
  93. }
  94. __DEVICE__ double __dmul_rz(double __a, double __b) {
  95.   return __nv_dmul_rz(__a, __b);
  96. }
  97. __DEVICE__ float __double2float_rd(double __a) {
  98.   return __nv_double2float_rd(__a);
  99. }
  100. __DEVICE__ float __double2float_rn(double __a) {
  101.   return __nv_double2float_rn(__a);
  102. }
  103. __DEVICE__ float __double2float_ru(double __a) {
  104.   return __nv_double2float_ru(__a);
  105. }
  106. __DEVICE__ float __double2float_rz(double __a) {
  107.   return __nv_double2float_rz(__a);
  108. }
  109. __DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); }
  110. __DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); }
  111. __DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); }
  112. __DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); }
  113. __DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); }
  114. __DEVICE__ long long __double2ll_rd(double __a) {
  115.   return __nv_double2ll_rd(__a);
  116. }
  117. __DEVICE__ long long __double2ll_rn(double __a) {
  118.   return __nv_double2ll_rn(__a);
  119. }
  120. __DEVICE__ long long __double2ll_ru(double __a) {
  121.   return __nv_double2ll_ru(__a);
  122. }
  123. __DEVICE__ long long __double2ll_rz(double __a) {
  124.   return __nv_double2ll_rz(__a);
  125. }
  126. __DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); }
  127. __DEVICE__ unsigned int __double2uint_rd(double __a) {
  128.   return __nv_double2uint_rd(__a);
  129. }
  130. __DEVICE__ unsigned int __double2uint_rn(double __a) {
  131.   return __nv_double2uint_rn(__a);
  132. }
  133. __DEVICE__ unsigned int __double2uint_ru(double __a) {
  134.   return __nv_double2uint_ru(__a);
  135. }
  136. __DEVICE__ unsigned int __double2uint_rz(double __a) {
  137.   return __nv_double2uint_rz(__a);
  138. }
  139. __DEVICE__ unsigned long long __double2ull_rd(double __a) {
  140.   return __nv_double2ull_rd(__a);
  141. }
  142. __DEVICE__ unsigned long long __double2ull_rn(double __a) {
  143.   return __nv_double2ull_rn(__a);
  144. }
  145. __DEVICE__ unsigned long long __double2ull_ru(double __a) {
  146.   return __nv_double2ull_ru(__a);
  147. }
  148. __DEVICE__ unsigned long long __double2ull_rz(double __a) {
  149.   return __nv_double2ull_rz(__a);
  150. }
  151. __DEVICE__ long long __double_as_longlong(double __a) {
  152.   return __nv_double_as_longlong(__a);
  153. }
  154. __DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); }
  155. __DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); }
  156. __DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); }
  157. __DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); }
  158. __DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); }
  159. __DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); }
  160. __DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); }
  161. __DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); }
  162. __DEVICE__ double __dsub_rd(double __a, double __b) {
  163.   return __nv_dsub_rd(__a, __b);
  164. }
  165. __DEVICE__ double __dsub_rn(double __a, double __b) {
  166.   return __nv_dsub_rn(__a, __b);
  167. }
  168. __DEVICE__ double __dsub_ru(double __a, double __b) {
  169.   return __nv_dsub_ru(__a, __b);
  170. }
  171. __DEVICE__ double __dsub_rz(double __a, double __b) {
  172.   return __nv_dsub_rz(__a, __b);
  173. }
  174. __DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); }
  175. __DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); }
  176. __DEVICE__ float __fAtomicAdd(float *__p, float __v) {
  177.   return __nvvm_atom_add_gen_f(__p, __v);
  178. }
  179. __DEVICE__ float __fAtomicAdd_block(float *__p, float __v) {
  180.   return __nvvm_atom_cta_add_gen_f(__p, __v);
  181. }
  182. __DEVICE__ float __fAtomicAdd_system(float *__p, float __v) {
  183.   return __nvvm_atom_sys_add_gen_f(__p, __v);
  184. }
  185. __DEVICE__ float __fAtomicExch(float *__p, float __v) {
  186.   return __nv_int_as_float(
  187.       __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
  188. }
  189. __DEVICE__ float __fAtomicExch_block(float *__p, float __v) {
  190.   return __nv_int_as_float(
  191.       __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
  192. }
  193. __DEVICE__ float __fAtomicExch_system(float *__p, float __v) {
  194.   return __nv_int_as_float(
  195.       __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v)));
  196. }
  197. __DEVICE__ float __fadd_rd(float __a, float __b) {
  198.   return __nv_fadd_rd(__a, __b);
  199. }
  200. __DEVICE__ float __fadd_rn(float __a, float __b) {
  201.   return __nv_fadd_rn(__a, __b);
  202. }
  203. __DEVICE__ float __fadd_ru(float __a, float __b) {
  204.   return __nv_fadd_ru(__a, __b);
  205. }
  206. __DEVICE__ float __fadd_rz(float __a, float __b) {
  207.   return __nv_fadd_rz(__a, __b);
  208. }
  209. __DEVICE__ float __fdiv_rd(float __a, float __b) {
  210.   return __nv_fdiv_rd(__a, __b);
  211. }
  212. __DEVICE__ float __fdiv_rn(float __a, float __b) {
  213.   return __nv_fdiv_rn(__a, __b);
  214. }
  215. __DEVICE__ float __fdiv_ru(float __a, float __b) {
  216.   return __nv_fdiv_ru(__a, __b);
  217. }
  218. __DEVICE__ float __fdiv_rz(float __a, float __b) {
  219.   return __nv_fdiv_rz(__a, __b);
  220. }
  221. __DEVICE__ float __fdividef(float __a, float __b) {
  222.   return __nv_fast_fdividef(__a, __b);
  223. }
  224. __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
  225. __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
  226. __DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
  227. __DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
  228. #ifdef _MSC_VER
  229. __DEVICE__ int __finitel(long double __a);
  230. #endif
  231. __DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
  232. __DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
  233. __DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
  234. __DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); }
  235. __DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); }
  236. __DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); }
  237. __DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); }
  238. __DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); }
  239. __DEVICE__ unsigned int __float2uint_rd(float __a) {
  240.   return __nv_float2uint_rd(__a);
  241. }
  242. __DEVICE__ unsigned int __float2uint_rn(float __a) {
  243.   return __nv_float2uint_rn(__a);
  244. }
  245. __DEVICE__ unsigned int __float2uint_ru(float __a) {
  246.   return __nv_float2uint_ru(__a);
  247. }
  248. __DEVICE__ unsigned int __float2uint_rz(float __a) {
  249.   return __nv_float2uint_rz(__a);
  250. }
  251. __DEVICE__ unsigned long long __float2ull_rd(float __a) {
  252.   return __nv_float2ull_rd(__a);
  253. }
  254. __DEVICE__ unsigned long long __float2ull_rn(float __a) {
  255.   return __nv_float2ull_rn(__a);
  256. }
  257. __DEVICE__ unsigned long long __float2ull_ru(float __a) {
  258.   return __nv_float2ull_ru(__a);
  259. }
  260. __DEVICE__ unsigned long long __float2ull_rz(float __a) {
  261.   return __nv_float2ull_rz(__a);
  262. }
  263. __DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); }
  264. __DEVICE__ unsigned int __float_as_uint(float __a) {
  265.   return __nv_float_as_uint(__a);
  266. }
  267. __DEVICE__ double __fma_rd(double __a, double __b, double __c) {
  268.   return __nv_fma_rd(__a, __b, __c);
  269. }
  270. __DEVICE__ double __fma_rn(double __a, double __b, double __c) {
  271.   return __nv_fma_rn(__a, __b, __c);
  272. }
  273. __DEVICE__ double __fma_ru(double __a, double __b, double __c) {
  274.   return __nv_fma_ru(__a, __b, __c);
  275. }
  276. __DEVICE__ double __fma_rz(double __a, double __b, double __c) {
  277.   return __nv_fma_rz(__a, __b, __c);
  278. }
  279. __DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) {
  280.   return __nv_fmaf_ieee_rd(__a, __b, __c);
  281. }
  282. __DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) {
  283.   return __nv_fmaf_ieee_rn(__a, __b, __c);
  284. }
  285. __DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) {
  286.   return __nv_fmaf_ieee_ru(__a, __b, __c);
  287. }
  288. __DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) {
  289.   return __nv_fmaf_ieee_rz(__a, __b, __c);
  290. }
  291. __DEVICE__ float __fmaf_rd(float __a, float __b, float __c) {
  292.   return __nv_fmaf_rd(__a, __b, __c);
  293. }
  294. __DEVICE__ float __fmaf_rn(float __a, float __b, float __c) {
  295.   return __nv_fmaf_rn(__a, __b, __c);
  296. }
  297. __DEVICE__ float __fmaf_ru(float __a, float __b, float __c) {
  298.   return __nv_fmaf_ru(__a, __b, __c);
  299. }
  300. __DEVICE__ float __fmaf_rz(float __a, float __b, float __c) {
  301.   return __nv_fmaf_rz(__a, __b, __c);
  302. }
  303. __DEVICE__ float __fmul_rd(float __a, float __b) {
  304.   return __nv_fmul_rd(__a, __b);
  305. }
  306. __DEVICE__ float __fmul_rn(float __a, float __b) {
  307.   return __nv_fmul_rn(__a, __b);
  308. }
  309. __DEVICE__ float __fmul_ru(float __a, float __b) {
  310.   return __nv_fmul_ru(__a, __b);
  311. }
  312. __DEVICE__ float __fmul_rz(float __a, float __b) {
  313.   return __nv_fmul_rz(__a, __b);
  314. }
  315. __DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); }
  316. __DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); }
  317. __DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); }
  318. __DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); }
  319. __DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); }
  320. __DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); }
  321. __DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); }
  322. __DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); }
  323. __DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); }
  324. __DEVICE__ float __fsub_rd(float __a, float __b) {
  325.   return __nv_fsub_rd(__a, __b);
  326. }
  327. __DEVICE__ float __fsub_rn(float __a, float __b) {
  328.   return __nv_fsub_rn(__a, __b);
  329. }
  330. __DEVICE__ float __fsub_ru(float __a, float __b) {
  331.   return __nv_fsub_ru(__a, __b);
  332. }
  333. __DEVICE__ float __fsub_rz(float __a, float __b) {
  334.   return __nv_fsub_rz(__a, __b);
  335. }
  336. __DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); }
  337. __DEVICE__ double __hiloint2double(int __a, int __b) {
  338.   return __nv_hiloint2double(__a, __b);
  339. }
  340. __DEVICE__ int __iAtomicAdd(int *__p, int __v) {
  341.   return __nvvm_atom_add_gen_i(__p, __v);
  342. }
  343. __DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
  344.   return __nvvm_atom_cta_add_gen_i(__p, __v);
  345. }
  346. __DEVICE__ int __iAtomicAdd_system(int *__p, int __v) {
  347.   return __nvvm_atom_sys_add_gen_i(__p, __v);
  348. }
  349. __DEVICE__ int __iAtomicAnd(int *__p, int __v) {
  350.   return __nvvm_atom_and_gen_i(__p, __v);
  351. }
  352. __DEVICE__ int __iAtomicAnd_block(int *__p, int __v) {
  353.   return __nvvm_atom_cta_and_gen_i(__p, __v);
  354. }
  355. __DEVICE__ int __iAtomicAnd_system(int *__p, int __v) {
  356.   return __nvvm_atom_sys_and_gen_i(__p, __v);
  357. }
  358. __DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) {
  359.   return __nvvm_atom_cas_gen_i(__p, __cmp, __v);
  360. }
  361. __DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) {
  362.   return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v);
  363. }
  364. __DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) {
  365.   return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v);
  366. }
  367. __DEVICE__ int __iAtomicExch(int *__p, int __v) {
  368.   return __nvvm_atom_xchg_gen_i(__p, __v);
  369. }
  370. __DEVICE__ int __iAtomicExch_block(int *__p, int __v) {
  371.   return __nvvm_atom_cta_xchg_gen_i(__p, __v);
  372. }
  373. __DEVICE__ int __iAtomicExch_system(int *__p, int __v) {
  374.   return __nvvm_atom_sys_xchg_gen_i(__p, __v);
  375. }
  376. __DEVICE__ int __iAtomicMax(int *__p, int __v) {
  377.   return __nvvm_atom_max_gen_i(__p, __v);
  378. }
  379. __DEVICE__ int __iAtomicMax_block(int *__p, int __v) {
  380.   return __nvvm_atom_cta_max_gen_i(__p, __v);
  381. }
  382. __DEVICE__ int __iAtomicMax_system(int *__p, int __v) {
  383.   return __nvvm_atom_sys_max_gen_i(__p, __v);
  384. }
  385. __DEVICE__ int __iAtomicMin(int *__p, int __v) {
  386.   return __nvvm_atom_min_gen_i(__p, __v);
  387. }
  388. __DEVICE__ int __iAtomicMin_block(int *__p, int __v) {
  389.   return __nvvm_atom_cta_min_gen_i(__p, __v);
  390. }
  391. __DEVICE__ int __iAtomicMin_system(int *__p, int __v) {
  392.   return __nvvm_atom_sys_min_gen_i(__p, __v);
  393. }
  394. __DEVICE__ int __iAtomicOr(int *__p, int __v) {
  395.   return __nvvm_atom_or_gen_i(__p, __v);
  396. }
  397. __DEVICE__ int __iAtomicOr_block(int *__p, int __v) {
  398.   return __nvvm_atom_cta_or_gen_i(__p, __v);
  399. }
  400. __DEVICE__ int __iAtomicOr_system(int *__p, int __v) {
  401.   return __nvvm_atom_sys_or_gen_i(__p, __v);
  402. }
  403. __DEVICE__ int __iAtomicXor(int *__p, int __v) {
  404.   return __nvvm_atom_xor_gen_i(__p, __v);
  405. }
  406. __DEVICE__ int __iAtomicXor_block(int *__p, int __v) {
  407.   return __nvvm_atom_cta_xor_gen_i(__p, __v);
  408. }
  409. __DEVICE__ int __iAtomicXor_system(int *__p, int __v) {
  410.   return __nvvm_atom_sys_xor_gen_i(__p, __v);
  411. }
  412. __DEVICE__ long long __illAtomicMax(long long *__p, long long __v) {
  413.   return __nvvm_atom_max_gen_ll(__p, __v);
  414. }
  415. __DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) {
  416.   return __nvvm_atom_cta_max_gen_ll(__p, __v);
  417. }
  418. __DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) {
  419.   return __nvvm_atom_sys_max_gen_ll(__p, __v);
  420. }
  421. __DEVICE__ long long __illAtomicMin(long long *__p, long long __v) {
  422.   return __nvvm_atom_min_gen_ll(__p, __v);
  423. }
  424. __DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) {
  425.   return __nvvm_atom_cta_min_gen_ll(__p, __v);
  426. }
  427. __DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) {
  428.   return __nvvm_atom_sys_min_gen_ll(__p, __v);
  429. }
  430. __DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); }
  431. __DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); }
  432. __DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); }
  433. __DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); }
  434. __DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); }
  435. __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
  436. __DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
  437. __DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
  438. __DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
  439. #ifdef _MSC_VER
  440. __DEVICE__ int __isinfl(long double __a);
  441. #endif
  442. __DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
  443. __DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
  444. #ifdef _MSC_VER
  445. __DEVICE__ int __isnanl(long double __a);
  446. #endif
  447. __DEVICE__ double __ll2double_rd(long long __a) {
  448.   return __nv_ll2double_rd(__a);
  449. }
  450. __DEVICE__ double __ll2double_rn(long long __a) {
  451.   return __nv_ll2double_rn(__a);
  452. }
  453. __DEVICE__ double __ll2double_ru(long long __a) {
  454.   return __nv_ll2double_ru(__a);
  455. }
  456. __DEVICE__ double __ll2double_rz(long long __a) {
  457.   return __nv_ll2double_rz(__a);
  458. }
  459. __DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); }
  460. __DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); }
  461. __DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); }
  462. __DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); }
  463. __DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) {
  464.   return __nvvm_atom_and_gen_ll(__p, __v);
  465. }
  466. __DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) {
  467.   return __nvvm_atom_cta_and_gen_ll(__p, __v);
  468. }
  469. __DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) {
  470.   return __nvvm_atom_sys_and_gen_ll(__p, __v);
  471. }
  472. __DEVICE__ long long __llAtomicOr(long long *__p, long long __v) {
  473.   return __nvvm_atom_or_gen_ll(__p, __v);
  474. }
  475. __DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) {
  476.   return __nvvm_atom_cta_or_gen_ll(__p, __v);
  477. }
  478. __DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) {
  479.   return __nvvm_atom_sys_or_gen_ll(__p, __v);
  480. }
  481. __DEVICE__ long long __llAtomicXor(long long *__p, long long __v) {
  482.   return __nvvm_atom_xor_gen_ll(__p, __v);
  483. }
  484. __DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) {
  485.   return __nvvm_atom_cta_xor_gen_ll(__p, __v);
  486. }
  487. __DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) {
  488.   return __nvvm_atom_sys_xor_gen_ll(__p, __v);
  489. }
  490. __DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); }
  491. __DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); }
  492. __DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); }
  493. __DEVICE__ double __longlong_as_double(long long __a) {
  494.   return __nv_longlong_as_double(__a);
  495. }
  496. __DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); }
  497. __DEVICE__ long long __mul64hi(long long __a, long long __b) {
  498.   return __nv_mul64hi(__a, __b);
  499. }
  500. __DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); }
  501. __DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
  502. __DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
  503. __DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
  504. __DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }
  505. __DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
  506. __DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
  507. __DEVICE__ float __powf(float __a, float __b) {
  508.   return __nv_fast_powf(__a, __b);
  509. }
  510.  
  511. // Parameter must have a known integer value.
  512. #define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
  513. __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
  514. __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
  515.   return __nv_sad(__a, __b, __c);
  516. }
  517. __DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
  518. __DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
  519. __DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
  520. __DEVICE__ void __sincosf(float __a, float *__s, float *__c) {
  521.   return __nv_fast_sincosf(__a, __s, __c);
  522. }
  523. __DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); }
  524. __DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); }
  525. __DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); }
  526. __DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); }
  527. __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
  528. __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
  529. __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
  530. __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
  531. __DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
  532. __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
  533.   return __nvvm_atom_add_gen_i((int *)__p, __v);
  534. }
  535. __DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p,
  536.                                            unsigned int __v) {
  537.   return __nvvm_atom_cta_add_gen_i((int *)__p, __v);
  538. }
  539. __DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p,
  540.                                             unsigned int __v) {
  541.   return __nvvm_atom_sys_add_gen_i((int *)__p, __v);
  542. }
  543. __DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) {
  544.   return __nvvm_atom_and_gen_i((int *)__p, __v);
  545. }
  546. __DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p,
  547.                                            unsigned int __v) {
  548.   return __nvvm_atom_cta_and_gen_i((int *)__p, __v);
  549. }
  550. __DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p,
  551.                                             unsigned int __v) {
  552.   return __nvvm_atom_sys_and_gen_i((int *)__p, __v);
  553. }
  554. __DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp,
  555.                                      unsigned int __v) {
  556.   return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v);
  557. }
  558. __DEVICE__ unsigned int
  559. __uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
  560.   return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v);
  561. }
  562. __DEVICE__ unsigned int
  563. __uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) {
  564.   return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v);
  565. }
  566. __DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) {
  567.   return __nvvm_atom_dec_gen_ui(__p, __v);
  568. }
  569. __DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p,
  570.                                            unsigned int __v) {
  571.   return __nvvm_atom_cta_dec_gen_ui(__p, __v);
  572. }
  573. __DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p,
  574.                                             unsigned int __v) {
  575.   return __nvvm_atom_sys_dec_gen_ui(__p, __v);
  576. }
  577. __DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) {
  578.   return __nvvm_atom_xchg_gen_i((int *)__p, __v);
  579. }
  580. __DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p,
  581.                                             unsigned int __v) {
  582.   return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v);
  583. }
  584. __DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p,
  585.                                              unsigned int __v) {
  586.   return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v);
  587. }
  588. __DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) {
  589.   return __nvvm_atom_inc_gen_ui(__p, __v);
  590. }
  591. __DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p,
  592.                                            unsigned int __v) {
  593.   return __nvvm_atom_cta_inc_gen_ui(__p, __v);
  594. }
  595. __DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p,
  596.                                             unsigned int __v) {
  597.   return __nvvm_atom_sys_inc_gen_ui(__p, __v);
  598. }
  599. __DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) {
  600.   return __nvvm_atom_max_gen_ui(__p, __v);
  601. }
  602. __DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p,
  603.                                            unsigned int __v) {
  604.   return __nvvm_atom_cta_max_gen_ui(__p, __v);
  605. }
  606. __DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p,
  607.                                             unsigned int __v) {
  608.   return __nvvm_atom_sys_max_gen_ui(__p, __v);
  609. }
  610. __DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) {
  611.   return __nvvm_atom_min_gen_ui(__p, __v);
  612. }
  613. __DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p,
  614.                                            unsigned int __v) {
  615.   return __nvvm_atom_cta_min_gen_ui(__p, __v);
  616. }
  617. __DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p,
  618.                                             unsigned int __v) {
  619.   return __nvvm_atom_sys_min_gen_ui(__p, __v);
  620. }
  621. __DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) {
  622.   return __nvvm_atom_or_gen_i((int *)__p, __v);
  623. }
  624. __DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) {
  625.   return __nvvm_atom_cta_or_gen_i((int *)__p, __v);
  626. }
  627. __DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p,
  628.                                            unsigned int __v) {
  629.   return __nvvm_atom_sys_or_gen_i((int *)__p, __v);
  630. }
  631. __DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) {
  632.   return __nvvm_atom_xor_gen_i((int *)__p, __v);
  633. }
  634. __DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p,
  635.                                            unsigned int __v) {
  636.   return __nvvm_atom_cta_xor_gen_i((int *)__p, __v);
  637. }
  638. __DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p,
  639.                                             unsigned int __v) {
  640.   return __nvvm_atom_sys_xor_gen_i((int *)__p, __v);
  641. }
  642. __DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) {
  643.   return __nv_uhadd(__a, __b);
  644. }
  645. __DEVICE__ double __uint2double_rn(unsigned int __a) {
  646.   return __nv_uint2double_rn(__a);
  647. }
  648. __DEVICE__ float __uint2float_rd(unsigned int __a) {
  649.   return __nv_uint2float_rd(__a);
  650. }
  651. __DEVICE__ float __uint2float_rn(unsigned int __a) {
  652.   return __nv_uint2float_rn(__a);
  653. }
  654. __DEVICE__ float __uint2float_ru(unsigned int __a) {
  655.   return __nv_uint2float_ru(__a);
  656. }
  657. __DEVICE__ float __uint2float_rz(unsigned int __a) {
  658.   return __nv_uint2float_rz(__a);
  659. }
  660. __DEVICE__ float __uint_as_float(unsigned int __a) {
  661.   return __nv_uint_as_float(__a);
  662. } //
  663. __DEVICE__ double __ull2double_rd(unsigned long long __a) {
  664.   return __nv_ull2double_rd(__a);
  665. }
  666. __DEVICE__ double __ull2double_rn(unsigned long long __a) {
  667.   return __nv_ull2double_rn(__a);
  668. }
  669. __DEVICE__ double __ull2double_ru(unsigned long long __a) {
  670.   return __nv_ull2double_ru(__a);
  671. }
  672. __DEVICE__ double __ull2double_rz(unsigned long long __a) {
  673.   return __nv_ull2double_rz(__a);
  674. }
  675. __DEVICE__ float __ull2float_rd(unsigned long long __a) {
  676.   return __nv_ull2float_rd(__a);
  677. }
  678. __DEVICE__ float __ull2float_rn(unsigned long long __a) {
  679.   return __nv_ull2float_rn(__a);
  680. }
  681. __DEVICE__ float __ull2float_ru(unsigned long long __a) {
  682.   return __nv_ull2float_ru(__a);
  683. }
  684. __DEVICE__ float __ull2float_rz(unsigned long long __a) {
  685.   return __nv_ull2float_rz(__a);
  686. }
  687. __DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p,
  688.                                              unsigned long long __v) {
  689.   return __nvvm_atom_add_gen_ll((long long *)__p, __v);
  690. }
  691. __DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p,
  692.                                                    unsigned long long __v) {
  693.   return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v);
  694. }
  695. __DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p,
  696.                                                     unsigned long long __v) {
  697.   return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v);
  698. }
  699. __DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p,
  700.                                              unsigned long long __v) {
  701.   return __nvvm_atom_and_gen_ll((long long *)__p, __v);
  702. }
  703. __DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p,
  704.                                                    unsigned long long __v) {
  705.   return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v);
  706. }
  707. __DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p,
  708.                                                     unsigned long long __v) {
  709.   return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v);
  710. }
  711. __DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p,
  712.                                              unsigned long long __cmp,
  713.                                              unsigned long long __v) {
  714.   return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v);
  715. }
  716. __DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p,
  717.                                                    unsigned long long __cmp,
  718.                                                    unsigned long long __v) {
  719.   return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v);
  720. }
  721. __DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p,
  722.                                                     unsigned long long __cmp,
  723.                                                     unsigned long long __v) {
  724.   return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v);
  725. }
  726. __DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p,
  727.                                               unsigned long long __v) {
  728.   return __nvvm_atom_xchg_gen_ll((long long *)__p, __v);
  729. }
  730. __DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p,
  731.                                                     unsigned long long __v) {
  732.   return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v);
  733. }
  734. __DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p,
  735.                                                      unsigned long long __v) {
  736.   return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v);
  737. }
  738. __DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p,
  739.                                              unsigned long long __v) {
  740.   return __nvvm_atom_max_gen_ull(__p, __v);
  741. }
  742. __DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p,
  743.                                                    unsigned long long __v) {
  744.   return __nvvm_atom_cta_max_gen_ull(__p, __v);
  745. }
  746. __DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p,
  747.                                                     unsigned long long __v) {
  748.   return __nvvm_atom_sys_max_gen_ull(__p, __v);
  749. }
  750. __DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p,
  751.                                              unsigned long long __v) {
  752.   return __nvvm_atom_min_gen_ull(__p, __v);
  753. }
  754. __DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p,
  755.                                                    unsigned long long __v) {
  756.   return __nvvm_atom_cta_min_gen_ull(__p, __v);
  757. }
  758. __DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p,
  759.                                                     unsigned long long __v) {
  760.   return __nvvm_atom_sys_min_gen_ull(__p, __v);
  761. }
  762. __DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p,
  763.                                             unsigned long long __v) {
  764.   return __nvvm_atom_or_gen_ll((long long *)__p, __v);
  765. }
  766. __DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p,
  767.                                                   unsigned long long __v) {
  768.   return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v);
  769. }
  770. __DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p,
  771.                                                    unsigned long long __v) {
  772.   return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v);
  773. }
  774. __DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p,
  775.                                              unsigned long long __v) {
  776.   return __nvvm_atom_xor_gen_ll((long long *)__p, __v);
  777. }
  778. __DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p,
  779.                                                    unsigned long long __v) {
  780.   return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v);
  781. }
  782. __DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p,
  783.                                                     unsigned long long __v) {
  784.   return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v);
  785. }
  786. __DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) {
  787.   return __nv_umul24(__a, __b);
  788. }
  789. __DEVICE__ unsigned long long __umul64hi(unsigned long long __a,
  790.                                          unsigned long long __b) {
  791.   return __nv_umul64hi(__a, __b);
  792. }
  793. __DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) {
  794.   return __nv_umulhi(__a, __b);
  795. }
  796. __DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) {
  797.   return __nv_urhadd(__a, __b);
  798. }
  799. __DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b,
  800.                                unsigned int __c) {
  801.   return __nv_usad(__a, __b, __c);
  802. }
  803.  
  804. #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
  805. __DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); }
  806. __DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); }
  807. __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
  808.   return __nv_vabsdiffs2(__a, __b);
  809. }
  810. __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
  811.   return __nv_vabsdiffs4(__a, __b);
  812. }
  813. __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
  814.   return __nv_vabsdiffu2(__a, __b);
  815. }
  816. __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
  817.   return __nv_vabsdiffu4(__a, __b);
  818. }
  819. __DEVICE__ unsigned int __vabsss2(unsigned int __a) {
  820.   return __nv_vabsss2(__a);
  821. }
  822. __DEVICE__ unsigned int __vabsss4(unsigned int __a) {
  823.   return __nv_vabsss4(__a);
  824. }
  825. __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
  826.   return __nv_vadd2(__a, __b);
  827. }
  828. __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
  829.   return __nv_vadd4(__a, __b);
  830. }
  831. __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
  832.   return __nv_vaddss2(__a, __b);
  833. }
  834. __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
  835.   return __nv_vaddss4(__a, __b);
  836. }
  837. __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
  838.   return __nv_vaddus2(__a, __b);
  839. }
  840. __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
  841.   return __nv_vaddus4(__a, __b);
  842. }
  843. __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
  844.   return __nv_vavgs2(__a, __b);
  845. }
  846. __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
  847.   return __nv_vavgs4(__a, __b);
  848. }
  849. __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
  850.   return __nv_vavgu2(__a, __b);
  851. }
  852. __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
  853.   return __nv_vavgu4(__a, __b);
  854. }
  855. __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
  856.   return __nv_vcmpeq2(__a, __b);
  857. }
  858. __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
  859.   return __nv_vcmpeq4(__a, __b);
  860. }
  861. __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
  862.   return __nv_vcmpges2(__a, __b);
  863. }
  864. __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
  865.   return __nv_vcmpges4(__a, __b);
  866. }
  867. __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
  868.   return __nv_vcmpgeu2(__a, __b);
  869. }
  870. __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
  871.   return __nv_vcmpgeu4(__a, __b);
  872. }
  873. __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
  874.   return __nv_vcmpgts2(__a, __b);
  875. }
  876. __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
  877.   return __nv_vcmpgts4(__a, __b);
  878. }
  879. __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
  880.   return __nv_vcmpgtu2(__a, __b);
  881. }
  882. __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
  883.   return __nv_vcmpgtu4(__a, __b);
  884. }
  885. __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
  886.   return __nv_vcmples2(__a, __b);
  887. }
  888. __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
  889.   return __nv_vcmples4(__a, __b);
  890. }
  891. __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
  892.   return __nv_vcmpleu2(__a, __b);
  893. }
  894. __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
  895.   return __nv_vcmpleu4(__a, __b);
  896. }
  897. __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
  898.   return __nv_vcmplts2(__a, __b);
  899. }
  900. __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
  901.   return __nv_vcmplts4(__a, __b);
  902. }
  903. __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
  904.   return __nv_vcmpltu2(__a, __b);
  905. }
  906. __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
  907.   return __nv_vcmpltu4(__a, __b);
  908. }
  909. __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
  910.   return __nv_vcmpne2(__a, __b);
  911. }
  912. __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
  913.   return __nv_vcmpne4(__a, __b);
  914. }
  915. __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
  916.   return __nv_vhaddu2(__a, __b);
  917. }
  918. __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
  919.   return __nv_vhaddu4(__a, __b);
  920. }
  921. __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
  922.   return __nv_vmaxs2(__a, __b);
  923. }
  924. __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
  925.   return __nv_vmaxs4(__a, __b);
  926. }
  927. __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
  928.   return __nv_vmaxu2(__a, __b);
  929. }
  930. __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
  931.   return __nv_vmaxu4(__a, __b);
  932. }
  933. __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
  934.   return __nv_vmins2(__a, __b);
  935. }
  936. __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
  937.   return __nv_vmins4(__a, __b);
  938. }
  939. __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
  940.   return __nv_vminu2(__a, __b);
  941. }
  942. __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
  943.   return __nv_vminu4(__a, __b);
  944. }
  945. __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); }
  946. __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); }
  947. __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
  948.   return __nv_vnegss2(__a);
  949. }
  950. __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
  951.   return __nv_vnegss4(__a);
  952. }
  953. __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
  954.   return __nv_vsads2(__a, __b);
  955. }
  956. __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
  957.   return __nv_vsads4(__a, __b);
  958. }
  959. __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
  960.   return __nv_vsadu2(__a, __b);
  961. }
  962. __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
  963.   return __nv_vsadu4(__a, __b);
  964. }
  965. __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
  966.   return __nv_vseteq2(__a, __b);
  967. }
  968. __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
  969.   return __nv_vseteq4(__a, __b);
  970. }
  971. __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
  972.   return __nv_vsetges2(__a, __b);
  973. }
  974. __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
  975.   return __nv_vsetges4(__a, __b);
  976. }
  977. __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
  978.   return __nv_vsetgeu2(__a, __b);
  979. }
  980. __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
  981.   return __nv_vsetgeu4(__a, __b);
  982. }
  983. __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
  984.   return __nv_vsetgts2(__a, __b);
  985. }
  986. __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
  987.   return __nv_vsetgts4(__a, __b);
  988. }
  989. __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
  990.   return __nv_vsetgtu2(__a, __b);
  991. }
  992. __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
  993.   return __nv_vsetgtu4(__a, __b);
  994. }
  995. __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
  996.   return __nv_vsetles2(__a, __b);
  997. }
  998. __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
  999.   return __nv_vsetles4(__a, __b);
  1000. }
  1001. __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
  1002.   return __nv_vsetleu2(__a, __b);
  1003. }
  1004. __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
  1005.   return __nv_vsetleu4(__a, __b);
  1006. }
  1007. __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
  1008.   return __nv_vsetlts2(__a, __b);
  1009. }
  1010. __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
  1011.   return __nv_vsetlts4(__a, __b);
  1012. }
  1013. __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
  1014.   return __nv_vsetltu2(__a, __b);
  1015. }
  1016. __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
  1017.   return __nv_vsetltu4(__a, __b);
  1018. }
  1019. __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
  1020.   return __nv_vsetne2(__a, __b);
  1021. }
  1022. __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
  1023.   return __nv_vsetne4(__a, __b);
  1024. }
  1025. __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
  1026.   return __nv_vsub2(__a, __b);
  1027. }
  1028. __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
  1029.   return __nv_vsub4(__a, __b);
  1030. }
  1031. __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
  1032.   return __nv_vsubss2(__a, __b);
  1033. }
  1034. __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
  1035.   return __nv_vsubss4(__a, __b);
  1036. }
  1037. __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
  1038.   return __nv_vsubus2(__a, __b);
  1039. }
  1040. __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  1041.   return __nv_vsubus4(__a, __b);
  1042. }
  1043. #else // CUDA_VERSION >= 9020
  1044. // CUDA no longer provides inline assembly (or bitcode) implementation of these
  1045. // functions, so we have to reimplment them. The implementation is naive and is
  1046. // not optimized for performance.
  1047.  
  1048. // Helper function to convert N-bit boolean subfields into all-0 or all-1.
  1049. // E.g. __bool2mask(0x01000100,8) -> 0xff00ff00
  1050. //      __bool2mask(0x00010000,16) -> 0xffff0000
  1051. __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
  1052.   return (__a << shift) - __a;
  1053. }
  1054. __DEVICE__ unsigned int __vabs2(unsigned int __a) {
  1055.   unsigned int r;
  1056.   __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
  1057.           : "=r"(r)
  1058.           : "r"(__a), "r"(0), "r"(0));
  1059.   return r;
  1060. }
  1061. __DEVICE__ unsigned int __vabs4(unsigned int __a) {
  1062.   unsigned int r;
  1063.   __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
  1064.           : "=r"(r)
  1065.           : "r"(__a), "r"(0), "r"(0));
  1066.   return r;
  1067. }
  1068. __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
  1069.   unsigned int r;
  1070.   __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
  1071.           : "=r"(r)
  1072.           : "r"(__a), "r"(__b), "r"(0));
  1073.   return r;
  1074. }
  1075.  
  1076. __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
  1077.   unsigned int r;
  1078.   __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
  1079.           : "=r"(r)
  1080.           : "r"(__a), "r"(__b), "r"(0));
  1081.   return r;
  1082. }
  1083. __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
  1084.   unsigned int r;
  1085.   __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
  1086.           : "=r"(r)
  1087.           : "r"(__a), "r"(__b), "r"(0));
  1088.   return r;
  1089. }
  1090. __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
  1091.   unsigned int r;
  1092.   __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
  1093.           : "=r"(r)
  1094.           : "r"(__a), "r"(__b), "r"(0));
  1095.   return r;
  1096. }
  1097. __DEVICE__ unsigned int __vabsss2(unsigned int __a) {
  1098.   unsigned int r;
  1099.   __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
  1100.           : "=r"(r)
  1101.           : "r"(__a), "r"(0), "r"(0));
  1102.   return r;
  1103. }
  1104. __DEVICE__ unsigned int __vabsss4(unsigned int __a) {
  1105.   unsigned int r;
  1106.   __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
  1107.           : "=r"(r)
  1108.           : "r"(__a), "r"(0), "r"(0));
  1109.   return r;
  1110. }
  1111. __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
  1112.   unsigned int r;
  1113.   __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;"
  1114.           : "=r"(r)
  1115.           : "r"(__a), "r"(__b), "r"(0));
  1116.   return r;
  1117. }
  1118. __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
  1119.   unsigned int r;
  1120.   __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;"
  1121.           : "=r"(r)
  1122.           : "r"(__a), "r"(__b), "r"(0));
  1123.   return r;
  1124. }
  1125. __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
  1126.   unsigned int r;
  1127.   __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
  1128.           : "=r"(r)
  1129.           : "r"(__a), "r"(__b), "r"(0));
  1130.   return r;
  1131. }
  1132. __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
  1133.   unsigned int r;
  1134.   __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
  1135.           : "=r"(r)
  1136.           : "r"(__a), "r"(__b), "r"(0));
  1137.   return r;
  1138. }
  1139. __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
  1140.   unsigned int r;
  1141.   __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
  1142.           : "=r"(r)
  1143.           : "r"(__a), "r"(__b), "r"(0));
  1144.   return r;
  1145. }
  1146. __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
  1147.   unsigned int r;
  1148.   __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
  1149.           : "=r"(r)
  1150.           : "r"(__a), "r"(__b), "r"(0));
  1151.   return r;
  1152. }
  1153. __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
  1154.   unsigned int r;
  1155.   __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;"
  1156.           : "=r"(r)
  1157.           : "r"(__a), "r"(__b), "r"(0));
  1158.   return r;
  1159. }
  1160. __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
  1161.   unsigned int r;
  1162.   __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;"
  1163.           : "=r"(r)
  1164.           : "r"(__a), "r"(__b), "r"(0));
  1165.   return r;
  1166. }
  1167. __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
  1168.   unsigned int r;
  1169.   __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;"
  1170.           : "=r"(r)
  1171.           : "r"(__a), "r"(__b), "r"(0));
  1172.   return r;
  1173. }
  1174. __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
  1175.   unsigned int r;
  1176.   __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;"
  1177.           : "=r"(r)
  1178.           : "r"(__a), "r"(__b), "r"(0));
  1179.   return r;
  1180. }
  1181. __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
  1182.   unsigned int r;
  1183.   __asm__("vset2.u32.u32.eq %0,%1,%2,%3;"
  1184.           : "=r"(r)
  1185.           : "r"(__a), "r"(__b), "r"(0));
  1186.   return r;
  1187. }
  1188. __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
  1189.   return __bool2mask(__vseteq2(__a, __b), 16);
  1190. }
  1191. __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
  1192.   unsigned int r;
  1193.   __asm__("vset4.u32.u32.eq %0,%1,%2,%3;"
  1194.           : "=r"(r)
  1195.           : "r"(__a), "r"(__b), "r"(0));
  1196.   return r;
  1197. }
  1198. __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
  1199.   return __bool2mask(__vseteq4(__a, __b), 8);
  1200. }
  1201. __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
  1202.   unsigned int r;
  1203.   __asm__("vset2.s32.s32.ge %0,%1,%2,%3;"
  1204.           : "=r"(r)
  1205.           : "r"(__a), "r"(__b), "r"(0));
  1206.   return r;
  1207. }
  1208. __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
  1209.   return __bool2mask(__vsetges2(__a, __b), 16);
  1210. }
  1211. __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
  1212.   unsigned int r;
  1213.   __asm__("vset4.s32.s32.ge %0,%1,%2,%3;"
  1214.           : "=r"(r)
  1215.           : "r"(__a), "r"(__b), "r"(0));
  1216.   return r;
  1217. }
  1218. __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
  1219.   return __bool2mask(__vsetges4(__a, __b), 8);
  1220. }
  1221. __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
  1222.   unsigned int r;
  1223.   __asm__("vset2.u32.u32.ge %0,%1,%2,%3;"
  1224.           : "=r"(r)
  1225.           : "r"(__a), "r"(__b), "r"(0));
  1226.   return r;
  1227. }
  1228. __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
  1229.   return __bool2mask(__vsetgeu2(__a, __b), 16);
  1230. }
  1231. __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
  1232.   unsigned int r;
  1233.   __asm__("vset4.u32.u32.ge %0,%1,%2,%3;"
  1234.           : "=r"(r)
  1235.           : "r"(__a), "r"(__b), "r"(0));
  1236.   return r;
  1237. }
  1238. __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
  1239.   return __bool2mask(__vsetgeu4(__a, __b), 8);
  1240. }
  1241. __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
  1242.   unsigned int r;
  1243.   __asm__("vset2.s32.s32.gt %0,%1,%2,%3;"
  1244.           : "=r"(r)
  1245.           : "r"(__a), "r"(__b), "r"(0));
  1246.   return r;
  1247. }
  1248. __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
  1249.   return __bool2mask(__vsetgts2(__a, __b), 16);
  1250. }
  1251. __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
  1252.   unsigned int r;
  1253.   __asm__("vset4.s32.s32.gt %0,%1,%2,%3;"
  1254.           : "=r"(r)
  1255.           : "r"(__a), "r"(__b), "r"(0));
  1256.   return r;
  1257. }
  1258. __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
  1259.   return __bool2mask(__vsetgts4(__a, __b), 8);
  1260. }
  1261. __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
  1262.   unsigned int r;
  1263.   __asm__("vset2.u32.u32.gt %0,%1,%2,%3;"
  1264.           : "=r"(r)
  1265.           : "r"(__a), "r"(__b), "r"(0));
  1266.   return r;
  1267. }
  1268. __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
  1269.   return __bool2mask(__vsetgtu2(__a, __b), 16);
  1270. }
  1271. __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
  1272.   unsigned int r;
  1273.   __asm__("vset4.u32.u32.gt %0,%1,%2,%3;"
  1274.           : "=r"(r)
  1275.           : "r"(__a), "r"(__b), "r"(0));
  1276.   return r;
  1277. }
  1278. __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
  1279.   return __bool2mask(__vsetgtu4(__a, __b), 8);
  1280. }
  1281. __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
  1282.   unsigned int r;
  1283.   __asm__("vset2.s32.s32.le %0,%1,%2,%3;"
  1284.           : "=r"(r)
  1285.           : "r"(__a), "r"(__b), "r"(0));
  1286.   return r;
  1287. }
  1288. __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
  1289.   return __bool2mask(__vsetles2(__a, __b), 16);
  1290. }
  1291. __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
  1292.   unsigned int r;
  1293.   __asm__("vset4.s32.s32.le %0,%1,%2,%3;"
  1294.           : "=r"(r)
  1295.           : "r"(__a), "r"(__b), "r"(0));
  1296.   return r;
  1297. }
  1298. __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
  1299.   return __bool2mask(__vsetles4(__a, __b), 8);
  1300. }
  1301. __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
  1302.   unsigned int r;
  1303.   __asm__("vset2.u32.u32.le %0,%1,%2,%3;"
  1304.           : "=r"(r)
  1305.           : "r"(__a), "r"(__b), "r"(0));
  1306.   return r;
  1307. }
  1308. __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
  1309.   return __bool2mask(__vsetleu2(__a, __b), 16);
  1310. }
  1311. __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
  1312.   unsigned int r;
  1313.   __asm__("vset4.u32.u32.le %0,%1,%2,%3;"
  1314.           : "=r"(r)
  1315.           : "r"(__a), "r"(__b), "r"(0));
  1316.   return r;
  1317. }
  1318. __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
  1319.   return __bool2mask(__vsetleu4(__a, __b), 8);
  1320. }
  1321. __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
  1322.   unsigned int r;
  1323.   __asm__("vset2.s32.s32.lt %0,%1,%2,%3;"
  1324.           : "=r"(r)
  1325.           : "r"(__a), "r"(__b), "r"(0));
  1326.   return r;
  1327. }
  1328. __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
  1329.   return __bool2mask(__vsetlts2(__a, __b), 16);
  1330. }
  1331. __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
  1332.   unsigned int r;
  1333.   __asm__("vset4.s32.s32.lt %0,%1,%2,%3;"
  1334.           : "=r"(r)
  1335.           : "r"(__a), "r"(__b), "r"(0));
  1336.   return r;
  1337. }
  1338. __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
  1339.   return __bool2mask(__vsetlts4(__a, __b), 8);
  1340. }
  1341. __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
  1342.   unsigned int r;
  1343.   __asm__("vset2.u32.u32.lt %0,%1,%2,%3;"
  1344.           : "=r"(r)
  1345.           : "r"(__a), "r"(__b), "r"(0));
  1346.   return r;
  1347. }
  1348. __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
  1349.   return __bool2mask(__vsetltu2(__a, __b), 16);
  1350. }
  1351. __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
  1352.   unsigned int r;
  1353.   __asm__("vset4.u32.u32.lt %0,%1,%2,%3;"
  1354.           : "=r"(r)
  1355.           : "r"(__a), "r"(__b), "r"(0));
  1356.   return r;
  1357. }
  1358. __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
  1359.   return __bool2mask(__vsetltu4(__a, __b), 8);
  1360. }
  1361. __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
  1362.   unsigned int r;
  1363.   __asm__("vset2.u32.u32.ne %0,%1,%2,%3;"
  1364.           : "=r"(r)
  1365.           : "r"(__a), "r"(__b), "r"(0));
  1366.   return r;
  1367. }
  1368. __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
  1369.   return __bool2mask(__vsetne2(__a, __b), 16);
  1370. }
  1371. __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
  1372.   unsigned int r;
  1373.   __asm__("vset4.u32.u32.ne %0,%1,%2,%3;"
  1374.           : "=r"(r)
  1375.           : "r"(__a), "r"(__b), "r"(0));
  1376.   return r;
  1377. }
  1378. __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
  1379.   return __bool2mask(__vsetne4(__a, __b), 8);
  1380. }
  1381.  
  1382. // Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086
  1383. // (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) =>
  1384. // (a + b) / 2 = ((a ^ b) >> 1) + (a & b)
  1385. // To operate on multiple sub-elements we need to make sure to mask out bits
  1386. // that crossed over into adjacent elements during the shift.
  1387. __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) {
  1388.   return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b);
  1389. }
  1390. __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) {
  1391.   return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b);
  1392. }
  1393.  
  1394. __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
  1395.   unsigned int r;
  1396.   if ((__a & 0x8000) && (__b & 0x8000)) {
  1397.     // Work around a bug in ptxas which produces invalid result if low element
  1398.     // is negative.
  1399.     unsigned mask = __vcmpgts2(__a, __b);
  1400.     r = (__a & mask) | (__b & ~mask);
  1401.   } else {
  1402.     __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;"
  1403.             : "=r"(r)
  1404.             : "r"(__a), "r"(__b), "r"(0));
  1405.   }
  1406.   return r;
  1407. }
  1408. __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
  1409.   unsigned int r;
  1410.   __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;"
  1411.           : "=r"(r)
  1412.           : "r"(__a), "r"(__b), "r"(0));
  1413.   return r;
  1414. }
  1415. __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
  1416.   unsigned int r;
  1417.   __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;"
  1418.           : "=r"(r)
  1419.           : "r"(__a), "r"(__b), "r"(0));
  1420.   return r;
  1421. }
  1422. __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
  1423.   unsigned int r;
  1424.   __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;"
  1425.           : "=r"(r)
  1426.           : "r"(__a), "r"(__b), "r"(0));
  1427.   return r;
  1428. }
  1429. __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
  1430.   unsigned int r;
  1431.   __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;"
  1432.           : "=r"(r)
  1433.           : "r"(__a), "r"(__b), "r"(0));
  1434.   return r;
  1435. }
  1436. __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
  1437.   unsigned int r;
  1438.   __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;"
  1439.           : "=r"(r)
  1440.           : "r"(__a), "r"(__b), "r"(0));
  1441.   return r;
  1442. }
  1443. __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
  1444.   unsigned int r;
  1445.   __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;"
  1446.           : "=r"(r)
  1447.           : "r"(__a), "r"(__b), "r"(0));
  1448.   return r;
  1449. }
  1450. __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
  1451.   unsigned int r;
  1452.   __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;"
  1453.           : "=r"(r)
  1454.           : "r"(__a), "r"(__b), "r"(0));
  1455.   return r;
  1456. }
  1457. __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
  1458.   unsigned int r;
  1459.   __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
  1460.           : "=r"(r)
  1461.           : "r"(__a), "r"(__b), "r"(0));
  1462.   return r;
  1463. }
  1464. __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
  1465.   unsigned int r;
  1466.   __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
  1467.           : "=r"(r)
  1468.           : "r"(__a), "r"(__b), "r"(0));
  1469.   return r;
  1470. }
  1471. __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
  1472.   unsigned int r;
  1473.   __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
  1474.           : "=r"(r)
  1475.           : "r"(__a), "r"(__b), "r"(0));
  1476.   return r;
  1477. }
  1478. __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
  1479.   unsigned int r;
  1480.   __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
  1481.           : "=r"(r)
  1482.           : "r"(__a), "r"(__b), "r"(0));
  1483.   return r;
  1484. }
  1485.  
  1486. __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
  1487.   unsigned int r;
  1488.   __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;"
  1489.           : "=r"(r)
  1490.           : "r"(__a), "r"(__b), "r"(0));
  1491.   return r;
  1492. }
  1493. __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }
  1494.  
  1495. __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
  1496.   unsigned int r;
  1497.   __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;"
  1498.           : "=r"(r)
  1499.           : "r"(__a), "r"(__b), "r"(0));
  1500.   return r;
  1501. }
  1502. __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
  1503. __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
  1504.   unsigned int r;
  1505.   __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
  1506.           : "=r"(r)
  1507.           : "r"(__a), "r"(__b), "r"(0));
  1508.   return r;
  1509. }
  1510. __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
  1511.   return __vsubss2(0, __a);
  1512. }
  1513. __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
  1514.   unsigned int r;
  1515.   __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
  1516.           : "=r"(r)
  1517.           : "r"(__a), "r"(__b), "r"(0));
  1518.   return r;
  1519. }
  1520. __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
  1521.   return __vsubss4(0, __a);
  1522. }
  1523. __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
  1524.   unsigned int r;
  1525.   __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
  1526.           : "=r"(r)
  1527.           : "r"(__a), "r"(__b), "r"(0));
  1528.   return r;
  1529. }
  1530. __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  1531.   unsigned int r;
  1532.   __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
  1533.           : "=r"(r)
  1534.           : "r"(__a), "r"(__b), "r"(0));
  1535.   return r;
  1536. }
  1537. #endif // CUDA_VERSION >= 9020
  1538.  
  1539. // For OpenMP we require the user to include <time.h> as we need to know what
  1540. // clock_t is on the system.
  1541. #ifndef __OPENMP_NVPTX__
  1542. __DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); }
  1543. #endif
  1544. __DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
  1545.  
  1546. // These functions shouldn't be declared when including this header
  1547. // for math function resolution purposes.
  1548. #ifndef __OPENMP_NVPTX__
  1549. __DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
  1550.   return __builtin_memcpy(__a, __b, __c);
  1551. }
  1552. __DEVICE__ void *memset(void *__a, int __b, size_t __c) {
  1553.   return __builtin_memset(__a, __b, __c);
  1554. }
  1555. #endif
  1556.  
  1557. #pragma pop_macro("__DEVICE__")
  1558. #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
  1559.