Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __PMMINTRIN_H
  11. #define __PMMINTRIN_H
  12.  
  13. #if !defined(__i386__) && !defined(__x86_64__)
  14. #error "This header is only meant to be used on x86 and x64 architecture"
  15. #endif
  16.  
  17. #include <emmintrin.h>
  18.  
  19. /* Define the default attributes for the functions in this file. */
  20. #define __DEFAULT_FN_ATTRS \
  21.   __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
  22.  
  23. /// Loads data from an unaligned memory location to elements in a 128-bit
  24. ///    vector.
  25. ///
  26. ///    If the address of the data is not 16-byte aligned, the instruction may
  27. ///    read two adjacent aligned blocks of memory to retrieve the requested
  28. ///    data.
  29. ///
  30. /// \headerfile <x86intrin.h>
  31. ///
  32. /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
  33. ///
  34. /// \param __p
  35. ///    A pointer to a 128-bit integer vector containing integer values.
  36. /// \returns A 128-bit vector containing the moved values.
  37. static __inline__ __m128i __DEFAULT_FN_ATTRS
  38. _mm_lddqu_si128(__m128i_u const *__p)
  39. {
  40.   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
  41. }
  42.  
  43. /// Adds the even-indexed values and subtracts the odd-indexed values of
  44. ///    two 128-bit vectors of [4 x float].
  45. ///
  46. /// \headerfile <x86intrin.h>
  47. ///
  48. /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
  49. ///
  50. /// \param __a
  51. ///    A 128-bit vector of [4 x float] containing the left source operand.
  52. /// \param __b
  53. ///    A 128-bit vector of [4 x float] containing the right source operand.
  54. /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
  55. ///    differences of both operands.
  56. static __inline__ __m128 __DEFAULT_FN_ATTRS
  57. _mm_addsub_ps(__m128 __a, __m128 __b)
  58. {
  59.   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
  60. }
  61.  
  62. /// Horizontally adds the adjacent pairs of values contained in two
  63. ///    128-bit vectors of [4 x float].
  64. ///
  65. /// \headerfile <x86intrin.h>
  66. ///
  67. /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
  68. ///
  69. /// \param __a
  70. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  71. ///    The horizontal sums of the values are stored in the lower bits of the
  72. ///    destination.
  73. /// \param __b
  74. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  75. ///    The horizontal sums of the values are stored in the upper bits of the
  76. ///    destination.
  77. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
  78. ///    both operands.
  79. static __inline__ __m128 __DEFAULT_FN_ATTRS
  80. _mm_hadd_ps(__m128 __a, __m128 __b)
  81. {
  82.   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
  83. }
  84.  
  85. /// Horizontally subtracts the adjacent pairs of values contained in two
  86. ///    128-bit vectors of [4 x float].
  87. ///
  88. /// \headerfile <x86intrin.h>
  89. ///
  90. /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
  91. ///
  92. /// \param __a
  93. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  94. ///    The horizontal differences between the values are stored in the lower
  95. ///    bits of the destination.
  96. /// \param __b
  97. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  98. ///    The horizontal differences between the values are stored in the upper
  99. ///    bits of the destination.
  100. /// \returns A 128-bit vector of [4 x float] containing the horizontal
  101. ///    differences of both operands.
  102. static __inline__ __m128 __DEFAULT_FN_ATTRS
  103. _mm_hsub_ps(__m128 __a, __m128 __b)
  104. {
  105.   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
  106. }
  107.  
  108. /// Moves and duplicates odd-indexed values from a 128-bit vector
  109. ///    of [4 x float] to float values stored in a 128-bit vector of
  110. ///    [4 x float].
  111. ///
  112. /// \headerfile <x86intrin.h>
  113. ///
  114. /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
  115. ///
  116. /// \param __a
  117. ///    A 128-bit vector of [4 x float]. \n
  118. ///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
  119. ///    the destination. \n
  120. ///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
  121. ///    destination.
  122. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
  123. ///    values.
  124. static __inline__ __m128 __DEFAULT_FN_ATTRS
  125. _mm_movehdup_ps(__m128 __a)
  126. {
  127.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
  128. }
  129.  
  130. /// Duplicates even-indexed values from a 128-bit vector of
  131. ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
  132. ///
  133. /// \headerfile <x86intrin.h>
  134. ///
  135. /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
  136. ///
  137. /// \param __a
  138. ///    A 128-bit vector of [4 x float] \n
  139. ///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
  140. ///    the destination. \n
  141. ///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
  142. ///    destination.
  143. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
  144. ///    values.
  145. static __inline__ __m128 __DEFAULT_FN_ATTRS
  146. _mm_moveldup_ps(__m128 __a)
  147. {
  148.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
  149. }
  150.  
  151. /// Adds the even-indexed values and subtracts the odd-indexed values of
  152. ///    two 128-bit vectors of [2 x double].
  153. ///
  154. /// \headerfile <x86intrin.h>
  155. ///
  156. /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
  157. ///
  158. /// \param __a
  159. ///    A 128-bit vector of [2 x double] containing the left source operand.
  160. /// \param __b
  161. ///    A 128-bit vector of [2 x double] containing the right source operand.
  162. /// \returns A 128-bit vector of [2 x double] containing the alternating sums
  163. ///    and differences of both operands.
  164. static __inline__ __m128d __DEFAULT_FN_ATTRS
  165. _mm_addsub_pd(__m128d __a, __m128d __b)
  166. {
  167.   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
  168. }
  169.  
  170. /// Horizontally adds the pairs of values contained in two 128-bit
  171. ///    vectors of [2 x double].
  172. ///
  173. /// \headerfile <x86intrin.h>
  174. ///
  175. /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
  176. ///
  177. /// \param __a
  178. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  179. ///    The horizontal sum of the values is stored in the lower bits of the
  180. ///    destination.
  181. /// \param __b
  182. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  183. ///    The horizontal sum of the values is stored in the upper bits of the
  184. ///    destination.
  185. /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
  186. ///    both operands.
  187. static __inline__ __m128d __DEFAULT_FN_ATTRS
  188. _mm_hadd_pd(__m128d __a, __m128d __b)
  189. {
  190.   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
  191. }
  192.  
  193. /// Horizontally subtracts the pairs of values contained in two 128-bit
  194. ///    vectors of [2 x double].
  195. ///
  196. /// \headerfile <x86intrin.h>
  197. ///
  198. /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
  199. ///
  200. /// \param __a
  201. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  202. ///    The horizontal difference of the values is stored in the lower bits of
  203. ///    the destination.
  204. /// \param __b
  205. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  206. ///    The horizontal difference of the values is stored in the upper bits of
  207. ///    the destination.
  208. /// \returns A 128-bit vector of [2 x double] containing the horizontal
  209. ///    differences of both operands.
  210. static __inline__ __m128d __DEFAULT_FN_ATTRS
  211. _mm_hsub_pd(__m128d __a, __m128d __b)
  212. {
  213.   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
  214. }
  215.  
  216. /// Moves and duplicates one double-precision value to double-precision
  217. ///    values stored in a 128-bit vector of [2 x double].
  218. ///
  219. /// \headerfile <x86intrin.h>
  220. ///
  221. /// \code
  222. /// __m128d _mm_loaddup_pd(double const *dp);
  223. /// \endcode
  224. ///
  225. /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
  226. ///
  227. /// \param dp
  228. ///    A pointer to a double-precision value to be moved and duplicated.
  229. /// \returns A 128-bit vector of [2 x double] containing the moved and
  230. ///    duplicated values.
  231. #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
  232.  
  233. /// Moves and duplicates the double-precision value in the lower bits of
  234. ///    a 128-bit vector of [2 x double] to double-precision values stored in a
  235. ///    128-bit vector of [2 x double].
  236. ///
  237. /// \headerfile <x86intrin.h>
  238. ///
  239. /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
  240. ///
  241. /// \param __a
  242. ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
  243. ///    [127:64] and [63:0] of the destination.
  244. /// \returns A 128-bit vector of [2 x double] containing the moved and
  245. ///    duplicated values.
  246. static __inline__ __m128d __DEFAULT_FN_ATTRS
  247. _mm_movedup_pd(__m128d __a)
  248. {
  249.   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
  250. }
  251.  
  252. /// Establishes a linear address memory range to be monitored and puts
  253. ///    the processor in the monitor event pending state. Data stored in the
  254. ///    monitored address range causes the processor to exit the pending state.
  255. ///
  256. /// \headerfile <x86intrin.h>
  257. ///
  258. /// This intrinsic corresponds to the <c> MONITOR </c> instruction.
  259. ///
  260. /// \param __p
  261. ///    The memory range to be monitored. The size of the range is determined by
  262. ///    CPUID function 0000_0005h.
  263. /// \param __extensions
  264. ///    Optional extensions for the monitoring state.
  265. /// \param __hints
  266. ///    Optional hints for the monitoring state.
  267. static __inline__ void __DEFAULT_FN_ATTRS
  268. _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
  269. {
  270.   __builtin_ia32_monitor(__p, __extensions, __hints);
  271. }
  272.  
  273. /// Used with the MONITOR instruction to wait while the processor is in
  274. ///    the monitor event pending state. Data stored in the monitored address
  275. ///    range causes the processor to exit the pending state.
  276. ///
  277. /// \headerfile <x86intrin.h>
  278. ///
  279. /// This intrinsic corresponds to the <c> MWAIT </c> instruction.
  280. ///
  281. /// \param __extensions
  282. ///    Optional extensions for the monitoring state, which may vary by
  283. ///    processor.
  284. /// \param __hints
  285. ///    Optional hints for the monitoring state, which may vary by processor.
  286. static __inline__ void __DEFAULT_FN_ATTRS
  287. _mm_mwait(unsigned __extensions, unsigned __hints)
  288. {
  289.   __builtin_ia32_mwait(__extensions, __hints);
  290. }
  291.  
  292. #undef __DEFAULT_FN_ATTRS
  293.  
  294. #endif /* __PMMINTRIN_H */
  295.