Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __TMMINTRIN_H
  11. #define __TMMINTRIN_H
  12.  
  13. #if !defined(__i386__) && !defined(__x86_64__)
  14. #error "This header is only meant to be used on x86 and x64 architecture"
  15. #endif
  16.  
  17. #include <pmmintrin.h>
  18.  
  19. /* Define the default attributes for the functions in this file. */
  20. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
  21. #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
  22.  
  23. /// Computes the absolute value of each of the packed 8-bit signed
  24. ///    integers in the source operand and stores the 8-bit unsigned integer
  25. ///    results in the destination.
  26. ///
  27. /// \headerfile <x86intrin.h>
  28. ///
  29. /// This intrinsic corresponds to the \c PABSB instruction.
  30. ///
  31. /// \param __a
  32. ///    A 64-bit vector of [8 x i8].
  33. /// \returns A 64-bit integer vector containing the absolute values of the
  34. ///    elements in the operand.
  35. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  36. _mm_abs_pi8(__m64 __a)
  37. {
  38.     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  39. }
  40.  
  41. /// Computes the absolute value of each of the packed 8-bit signed
  42. ///    integers in the source operand and stores the 8-bit unsigned integer
  43. ///    results in the destination.
  44. ///
  45. /// \headerfile <x86intrin.h>
  46. ///
  47. /// This intrinsic corresponds to the \c VPABSB instruction.
  48. ///
  49. /// \param __a
  50. ///    A 128-bit vector of [16 x i8].
  51. /// \returns A 128-bit integer vector containing the absolute values of the
  52. ///    elements in the operand.
  53. static __inline__ __m128i __DEFAULT_FN_ATTRS
  54. _mm_abs_epi8(__m128i __a)
  55. {
  56.     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
  57. }
  58.  
  59. /// Computes the absolute value of each of the packed 16-bit signed
  60. ///    integers in the source operand and stores the 16-bit unsigned integer
  61. ///    results in the destination.
  62. ///
  63. /// \headerfile <x86intrin.h>
  64. ///
  65. /// This intrinsic corresponds to the \c PABSW instruction.
  66. ///
  67. /// \param __a
  68. ///    A 64-bit vector of [4 x i16].
  69. /// \returns A 64-bit integer vector containing the absolute values of the
  70. ///    elements in the operand.
  71. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  72. _mm_abs_pi16(__m64 __a)
  73. {
  74.     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  75. }
  76.  
  77. /// Computes the absolute value of each of the packed 16-bit signed
  78. ///    integers in the source operand and stores the 16-bit unsigned integer
  79. ///    results in the destination.
  80. ///
  81. /// \headerfile <x86intrin.h>
  82. ///
  83. /// This intrinsic corresponds to the \c VPABSW instruction.
  84. ///
  85. /// \param __a
  86. ///    A 128-bit vector of [8 x i16].
  87. /// \returns A 128-bit integer vector containing the absolute values of the
  88. ///    elements in the operand.
  89. static __inline__ __m128i __DEFAULT_FN_ATTRS
  90. _mm_abs_epi16(__m128i __a)
  91. {
  92.     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
  93. }
  94.  
  95. /// Computes the absolute value of each of the packed 32-bit signed
  96. ///    integers in the source operand and stores the 32-bit unsigned integer
  97. ///    results in the destination.
  98. ///
  99. /// \headerfile <x86intrin.h>
  100. ///
  101. /// This intrinsic corresponds to the \c PABSD instruction.
  102. ///
  103. /// \param __a
  104. ///    A 64-bit vector of [2 x i32].
  105. /// \returns A 64-bit integer vector containing the absolute values of the
  106. ///    elements in the operand.
  107. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  108. _mm_abs_pi32(__m64 __a)
  109. {
  110.     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
  111. }
  112.  
  113. /// Computes the absolute value of each of the packed 32-bit signed
  114. ///    integers in the source operand and stores the 32-bit unsigned integer
  115. ///    results in the destination.
  116. ///
  117. /// \headerfile <x86intrin.h>
  118. ///
  119. /// This intrinsic corresponds to the \c VPABSD instruction.
  120. ///
  121. /// \param __a
  122. ///    A 128-bit vector of [4 x i32].
  123. /// \returns A 128-bit integer vector containing the absolute values of the
  124. ///    elements in the operand.
  125. static __inline__ __m128i __DEFAULT_FN_ATTRS
  126. _mm_abs_epi32(__m128i __a)
  127. {
  128.     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
  129. }
  130.  
  131. /// Concatenates the two 128-bit integer vector operands, and
  132. ///    right-shifts the result by the number of bytes specified in the immediate
  133. ///    operand.
  134. ///
  135. /// \headerfile <x86intrin.h>
  136. ///
  137. /// \code
  138. /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
  139. /// \endcode
  140. ///
  141. /// This intrinsic corresponds to the \c PALIGNR instruction.
  142. ///
  143. /// \param a
  144. ///    A 128-bit vector of [16 x i8] containing one of the source operands.
  145. /// \param b
  146. ///    A 128-bit vector of [16 x i8] containing one of the source operands.
  147. /// \param n
  148. ///    An immediate operand specifying how many bytes to right-shift the result.
  149. /// \returns A 128-bit integer vector containing the concatenated right-shifted
  150. ///    value.
  151. #define _mm_alignr_epi8(a, b, n) \
  152.   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
  153.                                       (__v16qi)(__m128i)(b), (n)))
  154.  
  155. /// Concatenates the two 64-bit integer vector operands, and right-shifts
  156. ///    the result by the number of bytes specified in the immediate operand.
  157. ///
  158. /// \headerfile <x86intrin.h>
  159. ///
  160. /// \code
  161. /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
  162. /// \endcode
  163. ///
  164. /// This intrinsic corresponds to the \c PALIGNR instruction.
  165. ///
  166. /// \param a
  167. ///    A 64-bit vector of [8 x i8] containing one of the source operands.
  168. /// \param b
  169. ///    A 64-bit vector of [8 x i8] containing one of the source operands.
  170. /// \param n
  171. ///    An immediate operand specifying how many bytes to right-shift the result.
  172. /// \returns A 64-bit integer vector containing the concatenated right-shifted
  173. ///    value.
  174. #define _mm_alignr_pi8(a, b, n) \
  175.   ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
  176.  
  177. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  178. ///    128-bit vectors of [8 x i16].
  179. ///
  180. /// \headerfile <x86intrin.h>
  181. ///
  182. /// This intrinsic corresponds to the \c VPHADDW instruction.
  183. ///
  184. /// \param __a
  185. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  186. ///    horizontal sums of the values are stored in the lower bits of the
  187. ///    destination.
  188. /// \param __b
  189. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  190. ///    horizontal sums of the values are stored in the upper bits of the
  191. ///    destination.
  192. /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
  193. ///    both operands.
  194. static __inline__ __m128i __DEFAULT_FN_ATTRS
  195. _mm_hadd_epi16(__m128i __a, __m128i __b)
  196. {
  197.     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
  198. }
  199.  
  200. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  201. ///    128-bit vectors of [4 x i32].
  202. ///
  203. /// \headerfile <x86intrin.h>
  204. ///
  205. /// This intrinsic corresponds to the \c VPHADDD instruction.
  206. ///
  207. /// \param __a
  208. ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
  209. ///    horizontal sums of the values are stored in the lower bits of the
  210. ///    destination.
  211. /// \param __b
  212. ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
  213. ///    horizontal sums of the values are stored in the upper bits of the
  214. ///    destination.
  215. /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
  216. ///    both operands.
  217. static __inline__ __m128i __DEFAULT_FN_ATTRS
  218. _mm_hadd_epi32(__m128i __a, __m128i __b)
  219. {
  220.     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
  221. }
  222.  
  223. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  224. ///    64-bit vectors of [4 x i16].
  225. ///
  226. /// \headerfile <x86intrin.h>
  227. ///
  228. /// This intrinsic corresponds to the \c PHADDW instruction.
  229. ///
  230. /// \param __a
  231. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  232. ///    horizontal sums of the values are stored in the lower bits of the
  233. ///    destination.
  234. /// \param __b
  235. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  236. ///    horizontal sums of the values are stored in the upper bits of the
  237. ///    destination.
  238. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
  239. ///    operands.
  240. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  241. _mm_hadd_pi16(__m64 __a, __m64 __b)
  242. {
  243.     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
  244. }
  245.  
  246. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  247. ///    64-bit vectors of [2 x i32].
  248. ///
  249. /// \headerfile <x86intrin.h>
  250. ///
  251. /// This intrinsic corresponds to the \c PHADDD instruction.
  252. ///
  253. /// \param __a
  254. ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
  255. ///    horizontal sums of the values are stored in the lower bits of the
  256. ///    destination.
  257. /// \param __b
  258. ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
  259. ///    horizontal sums of the values are stored in the upper bits of the
  260. ///    destination.
  261. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
  262. ///    operands.
  263. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  264. _mm_hadd_pi32(__m64 __a, __m64 __b)
  265. {
  266.     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
  267. }
  268.  
  269. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  270. ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
  271. ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
  272. ///    0x8000.
  273. ///
  274. /// \headerfile <x86intrin.h>
  275. ///
  276. /// This intrinsic corresponds to the \c VPHADDSW instruction.
  277. ///
  278. /// \param __a
  279. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  280. ///    horizontal sums of the values are stored in the lower bits of the
  281. ///    destination.
  282. /// \param __b
  283. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  284. ///    horizontal sums of the values are stored in the upper bits of the
  285. ///    destination.
  286. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
  287. ///    sums of both operands.
  288. static __inline__ __m128i __DEFAULT_FN_ATTRS
  289. _mm_hadds_epi16(__m128i __a, __m128i __b)
  290. {
  291.     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
  292. }
  293.  
  294. /// Horizontally adds the adjacent pairs of values contained in 2 packed
  295. ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
  296. ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
  297. ///    0x8000.
  298. ///
  299. /// \headerfile <x86intrin.h>
  300. ///
  301. /// This intrinsic corresponds to the \c PHADDSW instruction.
  302. ///
  303. /// \param __a
  304. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  305. ///    horizontal sums of the values are stored in the lower bits of the
  306. ///    destination.
  307. /// \param __b
  308. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  309. ///    horizontal sums of the values are stored in the upper bits of the
  310. ///    destination.
  311. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
  312. ///    sums of both operands.
  313. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  314. _mm_hadds_pi16(__m64 __a, __m64 __b)
  315. {
  316.     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
  317. }
  318.  
  319. /// Horizontally subtracts the adjacent pairs of values contained in 2
  320. ///    packed 128-bit vectors of [8 x i16].
  321. ///
  322. /// \headerfile <x86intrin.h>
  323. ///
  324. /// This intrinsic corresponds to the \c VPHSUBW instruction.
  325. ///
  326. /// \param __a
  327. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  328. ///    horizontal differences between the values are stored in the lower bits of
  329. ///    the destination.
  330. /// \param __b
  331. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  332. ///    horizontal differences between the values are stored in the upper bits of
  333. ///    the destination.
  334. /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
  335. ///    of both operands.
  336. static __inline__ __m128i __DEFAULT_FN_ATTRS
  337. _mm_hsub_epi16(__m128i __a, __m128i __b)
  338. {
  339.     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
  340. }
  341.  
  342. /// Horizontally subtracts the adjacent pairs of values contained in 2
  343. ///    packed 128-bit vectors of [4 x i32].
  344. ///
  345. /// \headerfile <x86intrin.h>
  346. ///
  347. /// This intrinsic corresponds to the \c VPHSUBD instruction.
  348. ///
  349. /// \param __a
  350. ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
  351. ///    horizontal differences between the values are stored in the lower bits of
  352. ///    the destination.
  353. /// \param __b
  354. ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
  355. ///    horizontal differences between the values are stored in the upper bits of
  356. ///    the destination.
  357. /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
  358. ///    of both operands.
  359. static __inline__ __m128i __DEFAULT_FN_ATTRS
  360. _mm_hsub_epi32(__m128i __a, __m128i __b)
  361. {
  362.     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
  363. }
  364.  
  365. /// Horizontally subtracts the adjacent pairs of values contained in 2
  366. ///    packed 64-bit vectors of [4 x i16].
  367. ///
  368. /// \headerfile <x86intrin.h>
  369. ///
  370. /// This intrinsic corresponds to the \c PHSUBW instruction.
  371. ///
  372. /// \param __a
  373. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  374. ///    horizontal differences between the values are stored in the lower bits of
  375. ///    the destination.
  376. /// \param __b
  377. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  378. ///    horizontal differences between the values are stored in the upper bits of
  379. ///    the destination.
  380. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
  381. ///    of both operands.
  382. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  383. _mm_hsub_pi16(__m64 __a, __m64 __b)
  384. {
  385.     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
  386. }
  387.  
  388. /// Horizontally subtracts the adjacent pairs of values contained in 2
  389. ///    packed 64-bit vectors of [2 x i32].
  390. ///
  391. /// \headerfile <x86intrin.h>
  392. ///
  393. /// This intrinsic corresponds to the \c PHSUBD instruction.
  394. ///
  395. /// \param __a
  396. ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
  397. ///    horizontal differences between the values are stored in the lower bits of
  398. ///    the destination.
  399. /// \param __b
  400. ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
  401. ///    horizontal differences between the values are stored in the upper bits of
  402. ///    the destination.
  403. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
  404. ///    of both operands.
  405. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  406. _mm_hsub_pi32(__m64 __a, __m64 __b)
  407. {
  408.     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
  409. }
  410.  
  411. /// Horizontally subtracts the adjacent pairs of values contained in 2
  412. ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
  413. ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
  414. ///    saturated to 0x8000.
  415. ///
  416. /// \headerfile <x86intrin.h>
  417. ///
  418. /// This intrinsic corresponds to the \c VPHSUBSW instruction.
  419. ///
  420. /// \param __a
  421. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  422. ///    horizontal differences between the values are stored in the lower bits of
  423. ///    the destination.
  424. /// \param __b
  425. ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
  426. ///    horizontal differences between the values are stored in the upper bits of
  427. ///    the destination.
  428. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
  429. ///    differences of both operands.
  430. static __inline__ __m128i __DEFAULT_FN_ATTRS
  431. _mm_hsubs_epi16(__m128i __a, __m128i __b)
  432. {
  433.     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
  434. }
  435.  
  436. /// Horizontally subtracts the adjacent pairs of values contained in 2
  437. ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
  438. ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
  439. ///    saturated to 0x8000.
  440. ///
  441. /// \headerfile <x86intrin.h>
  442. ///
  443. /// This intrinsic corresponds to the \c PHSUBSW instruction.
  444. ///
  445. /// \param __a
  446. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  447. ///    horizontal differences between the values are stored in the lower bits of
  448. ///    the destination.
  449. /// \param __b
  450. ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
  451. ///    horizontal differences between the values are stored in the upper bits of
  452. ///    the destination.
  453. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
  454. ///    differences of both operands.
  455. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  456. _mm_hsubs_pi16(__m64 __a, __m64 __b)
  457. {
  458.     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
  459. }
  460.  
  461. /// Multiplies corresponding pairs of packed 8-bit unsigned integer
  462. ///    values contained in the first source operand and packed 8-bit signed
  463. ///    integer values contained in the second source operand, adds pairs of
  464. ///    contiguous products with signed saturation, and writes the 16-bit sums to
  465. ///    the corresponding bits in the destination.
  466. ///
  467. ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
  468. ///    both operands are multiplied, and the sum of both results is written to
  469. ///    bits [15:0] of the destination.
  470. ///
  471. /// \headerfile <x86intrin.h>
  472. ///
  473. /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
  474. ///
  475. /// \param __a
  476. ///    A 128-bit integer vector containing the first source operand.
  477. /// \param __b
  478. ///    A 128-bit integer vector containing the second source operand.
  479. /// \returns A 128-bit integer vector containing the sums of products of both
  480. ///    operands: \n
  481. ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
  482. ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
  483. ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
  484. ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
  485. ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
  486. ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
  487. ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
  488. ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
  489. static __inline__ __m128i __DEFAULT_FN_ATTRS
  490. _mm_maddubs_epi16(__m128i __a, __m128i __b)
  491. {
  492.     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
  493. }
  494.  
  495. /// Multiplies corresponding pairs of packed 8-bit unsigned integer
  496. ///    values contained in the first source operand and packed 8-bit signed
  497. ///    integer values contained in the second source operand, adds pairs of
  498. ///    contiguous products with signed saturation, and writes the 16-bit sums to
  499. ///    the corresponding bits in the destination.
  500. ///
  501. ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
  502. ///    both operands are multiplied, and the sum of both results is written to
  503. ///    bits [15:0] of the destination.
  504. ///
  505. /// \headerfile <x86intrin.h>
  506. ///
  507. /// This intrinsic corresponds to the \c PMADDUBSW instruction.
  508. ///
  509. /// \param __a
  510. ///    A 64-bit integer vector containing the first source operand.
  511. /// \param __b
  512. ///    A 64-bit integer vector containing the second source operand.
  513. /// \returns A 64-bit integer vector containing the sums of products of both
  514. ///    operands: \n
  515. ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
  516. ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
  517. ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
  518. ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
  519. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  520. _mm_maddubs_pi16(__m64 __a, __m64 __b)
  521. {
  522.     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
  523. }
  524.  
  525. /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
  526. ///    products to the 18 most significant bits by right-shifting, rounds the
  527. ///    truncated value by adding 1, and writes bits [16:1] to the destination.
  528. ///
  529. /// \headerfile <x86intrin.h>
  530. ///
  531. /// This intrinsic corresponds to the \c VPMULHRSW instruction.
  532. ///
  533. /// \param __a
  534. ///    A 128-bit vector of [8 x i16] containing one of the source operands.
  535. /// \param __b
  536. ///    A 128-bit vector of [8 x i16] containing one of the source operands.
  537. /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
  538. ///    products of both operands.
  539. static __inline__ __m128i __DEFAULT_FN_ATTRS
  540. _mm_mulhrs_epi16(__m128i __a, __m128i __b)
  541. {
  542.     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
  543. }
  544.  
  545. /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
  546. ///    products to the 18 most significant bits by right-shifting, rounds the
  547. ///    truncated value by adding 1, and writes bits [16:1] to the destination.
  548. ///
  549. /// \headerfile <x86intrin.h>
  550. ///
  551. /// This intrinsic corresponds to the \c PMULHRSW instruction.
  552. ///
  553. /// \param __a
  554. ///    A 64-bit vector of [4 x i16] containing one of the source operands.
  555. /// \param __b
  556. ///    A 64-bit vector of [4 x i16] containing one of the source operands.
  557. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
  558. ///    products of both operands.
  559. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  560. _mm_mulhrs_pi16(__m64 __a, __m64 __b)
  561. {
  562.     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
  563. }
  564.  
  565. /// Copies the 8-bit integers from a 128-bit integer vector to the
  566. ///    destination or clears 8-bit values in the destination, as specified by
  567. ///    the second source operand.
  568. ///
  569. /// \headerfile <x86intrin.h>
  570. ///
  571. /// This intrinsic corresponds to the \c VPSHUFB instruction.
  572. ///
  573. /// \param __a
  574. ///    A 128-bit integer vector containing the values to be copied.
  575. /// \param __b
  576. ///    A 128-bit integer vector containing control bytes corresponding to
  577. ///    positions in the destination:
  578. ///    Bit 7: \n
  579. ///    1: Clear the corresponding byte in the destination. \n
  580. ///    0: Copy the selected source byte to the corresponding byte in the
  581. ///    destination. \n
  582. ///    Bits [6:4] Reserved.  \n
  583. ///    Bits [3:0] select the source byte to be copied.
  584. /// \returns A 128-bit integer vector containing the copied or cleared values.
  585. static __inline__ __m128i __DEFAULT_FN_ATTRS
  586. _mm_shuffle_epi8(__m128i __a, __m128i __b)
  587. {
  588.     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
  589. }
  590.  
  591. /// Copies the 8-bit integers from a 64-bit integer vector to the
  592. ///    destination or clears 8-bit values in the destination, as specified by
  593. ///    the second source operand.
  594. ///
  595. /// \headerfile <x86intrin.h>
  596. ///
  597. /// This intrinsic corresponds to the \c PSHUFB instruction.
  598. ///
  599. /// \param __a
  600. ///    A 64-bit integer vector containing the values to be copied.
  601. /// \param __b
  602. ///    A 64-bit integer vector containing control bytes corresponding to
  603. ///    positions in the destination:
  604. ///    Bit 7: \n
  605. ///    1: Clear the corresponding byte in the destination. \n
  606. ///    0: Copy the selected source byte to the corresponding byte in the
  607. ///    destination. \n
  608. ///    Bits [3:0] select the source byte to be copied.
  609. /// \returns A 64-bit integer vector containing the copied or cleared values.
  610. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  611. _mm_shuffle_pi8(__m64 __a, __m64 __b)
  612. {
  613.     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
  614. }
  615.  
  616. /// For each 8-bit integer in the first source operand, perform one of
  617. ///    the following actions as specified by the second source operand.
  618. ///
  619. ///    If the byte in the second source is negative, calculate the two's
  620. ///    complement of the corresponding byte in the first source, and write that
  621. ///    value to the destination. If the byte in the second source is positive,
  622. ///    copy the corresponding byte from the first source to the destination. If
  623. ///    the byte in the second source is zero, clear the corresponding byte in
  624. ///    the destination.
  625. ///
  626. /// \headerfile <x86intrin.h>
  627. ///
  628. /// This intrinsic corresponds to the \c VPSIGNB instruction.
  629. ///
  630. /// \param __a
  631. ///    A 128-bit integer vector containing the values to be copied.
  632. /// \param __b
  633. ///    A 128-bit integer vector containing control bytes corresponding to
  634. ///    positions in the destination.
  635. /// \returns A 128-bit integer vector containing the resultant values.
  636. static __inline__ __m128i __DEFAULT_FN_ATTRS
  637. _mm_sign_epi8(__m128i __a, __m128i __b)
  638. {
  639.     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
  640. }
  641.  
  642. /// For each 16-bit integer in the first source operand, perform one of
  643. ///    the following actions as specified by the second source operand.
  644. ///
  645. ///    If the word in the second source is negative, calculate the two's
  646. ///    complement of the corresponding word in the first source, and write that
  647. ///    value to the destination. If the word in the second source is positive,
  648. ///    copy the corresponding word from the first source to the destination. If
  649. ///    the word in the second source is zero, clear the corresponding word in
  650. ///    the destination.
  651. ///
  652. /// \headerfile <x86intrin.h>
  653. ///
  654. /// This intrinsic corresponds to the \c VPSIGNW instruction.
  655. ///
  656. /// \param __a
  657. ///    A 128-bit integer vector containing the values to be copied.
  658. /// \param __b
  659. ///    A 128-bit integer vector containing control words corresponding to
  660. ///    positions in the destination.
  661. /// \returns A 128-bit integer vector containing the resultant values.
  662. static __inline__ __m128i __DEFAULT_FN_ATTRS
  663. _mm_sign_epi16(__m128i __a, __m128i __b)
  664. {
  665.     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
  666. }
  667.  
  668. /// For each 32-bit integer in the first source operand, perform one of
  669. ///    the following actions as specified by the second source operand.
  670. ///
  671. ///    If the doubleword in the second source is negative, calculate the two's
  672. ///    complement of the corresponding word in the first source, and write that
  673. ///    value to the destination. If the doubleword in the second source is
  674. ///    positive, copy the corresponding word from the first source to the
  675. ///    destination. If the doubleword in the second source is zero, clear the
  676. ///    corresponding word in the destination.
  677. ///
  678. /// \headerfile <x86intrin.h>
  679. ///
  680. /// This intrinsic corresponds to the \c VPSIGND instruction.
  681. ///
  682. /// \param __a
  683. ///    A 128-bit integer vector containing the values to be copied.
  684. /// \param __b
  685. ///    A 128-bit integer vector containing control doublewords corresponding to
  686. ///    positions in the destination.
  687. /// \returns A 128-bit integer vector containing the resultant values.
  688. static __inline__ __m128i __DEFAULT_FN_ATTRS
  689. _mm_sign_epi32(__m128i __a, __m128i __b)
  690. {
  691.     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
  692. }
  693.  
  694. /// For each 8-bit integer in the first source operand, perform one of
  695. ///    the following actions as specified by the second source operand.
  696. ///
  697. ///    If the byte in the second source is negative, calculate the two's
  698. ///    complement of the corresponding byte in the first source, and write that
  699. ///    value to the destination. If the byte in the second source is positive,
  700. ///    copy the corresponding byte from the first source to the destination. If
  701. ///    the byte in the second source is zero, clear the corresponding byte in
  702. ///    the destination.
  703. ///
  704. /// \headerfile <x86intrin.h>
  705. ///
  706. /// This intrinsic corresponds to the \c PSIGNB instruction.
  707. ///
  708. /// \param __a
  709. ///    A 64-bit integer vector containing the values to be copied.
  710. /// \param __b
  711. ///    A 64-bit integer vector containing control bytes corresponding to
  712. ///    positions in the destination.
  713. /// \returns A 64-bit integer vector containing the resultant values.
  714. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  715. _mm_sign_pi8(__m64 __a, __m64 __b)
  716. {
  717.     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
  718. }
  719.  
  720. /// For each 16-bit integer in the first source operand, perform one of
  721. ///    the following actions as specified by the second source operand.
  722. ///
  723. ///    If the word in the second source is negative, calculate the two's
  724. ///    complement of the corresponding word in the first source, and write that
  725. ///    value to the destination. If the word in the second source is positive,
  726. ///    copy the corresponding word from the first source to the destination. If
  727. ///    the word in the second source is zero, clear the corresponding word in
  728. ///    the destination.
  729. ///
  730. /// \headerfile <x86intrin.h>
  731. ///
  732. /// This intrinsic corresponds to the \c PSIGNW instruction.
  733. ///
  734. /// \param __a
  735. ///    A 64-bit integer vector containing the values to be copied.
  736. /// \param __b
  737. ///    A 64-bit integer vector containing control words corresponding to
  738. ///    positions in the destination.
  739. /// \returns A 64-bit integer vector containing the resultant values.
  740. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  741. _mm_sign_pi16(__m64 __a, __m64 __b)
  742. {
  743.     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
  744. }
  745.  
  746. /// For each 32-bit integer in the first source operand, perform one of
  747. ///    the following actions as specified by the second source operand.
  748. ///
  749. ///    If the doubleword in the second source is negative, calculate the two's
  750. ///    complement of the corresponding doubleword in the first source, and
  751. ///    write that value to the destination. If the doubleword in the second
  752. ///    source is positive, copy the corresponding doubleword from the first
  753. ///    source to the destination. If the doubleword in the second source is
  754. ///    zero, clear the corresponding doubleword in the destination.
  755. ///
  756. /// \headerfile <x86intrin.h>
  757. ///
  758. /// This intrinsic corresponds to the \c PSIGND instruction.
  759. ///
  760. /// \param __a
  761. ///    A 64-bit integer vector containing the values to be copied.
  762. /// \param __b
  763. ///    A 64-bit integer vector containing two control doublewords corresponding
  764. ///    to positions in the destination.
  765. /// \returns A 64-bit integer vector containing the resultant values.
  766. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  767. _mm_sign_pi32(__m64 __a, __m64 __b)
  768. {
  769.     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
  770. }
  771.  
  772. #undef __DEFAULT_FN_ATTRS
  773. #undef __DEFAULT_FN_ATTRS_MMX
  774.  
  775. #endif /* __TMMINTRIN_H */
  776.