WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/xmmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	/* Implemented from the specification included in the Intel C++ Compiler
		11	User Guide and Reference, version 9.0. */
		12
		13	#ifndef NO_WARN_X86_INTRINSICS
		14	/* This header file is to help porting code using Intel intrinsics
		15	explicitly from x86_64 to powerpc64/powerpc64le.
		16
		17	Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
		18	VMX/VSX ISA is a good match for vector float SIMD operations.
		19	However scalar float operations in vector (XMM) registers require
		20	the POWER8 VSX ISA (2.07) level. There are differences for data
		21	format and placement of float scalars in the vector register, which
		22	require extra steps to match SSE scalar float semantics on POWER.
		23
		24	It should be noted that there's much difference between X86_64's
		25	MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
		26	portable <fenv.h> instead of access MXSCR directly.
		27
		28	Most SSE scalar float intrinsic operations can be performed more
		29	efficiently as C language float scalar operations or optimized to
		30	use vector SIMD operations. We recommend this for new applications. */
		31	#error \
		32	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
		33	#endif
		34
		35	#ifndef XMMINTRIN_H_
		36	#define XMMINTRIN_H_
		37
		38	#if defined(__powerpc64__) && \
		39	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
		40
		41	/* Define four value permute mask */
		42	#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) \| ((x) << 4) \| ((y) << 2) \| (z))
		43
		44	#include <altivec.h>
		45
		46	/* Avoid collisions between altivec.h and strict adherence to C++ and
		47	C11 standards. This should eventually be done inside altivec.h itself,
		48	but only after testing a full distro build. */
		49	#if defined(__STRICT_ANSI__) && \
		50	(defined(__cplusplus) \|\| \
		51	(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
		52	#undef vector
		53	#undef pixel
		54	#undef bool
		55	#endif
		56
		57	/* We need type definitions from the MMX header file. */
		58	#include <mmintrin.h>
		59
		60	/* Get _mm_malloc () and _mm_free (). */
		61	#if __STDC_HOSTED__
		62	#include <mm_malloc.h>
		63	#endif
		64
		65	/* The Intel API is flexible enough that we must allow aliasing with other
		66	vector types, and their scalar components. */
		67	typedef vector float __m128 __attribute__((__may_alias__));
		68
		69	/* Unaligned version of the same type. */
		70	typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
		71
		72	/* Internal data types for implementing the intrinsics. */
		73	typedef vector float __v4sf;
		74
		75	/* Create an undefined vector. */
		76	extern __inline __m128
		77	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		78	_mm_undefined_ps(void) {
		79	__m128 __Y = __Y;
		80	return __Y;
		81	}
		82
		83	/* Create a vector of zeros. */
		84	extern __inline __m128
		85	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		86	_mm_setzero_ps(void) {
		87	return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
		88	}
		89
		90	/* Load four SPFP values from P. The address must be 16-byte aligned. */
		91	extern __inline __m128
		92	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		93	_mm_load_ps(float const *__P) {
		94	return ((__m128)vec_ld(0, (__v4sf *)__P));
		95	}
		96
		97	/* Load four SPFP values from P. The address need not be 16-byte aligned. */
		98	extern __inline __m128
		99	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		100	_mm_loadu_ps(float const *__P) {
		101	return (vec_vsx_ld(0, __P));
		102	}
		103
		104	/* Load four SPFP values in reverse order. The address must be aligned. */
		105	extern __inline __m128
		106	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		107	_mm_loadr_ps(float const *__P) {
		108	__v4sf __tmp;
		109	__m128 __result;
		110	static const __vector unsigned char __permute_vector = {
		111	0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
		112	0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
		113
		114	__tmp = vec_ld(0, (__v4sf *)__P);
		115	__result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
		116	return __result;
		117	}
		118
		119	/* Create a vector with all four elements equal to F. */
		120	extern __inline __m128
		121	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		122	_mm_set1_ps(float __F) {
		123	return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
		124	}
		125
		126	extern __inline __m128
		127	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		128	_mm_set_ps1(float __F) {
		129	return _mm_set1_ps(__F);
		130	}
		131
		132	/* Create the vector [Z Y X W]. */
		133	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
		134	__artificial__))
		135	_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
		136	return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
		137	}
		138
		139	/* Create the vector [W X Y Z]. */
		140	extern __inline __m128
		141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		142	_mm_setr_ps(float __Z, float __Y, float __X, float __W) {
		143	return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
		144	}
		145
		146	/* Store four SPFP values. The address must be 16-byte aligned. */
		147	extern __inline void
		148	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		149	_mm_store_ps(float *__P, __m128 __A) {
		150	vec_st((__v4sf)__A, 0, (__v4sf *)__P);
		151	}
		152
		153	/* Store four SPFP values. The address need not be 16-byte aligned. */
		154	extern __inline void
		155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		156	_mm_storeu_ps(float *__P, __m128 __A) {
		157	(__m128_u )__P = __A;
		158	}
		159
		160	/* Store four SPFP values in reverse order. The address must be aligned. */
		161	extern __inline void
		162	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		163	_mm_storer_ps(float *__P, __m128 __A) {
		164	__v4sf __tmp;
		165	static const __vector unsigned char __permute_vector = {
		166	0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
		167	0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
		168
		169	__tmp = (__m128)vec_perm(__A, __A, __permute_vector);
		170
		171	_mm_store_ps(__P, __tmp);
		172	}
		173
		174	/* Store the lower SPFP value across four words. */
		175	extern __inline void
		176	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		177	_mm_store1_ps(float *__P, __m128 __A) {
		178	__v4sf __va = vec_splat((__v4sf)__A, 0);
		179	_mm_store_ps(__P, __va);
		180	}
		181
		182	extern __inline void
		183	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		184	_mm_store_ps1(float *__P, __m128 __A) {
		185	_mm_store1_ps(__P, __A);
		186	}
		187
		188	/* Create a vector with element 0 as F and the rest zero. */
		189	extern __inline __m128
		190	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		191	_mm_set_ss(float __F) {
		192	return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
		193	}
		194
		195	/* Sets the low SPFP value of A from the low value of B. */
		196	extern __inline __m128
		197	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		198	_mm_move_ss(__m128 __A, __m128 __B) {
		199	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		200
		201	return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
		202	}
		203
		204	/* Create a vector with element 0 as P and the rest zero. /
		205	extern __inline __m128
		206	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		207	_mm_load_ss(float const *__P) {
		208	return _mm_set_ss(*__P);
		209	}
		210
		211	/* Stores the lower SPFP value. */
		212	extern __inline void
		213	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		214	_mm_store_ss(float *__P, __m128 __A) {
		215	*__P = ((__v4sf)__A)[0];
		216	}
		217
		218	/* Perform the respective operation on the lower SPFP (single-precision
		219	floating-point) values of A and B; the upper three SPFP values are
		220	passed through from A. */
		221
		222	extern __inline __m128
		223	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		224	_mm_add_ss(__m128 __A, __m128 __B) {
		225	#ifdef _ARCH_PWR7
		226	__m128 __a, __b, __c;
		227	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		228	/* PowerISA VSX does not allow partial (for just lower double)
		229	results. So to insure we don't generate spurious exceptions
		230	(from the upper double values) we splat the lower double
		231	before we to the operation. */
		232	__a = vec_splat(__A, 0);
		233	__b = vec_splat(__B, 0);
		234	__c = __a + __b;
		235	/* Then we merge the lower float result with the original upper
		236	float elements from __A. */
		237	return (vec_sel(__A, __c, __mask));
		238	#else
		239	__A[0] = __A[0] + __B[0];
		240	return (__A);
		241	#endif
		242	}
		243
		244	extern __inline __m128
		245	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		246	_mm_sub_ss(__m128 __A, __m128 __B) {
		247	#ifdef _ARCH_PWR7
		248	__m128 __a, __b, __c;
		249	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		250	/* PowerISA VSX does not allow partial (for just lower double)
		251	results. So to insure we don't generate spurious exceptions
		252	(from the upper double values) we splat the lower double
		253	before we to the operation. */
		254	__a = vec_splat(__A, 0);
		255	__b = vec_splat(__B, 0);
		256	__c = __a - __b;
		257	/* Then we merge the lower float result with the original upper
		258	float elements from __A. */
		259	return (vec_sel(__A, __c, __mask));
		260	#else
		261	__A[0] = __A[0] - __B[0];
		262	return (__A);
		263	#endif
		264	}
		265
		266	extern __inline __m128
		267	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		268	_mm_mul_ss(__m128 __A, __m128 __B) {
		269	#ifdef _ARCH_PWR7
		270	__m128 __a, __b, __c;
		271	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		272	/* PowerISA VSX does not allow partial (for just lower double)
		273	results. So to insure we don't generate spurious exceptions
		274	(from the upper double values) we splat the lower double
		275	before we to the operation. */
		276	__a = vec_splat(__A, 0);
		277	__b = vec_splat(__B, 0);
		278	__c = __a * __b;
		279	/* Then we merge the lower float result with the original upper
		280	float elements from __A. */
		281	return (vec_sel(__A, __c, __mask));
		282	#else
		283	__A[0] = __A[0] * __B[0];
		284	return (__A);
		285	#endif
		286	}
		287
		288	extern __inline __m128
		289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		290	_mm_div_ss(__m128 __A, __m128 __B) {
		291	#ifdef _ARCH_PWR7
		292	__m128 __a, __b, __c;
		293	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		294	/* PowerISA VSX does not allow partial (for just lower double)
		295	results. So to insure we don't generate spurious exceptions
		296	(from the upper double values) we splat the lower double
		297	before we to the operation. */
		298	__a = vec_splat(__A, 0);
		299	__b = vec_splat(__B, 0);
		300	__c = __a / __b;
		301	/* Then we merge the lower float result with the original upper
		302	float elements from __A. */
		303	return (vec_sel(__A, __c, __mask));
		304	#else
		305	__A[0] = __A[0] / __B[0];
		306	return (__A);
		307	#endif
		308	}
		309
		310	extern __inline __m128
		311	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		312	_mm_sqrt_ss(__m128 __A) {
		313	__m128 __a, __c;
		314	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		315	/* PowerISA VSX does not allow partial (for just lower double)
		316	* results. So to insure we don't generate spurious exceptions
		317	* (from the upper double values) we splat the lower double
		318	* before we to the operation. */
		319	__a = vec_splat(__A, 0);
		320	__c = vec_sqrt(__a);
		321	/* Then we merge the lower float result with the original upper
		322	* float elements from __A. */
		323	return (vec_sel(__A, __c, __mask));
		324	}
		325
		326	/* Perform the respective operation on the four SPFP values in A and B. */
		327	extern __inline __m128
		328	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		329	_mm_add_ps(__m128 __A, __m128 __B) {
		330	return (__m128)((__v4sf)__A + (__v4sf)__B);
		331	}
		332
		333	extern __inline __m128
		334	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		335	_mm_sub_ps(__m128 __A, __m128 __B) {
		336	return (__m128)((__v4sf)__A - (__v4sf)__B);
		337	}
		338
		339	extern __inline __m128
		340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		341	_mm_mul_ps(__m128 __A, __m128 __B) {
		342	return (__m128)((__v4sf)__A * (__v4sf)__B);
		343	}
		344
		345	extern __inline __m128
		346	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		347	_mm_div_ps(__m128 __A, __m128 __B) {
		348	return (__m128)((__v4sf)__A / (__v4sf)__B);
		349	}
		350
		351	extern __inline __m128
		352	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		353	_mm_sqrt_ps(__m128 __A) {
		354	return (vec_sqrt((__v4sf)__A));
		355	}
		356
		357	extern __inline __m128
		358	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		359	_mm_rcp_ps(__m128 __A) {
		360	return (vec_re((__v4sf)__A));
		361	}
		362
		363	extern __inline __m128
		364	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		365	_mm_rsqrt_ps(__m128 __A) {
		366	return (vec_rsqrte(__A));
		367	}
		368
		369	extern __inline __m128
		370	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		371	_mm_rcp_ss(__m128 __A) {
		372	__m128 __a, __c;
		373	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		374	/* PowerISA VSX does not allow partial (for just lower double)
		375	* results. So to insure we don't generate spurious exceptions
		376	* (from the upper double values) we splat the lower double
		377	* before we to the operation. */
		378	__a = vec_splat(__A, 0);
		379	__c = _mm_rcp_ps(__a);
		380	/* Then we merge the lower float result with the original upper
		381	* float elements from __A. */
		382	return (vec_sel(__A, __c, __mask));
		383	}
		384
		385	extern __inline __m128
		386	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		387	_mm_rsqrt_ss(__m128 __A) {
		388	__m128 __a, __c;
		389	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		390	/* PowerISA VSX does not allow partial (for just lower double)
		391	* results. So to insure we don't generate spurious exceptions
		392	* (from the upper double values) we splat the lower double
		393	* before we to the operation. */
		394	__a = vec_splat(__A, 0);
		395	__c = vec_rsqrte(__a);
		396	/* Then we merge the lower float result with the original upper
		397	* float elements from __A. */
		398	return (vec_sel(__A, __c, __mask));
		399	}
		400
		401	extern __inline __m128
		402	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		403	_mm_min_ss(__m128 __A, __m128 __B) {
		404	__v4sf __a, __b, __c;
		405	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		406	/* PowerISA VSX does not allow partial (for just lower float)
		407	* results. So to insure we don't generate spurious exceptions
		408	* (from the upper float values) we splat the lower float
		409	* before we to the operation. */
		410	__a = vec_splat((__v4sf)__A, 0);
		411	__b = vec_splat((__v4sf)__B, 0);
		412	__c = vec_min(__a, __b);
		413	/* Then we merge the lower float result with the original upper
		414	* float elements from __A. */
		415	return (vec_sel((__v4sf)__A, __c, __mask));
		416	}
		417
		418	extern __inline __m128
		419	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		420	_mm_max_ss(__m128 __A, __m128 __B) {
		421	__v4sf __a, __b, __c;
		422	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		423	/* PowerISA VSX does not allow partial (for just lower float)
		424	* results. So to insure we don't generate spurious exceptions
		425	* (from the upper float values) we splat the lower float
		426	* before we to the operation. */
		427	__a = vec_splat(__A, 0);
		428	__b = vec_splat(__B, 0);
		429	__c = vec_max(__a, __b);
		430	/* Then we merge the lower float result with the original upper
		431	* float elements from __A. */
		432	return (vec_sel((__v4sf)__A, __c, __mask));
		433	}
		434
		435	extern __inline __m128
		436	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		437	_mm_min_ps(__m128 __A, __m128 __B) {
		438	__vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
		439	return vec_sel(__B, __A, __m);
		440	}
		441
		442	extern __inline __m128
		443	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		444	_mm_max_ps(__m128 __A, __m128 __B) {
		445	__vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
		446	return vec_sel(__B, __A, __m);
		447	}
		448
		449	/* Perform logical bit-wise operations on 128-bit values. */
		450	extern __inline __m128
		451	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		452	_mm_and_ps(__m128 __A, __m128 __B) {
		453	return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
		454	// return __builtin_ia32_andps (__A, __B);
		455	}
		456
		457	extern __inline __m128
		458	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		459	_mm_andnot_ps(__m128 __A, __m128 __B) {
		460	return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
		461	}
		462
		463	extern __inline __m128
		464	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		465	_mm_or_ps(__m128 __A, __m128 __B) {
		466	return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
		467	}
		468
		469	extern __inline __m128
		470	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		471	_mm_xor_ps(__m128 __A, __m128 __B) {
		472	return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
		473	}
		474
		475	/* Perform a comparison on the four SPFP values of A and B. For each
		476	element, if the comparison is true, place a mask of all ones in the
		477	result, otherwise a mask of zeros. */
		478	extern __inline __m128
		479	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		480	_mm_cmpeq_ps(__m128 __A, __m128 __B) {
		481	return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
		482	}
		483
		484	extern __inline __m128
		485	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		486	_mm_cmplt_ps(__m128 __A, __m128 __B) {
		487	return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
		488	}
		489
		490	extern __inline __m128
		491	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		492	_mm_cmple_ps(__m128 __A, __m128 __B) {
		493	return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
		494	}
		495
		496	extern __inline __m128
		497	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		498	_mm_cmpgt_ps(__m128 __A, __m128 __B) {
		499	return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
		500	}
		501
		502	extern __inline __m128
		503	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		504	_mm_cmpge_ps(__m128 __A, __m128 __B) {
		505	return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
		506	}
		507
		508	extern __inline __m128
		509	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		510	_mm_cmpneq_ps(__m128 __A, __m128 __B) {
		511	__v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
		512	return ((__m128)vec_nor(__temp, __temp));
		513	}
		514
		515	extern __inline __m128
		516	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		517	_mm_cmpnlt_ps(__m128 __A, __m128 __B) {
		518	return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
		519	}
		520
		521	extern __inline __m128
		522	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		523	_mm_cmpnle_ps(__m128 __A, __m128 __B) {
		524	return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
		525	}
		526
		527	extern __inline __m128
		528	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		529	_mm_cmpngt_ps(__m128 __A, __m128 __B) {
		530	return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
		531	}
		532
		533	extern __inline __m128
		534	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		535	_mm_cmpnge_ps(__m128 __A, __m128 __B) {
		536	return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
		537	}
		538
		539	extern __inline __m128
		540	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		541	_mm_cmpord_ps(__m128 __A, __m128 __B) {
		542	__vector unsigned int __a, __b;
		543	__vector unsigned int __c, __d;
		544	static const __vector unsigned int __float_exp_mask = {
		545	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
		546
		547	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
		548	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
		549	__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
		550	__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
		551	return ((__m128)vec_and(__c, __d));
		552	}
		553
		554	extern __inline __m128
		555	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		556	_mm_cmpunord_ps(__m128 __A, __m128 __B) {
		557	__vector unsigned int __a, __b;
		558	__vector unsigned int __c, __d;
		559	static const __vector unsigned int __float_exp_mask = {
		560	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
		561
		562	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
		563	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
		564	__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
		565	__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
		566	return ((__m128)vec_or(__c, __d));
		567	}
		568
		569	/* Perform a comparison on the lower SPFP values of A and B. If the
		570	comparison is true, place a mask of all ones in the result, otherwise a
		571	mask of zeros. The upper three SPFP values are passed through from A. */
		572	extern __inline __m128
		573	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		574	_mm_cmpeq_ss(__m128 __A, __m128 __B) {
		575	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		576	__v4sf __a, __b, __c;
		577	/* PowerISA VMX does not allow partial (for just element 0)
		578	* results. So to insure we don't generate spurious exceptions
		579	* (from the upper elements) we splat the lower float
		580	* before we to the operation. */
		581	__a = vec_splat((__v4sf)__A, 0);
		582	__b = vec_splat((__v4sf)__B, 0);
		583	__c = (__v4sf)vec_cmpeq(__a, __b);
		584	/* Then we merge the lower float result with the original upper
		585	* float elements from __A. */
		586	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		587	}
		588
		589	extern __inline __m128
		590	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		591	_mm_cmplt_ss(__m128 __A, __m128 __B) {
		592	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		593	__v4sf __a, __b, __c;
		594	/* PowerISA VMX does not allow partial (for just element 0)
		595	* results. So to insure we don't generate spurious exceptions
		596	* (from the upper elements) we splat the lower float
		597	* before we to the operation. */
		598	__a = vec_splat((__v4sf)__A, 0);
		599	__b = vec_splat((__v4sf)__B, 0);
		600	__c = (__v4sf)vec_cmplt(__a, __b);
		601	/* Then we merge the lower float result with the original upper
		602	* float elements from __A. */
		603	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		604	}
		605
		606	extern __inline __m128
		607	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		608	_mm_cmple_ss(__m128 __A, __m128 __B) {
		609	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		610	__v4sf __a, __b, __c;
		611	/* PowerISA VMX does not allow partial (for just element 0)
		612	* results. So to insure we don't generate spurious exceptions
		613	* (from the upper elements) we splat the lower float
		614	* before we to the operation. */
		615	__a = vec_splat((__v4sf)__A, 0);
		616	__b = vec_splat((__v4sf)__B, 0);
		617	__c = (__v4sf)vec_cmple(__a, __b);
		618	/* Then we merge the lower float result with the original upper
		619	* float elements from __A. */
		620	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		621	}
		622
		623	extern __inline __m128
		624	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		625	_mm_cmpgt_ss(__m128 __A, __m128 __B) {
		626	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		627	__v4sf __a, __b, __c;
		628	/* PowerISA VMX does not allow partial (for just element 0)
		629	* results. So to insure we don't generate spurious exceptions
		630	* (from the upper elements) we splat the lower float
		631	* before we to the operation. */
		632	__a = vec_splat((__v4sf)__A, 0);
		633	__b = vec_splat((__v4sf)__B, 0);
		634	__c = (__v4sf)vec_cmpgt(__a, __b);
		635	/* Then we merge the lower float result with the original upper
		636	* float elements from __A. */
		637	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		638	}
		639
		640	extern __inline __m128
		641	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		642	_mm_cmpge_ss(__m128 __A, __m128 __B) {
		643	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		644	__v4sf __a, __b, __c;
		645	/* PowerISA VMX does not allow partial (for just element 0)
		646	* results. So to insure we don't generate spurious exceptions
		647	* (from the upper elements) we splat the lower float
		648	* before we to the operation. */
		649	__a = vec_splat((__v4sf)__A, 0);
		650	__b = vec_splat((__v4sf)__B, 0);
		651	__c = (__v4sf)vec_cmpge(__a, __b);
		652	/* Then we merge the lower float result with the original upper
		653	* float elements from __A. */
		654	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		655	}
		656
		657	extern __inline __m128
		658	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		659	_mm_cmpneq_ss(__m128 __A, __m128 __B) {
		660	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		661	__v4sf __a, __b, __c;
		662	/* PowerISA VMX does not allow partial (for just element 0)
		663	* results. So to insure we don't generate spurious exceptions
		664	* (from the upper elements) we splat the lower float
		665	* before we to the operation. */
		666	__a = vec_splat((__v4sf)__A, 0);
		667	__b = vec_splat((__v4sf)__B, 0);
		668	__c = (__v4sf)vec_cmpeq(__a, __b);
		669	__c = vec_nor(__c, __c);
		670	/* Then we merge the lower float result with the original upper
		671	* float elements from __A. */
		672	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		673	}
		674
		675	extern __inline __m128
		676	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		677	_mm_cmpnlt_ss(__m128 __A, __m128 __B) {
		678	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		679	__v4sf __a, __b, __c;
		680	/* PowerISA VMX does not allow partial (for just element 0)
		681	* results. So to insure we don't generate spurious exceptions
		682	* (from the upper elements) we splat the lower float
		683	* before we to the operation. */
		684	__a = vec_splat((__v4sf)__A, 0);
		685	__b = vec_splat((__v4sf)__B, 0);
		686	__c = (__v4sf)vec_cmpge(__a, __b);
		687	/* Then we merge the lower float result with the original upper
		688	* float elements from __A. */
		689	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		690	}
		691
		692	extern __inline __m128
		693	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		694	_mm_cmpnle_ss(__m128 __A, __m128 __B) {
		695	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		696	__v4sf __a, __b, __c;
		697	/* PowerISA VMX does not allow partial (for just element 0)
		698	* results. So to insure we don't generate spurious exceptions
		699	* (from the upper elements) we splat the lower float
		700	* before we to the operation. */
		701	__a = vec_splat((__v4sf)__A, 0);
		702	__b = vec_splat((__v4sf)__B, 0);
		703	__c = (__v4sf)vec_cmpgt(__a, __b);
		704	/* Then we merge the lower float result with the original upper
		705	* float elements from __A. */
		706	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		707	}
		708
		709	extern __inline __m128
		710	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		711	_mm_cmpngt_ss(__m128 __A, __m128 __B) {
		712	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		713	__v4sf __a, __b, __c;
		714	/* PowerISA VMX does not allow partial (for just element 0)
		715	* results. So to insure we don't generate spurious exceptions
		716	* (from the upper elements) we splat the lower float
		717	* before we to the operation. */
		718	__a = vec_splat((__v4sf)__A, 0);
		719	__b = vec_splat((__v4sf)__B, 0);
		720	__c = (__v4sf)vec_cmple(__a, __b);
		721	/* Then we merge the lower float result with the original upper
		722	* float elements from __A. */
		723	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		724	}
		725
		726	extern __inline __m128
		727	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		728	_mm_cmpnge_ss(__m128 __A, __m128 __B) {
		729	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		730	__v4sf __a, __b, __c;
		731	/* PowerISA VMX does not allow partial (for just element 0)
		732	* results. So to insure we don't generate spurious exceptions
		733	* (from the upper elements) we splat the lower float
		734	* before we do the operation. */
		735	__a = vec_splat((__v4sf)__A, 0);
		736	__b = vec_splat((__v4sf)__B, 0);
		737	__c = (__v4sf)vec_cmplt(__a, __b);
		738	/* Then we merge the lower float result with the original upper
		739	* float elements from __A. */
		740	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
		741	}
		742
		743	extern __inline __m128
		744	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		745	_mm_cmpord_ss(__m128 __A, __m128 __B) {
		746	__vector unsigned int __a, __b;
		747	__vector unsigned int __c, __d;
		748	static const __vector unsigned int __float_exp_mask = {
		749	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
		750	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		751
		752	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
		753	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
		754	__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
		755	__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
		756	__c = vec_and(__c, __d);
		757	/* Then we merge the lower float result with the original upper
		758	* float elements from __A. */
		759	return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
		760	}
		761
		762	extern __inline __m128
		763	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		764	_mm_cmpunord_ss(__m128 __A, __m128 __B) {
		765	__vector unsigned int __a, __b;
		766	__vector unsigned int __c, __d;
		767	static const __vector unsigned int __float_exp_mask = {
		768	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
		769	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
		770
		771	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
		772	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
		773	__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
		774	__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
		775	__c = vec_or(__c, __d);
		776	/* Then we merge the lower float result with the original upper
		777	* float elements from __A. */
		778	return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
		779	}
		780
		781	/* Compare the lower SPFP values of A and B and return 1 if true
		782	and 0 if false. */
		783	extern __inline int
		784	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		785	_mm_comieq_ss(__m128 __A, __m128 __B) {
		786	return (__A[0] == __B[0]);
		787	}
		788
		789	extern __inline int
		790	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		791	_mm_comilt_ss(__m128 __A, __m128 __B) {
		792	return (__A[0] < __B[0]);
		793	}
		794
		795	extern __inline int
		796	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		797	_mm_comile_ss(__m128 __A, __m128 __B) {
		798	return (__A[0] <= __B[0]);
		799	}
		800
		801	extern __inline int
		802	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		803	_mm_comigt_ss(__m128 __A, __m128 __B) {
		804	return (__A[0] > __B[0]);
		805	}
		806
		807	extern __inline int
		808	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		809	_mm_comige_ss(__m128 __A, __m128 __B) {
		810	return (__A[0] >= __B[0]);
		811	}
		812
		813	extern __inline int
		814	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		815	_mm_comineq_ss(__m128 __A, __m128 __B) {
		816	return (__A[0] != __B[0]);
		817	}
		818
		819	/* FIXME
		820	* The __mm_ucomi??_ss implementations below are exactly the same as
		821	* __mm_comi??_ss because GCC for PowerPC only generates unordered
		822	* compares (scalar and vector).
		823	* Technically __mm_comieq_ss et al should be using the ordered
		824	* compare and signal for QNaNs.
		825	* The __mm_ucomieq_sd et all should be OK, as is.
		826	*/
		827	extern __inline int
		828	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		829	_mm_ucomieq_ss(__m128 __A, __m128 __B) {
		830	return (__A[0] == __B[0]);
		831	}
		832
		833	extern __inline int
		834	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		835	_mm_ucomilt_ss(__m128 __A, __m128 __B) {
		836	return (__A[0] < __B[0]);
		837	}
		838
		839	extern __inline int
		840	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		841	_mm_ucomile_ss(__m128 __A, __m128 __B) {
		842	return (__A[0] <= __B[0]);
		843	}
		844
		845	extern __inline int
		846	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		847	_mm_ucomigt_ss(__m128 __A, __m128 __B) {
		848	return (__A[0] > __B[0]);
		849	}
		850
		851	extern __inline int
		852	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		853	_mm_ucomige_ss(__m128 __A, __m128 __B) {
		854	return (__A[0] >= __B[0]);
		855	}
		856
		857	extern __inline int
		858	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		859	_mm_ucomineq_ss(__m128 __A, __m128 __B) {
		860	return (__A[0] != __B[0]);
		861	}
		862
		863	extern __inline float
		864	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		865	_mm_cvtss_f32(__m128 __A) {
		866	return ((__v4sf)__A)[0];
		867	}
		868
		869	/* Convert the lower SPFP value to a 32-bit integer according to the current
		870	rounding mode. */
		871	extern __inline int
		872	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		873	_mm_cvtss_si32(__m128 __A) {
		874	int __res;
		875	#ifdef _ARCH_PWR8
		876	double __dtmp;
		877	__asm__(
		878	#ifdef __LITTLE_ENDIAN__
		879	"xxsldwi %x0,%x0,%x0,3;\n"
		880	#endif
		881	"xscvspdp %x2,%x0;\n"
		882	"fctiw %2,%2;\n"
		883	"mfvsrd %1,%x2;\n"
		884	: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
		885	:);
		886	#else
		887	__res = __builtin_rint(__A[0]);
		888	#endif
		889	return __res;
		890	}
		891
		892	extern __inline int
		893	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		894	_mm_cvt_ss2si(__m128 __A) {
		895	return _mm_cvtss_si32(__A);
		896	}
		897
		898	/* Convert the lower SPFP value to a 32-bit integer according to the
		899	current rounding mode. */
		900
		901	/* Intel intrinsic. */
		902	extern __inline long long
		903	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		904	_mm_cvtss_si64(__m128 __A) {
		905	long long __res;
		906	#if defined(_ARCH_PWR8) && defined(__powerpc64__)
		907	double __dtmp;
		908	__asm__(
		909	#ifdef __LITTLE_ENDIAN__
		910	"xxsldwi %x0,%x0,%x0,3;\n"
		911	#endif
		912	"xscvspdp %x2,%x0;\n"
		913	"fctid %2,%2;\n"
		914	"mfvsrd %1,%x2;\n"
		915	: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
		916	:);
		917	#else
		918	__res = __builtin_llrint(__A[0]);
		919	#endif
		920	return __res;
		921	}
		922
		923	/* Microsoft intrinsic. */
		924	extern __inline long long
		925	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		926	_mm_cvtss_si64x(__m128 __A) {
		927	return _mm_cvtss_si64((__v4sf)__A);
		928	}
		929
		930	/* Constants for use with _mm_prefetch. */
		931	enum _mm_hint {
		932	/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
		933	_MM_HINT_ET0 = 7,
		934	_MM_HINT_ET1 = 6,
		935	_MM_HINT_T0 = 3,
		936	_MM_HINT_T1 = 2,
		937	_MM_HINT_T2 = 1,
		938	_MM_HINT_NTA = 0
		939	};
		940
		941	/* Loads one cache line from address P to a location "closer" to the
		942	processor. The selector I specifies the type of prefetch operation. */
		943	extern __inline void
		944	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		945	_mm_prefetch(const void *__P, enum _mm_hint __I) {
		946	/* Current PowerPC will ignores the hint parameters. */
		947	__builtin_prefetch(__P);
		948	}
		949
		950	/* Convert the two lower SPFP values to 32-bit integers according to the
		951	current rounding mode. Return the integers in packed form. */
		952	extern __inline __m64
		953	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		954	_mm_cvtps_pi32(__m128 __A) {
		955	/* Splat two lower SPFP values to both halves. */
		956	__v4sf __temp, __rounded;
		957	__vector unsigned long long __result;
		958
		959	/* Splat two lower SPFP values to both halves. */
		960	__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
		961	__rounded = vec_rint(__temp);
		962	__result = (__vector unsigned long long)vec_cts(__rounded, 0);
		963
		964	return (__m64)((__vector long long)__result)[0];
		965	}
		966
		967	extern __inline __m64
		968	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		969	_mm_cvt_ps2pi(__m128 __A) {
		970	return _mm_cvtps_pi32(__A);
		971	}
		972
		973	/* Truncate the lower SPFP value to a 32-bit integer. */
		974	extern __inline int
		975	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		976	_mm_cvttss_si32(__m128 __A) {
		977	/* Extract the lower float element. */
		978	float __temp = __A[0];
		979	/* truncate to 32-bit integer and return. */
		980	return __temp;
		981	}
		982
		983	extern __inline int
		984	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		985	_mm_cvtt_ss2si(__m128 __A) {
		986	return _mm_cvttss_si32(__A);
		987	}
		988
		989	/* Intel intrinsic. */
		990	extern __inline long long
		991	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		992	_mm_cvttss_si64(__m128 __A) {
		993	/* Extract the lower float element. */
		994	float __temp = __A[0];
		995	/* truncate to 32-bit integer and return. */
		996	return __temp;
		997	}
		998
		999	/* Microsoft intrinsic. */
		1000	extern __inline long long
		1001	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1002	_mm_cvttss_si64x(__m128 __A) {
		1003	/* Extract the lower float element. */
		1004	float __temp = __A[0];
		1005	/* truncate to 32-bit integer and return. */
		1006	return __temp;
		1007	}
		1008
		1009	/* Truncate the two lower SPFP values to 32-bit integers. Return the
		1010	integers in packed form. */
		1011	extern __inline __m64
		1012	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1013	_mm_cvttps_pi32(__m128 __A) {
		1014	__v4sf __temp;
		1015	__vector unsigned long long __result;
		1016
		1017	/* Splat two lower SPFP values to both halves. */
		1018	__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
		1019	__result = (__vector unsigned long long)vec_cts(__temp, 0);
		1020
		1021	return (__m64)((__vector long long)__result)[0];
		1022	}
		1023
		1024	extern __inline __m64
		1025	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1026	_mm_cvtt_ps2pi(__m128 __A) {
		1027	return _mm_cvttps_pi32(__A);
		1028	}
		1029
		1030	/* Convert B to a SPFP value and insert it as element zero in A. */
		1031	extern __inline __m128
		1032	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1033	_mm_cvtsi32_ss(__m128 __A, int __B) {
		1034	float __temp = __B;
		1035	__A[0] = __temp;
		1036
		1037	return __A;
		1038	}
		1039
		1040	extern __inline __m128
		1041	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1042	_mm_cvt_si2ss(__m128 __A, int __B) {
		1043	return _mm_cvtsi32_ss(__A, __B);
		1044	}
		1045
		1046	/* Convert B to a SPFP value and insert it as element zero in A. */
		1047	/* Intel intrinsic. */
		1048	extern __inline __m128
		1049	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1050	_mm_cvtsi64_ss(__m128 __A, long long __B) {
		1051	float __temp = __B;
		1052	__A[0] = __temp;
		1053
		1054	return __A;
		1055	}
		1056
		1057	/* Microsoft intrinsic. */
		1058	extern __inline __m128
		1059	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1060	_mm_cvtsi64x_ss(__m128 __A, long long __B) {
		1061	return _mm_cvtsi64_ss(__A, __B);
		1062	}
		1063
		1064	/* Convert the two 32-bit values in B to SPFP form and insert them
		1065	as the two lower elements in A. */
		1066	extern __inline __m128
		1067	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1068	_mm_cvtpi32_ps(__m128 __A, __m64 __B) {
		1069	__vector signed int __vm1;
		1070	__vector float __vf1;
		1071
		1072	__vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
		1073	__vf1 = (__vector float)vec_ctf(__vm1, 0);
		1074
		1075	return ((__m128)(__vector unsigned long long){
		1076	((__vector unsigned long long)__vf1)[0],
		1077	((__vector unsigned long long)__A)[1]});
		1078	}
		1079
		1080	extern __inline __m128
		1081	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1082	_mm_cvt_pi2ps(__m128 __A, __m64 __B) {
		1083	return _mm_cvtpi32_ps(__A, __B);
		1084	}
		1085
		1086	/* Convert the four signed 16-bit values in A to SPFP form. */
		1087	extern __inline __m128
		1088	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1089	_mm_cvtpi16_ps(__m64 __A) {
		1090	__vector signed short __vs8;
		1091	__vector signed int __vi4;
		1092	__vector float __vf1;
		1093
		1094	__vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
		1095	__vi4 = vec_vupklsh(__vs8);
		1096	__vf1 = (__vector float)vec_ctf(__vi4, 0);
		1097
		1098	return (__m128)__vf1;
		1099	}
		1100
		1101	/* Convert the four unsigned 16-bit values in A to SPFP form. */
		1102	extern __inline __m128
		1103	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1104	_mm_cvtpu16_ps(__m64 __A) {
		1105	const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
		1106	__vector unsigned short __vs8;
		1107	__vector unsigned int __vi4;
		1108	__vector float __vf1;
		1109
		1110	__vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
		1111	__vi4 = (__vector unsigned int)vec_mergel
		1112	#ifdef __LITTLE_ENDIAN__
		1113	(__vs8, __zero);
		1114	#else
		1115	(__zero, __vs8);
		1116	#endif
		1117	__vf1 = (__vector float)vec_ctf(__vi4, 0);
		1118
		1119	return (__m128)__vf1;
		1120	}
		1121
		1122	/* Convert the low four signed 8-bit values in A to SPFP form. */
		1123	extern __inline __m128
		1124	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1125	_mm_cvtpi8_ps(__m64 __A) {
		1126	__vector signed char __vc16;
		1127	__vector signed short __vs8;
		1128	__vector signed int __vi4;
		1129	__vector float __vf1;
		1130
		1131	__vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
		1132	__vs8 = vec_vupkhsb(__vc16);
		1133	__vi4 = vec_vupkhsh(__vs8);
		1134	__vf1 = (__vector float)vec_ctf(__vi4, 0);
		1135
		1136	return (__m128)__vf1;
		1137	}
		1138
		1139	/* Convert the low four unsigned 8-bit values in A to SPFP form. */
		1140	extern __inline __m128
		1141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1142
		1143	_mm_cvtpu8_ps(__m64 __A) {
		1144	const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
		1145	__vector unsigned char __vc16;
		1146	__vector unsigned short __vs8;
		1147	__vector unsigned int __vi4;
		1148	__vector float __vf1;
		1149
		1150	__vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
		1151	#ifdef __LITTLE_ENDIAN__
		1152	__vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
		1153	__vi4 =
		1154	(__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
		1155	#else
		1156	__vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
		1157	__vi4 =
		1158	(__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
		1159	#endif
		1160	__vf1 = (__vector float)vec_ctf(__vi4, 0);
		1161
		1162	return (__m128)__vf1;
		1163	}
		1164
		1165	/* Convert the four signed 32-bit values in A and B to SPFP form. */
		1166	extern __inline __m128
		1167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1168	_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
		1169	__vector signed int __vi4;
		1170	__vector float __vf4;
		1171
		1172	__vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
		1173	__vf4 = (__vector float)vec_ctf(__vi4, 0);
		1174	return (__m128)__vf4;
		1175	}
		1176
		1177	/* Convert the four SPFP values in A to four signed 16-bit integers. */
		1178	extern __inline __m64
		1179	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1180	_mm_cvtps_pi16(__m128 __A) {
		1181	__v4sf __rounded;
		1182	__vector signed int __temp;
		1183	__vector unsigned long long __result;
		1184
		1185	__rounded = vec_rint(__A);
		1186	__temp = vec_cts(__rounded, 0);
		1187	__result = (__vector unsigned long long)vec_pack(__temp, __temp);
		1188
		1189	return (__m64)((__vector long long)__result)[0];
		1190	}
		1191
		1192	/* Convert the four SPFP values in A to four signed 8-bit integers. */
		1193	extern __inline __m64
		1194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1195	_mm_cvtps_pi8(__m128 __A) {
		1196	__v4sf __rounded;
		1197	__vector signed int __tmp_i;
		1198	static const __vector signed int __zero = {0, 0, 0, 0};
		1199	__vector signed short __tmp_s;
		1200	__vector signed char __res_v;
		1201
		1202	__rounded = vec_rint(__A);
		1203	__tmp_i = vec_cts(__rounded, 0);
		1204	__tmp_s = vec_pack(__tmp_i, __zero);
		1205	__res_v = vec_pack(__tmp_s, __tmp_s);
		1206	return (__m64)((__vector long long)__res_v)[0];
		1207	}
		1208
		1209	/* Selects four specific SPFP values from A and B based on MASK. */
		1210	extern __inline __m128
		1211	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1212
		1213	_mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
		1214	unsigned long __element_selector_10 = __mask & 0x03;
		1215	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
		1216	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
		1217	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
		1218	static const unsigned int __permute_selectors[4] = {
		1219	#ifdef __LITTLE_ENDIAN__
		1220	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
		1221	#else
		1222	0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
		1223	#endif
		1224	};
		1225	__vector unsigned int __t;
		1226
		1227	__t[0] = __permute_selectors[__element_selector_10];
		1228	__t[1] = __permute_selectors[__element_selector_32];
		1229	__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
		1230	__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
		1231	return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
		1232	}
		1233
		1234	/* Selects and interleaves the upper two SPFP values from A and B. */
		1235	extern __inline __m128
		1236	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1237	_mm_unpackhi_ps(__m128 __A, __m128 __B) {
		1238	return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
		1239	}
		1240
		1241	/* Selects and interleaves the lower two SPFP values from A and B. */
		1242	extern __inline __m128
		1243	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1244	_mm_unpacklo_ps(__m128 __A, __m128 __B) {
		1245	return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
		1246	}
		1247
		1248	/* Sets the upper two SPFP values with 64-bits of data loaded from P;
		1249	the lower two values are passed through from A. */
		1250	extern __inline __m128
		1251	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1252	_mm_loadh_pi(__m128 __A, __m64 const *__P) {
		1253	__vector unsigned long long __a = (__vector unsigned long long)__A;
		1254	__vector unsigned long long __p = vec_splats(*__P);
		1255	__a[1] = __p[1];
		1256
		1257	return (__m128)__a;
		1258	}
		1259
		1260	/* Stores the upper two SPFP values of A into P. */
		1261	extern __inline void
		1262	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1263	_mm_storeh_pi(__m64 *__P, __m128 __A) {
		1264	__vector unsigned long long __a = (__vector unsigned long long)__A;
		1265
		1266	*__P = __a[1];
		1267	}
		1268
		1269	/* Moves the upper two values of B into the lower two values of A. */
		1270	extern __inline __m128
		1271	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1272	_mm_movehl_ps(__m128 __A, __m128 __B) {
		1273	return (__m128)vec_mergel((__vector unsigned long long)__B,
		1274	(__vector unsigned long long)__A);
		1275	}
		1276
		1277	/* Moves the lower two values of B into the upper two values of A. */
		1278	extern __inline __m128
		1279	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1280	_mm_movelh_ps(__m128 __A, __m128 __B) {
		1281	return (__m128)vec_mergeh((__vector unsigned long long)__A,
		1282	(__vector unsigned long long)__B);
		1283	}
		1284
		1285	/* Sets the lower two SPFP values with 64-bits of data loaded from P;
		1286	the upper two values are passed through from A. */
		1287	extern __inline __m128
		1288	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1289	_mm_loadl_pi(__m128 __A, __m64 const *__P) {
		1290	__vector unsigned long long __a = (__vector unsigned long long)__A;
		1291	__vector unsigned long long __p = vec_splats(*__P);
		1292	__a[0] = __p[0];
		1293
		1294	return (__m128)__a;
		1295	}
		1296
		1297	/* Stores the lower two SPFP values of A into P. */
		1298	extern __inline void
		1299	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1300	_mm_storel_pi(__m64 *__P, __m128 __A) {
		1301	__vector unsigned long long __a = (__vector unsigned long long)__A;
		1302
		1303	*__P = __a[0];
		1304	}
		1305
		1306	#ifdef _ARCH_PWR8
		1307	/* Intrinsic functions that require PowerISA 2.07 minimum. */
		1308
		1309	/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
		1310	extern __inline int
		1311	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1312	_mm_movemask_ps(__m128 __A) {
		1313	#ifdef _ARCH_PWR10
		1314	return vec_extractm((__vector unsigned int)__A);
		1315	#else
		1316	__vector unsigned long long __result;
		1317	static const __vector unsigned int __perm_mask = {
		1318	#ifdef __LITTLE_ENDIAN__
		1319	0x00204060, 0x80808080, 0x80808080, 0x80808080
		1320	#else
		1321	0x80808080, 0x80808080, 0x80808080, 0x00204060
		1322	#endif
		1323	};
		1324
		1325	__result = ((__vector unsigned long long)vec_vbpermq(
		1326	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
		1327
		1328	#ifdef __LITTLE_ENDIAN__
		1329	return __result[1];
		1330	#else
		1331	return __result[0];
		1332	#endif
		1333	#endif /* !_ARCH_PWR10 */
		1334	}
		1335	#endif /* _ARCH_PWR8 */
		1336
		1337	/* Create a vector with all four elements equal to P. /
		1338	extern __inline __m128
		1339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1340	_mm_load1_ps(float const *__P) {
		1341	return _mm_set1_ps(*__P);
		1342	}
		1343
		1344	extern __inline __m128
		1345	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1346	_mm_load_ps1(float const *__P) {
		1347	return _mm_load1_ps(__P);
		1348	}
		1349
		1350	/* Extracts one of the four words of A. The selector N must be immediate. */
		1351	extern __inline int
		1352	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1353	_mm_extract_pi16(__m64 const __A, int const __N) {
		1354	unsigned int __shiftr = __N & 3;
		1355	#ifdef __BIG_ENDIAN__
		1356	__shiftr = 3 - __shiftr;
		1357	#endif
		1358
		1359	return ((__A >> (__shiftr * 16)) & 0xffff);
		1360	}
		1361
		1362	extern __inline int
		1363	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1364	_m_pextrw(__m64 const __A, int const __N) {
		1365	return _mm_extract_pi16(__A, __N);
		1366	}
		1367
		1368	/* Inserts word D into one of four words of A. The selector N must be
		1369	immediate. */
		1370	extern __inline __m64
		1371	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1372	_mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
		1373	const int __shiftl = (__N & 3) * 16;
		1374	const __m64 __shiftD = (const __m64)__D << __shiftl;
		1375	const __m64 __mask = 0xffffUL << __shiftl;
		1376	__m64 __result = (__A & (~__mask)) \| (__shiftD & __mask);
		1377
		1378	return __result;
		1379	}
		1380
		1381	extern __inline __m64
		1382	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1383	_m_pinsrw(__m64 const __A, int const __D, int const __N) {
		1384	return _mm_insert_pi16(__A, __D, __N);
		1385	}
		1386
		1387	/* Compute the element-wise maximum of signed 16-bit values. */
		1388	extern __inline __m64
		1389	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1390
		1391	_mm_max_pi16(__m64 __A, __m64 __B) {
		1392	#if _ARCH_PWR8
		1393	__vector signed short __a, __b, __r;
		1394	__vector __bool short __c;
		1395
		1396	__a = (__vector signed short)vec_splats(__A);
		1397	__b = (__vector signed short)vec_splats(__B);
		1398	__c = (__vector __bool short)vec_cmpgt(__a, __b);
		1399	__r = vec_sel(__b, __a, __c);
		1400	return (__m64)((__vector long long)__r)[0];
		1401	#else
		1402	__m64_union __m1, __m2, __res;
		1403
		1404	__m1.as_m64 = __A;
		1405	__m2.as_m64 = __B;
		1406
		1407	__res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
		1408	: __m2.as_short[0];
		1409	__res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
		1410	: __m2.as_short[1];
		1411	__res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
		1412	: __m2.as_short[2];
		1413	__res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
		1414	: __m2.as_short[3];
		1415
		1416	return (__m64)__res.as_m64;
		1417	#endif
		1418	}
		1419
		1420	extern __inline __m64
		1421	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1422	_m_pmaxsw(__m64 __A, __m64 __B) {
		1423	return _mm_max_pi16(__A, __B);
		1424	}
		1425
		1426	/* Compute the element-wise maximum of unsigned 8-bit values. */
		1427	extern __inline __m64
		1428	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1429	_mm_max_pu8(__m64 __A, __m64 __B) {
		1430	#if _ARCH_PWR8
		1431	__vector unsigned char __a, __b, __r;
		1432	__vector __bool char __c;
		1433
		1434	__a = (__vector unsigned char)vec_splats(__A);
		1435	__b = (__vector unsigned char)vec_splats(__B);
		1436	__c = (__vector __bool char)vec_cmpgt(__a, __b);
		1437	__r = vec_sel(__b, __a, __c);
		1438	return (__m64)((__vector long long)__r)[0];
		1439	#else
		1440	__m64_union __m1, __m2, __res;
		1441	long __i;
		1442
		1443	__m1.as_m64 = __A;
		1444	__m2.as_m64 = __B;
		1445
		1446	for (__i = 0; __i < 8; __i++)
		1447	__res.as_char[__i] =
		1448	((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
		1449	? __m1.as_char[__i]
		1450	: __m2.as_char[__i];
		1451
		1452	return (__m64)__res.as_m64;
		1453	#endif
		1454	}
		1455
		1456	extern __inline __m64
		1457	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1458	_m_pmaxub(__m64 __A, __m64 __B) {
		1459	return _mm_max_pu8(__A, __B);
		1460	}
		1461
		1462	/* Compute the element-wise minimum of signed 16-bit values. */
		1463	extern __inline __m64
		1464	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1465	_mm_min_pi16(__m64 __A, __m64 __B) {
		1466	#if _ARCH_PWR8
		1467	__vector signed short __a, __b, __r;
		1468	__vector __bool short __c;
		1469
		1470	__a = (__vector signed short)vec_splats(__A);
		1471	__b = (__vector signed short)vec_splats(__B);
		1472	__c = (__vector __bool short)vec_cmplt(__a, __b);
		1473	__r = vec_sel(__b, __a, __c);
		1474	return (__m64)((__vector long long)__r)[0];
		1475	#else
		1476	__m64_union __m1, __m2, __res;
		1477
		1478	__m1.as_m64 = __A;
		1479	__m2.as_m64 = __B;
		1480
		1481	__res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
		1482	: __m2.as_short[0];
		1483	__res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
		1484	: __m2.as_short[1];
		1485	__res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
		1486	: __m2.as_short[2];
		1487	__res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
		1488	: __m2.as_short[3];
		1489
		1490	return (__m64)__res.as_m64;
		1491	#endif
		1492	}
		1493
		1494	extern __inline __m64
		1495	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1496	_m_pminsw(__m64 __A, __m64 __B) {
		1497	return _mm_min_pi16(__A, __B);
		1498	}
		1499
		1500	/* Compute the element-wise minimum of unsigned 8-bit values. */
		1501	extern __inline __m64
		1502	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1503	_mm_min_pu8(__m64 __A, __m64 __B) {
		1504	#if _ARCH_PWR8
		1505	__vector unsigned char __a, __b, __r;
		1506	__vector __bool char __c;
		1507
		1508	__a = (__vector unsigned char)vec_splats(__A);
		1509	__b = (__vector unsigned char)vec_splats(__B);
		1510	__c = (__vector __bool char)vec_cmplt(__a, __b);
		1511	__r = vec_sel(__b, __a, __c);
		1512	return (__m64)((__vector long long)__r)[0];
		1513	#else
		1514	__m64_union __m1, __m2, __res;
		1515	long __i;
		1516
		1517	__m1.as_m64 = __A;
		1518	__m2.as_m64 = __B;
		1519
		1520	for (__i = 0; __i < 8; __i++)
		1521	__res.as_char[__i] =
		1522	((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
		1523	? __m1.as_char[__i]
		1524	: __m2.as_char[__i];
		1525
		1526	return (__m64)__res.as_m64;
		1527	#endif
		1528	}
		1529
		1530	extern __inline __m64
		1531	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1532	_m_pminub(__m64 __A, __m64 __B) {
		1533	return _mm_min_pu8(__A, __B);
		1534	}
		1535
		1536	/* Create an 8-bit mask of the signs of 8-bit values. */
		1537	extern __inline int
		1538	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1539	_mm_movemask_pi8(__m64 __A) {
		1540	#ifdef __powerpc64__
		1541	unsigned long long __p =
		1542	#ifdef __LITTLE_ENDIAN__
		1543	0x0008101820283038UL; // permute control for sign bits
		1544	#else
		1545	0x3830282018100800UL; // permute control for sign bits
		1546	#endif
		1547	return __builtin_bpermd(__p, __A);
		1548	#else
		1549	#ifdef __LITTLE_ENDIAN__
		1550	unsigned int __mask = 0x20283038UL;
		1551	unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
		1552	unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
		1553	#else
		1554	unsigned int __mask = 0x38302820UL;
		1555	unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
		1556	unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
		1557	#endif
		1558	return (__r2 << 4) \| __r1;
		1559	#endif
		1560	}
		1561
		1562	extern __inline int
		1563	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1564	_m_pmovmskb(__m64 __A) {
		1565	return _mm_movemask_pi8(__A);
		1566	}
		1567
		1568	/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
		1569	in B and produce the high 16 bits of the 32-bit results. */
		1570	extern __inline __m64
		1571	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1572	_mm_mulhi_pu16(__m64 __A, __m64 __B) {
		1573	__vector unsigned short __a, __b;
		1574	__vector unsigned short __c;
		1575	__vector unsigned int __w0, __w1;
		1576	__vector unsigned char __xform1 = {
		1577	#ifdef __LITTLE_ENDIAN__
		1578	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
		1579	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
		1580	#else
		1581	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
		1582	0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
		1583	#endif
		1584	};
		1585
		1586	__a = (__vector unsigned short)vec_splats(__A);
		1587	__b = (__vector unsigned short)vec_splats(__B);
		1588
		1589	__w0 = vec_vmuleuh(__a, __b);
		1590	__w1 = vec_vmulouh(__a, __b);
		1591	__c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
		1592
		1593	return (__m64)((__vector long long)__c)[0];
		1594	}
		1595
		1596	extern __inline __m64
		1597	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1598	_m_pmulhuw(__m64 __A, __m64 __B) {
		1599	return _mm_mulhi_pu16(__A, __B);
		1600	}
		1601
		1602	/* Return a combination of the four 16-bit values in A. The selector
		1603	must be an immediate. */
		1604	extern __inline __m64
		1605	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1606	_mm_shuffle_pi16(__m64 __A, int const __N) {
		1607	unsigned long __element_selector_10 = __N & 0x03;
		1608	unsigned long __element_selector_32 = (__N >> 2) & 0x03;
		1609	unsigned long __element_selector_54 = (__N >> 4) & 0x03;
		1610	unsigned long __element_selector_76 = (__N >> 6) & 0x03;
		1611	static const unsigned short __permute_selectors[4] = {
		1612	#ifdef __LITTLE_ENDIAN__
		1613	0x0908, 0x0B0A, 0x0D0C, 0x0F0E
		1614	#else
		1615	0x0607, 0x0405, 0x0203, 0x0001
		1616	#endif
		1617	};
		1618	__m64_union __t;
		1619	__vector unsigned long long __a, __p, __r;
		1620
		1621	#ifdef __LITTLE_ENDIAN__
		1622	__t.as_short[0] = __permute_selectors[__element_selector_10];
		1623	__t.as_short[1] = __permute_selectors[__element_selector_32];
		1624	__t.as_short[2] = __permute_selectors[__element_selector_54];
		1625	__t.as_short[3] = __permute_selectors[__element_selector_76];
		1626	#else
		1627	__t.as_short[3] = __permute_selectors[__element_selector_10];
		1628	__t.as_short[2] = __permute_selectors[__element_selector_32];
		1629	__t.as_short[1] = __permute_selectors[__element_selector_54];
		1630	__t.as_short[0] = __permute_selectors[__element_selector_76];
		1631	#endif
		1632	__p = vec_splats(__t.as_m64);
		1633	__a = vec_splats(__A);
		1634	__r = vec_perm(__a, __a, (__vector unsigned char)__p);
		1635	return (__m64)((__vector long long)__r)[0];
		1636	}
		1637
		1638	extern __inline __m64
		1639	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1640	_m_pshufw(__m64 __A, int const __N) {
		1641	return _mm_shuffle_pi16(__A, __N);
		1642	}
		1643
		1644	/* Conditionally store byte elements of A into P. The high bit of each
		1645	byte in the selector N determines whether the corresponding byte from
		1646	A is stored. */
		1647	extern __inline void
		1648	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1649	_mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
		1650	__m64 __hibit = 0x8080808080808080UL;
		1651	__m64 __mask, __tmp;
		1652	__m64 __p = (__m64 )__P;
		1653
		1654	__tmp = *__p;
		1655	__mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
		1656	__tmp = (__tmp & (~__mask)) \| (__A & __mask);
		1657	*__p = __tmp;
		1658	}
		1659
		1660	extern __inline void
		1661	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1662	_m_maskmovq(__m64 __A, __m64 __N, char *__P) {
		1663	_mm_maskmove_si64(__A, __N, __P);
		1664	}
		1665
		1666	/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
		1667	extern __inline __m64
		1668	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1669	_mm_avg_pu8(__m64 __A, __m64 __B) {
		1670	__vector unsigned char __a, __b, __c;
		1671
		1672	__a = (__vector unsigned char)vec_splats(__A);
		1673	__b = (__vector unsigned char)vec_splats(__B);
		1674	__c = vec_avg(__a, __b);
		1675	return (__m64)((__vector long long)__c)[0];
		1676	}
		1677
		1678	extern __inline __m64
		1679	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1680	_m_pavgb(__m64 __A, __m64 __B) {
		1681	return _mm_avg_pu8(__A, __B);
		1682	}
		1683
		1684	/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
		1685	extern __inline __m64
		1686	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1687	_mm_avg_pu16(__m64 __A, __m64 __B) {
		1688	__vector unsigned short __a, __b, __c;
		1689
		1690	__a = (__vector unsigned short)vec_splats(__A);
		1691	__b = (__vector unsigned short)vec_splats(__B);
		1692	__c = vec_avg(__a, __b);
		1693	return (__m64)((__vector long long)__c)[0];
		1694	}
		1695
		1696	extern __inline __m64
		1697	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1698	_m_pavgw(__m64 __A, __m64 __B) {
		1699	return _mm_avg_pu16(__A, __B);
		1700	}
		1701
		1702	/* Compute the sum of the absolute differences of the unsigned 8-bit
		1703	values in A and B. Return the value in the lower 16-bit word; the
		1704	upper words are cleared. */
		1705	extern __inline __m64
		1706	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1707	_mm_sad_pu8(__m64 __A, __m64 __B) {
		1708	__vector unsigned char __a, __b;
		1709	__vector unsigned char __vmin, __vmax, __vabsdiff;
		1710	__vector signed int __vsum;
		1711	const __vector unsigned int __zero = {0, 0, 0, 0};
		1712	__m64_union __result = {0};
		1713
		1714	__a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
		1715	__b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
		1716	__vmin = vec_min(__a, __b);
		1717	__vmax = vec_max(__a, __b);
		1718	__vabsdiff = vec_sub(__vmax, __vmin);
		1719	/* Sum four groups of bytes into integers. */
		1720	__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
		1721	/* Sum across four integers with integer result. */
		1722	__vsum = vec_sums(__vsum, (__vector signed int)__zero);
		1723	/* The sum is in the right most 32-bits of the vector result.
		1724	Transfer to a GPR and truncate to 16 bits. */
		1725	__result.as_short[0] = __vsum[3];
		1726	return __result.as_m64;
		1727	}
		1728
		1729	extern __inline __m64
		1730	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1731	_m_psadbw(__m64 __A, __m64 __B) {
		1732	return _mm_sad_pu8(__A, __B);
		1733	}
		1734
		1735	/* Stores the data in A to the address P without polluting the caches. */
		1736	extern __inline void
		1737	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1738	_mm_stream_pi(__m64 *__P, __m64 __A) {
		1739	/* Use the data cache block touch for store transient. */
		1740	__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
		1741	*__P = __A;
		1742	}
		1743
		1744	/* Likewise. The address must be 16-byte aligned. */
		1745	extern __inline void
		1746	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1747	_mm_stream_ps(float *__P, __m128 __A) {
		1748	/* Use the data cache block touch for store transient. */
		1749	__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
		1750	_mm_store_ps(__P, __A);
		1751	}
		1752
		1753	/* Guarantees that every preceding store is globally visible before
		1754	any subsequent store. */
		1755	extern __inline void
		1756	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1757	_mm_sfence(void) {
		1758	/* Generate a light weight sync. */
		1759	__atomic_thread_fence(__ATOMIC_RELEASE);
		1760	}
		1761
		1762	/* The execution of the next instruction is delayed by an implementation
		1763	specific amount of time. The instruction does not modify the
		1764	architectural state. This is after the pop_options pragma because
		1765	it does not require SSE support in the processor--the encoding is a
		1766	nop on processors that do not support it. */
		1767	extern __inline void
		1768	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1769	_mm_pause(void) {
		1770	/* There is no exact match with this construct, but the following is
		1771	close to the desired effect. */
		1772	#if _ARCH_PWR8
		1773	/* On power8 and later processors we can depend on Program Priority
		1774	(PRI) and associated "very low" PPI setting. Since we don't know
		1775	what PPI this thread is running at we: 1) save the current PRI
		1776	from the PPR SPR into a local GRP, 2) set the PRI to "very low*
		1777	via the special or 31,31,31 encoding. 3) issue an "isync" to
		1778	insure the PRI change takes effect before we execute any more
		1779	instructions.
		1780	Now we can execute a lwsync (release barrier) while we execute
		1781	this thread at "very low" PRI. Finally we restore the original
		1782	PRI and continue execution. */
		1783	unsigned long __PPR;
		1784
		1785	__asm__ volatile(" mfppr %0;"
		1786	" or 31,31,31;"
		1787	" isync;"
		1788	" lwsync;"
		1789	" isync;"
		1790	" mtppr %0;"
		1791	: "=r"(__PPR)
		1792	:
		1793	: "memory");
		1794	#else
		1795	/* For older processor where we may not even have Program Priority
		1796	controls we can only depend on Heavy Weight Sync. */
		1797	__atomic_thread_fence(__ATOMIC_SEQ_CST);
		1798	#endif
		1799	}
		1800
		1801	/* Transpose the 4x4 matrix composed of row[0-3]. */
		1802	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
		1803	do { \
		1804	__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
		1805	__v4sf __t0 = vec_vmrghw(__r0, __r1); \
		1806	__v4sf __t1 = vec_vmrghw(__r2, __r3); \
		1807	__v4sf __t2 = vec_vmrglw(__r0, __r1); \
		1808	__v4sf __t3 = vec_vmrglw(__r2, __r3); \
		1809	(row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
		1810	(__vector long long)__t1); \
		1811	(row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
		1812	(__vector long long)__t1); \
		1813	(row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
		1814	(__vector long long)__t3); \
		1815	(row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
		1816	(__vector long long)__t3); \
		1817	} while (0)
		1818
		1819	/* For backward source compatibility. */
		1820	//# include <emmintrin.h>
		1821
		1822	#else
		1823	#include_next <xmmintrin.h>
		1824	#endif /* defined(__powerpc64__) && \
		1825	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
		1826
		1827	#endif /* XMMINTRIN_H_ */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/xmmintrin.h – Rev 14