WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/emmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	/* Implemented from the specification included in the Intel C++ Compiler
		11	User Guide and Reference, version 9.0. */
		12
		13	#ifndef NO_WARN_X86_INTRINSICS
		14	/* This header file is to help porting code using Intel intrinsics
		15	explicitly from x86_64 to powerpc64/powerpc64le.
		16
		17	Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
		18	PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
		19	However scalar float operations in vector (XMM) registers require
		20	the POWER8 VSX ISA (2.07) level. There are differences for data
		21	format and placement of float scalars in the vector register, which
		22	require extra steps to match SSE2 scalar float semantics on POWER.
		23
		24	It should be noted that there's much difference between X86_64's
		25	MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
		26	portable <fenv.h> instead of access MXSCR directly.
		27
		28	Most SSE2 scalar float intrinsic operations can be performed more
		29	efficiently as C language float scalar operations or optimized to
		30	use vector SIMD operations. We recommend this for new applications.
		31	*/
		32	#error \
		33	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
		34	#endif
		35
		36	#ifndef EMMINTRIN_H_
		37	#define EMMINTRIN_H_
		38
		39	#if defined(__powerpc64__) && \
		40	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
		41
		42	#include <altivec.h>
		43
		44	/* We need definitions from the SSE header files. */
		45	#include <xmmintrin.h>
		46
		47	/* SSE2 */
		48	typedef __vector double __v2df;
		49	typedef __vector long long __v2di;
		50	typedef __vector unsigned long long __v2du;
		51	typedef __vector int __v4si;
		52	typedef __vector unsigned int __v4su;
		53	typedef __vector short __v8hi;
		54	typedef __vector unsigned short __v8hu;
		55	typedef __vector signed char __v16qi;
		56	typedef __vector unsigned char __v16qu;
		57
		58	/* The Intel API is flexible enough that we must allow aliasing with other
		59	vector types, and their scalar components. */
		60	typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
		61	typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
		62
		63	/* Unaligned version of the same types. */
		64	typedef long long __m128i_u
		65	__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
		66	typedef double __m128d_u
		67	__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
		68
		69	/* Define two value permute mask. */
		70	#define _MM_SHUFFLE2(x, y) (((x) << 1) \| (y))
		71
		72	/* Create a vector with element 0 as F and the rest zero. */
		73	extern __inline __m128d
		74	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		75	_mm_set_sd(double __F) {
		76	return __extension__(__m128d){__F, 0.0};
		77	}
		78
		79	/* Create a vector with both elements equal to F. */
		80	extern __inline __m128d
		81	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		82	_mm_set1_pd(double __F) {
		83	return __extension__(__m128d){__F, __F};
		84	}
		85
		86	extern __inline __m128d
		87	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		88	_mm_set_pd1(double __F) {
		89	return _mm_set1_pd(__F);
		90	}
		91
		92	/* Create a vector with the lower value X and upper value W. */
		93	extern __inline __m128d
		94	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		95	_mm_set_pd(double __W, double __X) {
		96	return __extension__(__m128d){__X, __W};
		97	}
		98
		99	/* Create a vector with the lower value W and upper value X. */
		100	extern __inline __m128d
		101	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		102	_mm_setr_pd(double __W, double __X) {
		103	return __extension__(__m128d){__W, __X};
		104	}
		105
		106	/* Create an undefined vector. */
		107	extern __inline __m128d
		108	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		109	_mm_undefined_pd(void) {
		110	__m128d __Y = __Y;
		111	return __Y;
		112	}
		113
		114	/* Create a vector of zeros. */
		115	extern __inline __m128d
		116	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		117	_mm_setzero_pd(void) {
		118	return (__m128d)vec_splats(0);
		119	}
		120
		121	/* Sets the low DPFP value of A from the low value of B. */
		122	extern __inline __m128d
		123	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		124	_mm_move_sd(__m128d __A, __m128d __B) {
		125	__v2df __result = (__v2df)__A;
		126	__result[0] = ((__v2df)__B)[0];
		127	return (__m128d)__result;
		128	}
		129
		130	/* Load two DPFP values from P. The address must be 16-byte aligned. */
		131	extern __inline __m128d
		132	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		133	_mm_load_pd(double const *__P) {
		134	return ((__m128d)vec_ld(0, (__v16qu *)__P));
		135	}
		136
		137	/* Load two DPFP values from P. The address need not be 16-byte aligned. */
		138	extern __inline __m128d
		139	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		140	_mm_loadu_pd(double const *__P) {
		141	return (vec_vsx_ld(0, __P));
		142	}
		143
		144	/* Create a vector with all two elements equal to P. /
		145	extern __inline __m128d
		146	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		147	_mm_load1_pd(double const *__P) {
		148	return (vec_splats(*__P));
		149	}
		150
		151	/* Create a vector with element 0 as P and the rest zero. /
		152	extern __inline __m128d
		153	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		154	_mm_load_sd(double const *__P) {
		155	return _mm_set_sd(*__P);
		156	}
		157
		158	extern __inline __m128d
		159	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		160	_mm_load_pd1(double const *__P) {
		161	return _mm_load1_pd(__P);
		162	}
		163
		164	/* Load two DPFP values in reverse order. The address must be aligned. */
		165	extern __inline __m128d
		166	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		167	_mm_loadr_pd(double const *__P) {
		168	__v2df __tmp = _mm_load_pd(__P);
		169	return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
		170	}
		171
		172	/* Store two DPFP values. The address must be 16-byte aligned. */
		173	extern __inline void
		174	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		175	_mm_store_pd(double *__P, __m128d __A) {
		176	vec_st((__v16qu)__A, 0, (__v16qu *)__P);
		177	}
		178
		179	/* Store two DPFP values. The address need not be 16-byte aligned. */
		180	extern __inline void
		181	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		182	_mm_storeu_pd(double *__P, __m128d __A) {
		183	(__m128d_u )__P = __A;
		184	}
		185
		186	/* Stores the lower DPFP value. */
		187	extern __inline void
		188	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		189	_mm_store_sd(double *__P, __m128d __A) {
		190	*__P = ((__v2df)__A)[0];
		191	}
		192
		193	extern __inline double
		194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		195	_mm_cvtsd_f64(__m128d __A) {
		196	return ((__v2df)__A)[0];
		197	}
		198
		199	extern __inline void
		200	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		201	_mm_storel_pd(double *__P, __m128d __A) {
		202	_mm_store_sd(__P, __A);
		203	}
		204
		205	/* Stores the upper DPFP value. */
		206	extern __inline void
		207	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		208	_mm_storeh_pd(double *__P, __m128d __A) {
		209	*__P = ((__v2df)__A)[1];
		210	}
		211	/* Store the lower DPFP value across two words.
		212	The address must be 16-byte aligned. */
		213	extern __inline void
		214	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		215	_mm_store1_pd(double *__P, __m128d __A) {
		216	_mm_store_pd(__P, vec_splat(__A, 0));
		217	}
		218
		219	extern __inline void
		220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		221	_mm_store_pd1(double *__P, __m128d __A) {
		222	_mm_store1_pd(__P, __A);
		223	}
		224
		225	/* Store two DPFP values in reverse order. The address must be aligned. */
		226	extern __inline void
		227	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		228	_mm_storer_pd(double *__P, __m128d __A) {
		229	_mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
		230	}
		231
		232	/* Intel intrinsic. */
		233	extern __inline long long
		234	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		235	_mm_cvtsi128_si64(__m128i __A) {
		236	return ((__v2di)__A)[0];
		237	}
		238
		239	/* Microsoft intrinsic. */
		240	extern __inline long long
		241	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		242	_mm_cvtsi128_si64x(__m128i __A) {
		243	return ((__v2di)__A)[0];
		244	}
		245
		246	extern __inline __m128d
		247	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		248	_mm_add_pd(__m128d __A, __m128d __B) {
		249	return (__m128d)((__v2df)__A + (__v2df)__B);
		250	}
		251
		252	/* Add the lower double-precision (64-bit) floating-point element in
		253	a and b, store the result in the lower element of dst, and copy
		254	the upper element from a to the upper element of dst. */
		255	extern __inline __m128d
		256	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		257	_mm_add_sd(__m128d __A, __m128d __B) {
		258	__A[0] = __A[0] + __B[0];
		259	return (__A);
		260	}
		261
		262	extern __inline __m128d
		263	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		264	_mm_sub_pd(__m128d __A, __m128d __B) {
		265	return (__m128d)((__v2df)__A - (__v2df)__B);
		266	}
		267
		268	extern __inline __m128d
		269	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		270	_mm_sub_sd(__m128d __A, __m128d __B) {
		271	__A[0] = __A[0] - __B[0];
		272	return (__A);
		273	}
		274
		275	extern __inline __m128d
		276	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		277	_mm_mul_pd(__m128d __A, __m128d __B) {
		278	return (__m128d)((__v2df)__A * (__v2df)__B);
		279	}
		280
		281	extern __inline __m128d
		282	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		283	_mm_mul_sd(__m128d __A, __m128d __B) {
		284	__A[0] = __A[0] * __B[0];
		285	return (__A);
		286	}
		287
		288	extern __inline __m128d
		289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		290	_mm_div_pd(__m128d __A, __m128d __B) {
		291	return (__m128d)((__v2df)__A / (__v2df)__B);
		292	}
		293
		294	extern __inline __m128d
		295	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		296	_mm_div_sd(__m128d __A, __m128d __B) {
		297	__A[0] = __A[0] / __B[0];
		298	return (__A);
		299	}
		300
		301	extern __inline __m128d
		302	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		303	_mm_sqrt_pd(__m128d __A) {
		304	return (vec_sqrt(__A));
		305	}
		306
		307	/* Return pair {sqrt (B[0]), A[1]}. */
		308	extern __inline __m128d
		309	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		310	_mm_sqrt_sd(__m128d __A, __m128d __B) {
		311	__v2df __c;
		312	__c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
		313	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		314	}
		315
		316	extern __inline __m128d
		317	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		318	_mm_min_pd(__m128d __A, __m128d __B) {
		319	return (vec_min(__A, __B));
		320	}
		321
		322	extern __inline __m128d
		323	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		324	_mm_min_sd(__m128d __A, __m128d __B) {
		325	__v2df __a, __b, __c;
		326	__a = vec_splats(__A[0]);
		327	__b = vec_splats(__B[0]);
		328	__c = vec_min(__a, __b);
		329	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		330	}
		331
		332	extern __inline __m128d
		333	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		334	_mm_max_pd(__m128d __A, __m128d __B) {
		335	return (vec_max(__A, __B));
		336	}
		337
		338	extern __inline __m128d
		339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		340	_mm_max_sd(__m128d __A, __m128d __B) {
		341	__v2df __a, __b, __c;
		342	__a = vec_splats(__A[0]);
		343	__b = vec_splats(__B[0]);
		344	__c = vec_max(__a, __b);
		345	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		346	}
		347
		348	extern __inline __m128d
		349	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		350	_mm_cmpeq_pd(__m128d __A, __m128d __B) {
		351	return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
		352	}
		353
		354	extern __inline __m128d
		355	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		356	_mm_cmplt_pd(__m128d __A, __m128d __B) {
		357	return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
		358	}
		359
		360	extern __inline __m128d
		361	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		362	_mm_cmple_pd(__m128d __A, __m128d __B) {
		363	return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
		364	}
		365
		366	extern __inline __m128d
		367	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		368	_mm_cmpgt_pd(__m128d __A, __m128d __B) {
		369	return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
		370	}
		371
		372	extern __inline __m128d
		373	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		374	_mm_cmpge_pd(__m128d __A, __m128d __B) {
		375	return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
		376	}
		377
		378	extern __inline __m128d
		379	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		380	_mm_cmpneq_pd(__m128d __A, __m128d __B) {
		381	__v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
		382	return ((__m128d)vec_nor(__temp, __temp));
		383	}
		384
		385	extern __inline __m128d
		386	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		387	_mm_cmpnlt_pd(__m128d __A, __m128d __B) {
		388	return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
		389	}
		390
		391	extern __inline __m128d
		392	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		393	_mm_cmpnle_pd(__m128d __A, __m128d __B) {
		394	return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
		395	}
		396
		397	extern __inline __m128d
		398	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		399	_mm_cmpngt_pd(__m128d __A, __m128d __B) {
		400	return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
		401	}
		402
		403	extern __inline __m128d
		404	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		405	_mm_cmpnge_pd(__m128d __A, __m128d __B) {
		406	return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
		407	}
		408
		409	extern __inline __m128d
		410	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		411	_mm_cmpord_pd(__m128d __A, __m128d __B) {
		412	__v2du __c, __d;
		413	/* Compare against self will return false (0's) if NAN. */
		414	__c = (__v2du)vec_cmpeq(__A, __A);
		415	__d = (__v2du)vec_cmpeq(__B, __B);
		416	/* A != NAN and B != NAN. */
		417	return ((__m128d)vec_and(__c, __d));
		418	}
		419
		420	extern __inline __m128d
		421	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		422	_mm_cmpunord_pd(__m128d __A, __m128d __B) {
		423	#if _ARCH_PWR8
		424	__v2du __c, __d;
		425	/* Compare against self will return false (0's) if NAN. */
		426	__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
		427	__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
		428	/* A == NAN OR B == NAN converts too:
		429	NOT(A != NAN) OR NOT(B != NAN). */
		430	__c = vec_nor(__c, __c);
		431	return ((__m128d)vec_orc(__c, __d));
		432	#else
		433	__v2du __c, __d;
		434	/* Compare against self will return false (0's) if NAN. */
		435	__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
		436	__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
		437	/* Convert the true ('1's) is NAN. */
		438	__c = vec_nor(__c, __c);
		439	__d = vec_nor(__d, __d);
		440	return ((__m128d)vec_or(__c, __d));
		441	#endif
		442	}
		443
		444	extern __inline __m128d
		445	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		446	_mm_cmpeq_sd(__m128d __A, __m128d __B) {
		447	__v2df __a, __b, __c;
		448	/* PowerISA VSX does not allow partial (for just lower double)
		449	results. So to insure we don't generate spurious exceptions
		450	(from the upper double values) we splat the lower double
		451	before we do the operation. */
		452	__a = vec_splats(__A[0]);
		453	__b = vec_splats(__B[0]);
		454	__c = (__v2df)vec_cmpeq(__a, __b);
		455	/* Then we merge the lower double result with the original upper
		456	double from __A. */
		457	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		458	}
		459
		460	extern __inline __m128d
		461	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		462	_mm_cmplt_sd(__m128d __A, __m128d __B) {
		463	__v2df __a, __b, __c;
		464	__a = vec_splats(__A[0]);
		465	__b = vec_splats(__B[0]);
		466	__c = (__v2df)vec_cmplt(__a, __b);
		467	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		468	}
		469
		470	extern __inline __m128d
		471	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		472	_mm_cmple_sd(__m128d __A, __m128d __B) {
		473	__v2df __a, __b, __c;
		474	__a = vec_splats(__A[0]);
		475	__b = vec_splats(__B[0]);
		476	__c = (__v2df)vec_cmple(__a, __b);
		477	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		478	}
		479
		480	extern __inline __m128d
		481	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		482	_mm_cmpgt_sd(__m128d __A, __m128d __B) {
		483	__v2df __a, __b, __c;
		484	__a = vec_splats(__A[0]);
		485	__b = vec_splats(__B[0]);
		486	__c = (__v2df)vec_cmpgt(__a, __b);
		487	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		488	}
		489
		490	extern __inline __m128d
		491	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		492	_mm_cmpge_sd(__m128d __A, __m128d __B) {
		493	__v2df __a, __b, __c;
		494	__a = vec_splats(__A[0]);
		495	__b = vec_splats(__B[0]);
		496	__c = (__v2df)vec_cmpge(__a, __b);
		497	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		498	}
		499
		500	extern __inline __m128d
		501	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		502	_mm_cmpneq_sd(__m128d __A, __m128d __B) {
		503	__v2df __a, __b, __c;
		504	__a = vec_splats(__A[0]);
		505	__b = vec_splats(__B[0]);
		506	__c = (__v2df)vec_cmpeq(__a, __b);
		507	__c = vec_nor(__c, __c);
		508	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		509	}
		510
		511	extern __inline __m128d
		512	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		513	_mm_cmpnlt_sd(__m128d __A, __m128d __B) {
		514	__v2df __a, __b, __c;
		515	__a = vec_splats(__A[0]);
		516	__b = vec_splats(__B[0]);
		517	/* Not less than is just greater than or equal. */
		518	__c = (__v2df)vec_cmpge(__a, __b);
		519	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		520	}
		521
		522	extern __inline __m128d
		523	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		524	_mm_cmpnle_sd(__m128d __A, __m128d __B) {
		525	__v2df __a, __b, __c;
		526	__a = vec_splats(__A[0]);
		527	__b = vec_splats(__B[0]);
		528	/* Not less than or equal is just greater than. */
		529	__c = (__v2df)vec_cmpge(__a, __b);
		530	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		531	}
		532
		533	extern __inline __m128d
		534	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		535	_mm_cmpngt_sd(__m128d __A, __m128d __B) {
		536	__v2df __a, __b, __c;
		537	__a = vec_splats(__A[0]);
		538	__b = vec_splats(__B[0]);
		539	/* Not greater than is just less than or equal. */
		540	__c = (__v2df)vec_cmple(__a, __b);
		541	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		542	}
		543
		544	extern __inline __m128d
		545	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		546	_mm_cmpnge_sd(__m128d __A, __m128d __B) {
		547	__v2df __a, __b, __c;
		548	__a = vec_splats(__A[0]);
		549	__b = vec_splats(__B[0]);
		550	/* Not greater than or equal is just less than. */
		551	__c = (__v2df)vec_cmplt(__a, __b);
		552	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
		553	}
		554
		555	extern __inline __m128d
		556	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		557	_mm_cmpord_sd(__m128d __A, __m128d __B) {
		558	__v2df __r;
		559	__r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
		560	return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
		561	}
		562
		563	extern __inline __m128d
		564	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		565	_mm_cmpunord_sd(__m128d __A, __m128d __B) {
		566	__v2df __r;
		567	__r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
		568	return (__m128d)_mm_setr_pd(__r[0], __A[1]);
		569	}
		570
		571	/* FIXME
		572	The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
		573	exactly the same because GCC for PowerPC only generates unordered
		574	compares (scalar and vector).
		575	Technically __mm_comieq_sp et all should be using the ordered
		576	compare and signal for QNaNs. The __mm_ucomieq_sd et all should
		577	be OK. */
		578	extern __inline int
		579	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		580	_mm_comieq_sd(__m128d __A, __m128d __B) {
		581	return (__A[0] == __B[0]);
		582	}
		583
		584	extern __inline int
		585	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		586	_mm_comilt_sd(__m128d __A, __m128d __B) {
		587	return (__A[0] < __B[0]);
		588	}
		589
		590	extern __inline int
		591	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		592	_mm_comile_sd(__m128d __A, __m128d __B) {
		593	return (__A[0] <= __B[0]);
		594	}
		595
		596	extern __inline int
		597	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		598	_mm_comigt_sd(__m128d __A, __m128d __B) {
		599	return (__A[0] > __B[0]);
		600	}
		601
		602	extern __inline int
		603	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		604	_mm_comige_sd(__m128d __A, __m128d __B) {
		605	return (__A[0] >= __B[0]);
		606	}
		607
		608	extern __inline int
		609	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		610	_mm_comineq_sd(__m128d __A, __m128d __B) {
		611	return (__A[0] != __B[0]);
		612	}
		613
		614	extern __inline int
		615	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		616	_mm_ucomieq_sd(__m128d __A, __m128d __B) {
		617	return (__A[0] == __B[0]);
		618	}
		619
		620	extern __inline int
		621	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		622	_mm_ucomilt_sd(__m128d __A, __m128d __B) {
		623	return (__A[0] < __B[0]);
		624	}
		625
		626	extern __inline int
		627	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		628	_mm_ucomile_sd(__m128d __A, __m128d __B) {
		629	return (__A[0] <= __B[0]);
		630	}
		631
		632	extern __inline int
		633	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		634	_mm_ucomigt_sd(__m128d __A, __m128d __B) {
		635	return (__A[0] > __B[0]);
		636	}
		637
		638	extern __inline int
		639	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		640	_mm_ucomige_sd(__m128d __A, __m128d __B) {
		641	return (__A[0] >= __B[0]);
		642	}
		643
		644	extern __inline int
		645	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		646	_mm_ucomineq_sd(__m128d __A, __m128d __B) {
		647	return (__A[0] != __B[0]);
		648	}
		649
		650	/* Create a vector of Qi, where i is the element number. */
		651	extern __inline __m128i
		652	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		653	_mm_set_epi64x(long long __q1, long long __q0) {
		654	return __extension__(__m128i)(__v2di){__q0, __q1};
		655	}
		656
		657	extern __inline __m128i
		658	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		659	_mm_set_epi64(__m64 __q1, __m64 __q0) {
		660	return _mm_set_epi64x((long long)__q1, (long long)__q0);
		661	}
		662
		663	extern __inline __m128i
		664	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		665	_mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
		666	return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
		667	}
		668
		669	extern __inline __m128i
		670	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		671	_mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
		672	short __q2, short __q1, short __q0) {
		673	return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
		674	__q4, __q5, __q6, __q7};
		675	}
		676
		677	extern __inline __m128i
		678	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		679	_mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
		680	char __q10, char __q09, char __q08, char __q07, char __q06,
		681	char __q05, char __q04, char __q03, char __q02, char __q01,
		682	char __q00) {
		683	return __extension__(__m128i)(__v16qi){
		684	__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
		685	__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
		686	}
		687
		688	/* Set all of the elements of the vector to A. */
		689	extern __inline __m128i
		690	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		691	_mm_set1_epi64x(long long __A) {
		692	return _mm_set_epi64x(__A, __A);
		693	}
		694
		695	extern __inline __m128i
		696	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		697	_mm_set1_epi64(__m64 __A) {
		698	return _mm_set_epi64(__A, __A);
		699	}
		700
		701	extern __inline __m128i
		702	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		703	_mm_set1_epi32(int __A) {
		704	return _mm_set_epi32(__A, __A, __A, __A);
		705	}
		706
		707	extern __inline __m128i
		708	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		709	_mm_set1_epi16(short __A) {
		710	return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
		711	}
		712
		713	extern __inline __m128i
		714	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		715	_mm_set1_epi8(char __A) {
		716	return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
		717	__A, __A, __A, __A, __A);
		718	}
		719
		720	/* Create a vector of Qi, where i is the element number.
		721	The parameter order is reversed from the _mm_set_epi* functions. */
		722	extern __inline __m128i
		723	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		724	_mm_setr_epi64(__m64 __q0, __m64 __q1) {
		725	return _mm_set_epi64(__q1, __q0);
		726	}
		727
		728	extern __inline __m128i
		729	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		730	_mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
		731	return _mm_set_epi32(__q3, __q2, __q1, __q0);
		732	}
		733
		734	extern __inline __m128i
		735	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		736	_mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
		737	short __q5, short __q6, short __q7) {
		738	return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
		739	}
		740
		741	extern __inline __m128i
		742	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		743	_mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
		744	char __q05, char __q06, char __q07, char __q08, char __q09,
		745	char __q10, char __q11, char __q12, char __q13, char __q14,
		746	char __q15) {
		747	return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
		748	__q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
		749	}
		750
		751	/* Create a vector with element 0 as P and the rest zero. /
		752	extern __inline __m128i
		753	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		754	_mm_load_si128(__m128i const *__P) {
		755	return *__P;
		756	}
		757
		758	extern __inline __m128i
		759	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		760	_mm_loadu_si128(__m128i_u const *__P) {
		761	return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
		762	}
		763
		764	extern __inline __m128i
		765	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		766	_mm_loadl_epi64(__m128i_u const *__P) {
		767	return _mm_set_epi64((__m64)0LL, (__m64 )__P);
		768	}
		769
		770	extern __inline void
		771	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		772	_mm_store_si128(__m128i *__P, __m128i __B) {
		773	vec_st((__v16qu)__B, 0, (__v16qu *)__P);
		774	}
		775
		776	extern __inline void
		777	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		778	_mm_storeu_si128(__m128i_u *__P, __m128i __B) {
		779	*__P = __B;
		780	}
		781
		782	extern __inline void
		783	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		784	_mm_storel_epi64(__m128i_u *__P, __m128i __B) {
		785	(long long )__P = ((__v2di)__B)[0];
		786	}
		787
		788	extern __inline __m64
		789	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		790	_mm_movepi64_pi64(__m128i_u __B) {
		791	return (__m64)((__v2di)__B)[0];
		792	}
		793
		794	extern __inline __m128i
		795	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		796	_mm_movpi64_epi64(__m64 __A) {
		797	return _mm_set_epi64((__m64)0LL, __A);
		798	}
		799
		800	extern __inline __m128i
		801	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		802	_mm_move_epi64(__m128i __A) {
		803	return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
		804	}
		805
		806	/* Create an undefined vector. */
		807	extern __inline __m128i
		808	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		809	_mm_undefined_si128(void) {
		810	__m128i __Y = __Y;
		811	return __Y;
		812	}
		813
		814	/* Create a vector of zeros. */
		815	extern __inline __m128i
		816	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		817	_mm_setzero_si128(void) {
		818	return __extension__(__m128i)(__v4si){0, 0, 0, 0};
		819	}
		820
		821	#ifdef _ARCH_PWR8
		822	extern __inline __m128d
		823	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		824	_mm_cvtepi32_pd(__m128i __A) {
		825	__v2di __val;
		826	/* For LE need to generate Vector Unpack Low Signed Word.
		827	Which is generated from unpackh. */
		828	__val = (__v2di)vec_unpackh((__v4si)__A);
		829
		830	return (__m128d)vec_ctf(__val, 0);
		831	}
		832	#endif
		833
		834	extern __inline __m128
		835	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		836	_mm_cvtepi32_ps(__m128i __A) {
		837	return ((__m128)vec_ctf((__v4si)__A, 0));
		838	}
		839
		840	extern __inline __m128i
		841	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		842	_mm_cvtpd_epi32(__m128d __A) {
		843	__v2df __rounded = vec_rint(__A);
		844	__v4si __result, __temp;
		845	const __v4si __vzero = {0, 0, 0, 0};
		846
		847	/* VSX Vector truncate Double-Precision to integer and Convert to
		848	Signed Integer Word format with Saturate. */
		849	__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
		850
		851	#ifdef _ARCH_PWR8
		852	#ifdef __LITTLE_ENDIAN__
		853	__temp = vec_mergeo(__temp, __temp);
		854	#else
		855	__temp = vec_mergee(__temp, __temp);
		856	#endif
		857	__result = (__v4si)vec_vpkudum((__vector long long)__temp,
		858	(__vector long long)__vzero);
		859	#else
		860	{
		861	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
		862	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
		863	__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
		864	}
		865	#endif
		866	return (__m128i)__result;
		867	}
		868
		869	extern __inline __m64
		870	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		871	_mm_cvtpd_pi32(__m128d __A) {
		872	__m128i __result = _mm_cvtpd_epi32(__A);
		873
		874	return (__m64)__result[0];
		875	}
		876
		877	extern __inline __m128
		878	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		879	_mm_cvtpd_ps(__m128d __A) {
		880	__v4sf __result;
		881	__v4si __temp;
		882	const __v4si __vzero = {0, 0, 0, 0};
		883
		884	__asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
		885
		886	#ifdef _ARCH_PWR8
		887	#ifdef __LITTLE_ENDIAN__
		888	__temp = vec_mergeo(__temp, __temp);
		889	#else
		890	__temp = vec_mergee(__temp, __temp);
		891	#endif
		892	__result = (__v4sf)vec_vpkudum((__vector long long)__temp,
		893	(__vector long long)__vzero);
		894	#else
		895	{
		896	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
		897	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
		898	__result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
		899	}
		900	#endif
		901	return ((__m128)__result);
		902	}
		903
		904	extern __inline __m128i
		905	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		906	_mm_cvttpd_epi32(__m128d __A) {
		907	__v4si __result;
		908	__v4si __temp;
		909	const __v4si __vzero = {0, 0, 0, 0};
		910
		911	/* VSX Vector truncate Double-Precision to integer and Convert to
		912	Signed Integer Word format with Saturate. */
		913	__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
		914
		915	#ifdef _ARCH_PWR8
		916	#ifdef __LITTLE_ENDIAN__
		917	__temp = vec_mergeo(__temp, __temp);
		918	#else
		919	__temp = vec_mergee(__temp, __temp);
		920	#endif
		921	__result = (__v4si)vec_vpkudum((__vector long long)__temp,
		922	(__vector long long)__vzero);
		923	#else
		924	{
		925	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
		926	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
		927	__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
		928	}
		929	#endif
		930
		931	return ((__m128i)__result);
		932	}
		933
		934	extern __inline __m64
		935	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		936	_mm_cvttpd_pi32(__m128d __A) {
		937	__m128i __result = _mm_cvttpd_epi32(__A);
		938
		939	return (__m64)__result[0];
		940	}
		941
		942	extern __inline int
		943	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		944	_mm_cvtsi128_si32(__m128i __A) {
		945	return ((__v4si)__A)[0];
		946	}
		947
		948	#ifdef _ARCH_PWR8
		949	extern __inline __m128d
		950	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		951	_mm_cvtpi32_pd(__m64 __A) {
		952	__v4si __temp;
		953	__v2di __tmp2;
		954	__v2df __result;
		955
		956	__temp = (__v4si)vec_splats(__A);
		957	__tmp2 = (__v2di)vec_unpackl(__temp);
		958	__result = vec_ctf((__vector signed long long)__tmp2, 0);
		959	return (__m128d)__result;
		960	}
		961	#endif
		962
		963	extern __inline __m128i
		964	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		965	_mm_cvtps_epi32(__m128 __A) {
		966	__v4sf __rounded;
		967	__v4si __result;
		968
		969	__rounded = vec_rint((__v4sf)__A);
		970	__result = vec_cts(__rounded, 0);
		971	return (__m128i)__result;
		972	}
		973
		974	extern __inline __m128i
		975	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		976	_mm_cvttps_epi32(__m128 __A) {
		977	__v4si __result;
		978
		979	__result = vec_cts((__v4sf)__A, 0);
		980	return (__m128i)__result;
		981	}
		982
		983	extern __inline __m128d
		984	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		985	_mm_cvtps_pd(__m128 __A) {
		986	/* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
		987	#ifdef vec_doubleh
		988	return (__m128d)vec_doubleh((__v4sf)__A);
		989	#else
		990	/* Otherwise the compiler is not current and so need to generate the
		991	equivalent code. */
		992	__v4sf __a = (__v4sf)__A;
		993	__v4sf __temp;
		994	__v2df __result;
		995	#ifdef __LITTLE_ENDIAN__
		996	/* The input float values are in elements {[0], [1]} but the convert
		997	instruction needs them in elements {[1], [3]}, So we use two
		998	shift left double vector word immediates to get the elements
		999	lined up. */
		1000	__temp = __builtin_vsx_xxsldwi(__a, __a, 3);
		1001	__temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
		1002	#else
		1003	/* The input float values are in elements {[0], [1]} but the convert
		1004	instruction needs them in elements {[0], [2]}, So we use two
		1005	shift left double vector word immediates to get the elements
		1006	lined up. */
		1007	__temp = vec_vmrghw(__a, __a);
		1008	#endif
		1009	__asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
		1010	return (__m128d)__result;
		1011	#endif
		1012	}
		1013
		1014	extern __inline int
		1015	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1016	_mm_cvtsd_si32(__m128d __A) {
		1017	__v2df __rounded = vec_rint((__v2df)__A);
		1018	int __result = ((__v2df)__rounded)[0];
		1019
		1020	return __result;
		1021	}
		1022	/* Intel intrinsic. */
		1023	extern __inline long long
		1024	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1025	_mm_cvtsd_si64(__m128d __A) {
		1026	__v2df __rounded = vec_rint((__v2df)__A);
		1027	long long __result = ((__v2df)__rounded)[0];
		1028
		1029	return __result;
		1030	}
		1031
		1032	/* Microsoft intrinsic. */
		1033	extern __inline long long
		1034	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1035	_mm_cvtsd_si64x(__m128d __A) {
		1036	return _mm_cvtsd_si64((__v2df)__A);
		1037	}
		1038
		1039	extern __inline int
		1040	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1041	_mm_cvttsd_si32(__m128d __A) {
		1042	int __result = ((__v2df)__A)[0];
		1043
		1044	return __result;
		1045	}
		1046
		1047	/* Intel intrinsic. */
		1048	extern __inline long long
		1049	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1050	_mm_cvttsd_si64(__m128d __A) {
		1051	long long __result = ((__v2df)__A)[0];
		1052
		1053	return __result;
		1054	}
		1055
		1056	/* Microsoft intrinsic. */
		1057	extern __inline long long
		1058	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1059	_mm_cvttsd_si64x(__m128d __A) {
		1060	return _mm_cvttsd_si64(__A);
		1061	}
		1062
		1063	extern __inline __m128
		1064	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1065	_mm_cvtsd_ss(__m128 __A, __m128d __B) {
		1066	__v4sf __result = (__v4sf)__A;
		1067
		1068	#ifdef __LITTLE_ENDIAN__
		1069	__v4sf __temp_s;
		1070	/* Copy double element[0] to element [1] for conversion. */
		1071	__v2df __temp_b = vec_splat((__v2df)__B, 0);
		1072
		1073	/* Pre-rotate __A left 3 (logically right 1) elements. */
		1074	__result = __builtin_vsx_xxsldwi(__result, __result, 3);
		1075	/* Convert double to single float scalar in a vector. */
		1076	__asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
		1077	/* Shift the resulting scalar into vector element [0]. */
		1078	__result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
		1079	#else
		1080	__result[0] = ((__v2df)__B)[0];
		1081	#endif
		1082	return (__m128)__result;
		1083	}
		1084
		1085	extern __inline __m128d
		1086	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1087	_mm_cvtsi32_sd(__m128d __A, int __B) {
		1088	__v2df __result = (__v2df)__A;
		1089	double __db = __B;
		1090	__result[0] = __db;
		1091	return (__m128d)__result;
		1092	}
		1093
		1094	/* Intel intrinsic. */
		1095	extern __inline __m128d
		1096	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1097	_mm_cvtsi64_sd(__m128d __A, long long __B) {
		1098	__v2df __result = (__v2df)__A;
		1099	double __db = __B;
		1100	__result[0] = __db;
		1101	return (__m128d)__result;
		1102	}
		1103
		1104	/* Microsoft intrinsic. */
		1105	extern __inline __m128d
		1106	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1107	_mm_cvtsi64x_sd(__m128d __A, long long __B) {
		1108	return _mm_cvtsi64_sd(__A, __B);
		1109	}
		1110
		1111	extern __inline __m128d
		1112	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1113	_mm_cvtss_sd(__m128d __A, __m128 __B) {
		1114	#ifdef __LITTLE_ENDIAN__
		1115	/* Use splat to move element [0] into position for the convert. */
		1116	__v4sf __temp = vec_splat((__v4sf)__B, 0);
		1117	__v2df __res;
		1118	/* Convert single float scalar to double in a vector. */
		1119	__asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
		1120	return (__m128d)vec_mergel(__res, (__v2df)__A);
		1121	#else
		1122	__v2df __res = (__v2df)__A;
		1123	__res[0] = ((__v4sf)__B)[0];
		1124	return (__m128d)__res;
		1125	#endif
		1126	}
		1127
		1128	extern __inline __m128d
		1129	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1130	_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
		1131	__vector double __result;
		1132	const int __litmsk = __mask & 0x3;
		1133
		1134	if (__litmsk == 0)
		1135	__result = vec_mergeh(__A, __B);
		1136	#if __GNUC__ < 6
		1137	else if (__litmsk == 1)
		1138	__result = vec_xxpermdi(__B, __A, 2);
		1139	else if (__litmsk == 2)
		1140	__result = vec_xxpermdi(__B, __A, 1);
		1141	#else
		1142	else if (__litmsk == 1)
		1143	__result = vec_xxpermdi(__A, __B, 2);
		1144	else if (__litmsk == 2)
		1145	__result = vec_xxpermdi(__A, __B, 1);
		1146	#endif
		1147	else
		1148	__result = vec_mergel(__A, __B);
		1149
		1150	return __result;
		1151	}
		1152
		1153	extern __inline __m128d
		1154	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1155	_mm_unpackhi_pd(__m128d __A, __m128d __B) {
		1156	return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
		1157	}
		1158
		1159	extern __inline __m128d
		1160	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1161	_mm_unpacklo_pd(__m128d __A, __m128d __B) {
		1162	return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
		1163	}
		1164
		1165	extern __inline __m128d
		1166	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1167	_mm_loadh_pd(__m128d __A, double const *__B) {
		1168	__v2df __result = (__v2df)__A;
		1169	__result[1] = *__B;
		1170	return (__m128d)__result;
		1171	}
		1172
		1173	extern __inline __m128d
		1174	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1175	_mm_loadl_pd(__m128d __A, double const *__B) {
		1176	__v2df __result = (__v2df)__A;
		1177	__result[0] = *__B;
		1178	return (__m128d)__result;
		1179	}
		1180
		1181	#ifdef _ARCH_PWR8
		1182	/* Intrinsic functions that require PowerISA 2.07 minimum. */
		1183
		1184	/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
		1185	extern __inline int
		1186	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1187	_mm_movemask_pd(__m128d __A) {
		1188	#ifdef _ARCH_PWR10
		1189	return vec_extractm((__v2du)__A);
		1190	#else
		1191	__vector unsigned long long __result;
		1192	static const __vector unsigned int __perm_mask = {
		1193	#ifdef __LITTLE_ENDIAN__
		1194	0x80800040, 0x80808080, 0x80808080, 0x80808080
		1195	#else
		1196	0x80808080, 0x80808080, 0x80808080, 0x80804000
		1197	#endif
		1198	};
		1199
		1200	__result = ((__vector unsigned long long)vec_vbpermq(
		1201	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
		1202
		1203	#ifdef __LITTLE_ENDIAN__
		1204	return __result[1];
		1205	#else
		1206	return __result[0];
		1207	#endif
		1208	#endif /* !_ARCH_PWR10 */
		1209	}
		1210	#endif /* _ARCH_PWR8 */
		1211
		1212	extern __inline __m128i
		1213	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1214	_mm_packs_epi16(__m128i __A, __m128i __B) {
		1215	return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
		1216	}
		1217
		1218	extern __inline __m128i
		1219	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1220	_mm_packs_epi32(__m128i __A, __m128i __B) {
		1221	return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
		1222	}
		1223
		1224	extern __inline __m128i
		1225	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1226	_mm_packus_epi16(__m128i __A, __m128i __B) {
		1227	return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
		1228	}
		1229
		1230	extern __inline __m128i
		1231	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1232	_mm_unpackhi_epi8(__m128i __A, __m128i __B) {
		1233	return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
		1234	}
		1235
		1236	extern __inline __m128i
		1237	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1238	_mm_unpackhi_epi16(__m128i __A, __m128i __B) {
		1239	return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
		1240	}
		1241
		1242	extern __inline __m128i
		1243	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1244	_mm_unpackhi_epi32(__m128i __A, __m128i __B) {
		1245	return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
		1246	}
		1247
		1248	extern __inline __m128i
		1249	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1250	_mm_unpackhi_epi64(__m128i __A, __m128i __B) {
		1251	return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
		1252	}
		1253
		1254	extern __inline __m128i
		1255	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1256	_mm_unpacklo_epi8(__m128i __A, __m128i __B) {
		1257	return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
		1258	}
		1259
		1260	extern __inline __m128i
		1261	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1262	_mm_unpacklo_epi16(__m128i __A, __m128i __B) {
		1263	return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
		1264	}
		1265
		1266	extern __inline __m128i
		1267	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1268	_mm_unpacklo_epi32(__m128i __A, __m128i __B) {
		1269	return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
		1270	}
		1271
		1272	extern __inline __m128i
		1273	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1274	_mm_unpacklo_epi64(__m128i __A, __m128i __B) {
		1275	return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
		1276	}
		1277
		1278	extern __inline __m128i
		1279	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1280	_mm_add_epi8(__m128i __A, __m128i __B) {
		1281	return (__m128i)((__v16qu)__A + (__v16qu)__B);
		1282	}
		1283
		1284	extern __inline __m128i
		1285	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1286	_mm_add_epi16(__m128i __A, __m128i __B) {
		1287	return (__m128i)((__v8hu)__A + (__v8hu)__B);
		1288	}
		1289
		1290	extern __inline __m128i
		1291	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1292	_mm_add_epi32(__m128i __A, __m128i __B) {
		1293	return (__m128i)((__v4su)__A + (__v4su)__B);
		1294	}
		1295
		1296	extern __inline __m128i
		1297	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1298	_mm_add_epi64(__m128i __A, __m128i __B) {
		1299	return (__m128i)((__v2du)__A + (__v2du)__B);
		1300	}
		1301
		1302	extern __inline __m128i
		1303	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1304	_mm_adds_epi8(__m128i __A, __m128i __B) {
		1305	return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
		1306	}
		1307
		1308	extern __inline __m128i
		1309	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1310	_mm_adds_epi16(__m128i __A, __m128i __B) {
		1311	return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
		1312	}
		1313
		1314	extern __inline __m128i
		1315	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1316	_mm_adds_epu8(__m128i __A, __m128i __B) {
		1317	return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
		1318	}
		1319
		1320	extern __inline __m128i
		1321	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1322	_mm_adds_epu16(__m128i __A, __m128i __B) {
		1323	return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
		1324	}
		1325
		1326	extern __inline __m128i
		1327	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1328	_mm_sub_epi8(__m128i __A, __m128i __B) {
		1329	return (__m128i)((__v16qu)__A - (__v16qu)__B);
		1330	}
		1331
		1332	extern __inline __m128i
		1333	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1334	_mm_sub_epi16(__m128i __A, __m128i __B) {
		1335	return (__m128i)((__v8hu)__A - (__v8hu)__B);
		1336	}
		1337
		1338	extern __inline __m128i
		1339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1340	_mm_sub_epi32(__m128i __A, __m128i __B) {
		1341	return (__m128i)((__v4su)__A - (__v4su)__B);
		1342	}
		1343
		1344	extern __inline __m128i
		1345	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1346	_mm_sub_epi64(__m128i __A, __m128i __B) {
		1347	return (__m128i)((__v2du)__A - (__v2du)__B);
		1348	}
		1349
		1350	extern __inline __m128i
		1351	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1352	_mm_subs_epi8(__m128i __A, __m128i __B) {
		1353	return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
		1354	}
		1355
		1356	extern __inline __m128i
		1357	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1358	_mm_subs_epi16(__m128i __A, __m128i __B) {
		1359	return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
		1360	}
		1361
		1362	extern __inline __m128i
		1363	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1364	_mm_subs_epu8(__m128i __A, __m128i __B) {
		1365	return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
		1366	}
		1367
		1368	extern __inline __m128i
		1369	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1370	_mm_subs_epu16(__m128i __A, __m128i __B) {
		1371	return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
		1372	}
		1373
		1374	extern __inline __m128i
		1375	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1376	_mm_madd_epi16(__m128i __A, __m128i __B) {
		1377	__vector signed int __zero = {0, 0, 0, 0};
		1378
		1379	return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
		1380	}
		1381
		1382	extern __inline __m128i
		1383	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1384	_mm_mulhi_epi16(__m128i __A, __m128i __B) {
		1385	__vector signed int __w0, __w1;
		1386
		1387	__vector unsigned char __xform1 = {
		1388	#ifdef __LITTLE_ENDIAN__
		1389	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
		1390	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
		1391	#else
		1392	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
		1393	0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
		1394	#endif
		1395	};
		1396
		1397	__w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
		1398	__w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
		1399	return (__m128i)vec_perm(__w0, __w1, __xform1);
		1400	}
		1401
		1402	extern __inline __m128i
		1403	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1404	_mm_mullo_epi16(__m128i __A, __m128i __B) {
		1405	return (__m128i)((__v8hi)__A * (__v8hi)__B);
		1406	}
		1407
		1408	extern __inline __m64
		1409	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1410	_mm_mul_su32(__m64 __A, __m64 __B) {
		1411	unsigned int __a = __A;
		1412	unsigned int __b = __B;
		1413
		1414	return ((__m64)__a * (__m64)__b);
		1415	}
		1416
		1417	#ifdef _ARCH_PWR8
		1418	extern __inline __m128i
		1419	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1420	_mm_mul_epu32(__m128i __A, __m128i __B) {
		1421	#if __GNUC__ < 8
		1422	__v2du __result;
		1423
		1424	#ifdef __LITTLE_ENDIAN__
		1425	/* VMX Vector Multiply Odd Unsigned Word. */
		1426	__asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
		1427	#else
		1428	/* VMX Vector Multiply Even Unsigned Word. */
		1429	__asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
		1430	#endif
		1431	return (__m128i)__result;
		1432	#else
		1433	return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
		1434	#endif
		1435	}
		1436	#endif
		1437
		1438	extern __inline __m128i
		1439	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1440	_mm_slli_epi16(__m128i __A, int __B) {
		1441	__v8hu __lshift;
		1442	__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
		1443
		1444	if (__B >= 0 && __B < 16) {
		1445	if (__builtin_constant_p(__B))
		1446	__lshift = (__v8hu)vec_splat_s16(__B);
		1447	else
		1448	__lshift = vec_splats((unsigned short)__B);
		1449
		1450	__result = vec_sl((__v8hi)__A, __lshift);
		1451	}
		1452
		1453	return (__m128i)__result;
		1454	}
		1455
		1456	extern __inline __m128i
		1457	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1458	_mm_slli_epi32(__m128i __A, int __B) {
		1459	__v4su __lshift;
		1460	__v4si __result = {0, 0, 0, 0};
		1461
		1462	if (__B >= 0 && __B < 32) {
		1463	if (__builtin_constant_p(__B) && __B < 16)
		1464	__lshift = (__v4su)vec_splat_s32(__B);
		1465	else
		1466	__lshift = vec_splats((unsigned int)__B);
		1467
		1468	__result = vec_sl((__v4si)__A, __lshift);
		1469	}
		1470
		1471	return (__m128i)__result;
		1472	}
		1473
		1474	#ifdef _ARCH_PWR8
		1475	extern __inline __m128i
		1476	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1477	_mm_slli_epi64(__m128i __A, int __B) {
		1478	__v2du __lshift;
		1479	__v2di __result = {0, 0};
		1480
		1481	if (__B >= 0 && __B < 64) {
		1482	if (__builtin_constant_p(__B) && __B < 16)
		1483	__lshift = (__v2du)vec_splat_s32(__B);
		1484	else
		1485	__lshift = (__v2du)vec_splats((unsigned int)__B);
		1486
		1487	__result = vec_sl((__v2di)__A, __lshift);
		1488	}
		1489
		1490	return (__m128i)__result;
		1491	}
		1492	#endif
		1493
		1494	extern __inline __m128i
		1495	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1496	_mm_srai_epi16(__m128i __A, int __B) {
		1497	__v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
		1498	__v8hi __result;
		1499
		1500	if (__B < 16) {
		1501	if (__builtin_constant_p(__B))
		1502	__rshift = (__v8hu)vec_splat_s16(__B);
		1503	else
		1504	__rshift = vec_splats((unsigned short)__B);
		1505	}
		1506	__result = vec_sra((__v8hi)__A, __rshift);
		1507
		1508	return (__m128i)__result;
		1509	}
		1510
		1511	extern __inline __m128i
		1512	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1513	_mm_srai_epi32(__m128i __A, int __B) {
		1514	__v4su __rshift = {31, 31, 31, 31};
		1515	__v4si __result;
		1516
		1517	if (__B < 32) {
		1518	if (__builtin_constant_p(__B)) {
		1519	if (__B < 16)
		1520	__rshift = (__v4su)vec_splat_s32(__B);
		1521	else
		1522	__rshift = (__v4su)vec_splats((unsigned int)__B);
		1523	} else
		1524	__rshift = vec_splats((unsigned int)__B);
		1525	}
		1526	__result = vec_sra((__v4si)__A, __rshift);
		1527
		1528	return (__m128i)__result;
		1529	}
		1530
		1531	extern __inline __m128i
		1532	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1533	_mm_bslli_si128(__m128i __A, const int __N) {
		1534	__v16qu __result;
		1535	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
		1536
		1537	if (__N < 16)
		1538	__result = vec_sld((__v16qu)__A, __zeros, __N);
		1539	else
		1540	__result = __zeros;
		1541
		1542	return (__m128i)__result;
		1543	}
		1544
		1545	extern __inline __m128i
		1546	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1547	_mm_bsrli_si128(__m128i __A, const int __N) {
		1548	__v16qu __result;
		1549	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
		1550
		1551	if (__N < 16)
		1552	#ifdef __LITTLE_ENDIAN__
		1553	if (__builtin_constant_p(__N))
		1554	/* Would like to use Vector Shift Left Double by Octet
		1555	Immediate here to use the immediate form and avoid
		1556	load of __N * 8 value into a separate VR. */
		1557	__result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
		1558	else
		1559	#endif
		1560	{
		1561	__v16qu __shift = vec_splats((unsigned char)(__N * 8));
		1562	#ifdef __LITTLE_ENDIAN__
		1563	__result = vec_sro((__v16qu)__A, __shift);
		1564	#else
		1565	__result = vec_slo((__v16qu)__A, __shift);
		1566	#endif
		1567	}
		1568	else
		1569	__result = __zeros;
		1570
		1571	return (__m128i)__result;
		1572	}
		1573
		1574	extern __inline __m128i
		1575	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1576	_mm_srli_si128(__m128i __A, const int __N) {
		1577	return _mm_bsrli_si128(__A, __N);
		1578	}
		1579
		1580	extern __inline __m128i
		1581	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1582	_mm_slli_si128(__m128i __A, const int _imm5) {
		1583	__v16qu __result;
		1584	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
		1585
		1586	if (_imm5 < 16)
		1587	#ifdef __LITTLE_ENDIAN__
		1588	__result = vec_sld((__v16qu)__A, __zeros, _imm5);
		1589	#else
		1590	__result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
		1591	#endif
		1592	else
		1593	__result = __zeros;
		1594
		1595	return (__m128i)__result;
		1596	}
		1597
		1598	extern __inline __m128i
		1599	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1600
		1601	_mm_srli_epi16(__m128i __A, int __B) {
		1602	__v8hu __rshift;
		1603	__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
		1604
		1605	if (__B < 16) {
		1606	if (__builtin_constant_p(__B))
		1607	__rshift = (__v8hu)vec_splat_s16(__B);
		1608	else
		1609	__rshift = vec_splats((unsigned short)__B);
		1610
		1611	__result = vec_sr((__v8hi)__A, __rshift);
		1612	}
		1613
		1614	return (__m128i)__result;
		1615	}
		1616
		1617	extern __inline __m128i
		1618	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1619	_mm_srli_epi32(__m128i __A, int __B) {
		1620	__v4su __rshift;
		1621	__v4si __result = {0, 0, 0, 0};
		1622
		1623	if (__B < 32) {
		1624	if (__builtin_constant_p(__B)) {
		1625	if (__B < 16)
		1626	__rshift = (__v4su)vec_splat_s32(__B);
		1627	else
		1628	__rshift = (__v4su)vec_splats((unsigned int)__B);
		1629	} else
		1630	__rshift = vec_splats((unsigned int)__B);
		1631
		1632	__result = vec_sr((__v4si)__A, __rshift);
		1633	}
		1634
		1635	return (__m128i)__result;
		1636	}
		1637
		1638	#ifdef _ARCH_PWR8
		1639	extern __inline __m128i
		1640	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1641	_mm_srli_epi64(__m128i __A, int __B) {
		1642	__v2du __rshift;
		1643	__v2di __result = {0, 0};
		1644
		1645	if (__B < 64) {
		1646	if (__builtin_constant_p(__B)) {
		1647	if (__B < 16)
		1648	__rshift = (__v2du)vec_splat_s32(__B);
		1649	else
		1650	__rshift = (__v2du)vec_splats((unsigned long long)__B);
		1651	} else
		1652	__rshift = (__v2du)vec_splats((unsigned int)__B);
		1653
		1654	__result = vec_sr((__v2di)__A, __rshift);
		1655	}
		1656
		1657	return (__m128i)__result;
		1658	}
		1659	#endif
		1660
		1661	extern __inline __m128i
		1662	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1663	_mm_sll_epi16(__m128i __A, __m128i __B) {
		1664	__v8hu __lshift;
		1665	__vector __bool short __shmask;
		1666	const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
		1667	__v8hu __result;
		1668
		1669	#ifdef __LITTLE_ENDIAN__
		1670	__lshift = vec_splat((__v8hu)__B, 0);
		1671	#else
		1672	__lshift = vec_splat((__v8hu)__B, 3);
		1673	#endif
		1674	__shmask = vec_cmple(__lshift, __shmax);
		1675	__result = vec_sl((__v8hu)__A, __lshift);
		1676	__result = vec_sel((__v8hu)__shmask, __result, __shmask);
		1677
		1678	return (__m128i)__result;
		1679	}
		1680
		1681	extern __inline __m128i
		1682	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1683	_mm_sll_epi32(__m128i __A, __m128i __B) {
		1684	__v4su __lshift;
		1685	__vector __bool int __shmask;
		1686	const __v4su __shmax = {32, 32, 32, 32};
		1687	__v4su __result;
		1688	#ifdef __LITTLE_ENDIAN__
		1689	__lshift = vec_splat((__v4su)__B, 0);
		1690	#else
		1691	__lshift = vec_splat((__v4su)__B, 1);
		1692	#endif
		1693	__shmask = vec_cmplt(__lshift, __shmax);
		1694	__result = vec_sl((__v4su)__A, __lshift);
		1695	__result = vec_sel((__v4su)__shmask, __result, __shmask);
		1696
		1697	return (__m128i)__result;
		1698	}
		1699
		1700	#ifdef _ARCH_PWR8
		1701	extern __inline __m128i
		1702	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1703	_mm_sll_epi64(__m128i __A, __m128i __B) {
		1704	__v2du __lshift;
		1705	__vector __bool long long __shmask;
		1706	const __v2du __shmax = {64, 64};
		1707	__v2du __result;
		1708
		1709	__lshift = vec_splat((__v2du)__B, 0);
		1710	__shmask = vec_cmplt(__lshift, __shmax);
		1711	__result = vec_sl((__v2du)__A, __lshift);
		1712	__result = vec_sel((__v2du)__shmask, __result, __shmask);
		1713
		1714	return (__m128i)__result;
		1715	}
		1716	#endif
		1717
		1718	extern __inline __m128i
		1719	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1720	_mm_sra_epi16(__m128i __A, __m128i __B) {
		1721	const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
		1722	__v8hu __rshift;
		1723	__v8hi __result;
		1724
		1725	#ifdef __LITTLE_ENDIAN__
		1726	__rshift = vec_splat((__v8hu)__B, 0);
		1727	#else
		1728	__rshift = vec_splat((__v8hu)__B, 3);
		1729	#endif
		1730	__rshift = vec_min(__rshift, __rshmax);
		1731	__result = vec_sra((__v8hi)__A, __rshift);
		1732
		1733	return (__m128i)__result;
		1734	}
		1735
		1736	extern __inline __m128i
		1737	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1738	_mm_sra_epi32(__m128i __A, __m128i __B) {
		1739	const __v4su __rshmax = {31, 31, 31, 31};
		1740	__v4su __rshift;
		1741	__v4si __result;
		1742
		1743	#ifdef __LITTLE_ENDIAN__
		1744	__rshift = vec_splat((__v4su)__B, 0);
		1745	#else
		1746	__rshift = vec_splat((__v4su)__B, 1);
		1747	#endif
		1748	__rshift = vec_min(__rshift, __rshmax);
		1749	__result = vec_sra((__v4si)__A, __rshift);
		1750
		1751	return (__m128i)__result;
		1752	}
		1753
		1754	extern __inline __m128i
		1755	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1756	_mm_srl_epi16(__m128i __A, __m128i __B) {
		1757	__v8hu __rshift;
		1758	__vector __bool short __shmask;
		1759	const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
		1760	__v8hu __result;
		1761
		1762	#ifdef __LITTLE_ENDIAN__
		1763	__rshift = vec_splat((__v8hu)__B, 0);
		1764	#else
		1765	__rshift = vec_splat((__v8hu)__B, 3);
		1766	#endif
		1767	__shmask = vec_cmple(__rshift, __shmax);
		1768	__result = vec_sr((__v8hu)__A, __rshift);
		1769	__result = vec_sel((__v8hu)__shmask, __result, __shmask);
		1770
		1771	return (__m128i)__result;
		1772	}
		1773
		1774	extern __inline __m128i
		1775	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1776	_mm_srl_epi32(__m128i __A, __m128i __B) {
		1777	__v4su __rshift;
		1778	__vector __bool int __shmask;
		1779	const __v4su __shmax = {32, 32, 32, 32};
		1780	__v4su __result;
		1781
		1782	#ifdef __LITTLE_ENDIAN__
		1783	__rshift = vec_splat((__v4su)__B, 0);
		1784	#else
		1785	__rshift = vec_splat((__v4su)__B, 1);
		1786	#endif
		1787	__shmask = vec_cmplt(__rshift, __shmax);
		1788	__result = vec_sr((__v4su)__A, __rshift);
		1789	__result = vec_sel((__v4su)__shmask, __result, __shmask);
		1790
		1791	return (__m128i)__result;
		1792	}
		1793
		1794	#ifdef _ARCH_PWR8
		1795	extern __inline __m128i
		1796	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1797	_mm_srl_epi64(__m128i __A, __m128i __B) {
		1798	__v2du __rshift;
		1799	__vector __bool long long __shmask;
		1800	const __v2du __shmax = {64, 64};
		1801	__v2du __result;
		1802
		1803	__rshift = vec_splat((__v2du)__B, 0);
		1804	__shmask = vec_cmplt(__rshift, __shmax);
		1805	__result = vec_sr((__v2du)__A, __rshift);
		1806	__result = vec_sel((__v2du)__shmask, __result, __shmask);
		1807
		1808	return (__m128i)__result;
		1809	}
		1810	#endif
		1811
		1812	extern __inline __m128d
		1813	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1814	_mm_and_pd(__m128d __A, __m128d __B) {
		1815	return (vec_and((__v2df)__A, (__v2df)__B));
		1816	}
		1817
		1818	extern __inline __m128d
		1819	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1820	_mm_andnot_pd(__m128d __A, __m128d __B) {
		1821	return (vec_andc((__v2df)__B, (__v2df)__A));
		1822	}
		1823
		1824	extern __inline __m128d
		1825	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1826	_mm_or_pd(__m128d __A, __m128d __B) {
		1827	return (vec_or((__v2df)__A, (__v2df)__B));
		1828	}
		1829
		1830	extern __inline __m128d
		1831	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1832	_mm_xor_pd(__m128d __A, __m128d __B) {
		1833	return (vec_xor((__v2df)__A, (__v2df)__B));
		1834	}
		1835
		1836	extern __inline __m128i
		1837	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1838	_mm_and_si128(__m128i __A, __m128i __B) {
		1839	return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
		1840	}
		1841
		1842	extern __inline __m128i
		1843	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1844	_mm_andnot_si128(__m128i __A, __m128i __B) {
		1845	return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
		1846	}
		1847
		1848	extern __inline __m128i
		1849	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1850	_mm_or_si128(__m128i __A, __m128i __B) {
		1851	return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
		1852	}
		1853
		1854	extern __inline __m128i
		1855	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1856	_mm_xor_si128(__m128i __A, __m128i __B) {
		1857	return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
		1858	}
		1859
		1860	extern __inline __m128i
		1861	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1862	_mm_cmpeq_epi8(__m128i __A, __m128i __B) {
		1863	return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
		1864	}
		1865
		1866	extern __inline __m128i
		1867	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1868	_mm_cmpeq_epi16(__m128i __A, __m128i __B) {
		1869	return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
		1870	}
		1871
		1872	extern __inline __m128i
		1873	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1874	_mm_cmpeq_epi32(__m128i __A, __m128i __B) {
		1875	return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
		1876	}
		1877
		1878	extern __inline __m128i
		1879	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1880	_mm_cmplt_epi8(__m128i __A, __m128i __B) {
		1881	return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
		1882	}
		1883
		1884	extern __inline __m128i
		1885	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1886	_mm_cmplt_epi16(__m128i __A, __m128i __B) {
		1887	return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
		1888	}
		1889
		1890	extern __inline __m128i
		1891	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1892	_mm_cmplt_epi32(__m128i __A, __m128i __B) {
		1893	return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
		1894	}
		1895
		1896	extern __inline __m128i
		1897	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1898	_mm_cmpgt_epi8(__m128i __A, __m128i __B) {
		1899	return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
		1900	}
		1901
		1902	extern __inline __m128i
		1903	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1904	_mm_cmpgt_epi16(__m128i __A, __m128i __B) {
		1905	return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
		1906	}
		1907
		1908	extern __inline __m128i
		1909	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1910	_mm_cmpgt_epi32(__m128i __A, __m128i __B) {
		1911	return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
		1912	}
		1913
		1914	extern __inline int
		1915	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1916	_mm_extract_epi16(__m128i const __A, int const __N) {
		1917	return (unsigned short)((__v8hi)__A)[__N & 7];
		1918	}
		1919
		1920	extern __inline __m128i
		1921	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1922	_mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
		1923	__v8hi __result = (__v8hi)__A;
		1924
		1925	__result[(__N & 7)] = __D;
		1926
		1927	return (__m128i)__result;
		1928	}
		1929
		1930	extern __inline __m128i
		1931	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1932	_mm_max_epi16(__m128i __A, __m128i __B) {
		1933	return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
		1934	}
		1935
		1936	extern __inline __m128i
		1937	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1938	_mm_max_epu8(__m128i __A, __m128i __B) {
		1939	return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
		1940	}
		1941
		1942	extern __inline __m128i
		1943	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1944	_mm_min_epi16(__m128i __A, __m128i __B) {
		1945	return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
		1946	}
		1947
		1948	extern __inline __m128i
		1949	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1950	_mm_min_epu8(__m128i __A, __m128i __B) {
		1951	return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
		1952	}
		1953
		1954	#ifdef _ARCH_PWR8
		1955	/* Intrinsic functions that require PowerISA 2.07 minimum. */
		1956
		1957	/* Return a mask created from the most significant bit of each 8-bit
		1958	element in A. */
		1959	extern __inline int
		1960	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1961	_mm_movemask_epi8(__m128i __A) {
		1962	#ifdef _ARCH_PWR10
		1963	return vec_extractm((__v16qu)__A);
		1964	#else
		1965	__vector unsigned long long __result;
		1966	static const __vector unsigned char __perm_mask = {
		1967	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
		1968	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
		1969
		1970	__result = ((__vector unsigned long long)vec_vbpermq(
		1971	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
		1972
		1973	#ifdef __LITTLE_ENDIAN__
		1974	return __result[1];
		1975	#else
		1976	return __result[0];
		1977	#endif
		1978	#endif /* !_ARCH_PWR10 */
		1979	}
		1980	#endif /* _ARCH_PWR8 */
		1981
		1982	extern __inline __m128i
		1983	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1984	_mm_mulhi_epu16(__m128i __A, __m128i __B) {
		1985	__v4su __w0, __w1;
		1986	__v16qu __xform1 = {
		1987	#ifdef __LITTLE_ENDIAN__
		1988	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
		1989	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
		1990	#else
		1991	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
		1992	0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
		1993	#endif
		1994	};
		1995
		1996	__w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
		1997	__w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
		1998	return (__m128i)vec_perm(__w0, __w1, __xform1);
		1999	}
		2000
		2001	extern __inline __m128i
		2002	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2003	_mm_shufflehi_epi16(__m128i __A, const int __mask) {
		2004	unsigned long __element_selector_98 = __mask & 0x03;
		2005	unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
		2006	unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
		2007	unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
		2008	static const unsigned short __permute_selectors[4] = {
		2009	#ifdef __LITTLE_ENDIAN__
		2010	0x0908, 0x0B0A, 0x0D0C, 0x0F0E
		2011	#else
		2012	0x0809, 0x0A0B, 0x0C0D, 0x0E0F
		2013	#endif
		2014	};
		2015	__v2du __pmask =
		2016	#ifdef __LITTLE_ENDIAN__
		2017	{0x1716151413121110UL, 0UL};
		2018	#else
		2019	{0x1011121314151617UL, 0UL};
		2020	#endif
		2021	__m64_union __t;
		2022	__v2du __a, __r;
		2023
		2024	__t.as_short[0] = __permute_selectors[__element_selector_98];
		2025	__t.as_short[1] = __permute_selectors[__element_selector_BA];
		2026	__t.as_short[2] = __permute_selectors[__element_selector_DC];
		2027	__t.as_short[3] = __permute_selectors[__element_selector_FE];
		2028	__pmask[1] = __t.as_m64;
		2029	__a = (__v2du)__A;
		2030	__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
		2031	return (__m128i)__r;
		2032	}
		2033
		2034	extern __inline __m128i
		2035	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2036	_mm_shufflelo_epi16(__m128i __A, const int __mask) {
		2037	unsigned long __element_selector_10 = __mask & 0x03;
		2038	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
		2039	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
		2040	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
		2041	static const unsigned short __permute_selectors[4] = {
		2042	#ifdef __LITTLE_ENDIAN__
		2043	0x0100, 0x0302, 0x0504, 0x0706
		2044	#else
		2045	0x0001, 0x0203, 0x0405, 0x0607
		2046	#endif
		2047	};
		2048	__v2du __pmask =
		2049	#ifdef __LITTLE_ENDIAN__
		2050	{0UL, 0x1f1e1d1c1b1a1918UL};
		2051	#else
		2052	{0UL, 0x18191a1b1c1d1e1fUL};
		2053	#endif
		2054	__m64_union __t;
		2055	__v2du __a, __r;
		2056	__t.as_short[0] = __permute_selectors[__element_selector_10];
		2057	__t.as_short[1] = __permute_selectors[__element_selector_32];
		2058	__t.as_short[2] = __permute_selectors[__element_selector_54];
		2059	__t.as_short[3] = __permute_selectors[__element_selector_76];
		2060	__pmask[0] = __t.as_m64;
		2061	__a = (__v2du)__A;
		2062	__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
		2063	return (__m128i)__r;
		2064	}
		2065
		2066	extern __inline __m128i
		2067	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2068	_mm_shuffle_epi32(__m128i __A, const int __mask) {
		2069	unsigned long __element_selector_10 = __mask & 0x03;
		2070	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
		2071	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
		2072	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
		2073	static const unsigned int __permute_selectors[4] = {
		2074	#ifdef __LITTLE_ENDIAN__
		2075	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
		2076	#else
		2077	0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
		2078	#endif
		2079	};
		2080	__v4su __t;
		2081
		2082	__t[0] = __permute_selectors[__element_selector_10];
		2083	__t[1] = __permute_selectors[__element_selector_32];
		2084	__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
		2085	__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
		2086	return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
		2087	(__vector unsigned char)__t);
		2088	}
		2089
		2090	extern __inline void
		2091	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2092	_mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
		2093	__v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
		2094	__v16qu __mask, __tmp;
		2095	__m128i_u __p = (__m128i_u )__C;
		2096
		2097	__tmp = (__v16qu)_mm_loadu_si128(__p);
		2098	__mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
		2099	__tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
		2100	_mm_storeu_si128(__p, (__m128i)__tmp);
		2101	}
		2102
		2103	extern __inline __m128i
		2104	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2105	_mm_avg_epu8(__m128i __A, __m128i __B) {
		2106	return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
		2107	}
		2108
		2109	extern __inline __m128i
		2110	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2111	_mm_avg_epu16(__m128i __A, __m128i __B) {
		2112	return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
		2113	}
		2114
		2115	extern __inline __m128i
		2116	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2117	_mm_sad_epu8(__m128i __A, __m128i __B) {
		2118	__v16qu __a, __b;
		2119	__v16qu __vabsdiff;
		2120	__v4si __vsum;
		2121	const __v4su __zero = {0, 0, 0, 0};
		2122	__v4si __result;
		2123
		2124	__a = (__v16qu)__A;
		2125	__b = (__v16qu)__B;
		2126	#ifndef _ARCH_PWR9
		2127	__v16qu __vmin = vec_min(__a, __b);
		2128	__v16qu __vmax = vec_max(__a, __b);
		2129	__vabsdiff = vec_sub(__vmax, __vmin);
		2130	#else
		2131	__vabsdiff = vec_absd(__a, __b);
		2132	#endif
		2133	/* Sum four groups of bytes into integers. */
		2134	__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
		2135	#ifdef __LITTLE_ENDIAN__
		2136	/* Sum across four integers with two integer results. */
		2137	__asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
		2138	/* Note: vec_sum2s could be used here, but on little-endian, vector
		2139	shifts are added that are not needed for this use-case.
		2140	A vector shift to correctly position the 32-bit integer results
		2141	(currently at [0] and [2]) to [1] and [3] would then need to be
		2142	swapped back again since the desired results are two 64-bit
		2143	integers ([1]\|[0] and [3]\|[2]). Thus, no shift is performed. */
		2144	#else
		2145	/* Sum across four integers with two integer results. */
		2146	__result = vec_sum2s(__vsum, (__vector signed int)__zero);
		2147	/* Rotate the sums into the correct position. */
		2148	__result = vec_sld(__result, __result, 6);
		2149	#endif
		2150	return (__m128i)__result;
		2151	}
		2152
		2153	extern __inline void
		2154	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2155	_mm_stream_si32(int *__A, int __B) {
		2156	/* Use the data cache block touch for store transient. */
		2157	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
		2158	*__A = __B;
		2159	}
		2160
		2161	extern __inline void
		2162	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2163	_mm_stream_si64(long long int *__A, long long int __B) {
		2164	/* Use the data cache block touch for store transient. */
		2165	__asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
		2166	*__A = __B;
		2167	}
		2168
		2169	extern __inline void
		2170	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2171	_mm_stream_si128(__m128i *__A, __m128i __B) {
		2172	/* Use the data cache block touch for store transient. */
		2173	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
		2174	*__A = __B;
		2175	}
		2176
		2177	extern __inline void
		2178	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2179	_mm_stream_pd(double *__A, __m128d __B) {
		2180	/* Use the data cache block touch for store transient. */
		2181	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
		2182	(__m128d )__A = __B;
		2183	}
		2184
		2185	extern __inline void
		2186	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2187	_mm_clflush(void const *__A) {
		2188	/* Use the data cache block flush. */
		2189	__asm__("dcbf 0,%0" : : "b"(__A) : "memory");
		2190	}
		2191
		2192	extern __inline void
		2193	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2194	_mm_lfence(void) {
		2195	/* Use light weight sync for load to load ordering. */
		2196	__atomic_thread_fence(__ATOMIC_RELEASE);
		2197	}
		2198
		2199	extern __inline void
		2200	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2201	_mm_mfence(void) {
		2202	/* Use heavy weight sync for any to any ordering. */
		2203	__atomic_thread_fence(__ATOMIC_SEQ_CST);
		2204	}
		2205
		2206	extern __inline __m128i
		2207	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2208	_mm_cvtsi32_si128(int __A) {
		2209	return _mm_set_epi32(0, 0, 0, __A);
		2210	}
		2211
		2212	extern __inline __m128i
		2213	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2214	_mm_cvtsi64_si128(long long __A) {
		2215	return __extension__(__m128i)(__v2di){__A, 0LL};
		2216	}
		2217
		2218	/* Microsoft intrinsic. */
		2219	extern __inline __m128i
		2220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2221	_mm_cvtsi64x_si128(long long __A) {
		2222	return __extension__(__m128i)(__v2di){__A, 0LL};
		2223	}
		2224
		2225	/* Casts between various SP, DP, INT vector types. Note that these do no
		2226	conversion of values, they just change the type. */
		2227	extern __inline __m128
		2228	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2229	_mm_castpd_ps(__m128d __A) {
		2230	return (__m128)__A;
		2231	}
		2232
		2233	extern __inline __m128i
		2234	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2235	_mm_castpd_si128(__m128d __A) {
		2236	return (__m128i)__A;
		2237	}
		2238
		2239	extern __inline __m128d
		2240	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2241	_mm_castps_pd(__m128 __A) {
		2242	return (__m128d)__A;
		2243	}
		2244
		2245	extern __inline __m128i
		2246	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2247	_mm_castps_si128(__m128 __A) {
		2248	return (__m128i)__A;
		2249	}
		2250
		2251	extern __inline __m128
		2252	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2253	_mm_castsi128_ps(__m128i __A) {
		2254	return (__m128)__A;
		2255	}
		2256
		2257	extern __inline __m128d
		2258	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		2259	_mm_castsi128_pd(__m128i __A) {
		2260	return (__m128d)__A;
		2261	}
		2262
		2263	#else
		2264	#include_next <emmintrin.h>
		2265	#endif /* defined(__powerpc64__) && \
		2266	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
		2267
		2268	#endif /* EMMINTRIN_H_ */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/emmintrin.h – Rev 14