WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/mmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	/* Implemented from the specification included in the Intel C++ Compiler
		11	User Guide and Reference, version 9.0. */
		12
		13	#ifndef NO_WARN_X86_INTRINSICS
		14	/* This header file is to help porting code using Intel intrinsics
		15	explicitly from x86_64 to powerpc64/powerpc64le.
		16
		17	Since PowerPC target doesn't support native 64-bit vector type, we
		18	typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
		19	works well for _si64 and some _pi32 operations.
		20
		21	For _pi16 and _pi8 operations, it's better to transfer __m64 into
		22	128-bit PowerPC vector first. Power8 introduced direct register
		23	move instructions which helps for more efficient implementation.
		24
		25	It's user's responsibility to determine if the results of such port
		26	are acceptable or further changes are needed. Please note that much
		27	code using Intel intrinsics CAN BE REWRITTEN in more portable and
		28	efficient standard C or GNU C extensions with 64-bit scalar
		29	operations, or 128-bit SSE/Altivec operations, which are more
		30	recommended. */
		31	#error \
		32	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
		33	#endif
		34
		35	#ifndef _MMINTRIN_H_INCLUDED
		36	#define _MMINTRIN_H_INCLUDED
		37
		38	#if defined(__powerpc64__) && \
		39	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
		40
		41	#include <altivec.h>
		42	/* The Intel API is flexible enough that we must allow aliasing with other
		43	vector types, and their scalar components. */
		44	typedef __attribute__((__aligned__(8))) unsigned long long __m64;
		45
		46	typedef __attribute__((__aligned__(8))) union {
		47	__m64 as_m64;
		48	char as_char[8];
		49	signed char as_signed_char[8];
		50	short as_short[4];
		51	int as_int[2];
		52	long long as_long_long;
		53	float as_float[2];
		54	double as_double;
		55	} __m64_union;
		56
		57	/* Empty the multimedia state. */
		58	extern __inline void
		59	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		60	_mm_empty(void) {
		61	/* nothing to do on PowerPC. */
		62	}
		63
		64	extern __inline void
		65	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		66	_m_empty(void) {
		67	/* nothing to do on PowerPC. */
		68	}
		69
		70	/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
		71	extern __inline __m64
		72	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		73	_mm_cvtsi32_si64(int __i) {
		74	return (__m64)(unsigned int)__i;
		75	}
		76
		77	extern __inline __m64
		78	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		79	_m_from_int(int __i) {
		80	return _mm_cvtsi32_si64(__i);
		81	}
		82
		83	/* Convert the lower 32 bits of the __m64 object into an integer. */
		84	extern __inline int
		85	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		86	_mm_cvtsi64_si32(__m64 __i) {
		87	return ((int)__i);
		88	}
		89
		90	extern __inline int
		91	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		92	_m_to_int(__m64 __i) {
		93	return _mm_cvtsi64_si32(__i);
		94	}
		95
		96	/* Convert I to a __m64 object. */
		97
		98	/* Intel intrinsic. */
		99	extern __inline __m64
		100	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		101	_m_from_int64(long long __i) {
		102	return (__m64)__i;
		103	}
		104
		105	extern __inline __m64
		106	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		107	_mm_cvtsi64_m64(long long __i) {
		108	return (__m64)__i;
		109	}
		110
		111	/* Microsoft intrinsic. */
		112	extern __inline __m64
		113	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		114	_mm_cvtsi64x_si64(long long __i) {
		115	return (__m64)__i;
		116	}
		117
		118	extern __inline __m64
		119	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		120	_mm_set_pi64x(long long __i) {
		121	return (__m64)__i;
		122	}
		123
		124	/* Convert the __m64 object to a 64bit integer. */
		125
		126	/* Intel intrinsic. */
		127	extern __inline long long
		128	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		129	_m_to_int64(__m64 __i) {
		130	return (long long)__i;
		131	}
		132
		133	extern __inline long long
		134	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		135	_mm_cvtm64_si64(__m64 __i) {
		136	return (long long)__i;
		137	}
		138
		139	/* Microsoft intrinsic. */
		140	extern __inline long long
		141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		142	_mm_cvtsi64_si64x(__m64 __i) {
		143	return (long long)__i;
		144	}
		145
		146	#ifdef _ARCH_PWR8
		147	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
		148	the result, and the four 16-bit values from M2 into the upper four 8-bit
		149	values of the result, all with signed saturation. */
		150	extern __inline __m64
		151	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		152	_mm_packs_pi16(__m64 __m1, __m64 __m2) {
		153	__vector signed short __vm1;
		154	__vector signed char __vresult;
		155
		156	__vm1 = (__vector signed short)(__vector unsigned long long)
		157	#ifdef __LITTLE_ENDIAN__
		158	{__m1, __m2};
		159	#else
		160	{__m2, __m1};
		161	#endif
		162	__vresult = vec_packs(__vm1, __vm1);
		163	return (__m64)((__vector long long)__vresult)[0];
		164	}
		165
		166	extern __inline __m64
		167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		168	_m_packsswb(__m64 __m1, __m64 __m2) {
		169	return _mm_packs_pi16(__m1, __m2);
		170	}
		171
		172	/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
		173	the result, and the two 32-bit values from M2 into the upper two 16-bit
		174	values of the result, all with signed saturation. */
		175	extern __inline __m64
		176	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		177	_mm_packs_pi32(__m64 __m1, __m64 __m2) {
		178	__vector signed int __vm1;
		179	__vector signed short __vresult;
		180
		181	__vm1 = (__vector signed int)(__vector unsigned long long)
		182	#ifdef __LITTLE_ENDIAN__
		183	{__m1, __m2};
		184	#else
		185	{__m2, __m1};
		186	#endif
		187	__vresult = vec_packs(__vm1, __vm1);
		188	return (__m64)((__vector long long)__vresult)[0];
		189	}
		190
		191	extern __inline __m64
		192	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		193	_m_packssdw(__m64 __m1, __m64 __m2) {
		194	return _mm_packs_pi32(__m1, __m2);
		195	}
		196
		197	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
		198	the result, and the four 16-bit values from M2 into the upper four 8-bit
		199	values of the result, all with unsigned saturation. */
		200	extern __inline __m64
		201	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		202	_mm_packs_pu16(__m64 __m1, __m64 __m2) {
		203	__vector unsigned char __r;
		204	__vector signed short __vm1 = (__vector signed short)(__vector long long)
		205	#ifdef __LITTLE_ENDIAN__
		206	{__m1, __m2};
		207	#else
		208	{__m2, __m1};
		209	#endif
		210	const __vector signed short __zero = {0};
		211	__vector __bool short __select = vec_cmplt(__vm1, __zero);
		212	__r =
		213	vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
		214	__vector __bool char __packsel = vec_pack(__select, __select);
		215	__r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
		216	return (__m64)((__vector long long)__r)[0];
		217	}
		218
		219	extern __inline __m64
		220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		221	_m_packuswb(__m64 __m1, __m64 __m2) {
		222	return _mm_packs_pu16(__m1, __m2);
		223	}
		224	#endif /* end ARCH_PWR8 */
		225
		226	/* Interleave the four 8-bit values from the high half of M1 with the four
		227	8-bit values from the high half of M2. */
		228	extern __inline __m64
		229	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		230	_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
		231	#if _ARCH_PWR8
		232	__vector unsigned char __a, __b, __c;
		233
		234	__a = (__vector unsigned char)vec_splats(__m1);
		235	__b = (__vector unsigned char)vec_splats(__m2);
		236	__c = vec_mergel(__a, __b);
		237	return (__m64)((__vector long long)__c)[1];
		238	#else
		239	__m64_union __mu1, __mu2, __res;
		240
		241	__mu1.as_m64 = __m1;
		242	__mu2.as_m64 = __m2;
		243
		244	__res.as_char[0] = __mu1.as_char[4];
		245	__res.as_char[1] = __mu2.as_char[4];
		246	__res.as_char[2] = __mu1.as_char[5];
		247	__res.as_char[3] = __mu2.as_char[5];
		248	__res.as_char[4] = __mu1.as_char[6];
		249	__res.as_char[5] = __mu2.as_char[6];
		250	__res.as_char[6] = __mu1.as_char[7];
		251	__res.as_char[7] = __mu2.as_char[7];
		252
		253	return (__m64)__res.as_m64;
		254	#endif
		255	}
		256
		257	extern __inline __m64
		258	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		259	_m_punpckhbw(__m64 __m1, __m64 __m2) {
		260	return _mm_unpackhi_pi8(__m1, __m2);
		261	}
		262
		263	/* Interleave the two 16-bit values from the high half of M1 with the two
		264	16-bit values from the high half of M2. */
		265	extern __inline __m64
		266	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		267	_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
		268	__m64_union __mu1, __mu2, __res;
		269
		270	__mu1.as_m64 = __m1;
		271	__mu2.as_m64 = __m2;
		272
		273	__res.as_short[0] = __mu1.as_short[2];
		274	__res.as_short[1] = __mu2.as_short[2];
		275	__res.as_short[2] = __mu1.as_short[3];
		276	__res.as_short[3] = __mu2.as_short[3];
		277
		278	return (__m64)__res.as_m64;
		279	}
		280
		281	extern __inline __m64
		282	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		283	_m_punpckhwd(__m64 __m1, __m64 __m2) {
		284	return _mm_unpackhi_pi16(__m1, __m2);
		285	}
		286	/* Interleave the 32-bit value from the high half of M1 with the 32-bit
		287	value from the high half of M2. */
		288	extern __inline __m64
		289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		290	_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
		291	__m64_union __mu1, __mu2, __res;
		292
		293	__mu1.as_m64 = __m1;
		294	__mu2.as_m64 = __m2;
		295
		296	__res.as_int[0] = __mu1.as_int[1];
		297	__res.as_int[1] = __mu2.as_int[1];
		298
		299	return (__m64)__res.as_m64;
		300	}
		301
		302	extern __inline __m64
		303	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		304	_m_punpckhdq(__m64 __m1, __m64 __m2) {
		305	return _mm_unpackhi_pi32(__m1, __m2);
		306	}
		307	/* Interleave the four 8-bit values from the low half of M1 with the four
		308	8-bit values from the low half of M2. */
		309	extern __inline __m64
		310	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		311	_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
		312	#if _ARCH_PWR8
		313	__vector unsigned char __a, __b, __c;
		314
		315	__a = (__vector unsigned char)vec_splats(__m1);
		316	__b = (__vector unsigned char)vec_splats(__m2);
		317	__c = vec_mergel(__a, __b);
		318	return (__m64)((__vector long long)__c)[0];
		319	#else
		320	__m64_union __mu1, __mu2, __res;
		321
		322	__mu1.as_m64 = __m1;
		323	__mu2.as_m64 = __m2;
		324
		325	__res.as_char[0] = __mu1.as_char[0];
		326	__res.as_char[1] = __mu2.as_char[0];
		327	__res.as_char[2] = __mu1.as_char[1];
		328	__res.as_char[3] = __mu2.as_char[1];
		329	__res.as_char[4] = __mu1.as_char[2];
		330	__res.as_char[5] = __mu2.as_char[2];
		331	__res.as_char[6] = __mu1.as_char[3];
		332	__res.as_char[7] = __mu2.as_char[3];
		333
		334	return (__m64)__res.as_m64;
		335	#endif
		336	}
		337
		338	extern __inline __m64
		339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		340	_m_punpcklbw(__m64 __m1, __m64 __m2) {
		341	return _mm_unpacklo_pi8(__m1, __m2);
		342	}
		343	/* Interleave the two 16-bit values from the low half of M1 with the two
		344	16-bit values from the low half of M2. */
		345	extern __inline __m64
		346	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		347	_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
		348	__m64_union __mu1, __mu2, __res;
		349
		350	__mu1.as_m64 = __m1;
		351	__mu2.as_m64 = __m2;
		352
		353	__res.as_short[0] = __mu1.as_short[0];
		354	__res.as_short[1] = __mu2.as_short[0];
		355	__res.as_short[2] = __mu1.as_short[1];
		356	__res.as_short[3] = __mu2.as_short[1];
		357
		358	return (__m64)__res.as_m64;
		359	}
		360
		361	extern __inline __m64
		362	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		363	_m_punpcklwd(__m64 __m1, __m64 __m2) {
		364	return _mm_unpacklo_pi16(__m1, __m2);
		365	}
		366
		367	/* Interleave the 32-bit value from the low half of M1 with the 32-bit
		368	value from the low half of M2. */
		369	extern __inline __m64
		370	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		371	_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
		372	__m64_union __mu1, __mu2, __res;
		373
		374	__mu1.as_m64 = __m1;
		375	__mu2.as_m64 = __m2;
		376
		377	__res.as_int[0] = __mu1.as_int[0];
		378	__res.as_int[1] = __mu2.as_int[0];
		379
		380	return (__m64)__res.as_m64;
		381	}
		382
		383	extern __inline __m64
		384	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		385	_m_punpckldq(__m64 __m1, __m64 __m2) {
		386	return _mm_unpacklo_pi32(__m1, __m2);
		387	}
		388
		389	/* Add the 8-bit values in M1 to the 8-bit values in M2. */
		390	extern __inline __m64
		391	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		392	_mm_add_pi8(__m64 __m1, __m64 __m2) {
		393	#if _ARCH_PWR8
		394	__vector signed char __a, __b, __c;
		395
		396	__a = (__vector signed char)vec_splats(__m1);
		397	__b = (__vector signed char)vec_splats(__m2);
		398	__c = vec_add(__a, __b);
		399	return (__m64)((__vector long long)__c)[0];
		400	#else
		401	__m64_union __mu1, __mu2, __res;
		402
		403	__mu1.as_m64 = __m1;
		404	__mu2.as_m64 = __m2;
		405
		406	__res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
		407	__res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
		408	__res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
		409	__res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
		410	__res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
		411	__res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
		412	__res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
		413	__res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
		414
		415	return (__m64)__res.as_m64;
		416	#endif
		417	}
		418
		419	extern __inline __m64
		420	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		421	_m_paddb(__m64 __m1, __m64 __m2) {
		422	return _mm_add_pi8(__m1, __m2);
		423	}
		424
		425	/* Add the 16-bit values in M1 to the 16-bit values in M2. */
		426	extern __inline __m64
		427	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		428	_mm_add_pi16(__m64 __m1, __m64 __m2) {
		429	#if _ARCH_PWR8
		430	__vector signed short __a, __b, __c;
		431
		432	__a = (__vector signed short)vec_splats(__m1);
		433	__b = (__vector signed short)vec_splats(__m2);
		434	__c = vec_add(__a, __b);
		435	return (__m64)((__vector long long)__c)[0];
		436	#else
		437	__m64_union __mu1, __mu2, __res;
		438
		439	__mu1.as_m64 = __m1;
		440	__mu2.as_m64 = __m2;
		441
		442	__res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
		443	__res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
		444	__res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
		445	__res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
		446
		447	return (__m64)__res.as_m64;
		448	#endif
		449	}
		450
		451	extern __inline __m64
		452	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		453	_m_paddw(__m64 __m1, __m64 __m2) {
		454	return _mm_add_pi16(__m1, __m2);
		455	}
		456
		457	/* Add the 32-bit values in M1 to the 32-bit values in M2. */
		458	extern __inline __m64
		459	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		460	_mm_add_pi32(__m64 __m1, __m64 __m2) {
		461	#if _ARCH_PWR9
		462	__vector signed int __a, __b, __c;
		463
		464	__a = (__vector signed int)vec_splats(__m1);
		465	__b = (__vector signed int)vec_splats(__m2);
		466	__c = vec_add(__a, __b);
		467	return (__m64)((__vector long long)__c)[0];
		468	#else
		469	__m64_union __mu1, __mu2, __res;
		470
		471	__mu1.as_m64 = __m1;
		472	__mu2.as_m64 = __m2;
		473
		474	__res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
		475	__res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
		476
		477	return (__m64)__res.as_m64;
		478	#endif
		479	}
		480
		481	extern __inline __m64
		482	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		483	_m_paddd(__m64 __m1, __m64 __m2) {
		484	return _mm_add_pi32(__m1, __m2);
		485	}
		486
		487	/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
		488	extern __inline __m64
		489	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		490	_mm_sub_pi8(__m64 __m1, __m64 __m2) {
		491	#if _ARCH_PWR8
		492	__vector signed char __a, __b, __c;
		493
		494	__a = (__vector signed char)vec_splats(__m1);
		495	__b = (__vector signed char)vec_splats(__m2);
		496	__c = vec_sub(__a, __b);
		497	return (__m64)((__vector long long)__c)[0];
		498	#else
		499	__m64_union __mu1, __mu2, __res;
		500
		501	__mu1.as_m64 = __m1;
		502	__mu2.as_m64 = __m2;
		503
		504	__res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
		505	__res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
		506	__res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
		507	__res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
		508	__res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
		509	__res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
		510	__res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
		511	__res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
		512
		513	return (__m64)__res.as_m64;
		514	#endif
		515	}
		516
		517	extern __inline __m64
		518	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		519	_m_psubb(__m64 __m1, __m64 __m2) {
		520	return _mm_sub_pi8(__m1, __m2);
		521	}
		522
		523	/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
		524	extern __inline __m64
		525	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		526	_mm_sub_pi16(__m64 __m1, __m64 __m2) {
		527	#if _ARCH_PWR8
		528	__vector signed short __a, __b, __c;
		529
		530	__a = (__vector signed short)vec_splats(__m1);
		531	__b = (__vector signed short)vec_splats(__m2);
		532	__c = vec_sub(__a, __b);
		533	return (__m64)((__vector long long)__c)[0];
		534	#else
		535	__m64_union __mu1, __mu2, __res;
		536
		537	__mu1.as_m64 = __m1;
		538	__mu2.as_m64 = __m2;
		539
		540	__res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
		541	__res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
		542	__res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
		543	__res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
		544
		545	return (__m64)__res.as_m64;
		546	#endif
		547	}
		548
		549	extern __inline __m64
		550	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		551	_m_psubw(__m64 __m1, __m64 __m2) {
		552	return _mm_sub_pi16(__m1, __m2);
		553	}
		554
		555	/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
		556	extern __inline __m64
		557	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		558	_mm_sub_pi32(__m64 __m1, __m64 __m2) {
		559	#if _ARCH_PWR9
		560	__vector signed int __a, __b, __c;
		561
		562	__a = (__vector signed int)vec_splats(__m1);
		563	__b = (__vector signed int)vec_splats(__m2);
		564	__c = vec_sub(__a, __b);
		565	return (__m64)((__vector long long)__c)[0];
		566	#else
		567	__m64_union __mu1, __mu2, __res;
		568
		569	__mu1.as_m64 = __m1;
		570	__mu2.as_m64 = __m2;
		571
		572	__res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
		573	__res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
		574
		575	return (__m64)__res.as_m64;
		576	#endif
		577	}
		578
		579	extern __inline __m64
		580	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		581	_m_psubd(__m64 __m1, __m64 __m2) {
		582	return _mm_sub_pi32(__m1, __m2);
		583	}
		584
		585	extern __inline __m64
		586	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		587	_mm_add_si64(__m64 __m1, __m64 __m2) {
		588	return (__m1 + __m2);
		589	}
		590
		591	extern __inline __m64
		592	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		593	_mm_sub_si64(__m64 __m1, __m64 __m2) {
		594	return (__m1 - __m2);
		595	}
		596
		597	/* Shift the 64-bit value in M left by COUNT. */
		598	extern __inline __m64
		599	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		600	_mm_sll_si64(__m64 __m, __m64 __count) {
		601	return (__m << __count);
		602	}
		603
		604	extern __inline __m64
		605	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		606	_m_psllq(__m64 __m, __m64 __count) {
		607	return _mm_sll_si64(__m, __count);
		608	}
		609
		610	extern __inline __m64
		611	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		612	_mm_slli_si64(__m64 __m, const int __count) {
		613	return (__m << __count);
		614	}
		615
		616	extern __inline __m64
		617	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		618	_m_psllqi(__m64 __m, const int __count) {
		619	return _mm_slli_si64(__m, __count);
		620	}
		621
		622	/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
		623	extern __inline __m64
		624	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		625	_mm_srl_si64(__m64 __m, __m64 __count) {
		626	return (__m >> __count);
		627	}
		628
		629	extern __inline __m64
		630	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		631	_m_psrlq(__m64 __m, __m64 __count) {
		632	return _mm_srl_si64(__m, __count);
		633	}
		634
		635	extern __inline __m64
		636	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		637	_mm_srli_si64(__m64 __m, const int __count) {
		638	return (__m >> __count);
		639	}
		640
		641	extern __inline __m64
		642	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		643	_m_psrlqi(__m64 __m, const int __count) {
		644	return _mm_srli_si64(__m, __count);
		645	}
		646
		647	/* Bit-wise AND the 64-bit values in M1 and M2. */
		648	extern __inline __m64
		649	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		650	_mm_and_si64(__m64 __m1, __m64 __m2) {
		651	return (__m1 & __m2);
		652	}
		653
		654	extern __inline __m64
		655	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		656	_m_pand(__m64 __m1, __m64 __m2) {
		657	return _mm_and_si64(__m1, __m2);
		658	}
		659
		660	/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
		661	64-bit value in M2. */
		662	extern __inline __m64
		663	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		664	_mm_andnot_si64(__m64 __m1, __m64 __m2) {
		665	return (~__m1 & __m2);
		666	}
		667
		668	extern __inline __m64
		669	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		670	_m_pandn(__m64 __m1, __m64 __m2) {
		671	return _mm_andnot_si64(__m1, __m2);
		672	}
		673
		674	/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
		675	extern __inline __m64
		676	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		677	_mm_or_si64(__m64 __m1, __m64 __m2) {
		678	return (__m1 \| __m2);
		679	}
		680
		681	extern __inline __m64
		682	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		683	_m_por(__m64 __m1, __m64 __m2) {
		684	return _mm_or_si64(__m1, __m2);
		685	}
		686
		687	/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
		688	extern __inline __m64
		689	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		690	_mm_xor_si64(__m64 __m1, __m64 __m2) {
		691	return (__m1 ^ __m2);
		692	}
		693
		694	extern __inline __m64
		695	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		696	_m_pxor(__m64 __m1, __m64 __m2) {
		697	return _mm_xor_si64(__m1, __m2);
		698	}
		699
		700	/* Creates a 64-bit zero. */
		701	extern __inline __m64
		702	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		703	_mm_setzero_si64(void) {
		704	return (__m64)0;
		705	}
		706
		707	/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
		708	test is true and zero if false. */
		709	extern __inline __m64
		710	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		711	_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
		712	#if defined(_ARCH_PWR6) && defined(__powerpc64__)
		713	__m64 __res;
		714	__asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
		715	return (__res);
		716	#else
		717	__m64_union __mu1, __mu2, __res;
		718
		719	__mu1.as_m64 = __m1;
		720	__mu2.as_m64 = __m2;
		721
		722	__res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
		723	__res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
		724	__res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
		725	__res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
		726	__res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
		727	__res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
		728	__res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
		729	__res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
		730
		731	return (__m64)__res.as_m64;
		732	#endif
		733	}
		734
		735	extern __inline __m64
		736	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		737	_m_pcmpeqb(__m64 __m1, __m64 __m2) {
		738	return _mm_cmpeq_pi8(__m1, __m2);
		739	}
		740
		741	extern __inline __m64
		742	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		743	_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
		744	#if _ARCH_PWR8
		745	__vector signed char __a, __b, __c;
		746
		747	__a = (__vector signed char)vec_splats(__m1);
		748	__b = (__vector signed char)vec_splats(__m2);
		749	__c = (__vector signed char)vec_cmpgt(__a, __b);
		750	return (__m64)((__vector long long)__c)[0];
		751	#else
		752	__m64_union __mu1, __mu2, __res;
		753
		754	__mu1.as_m64 = __m1;
		755	__mu2.as_m64 = __m2;
		756
		757	__res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
		758	__res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
		759	__res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
		760	__res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
		761	__res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
		762	__res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
		763	__res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
		764	__res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
		765
		766	return (__m64)__res.as_m64;
		767	#endif
		768	}
		769
		770	extern __inline __m64
		771	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		772	_m_pcmpgtb(__m64 __m1, __m64 __m2) {
		773	return _mm_cmpgt_pi8(__m1, __m2);
		774	}
		775
		776	/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
		777	the test is true and zero if false. */
		778	extern __inline __m64
		779	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		780	_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
		781	#if _ARCH_PWR8
		782	__vector signed short __a, __b, __c;
		783
		784	__a = (__vector signed short)vec_splats(__m1);
		785	__b = (__vector signed short)vec_splats(__m2);
		786	__c = (__vector signed short)vec_cmpeq(__a, __b);
		787	return (__m64)((__vector long long)__c)[0];
		788	#else
		789	__m64_union __mu1, __mu2, __res;
		790
		791	__mu1.as_m64 = __m1;
		792	__mu2.as_m64 = __m2;
		793
		794	__res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
		795	__res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
		796	__res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
		797	__res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
		798
		799	return (__m64)__res.as_m64;
		800	#endif
		801	}
		802
		803	extern __inline __m64
		804	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		805	_m_pcmpeqw(__m64 __m1, __m64 __m2) {
		806	return _mm_cmpeq_pi16(__m1, __m2);
		807	}
		808
		809	extern __inline __m64
		810	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		811	_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
		812	#if _ARCH_PWR8
		813	__vector signed short __a, __b, __c;
		814
		815	__a = (__vector signed short)vec_splats(__m1);
		816	__b = (__vector signed short)vec_splats(__m2);
		817	__c = (__vector signed short)vec_cmpgt(__a, __b);
		818	return (__m64)((__vector long long)__c)[0];
		819	#else
		820	__m64_union __mu1, __mu2, __res;
		821
		822	__mu1.as_m64 = __m1;
		823	__mu2.as_m64 = __m2;
		824
		825	__res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
		826	__res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
		827	__res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
		828	__res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
		829
		830	return (__m64)__res.as_m64;
		831	#endif
		832	}
		833
		834	extern __inline __m64
		835	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		836	_m_pcmpgtw(__m64 __m1, __m64 __m2) {
		837	return _mm_cmpgt_pi16(__m1, __m2);
		838	}
		839
		840	/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
		841	the test is true and zero if false. */
		842	extern __inline __m64
		843	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		844	_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
		845	#if _ARCH_PWR9
		846	__vector signed int __a, __b, __c;
		847
		848	__a = (__vector signed int)vec_splats(__m1);
		849	__b = (__vector signed int)vec_splats(__m2);
		850	__c = (__vector signed int)vec_cmpeq(__a, __b);
		851	return (__m64)((__vector long long)__c)[0];
		852	#else
		853	__m64_union __mu1, __mu2, __res;
		854
		855	__mu1.as_m64 = __m1;
		856	__mu2.as_m64 = __m2;
		857
		858	__res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
		859	__res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
		860
		861	return (__m64)__res.as_m64;
		862	#endif
		863	}
		864
		865	extern __inline __m64
		866	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		867	_m_pcmpeqd(__m64 __m1, __m64 __m2) {
		868	return _mm_cmpeq_pi32(__m1, __m2);
		869	}
		870
		871	extern __inline __m64
		872	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		873	_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
		874	#if _ARCH_PWR9
		875	__vector signed int __a, __b, __c;
		876
		877	__a = (__vector signed int)vec_splats(__m1);
		878	__b = (__vector signed int)vec_splats(__m2);
		879	__c = (__vector signed int)vec_cmpgt(__a, __b);
		880	return (__m64)((__vector long long)__c)[0];
		881	#else
		882	__m64_union __mu1, __mu2, __res;
		883
		884	__mu1.as_m64 = __m1;
		885	__mu2.as_m64 = __m2;
		886
		887	__res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
		888	__res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
		889
		890	return (__m64)__res.as_m64;
		891	#endif
		892	}
		893
		894	extern __inline __m64
		895	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		896	_m_pcmpgtd(__m64 __m1, __m64 __m2) {
		897	return _mm_cmpgt_pi32(__m1, __m2);
		898	}
		899
		900	#if _ARCH_PWR8
		901	/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
		902	saturated arithmetic. */
		903	extern __inline __m64
		904	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		905	_mm_adds_pi8(__m64 __m1, __m64 __m2) {
		906	__vector signed char __a, __b, __c;
		907
		908	__a = (__vector signed char)vec_splats(__m1);
		909	__b = (__vector signed char)vec_splats(__m2);
		910	__c = vec_adds(__a, __b);
		911	return (__m64)((__vector long long)__c)[0];
		912	}
		913
		914	extern __inline __m64
		915	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		916	_m_paddsb(__m64 __m1, __m64 __m2) {
		917	return _mm_adds_pi8(__m1, __m2);
		918	}
		919	/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
		920	saturated arithmetic. */
		921	extern __inline __m64
		922	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		923	_mm_adds_pi16(__m64 __m1, __m64 __m2) {
		924	__vector signed short __a, __b, __c;
		925
		926	__a = (__vector signed short)vec_splats(__m1);
		927	__b = (__vector signed short)vec_splats(__m2);
		928	__c = vec_adds(__a, __b);
		929	return (__m64)((__vector long long)__c)[0];
		930	}
		931
		932	extern __inline __m64
		933	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		934	_m_paddsw(__m64 __m1, __m64 __m2) {
		935	return _mm_adds_pi16(__m1, __m2);
		936	}
		937	/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
		938	saturated arithmetic. */
		939	extern __inline __m64
		940	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		941	_mm_adds_pu8(__m64 __m1, __m64 __m2) {
		942	__vector unsigned char __a, __b, __c;
		943
		944	__a = (__vector unsigned char)vec_splats(__m1);
		945	__b = (__vector unsigned char)vec_splats(__m2);
		946	__c = vec_adds(__a, __b);
		947	return (__m64)((__vector long long)__c)[0];
		948	}
		949
		950	extern __inline __m64
		951	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		952	_m_paddusb(__m64 __m1, __m64 __m2) {
		953	return _mm_adds_pu8(__m1, __m2);
		954	}
		955
		956	/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
		957	saturated arithmetic. */
		958	extern __inline __m64
		959	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		960	_mm_adds_pu16(__m64 __m1, __m64 __m2) {
		961	__vector unsigned short __a, __b, __c;
		962
		963	__a = (__vector unsigned short)vec_splats(__m1);
		964	__b = (__vector unsigned short)vec_splats(__m2);
		965	__c = vec_adds(__a, __b);
		966	return (__m64)((__vector long long)__c)[0];
		967	}
		968
		969	extern __inline __m64
		970	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		971	_m_paddusw(__m64 __m1, __m64 __m2) {
		972	return _mm_adds_pu16(__m1, __m2);
		973	}
		974
		975	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
		976	saturating arithmetic. */
		977	extern __inline __m64
		978	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		979	_mm_subs_pi8(__m64 __m1, __m64 __m2) {
		980	__vector signed char __a, __b, __c;
		981
		982	__a = (__vector signed char)vec_splats(__m1);
		983	__b = (__vector signed char)vec_splats(__m2);
		984	__c = vec_subs(__a, __b);
		985	return (__m64)((__vector long long)__c)[0];
		986	}
		987
		988	extern __inline __m64
		989	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		990	_m_psubsb(__m64 __m1, __m64 __m2) {
		991	return _mm_subs_pi8(__m1, __m2);
		992	}
		993
		994	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
		995	signed saturating arithmetic. */
		996	extern __inline __m64
		997	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		998	_mm_subs_pi16(__m64 __m1, __m64 __m2) {
		999	__vector signed short __a, __b, __c;
		1000
		1001	__a = (__vector signed short)vec_splats(__m1);
		1002	__b = (__vector signed short)vec_splats(__m2);
		1003	__c = vec_subs(__a, __b);
		1004	return (__m64)((__vector long long)__c)[0];
		1005	}
		1006
		1007	extern __inline __m64
		1008	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1009	_m_psubsw(__m64 __m1, __m64 __m2) {
		1010	return _mm_subs_pi16(__m1, __m2);
		1011	}
		1012
		1013	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
		1014	unsigned saturating arithmetic. */
		1015	extern __inline __m64
		1016	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1017	_mm_subs_pu8(__m64 __m1, __m64 __m2) {
		1018	__vector unsigned char __a, __b, __c;
		1019
		1020	__a = (__vector unsigned char)vec_splats(__m1);
		1021	__b = (__vector unsigned char)vec_splats(__m2);
		1022	__c = vec_subs(__a, __b);
		1023	return (__m64)((__vector long long)__c)[0];
		1024	}
		1025
		1026	extern __inline __m64
		1027	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1028	_m_psubusb(__m64 __m1, __m64 __m2) {
		1029	return _mm_subs_pu8(__m1, __m2);
		1030	}
		1031
		1032	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
		1033	unsigned saturating arithmetic. */
		1034	extern __inline __m64
		1035	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1036	_mm_subs_pu16(__m64 __m1, __m64 __m2) {
		1037	__vector unsigned short __a, __b, __c;
		1038
		1039	__a = (__vector unsigned short)vec_splats(__m1);
		1040	__b = (__vector unsigned short)vec_splats(__m2);
		1041	__c = vec_subs(__a, __b);
		1042	return (__m64)((__vector long long)__c)[0];
		1043	}
		1044
		1045	extern __inline __m64
		1046	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1047	_m_psubusw(__m64 __m1, __m64 __m2) {
		1048	return _mm_subs_pu16(__m1, __m2);
		1049	}
		1050
		1051	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
		1052	four 32-bit intermediate results, which are then summed by pairs to
		1053	produce two 32-bit results. */
		1054	extern __inline __m64
		1055	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1056	_mm_madd_pi16(__m64 __m1, __m64 __m2) {
		1057	__vector signed short __a, __b;
		1058	__vector signed int __c;
		1059	__vector signed int __zero = {0, 0, 0, 0};
		1060
		1061	__a = (__vector signed short)vec_splats(__m1);
		1062	__b = (__vector signed short)vec_splats(__m2);
		1063	__c = vec_vmsumshm(__a, __b, __zero);
		1064	return (__m64)((__vector long long)__c)[0];
		1065	}
		1066
		1067	extern __inline __m64
		1068	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1069	_m_pmaddwd(__m64 __m1, __m64 __m2) {
		1070	return _mm_madd_pi16(__m1, __m2);
		1071	}
		1072	/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
		1073	M2 and produce the high 16 bits of the 32-bit results. */
		1074	extern __inline __m64
		1075	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1076	_mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
		1077	__vector signed short __a, __b;
		1078	__vector signed short __c;
		1079	__vector signed int __w0, __w1;
		1080	__vector unsigned char __xform1 = {
		1081	#ifdef __LITTLE_ENDIAN__
		1082	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
		1083	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
		1084	#else
		1085	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
		1086	0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
		1087	#endif
		1088	};
		1089
		1090	__a = (__vector signed short)vec_splats(__m1);
		1091	__b = (__vector signed short)vec_splats(__m2);
		1092
		1093	__w0 = vec_vmulesh(__a, __b);
		1094	__w1 = vec_vmulosh(__a, __b);
		1095	__c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
		1096
		1097	return (__m64)((__vector long long)__c)[0];
		1098	}
		1099
		1100	extern __inline __m64
		1101	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1102	_m_pmulhw(__m64 __m1, __m64 __m2) {
		1103	return _mm_mulhi_pi16(__m1, __m2);
		1104	}
		1105
		1106	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
		1107	the low 16 bits of the results. */
		1108	extern __inline __m64
		1109	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1110	_mm_mullo_pi16(__m64 __m1, __m64 __m2) {
		1111	__vector signed short __a, __b, __c;
		1112
		1113	__a = (__vector signed short)vec_splats(__m1);
		1114	__b = (__vector signed short)vec_splats(__m2);
		1115	__c = __a * __b;
		1116	return (__m64)((__vector long long)__c)[0];
		1117	}
		1118
		1119	extern __inline __m64
		1120	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1121	_m_pmullw(__m64 __m1, __m64 __m2) {
		1122	return _mm_mullo_pi16(__m1, __m2);
		1123	}
		1124
		1125	/* Shift four 16-bit values in M left by COUNT. */
		1126	extern __inline __m64
		1127	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1128	_mm_sll_pi16(__m64 __m, __m64 __count) {
		1129	__vector signed short __r;
		1130	__vector unsigned short __c;
		1131
		1132	if (__count <= 15) {
		1133	__r = (__vector signed short)vec_splats(__m);
		1134	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
		1135	__r = vec_sl(__r, (__vector unsigned short)__c);
		1136	return (__m64)((__vector long long)__r)[0];
		1137	} else
		1138	return (0);
		1139	}
		1140
		1141	extern __inline __m64
		1142	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1143	_m_psllw(__m64 __m, __m64 __count) {
		1144	return _mm_sll_pi16(__m, __count);
		1145	}
		1146
		1147	extern __inline __m64
		1148	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1149	_mm_slli_pi16(__m64 __m, int __count) {
		1150	/* Promote int to long then invoke mm_sll_pi16. */
		1151	return _mm_sll_pi16(__m, __count);
		1152	}
		1153
		1154	extern __inline __m64
		1155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1156	_m_psllwi(__m64 __m, int __count) {
		1157	return _mm_slli_pi16(__m, __count);
		1158	}
		1159
		1160	/* Shift two 32-bit values in M left by COUNT. */
		1161	extern __inline __m64
		1162	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1163	_mm_sll_pi32(__m64 __m, __m64 __count) {
		1164	__m64_union __res;
		1165
		1166	__res.as_m64 = __m;
		1167
		1168	__res.as_int[0] = __res.as_int[0] << __count;
		1169	__res.as_int[1] = __res.as_int[1] << __count;
		1170	return (__res.as_m64);
		1171	}
		1172
		1173	extern __inline __m64
		1174	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1175	_m_pslld(__m64 __m, __m64 __count) {
		1176	return _mm_sll_pi32(__m, __count);
		1177	}
		1178
		1179	extern __inline __m64
		1180	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1181	_mm_slli_pi32(__m64 __m, int __count) {
		1182	/* Promote int to long then invoke mm_sll_pi32. */
		1183	return _mm_sll_pi32(__m, __count);
		1184	}
		1185
		1186	extern __inline __m64
		1187	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1188	_m_pslldi(__m64 __m, int __count) {
		1189	return _mm_slli_pi32(__m, __count);
		1190	}
		1191
		1192	/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
		1193	extern __inline __m64
		1194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1195	_mm_sra_pi16(__m64 __m, __m64 __count) {
		1196	__vector signed short __r;
		1197	__vector unsigned short __c;
		1198
		1199	if (__count <= 15) {
		1200	__r = (__vector signed short)vec_splats(__m);
		1201	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
		1202	__r = vec_sra(__r, (__vector unsigned short)__c);
		1203	return (__m64)((__vector long long)__r)[0];
		1204	} else
		1205	return (0);
		1206	}
		1207
		1208	extern __inline __m64
		1209	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1210	_m_psraw(__m64 __m, __m64 __count) {
		1211	return _mm_sra_pi16(__m, __count);
		1212	}
		1213
		1214	extern __inline __m64
		1215	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1216	_mm_srai_pi16(__m64 __m, int __count) {
		1217	/* Promote int to long then invoke mm_sra_pi32. */
		1218	return _mm_sra_pi16(__m, __count);
		1219	}
		1220
		1221	extern __inline __m64
		1222	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1223	_m_psrawi(__m64 __m, int __count) {
		1224	return _mm_srai_pi16(__m, __count);
		1225	}
		1226
		1227	/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
		1228	extern __inline __m64
		1229	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1230	_mm_sra_pi32(__m64 __m, __m64 __count) {
		1231	__m64_union __res;
		1232
		1233	__res.as_m64 = __m;
		1234
		1235	__res.as_int[0] = __res.as_int[0] >> __count;
		1236	__res.as_int[1] = __res.as_int[1] >> __count;
		1237	return (__res.as_m64);
		1238	}
		1239
		1240	extern __inline __m64
		1241	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1242	_m_psrad(__m64 __m, __m64 __count) {
		1243	return _mm_sra_pi32(__m, __count);
		1244	}
		1245
		1246	extern __inline __m64
		1247	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1248	_mm_srai_pi32(__m64 __m, int __count) {
		1249	/* Promote int to long then invoke mm_sra_pi32. */
		1250	return _mm_sra_pi32(__m, __count);
		1251	}
		1252
		1253	extern __inline __m64
		1254	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1255	_m_psradi(__m64 __m, int __count) {
		1256	return _mm_srai_pi32(__m, __count);
		1257	}
		1258
		1259	/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
		1260	extern __inline __m64
		1261	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1262	_mm_srl_pi16(__m64 __m, __m64 __count) {
		1263	__vector unsigned short __r;
		1264	__vector unsigned short __c;
		1265
		1266	if (__count <= 15) {
		1267	__r = (__vector unsigned short)vec_splats(__m);
		1268	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
		1269	__r = vec_sr(__r, (__vector unsigned short)__c);
		1270	return (__m64)((__vector long long)__r)[0];
		1271	} else
		1272	return (0);
		1273	}
		1274
		1275	extern __inline __m64
		1276	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1277	_m_psrlw(__m64 __m, __m64 __count) {
		1278	return _mm_srl_pi16(__m, __count);
		1279	}
		1280
		1281	extern __inline __m64
		1282	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1283	_mm_srli_pi16(__m64 __m, int __count) {
		1284	/* Promote int to long then invoke mm_sra_pi32. */
		1285	return _mm_srl_pi16(__m, __count);
		1286	}
		1287
		1288	extern __inline __m64
		1289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1290	_m_psrlwi(__m64 __m, int __count) {
		1291	return _mm_srli_pi16(__m, __count);
		1292	}
		1293
		1294	/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
		1295	extern __inline __m64
		1296	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1297	_mm_srl_pi32(__m64 __m, __m64 __count) {
		1298	__m64_union __res;
		1299
		1300	__res.as_m64 = __m;
		1301
		1302	__res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
		1303	__res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
		1304	return (__res.as_m64);
		1305	}
		1306
		1307	extern __inline __m64
		1308	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1309	_m_psrld(__m64 __m, __m64 __count) {
		1310	return _mm_srl_pi32(__m, __count);
		1311	}
		1312
		1313	extern __inline __m64
		1314	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1315	_mm_srli_pi32(__m64 __m, int __count) {
		1316	/* Promote int to long then invoke mm_srl_pi32. */
		1317	return _mm_srl_pi32(__m, __count);
		1318	}
		1319
		1320	extern __inline __m64
		1321	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1322	_m_psrldi(__m64 __m, int __count) {
		1323	return _mm_srli_pi32(__m, __count);
		1324	}
		1325	#endif /* _ARCH_PWR8 */
		1326
		1327	/* Creates a vector of two 32-bit values; I0 is least significant. */
		1328	extern __inline __m64
		1329	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1330	_mm_set_pi32(int __i1, int __i0) {
		1331	__m64_union __res;
		1332
		1333	__res.as_int[0] = __i0;
		1334	__res.as_int[1] = __i1;
		1335	return (__res.as_m64);
		1336	}
		1337
		1338	/* Creates a vector of four 16-bit values; W0 is least significant. */
		1339	extern __inline __m64
		1340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1341	_mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
		1342	__m64_union __res;
		1343
		1344	__res.as_short[0] = __w0;
		1345	__res.as_short[1] = __w1;
		1346	__res.as_short[2] = __w2;
		1347	__res.as_short[3] = __w3;
		1348	return (__res.as_m64);
		1349	}
		1350
		1351	/* Creates a vector of eight 8-bit values; B0 is least significant. */
		1352	extern __inline __m64
		1353	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1354	_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
		1355	char __b2, char __b1, char __b0) {
		1356	__m64_union __res;
		1357
		1358	__res.as_char[0] = __b0;
		1359	__res.as_char[1] = __b1;
		1360	__res.as_char[2] = __b2;
		1361	__res.as_char[3] = __b3;
		1362	__res.as_char[4] = __b4;
		1363	__res.as_char[5] = __b5;
		1364	__res.as_char[6] = __b6;
		1365	__res.as_char[7] = __b7;
		1366	return (__res.as_m64);
		1367	}
		1368
		1369	/* Similar, but with the arguments in reverse order. */
		1370	extern __inline __m64
		1371	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1372	_mm_setr_pi32(int __i0, int __i1) {
		1373	__m64_union __res;
		1374
		1375	__res.as_int[0] = __i0;
		1376	__res.as_int[1] = __i1;
		1377	return (__res.as_m64);
		1378	}
		1379
		1380	extern __inline __m64
		1381	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1382	_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
		1383	return _mm_set_pi16(__w3, __w2, __w1, __w0);
		1384	}
		1385
		1386	extern __inline __m64
		1387	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1388	_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
		1389	char __b5, char __b6, char __b7) {
		1390	return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
		1391	}
		1392
		1393	/* Creates a vector of two 32-bit values, both elements containing I. */
		1394	extern __inline __m64
		1395	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1396	_mm_set1_pi32(int __i) {
		1397	__m64_union __res;
		1398
		1399	__res.as_int[0] = __i;
		1400	__res.as_int[1] = __i;
		1401	return (__res.as_m64);
		1402	}
		1403
		1404	/* Creates a vector of four 16-bit values, all elements containing W. */
		1405	extern __inline __m64
		1406	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1407	_mm_set1_pi16(short __w) {
		1408	#if _ARCH_PWR9
		1409	__vector signed short w;
		1410
		1411	w = (__vector signed short)vec_splats(__w);
		1412	return (__m64)((__vector long long)w)[0];
		1413	#else
		1414	__m64_union __res;
		1415
		1416	__res.as_short[0] = __w;
		1417	__res.as_short[1] = __w;
		1418	__res.as_short[2] = __w;
		1419	__res.as_short[3] = __w;
		1420	return (__res.as_m64);
		1421	#endif
		1422	}
		1423
		1424	/* Creates a vector of eight 8-bit values, all elements containing B. */
		1425	extern __inline __m64
		1426	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		1427	_mm_set1_pi8(signed char __b) {
		1428	#if _ARCH_PWR8
		1429	__vector signed char __res;
		1430
		1431	__res = (__vector signed char)vec_splats(__b);
		1432	return (__m64)((__vector long long)__res)[0];
		1433	#else
		1434	__m64_union __res;
		1435
		1436	__res.as_char[0] = __b;
		1437	__res.as_char[1] = __b;
		1438	__res.as_char[2] = __b;
		1439	__res.as_char[3] = __b;
		1440	__res.as_char[4] = __b;
		1441	__res.as_char[5] = __b;
		1442	__res.as_char[6] = __b;
		1443	__res.as_char[7] = __b;
		1444	return (__res.as_m64);
		1445	#endif
		1446	}
		1447
		1448	#else
		1449	#include_next <mmintrin.h>
		1450	#endif /* defined(__powerpc64__) && \
		1451	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
		1452
		1453	#endif /* _MMINTRIN_H_INCLUDED */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/ppc_wrappers/mmintrin.h – Rev 14