core/stdarch/crates/core_arch/src/x86/
sse2.rs

1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdarch_test::assert_instr;
5
6use crate::{
7    core_arch::{simd::*, x86::*},
8    intrinsics::simd::*,
9    intrinsics::sqrtf64,
10    mem, ptr,
11};
12
13/// Provides a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
17///
18/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause)
19#[inline]
20#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(pause))]
21#[stable(feature = "simd_x86", since = "1.27.0")]
22pub unsafe fn _mm_pause() {
23    // note: `pause` is guaranteed to be interpreted as a `nop` by CPUs without
24    // the SSE2 target-feature - therefore it does not require any target features
25    pause()
26}
27
28/// Invalidates and flushes the cache line that contains `p` from all levels of
29/// the cache hierarchy.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush)
32#[inline]
33#[target_feature(enable = "sse2")]
34#[cfg_attr(test, assert_instr(clflush))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm_clflush(p: *const u8) {
37    clflush(p)
38}
39
40/// Performs a serializing operation on all load-from-memory instructions
41/// that were issued prior to this instruction.
42///
43/// Guarantees that every load instruction that precedes, in program order, is
44/// globally visible before any load instruction which follows the fence in
45/// program order.
46///
47/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence)
48#[inline]
49#[target_feature(enable = "sse2")]
50#[cfg_attr(test, assert_instr(lfence))]
51#[stable(feature = "simd_x86", since = "1.27.0")]
52pub unsafe fn _mm_lfence() {
53    lfence()
54}
55
56/// Performs a serializing operation on all load-from-memory and store-to-memory
57/// instructions that were issued prior to this instruction.
58///
59/// Guarantees that every memory access that precedes, in program order, the
60/// memory fence instruction is globally visible before any memory instruction
61/// which follows the fence in program order.
62///
63/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence)
64#[inline]
65#[target_feature(enable = "sse2")]
66#[cfg_attr(test, assert_instr(mfence))]
67#[stable(feature = "simd_x86", since = "1.27.0")]
68pub unsafe fn _mm_mfence() {
69    mfence()
70}
71
72/// Adds packed 8-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
75#[inline]
76#[target_feature(enable = "sse2")]
77#[cfg_attr(test, assert_instr(paddb))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
80    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
81}
82
83/// Adds packed 16-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
86#[inline]
87#[target_feature(enable = "sse2")]
88#[cfg_attr(test, assert_instr(paddw))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
91    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
92}
93
94/// Adds packed 32-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
97#[inline]
98#[target_feature(enable = "sse2")]
99#[cfg_attr(test, assert_instr(paddd))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
102    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
103}
104
105/// Adds packed 64-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
108#[inline]
109#[target_feature(enable = "sse2")]
110#[cfg_attr(test, assert_instr(paddq))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
113    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
119#[inline]
120#[target_feature(enable = "sse2")]
121#[cfg_attr(test, assert_instr(paddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
130#[inline]
131#[target_feature(enable = "sse2")]
132#[cfg_attr(test, assert_instr(paddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
141#[inline]
142#[target_feature(enable = "sse2")]
143#[cfg_attr(test, assert_instr(paddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
152#[inline]
153#[target_feature(enable = "sse2")]
154#[cfg_attr(test, assert_instr(paddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8())) }
158}
159
160/// Averages packed unsigned 8-bit integers in `a` and `b`.
161///
162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
163#[inline]
164#[target_feature(enable = "sse2")]
165#[cfg_attr(test, assert_instr(pavgb))]
166#[stable(feature = "simd_x86", since = "1.27.0")]
167pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
168    unsafe {
169        let a = simd_cast::<_, u16x16>(a.as_u8x16());
170        let b = simd_cast::<_, u16x16>(b.as_u8x16());
171        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
172        transmute(simd_cast::<_, u8x16>(r))
173    }
174}
175
176/// Averages packed unsigned 16-bit integers in `a` and `b`.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
179#[inline]
180#[target_feature(enable = "sse2")]
181#[cfg_attr(test, assert_instr(pavgw))]
182#[stable(feature = "simd_x86", since = "1.27.0")]
183pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
184    unsafe {
185        let a = simd_cast::<_, u32x8>(a.as_u16x8());
186        let b = simd_cast::<_, u32x8>(b.as_u16x8());
187        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
188        transmute(simd_cast::<_, u16x8>(r))
189    }
190}
191
192/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
193///
194/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
195/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
196/// intermediate 32-bit integers.
197///
198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
199#[inline]
200#[target_feature(enable = "sse2")]
201#[cfg_attr(test, assert_instr(pmaddwd))]
202#[stable(feature = "simd_x86", since = "1.27.0")]
203pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
204    unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
205}
206
207/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
208/// maximum values.
209///
210/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
211#[inline]
212#[target_feature(enable = "sse2")]
213#[cfg_attr(test, assert_instr(pmaxsw))]
214#[stable(feature = "simd_x86", since = "1.27.0")]
215pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
216    unsafe {
217        let a = a.as_i16x8();
218        let b = b.as_i16x8();
219        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
220    }
221}
222
223/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
224/// packed maximum values.
225///
226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
227#[inline]
228#[target_feature(enable = "sse2")]
229#[cfg_attr(test, assert_instr(pmaxub))]
230#[stable(feature = "simd_x86", since = "1.27.0")]
231pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
232    unsafe {
233        let a = a.as_u8x16();
234        let b = b.as_u8x16();
235        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
236    }
237}
238
239/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
240/// minimum values.
241///
242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
243#[inline]
244#[target_feature(enable = "sse2")]
245#[cfg_attr(test, assert_instr(pminsw))]
246#[stable(feature = "simd_x86", since = "1.27.0")]
247pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
248    unsafe {
249        let a = a.as_i16x8();
250        let b = b.as_i16x8();
251        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
252    }
253}
254
255/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
256/// packed minimum values.
257///
258/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
259#[inline]
260#[target_feature(enable = "sse2")]
261#[cfg_attr(test, assert_instr(pminub))]
262#[stable(feature = "simd_x86", since = "1.27.0")]
263pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
264    unsafe {
265        let a = a.as_u8x16();
266        let b = b.as_u8x16();
267        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
268    }
269}
270
271/// Multiplies the packed 16-bit integers in `a` and `b`.
272///
273/// The multiplication produces intermediate 32-bit integers, and returns the
274/// high 16 bits of the intermediate integers.
275///
276/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
277#[inline]
278#[target_feature(enable = "sse2")]
279#[cfg_attr(test, assert_instr(pmulhw))]
280#[stable(feature = "simd_x86", since = "1.27.0")]
281pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
282    unsafe {
283        let a = simd_cast::<_, i32x8>(a.as_i16x8());
284        let b = simd_cast::<_, i32x8>(b.as_i16x8());
285        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
286        transmute(simd_cast::<i32x8, i16x8>(r))
287    }
288}
289
290/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
291///
292/// The multiplication produces intermediate 32-bit integers, and returns the
293/// high 16 bits of the intermediate integers.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
296#[inline]
297#[target_feature(enable = "sse2")]
298#[cfg_attr(test, assert_instr(pmulhuw))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
301    unsafe {
302        let a = simd_cast::<_, u32x8>(a.as_u16x8());
303        let b = simd_cast::<_, u32x8>(b.as_u16x8());
304        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
305        transmute(simd_cast::<u32x8, u16x8>(r))
306    }
307}
308
309/// Multiplies the packed 16-bit integers in `a` and `b`.
310///
311/// The multiplication produces intermediate 32-bit integers, and returns the
312/// low 16 bits of the intermediate integers.
313///
314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
315#[inline]
316#[target_feature(enable = "sse2")]
317#[cfg_attr(test, assert_instr(pmullw))]
318#[stable(feature = "simd_x86", since = "1.27.0")]
319pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
320    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
321}
322
323/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
324/// in `a` and `b`.
325///
326/// Returns the unsigned 64-bit results.
327///
328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
329#[inline]
330#[target_feature(enable = "sse2")]
331#[cfg_attr(test, assert_instr(pmuludq))]
332#[stable(feature = "simd_x86", since = "1.27.0")]
333pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
334    unsafe {
335        let a = a.as_u64x2();
336        let b = b.as_u64x2();
337        let mask = u64x2::splat(u32::MAX.into());
338        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
339    }
340}
341
342/// Sum the absolute differences of packed unsigned 8-bit integers.
343///
344/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
345/// and `b`, then horizontally sum each consecutive 8 differences to produce
346/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
347/// the low 16 bits of 64-bit elements returned.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
350#[inline]
351#[target_feature(enable = "sse2")]
352#[cfg_attr(test, assert_instr(psadbw))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
355    unsafe { transmute(psadbw(a.as_u8x16(), b.as_u8x16())) }
356}
357
358/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
359///
360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
361#[inline]
362#[target_feature(enable = "sse2")]
363#[cfg_attr(test, assert_instr(psubb))]
364#[stable(feature = "simd_x86", since = "1.27.0")]
365pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
366    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
367}
368
369/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
372#[inline]
373#[target_feature(enable = "sse2")]
374#[cfg_attr(test, assert_instr(psubw))]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
377    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
378}
379
380/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
383#[inline]
384#[target_feature(enable = "sse2")]
385#[cfg_attr(test, assert_instr(psubd))]
386#[stable(feature = "simd_x86", since = "1.27.0")]
387pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
388    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
389}
390
391/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
392///
393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
394#[inline]
395#[target_feature(enable = "sse2")]
396#[cfg_attr(test, assert_instr(psubq))]
397#[stable(feature = "simd_x86", since = "1.27.0")]
398pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
399    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
400}
401
402/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
403/// using saturation.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
406#[inline]
407#[target_feature(enable = "sse2")]
408#[cfg_attr(test, assert_instr(psubsb))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
411    unsafe { transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16())) }
412}
413
414/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
415/// using saturation.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
418#[inline]
419#[target_feature(enable = "sse2")]
420#[cfg_attr(test, assert_instr(psubsw))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
423    unsafe { transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8())) }
424}
425
426/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
427/// integers in `a` using saturation.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
430#[inline]
431#[target_feature(enable = "sse2")]
432#[cfg_attr(test, assert_instr(psubusb))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
435    unsafe { transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16())) }
436}
437
438/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
439/// integers in `a` using saturation.
440///
441/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
442#[inline]
443#[target_feature(enable = "sse2")]
444#[cfg_attr(test, assert_instr(psubusw))]
445#[stable(feature = "simd_x86", since = "1.27.0")]
446pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
447    unsafe { transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8())) }
448}
449
450/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
451///
452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
453#[inline]
454#[target_feature(enable = "sse2")]
455#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
456#[rustc_legacy_const_generics(1)]
457#[stable(feature = "simd_x86", since = "1.27.0")]
458pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
459    static_assert_uimm_bits!(IMM8, 8);
460    unsafe { _mm_slli_si128_impl::<IMM8>(a) }
461}
462
463/// Implementation detail: converts the immediate argument of the
464/// `_mm_slli_si128` intrinsic into a compile-time constant.
465#[inline]
466#[target_feature(enable = "sse2")]
467unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
468    const fn mask(shift: i32, i: u32) -> u32 {
469        let shift = shift as u32 & 0xff;
470        if shift > 15 { i } else { 16 - shift + i }
471    }
472    transmute::<i8x16, _>(simd_shuffle!(
473        i8x16::ZERO,
474        a.as_i8x16(),
475        [
476            mask(IMM8, 0),
477            mask(IMM8, 1),
478            mask(IMM8, 2),
479            mask(IMM8, 3),
480            mask(IMM8, 4),
481            mask(IMM8, 5),
482            mask(IMM8, 6),
483            mask(IMM8, 7),
484            mask(IMM8, 8),
485            mask(IMM8, 9),
486            mask(IMM8, 10),
487            mask(IMM8, 11),
488            mask(IMM8, 12),
489            mask(IMM8, 13),
490            mask(IMM8, 14),
491            mask(IMM8, 15),
492        ],
493    ))
494}
495
496/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
499#[inline]
500#[target_feature(enable = "sse2")]
501#[cfg_attr(test, assert_instr(pslldq, IMM8 = 1))]
502#[rustc_legacy_const_generics(1)]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
505    unsafe {
506        static_assert_uimm_bits!(IMM8, 8);
507        _mm_slli_si128_impl::<IMM8>(a)
508    }
509}
510
511/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
514#[inline]
515#[target_feature(enable = "sse2")]
516#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
517#[rustc_legacy_const_generics(1)]
518#[stable(feature = "simd_x86", since = "1.27.0")]
519pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
520    unsafe {
521        static_assert_uimm_bits!(IMM8, 8);
522        _mm_srli_si128_impl::<IMM8>(a)
523    }
524}
525
526/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
527///
528/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
529#[inline]
530#[target_feature(enable = "sse2")]
531#[cfg_attr(test, assert_instr(psllw, IMM8 = 7))]
532#[rustc_legacy_const_generics(1)]
533#[stable(feature = "simd_x86", since = "1.27.0")]
534pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
535    static_assert_uimm_bits!(IMM8, 8);
536    unsafe {
537        if IMM8 >= 16 {
538            _mm_setzero_si128()
539        } else {
540            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
541        }
542    }
543}
544
545/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
546/// zeros.
547///
548/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
549#[inline]
550#[target_feature(enable = "sse2")]
551#[cfg_attr(test, assert_instr(psllw))]
552#[stable(feature = "simd_x86", since = "1.27.0")]
553pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
554    unsafe { transmute(psllw(a.as_i16x8(), count.as_i16x8())) }
555}
556
557/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
560#[inline]
561#[target_feature(enable = "sse2")]
562#[cfg_attr(test, assert_instr(pslld, IMM8 = 7))]
563#[rustc_legacy_const_generics(1)]
564#[stable(feature = "simd_x86", since = "1.27.0")]
565pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
566    static_assert_uimm_bits!(IMM8, 8);
567    unsafe {
568        if IMM8 >= 32 {
569            _mm_setzero_si128()
570        } else {
571            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
572        }
573    }
574}
575
576/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
577/// zeros.
578///
579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
580#[inline]
581#[target_feature(enable = "sse2")]
582#[cfg_attr(test, assert_instr(pslld))]
583#[stable(feature = "simd_x86", since = "1.27.0")]
584pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
585    unsafe { transmute(pslld(a.as_i32x4(), count.as_i32x4())) }
586}
587
588/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
589///
590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
591#[inline]
592#[target_feature(enable = "sse2")]
593#[cfg_attr(test, assert_instr(psllq, IMM8 = 7))]
594#[rustc_legacy_const_generics(1)]
595#[stable(feature = "simd_x86", since = "1.27.0")]
596pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
597    static_assert_uimm_bits!(IMM8, 8);
598    unsafe {
599        if IMM8 >= 64 {
600            _mm_setzero_si128()
601        } else {
602            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
603        }
604    }
605}
606
607/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
608/// zeros.
609///
610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
611#[inline]
612#[target_feature(enable = "sse2")]
613#[cfg_attr(test, assert_instr(psllq))]
614#[stable(feature = "simd_x86", since = "1.27.0")]
615pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
616    unsafe { transmute(psllq(a.as_i64x2(), count.as_i64x2())) }
617}
618
619/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
620/// bits.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
623#[inline]
624#[target_feature(enable = "sse2")]
625#[cfg_attr(test, assert_instr(psraw, IMM8 = 1))]
626#[rustc_legacy_const_generics(1)]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
629    static_assert_uimm_bits!(IMM8, 8);
630    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16))) }
631}
632
633/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
634/// bits.
635///
636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
637#[inline]
638#[target_feature(enable = "sse2")]
639#[cfg_attr(test, assert_instr(psraw))]
640#[stable(feature = "simd_x86", since = "1.27.0")]
641pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
642    unsafe { transmute(psraw(a.as_i16x8(), count.as_i16x8())) }
643}
644
645/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
646/// bits.
647///
648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
649#[inline]
650#[target_feature(enable = "sse2")]
651#[cfg_attr(test, assert_instr(psrad, IMM8 = 1))]
652#[rustc_legacy_const_generics(1)]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
655    static_assert_uimm_bits!(IMM8, 8);
656    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31)))) }
657}
658
659/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
660/// bits.
661///
662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
663#[inline]
664#[target_feature(enable = "sse2")]
665#[cfg_attr(test, assert_instr(psrad))]
666#[stable(feature = "simd_x86", since = "1.27.0")]
667pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
668    unsafe { transmute(psrad(a.as_i32x4(), count.as_i32x4())) }
669}
670
671/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
674#[inline]
675#[target_feature(enable = "sse2")]
676#[cfg_attr(test, assert_instr(psrldq, IMM8 = 1))]
677#[rustc_legacy_const_generics(1)]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
680    static_assert_uimm_bits!(IMM8, 8);
681    unsafe { _mm_srli_si128_impl::<IMM8>(a) }
682}
683
684/// Implementation detail: converts the immediate argument of the
685/// `_mm_srli_si128` intrinsic into a compile-time constant.
686#[inline]
687#[target_feature(enable = "sse2")]
688unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
689    const fn mask(shift: i32, i: u32) -> u32 {
690        if (shift as u32) > 15 {
691            i + 16
692        } else {
693            i + (shift as u32)
694        }
695    }
696    let x: i8x16 = simd_shuffle!(
697        a.as_i8x16(),
698        i8x16::ZERO,
699        [
700            mask(IMM8, 0),
701            mask(IMM8, 1),
702            mask(IMM8, 2),
703            mask(IMM8, 3),
704            mask(IMM8, 4),
705            mask(IMM8, 5),
706            mask(IMM8, 6),
707            mask(IMM8, 7),
708            mask(IMM8, 8),
709            mask(IMM8, 9),
710            mask(IMM8, 10),
711            mask(IMM8, 11),
712            mask(IMM8, 12),
713            mask(IMM8, 13),
714            mask(IMM8, 14),
715            mask(IMM8, 15),
716        ],
717    );
718    transmute(x)
719}
720
721/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
722/// zeros.
723///
724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
725#[inline]
726#[target_feature(enable = "sse2")]
727#[cfg_attr(test, assert_instr(psrlw, IMM8 = 1))]
728#[rustc_legacy_const_generics(1)]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
731    static_assert_uimm_bits!(IMM8, 8);
732    unsafe {
733        if IMM8 >= 16 {
734            _mm_setzero_si128()
735        } else {
736            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
737        }
738    }
739}
740
741/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
742/// zeros.
743///
744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
745#[inline]
746#[target_feature(enable = "sse2")]
747#[cfg_attr(test, assert_instr(psrlw))]
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
750    unsafe { transmute(psrlw(a.as_i16x8(), count.as_i16x8())) }
751}
752
753/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
754/// zeros.
755///
756/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
757#[inline]
758#[target_feature(enable = "sse2")]
759#[cfg_attr(test, assert_instr(psrld, IMM8 = 8))]
760#[rustc_legacy_const_generics(1)]
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
763    static_assert_uimm_bits!(IMM8, 8);
764    unsafe {
765        if IMM8 >= 32 {
766            _mm_setzero_si128()
767        } else {
768            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
769        }
770    }
771}
772
773/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
774/// zeros.
775///
776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
777#[inline]
778#[target_feature(enable = "sse2")]
779#[cfg_attr(test, assert_instr(psrld))]
780#[stable(feature = "simd_x86", since = "1.27.0")]
781pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
782    unsafe { transmute(psrld(a.as_i32x4(), count.as_i32x4())) }
783}
784
785/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
786/// zeros.
787///
788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
789#[inline]
790#[target_feature(enable = "sse2")]
791#[cfg_attr(test, assert_instr(psrlq, IMM8 = 1))]
792#[rustc_legacy_const_generics(1)]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
795    static_assert_uimm_bits!(IMM8, 8);
796    unsafe {
797        if IMM8 >= 64 {
798            _mm_setzero_si128()
799        } else {
800            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
801        }
802    }
803}
804
805/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
806/// zeros.
807///
808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
809#[inline]
810#[target_feature(enable = "sse2")]
811#[cfg_attr(test, assert_instr(psrlq))]
812#[stable(feature = "simd_x86", since = "1.27.0")]
813pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
814    unsafe { transmute(psrlq(a.as_i64x2(), count.as_i64x2())) }
815}
816
817/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
818/// `b`.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
821#[inline]
822#[target_feature(enable = "sse2")]
823#[cfg_attr(test, assert_instr(andps))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
826    unsafe { simd_and(a, b) }
827}
828
829/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
830/// then AND with `b`.
831///
832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
833#[inline]
834#[target_feature(enable = "sse2")]
835#[cfg_attr(test, assert_instr(andnps))]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
838    unsafe { simd_and(simd_xor(_mm_set1_epi8(-1), a), b) }
839}
840
841/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
842/// `b`.
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
845#[inline]
846#[target_feature(enable = "sse2")]
847#[cfg_attr(test, assert_instr(orps))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
850    unsafe { simd_or(a, b) }
851}
852
853/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
854/// `b`.
855///
856/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
857#[inline]
858#[target_feature(enable = "sse2")]
859#[cfg_attr(test, assert_instr(xorps))]
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
862    unsafe { simd_xor(a, b) }
863}
864
865/// Compares packed 8-bit integers in `a` and `b` for equality.
866///
867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
868#[inline]
869#[target_feature(enable = "sse2")]
870#[cfg_attr(test, assert_instr(pcmpeqb))]
871#[stable(feature = "simd_x86", since = "1.27.0")]
872pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
873    unsafe { transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
874}
875
876/// Compares packed 16-bit integers in `a` and `b` for equality.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
879#[inline]
880#[target_feature(enable = "sse2")]
881#[cfg_attr(test, assert_instr(pcmpeqw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
884    unsafe { transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
885}
886
887/// Compares packed 32-bit integers in `a` and `b` for equality.
888///
889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
890#[inline]
891#[target_feature(enable = "sse2")]
892#[cfg_attr(test, assert_instr(pcmpeqd))]
893#[stable(feature = "simd_x86", since = "1.27.0")]
894pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
895    unsafe { transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
896}
897
898/// Compares packed 8-bit integers in `a` and `b` for greater-than.
899///
900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
901#[inline]
902#[target_feature(enable = "sse2")]
903#[cfg_attr(test, assert_instr(pcmpgtb))]
904#[stable(feature = "simd_x86", since = "1.27.0")]
905pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
906    unsafe { transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
907}
908
909/// Compares packed 16-bit integers in `a` and `b` for greater-than.
910///
911/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
912#[inline]
913#[target_feature(enable = "sse2")]
914#[cfg_attr(test, assert_instr(pcmpgtw))]
915#[stable(feature = "simd_x86", since = "1.27.0")]
916pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
917    unsafe { transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
918}
919
920/// Compares packed 32-bit integers in `a` and `b` for greater-than.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
923#[inline]
924#[target_feature(enable = "sse2")]
925#[cfg_attr(test, assert_instr(pcmpgtd))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
928    unsafe { transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
929}
930
931/// Compares packed 8-bit integers in `a` and `b` for less-than.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
934#[inline]
935#[target_feature(enable = "sse2")]
936#[cfg_attr(test, assert_instr(pcmpgtb))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
939    unsafe { transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
940}
941
942/// Compares packed 16-bit integers in `a` and `b` for less-than.
943///
944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
945#[inline]
946#[target_feature(enable = "sse2")]
947#[cfg_attr(test, assert_instr(pcmpgtw))]
948#[stable(feature = "simd_x86", since = "1.27.0")]
949pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
950    unsafe { transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
951}
952
953/// Compares packed 32-bit integers in `a` and `b` for less-than.
954///
955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
956#[inline]
957#[target_feature(enable = "sse2")]
958#[cfg_attr(test, assert_instr(pcmpgtd))]
959#[stable(feature = "simd_x86", since = "1.27.0")]
960pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
961    unsafe { transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
962}
963
964/// Converts the lower two packed 32-bit integers in `a` to packed
965/// double-precision (64-bit) floating-point elements.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
968#[inline]
969#[target_feature(enable = "sse2")]
970#[cfg_attr(test, assert_instr(cvtdq2pd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
973    unsafe {
974        let a = a.as_i32x4();
975        simd_cast::<i32x2, __m128d>(simd_shuffle!(a, a, [0, 1]))
976    }
977}
978
979/// Returns `a` with its lower element replaced by `b` after converting it to
980/// an `f64`.
981///
982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
983#[inline]
984#[target_feature(enable = "sse2")]
985#[cfg_attr(test, assert_instr(cvtsi2sd))]
986#[stable(feature = "simd_x86", since = "1.27.0")]
987pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
988    unsafe { simd_insert!(a, 0, b as f64) }
989}
990
991/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
992/// floating-point elements.
993///
994/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
995#[inline]
996#[target_feature(enable = "sse2")]
997#[cfg_attr(test, assert_instr(cvtdq2ps))]
998#[stable(feature = "simd_x86", since = "1.27.0")]
999pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
1000    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
1001}
1002
1003/// Converts packed single-precision (32-bit) floating-point elements in `a`
1004/// to packed 32-bit integers.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
1007#[inline]
1008#[target_feature(enable = "sse2")]
1009#[cfg_attr(test, assert_instr(cvtps2dq))]
1010#[stable(feature = "simd_x86", since = "1.27.0")]
1011pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
1012    unsafe { transmute(cvtps2dq(a)) }
1013}
1014
1015/// Returns a vector whose lowest element is `a` and all higher elements are
1016/// `0`.
1017///
1018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
1019#[inline]
1020#[target_feature(enable = "sse2")]
1021#[stable(feature = "simd_x86", since = "1.27.0")]
1022pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
1023    unsafe { transmute(i32x4::new(a, 0, 0, 0)) }
1024}
1025
1026/// Returns the lowest element of `a`.
1027///
1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
1029#[inline]
1030#[target_feature(enable = "sse2")]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
1033    unsafe { simd_extract!(a.as_i32x4(), 0) }
1034}
1035
1036/// Sets packed 64-bit integers with the supplied values, from highest to
1037/// lowest.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
1040#[inline]
1041#[target_feature(enable = "sse2")]
1042// no particular instruction to test
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
1045    unsafe { transmute(i64x2::new(e0, e1)) }
1046}
1047
1048/// Sets packed 32-bit integers with the supplied values.
1049///
1050/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
1051#[inline]
1052#[target_feature(enable = "sse2")]
1053// no particular instruction to test
1054#[stable(feature = "simd_x86", since = "1.27.0")]
1055pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1056    unsafe { transmute(i32x4::new(e0, e1, e2, e3)) }
1057}
1058
1059/// Sets packed 16-bit integers with the supplied values.
1060///
1061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
1062#[inline]
1063#[target_feature(enable = "sse2")]
1064// no particular instruction to test
1065#[stable(feature = "simd_x86", since = "1.27.0")]
1066pub fn _mm_set_epi16(
1067    e7: i16,
1068    e6: i16,
1069    e5: i16,
1070    e4: i16,
1071    e3: i16,
1072    e2: i16,
1073    e1: i16,
1074    e0: i16,
1075) -> __m128i {
1076    unsafe { transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
1077}
1078
1079/// Sets packed 8-bit integers with the supplied values.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
1082#[inline]
1083#[target_feature(enable = "sse2")]
1084// no particular instruction to test
1085#[stable(feature = "simd_x86", since = "1.27.0")]
1086pub fn _mm_set_epi8(
1087    e15: i8,
1088    e14: i8,
1089    e13: i8,
1090    e12: i8,
1091    e11: i8,
1092    e10: i8,
1093    e9: i8,
1094    e8: i8,
1095    e7: i8,
1096    e6: i8,
1097    e5: i8,
1098    e4: i8,
1099    e3: i8,
1100    e2: i8,
1101    e1: i8,
1102    e0: i8,
1103) -> __m128i {
1104    unsafe {
1105        #[rustfmt::skip]
1106        transmute(i8x16::new(
1107            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1108        ))
1109    }
1110}
1111
1112/// Broadcasts 64-bit integer `a` to all elements.
1113///
1114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
1115#[inline]
1116#[target_feature(enable = "sse2")]
1117// no particular instruction to test
1118#[stable(feature = "simd_x86", since = "1.27.0")]
1119pub fn _mm_set1_epi64x(a: i64) -> __m128i {
1120    _mm_set_epi64x(a, a)
1121}
1122
1123/// Broadcasts 32-bit integer `a` to all elements.
1124///
1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
1126#[inline]
1127#[target_feature(enable = "sse2")]
1128// no particular instruction to test
1129#[stable(feature = "simd_x86", since = "1.27.0")]
1130pub fn _mm_set1_epi32(a: i32) -> __m128i {
1131    _mm_set_epi32(a, a, a, a)
1132}
1133
1134/// Broadcasts 16-bit integer `a` to all elements.
1135///
1136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
1137#[inline]
1138#[target_feature(enable = "sse2")]
1139// no particular instruction to test
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_set1_epi16(a: i16) -> __m128i {
1142    _mm_set_epi16(a, a, a, a, a, a, a, a)
1143}
1144
1145/// Broadcasts 8-bit integer `a` to all elements.
1146///
1147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
1148#[inline]
1149#[target_feature(enable = "sse2")]
1150// no particular instruction to test
1151#[stable(feature = "simd_x86", since = "1.27.0")]
1152pub fn _mm_set1_epi8(a: i8) -> __m128i {
1153    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1154}
1155
1156/// Sets packed 32-bit integers with the supplied values in reverse order.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
1159#[inline]
1160#[target_feature(enable = "sse2")]
1161// no particular instruction to test
1162#[stable(feature = "simd_x86", since = "1.27.0")]
1163pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1164    _mm_set_epi32(e0, e1, e2, e3)
1165}
1166
1167/// Sets packed 16-bit integers with the supplied values in reverse order.
1168///
1169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
1170#[inline]
1171#[target_feature(enable = "sse2")]
1172// no particular instruction to test
1173#[stable(feature = "simd_x86", since = "1.27.0")]
1174pub fn _mm_setr_epi16(
1175    e7: i16,
1176    e6: i16,
1177    e5: i16,
1178    e4: i16,
1179    e3: i16,
1180    e2: i16,
1181    e1: i16,
1182    e0: i16,
1183) -> __m128i {
1184    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1185}
1186
1187/// Sets packed 8-bit integers with the supplied values in reverse order.
1188///
1189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
1190#[inline]
1191#[target_feature(enable = "sse2")]
1192// no particular instruction to test
1193#[stable(feature = "simd_x86", since = "1.27.0")]
1194pub fn _mm_setr_epi8(
1195    e15: i8,
1196    e14: i8,
1197    e13: i8,
1198    e12: i8,
1199    e11: i8,
1200    e10: i8,
1201    e9: i8,
1202    e8: i8,
1203    e7: i8,
1204    e6: i8,
1205    e5: i8,
1206    e4: i8,
1207    e3: i8,
1208    e2: i8,
1209    e1: i8,
1210    e0: i8,
1211) -> __m128i {
1212    #[rustfmt::skip]
1213    _mm_set_epi8(
1214        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1215    )
1216}
1217
1218/// Returns a vector with all elements set to zero.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
1221#[inline]
1222#[target_feature(enable = "sse2")]
1223#[cfg_attr(test, assert_instr(xorps))]
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225pub fn _mm_setzero_si128() -> __m128i {
1226    const { unsafe { mem::zeroed() } }
1227}
1228
1229/// Loads 64-bit integer from memory into first element of returned vector.
1230///
1231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64)
1232#[inline]
1233#[target_feature(enable = "sse2")]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
1236    _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
1237}
1238
1239/// Loads 128-bits of integer data from memory into a new vector.
1240///
1241/// `mem_addr` must be aligned on a 16-byte boundary.
1242///
1243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128)
1244#[inline]
1245#[target_feature(enable = "sse2")]
1246#[cfg_attr(test, assert_instr(movaps))]
1247#[stable(feature = "simd_x86", since = "1.27.0")]
1248pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1249    *mem_addr
1250}
1251
1252/// Loads 128-bits of integer data from memory into a new vector.
1253///
1254/// `mem_addr` does not need to be aligned on any particular boundary.
1255///
1256/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128)
1257#[inline]
1258#[target_feature(enable = "sse2")]
1259#[cfg_attr(test, assert_instr(movups))]
1260#[stable(feature = "simd_x86", since = "1.27.0")]
1261pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1262    let mut dst: __m128i = _mm_undefined_si128();
1263    ptr::copy_nonoverlapping(
1264        mem_addr as *const u8,
1265        ptr::addr_of_mut!(dst) as *mut u8,
1266        mem::size_of::<__m128i>(),
1267    );
1268    dst
1269}
1270
1271/// Conditionally store 8-bit integer elements from `a` into memory using
1272/// `mask`.
1273///
1274/// Elements are not stored when the highest bit is not set in the
1275/// corresponding element.
1276///
1277/// `mem_addr` should correspond to a 128-bit memory location and does not need
1278/// to be aligned on any particular boundary.
1279///
1280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128)
1281#[inline]
1282#[target_feature(enable = "sse2")]
1283#[cfg_attr(test, assert_instr(maskmovdqu))]
1284#[stable(feature = "simd_x86", since = "1.27.0")]
1285pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
1286    maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1287}
1288
1289/// Stores 128-bits of integer data from `a` into memory.
1290///
1291/// `mem_addr` must be aligned on a 16-byte boundary.
1292///
1293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128)
1294#[inline]
1295#[target_feature(enable = "sse2")]
1296#[cfg_attr(test, assert_instr(movaps))]
1297#[stable(feature = "simd_x86", since = "1.27.0")]
1298pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1299    *mem_addr = a;
1300}
1301
1302/// Stores 128-bits of integer data from `a` into memory.
1303///
1304/// `mem_addr` does not need to be aligned on any particular boundary.
1305///
1306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128)
1307#[inline]
1308#[target_feature(enable = "sse2")]
1309#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
1310#[stable(feature = "simd_x86", since = "1.27.0")]
1311pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1312    mem_addr.write_unaligned(a);
1313}
1314
1315/// Stores the lower 64-bit integer `a` to a memory location.
1316///
1317/// `mem_addr` does not need to be aligned on any particular boundary.
1318///
1319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64)
1320#[inline]
1321#[target_feature(enable = "sse2")]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
1324    ptr::copy_nonoverlapping(ptr::addr_of!(a) as *const u8, mem_addr as *mut u8, 8);
1325}
1326
1327/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1328/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1329/// used again soon).
1330///
1331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1332///
1333/// # Safety of non-temporal stores
1334///
1335/// After using this intrinsic, but before any other access to the memory that this intrinsic
1336/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1337/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1338/// return.
1339///
1340/// See [`_mm_sfence`] for details.
1341#[inline]
1342#[target_feature(enable = "sse2")]
1343#[cfg_attr(test, assert_instr(movntdq))]
1344#[stable(feature = "simd_x86", since = "1.27.0")]
1345pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1346    crate::arch::asm!(
1347        vps!("movntdq",  ",{a}"),
1348        p = in(reg) mem_addr,
1349        a = in(xmm_reg) a,
1350        options(nostack, preserves_flags),
1351    );
1352}
1353
1354/// Stores a 32-bit integer value in the specified memory location.
1355/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1356/// used again soon).
1357///
1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1359///
1360/// # Safety of non-temporal stores
1361///
1362/// After using this intrinsic, but before any other access to the memory that this intrinsic
1363/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1364/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1365/// return.
1366///
1367/// See [`_mm_sfence`] for details.
1368#[inline]
1369#[target_feature(enable = "sse2")]
1370#[cfg_attr(test, assert_instr(movnti))]
1371#[stable(feature = "simd_x86", since = "1.27.0")]
1372pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1373    crate::arch::asm!(
1374        vps!("movnti", ",{a:e}"), // `:e` for 32bit value
1375        p = in(reg) mem_addr,
1376        a = in(reg) a,
1377        options(nostack, preserves_flags),
1378    );
1379}
1380
1381/// Returns a vector where the low element is extracted from `a` and its upper
1382/// element is zero.
1383///
1384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
1385#[inline]
1386#[target_feature(enable = "sse2")]
1387// FIXME movd on msvc, movd on i686
1388#[cfg_attr(
1389    all(test, not(target_env = "msvc"), target_arch = "x86_64"),
1390    assert_instr(movq)
1391)]
1392#[stable(feature = "simd_x86", since = "1.27.0")]
1393pub fn _mm_move_epi64(a: __m128i) -> __m128i {
1394    unsafe {
1395        let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
1396        transmute(r)
1397    }
1398}
1399
1400/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1401/// using signed saturation.
1402///
1403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
1404#[inline]
1405#[target_feature(enable = "sse2")]
1406#[cfg_attr(test, assert_instr(packsswb))]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1409    unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) }
1410}
1411
1412/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
1413/// using signed saturation.
1414///
1415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
1416#[inline]
1417#[target_feature(enable = "sse2")]
1418#[cfg_attr(test, assert_instr(packssdw))]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1421    unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) }
1422}
1423
1424/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
1425/// using unsigned saturation.
1426///
1427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
1428#[inline]
1429#[target_feature(enable = "sse2")]
1430#[cfg_attr(test, assert_instr(packuswb))]
1431#[stable(feature = "simd_x86", since = "1.27.0")]
1432pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1433    unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) }
1434}
1435
1436/// Returns the `imm8` element of `a`.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
1439#[inline]
1440#[target_feature(enable = "sse2")]
1441#[cfg_attr(test, assert_instr(pextrw, IMM8 = 7))]
1442#[rustc_legacy_const_generics(1)]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
1445    static_assert_uimm_bits!(IMM8, 3);
1446    unsafe { simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32 }
1447}
1448
1449/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
1450///
1451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
1452#[inline]
1453#[target_feature(enable = "sse2")]
1454#[cfg_attr(test, assert_instr(pinsrw, IMM8 = 7))]
1455#[rustc_legacy_const_generics(2)]
1456#[stable(feature = "simd_x86", since = "1.27.0")]
1457pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
1458    static_assert_uimm_bits!(IMM8, 3);
1459    unsafe { transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16)) }
1460}
1461
1462/// Returns a mask of the most significant bit of each element in `a`.
1463///
1464/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
1465#[inline]
1466#[target_feature(enable = "sse2")]
1467#[cfg_attr(test, assert_instr(pmovmskb))]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
1470    unsafe {
1471        let z = i8x16::ZERO;
1472        let m: i8x16 = simd_lt(a.as_i8x16(), z);
1473        simd_bitmask::<_, u16>(m) as u32 as i32
1474    }
1475}
1476
1477/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
1478///
1479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
1480#[inline]
1481#[target_feature(enable = "sse2")]
1482#[cfg_attr(test, assert_instr(pshufd, IMM8 = 9))]
1483#[rustc_legacy_const_generics(1)]
1484#[stable(feature = "simd_x86", since = "1.27.0")]
1485pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
1486    static_assert_uimm_bits!(IMM8, 8);
1487    unsafe {
1488        let a = a.as_i32x4();
1489        let x: i32x4 = simd_shuffle!(
1490            a,
1491            a,
1492            [
1493                IMM8 as u32 & 0b11,
1494                (IMM8 as u32 >> 2) & 0b11,
1495                (IMM8 as u32 >> 4) & 0b11,
1496                (IMM8 as u32 >> 6) & 0b11,
1497            ],
1498        );
1499        transmute(x)
1500    }
1501}
1502
1503/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
1504/// `IMM8`.
1505///
1506/// Put the results in the high 64 bits of the returned vector, with the low 64
1507/// bits being copied from `a`.
1508///
1509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
1510#[inline]
1511#[target_feature(enable = "sse2")]
1512#[cfg_attr(test, assert_instr(pshufhw, IMM8 = 9))]
1513#[rustc_legacy_const_generics(1)]
1514#[stable(feature = "simd_x86", since = "1.27.0")]
1515pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1516    static_assert_uimm_bits!(IMM8, 8);
1517    unsafe {
1518        let a = a.as_i16x8();
1519        let x: i16x8 = simd_shuffle!(
1520            a,
1521            a,
1522            [
1523                0,
1524                1,
1525                2,
1526                3,
1527                (IMM8 as u32 & 0b11) + 4,
1528                ((IMM8 as u32 >> 2) & 0b11) + 4,
1529                ((IMM8 as u32 >> 4) & 0b11) + 4,
1530                ((IMM8 as u32 >> 6) & 0b11) + 4,
1531            ],
1532        );
1533        transmute(x)
1534    }
1535}
1536
1537/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
1538/// `IMM8`.
1539///
1540/// Put the results in the low 64 bits of the returned vector, with the high 64
1541/// bits being copied from `a`.
1542///
1543/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
1544#[inline]
1545#[target_feature(enable = "sse2")]
1546#[cfg_attr(test, assert_instr(pshuflw, IMM8 = 9))]
1547#[rustc_legacy_const_generics(1)]
1548#[stable(feature = "simd_x86", since = "1.27.0")]
1549pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
1550    static_assert_uimm_bits!(IMM8, 8);
1551    unsafe {
1552        let a = a.as_i16x8();
1553        let x: i16x8 = simd_shuffle!(
1554            a,
1555            a,
1556            [
1557                IMM8 as u32 & 0b11,
1558                (IMM8 as u32 >> 2) & 0b11,
1559                (IMM8 as u32 >> 4) & 0b11,
1560                (IMM8 as u32 >> 6) & 0b11,
1561                4,
1562                5,
1563                6,
1564                7,
1565            ],
1566        );
1567        transmute(x)
1568    }
1569}
1570
1571/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
1572///
1573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
1574#[inline]
1575#[target_feature(enable = "sse2")]
1576#[cfg_attr(test, assert_instr(punpckhbw))]
1577#[stable(feature = "simd_x86", since = "1.27.0")]
1578pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1579    unsafe {
1580        transmute::<i8x16, _>(simd_shuffle!(
1581            a.as_i8x16(),
1582            b.as_i8x16(),
1583            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
1584        ))
1585    }
1586}
1587
1588/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
1589///
1590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
1591#[inline]
1592#[target_feature(enable = "sse2")]
1593#[cfg_attr(test, assert_instr(punpckhwd))]
1594#[stable(feature = "simd_x86", since = "1.27.0")]
1595pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
1596    unsafe {
1597        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
1598        transmute::<i16x8, _>(x)
1599    }
1600}
1601
1602/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
1603///
1604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
1605#[inline]
1606#[target_feature(enable = "sse2")]
1607#[cfg_attr(test, assert_instr(unpckhps))]
1608#[stable(feature = "simd_x86", since = "1.27.0")]
1609pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
1610    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7])) }
1611}
1612
1613/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
1614///
1615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
1616#[inline]
1617#[target_feature(enable = "sse2")]
1618#[cfg_attr(test, assert_instr(unpckhpd))]
1619#[stable(feature = "simd_x86", since = "1.27.0")]
1620pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
1621    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [1, 3])) }
1622}
1623
1624/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
1625///
1626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
1627#[inline]
1628#[target_feature(enable = "sse2")]
1629#[cfg_attr(test, assert_instr(punpcklbw))]
1630#[stable(feature = "simd_x86", since = "1.27.0")]
1631pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1632    unsafe {
1633        transmute::<i8x16, _>(simd_shuffle!(
1634            a.as_i8x16(),
1635            b.as_i8x16(),
1636            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
1637        ))
1638    }
1639}
1640
1641/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
1642///
1643/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
1644#[inline]
1645#[target_feature(enable = "sse2")]
1646#[cfg_attr(test, assert_instr(punpcklwd))]
1647#[stable(feature = "simd_x86", since = "1.27.0")]
1648pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
1649    unsafe {
1650        let x = simd_shuffle!(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
1651        transmute::<i16x8, _>(x)
1652    }
1653}
1654
1655/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
1656///
1657/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
1658#[inline]
1659#[target_feature(enable = "sse2")]
1660#[cfg_attr(test, assert_instr(unpcklps))]
1661#[stable(feature = "simd_x86", since = "1.27.0")]
1662pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
1663    unsafe { transmute::<i32x4, _>(simd_shuffle!(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5])) }
1664}
1665
1666/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
1667///
1668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
1669#[inline]
1670#[target_feature(enable = "sse2")]
1671#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
1674    unsafe { transmute::<i64x2, _>(simd_shuffle!(a.as_i64x2(), b.as_i64x2(), [0, 2])) }
1675}
1676
1677/// Returns a new vector with the low element of `a` replaced by the sum of the
1678/// low elements of `a` and `b`.
1679///
1680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
1681#[inline]
1682#[target_feature(enable = "sse2")]
1683#[cfg_attr(test, assert_instr(addsd))]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1686    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b)) }
1687}
1688
1689/// Adds packed double-precision (64-bit) floating-point elements in `a` and
1690/// `b`.
1691///
1692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
1693#[inline]
1694#[target_feature(enable = "sse2")]
1695#[cfg_attr(test, assert_instr(addpd))]
1696#[stable(feature = "simd_x86", since = "1.27.0")]
1697pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1698    unsafe { simd_add(a, b) }
1699}
1700
1701/// Returns a new vector with the low element of `a` replaced by the result of
1702/// diving the lower element of `a` by the lower element of `b`.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
1705#[inline]
1706#[target_feature(enable = "sse2")]
1707#[cfg_attr(test, assert_instr(divsd))]
1708#[stable(feature = "simd_x86", since = "1.27.0")]
1709pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1710    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b)) }
1711}
1712
1713/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1714/// packed elements in `b`.
1715///
1716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
1717#[inline]
1718#[target_feature(enable = "sse2")]
1719#[cfg_attr(test, assert_instr(divpd))]
1720#[stable(feature = "simd_x86", since = "1.27.0")]
1721pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1722    unsafe { simd_div(a, b) }
1723}
1724
1725/// Returns a new vector with the low element of `a` replaced by the maximum
1726/// of the lower elements of `a` and `b`.
1727///
1728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
1729#[inline]
1730#[target_feature(enable = "sse2")]
1731#[cfg_attr(test, assert_instr(maxsd))]
1732#[stable(feature = "simd_x86", since = "1.27.0")]
1733pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1734    unsafe { maxsd(a, b) }
1735}
1736
1737/// Returns a new vector with the maximum values from corresponding elements in
1738/// `a` and `b`.
1739///
1740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
1741#[inline]
1742#[target_feature(enable = "sse2")]
1743#[cfg_attr(test, assert_instr(maxpd))]
1744#[stable(feature = "simd_x86", since = "1.27.0")]
1745pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1746    unsafe { maxpd(a, b) }
1747}
1748
1749/// Returns a new vector with the low element of `a` replaced by the minimum
1750/// of the lower elements of `a` and `b`.
1751///
1752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
1753#[inline]
1754#[target_feature(enable = "sse2")]
1755#[cfg_attr(test, assert_instr(minsd))]
1756#[stable(feature = "simd_x86", since = "1.27.0")]
1757pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1758    unsafe { minsd(a, b) }
1759}
1760
1761/// Returns a new vector with the minimum values from corresponding elements in
1762/// `a` and `b`.
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
1765#[inline]
1766#[target_feature(enable = "sse2")]
1767#[cfg_attr(test, assert_instr(minpd))]
1768#[stable(feature = "simd_x86", since = "1.27.0")]
1769pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1770    unsafe { minpd(a, b) }
1771}
1772
1773/// Returns a new vector with the low element of `a` replaced by multiplying the
1774/// low elements of `a` and `b`.
1775///
1776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
1777#[inline]
1778#[target_feature(enable = "sse2")]
1779#[cfg_attr(test, assert_instr(mulsd))]
1780#[stable(feature = "simd_x86", since = "1.27.0")]
1781pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1782    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b)) }
1783}
1784
1785/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
1786/// and `b`.
1787///
1788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
1789#[inline]
1790#[target_feature(enable = "sse2")]
1791#[cfg_attr(test, assert_instr(mulpd))]
1792#[stable(feature = "simd_x86", since = "1.27.0")]
1793pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1794    unsafe { simd_mul(a, b) }
1795}
1796
1797/// Returns a new vector with the low element of `a` replaced by the square
1798/// root of the lower element `b`.
1799///
1800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
1801#[inline]
1802#[target_feature(enable = "sse2")]
1803#[cfg_attr(test, assert_instr(sqrtsd))]
1804#[stable(feature = "simd_x86", since = "1.27.0")]
1805pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1806    unsafe { simd_insert!(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
1807}
1808
1809/// Returns a new vector with the square root of each of the values in `a`.
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
1812#[inline]
1813#[target_feature(enable = "sse2")]
1814#[cfg_attr(test, assert_instr(sqrtpd))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1817    unsafe { simd_fsqrt(a) }
1818}
1819
1820/// Returns a new vector with the low element of `a` replaced by subtracting the
1821/// low element by `b` from the low element of `a`.
1822///
1823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
1824#[inline]
1825#[target_feature(enable = "sse2")]
1826#[cfg_attr(test, assert_instr(subsd))]
1827#[stable(feature = "simd_x86", since = "1.27.0")]
1828pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1829    unsafe { simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b)) }
1830}
1831
1832/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1833/// from `a`.
1834///
1835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
1836#[inline]
1837#[target_feature(enable = "sse2")]
1838#[cfg_attr(test, assert_instr(subpd))]
1839#[stable(feature = "simd_x86", since = "1.27.0")]
1840pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1841    unsafe { simd_sub(a, b) }
1842}
1843
1844/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
1845/// elements in `a` and `b`.
1846///
1847/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
1848#[inline]
1849#[target_feature(enable = "sse2")]
1850#[cfg_attr(test, assert_instr(andps))]
1851#[stable(feature = "simd_x86", since = "1.27.0")]
1852pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1853    unsafe {
1854        let a: __m128i = transmute(a);
1855        let b: __m128i = transmute(b);
1856        transmute(_mm_and_si128(a, b))
1857    }
1858}
1859
1860/// Computes the bitwise NOT of `a` and then AND with `b`.
1861///
1862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
1863#[inline]
1864#[target_feature(enable = "sse2")]
1865#[cfg_attr(test, assert_instr(andnps))]
1866#[stable(feature = "simd_x86", since = "1.27.0")]
1867pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1868    unsafe {
1869        let a: __m128i = transmute(a);
1870        let b: __m128i = transmute(b);
1871        transmute(_mm_andnot_si128(a, b))
1872    }
1873}
1874
1875/// Computes the bitwise OR of `a` and `b`.
1876///
1877/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
1878#[inline]
1879#[target_feature(enable = "sse2")]
1880#[cfg_attr(test, assert_instr(orps))]
1881#[stable(feature = "simd_x86", since = "1.27.0")]
1882pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1883    unsafe {
1884        let a: __m128i = transmute(a);
1885        let b: __m128i = transmute(b);
1886        transmute(_mm_or_si128(a, b))
1887    }
1888}
1889
1890/// Computes the bitwise XOR of `a` and `b`.
1891///
1892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
1893#[inline]
1894#[target_feature(enable = "sse2")]
1895#[cfg_attr(test, assert_instr(xorps))]
1896#[stable(feature = "simd_x86", since = "1.27.0")]
1897pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1898    unsafe {
1899        let a: __m128i = transmute(a);
1900        let b: __m128i = transmute(b);
1901        transmute(_mm_xor_si128(a, b))
1902    }
1903}
1904
1905/// Returns a new vector with the low element of `a` replaced by the equality
1906/// comparison of the lower elements of `a` and `b`.
1907///
1908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
1909#[inline]
1910#[target_feature(enable = "sse2")]
1911#[cfg_attr(test, assert_instr(cmpeqsd))]
1912#[stable(feature = "simd_x86", since = "1.27.0")]
1913pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1914    unsafe { cmpsd(a, b, 0) }
1915}
1916
1917/// Returns a new vector with the low element of `a` replaced by the less-than
1918/// comparison of the lower elements of `a` and `b`.
1919///
1920/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
1921#[inline]
1922#[target_feature(enable = "sse2")]
1923#[cfg_attr(test, assert_instr(cmpltsd))]
1924#[stable(feature = "simd_x86", since = "1.27.0")]
1925pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1926    unsafe { cmpsd(a, b, 1) }
1927}
1928
1929/// Returns a new vector with the low element of `a` replaced by the
1930/// less-than-or-equal comparison of the lower elements of `a` and `b`.
1931///
1932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
1933#[inline]
1934#[target_feature(enable = "sse2")]
1935#[cfg_attr(test, assert_instr(cmplesd))]
1936#[stable(feature = "simd_x86", since = "1.27.0")]
1937pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1938    unsafe { cmpsd(a, b, 2) }
1939}
1940
1941/// Returns a new vector with the low element of `a` replaced by the
1942/// greater-than comparison of the lower elements of `a` and `b`.
1943///
1944/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
1945#[inline]
1946#[target_feature(enable = "sse2")]
1947#[cfg_attr(test, assert_instr(cmpltsd))]
1948#[stable(feature = "simd_x86", since = "1.27.0")]
1949pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
1950    unsafe { simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1951}
1952
1953/// Returns a new vector with the low element of `a` replaced by the
1954/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
1955///
1956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(cmplesd))]
1960#[stable(feature = "simd_x86", since = "1.27.0")]
1961pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
1962    unsafe { simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64)) }
1963}
1964
1965/// Returns a new vector with the low element of `a` replaced by the result
1966/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1967/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1968/// otherwise.
1969///
1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
1971#[inline]
1972#[target_feature(enable = "sse2")]
1973#[cfg_attr(test, assert_instr(cmpordsd))]
1974#[stable(feature = "simd_x86", since = "1.27.0")]
1975pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1976    unsafe { cmpsd(a, b, 7) }
1977}
1978
1979/// Returns a new vector with the low element of `a` replaced by the result of
1980/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1981/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
1984#[inline]
1985#[target_feature(enable = "sse2")]
1986#[cfg_attr(test, assert_instr(cmpunordsd))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
1989    unsafe { cmpsd(a, b, 3) }
1990}
1991
1992/// Returns a new vector with the low element of `a` replaced by the not-equal
1993/// comparison of the lower elements of `a` and `b`.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
1996#[inline]
1997#[target_feature(enable = "sse2")]
1998#[cfg_attr(test, assert_instr(cmpneqsd))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2001    unsafe { cmpsd(a, b, 4) }
2002}
2003
2004/// Returns a new vector with the low element of `a` replaced by the
2005/// not-less-than comparison of the lower elements of `a` and `b`.
2006///
2007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
2008#[inline]
2009#[target_feature(enable = "sse2")]
2010#[cfg_attr(test, assert_instr(cmpnltsd))]
2011#[stable(feature = "simd_x86", since = "1.27.0")]
2012pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2013    unsafe { cmpsd(a, b, 5) }
2014}
2015
2016/// Returns a new vector with the low element of `a` replaced by the
2017/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
2018///
2019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
2020#[inline]
2021#[target_feature(enable = "sse2")]
2022#[cfg_attr(test, assert_instr(cmpnlesd))]
2023#[stable(feature = "simd_x86", since = "1.27.0")]
2024pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2025    unsafe { cmpsd(a, b, 6) }
2026}
2027
2028/// Returns a new vector with the low element of `a` replaced by the
2029/// not-greater-than comparison of the lower elements of `a` and `b`.
2030///
2031/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
2032#[inline]
2033#[target_feature(enable = "sse2")]
2034#[cfg_attr(test, assert_instr(cmpnltsd))]
2035#[stable(feature = "simd_x86", since = "1.27.0")]
2036pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
2037    unsafe { simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2038}
2039
2040/// Returns a new vector with the low element of `a` replaced by the
2041/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
2042///
2043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
2044#[inline]
2045#[target_feature(enable = "sse2")]
2046#[cfg_attr(test, assert_instr(cmpnlesd))]
2047#[stable(feature = "simd_x86", since = "1.27.0")]
2048pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
2049    unsafe { simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64)) }
2050}
2051
2052/// Compares corresponding elements in `a` and `b` for equality.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
2055#[inline]
2056#[target_feature(enable = "sse2")]
2057#[cfg_attr(test, assert_instr(cmpeqpd))]
2058#[stable(feature = "simd_x86", since = "1.27.0")]
2059pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2060    unsafe { cmppd(a, b, 0) }
2061}
2062
2063/// Compares corresponding elements in `a` and `b` for less-than.
2064///
2065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
2066#[inline]
2067#[target_feature(enable = "sse2")]
2068#[cfg_attr(test, assert_instr(cmpltpd))]
2069#[stable(feature = "simd_x86", since = "1.27.0")]
2070pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2071    unsafe { cmppd(a, b, 1) }
2072}
2073
2074/// Compares corresponding elements in `a` and `b` for less-than-or-equal
2075///
2076/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
2077#[inline]
2078#[target_feature(enable = "sse2")]
2079#[cfg_attr(test, assert_instr(cmplepd))]
2080#[stable(feature = "simd_x86", since = "1.27.0")]
2081pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2082    unsafe { cmppd(a, b, 2) }
2083}
2084
2085/// Compares corresponding elements in `a` and `b` for greater-than.
2086///
2087/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
2088#[inline]
2089#[target_feature(enable = "sse2")]
2090#[cfg_attr(test, assert_instr(cmpltpd))]
2091#[stable(feature = "simd_x86", since = "1.27.0")]
2092pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2093    _mm_cmplt_pd(b, a)
2094}
2095
2096/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
2097///
2098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
2099#[inline]
2100#[target_feature(enable = "sse2")]
2101#[cfg_attr(test, assert_instr(cmplepd))]
2102#[stable(feature = "simd_x86", since = "1.27.0")]
2103pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2104    _mm_cmple_pd(b, a)
2105}
2106
2107/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
2108///
2109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
2110#[inline]
2111#[target_feature(enable = "sse2")]
2112#[cfg_attr(test, assert_instr(cmpordpd))]
2113#[stable(feature = "simd_x86", since = "1.27.0")]
2114pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2115    unsafe { cmppd(a, b, 7) }
2116}
2117
2118/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
2119///
2120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpunordpd))]
2124#[stable(feature = "simd_x86", since = "1.27.0")]
2125pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2126    unsafe { cmppd(a, b, 3) }
2127}
2128
2129/// Compares corresponding elements in `a` and `b` for not-equal.
2130///
2131/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
2132#[inline]
2133#[target_feature(enable = "sse2")]
2134#[cfg_attr(test, assert_instr(cmpneqpd))]
2135#[stable(feature = "simd_x86", since = "1.27.0")]
2136pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2137    unsafe { cmppd(a, b, 4) }
2138}
2139
2140/// Compares corresponding elements in `a` and `b` for not-less-than.
2141///
2142/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
2143#[inline]
2144#[target_feature(enable = "sse2")]
2145#[cfg_attr(test, assert_instr(cmpnltpd))]
2146#[stable(feature = "simd_x86", since = "1.27.0")]
2147pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2148    unsafe { cmppd(a, b, 5) }
2149}
2150
2151/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
2152///
2153/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
2154#[inline]
2155#[target_feature(enable = "sse2")]
2156#[cfg_attr(test, assert_instr(cmpnlepd))]
2157#[stable(feature = "simd_x86", since = "1.27.0")]
2158pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2159    unsafe { cmppd(a, b, 6) }
2160}
2161
2162/// Compares corresponding elements in `a` and `b` for not-greater-than.
2163///
2164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
2165#[inline]
2166#[target_feature(enable = "sse2")]
2167#[cfg_attr(test, assert_instr(cmpnltpd))]
2168#[stable(feature = "simd_x86", since = "1.27.0")]
2169pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2170    _mm_cmpnlt_pd(b, a)
2171}
2172
2173/// Compares corresponding elements in `a` and `b` for
2174/// not-greater-than-or-equal.
2175///
2176/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
2177#[inline]
2178#[target_feature(enable = "sse2")]
2179#[cfg_attr(test, assert_instr(cmpnlepd))]
2180#[stable(feature = "simd_x86", since = "1.27.0")]
2181pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2182    _mm_cmpnle_pd(b, a)
2183}
2184
2185/// Compares the lower element of `a` and `b` for equality.
2186///
2187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
2188#[inline]
2189#[target_feature(enable = "sse2")]
2190#[cfg_attr(test, assert_instr(comisd))]
2191#[stable(feature = "simd_x86", since = "1.27.0")]
2192pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2193    unsafe { comieqsd(a, b) }
2194}
2195
2196/// Compares the lower element of `a` and `b` for less-than.
2197///
2198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
2199#[inline]
2200#[target_feature(enable = "sse2")]
2201#[cfg_attr(test, assert_instr(comisd))]
2202#[stable(feature = "simd_x86", since = "1.27.0")]
2203pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2204    unsafe { comiltsd(a, b) }
2205}
2206
2207/// Compares the lower element of `a` and `b` for less-than-or-equal.
2208///
2209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
2210#[inline]
2211#[target_feature(enable = "sse2")]
2212#[cfg_attr(test, assert_instr(comisd))]
2213#[stable(feature = "simd_x86", since = "1.27.0")]
2214pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2215    unsafe { comilesd(a, b) }
2216}
2217
2218/// Compares the lower element of `a` and `b` for greater-than.
2219///
2220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
2221#[inline]
2222#[target_feature(enable = "sse2")]
2223#[cfg_attr(test, assert_instr(comisd))]
2224#[stable(feature = "simd_x86", since = "1.27.0")]
2225pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2226    unsafe { comigtsd(a, b) }
2227}
2228
2229/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2230///
2231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
2232#[inline]
2233#[target_feature(enable = "sse2")]
2234#[cfg_attr(test, assert_instr(comisd))]
2235#[stable(feature = "simd_x86", since = "1.27.0")]
2236pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2237    unsafe { comigesd(a, b) }
2238}
2239
2240/// Compares the lower element of `a` and `b` for not-equal.
2241///
2242/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
2243#[inline]
2244#[target_feature(enable = "sse2")]
2245#[cfg_attr(test, assert_instr(comisd))]
2246#[stable(feature = "simd_x86", since = "1.27.0")]
2247pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2248    unsafe { comineqsd(a, b) }
2249}
2250
2251/// Compares the lower element of `a` and `b` for equality.
2252///
2253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
2254#[inline]
2255#[target_feature(enable = "sse2")]
2256#[cfg_attr(test, assert_instr(ucomisd))]
2257#[stable(feature = "simd_x86", since = "1.27.0")]
2258pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2259    unsafe { ucomieqsd(a, b) }
2260}
2261
2262/// Compares the lower element of `a` and `b` for less-than.
2263///
2264/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
2265#[inline]
2266#[target_feature(enable = "sse2")]
2267#[cfg_attr(test, assert_instr(ucomisd))]
2268#[stable(feature = "simd_x86", since = "1.27.0")]
2269pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2270    unsafe { ucomiltsd(a, b) }
2271}
2272
2273/// Compares the lower element of `a` and `b` for less-than-or-equal.
2274///
2275/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
2276#[inline]
2277#[target_feature(enable = "sse2")]
2278#[cfg_attr(test, assert_instr(ucomisd))]
2279#[stable(feature = "simd_x86", since = "1.27.0")]
2280pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2281    unsafe { ucomilesd(a, b) }
2282}
2283
2284/// Compares the lower element of `a` and `b` for greater-than.
2285///
2286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
2287#[inline]
2288#[target_feature(enable = "sse2")]
2289#[cfg_attr(test, assert_instr(ucomisd))]
2290#[stable(feature = "simd_x86", since = "1.27.0")]
2291pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2292    unsafe { ucomigtsd(a, b) }
2293}
2294
2295/// Compares the lower element of `a` and `b` for greater-than-or-equal.
2296///
2297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
2298#[inline]
2299#[target_feature(enable = "sse2")]
2300#[cfg_attr(test, assert_instr(ucomisd))]
2301#[stable(feature = "simd_x86", since = "1.27.0")]
2302pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2303    unsafe { ucomigesd(a, b) }
2304}
2305
2306/// Compares the lower element of `a` and `b` for not-equal.
2307///
2308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
2309#[inline]
2310#[target_feature(enable = "sse2")]
2311#[cfg_attr(test, assert_instr(ucomisd))]
2312#[stable(feature = "simd_x86", since = "1.27.0")]
2313pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2314    unsafe { ucomineqsd(a, b) }
2315}
2316
2317/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2318/// packed single-precision (32-bit) floating-point elements
2319///
2320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
2321#[inline]
2322#[target_feature(enable = "sse2")]
2323#[cfg_attr(test, assert_instr(cvtpd2ps))]
2324#[stable(feature = "simd_x86", since = "1.27.0")]
2325pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2326    unsafe {
2327        let r = simd_cast::<_, f32x2>(a.as_f64x2());
2328        let zero = f32x2::ZERO;
2329        transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
2330    }
2331}
2332
2333/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2334/// packed
2335/// double-precision (64-bit) floating-point elements.
2336///
2337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
2338#[inline]
2339#[target_feature(enable = "sse2")]
2340#[cfg_attr(test, assert_instr(cvtps2pd))]
2341#[stable(feature = "simd_x86", since = "1.27.0")]
2342pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
2343    unsafe {
2344        let a = a.as_f32x4();
2345        transmute(simd_cast::<f32x2, f64x2>(simd_shuffle!(a, a, [0, 1])))
2346    }
2347}
2348
2349/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2350/// packed 32-bit integers.
2351///
2352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
2353#[inline]
2354#[target_feature(enable = "sse2")]
2355#[cfg_attr(test, assert_instr(cvtpd2dq))]
2356#[stable(feature = "simd_x86", since = "1.27.0")]
2357pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2358    unsafe { transmute(cvtpd2dq(a)) }
2359}
2360
2361/// Converts the lower double-precision (64-bit) floating-point element in a to
2362/// a 32-bit integer.
2363///
2364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
2365#[inline]
2366#[target_feature(enable = "sse2")]
2367#[cfg_attr(test, assert_instr(cvtsd2si))]
2368#[stable(feature = "simd_x86", since = "1.27.0")]
2369pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2370    unsafe { cvtsd2si(a) }
2371}
2372
2373/// Converts the lower double-precision (64-bit) floating-point element in `b`
2374/// to a single-precision (32-bit) floating-point element, store the result in
2375/// the lower element of the return value, and copies the upper element from `a`
2376/// to the upper element the return value.
2377///
2378/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
2379#[inline]
2380#[target_feature(enable = "sse2")]
2381#[cfg_attr(test, assert_instr(cvtsd2ss))]
2382#[stable(feature = "simd_x86", since = "1.27.0")]
2383pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2384    unsafe { cvtsd2ss(a, b) }
2385}
2386
2387/// Returns the lower double-precision (64-bit) floating-point element of `a`.
2388///
2389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
2390#[inline]
2391#[target_feature(enable = "sse2")]
2392#[stable(feature = "simd_x86", since = "1.27.0")]
2393pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2394    unsafe { simd_extract!(a, 0) }
2395}
2396
2397/// Converts the lower single-precision (32-bit) floating-point element in `b`
2398/// to a double-precision (64-bit) floating-point element, store the result in
2399/// the lower element of the return value, and copies the upper element from `a`
2400/// to the upper element the return value.
2401///
2402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
2403#[inline]
2404#[target_feature(enable = "sse2")]
2405#[cfg_attr(test, assert_instr(cvtss2sd))]
2406#[stable(feature = "simd_x86", since = "1.27.0")]
2407pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2408    unsafe { cvtss2sd(a, b) }
2409}
2410
2411/// Converts packed double-precision (64-bit) floating-point elements in `a` to
2412/// packed 32-bit integers with truncation.
2413///
2414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
2415#[inline]
2416#[target_feature(enable = "sse2")]
2417#[cfg_attr(test, assert_instr(cvttpd2dq))]
2418#[stable(feature = "simd_x86", since = "1.27.0")]
2419pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2420    unsafe { transmute(cvttpd2dq(a)) }
2421}
2422
2423/// Converts the lower double-precision (64-bit) floating-point element in `a`
2424/// to a 32-bit integer with truncation.
2425///
2426/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
2427#[inline]
2428#[target_feature(enable = "sse2")]
2429#[cfg_attr(test, assert_instr(cvttsd2si))]
2430#[stable(feature = "simd_x86", since = "1.27.0")]
2431pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2432    unsafe { cvttsd2si(a) }
2433}
2434
2435/// Converts packed single-precision (32-bit) floating-point elements in `a` to
2436/// packed 32-bit integers with truncation.
2437///
2438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
2439#[inline]
2440#[target_feature(enable = "sse2")]
2441#[cfg_attr(test, assert_instr(cvttps2dq))]
2442#[stable(feature = "simd_x86", since = "1.27.0")]
2443pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2444    unsafe { transmute(cvttps2dq(a)) }
2445}
2446
2447/// Copies double-precision (64-bit) floating-point element `a` to the lower
2448/// element of the packed 64-bit return value.
2449///
2450/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
2451#[inline]
2452#[target_feature(enable = "sse2")]
2453#[stable(feature = "simd_x86", since = "1.27.0")]
2454pub fn _mm_set_sd(a: f64) -> __m128d {
2455    _mm_set_pd(0.0, a)
2456}
2457
2458/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2459/// of the return value.
2460///
2461/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
2462#[inline]
2463#[target_feature(enable = "sse2")]
2464#[stable(feature = "simd_x86", since = "1.27.0")]
2465pub fn _mm_set1_pd(a: f64) -> __m128d {
2466    _mm_set_pd(a, a)
2467}
2468
2469/// Broadcasts double-precision (64-bit) floating-point value a to all elements
2470/// of the return value.
2471///
2472/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
2473#[inline]
2474#[target_feature(enable = "sse2")]
2475#[stable(feature = "simd_x86", since = "1.27.0")]
2476pub fn _mm_set_pd1(a: f64) -> __m128d {
2477    _mm_set_pd(a, a)
2478}
2479
2480/// Sets packed double-precision (64-bit) floating-point elements in the return
2481/// value with the supplied values.
2482///
2483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
2484#[inline]
2485#[target_feature(enable = "sse2")]
2486#[stable(feature = "simd_x86", since = "1.27.0")]
2487pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2488    __m128d([b, a])
2489}
2490
2491/// Sets packed double-precision (64-bit) floating-point elements in the return
2492/// value with the supplied values in reverse order.
2493///
2494/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
2495#[inline]
2496#[target_feature(enable = "sse2")]
2497#[stable(feature = "simd_x86", since = "1.27.0")]
2498pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2499    _mm_set_pd(b, a)
2500}
2501
2502/// Returns packed double-precision (64-bit) floating-point elements with all
2503/// zeros.
2504///
2505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
2506#[inline]
2507#[target_feature(enable = "sse2")]
2508#[cfg_attr(test, assert_instr(xorp))]
2509#[stable(feature = "simd_x86", since = "1.27.0")]
2510pub fn _mm_setzero_pd() -> __m128d {
2511    const { unsafe { mem::zeroed() } }
2512}
2513
2514/// Returns a mask of the most significant bit of each element in `a`.
2515///
2516/// The mask is stored in the 2 least significant bits of the return value.
2517/// All other bits are set to `0`.
2518///
2519/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
2520#[inline]
2521#[target_feature(enable = "sse2")]
2522#[cfg_attr(test, assert_instr(movmskpd))]
2523#[stable(feature = "simd_x86", since = "1.27.0")]
2524pub fn _mm_movemask_pd(a: __m128d) -> i32 {
2525    // Propagate the highest bit to the rest, because simd_bitmask
2526    // requires all-1 or all-0.
2527    unsafe {
2528        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
2529        simd_bitmask::<i64x2, u8>(mask).into()
2530    }
2531}
2532
2533/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2534/// floating-point elements) from memory into the returned vector.
2535/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2536/// exception may be generated.
2537///
2538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd)
2539#[inline]
2540#[target_feature(enable = "sse2")]
2541#[cfg_attr(test, assert_instr(movaps))]
2542#[stable(feature = "simd_x86", since = "1.27.0")]
2543#[allow(clippy::cast_ptr_alignment)]
2544pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2545    *(mem_addr as *const __m128d)
2546}
2547
2548/// Loads a 64-bit double-precision value to the low element of a
2549/// 128-bit integer vector and clears the upper element.
2550///
2551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd)
2552#[inline]
2553#[target_feature(enable = "sse2")]
2554#[cfg_attr(test, assert_instr(movsd))]
2555#[stable(feature = "simd_x86", since = "1.27.0")]
2556pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2557    _mm_setr_pd(*mem_addr, 0.)
2558}
2559
2560/// Loads a double-precision value into the high-order bits of a 128-bit
2561/// vector of `[2 x double]`. The low-order bits are copied from the low-order
2562/// bits of the first operand.
2563///
2564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd)
2565#[inline]
2566#[target_feature(enable = "sse2")]
2567#[cfg_attr(test, assert_instr(movhps))]
2568#[stable(feature = "simd_x86", since = "1.27.0")]
2569pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2570    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
2571}
2572
2573/// Loads a double-precision value into the low-order bits of a 128-bit
2574/// vector of `[2 x double]`. The high-order bits are copied from the
2575/// high-order bits of the first operand.
2576///
2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd)
2578#[inline]
2579#[target_feature(enable = "sse2")]
2580#[cfg_attr(test, assert_instr(movlps))]
2581#[stable(feature = "simd_x86", since = "1.27.0")]
2582pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2583    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
2584}
2585
2586/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
2587/// aligned memory location.
2588/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2589/// used again soon).
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2592///
2593/// # Safety of non-temporal stores
2594///
2595/// After using this intrinsic, but before any other access to the memory that this intrinsic
2596/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2597/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2598/// return.
2599///
2600/// See [`_mm_sfence`] for details.
2601#[inline]
2602#[target_feature(enable = "sse2")]
2603#[cfg_attr(test, assert_instr(movntpd))]
2604#[stable(feature = "simd_x86", since = "1.27.0")]
2605#[allow(clippy::cast_ptr_alignment)]
2606pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2607    crate::arch::asm!(
2608        vps!("movntpd", ",{a}"),
2609        p = in(reg) mem_addr,
2610        a = in(xmm_reg) a,
2611        options(nostack, preserves_flags),
2612    );
2613}
2614
2615/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2616/// memory location.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_sd)
2619#[inline]
2620#[target_feature(enable = "sse2")]
2621#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2622#[stable(feature = "simd_x86", since = "1.27.0")]
2623pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2624    *mem_addr = simd_extract!(a, 0)
2625}
2626
2627/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2628/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2629/// on a 16-byte boundary or a general-protection exception may be generated.
2630///
2631/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd)
2632#[inline]
2633#[target_feature(enable = "sse2")]
2634#[cfg_attr(test, assert_instr(movaps))]
2635#[stable(feature = "simd_x86", since = "1.27.0")]
2636#[allow(clippy::cast_ptr_alignment)]
2637pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2638    *(mem_addr as *mut __m128d) = a;
2639}
2640
2641/// Stores 128-bits (composed of 2 packed double-precision (64-bit)
2642/// floating-point elements) from `a` into memory.
2643/// `mem_addr` does not need to be aligned on any particular boundary.
2644///
2645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd)
2646#[inline]
2647#[target_feature(enable = "sse2")]
2648#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
2649#[stable(feature = "simd_x86", since = "1.27.0")]
2650pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2651    mem_addr.cast::<__m128d>().write_unaligned(a);
2652}
2653
2654/// Store 16-bit integer from the first element of a into memory.
2655///
2656/// `mem_addr` does not need to be aligned on any particular boundary.
2657///
2658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16)
2659#[inline]
2660#[target_feature(enable = "sse2")]
2661#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2662pub unsafe fn _mm_storeu_si16(mem_addr: *mut u8, a: __m128i) {
2663    ptr::write_unaligned(mem_addr as *mut i16, simd_extract(a.as_i16x8(), 0))
2664}
2665
2666/// Store 32-bit integer from the first element of a into memory.
2667///
2668/// `mem_addr` does not need to be aligned on any particular boundary.
2669///
2670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32)
2671#[inline]
2672#[target_feature(enable = "sse2")]
2673#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2674pub unsafe fn _mm_storeu_si32(mem_addr: *mut u8, a: __m128i) {
2675    ptr::write_unaligned(mem_addr as *mut i32, simd_extract(a.as_i32x4(), 0))
2676}
2677
2678/// Store 64-bit integer from the first element of a into memory.
2679///
2680/// `mem_addr` does not need to be aligned on any particular boundary.
2681///
2682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64)
2683#[inline]
2684#[target_feature(enable = "sse2")]
2685#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2686pub unsafe fn _mm_storeu_si64(mem_addr: *mut u8, a: __m128i) {
2687    ptr::write_unaligned(mem_addr as *mut i64, simd_extract(a.as_i64x2(), 0))
2688}
2689
2690/// Stores the lower double-precision (64-bit) floating-point element from `a`
2691/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2692/// 16-byte boundary or a general-protection exception may be generated.
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_pd)
2695#[inline]
2696#[target_feature(enable = "sse2")]
2697#[stable(feature = "simd_x86", since = "1.27.0")]
2698#[allow(clippy::cast_ptr_alignment)]
2699pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2700    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2701    *(mem_addr as *mut __m128d) = b;
2702}
2703
2704/// Stores the lower double-precision (64-bit) floating-point element from `a`
2705/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2706/// 16-byte boundary or a general-protection exception may be generated.
2707///
2708/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1)
2709#[inline]
2710#[target_feature(enable = "sse2")]
2711#[stable(feature = "simd_x86", since = "1.27.0")]
2712#[allow(clippy::cast_ptr_alignment)]
2713pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2714    let b: __m128d = simd_shuffle!(a, a, [0, 0]);
2715    *(mem_addr as *mut __m128d) = b;
2716}
2717
2718/// Stores 2 double-precision (64-bit) floating-point elements from `a` into
2719/// memory in reverse order.
2720/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2721/// exception may be generated.
2722///
2723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd)
2724#[inline]
2725#[target_feature(enable = "sse2")]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727#[allow(clippy::cast_ptr_alignment)]
2728pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2729    let b: __m128d = simd_shuffle!(a, a, [1, 0]);
2730    *(mem_addr as *mut __m128d) = b;
2731}
2732
2733/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
2734/// memory location.
2735///
2736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd)
2737#[inline]
2738#[target_feature(enable = "sse2")]
2739#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movhps))]
2740#[stable(feature = "simd_x86", since = "1.27.0")]
2741pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2742    *mem_addr = simd_extract!(a, 1);
2743}
2744
2745/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
2746/// memory location.
2747///
2748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd)
2749#[inline]
2750#[target_feature(enable = "sse2")]
2751#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlps))]
2752#[stable(feature = "simd_x86", since = "1.27.0")]
2753pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2754    *mem_addr = simd_extract!(a, 0);
2755}
2756
2757/// Loads a double-precision (64-bit) floating-point element from memory
2758/// into both elements of returned vector.
2759///
2760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd)
2761#[inline]
2762#[target_feature(enable = "sse2")]
2763// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
2764#[stable(feature = "simd_x86", since = "1.27.0")]
2765pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2766    let d = *mem_addr;
2767    _mm_setr_pd(d, d)
2768}
2769
2770/// Loads a double-precision (64-bit) floating-point element from memory
2771/// into both elements of returned vector.
2772///
2773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1)
2774#[inline]
2775#[target_feature(enable = "sse2")]
2776// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
2777#[stable(feature = "simd_x86", since = "1.27.0")]
2778pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2779    _mm_load1_pd(mem_addr)
2780}
2781
2782/// Loads 2 double-precision (64-bit) floating-point elements from memory into
2783/// the returned vector in reverse order. `mem_addr` must be aligned on a
2784/// 16-byte boundary or a general-protection exception may be generated.
2785///
2786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd)
2787#[inline]
2788#[target_feature(enable = "sse2")]
2789#[cfg_attr(test, assert_instr(movaps))]
2790#[stable(feature = "simd_x86", since = "1.27.0")]
2791pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2792    let a = _mm_load_pd(mem_addr);
2793    simd_shuffle!(a, a, [1, 0])
2794}
2795
2796/// Loads 128-bits (composed of 2 packed double-precision (64-bit)
2797/// floating-point elements) from memory into the returned vector.
2798/// `mem_addr` does not need to be aligned on any particular boundary.
2799///
2800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd)
2801#[inline]
2802#[target_feature(enable = "sse2")]
2803#[cfg_attr(test, assert_instr(movups))]
2804#[stable(feature = "simd_x86", since = "1.27.0")]
2805pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2806    let mut dst = _mm_undefined_pd();
2807    ptr::copy_nonoverlapping(
2808        mem_addr as *const u8,
2809        ptr::addr_of_mut!(dst) as *mut u8,
2810        mem::size_of::<__m128d>(),
2811    );
2812    dst
2813}
2814
2815/// Loads unaligned 16-bits of integer data from memory into new vector.
2816///
2817/// `mem_addr` does not need to be aligned on any particular boundary.
2818///
2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16)
2820#[inline]
2821#[target_feature(enable = "sse2")]
2822#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2823pub unsafe fn _mm_loadu_si16(mem_addr: *const u8) -> __m128i {
2824    transmute(i16x8::new(
2825        ptr::read_unaligned(mem_addr as *const i16),
2826        0,
2827        0,
2828        0,
2829        0,
2830        0,
2831        0,
2832        0,
2833    ))
2834}
2835
2836/// Loads unaligned 32-bits of integer data from memory into new vector.
2837///
2838/// `mem_addr` does not need to be aligned on any particular boundary.
2839///
2840/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32)
2841#[inline]
2842#[target_feature(enable = "sse2")]
2843#[stable(feature = "simd_x86_updates", since = "1.82.0")]
2844pub unsafe fn _mm_loadu_si32(mem_addr: *const u8) -> __m128i {
2845    transmute(i32x4::new(
2846        ptr::read_unaligned(mem_addr as *const i32),
2847        0,
2848        0,
2849        0,
2850    ))
2851}
2852
2853/// Loads unaligned 64-bits of integer data from memory into new vector.
2854///
2855/// `mem_addr` does not need to be aligned on any particular boundary.
2856///
2857/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
2858#[inline]
2859#[target_feature(enable = "sse2")]
2860#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
2861pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
2862    transmute(i64x2::new(ptr::read_unaligned(mem_addr as *const i64), 0))
2863}
2864
2865/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2866/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
2867/// parameter as a specifier.
2868///
2869/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
2870#[inline]
2871#[target_feature(enable = "sse2")]
2872#[cfg_attr(test, assert_instr(shufps, MASK = 2))]
2873#[rustc_legacy_const_generics(2)]
2874#[stable(feature = "simd_x86", since = "1.27.0")]
2875pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
2876    static_assert_uimm_bits!(MASK, 8);
2877    unsafe { simd_shuffle!(a, b, [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2]) }
2878}
2879
2880/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
2881/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2882/// 64 bits are set to the upper 64 bits of the first parameter.
2883///
2884/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
2885#[inline]
2886#[target_feature(enable = "sse2")]
2887#[cfg_attr(test, assert_instr(movsd))]
2888#[stable(feature = "simd_x86", since = "1.27.0")]
2889pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2890    unsafe { _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1)) }
2891}
2892
2893/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2894/// floating-point vector of `[4 x float]`.
2895///
2896/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
2897#[inline]
2898#[target_feature(enable = "sse2")]
2899#[stable(feature = "simd_x86", since = "1.27.0")]
2900pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
2901    unsafe { transmute(a) }
2902}
2903
2904/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2905/// integer vector.
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
2908#[inline]
2909#[target_feature(enable = "sse2")]
2910#[stable(feature = "simd_x86", since = "1.27.0")]
2911pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
2912    unsafe { transmute(a) }
2913}
2914
2915/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2916/// floating-point vector of `[2 x double]`.
2917///
2918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
2919#[inline]
2920#[target_feature(enable = "sse2")]
2921#[stable(feature = "simd_x86", since = "1.27.0")]
2922pub fn _mm_castps_pd(a: __m128) -> __m128d {
2923    unsafe { transmute(a) }
2924}
2925
2926/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2927/// integer vector.
2928///
2929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
2930#[inline]
2931#[target_feature(enable = "sse2")]
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933pub fn _mm_castps_si128(a: __m128) -> __m128i {
2934    unsafe { transmute(a) }
2935}
2936
2937/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2938/// of `[2 x double]`.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
2941#[inline]
2942#[target_feature(enable = "sse2")]
2943#[stable(feature = "simd_x86", since = "1.27.0")]
2944pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
2945    unsafe { transmute(a) }
2946}
2947
2948/// Casts a 128-bit integer vector into a 128-bit floating-point vector
2949/// of `[4 x float]`.
2950///
2951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
2952#[inline]
2953#[target_feature(enable = "sse2")]
2954#[stable(feature = "simd_x86", since = "1.27.0")]
2955pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2956    unsafe { transmute(a) }
2957}
2958
2959/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
2960/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2961/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2962/// In practice, this is typically equivalent to [`mem::zeroed`].
2963///
2964/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
2965#[inline]
2966#[target_feature(enable = "sse2")]
2967#[stable(feature = "simd_x86", since = "1.27.0")]
2968pub fn _mm_undefined_pd() -> __m128d {
2969    const { unsafe { mem::zeroed() } }
2970}
2971
2972/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
2973/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2974/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2975/// In practice, this is typically equivalent to [`mem::zeroed`].
2976///
2977/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
2978#[inline]
2979#[target_feature(enable = "sse2")]
2980#[stable(feature = "simd_x86", since = "1.27.0")]
2981pub fn _mm_undefined_si128() -> __m128i {
2982    const { unsafe { mem::zeroed() } }
2983}
2984
2985/// The resulting `__m128d` element is composed by the low-order values of
2986/// the two `__m128d` interleaved input elements, i.e.:
2987///
2988/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
2989/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
2990///
2991/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
2992#[inline]
2993#[target_feature(enable = "sse2")]
2994#[cfg_attr(test, assert_instr(unpckhpd))]
2995#[stable(feature = "simd_x86", since = "1.27.0")]
2996pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
2997    unsafe { simd_shuffle!(a, b, [1, 3]) }
2998}
2999
3000/// The resulting `__m128d` element is composed by the high-order values of
3001/// the two `__m128d` interleaved input elements, i.e.:
3002///
3003/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
3004/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
3005///
3006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
3007#[inline]
3008#[target_feature(enable = "sse2")]
3009#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
3010#[stable(feature = "simd_x86", since = "1.27.0")]
3011pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
3012    unsafe { simd_shuffle!(a, b, [0, 2]) }
3013}
3014
3015#[allow(improper_ctypes)]
3016unsafe extern "C" {
3017    #[link_name = "llvm.x86.sse2.pause"]
3018    fn pause();
3019    #[link_name = "llvm.x86.sse2.clflush"]
3020    fn clflush(p: *const u8);
3021    #[link_name = "llvm.x86.sse2.lfence"]
3022    fn lfence();
3023    #[link_name = "llvm.x86.sse2.mfence"]
3024    fn mfence();
3025    #[link_name = "llvm.x86.sse2.pmadd.wd"]
3026    fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3027    #[link_name = "llvm.x86.sse2.psad.bw"]
3028    fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3029    #[link_name = "llvm.x86.sse2.psll.w"]
3030    fn psllw(a: i16x8, count: i16x8) -> i16x8;
3031    #[link_name = "llvm.x86.sse2.psll.d"]
3032    fn pslld(a: i32x4, count: i32x4) -> i32x4;
3033    #[link_name = "llvm.x86.sse2.psll.q"]
3034    fn psllq(a: i64x2, count: i64x2) -> i64x2;
3035    #[link_name = "llvm.x86.sse2.psra.w"]
3036    fn psraw(a: i16x8, count: i16x8) -> i16x8;
3037    #[link_name = "llvm.x86.sse2.psra.d"]
3038    fn psrad(a: i32x4, count: i32x4) -> i32x4;
3039    #[link_name = "llvm.x86.sse2.psrl.w"]
3040    fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3041    #[link_name = "llvm.x86.sse2.psrl.d"]
3042    fn psrld(a: i32x4, count: i32x4) -> i32x4;
3043    #[link_name = "llvm.x86.sse2.psrl.q"]
3044    fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3045    #[link_name = "llvm.x86.sse2.cvtps2dq"]
3046    fn cvtps2dq(a: __m128) -> i32x4;
3047    #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3048    fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3049    #[link_name = "llvm.x86.sse2.packsswb.128"]
3050    fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3051    #[link_name = "llvm.x86.sse2.packssdw.128"]
3052    fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3053    #[link_name = "llvm.x86.sse2.packuswb.128"]
3054    fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3055    #[link_name = "llvm.x86.sse2.max.sd"]
3056    fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3057    #[link_name = "llvm.x86.sse2.max.pd"]
3058    fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3059    #[link_name = "llvm.x86.sse2.min.sd"]
3060    fn minsd(a: __m128d, b: __m128d) -> __m128d;
3061    #[link_name = "llvm.x86.sse2.min.pd"]
3062    fn minpd(a: __m128d, b: __m128d) -> __m128d;
3063    #[link_name = "llvm.x86.sse2.cmp.sd"]
3064    fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3065    #[link_name = "llvm.x86.sse2.cmp.pd"]
3066    fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3067    #[link_name = "llvm.x86.sse2.comieq.sd"]
3068    fn comieqsd(a: __m128d, b: __m128d) -> i32;
3069    #[link_name = "llvm.x86.sse2.comilt.sd"]
3070    fn comiltsd(a: __m128d, b: __m128d) -> i32;
3071    #[link_name = "llvm.x86.sse2.comile.sd"]
3072    fn comilesd(a: __m128d, b: __m128d) -> i32;
3073    #[link_name = "llvm.x86.sse2.comigt.sd"]
3074    fn comigtsd(a: __m128d, b: __m128d) -> i32;
3075    #[link_name = "llvm.x86.sse2.comige.sd"]
3076    fn comigesd(a: __m128d, b: __m128d) -> i32;
3077    #[link_name = "llvm.x86.sse2.comineq.sd"]
3078    fn comineqsd(a: __m128d, b: __m128d) -> i32;
3079    #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3080    fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3081    #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3082    fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3083    #[link_name = "llvm.x86.sse2.ucomile.sd"]
3084    fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3085    #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3086    fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3087    #[link_name = "llvm.x86.sse2.ucomige.sd"]
3088    fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3089    #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3090    fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3091    #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3092    fn cvtpd2dq(a: __m128d) -> i32x4;
3093    #[link_name = "llvm.x86.sse2.cvtsd2si"]
3094    fn cvtsd2si(a: __m128d) -> i32;
3095    #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3096    fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3097    #[link_name = "llvm.x86.sse2.cvtss2sd"]
3098    fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3099    #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3100    fn cvttpd2dq(a: __m128d) -> i32x4;
3101    #[link_name = "llvm.x86.sse2.cvttsd2si"]
3102    fn cvttsd2si(a: __m128d) -> i32;
3103    #[link_name = "llvm.x86.sse2.cvttps2dq"]
3104    fn cvttps2dq(a: __m128) -> i32x4;
3105}
3106
3107#[cfg(test)]
3108mod tests {
3109    use crate::{
3110        core_arch::{simd::*, x86::*},
3111        hint::black_box,
3112    };
3113    use std::{
3114        boxed, f32, f64,
3115        mem::{self, transmute},
3116        ptr,
3117    };
3118    use stdarch_test::simd_test;
3119
3120    const NAN: f64 = f64::NAN;
3121
3122    #[test]
3123    fn test_mm_pause() {
3124        unsafe { _mm_pause() }
3125    }
3126
3127    #[simd_test(enable = "sse2")]
3128    unsafe fn test_mm_clflush() {
3129        let x = 0_u8;
3130        _mm_clflush(ptr::addr_of!(x));
3131    }
3132
3133    #[simd_test(enable = "sse2")]
3134    // Miri cannot support this until it is clear how it fits in the Rust memory model
3135    #[cfg_attr(miri, ignore)]
3136    unsafe fn test_mm_lfence() {
3137        _mm_lfence();
3138    }
3139
3140    #[simd_test(enable = "sse2")]
3141    // Miri cannot support this until it is clear how it fits in the Rust memory model
3142    #[cfg_attr(miri, ignore)]
3143    unsafe fn test_mm_mfence() {
3144        _mm_mfence();
3145    }
3146
3147    #[simd_test(enable = "sse2")]
3148    unsafe fn test_mm_add_epi8() {
3149        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3150        #[rustfmt::skip]
3151        let b = _mm_setr_epi8(
3152            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3153        );
3154        let r = _mm_add_epi8(a, b);
3155        #[rustfmt::skip]
3156        let e = _mm_setr_epi8(
3157            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3158        );
3159        assert_eq_m128i(r, e);
3160    }
3161
3162    #[simd_test(enable = "sse2")]
3163    unsafe fn test_mm_add_epi8_overflow() {
3164        let a = _mm_set1_epi8(0x7F);
3165        let b = _mm_set1_epi8(1);
3166        let r = _mm_add_epi8(a, b);
3167        assert_eq_m128i(r, _mm_set1_epi8(-128));
3168    }
3169
3170    #[simd_test(enable = "sse2")]
3171    unsafe fn test_mm_add_epi16() {
3172        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3173        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3174        let r = _mm_add_epi16(a, b);
3175        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3176        assert_eq_m128i(r, e);
3177    }
3178
3179    #[simd_test(enable = "sse2")]
3180    unsafe fn test_mm_add_epi32() {
3181        let a = _mm_setr_epi32(0, 1, 2, 3);
3182        let b = _mm_setr_epi32(4, 5, 6, 7);
3183        let r = _mm_add_epi32(a, b);
3184        let e = _mm_setr_epi32(4, 6, 8, 10);
3185        assert_eq_m128i(r, e);
3186    }
3187
3188    #[simd_test(enable = "sse2")]
3189    unsafe fn test_mm_add_epi64() {
3190        let a = _mm_setr_epi64x(0, 1);
3191        let b = _mm_setr_epi64x(2, 3);
3192        let r = _mm_add_epi64(a, b);
3193        let e = _mm_setr_epi64x(2, 4);
3194        assert_eq_m128i(r, e);
3195    }
3196
3197    #[simd_test(enable = "sse2")]
3198    unsafe fn test_mm_adds_epi8() {
3199        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3200        #[rustfmt::skip]
3201        let b = _mm_setr_epi8(
3202            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3203        );
3204        let r = _mm_adds_epi8(a, b);
3205        #[rustfmt::skip]
3206        let e = _mm_setr_epi8(
3207            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3208        );
3209        assert_eq_m128i(r, e);
3210    }
3211
3212    #[simd_test(enable = "sse2")]
3213    unsafe fn test_mm_adds_epi8_saturate_positive() {
3214        let a = _mm_set1_epi8(0x7F);
3215        let b = _mm_set1_epi8(1);
3216        let r = _mm_adds_epi8(a, b);
3217        assert_eq_m128i(r, a);
3218    }
3219
3220    #[simd_test(enable = "sse2")]
3221    unsafe fn test_mm_adds_epi8_saturate_negative() {
3222        let a = _mm_set1_epi8(-0x80);
3223        let b = _mm_set1_epi8(-1);
3224        let r = _mm_adds_epi8(a, b);
3225        assert_eq_m128i(r, a);
3226    }
3227
3228    #[simd_test(enable = "sse2")]
3229    unsafe fn test_mm_adds_epi16() {
3230        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3231        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3232        let r = _mm_adds_epi16(a, b);
3233        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3234        assert_eq_m128i(r, e);
3235    }
3236
3237    #[simd_test(enable = "sse2")]
3238    unsafe fn test_mm_adds_epi16_saturate_positive() {
3239        let a = _mm_set1_epi16(0x7FFF);
3240        let b = _mm_set1_epi16(1);
3241        let r = _mm_adds_epi16(a, b);
3242        assert_eq_m128i(r, a);
3243    }
3244
3245    #[simd_test(enable = "sse2")]
3246    unsafe fn test_mm_adds_epi16_saturate_negative() {
3247        let a = _mm_set1_epi16(-0x8000);
3248        let b = _mm_set1_epi16(-1);
3249        let r = _mm_adds_epi16(a, b);
3250        assert_eq_m128i(r, a);
3251    }
3252
3253    #[simd_test(enable = "sse2")]
3254    unsafe fn test_mm_adds_epu8() {
3255        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3256        #[rustfmt::skip]
3257        let b = _mm_setr_epi8(
3258            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3259        );
3260        let r = _mm_adds_epu8(a, b);
3261        #[rustfmt::skip]
3262        let e = _mm_setr_epi8(
3263            16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3264        );
3265        assert_eq_m128i(r, e);
3266    }
3267
3268    #[simd_test(enable = "sse2")]
3269    unsafe fn test_mm_adds_epu8_saturate() {
3270        let a = _mm_set1_epi8(!0);
3271        let b = _mm_set1_epi8(1);
3272        let r = _mm_adds_epu8(a, b);
3273        assert_eq_m128i(r, a);
3274    }
3275
3276    #[simd_test(enable = "sse2")]
3277    unsafe fn test_mm_adds_epu16() {
3278        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3279        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3280        let r = _mm_adds_epu16(a, b);
3281        let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3282        assert_eq_m128i(r, e);
3283    }
3284
3285    #[simd_test(enable = "sse2")]
3286    unsafe fn test_mm_adds_epu16_saturate() {
3287        let a = _mm_set1_epi16(!0);
3288        let b = _mm_set1_epi16(1);
3289        let r = _mm_adds_epu16(a, b);
3290        assert_eq_m128i(r, a);
3291    }
3292
3293    #[simd_test(enable = "sse2")]
3294    unsafe fn test_mm_avg_epu8() {
3295        let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3296        let r = _mm_avg_epu8(a, b);
3297        assert_eq_m128i(r, _mm_set1_epi8(6));
3298    }
3299
3300    #[simd_test(enable = "sse2")]
3301    unsafe fn test_mm_avg_epu16() {
3302        let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3303        let r = _mm_avg_epu16(a, b);
3304        assert_eq_m128i(r, _mm_set1_epi16(6));
3305    }
3306
3307    #[simd_test(enable = "sse2")]
3308    unsafe fn test_mm_madd_epi16() {
3309        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3310        let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3311        let r = _mm_madd_epi16(a, b);
3312        let e = _mm_setr_epi32(29, 81, 149, 233);
3313        assert_eq_m128i(r, e);
3314
3315        // Test large values.
3316        // MIN*MIN+MIN*MIN will overflow into i32::MIN.
3317        let a = _mm_setr_epi16(
3318            i16::MAX,
3319            i16::MAX,
3320            i16::MIN,
3321            i16::MIN,
3322            i16::MIN,
3323            i16::MAX,
3324            0,
3325            0,
3326        );
3327        let b = _mm_setr_epi16(
3328            i16::MAX,
3329            i16::MAX,
3330            i16::MIN,
3331            i16::MIN,
3332            i16::MAX,
3333            i16::MIN,
3334            0,
3335            0,
3336        );
3337        let r = _mm_madd_epi16(a, b);
3338        let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0);
3339        assert_eq_m128i(r, e);
3340    }
3341
3342    #[simd_test(enable = "sse2")]
3343    unsafe fn test_mm_max_epi16() {
3344        let a = _mm_set1_epi16(1);
3345        let b = _mm_set1_epi16(-1);
3346        let r = _mm_max_epi16(a, b);
3347        assert_eq_m128i(r, a);
3348    }
3349
3350    #[simd_test(enable = "sse2")]
3351    unsafe fn test_mm_max_epu8() {
3352        let a = _mm_set1_epi8(1);
3353        let b = _mm_set1_epi8(!0);
3354        let r = _mm_max_epu8(a, b);
3355        assert_eq_m128i(r, b);
3356    }
3357
3358    #[simd_test(enable = "sse2")]
3359    unsafe fn test_mm_min_epi16() {
3360        let a = _mm_set1_epi16(1);
3361        let b = _mm_set1_epi16(-1);
3362        let r = _mm_min_epi16(a, b);
3363        assert_eq_m128i(r, b);
3364    }
3365
3366    #[simd_test(enable = "sse2")]
3367    unsafe fn test_mm_min_epu8() {
3368        let a = _mm_set1_epi8(1);
3369        let b = _mm_set1_epi8(!0);
3370        let r = _mm_min_epu8(a, b);
3371        assert_eq_m128i(r, a);
3372    }
3373
3374    #[simd_test(enable = "sse2")]
3375    unsafe fn test_mm_mulhi_epi16() {
3376        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3377        let r = _mm_mulhi_epi16(a, b);
3378        assert_eq_m128i(r, _mm_set1_epi16(-16));
3379    }
3380
3381    #[simd_test(enable = "sse2")]
3382    unsafe fn test_mm_mulhi_epu16() {
3383        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3384        let r = _mm_mulhi_epu16(a, b);
3385        assert_eq_m128i(r, _mm_set1_epi16(15));
3386    }
3387
3388    #[simd_test(enable = "sse2")]
3389    unsafe fn test_mm_mullo_epi16() {
3390        let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3391        let r = _mm_mullo_epi16(a, b);
3392        assert_eq_m128i(r, _mm_set1_epi16(-17960));
3393    }
3394
3395    #[simd_test(enable = "sse2")]
3396    unsafe fn test_mm_mul_epu32() {
3397        let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3398        let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3399        let r = _mm_mul_epu32(a, b);
3400        let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3401        assert_eq_m128i(r, e);
3402    }
3403
3404    #[simd_test(enable = "sse2")]
3405    unsafe fn test_mm_sad_epu8() {
3406        #[rustfmt::skip]
3407        let a = _mm_setr_epi8(
3408            255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3409            1, 2, 3, 4,
3410            155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3411            1, 2, 3, 4,
3412        );
3413        let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3414        let r = _mm_sad_epu8(a, b);
3415        let e = _mm_setr_epi64x(1020, 614);
3416        assert_eq_m128i(r, e);
3417    }
3418
3419    #[simd_test(enable = "sse2")]
3420    unsafe fn test_mm_sub_epi8() {
3421        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3422        let r = _mm_sub_epi8(a, b);
3423        assert_eq_m128i(r, _mm_set1_epi8(-1));
3424    }
3425
3426    #[simd_test(enable = "sse2")]
3427    unsafe fn test_mm_sub_epi16() {
3428        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3429        let r = _mm_sub_epi16(a, b);
3430        assert_eq_m128i(r, _mm_set1_epi16(-1));
3431    }
3432
3433    #[simd_test(enable = "sse2")]
3434    unsafe fn test_mm_sub_epi32() {
3435        let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3436        let r = _mm_sub_epi32(a, b);
3437        assert_eq_m128i(r, _mm_set1_epi32(-1));
3438    }
3439
3440    #[simd_test(enable = "sse2")]
3441    unsafe fn test_mm_sub_epi64() {
3442        let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3443        let r = _mm_sub_epi64(a, b);
3444        assert_eq_m128i(r, _mm_set1_epi64x(-1));
3445    }
3446
3447    #[simd_test(enable = "sse2")]
3448    unsafe fn test_mm_subs_epi8() {
3449        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3450        let r = _mm_subs_epi8(a, b);
3451        assert_eq_m128i(r, _mm_set1_epi8(3));
3452    }
3453
3454    #[simd_test(enable = "sse2")]
3455    unsafe fn test_mm_subs_epi8_saturate_positive() {
3456        let a = _mm_set1_epi8(0x7F);
3457        let b = _mm_set1_epi8(-1);
3458        let r = _mm_subs_epi8(a, b);
3459        assert_eq_m128i(r, a);
3460    }
3461
3462    #[simd_test(enable = "sse2")]
3463    unsafe fn test_mm_subs_epi8_saturate_negative() {
3464        let a = _mm_set1_epi8(-0x80);
3465        let b = _mm_set1_epi8(1);
3466        let r = _mm_subs_epi8(a, b);
3467        assert_eq_m128i(r, a);
3468    }
3469
3470    #[simd_test(enable = "sse2")]
3471    unsafe fn test_mm_subs_epi16() {
3472        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3473        let r = _mm_subs_epi16(a, b);
3474        assert_eq_m128i(r, _mm_set1_epi16(3));
3475    }
3476
3477    #[simd_test(enable = "sse2")]
3478    unsafe fn test_mm_subs_epi16_saturate_positive() {
3479        let a = _mm_set1_epi16(0x7FFF);
3480        let b = _mm_set1_epi16(-1);
3481        let r = _mm_subs_epi16(a, b);
3482        assert_eq_m128i(r, a);
3483    }
3484
3485    #[simd_test(enable = "sse2")]
3486    unsafe fn test_mm_subs_epi16_saturate_negative() {
3487        let a = _mm_set1_epi16(-0x8000);
3488        let b = _mm_set1_epi16(1);
3489        let r = _mm_subs_epi16(a, b);
3490        assert_eq_m128i(r, a);
3491    }
3492
3493    #[simd_test(enable = "sse2")]
3494    unsafe fn test_mm_subs_epu8() {
3495        let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3496        let r = _mm_subs_epu8(a, b);
3497        assert_eq_m128i(r, _mm_set1_epi8(3));
3498    }
3499
3500    #[simd_test(enable = "sse2")]
3501    unsafe fn test_mm_subs_epu8_saturate() {
3502        let a = _mm_set1_epi8(0);
3503        let b = _mm_set1_epi8(1);
3504        let r = _mm_subs_epu8(a, b);
3505        assert_eq_m128i(r, a);
3506    }
3507
3508    #[simd_test(enable = "sse2")]
3509    unsafe fn test_mm_subs_epu16() {
3510        let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3511        let r = _mm_subs_epu16(a, b);
3512        assert_eq_m128i(r, _mm_set1_epi16(3));
3513    }
3514
3515    #[simd_test(enable = "sse2")]
3516    unsafe fn test_mm_subs_epu16_saturate() {
3517        let a = _mm_set1_epi16(0);
3518        let b = _mm_set1_epi16(1);
3519        let r = _mm_subs_epu16(a, b);
3520        assert_eq_m128i(r, a);
3521    }
3522
3523    #[simd_test(enable = "sse2")]
3524    unsafe fn test_mm_slli_si128() {
3525        #[rustfmt::skip]
3526        let a = _mm_setr_epi8(
3527            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3528        );
3529        let r = _mm_slli_si128::<1>(a);
3530        let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3531        assert_eq_m128i(r, e);
3532
3533        #[rustfmt::skip]
3534        let a = _mm_setr_epi8(
3535            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3536        );
3537        let r = _mm_slli_si128::<15>(a);
3538        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3539        assert_eq_m128i(r, e);
3540
3541        #[rustfmt::skip]
3542        let a = _mm_setr_epi8(
3543            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3544        );
3545        let r = _mm_slli_si128::<16>(a);
3546        assert_eq_m128i(r, _mm_set1_epi8(0));
3547    }
3548
3549    #[simd_test(enable = "sse2")]
3550    unsafe fn test_mm_slli_epi16() {
3551        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3552        let r = _mm_slli_epi16::<4>(a);
3553        assert_eq_m128i(
3554            r,
3555            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3556        );
3557        let r = _mm_slli_epi16::<16>(a);
3558        assert_eq_m128i(r, _mm_set1_epi16(0));
3559    }
3560
3561    #[simd_test(enable = "sse2")]
3562    unsafe fn test_mm_sll_epi16() {
3563        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3564        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 4));
3565        assert_eq_m128i(
3566            r,
3567            _mm_setr_epi16(0xCC0, -0xCC0, 0xDD0, -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0),
3568        );
3569        let r = _mm_sll_epi16(a, _mm_set_epi64x(4, 0));
3570        assert_eq_m128i(r, a);
3571        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, 16));
3572        assert_eq_m128i(r, _mm_set1_epi16(0));
3573        let r = _mm_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
3574        assert_eq_m128i(r, _mm_set1_epi16(0));
3575    }
3576
3577    #[simd_test(enable = "sse2")]
3578    unsafe fn test_mm_slli_epi32() {
3579        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3580        let r = _mm_slli_epi32::<4>(a);
3581        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3582        let r = _mm_slli_epi32::<32>(a);
3583        assert_eq_m128i(r, _mm_set1_epi32(0));
3584    }
3585
3586    #[simd_test(enable = "sse2")]
3587    unsafe fn test_mm_sll_epi32() {
3588        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3589        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 4));
3590        assert_eq_m128i(r, _mm_setr_epi32(0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0));
3591        let r = _mm_sll_epi32(a, _mm_set_epi64x(4, 0));
3592        assert_eq_m128i(r, a);
3593        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, 32));
3594        assert_eq_m128i(r, _mm_set1_epi32(0));
3595        let r = _mm_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
3596        assert_eq_m128i(r, _mm_set1_epi32(0));
3597    }
3598
3599    #[simd_test(enable = "sse2")]
3600    unsafe fn test_mm_slli_epi64() {
3601        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3602        let r = _mm_slli_epi64::<4>(a);
3603        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3604        let r = _mm_slli_epi64::<64>(a);
3605        assert_eq_m128i(r, _mm_set1_epi64x(0));
3606    }
3607
3608    #[simd_test(enable = "sse2")]
3609    unsafe fn test_mm_sll_epi64() {
3610        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3611        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 4));
3612        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFFF0, -0xFFFFFFFF0));
3613        let r = _mm_sll_epi64(a, _mm_set_epi64x(4, 0));
3614        assert_eq_m128i(r, a);
3615        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, 64));
3616        assert_eq_m128i(r, _mm_set1_epi64x(0));
3617        let r = _mm_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
3618        assert_eq_m128i(r, _mm_set1_epi64x(0));
3619    }
3620
3621    #[simd_test(enable = "sse2")]
3622    unsafe fn test_mm_srai_epi16() {
3623        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3624        let r = _mm_srai_epi16::<4>(a);
3625        assert_eq_m128i(
3626            r,
3627            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3628        );
3629        let r = _mm_srai_epi16::<16>(a);
3630        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3631    }
3632
3633    #[simd_test(enable = "sse2")]
3634    unsafe fn test_mm_sra_epi16() {
3635        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3636        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 4));
3637        assert_eq_m128i(
3638            r,
3639            _mm_setr_epi16(0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, -0x10),
3640        );
3641        let r = _mm_sra_epi16(a, _mm_set_epi64x(4, 0));
3642        assert_eq_m128i(r, a);
3643        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, 16));
3644        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3645        let r = _mm_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
3646        assert_eq_m128i(r, _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1));
3647    }
3648
3649    #[simd_test(enable = "sse2")]
3650    unsafe fn test_mm_srai_epi32() {
3651        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3652        let r = _mm_srai_epi32::<4>(a);
3653        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3654        let r = _mm_srai_epi32::<32>(a);
3655        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3656    }
3657
3658    #[simd_test(enable = "sse2")]
3659    unsafe fn test_mm_sra_epi32() {
3660        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3661        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 4));
3662        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, -0xEEF, 0xFFF, -0x1000));
3663        let r = _mm_sra_epi32(a, _mm_set_epi64x(4, 0));
3664        assert_eq_m128i(r, a);
3665        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, 32));
3666        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3667        let r = _mm_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
3668        assert_eq_m128i(r, _mm_setr_epi32(0, -1, 0, -1));
3669    }
3670
3671    #[simd_test(enable = "sse2")]
3672    unsafe fn test_mm_srli_si128() {
3673        #[rustfmt::skip]
3674        let a = _mm_setr_epi8(
3675            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3676        );
3677        let r = _mm_srli_si128::<1>(a);
3678        #[rustfmt::skip]
3679        let e = _mm_setr_epi8(
3680            2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3681        );
3682        assert_eq_m128i(r, e);
3683
3684        #[rustfmt::skip]
3685        let a = _mm_setr_epi8(
3686            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3687        );
3688        let r = _mm_srli_si128::<15>(a);
3689        let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3690        assert_eq_m128i(r, e);
3691
3692        #[rustfmt::skip]
3693        let a = _mm_setr_epi8(
3694            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3695        );
3696        let r = _mm_srli_si128::<16>(a);
3697        assert_eq_m128i(r, _mm_set1_epi8(0));
3698    }
3699
3700    #[simd_test(enable = "sse2")]
3701    unsafe fn test_mm_srli_epi16() {
3702        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3703        let r = _mm_srli_epi16::<4>(a);
3704        assert_eq_m128i(
3705            r,
3706            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3707        );
3708        let r = _mm_srli_epi16::<16>(a);
3709        assert_eq_m128i(r, _mm_set1_epi16(0));
3710    }
3711
3712    #[simd_test(enable = "sse2")]
3713    unsafe fn test_mm_srl_epi16() {
3714        let a = _mm_setr_epi16(0xCC, -0xCC, 0xDD, -0xDD, 0xEE, -0xEE, 0xFF, -0xFF);
3715        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 4));
3716        assert_eq_m128i(
3717            r,
3718            _mm_setr_epi16(0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, 0xF, 0xFF0),
3719        );
3720        let r = _mm_srl_epi16(a, _mm_set_epi64x(4, 0));
3721        assert_eq_m128i(r, a);
3722        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, 16));
3723        assert_eq_m128i(r, _mm_set1_epi16(0));
3724        let r = _mm_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
3725        assert_eq_m128i(r, _mm_set1_epi16(0));
3726    }
3727
3728    #[simd_test(enable = "sse2")]
3729    unsafe fn test_mm_srli_epi32() {
3730        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3731        let r = _mm_srli_epi32::<4>(a);
3732        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3733        let r = _mm_srli_epi32::<32>(a);
3734        assert_eq_m128i(r, _mm_set1_epi32(0));
3735    }
3736
3737    #[simd_test(enable = "sse2")]
3738    unsafe fn test_mm_srl_epi32() {
3739        let a = _mm_setr_epi32(0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
3740        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 4));
3741        assert_eq_m128i(r, _mm_setr_epi32(0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000));
3742        let r = _mm_srl_epi32(a, _mm_set_epi64x(4, 0));
3743        assert_eq_m128i(r, a);
3744        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, 32));
3745        assert_eq_m128i(r, _mm_set1_epi32(0));
3746        let r = _mm_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
3747        assert_eq_m128i(r, _mm_set1_epi32(0));
3748    }
3749
3750    #[simd_test(enable = "sse2")]
3751    unsafe fn test_mm_srli_epi64() {
3752        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3753        let r = _mm_srli_epi64::<4>(a);
3754        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3755        let r = _mm_srli_epi64::<64>(a);
3756        assert_eq_m128i(r, _mm_set1_epi64x(0));
3757    }
3758
3759    #[simd_test(enable = "sse2")]
3760    unsafe fn test_mm_srl_epi64() {
3761        let a = _mm_set_epi64x(0xFFFFFFFF, -0xFFFFFFFF);
3762        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 4));
3763        assert_eq_m128i(r, _mm_set_epi64x(0xFFFFFFF, 0xFFFFFFFF0000000));
3764        let r = _mm_srl_epi64(a, _mm_set_epi64x(4, 0));
3765        assert_eq_m128i(r, a);
3766        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, 64));
3767        assert_eq_m128i(r, _mm_set1_epi64x(0));
3768        let r = _mm_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
3769        assert_eq_m128i(r, _mm_set1_epi64x(0));
3770    }
3771
3772    #[simd_test(enable = "sse2")]
3773    unsafe fn test_mm_and_si128() {
3774        let a = _mm_set1_epi8(5);
3775        let b = _mm_set1_epi8(3);
3776        let r = _mm_and_si128(a, b);
3777        assert_eq_m128i(r, _mm_set1_epi8(1));
3778    }
3779
3780    #[simd_test(enable = "sse2")]
3781    unsafe fn test_mm_andnot_si128() {
3782        let a = _mm_set1_epi8(5);
3783        let b = _mm_set1_epi8(3);
3784        let r = _mm_andnot_si128(a, b);
3785        assert_eq_m128i(r, _mm_set1_epi8(2));
3786    }
3787
3788    #[simd_test(enable = "sse2")]
3789    unsafe fn test_mm_or_si128() {
3790        let a = _mm_set1_epi8(5);
3791        let b = _mm_set1_epi8(3);
3792        let r = _mm_or_si128(a, b);
3793        assert_eq_m128i(r, _mm_set1_epi8(7));
3794    }
3795
3796    #[simd_test(enable = "sse2")]
3797    unsafe fn test_mm_xor_si128() {
3798        let a = _mm_set1_epi8(5);
3799        let b = _mm_set1_epi8(3);
3800        let r = _mm_xor_si128(a, b);
3801        assert_eq_m128i(r, _mm_set1_epi8(6));
3802    }
3803
3804    #[simd_test(enable = "sse2")]
3805    unsafe fn test_mm_cmpeq_epi8() {
3806        let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3807        let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3808        let r = _mm_cmpeq_epi8(a, b);
3809        #[rustfmt::skip]
3810        assert_eq_m128i(
3811            r,
3812            _mm_setr_epi8(
3813                0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3814            )
3815        );
3816    }
3817
3818    #[simd_test(enable = "sse2")]
3819    unsafe fn test_mm_cmpeq_epi16() {
3820        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3821        let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3822        let r = _mm_cmpeq_epi16(a, b);
3823        assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3824    }
3825
3826    #[simd_test(enable = "sse2")]
3827    unsafe fn test_mm_cmpeq_epi32() {
3828        let a = _mm_setr_epi32(0, 1, 2, 3);
3829        let b = _mm_setr_epi32(3, 2, 2, 0);
3830        let r = _mm_cmpeq_epi32(a, b);
3831        assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3832    }
3833
3834    #[simd_test(enable = "sse2")]
3835    unsafe fn test_mm_cmpgt_epi8() {
3836        let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3837        let b = _mm_set1_epi8(0);
3838        let r = _mm_cmpgt_epi8(a, b);
3839        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3840        assert_eq_m128i(r, e);
3841    }
3842
3843    #[simd_test(enable = "sse2")]
3844    unsafe fn test_mm_cmpgt_epi16() {
3845        let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3846        let b = _mm_set1_epi16(0);
3847        let r = _mm_cmpgt_epi16(a, b);
3848        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3849        assert_eq_m128i(r, e);
3850    }
3851
3852    #[simd_test(enable = "sse2")]
3853    unsafe fn test_mm_cmpgt_epi32() {
3854        let a = _mm_set_epi32(5, 0, 0, 0);
3855        let b = _mm_set1_epi32(0);
3856        let r = _mm_cmpgt_epi32(a, b);
3857        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3858    }
3859
3860    #[simd_test(enable = "sse2")]
3861    unsafe fn test_mm_cmplt_epi8() {
3862        let a = _mm_set1_epi8(0);
3863        let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3864        let r = _mm_cmplt_epi8(a, b);
3865        let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3866        assert_eq_m128i(r, e);
3867    }
3868
3869    #[simd_test(enable = "sse2")]
3870    unsafe fn test_mm_cmplt_epi16() {
3871        let a = _mm_set1_epi16(0);
3872        let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3873        let r = _mm_cmplt_epi16(a, b);
3874        let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3875        assert_eq_m128i(r, e);
3876    }
3877
3878    #[simd_test(enable = "sse2")]
3879    unsafe fn test_mm_cmplt_epi32() {
3880        let a = _mm_set1_epi32(0);
3881        let b = _mm_set_epi32(5, 0, 0, 0);
3882        let r = _mm_cmplt_epi32(a, b);
3883        assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3884    }
3885
3886    #[simd_test(enable = "sse2")]
3887    unsafe fn test_mm_cvtepi32_pd() {
3888        let a = _mm_set_epi32(35, 25, 15, 5);
3889        let r = _mm_cvtepi32_pd(a);
3890        assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3891    }
3892
3893    #[simd_test(enable = "sse2")]
3894    unsafe fn test_mm_cvtsi32_sd() {
3895        let a = _mm_set1_pd(3.5);
3896        let r = _mm_cvtsi32_sd(a, 5);
3897        assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3898    }
3899
3900    #[simd_test(enable = "sse2")]
3901    unsafe fn test_mm_cvtepi32_ps() {
3902        let a = _mm_setr_epi32(1, 2, 3, 4);
3903        let r = _mm_cvtepi32_ps(a);
3904        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3905    }
3906
3907    #[simd_test(enable = "sse2")]
3908    unsafe fn test_mm_cvtps_epi32() {
3909        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3910        let r = _mm_cvtps_epi32(a);
3911        assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3912    }
3913
3914    #[simd_test(enable = "sse2")]
3915    unsafe fn test_mm_cvtsi32_si128() {
3916        let r = _mm_cvtsi32_si128(5);
3917        assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3918    }
3919
3920    #[simd_test(enable = "sse2")]
3921    unsafe fn test_mm_cvtsi128_si32() {
3922        let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3923        assert_eq!(r, 5);
3924    }
3925
3926    #[simd_test(enable = "sse2")]
3927    unsafe fn test_mm_set_epi64x() {
3928        let r = _mm_set_epi64x(0, 1);
3929        assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3930    }
3931
3932    #[simd_test(enable = "sse2")]
3933    unsafe fn test_mm_set_epi32() {
3934        let r = _mm_set_epi32(0, 1, 2, 3);
3935        assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3936    }
3937
3938    #[simd_test(enable = "sse2")]
3939    unsafe fn test_mm_set_epi16() {
3940        let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3941        assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3942    }
3943
3944    #[simd_test(enable = "sse2")]
3945    unsafe fn test_mm_set_epi8() {
3946        #[rustfmt::skip]
3947        let r = _mm_set_epi8(
3948            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3949        );
3950        #[rustfmt::skip]
3951        let e = _mm_setr_epi8(
3952            15, 14, 13, 12, 11, 10, 9, 8,
3953            7, 6, 5, 4, 3, 2, 1, 0,
3954        );
3955        assert_eq_m128i(r, e);
3956    }
3957
3958    #[simd_test(enable = "sse2")]
3959    unsafe fn test_mm_set1_epi64x() {
3960        let r = _mm_set1_epi64x(1);
3961        assert_eq_m128i(r, _mm_set1_epi64x(1));
3962    }
3963
3964    #[simd_test(enable = "sse2")]
3965    unsafe fn test_mm_set1_epi32() {
3966        let r = _mm_set1_epi32(1);
3967        assert_eq_m128i(r, _mm_set1_epi32(1));
3968    }
3969
3970    #[simd_test(enable = "sse2")]
3971    unsafe fn test_mm_set1_epi16() {
3972        let r = _mm_set1_epi16(1);
3973        assert_eq_m128i(r, _mm_set1_epi16(1));
3974    }
3975
3976    #[simd_test(enable = "sse2")]
3977    unsafe fn test_mm_set1_epi8() {
3978        let r = _mm_set1_epi8(1);
3979        assert_eq_m128i(r, _mm_set1_epi8(1));
3980    }
3981
3982    #[simd_test(enable = "sse2")]
3983    unsafe fn test_mm_setr_epi32() {
3984        let r = _mm_setr_epi32(0, 1, 2, 3);
3985        assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
3986    }
3987
3988    #[simd_test(enable = "sse2")]
3989    unsafe fn test_mm_setr_epi16() {
3990        let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3991        assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
3992    }
3993
3994    #[simd_test(enable = "sse2")]
3995    unsafe fn test_mm_setr_epi8() {
3996        #[rustfmt::skip]
3997        let r = _mm_setr_epi8(
3998            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3999        );
4000        #[rustfmt::skip]
4001        let e = _mm_setr_epi8(
4002            0, 1, 2, 3, 4, 5, 6, 7,
4003            8, 9, 10, 11, 12, 13, 14, 15,
4004        );
4005        assert_eq_m128i(r, e);
4006    }
4007
4008    #[simd_test(enable = "sse2")]
4009    unsafe fn test_mm_setzero_si128() {
4010        let r = _mm_setzero_si128();
4011        assert_eq_m128i(r, _mm_set1_epi64x(0));
4012    }
4013
4014    #[simd_test(enable = "sse2")]
4015    unsafe fn test_mm_loadl_epi64() {
4016        let a = _mm_setr_epi64x(6, 5);
4017        let r = _mm_loadl_epi64(ptr::addr_of!(a));
4018        assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4019    }
4020
4021    #[simd_test(enable = "sse2")]
4022    unsafe fn test_mm_load_si128() {
4023        let a = _mm_set_epi64x(5, 6);
4024        let r = _mm_load_si128(ptr::addr_of!(a) as *const _);
4025        assert_eq_m128i(a, r);
4026    }
4027
4028    #[simd_test(enable = "sse2")]
4029    unsafe fn test_mm_loadu_si128() {
4030        let a = _mm_set_epi64x(5, 6);
4031        let r = _mm_loadu_si128(ptr::addr_of!(a) as *const _);
4032        assert_eq_m128i(a, r);
4033    }
4034
4035    #[simd_test(enable = "sse2")]
4036    // Miri cannot support this until it is clear how it fits in the Rust memory model
4037    // (non-temporal store)
4038    #[cfg_attr(miri, ignore)]
4039    unsafe fn test_mm_maskmoveu_si128() {
4040        let a = _mm_set1_epi8(9);
4041        #[rustfmt::skip]
4042        let mask = _mm_set_epi8(
4043            0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4044            0, 0, 0, 0, 0, 0, 0, 0,
4045        );
4046        let mut r = _mm_set1_epi8(0);
4047        _mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4048        let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4049        assert_eq_m128i(r, e);
4050    }
4051
4052    #[simd_test(enable = "sse2")]
4053    unsafe fn test_mm_store_si128() {
4054        let a = _mm_set1_epi8(9);
4055        let mut r = _mm_set1_epi8(0);
4056        _mm_store_si128(&mut r, a);
4057        assert_eq_m128i(r, a);
4058    }
4059
4060    #[simd_test(enable = "sse2")]
4061    unsafe fn test_mm_storeu_si128() {
4062        let a = _mm_set1_epi8(9);
4063        let mut r = _mm_set1_epi8(0);
4064        _mm_storeu_si128(&mut r, a);
4065        assert_eq_m128i(r, a);
4066    }
4067
4068    #[simd_test(enable = "sse2")]
4069    unsafe fn test_mm_storel_epi64() {
4070        let a = _mm_setr_epi64x(2, 9);
4071        let mut r = _mm_set1_epi8(0);
4072        _mm_storel_epi64(&mut r, a);
4073        assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4074    }
4075
4076    #[simd_test(enable = "sse2")]
4077    // Miri cannot support this until it is clear how it fits in the Rust memory model
4078    // (non-temporal store)
4079    #[cfg_attr(miri, ignore)]
4080    unsafe fn test_mm_stream_si128() {
4081        let a = _mm_setr_epi32(1, 2, 3, 4);
4082        let mut r = _mm_undefined_si128();
4083        _mm_stream_si128(ptr::addr_of_mut!(r), a);
4084        assert_eq_m128i(r, a);
4085    }
4086
4087    #[simd_test(enable = "sse2")]
4088    // Miri cannot support this until it is clear how it fits in the Rust memory model
4089    // (non-temporal store)
4090    #[cfg_attr(miri, ignore)]
4091    unsafe fn test_mm_stream_si32() {
4092        let a: i32 = 7;
4093        let mut mem = boxed::Box::<i32>::new(-1);
4094        _mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4095        assert_eq!(a, *mem);
4096    }
4097
4098    #[simd_test(enable = "sse2")]
4099    unsafe fn test_mm_move_epi64() {
4100        let a = _mm_setr_epi64x(5, 6);
4101        let r = _mm_move_epi64(a);
4102        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4103    }
4104
4105    #[simd_test(enable = "sse2")]
4106    unsafe fn test_mm_packs_epi16() {
4107        let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4108        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4109        let r = _mm_packs_epi16(a, b);
4110        #[rustfmt::skip]
4111        assert_eq_m128i(
4112            r,
4113            _mm_setr_epi8(
4114                0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4115            )
4116        );
4117    }
4118
4119    #[simd_test(enable = "sse2")]
4120    unsafe fn test_mm_packs_epi32() {
4121        let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4122        let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4123        let r = _mm_packs_epi32(a, b);
4124        assert_eq_m128i(
4125            r,
4126            _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4127        );
4128    }
4129
4130    #[simd_test(enable = "sse2")]
4131    unsafe fn test_mm_packus_epi16() {
4132        let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4133        let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4134        let r = _mm_packus_epi16(a, b);
4135        assert_eq_m128i(
4136            r,
4137            _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4138        );
4139    }
4140
4141    #[simd_test(enable = "sse2")]
4142    unsafe fn test_mm_extract_epi16() {
4143        let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4144        let r1 = _mm_extract_epi16::<0>(a);
4145        let r2 = _mm_extract_epi16::<3>(a);
4146        assert_eq!(r1, 0xFFFF);
4147        assert_eq!(r2, 3);
4148    }
4149
4150    #[simd_test(enable = "sse2")]
4151    unsafe fn test_mm_insert_epi16() {
4152        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4153        let r = _mm_insert_epi16::<0>(a, 9);
4154        let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4155        assert_eq_m128i(r, e);
4156    }
4157
4158    #[simd_test(enable = "sse2")]
4159    unsafe fn test_mm_movemask_epi8() {
4160        #[rustfmt::skip]
4161        let a = _mm_setr_epi8(
4162            0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4163            0b0101, 0b1111_0000u8 as i8, 0, 0,
4164            0, 0b1011_0101u8 as i8, 0b1111_0000u8 as i8, 0b0101,
4165            0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4166        );
4167        let r = _mm_movemask_epi8(a);
4168        assert_eq!(r, 0b10100110_00100101);
4169    }
4170
4171    #[simd_test(enable = "sse2")]
4172    unsafe fn test_mm_shuffle_epi32() {
4173        let a = _mm_setr_epi32(5, 10, 15, 20);
4174        let r = _mm_shuffle_epi32::<0b00_01_01_11>(a);
4175        let e = _mm_setr_epi32(20, 10, 10, 5);
4176        assert_eq_m128i(r, e);
4177    }
4178
4179    #[simd_test(enable = "sse2")]
4180    unsafe fn test_mm_shufflehi_epi16() {
4181        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4182        let r = _mm_shufflehi_epi16::<0b00_01_01_11>(a);
4183        let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4184        assert_eq_m128i(r, e);
4185    }
4186
4187    #[simd_test(enable = "sse2")]
4188    unsafe fn test_mm_shufflelo_epi16() {
4189        let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4190        let r = _mm_shufflelo_epi16::<0b00_01_01_11>(a);
4191        let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4192        assert_eq_m128i(r, e);
4193    }
4194
4195    #[simd_test(enable = "sse2")]
4196    unsafe fn test_mm_unpackhi_epi8() {
4197        #[rustfmt::skip]
4198        let a = _mm_setr_epi8(
4199            0, 1, 2, 3, 4, 5, 6, 7,
4200            8, 9, 10, 11, 12, 13, 14, 15,
4201        );
4202        #[rustfmt::skip]
4203        let b = _mm_setr_epi8(
4204            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4205        );
4206        let r = _mm_unpackhi_epi8(a, b);
4207        #[rustfmt::skip]
4208        let e = _mm_setr_epi8(
4209            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4210        );
4211        assert_eq_m128i(r, e);
4212    }
4213
4214    #[simd_test(enable = "sse2")]
4215    unsafe fn test_mm_unpackhi_epi16() {
4216        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4217        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4218        let r = _mm_unpackhi_epi16(a, b);
4219        let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4220        assert_eq_m128i(r, e);
4221    }
4222
4223    #[simd_test(enable = "sse2")]
4224    unsafe fn test_mm_unpackhi_epi32() {
4225        let a = _mm_setr_epi32(0, 1, 2, 3);
4226        let b = _mm_setr_epi32(4, 5, 6, 7);
4227        let r = _mm_unpackhi_epi32(a, b);
4228        let e = _mm_setr_epi32(2, 6, 3, 7);
4229        assert_eq_m128i(r, e);
4230    }
4231
4232    #[simd_test(enable = "sse2")]
4233    unsafe fn test_mm_unpackhi_epi64() {
4234        let a = _mm_setr_epi64x(0, 1);
4235        let b = _mm_setr_epi64x(2, 3);
4236        let r = _mm_unpackhi_epi64(a, b);
4237        let e = _mm_setr_epi64x(1, 3);
4238        assert_eq_m128i(r, e);
4239    }
4240
4241    #[simd_test(enable = "sse2")]
4242    unsafe fn test_mm_unpacklo_epi8() {
4243        #[rustfmt::skip]
4244        let a = _mm_setr_epi8(
4245            0, 1, 2, 3, 4, 5, 6, 7,
4246            8, 9, 10, 11, 12, 13, 14, 15,
4247        );
4248        #[rustfmt::skip]
4249        let b = _mm_setr_epi8(
4250            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4251        );
4252        let r = _mm_unpacklo_epi8(a, b);
4253        #[rustfmt::skip]
4254        let e = _mm_setr_epi8(
4255            0, 16, 1, 17, 2, 18, 3, 19,
4256            4, 20, 5, 21, 6, 22, 7, 23,
4257        );
4258        assert_eq_m128i(r, e);
4259    }
4260
4261    #[simd_test(enable = "sse2")]
4262    unsafe fn test_mm_unpacklo_epi16() {
4263        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4264        let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4265        let r = _mm_unpacklo_epi16(a, b);
4266        let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4267        assert_eq_m128i(r, e);
4268    }
4269
4270    #[simd_test(enable = "sse2")]
4271    unsafe fn test_mm_unpacklo_epi32() {
4272        let a = _mm_setr_epi32(0, 1, 2, 3);
4273        let b = _mm_setr_epi32(4, 5, 6, 7);
4274        let r = _mm_unpacklo_epi32(a, b);
4275        let e = _mm_setr_epi32(0, 4, 1, 5);
4276        assert_eq_m128i(r, e);
4277    }
4278
4279    #[simd_test(enable = "sse2")]
4280    unsafe fn test_mm_unpacklo_epi64() {
4281        let a = _mm_setr_epi64x(0, 1);
4282        let b = _mm_setr_epi64x(2, 3);
4283        let r = _mm_unpacklo_epi64(a, b);
4284        let e = _mm_setr_epi64x(0, 2);
4285        assert_eq_m128i(r, e);
4286    }
4287
4288    #[simd_test(enable = "sse2")]
4289    unsafe fn test_mm_add_sd() {
4290        let a = _mm_setr_pd(1.0, 2.0);
4291        let b = _mm_setr_pd(5.0, 10.0);
4292        let r = _mm_add_sd(a, b);
4293        assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4294    }
4295
4296    #[simd_test(enable = "sse2")]
4297    unsafe fn test_mm_add_pd() {
4298        let a = _mm_setr_pd(1.0, 2.0);
4299        let b = _mm_setr_pd(5.0, 10.0);
4300        let r = _mm_add_pd(a, b);
4301        assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4302    }
4303
4304    #[simd_test(enable = "sse2")]
4305    unsafe fn test_mm_div_sd() {
4306        let a = _mm_setr_pd(1.0, 2.0);
4307        let b = _mm_setr_pd(5.0, 10.0);
4308        let r = _mm_div_sd(a, b);
4309        assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4310    }
4311
4312    #[simd_test(enable = "sse2")]
4313    unsafe fn test_mm_div_pd() {
4314        let a = _mm_setr_pd(1.0, 2.0);
4315        let b = _mm_setr_pd(5.0, 10.0);
4316        let r = _mm_div_pd(a, b);
4317        assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4318    }
4319
4320    #[simd_test(enable = "sse2")]
4321    unsafe fn test_mm_max_sd() {
4322        let a = _mm_setr_pd(1.0, 2.0);
4323        let b = _mm_setr_pd(5.0, 10.0);
4324        let r = _mm_max_sd(a, b);
4325        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4326    }
4327
4328    #[simd_test(enable = "sse2")]
4329    unsafe fn test_mm_max_pd() {
4330        let a = _mm_setr_pd(1.0, 2.0);
4331        let b = _mm_setr_pd(5.0, 10.0);
4332        let r = _mm_max_pd(a, b);
4333        assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4334
4335        // Check SSE(2)-specific semantics for -0.0 handling.
4336        let a = _mm_setr_pd(-0.0, 0.0);
4337        let b = _mm_setr_pd(0.0, 0.0);
4338        let r1: [u8; 16] = transmute(_mm_max_pd(a, b));
4339        let r2: [u8; 16] = transmute(_mm_max_pd(b, a));
4340        let a: [u8; 16] = transmute(a);
4341        let b: [u8; 16] = transmute(b);
4342        assert_eq!(r1, b);
4343        assert_eq!(r2, a);
4344        assert_ne!(a, b); // sanity check that -0.0 is actually present
4345    }
4346
4347    #[simd_test(enable = "sse2")]
4348    unsafe fn test_mm_min_sd() {
4349        let a = _mm_setr_pd(1.0, 2.0);
4350        let b = _mm_setr_pd(5.0, 10.0);
4351        let r = _mm_min_sd(a, b);
4352        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4353    }
4354
4355    #[simd_test(enable = "sse2")]
4356    unsafe fn test_mm_min_pd() {
4357        let a = _mm_setr_pd(1.0, 2.0);
4358        let b = _mm_setr_pd(5.0, 10.0);
4359        let r = _mm_min_pd(a, b);
4360        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4361
4362        // Check SSE(2)-specific semantics for -0.0 handling.
4363        let a = _mm_setr_pd(-0.0, 0.0);
4364        let b = _mm_setr_pd(0.0, 0.0);
4365        let r1: [u8; 16] = transmute(_mm_min_pd(a, b));
4366        let r2: [u8; 16] = transmute(_mm_min_pd(b, a));
4367        let a: [u8; 16] = transmute(a);
4368        let b: [u8; 16] = transmute(b);
4369        assert_eq!(r1, b);
4370        assert_eq!(r2, a);
4371        assert_ne!(a, b); // sanity check that -0.0 is actually present
4372    }
4373
4374    #[simd_test(enable = "sse2")]
4375    unsafe fn test_mm_mul_sd() {
4376        let a = _mm_setr_pd(1.0, 2.0);
4377        let b = _mm_setr_pd(5.0, 10.0);
4378        let r = _mm_mul_sd(a, b);
4379        assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4380    }
4381
4382    #[simd_test(enable = "sse2")]
4383    unsafe fn test_mm_mul_pd() {
4384        let a = _mm_setr_pd(1.0, 2.0);
4385        let b = _mm_setr_pd(5.0, 10.0);
4386        let r = _mm_mul_pd(a, b);
4387        assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4388    }
4389
4390    #[simd_test(enable = "sse2")]
4391    unsafe fn test_mm_sqrt_sd() {
4392        let a = _mm_setr_pd(1.0, 2.0);
4393        let b = _mm_setr_pd(5.0, 10.0);
4394        let r = _mm_sqrt_sd(a, b);
4395        assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4396    }
4397
4398    #[simd_test(enable = "sse2")]
4399    unsafe fn test_mm_sqrt_pd() {
4400        let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4401        assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4402    }
4403
4404    #[simd_test(enable = "sse2")]
4405    unsafe fn test_mm_sub_sd() {
4406        let a = _mm_setr_pd(1.0, 2.0);
4407        let b = _mm_setr_pd(5.0, 10.0);
4408        let r = _mm_sub_sd(a, b);
4409        assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4410    }
4411
4412    #[simd_test(enable = "sse2")]
4413    unsafe fn test_mm_sub_pd() {
4414        let a = _mm_setr_pd(1.0, 2.0);
4415        let b = _mm_setr_pd(5.0, 10.0);
4416        let r = _mm_sub_pd(a, b);
4417        assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4418    }
4419
4420    #[simd_test(enable = "sse2")]
4421    unsafe fn test_mm_and_pd() {
4422        let a = transmute(u64x2::splat(5));
4423        let b = transmute(u64x2::splat(3));
4424        let r = _mm_and_pd(a, b);
4425        let e = transmute(u64x2::splat(1));
4426        assert_eq_m128d(r, e);
4427    }
4428
4429    #[simd_test(enable = "sse2")]
4430    unsafe fn test_mm_andnot_pd() {
4431        let a = transmute(u64x2::splat(5));
4432        let b = transmute(u64x2::splat(3));
4433        let r = _mm_andnot_pd(a, b);
4434        let e = transmute(u64x2::splat(2));
4435        assert_eq_m128d(r, e);
4436    }
4437
4438    #[simd_test(enable = "sse2")]
4439    unsafe fn test_mm_or_pd() {
4440        let a = transmute(u64x2::splat(5));
4441        let b = transmute(u64x2::splat(3));
4442        let r = _mm_or_pd(a, b);
4443        let e = transmute(u64x2::splat(7));
4444        assert_eq_m128d(r, e);
4445    }
4446
4447    #[simd_test(enable = "sse2")]
4448    unsafe fn test_mm_xor_pd() {
4449        let a = transmute(u64x2::splat(5));
4450        let b = transmute(u64x2::splat(3));
4451        let r = _mm_xor_pd(a, b);
4452        let e = transmute(u64x2::splat(6));
4453        assert_eq_m128d(r, e);
4454    }
4455
4456    #[simd_test(enable = "sse2")]
4457    unsafe fn test_mm_cmpeq_sd() {
4458        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4459        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4460        let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4461        assert_eq_m128i(r, e);
4462    }
4463
4464    #[simd_test(enable = "sse2")]
4465    unsafe fn test_mm_cmplt_sd() {
4466        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4467        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4468        let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4469        assert_eq_m128i(r, e);
4470    }
4471
4472    #[simd_test(enable = "sse2")]
4473    unsafe fn test_mm_cmple_sd() {
4474        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4475        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4476        let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4477        assert_eq_m128i(r, e);
4478    }
4479
4480    #[simd_test(enable = "sse2")]
4481    unsafe fn test_mm_cmpgt_sd() {
4482        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4483        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4484        let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4485        assert_eq_m128i(r, e);
4486    }
4487
4488    #[simd_test(enable = "sse2")]
4489    unsafe fn test_mm_cmpge_sd() {
4490        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4491        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4492        let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4493        assert_eq_m128i(r, e);
4494    }
4495
4496    #[simd_test(enable = "sse2")]
4497    unsafe fn test_mm_cmpord_sd() {
4498        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4499        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4500        let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4501        assert_eq_m128i(r, e);
4502    }
4503
4504    #[simd_test(enable = "sse2")]
4505    unsafe fn test_mm_cmpunord_sd() {
4506        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4507        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4508        let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4509        assert_eq_m128i(r, e);
4510    }
4511
4512    #[simd_test(enable = "sse2")]
4513    unsafe fn test_mm_cmpneq_sd() {
4514        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4515        let e = _mm_setr_epi64x(!0, 2.0f64.to_bits() as i64);
4516        let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4517        assert_eq_m128i(r, e);
4518    }
4519
4520    #[simd_test(enable = "sse2")]
4521    unsafe fn test_mm_cmpnlt_sd() {
4522        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4523        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4524        let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4525        assert_eq_m128i(r, e);
4526    }
4527
4528    #[simd_test(enable = "sse2")]
4529    unsafe fn test_mm_cmpnle_sd() {
4530        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4531        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4532        let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4533        assert_eq_m128i(r, e);
4534    }
4535
4536    #[simd_test(enable = "sse2")]
4537    unsafe fn test_mm_cmpngt_sd() {
4538        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4539        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4540        let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4541        assert_eq_m128i(r, e);
4542    }
4543
4544    #[simd_test(enable = "sse2")]
4545    unsafe fn test_mm_cmpnge_sd() {
4546        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4547        let e = _mm_setr_epi64x(0, 2.0f64.to_bits() as i64);
4548        let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4549        assert_eq_m128i(r, e);
4550    }
4551
4552    #[simd_test(enable = "sse2")]
4553    unsafe fn test_mm_cmpeq_pd() {
4554        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4555        let e = _mm_setr_epi64x(!0, 0);
4556        let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4557        assert_eq_m128i(r, e);
4558    }
4559
4560    #[simd_test(enable = "sse2")]
4561    unsafe fn test_mm_cmplt_pd() {
4562        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4563        let e = _mm_setr_epi64x(0, !0);
4564        let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4565        assert_eq_m128i(r, e);
4566    }
4567
4568    #[simd_test(enable = "sse2")]
4569    unsafe fn test_mm_cmple_pd() {
4570        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4571        let e = _mm_setr_epi64x(!0, !0);
4572        let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4573        assert_eq_m128i(r, e);
4574    }
4575
4576    #[simd_test(enable = "sse2")]
4577    unsafe fn test_mm_cmpgt_pd() {
4578        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4579        let e = _mm_setr_epi64x(0, 0);
4580        let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4581        assert_eq_m128i(r, e);
4582    }
4583
4584    #[simd_test(enable = "sse2")]
4585    unsafe fn test_mm_cmpge_pd() {
4586        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4587        let e = _mm_setr_epi64x(!0, 0);
4588        let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4589        assert_eq_m128i(r, e);
4590    }
4591
4592    #[simd_test(enable = "sse2")]
4593    unsafe fn test_mm_cmpord_pd() {
4594        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4595        let e = _mm_setr_epi64x(0, !0);
4596        let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4597        assert_eq_m128i(r, e);
4598    }
4599
4600    #[simd_test(enable = "sse2")]
4601    unsafe fn test_mm_cmpunord_pd() {
4602        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4603        let e = _mm_setr_epi64x(!0, 0);
4604        let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4605        assert_eq_m128i(r, e);
4606    }
4607
4608    #[simd_test(enable = "sse2")]
4609    unsafe fn test_mm_cmpneq_pd() {
4610        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4611        let e = _mm_setr_epi64x(!0, !0);
4612        let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4613        assert_eq_m128i(r, e);
4614    }
4615
4616    #[simd_test(enable = "sse2")]
4617    unsafe fn test_mm_cmpnlt_pd() {
4618        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4619        let e = _mm_setr_epi64x(0, 0);
4620        let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4621        assert_eq_m128i(r, e);
4622    }
4623
4624    #[simd_test(enable = "sse2")]
4625    unsafe fn test_mm_cmpnle_pd() {
4626        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4627        let e = _mm_setr_epi64x(0, 0);
4628        let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4629        assert_eq_m128i(r, e);
4630    }
4631
4632    #[simd_test(enable = "sse2")]
4633    unsafe fn test_mm_cmpngt_pd() {
4634        let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4635        let e = _mm_setr_epi64x(0, !0);
4636        let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4637        assert_eq_m128i(r, e);
4638    }
4639
4640    #[simd_test(enable = "sse2")]
4641    unsafe fn test_mm_cmpnge_pd() {
4642        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4643        let e = _mm_setr_epi64x(0, !0);
4644        let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4645        assert_eq_m128i(r, e);
4646    }
4647
4648    #[simd_test(enable = "sse2")]
4649    unsafe fn test_mm_comieq_sd() {
4650        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4651        assert!(_mm_comieq_sd(a, b) != 0);
4652
4653        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4654        assert!(_mm_comieq_sd(a, b) == 0);
4655    }
4656
4657    #[simd_test(enable = "sse2")]
4658    unsafe fn test_mm_comilt_sd() {
4659        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4660        assert!(_mm_comilt_sd(a, b) == 0);
4661    }
4662
4663    #[simd_test(enable = "sse2")]
4664    unsafe fn test_mm_comile_sd() {
4665        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4666        assert!(_mm_comile_sd(a, b) != 0);
4667    }
4668
4669    #[simd_test(enable = "sse2")]
4670    unsafe fn test_mm_comigt_sd() {
4671        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4672        assert!(_mm_comigt_sd(a, b) == 0);
4673    }
4674
4675    #[simd_test(enable = "sse2")]
4676    unsafe fn test_mm_comige_sd() {
4677        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4678        assert!(_mm_comige_sd(a, b) != 0);
4679    }
4680
4681    #[simd_test(enable = "sse2")]
4682    unsafe fn test_mm_comineq_sd() {
4683        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4684        assert!(_mm_comineq_sd(a, b) == 0);
4685    }
4686
4687    #[simd_test(enable = "sse2")]
4688    unsafe fn test_mm_ucomieq_sd() {
4689        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4690        assert!(_mm_ucomieq_sd(a, b) != 0);
4691
4692        let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4693        assert!(_mm_ucomieq_sd(a, b) == 0);
4694    }
4695
4696    #[simd_test(enable = "sse2")]
4697    unsafe fn test_mm_ucomilt_sd() {
4698        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4699        assert!(_mm_ucomilt_sd(a, b) == 0);
4700    }
4701
4702    #[simd_test(enable = "sse2")]
4703    unsafe fn test_mm_ucomile_sd() {
4704        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4705        assert!(_mm_ucomile_sd(a, b) != 0);
4706    }
4707
4708    #[simd_test(enable = "sse2")]
4709    unsafe fn test_mm_ucomigt_sd() {
4710        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4711        assert!(_mm_ucomigt_sd(a, b) == 0);
4712    }
4713
4714    #[simd_test(enable = "sse2")]
4715    unsafe fn test_mm_ucomige_sd() {
4716        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4717        assert!(_mm_ucomige_sd(a, b) != 0);
4718    }
4719
4720    #[simd_test(enable = "sse2")]
4721    unsafe fn test_mm_ucomineq_sd() {
4722        let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4723        assert!(_mm_ucomineq_sd(a, b) == 0);
4724    }
4725
4726    #[simd_test(enable = "sse2")]
4727    unsafe fn test_mm_movemask_pd() {
4728        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4729        assert_eq!(r, 0b01);
4730
4731        let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4732        assert_eq!(r, 0b11);
4733    }
4734
4735    #[repr(align(16))]
4736    struct Memory {
4737        data: [f64; 4],
4738    }
4739
4740    #[simd_test(enable = "sse2")]
4741    unsafe fn test_mm_load_pd() {
4742        let mem = Memory {
4743            data: [1.0f64, 2.0, 3.0, 4.0],
4744        };
4745        let vals = &mem.data;
4746        let d = vals.as_ptr();
4747
4748        let r = _mm_load_pd(d);
4749        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4750    }
4751
4752    #[simd_test(enable = "sse2")]
4753    unsafe fn test_mm_load_sd() {
4754        let a = 1.;
4755        let expected = _mm_setr_pd(a, 0.);
4756        let r = _mm_load_sd(&a);
4757        assert_eq_m128d(r, expected);
4758    }
4759
4760    #[simd_test(enable = "sse2")]
4761    unsafe fn test_mm_loadh_pd() {
4762        let a = _mm_setr_pd(1., 2.);
4763        let b = 3.;
4764        let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4765        let r = _mm_loadh_pd(a, &b);
4766        assert_eq_m128d(r, expected);
4767    }
4768
4769    #[simd_test(enable = "sse2")]
4770    unsafe fn test_mm_loadl_pd() {
4771        let a = _mm_setr_pd(1., 2.);
4772        let b = 3.;
4773        let expected = _mm_setr_pd(3., get_m128d(a, 1));
4774        let r = _mm_loadl_pd(a, &b);
4775        assert_eq_m128d(r, expected);
4776    }
4777
4778    #[simd_test(enable = "sse2")]
4779    // Miri cannot support this until it is clear how it fits in the Rust memory model
4780    // (non-temporal store)
4781    #[cfg_attr(miri, ignore)]
4782    unsafe fn test_mm_stream_pd() {
4783        #[repr(align(128))]
4784        struct Memory {
4785            pub data: [f64; 2],
4786        }
4787        let a = _mm_set1_pd(7.0);
4788        let mut mem = Memory { data: [-1.0; 2] };
4789
4790        _mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4791        for i in 0..2 {
4792            assert_eq!(mem.data[i], get_m128d(a, i));
4793        }
4794    }
4795
4796    #[simd_test(enable = "sse2")]
4797    unsafe fn test_mm_store_sd() {
4798        let mut dest = 0.;
4799        let a = _mm_setr_pd(1., 2.);
4800        _mm_store_sd(&mut dest, a);
4801        assert_eq!(dest, _mm_cvtsd_f64(a));
4802    }
4803
4804    #[simd_test(enable = "sse2")]
4805    unsafe fn test_mm_store_pd() {
4806        let mut mem = Memory { data: [0.0f64; 4] };
4807        let vals = &mut mem.data;
4808        let a = _mm_setr_pd(1.0, 2.0);
4809        let d = vals.as_mut_ptr();
4810
4811        _mm_store_pd(d, *black_box(&a));
4812        assert_eq!(vals[0], 1.0);
4813        assert_eq!(vals[1], 2.0);
4814    }
4815
4816    #[simd_test(enable = "sse2")]
4817    unsafe fn test_mm_storeu_pd() {
4818        let mut mem = Memory { data: [0.0f64; 4] };
4819        let vals = &mut mem.data;
4820        let a = _mm_setr_pd(1.0, 2.0);
4821
4822        let mut ofs = 0;
4823        let mut p = vals.as_mut_ptr();
4824
4825        // Make sure p is **not** aligned to 16-byte boundary
4826        if (p as usize) & 0xf == 0 {
4827            ofs = 1;
4828            p = p.add(1);
4829        }
4830
4831        _mm_storeu_pd(p, *black_box(&a));
4832
4833        if ofs > 0 {
4834            assert_eq!(vals[ofs - 1], 0.0);
4835        }
4836        assert_eq!(vals[ofs + 0], 1.0);
4837        assert_eq!(vals[ofs + 1], 2.0);
4838    }
4839
4840    #[simd_test(enable = "sse2")]
4841    unsafe fn test_mm_storeu_si16() {
4842        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4843        let mut r = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
4844        _mm_storeu_si16(ptr::addr_of_mut!(r).cast(), a);
4845        let e = _mm_setr_epi16(1, 10, 11, 12, 13, 14, 15, 16);
4846        assert_eq_m128i(r, e);
4847    }
4848
4849    #[simd_test(enable = "sse2")]
4850    unsafe fn test_mm_storeu_si32() {
4851        let a = _mm_setr_epi32(1, 2, 3, 4);
4852        let mut r = _mm_setr_epi32(5, 6, 7, 8);
4853        _mm_storeu_si32(ptr::addr_of_mut!(r).cast(), a);
4854        let e = _mm_setr_epi32(1, 6, 7, 8);
4855        assert_eq_m128i(r, e);
4856    }
4857
4858    #[simd_test(enable = "sse2")]
4859    unsafe fn test_mm_storeu_si64() {
4860        let a = _mm_setr_epi64x(1, 2);
4861        let mut r = _mm_setr_epi64x(3, 4);
4862        _mm_storeu_si64(ptr::addr_of_mut!(r).cast(), a);
4863        let e = _mm_setr_epi64x(1, 4);
4864        assert_eq_m128i(r, e);
4865    }
4866
4867    #[simd_test(enable = "sse2")]
4868    unsafe fn test_mm_store1_pd() {
4869        let mut mem = Memory { data: [0.0f64; 4] };
4870        let vals = &mut mem.data;
4871        let a = _mm_setr_pd(1.0, 2.0);
4872        let d = vals.as_mut_ptr();
4873
4874        _mm_store1_pd(d, *black_box(&a));
4875        assert_eq!(vals[0], 1.0);
4876        assert_eq!(vals[1], 1.0);
4877    }
4878
4879    #[simd_test(enable = "sse2")]
4880    unsafe fn test_mm_store_pd1() {
4881        let mut mem = Memory { data: [0.0f64; 4] };
4882        let vals = &mut mem.data;
4883        let a = _mm_setr_pd(1.0, 2.0);
4884        let d = vals.as_mut_ptr();
4885
4886        _mm_store_pd1(d, *black_box(&a));
4887        assert_eq!(vals[0], 1.0);
4888        assert_eq!(vals[1], 1.0);
4889    }
4890
4891    #[simd_test(enable = "sse2")]
4892    unsafe fn test_mm_storer_pd() {
4893        let mut mem = Memory { data: [0.0f64; 4] };
4894        let vals = &mut mem.data;
4895        let a = _mm_setr_pd(1.0, 2.0);
4896        let d = vals.as_mut_ptr();
4897
4898        _mm_storer_pd(d, *black_box(&a));
4899        assert_eq!(vals[0], 2.0);
4900        assert_eq!(vals[1], 1.0);
4901    }
4902
4903    #[simd_test(enable = "sse2")]
4904    unsafe fn test_mm_storeh_pd() {
4905        let mut dest = 0.;
4906        let a = _mm_setr_pd(1., 2.);
4907        _mm_storeh_pd(&mut dest, a);
4908        assert_eq!(dest, get_m128d(a, 1));
4909    }
4910
4911    #[simd_test(enable = "sse2")]
4912    unsafe fn test_mm_storel_pd() {
4913        let mut dest = 0.;
4914        let a = _mm_setr_pd(1., 2.);
4915        _mm_storel_pd(&mut dest, a);
4916        assert_eq!(dest, _mm_cvtsd_f64(a));
4917    }
4918
4919    #[simd_test(enable = "sse2")]
4920    unsafe fn test_mm_loadr_pd() {
4921        let mut mem = Memory {
4922            data: [1.0f64, 2.0, 3.0, 4.0],
4923        };
4924        let vals = &mut mem.data;
4925        let d = vals.as_ptr();
4926
4927        let r = _mm_loadr_pd(d);
4928        assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4929    }
4930
4931    #[simd_test(enable = "sse2")]
4932    unsafe fn test_mm_loadu_pd() {
4933        let mut mem = Memory {
4934            data: [1.0f64, 2.0, 3.0, 4.0],
4935        };
4936        let vals = &mut mem.data;
4937        let mut d = vals.as_ptr();
4938
4939        // make sure d is not aligned to 16-byte boundary
4940        let mut offset = 0;
4941        if (d as usize) & 0xf == 0 {
4942            offset = 1;
4943            d = d.add(offset);
4944        }
4945
4946        let r = _mm_loadu_pd(d);
4947        let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
4948        assert_eq_m128d(r, e);
4949    }
4950
4951    #[simd_test(enable = "sse2")]
4952    unsafe fn test_mm_loadu_si16() {
4953        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
4954        let r = _mm_loadu_si16(ptr::addr_of!(a) as *const _);
4955        assert_eq_m128i(r, _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0));
4956    }
4957
4958    #[simd_test(enable = "sse2")]
4959    unsafe fn test_mm_loadu_si32() {
4960        let a = _mm_setr_epi32(1, 2, 3, 4);
4961        let r = _mm_loadu_si32(ptr::addr_of!(a) as *const _);
4962        assert_eq_m128i(r, _mm_setr_epi32(1, 0, 0, 0));
4963    }
4964
4965    #[simd_test(enable = "sse2")]
4966    unsafe fn test_mm_loadu_si64() {
4967        let a = _mm_setr_epi64x(5, 6);
4968        let r = _mm_loadu_si64(ptr::addr_of!(a) as *const _);
4969        assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4970    }
4971
4972    #[simd_test(enable = "sse2")]
4973    unsafe fn test_mm_cvtpd_ps() {
4974        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4975        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4976
4977        let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4978        assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
4979
4980        let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
4981        assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
4982
4983        let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
4984        assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
4985    }
4986
4987    #[simd_test(enable = "sse2")]
4988    unsafe fn test_mm_cvtps_pd() {
4989        let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
4990        assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
4991
4992        let r = _mm_cvtps_pd(_mm_setr_ps(
4993            f32::MAX,
4994            f32::INFINITY,
4995            f32::NEG_INFINITY,
4996            f32::MIN,
4997        ));
4998        assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
4999    }
5000
5001    #[simd_test(enable = "sse2")]
5002    unsafe fn test_mm_cvtpd_epi32() {
5003        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
5004        assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
5005
5006        let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
5007        assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
5008
5009        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
5010        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5011
5012        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
5013        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5014
5015        let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
5016        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5017    }
5018
5019    #[simd_test(enable = "sse2")]
5020    unsafe fn test_mm_cvtsd_si32() {
5021        let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
5022        assert_eq!(r, -2);
5023
5024        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
5025        assert_eq!(r, i32::MIN);
5026
5027        let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
5028        assert_eq!(r, i32::MIN);
5029    }
5030
5031    #[simd_test(enable = "sse2")]
5032    unsafe fn test_mm_cvtsd_ss() {
5033        let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
5034        let b = _mm_setr_pd(2.0, -5.0);
5035
5036        let r = _mm_cvtsd_ss(a, b);
5037
5038        assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
5039
5040        let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
5041        let b = _mm_setr_pd(f64::INFINITY, -5.0);
5042
5043        let r = _mm_cvtsd_ss(a, b);
5044
5045        assert_eq_m128(
5046            r,
5047            _mm_setr_ps(
5048                f32::INFINITY,
5049                f32::NEG_INFINITY,
5050                f32::MAX,
5051                f32::NEG_INFINITY,
5052            ),
5053        );
5054    }
5055
5056    #[simd_test(enable = "sse2")]
5057    unsafe fn test_mm_cvtsd_f64() {
5058        let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
5059        assert_eq!(r, -1.1);
5060    }
5061
5062    #[simd_test(enable = "sse2")]
5063    unsafe fn test_mm_cvtss_sd() {
5064        let a = _mm_setr_pd(-1.1, 2.2);
5065        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
5066
5067        let r = _mm_cvtss_sd(a, b);
5068        assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
5069
5070        let a = _mm_setr_pd(-1.1, f64::INFINITY);
5071        let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
5072
5073        let r = _mm_cvtss_sd(a, b);
5074        assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5075    }
5076
5077    #[simd_test(enable = "sse2")]
5078    unsafe fn test_mm_cvttpd_epi32() {
5079        let a = _mm_setr_pd(-1.1, 2.2);
5080        let r = _mm_cvttpd_epi32(a);
5081        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5082
5083        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5084        let r = _mm_cvttpd_epi32(a);
5085        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5086    }
5087
5088    #[simd_test(enable = "sse2")]
5089    unsafe fn test_mm_cvttsd_si32() {
5090        let a = _mm_setr_pd(-1.1, 2.2);
5091        let r = _mm_cvttsd_si32(a);
5092        assert_eq!(r, -1);
5093
5094        let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5095        let r = _mm_cvttsd_si32(a);
5096        assert_eq!(r, i32::MIN);
5097    }
5098
5099    #[simd_test(enable = "sse2")]
5100    unsafe fn test_mm_cvttps_epi32() {
5101        let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5102        let r = _mm_cvttps_epi32(a);
5103        assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5104
5105        let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
5106        let r = _mm_cvttps_epi32(a);
5107        assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
5108    }
5109
5110    #[simd_test(enable = "sse2")]
5111    unsafe fn test_mm_set_sd() {
5112        let r = _mm_set_sd(-1.0_f64);
5113        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5114    }
5115
5116    #[simd_test(enable = "sse2")]
5117    unsafe fn test_mm_set1_pd() {
5118        let r = _mm_set1_pd(-1.0_f64);
5119        assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5120    }
5121
5122    #[simd_test(enable = "sse2")]
5123    unsafe fn test_mm_set_pd1() {
5124        let r = _mm_set_pd1(-2.0_f64);
5125        assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5126    }
5127
5128    #[simd_test(enable = "sse2")]
5129    unsafe fn test_mm_set_pd() {
5130        let r = _mm_set_pd(1.0_f64, 5.0_f64);
5131        assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5132    }
5133
5134    #[simd_test(enable = "sse2")]
5135    unsafe fn test_mm_setr_pd() {
5136        let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5137        assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5138    }
5139
5140    #[simd_test(enable = "sse2")]
5141    unsafe fn test_mm_setzero_pd() {
5142        let r = _mm_setzero_pd();
5143        assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5144    }
5145
5146    #[simd_test(enable = "sse2")]
5147    unsafe fn test_mm_load1_pd() {
5148        let d = -5.0;
5149        let r = _mm_load1_pd(&d);
5150        assert_eq_m128d(r, _mm_setr_pd(d, d));
5151    }
5152
5153    #[simd_test(enable = "sse2")]
5154    unsafe fn test_mm_load_pd1() {
5155        let d = -5.0;
5156        let r = _mm_load_pd1(&d);
5157        assert_eq_m128d(r, _mm_setr_pd(d, d));
5158    }
5159
5160    #[simd_test(enable = "sse2")]
5161    unsafe fn test_mm_unpackhi_pd() {
5162        let a = _mm_setr_pd(1.0, 2.0);
5163        let b = _mm_setr_pd(3.0, 4.0);
5164        let r = _mm_unpackhi_pd(a, b);
5165        assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5166    }
5167
5168    #[simd_test(enable = "sse2")]
5169    unsafe fn test_mm_unpacklo_pd() {
5170        let a = _mm_setr_pd(1.0, 2.0);
5171        let b = _mm_setr_pd(3.0, 4.0);
5172        let r = _mm_unpacklo_pd(a, b);
5173        assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5174    }
5175
5176    #[simd_test(enable = "sse2")]
5177    unsafe fn test_mm_shuffle_pd() {
5178        let a = _mm_setr_pd(1., 2.);
5179        let b = _mm_setr_pd(3., 4.);
5180        let expected = _mm_setr_pd(1., 3.);
5181        let r = _mm_shuffle_pd::<0b00_00_00_00>(a, b);
5182        assert_eq_m128d(r, expected);
5183    }
5184
5185    #[simd_test(enable = "sse2")]
5186    unsafe fn test_mm_move_sd() {
5187        let a = _mm_setr_pd(1., 2.);
5188        let b = _mm_setr_pd(3., 4.);
5189        let expected = _mm_setr_pd(3., 2.);
5190        let r = _mm_move_sd(a, b);
5191        assert_eq_m128d(r, expected);
5192    }
5193
5194    #[simd_test(enable = "sse2")]
5195    unsafe fn test_mm_castpd_ps() {
5196        let a = _mm_set1_pd(0.);
5197        let expected = _mm_set1_ps(0.);
5198        let r = _mm_castpd_ps(a);
5199        assert_eq_m128(r, expected);
5200    }
5201
5202    #[simd_test(enable = "sse2")]
5203    unsafe fn test_mm_castpd_si128() {
5204        let a = _mm_set1_pd(0.);
5205        let expected = _mm_set1_epi64x(0);
5206        let r = _mm_castpd_si128(a);
5207        assert_eq_m128i(r, expected);
5208    }
5209
5210    #[simd_test(enable = "sse2")]
5211    unsafe fn test_mm_castps_pd() {
5212        let a = _mm_set1_ps(0.);
5213        let expected = _mm_set1_pd(0.);
5214        let r = _mm_castps_pd(a);
5215        assert_eq_m128d(r, expected);
5216    }
5217
5218    #[simd_test(enable = "sse2")]
5219    unsafe fn test_mm_castps_si128() {
5220        let a = _mm_set1_ps(0.);
5221        let expected = _mm_set1_epi32(0);
5222        let r = _mm_castps_si128(a);
5223        assert_eq_m128i(r, expected);
5224    }
5225
5226    #[simd_test(enable = "sse2")]
5227    unsafe fn test_mm_castsi128_pd() {
5228        let a = _mm_set1_epi64x(0);
5229        let expected = _mm_set1_pd(0.);
5230        let r = _mm_castsi128_pd(a);
5231        assert_eq_m128d(r, expected);
5232    }
5233
5234    #[simd_test(enable = "sse2")]
5235    unsafe fn test_mm_castsi128_ps() {
5236        let a = _mm_set1_epi32(0);
5237        let expected = _mm_set1_ps(0.);
5238        let r = _mm_castsi128_ps(a);
5239        assert_eq_m128(r, expected);
5240    }
5241}