core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub fn _mm_set1_ph(a: f16) -> __m128h {
118    unsafe { transmute(f16x8::splat(a)) }
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub fn _mm256_set1_ph(a: f16) -> __m256h {
128    unsafe { transmute(f16x16::splat(a)) }
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub fn _mm512_set1_ph(a: f16) -> __m512h {
138    unsafe { transmute(f16x32::splat(a)) }
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub fn _mm_setzero_ph() -> __m128h {
242    unsafe { transmute(f16x8::ZERO) }
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub fn _mm256_setzero_ph() -> __m256h {
252    unsafe { transmute(f16x16::ZERO) }
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub fn _mm512_setzero_ph() -> __m512h {
262    unsafe { transmute(f16x32::ZERO) }
263}
264
265/// Return vector of type `__m128h` with indetermination elements.
266/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
267/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
268/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
269///
270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
271#[inline]
272#[target_feature(enable = "avx512fp16,avx512vl")]
273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
274pub fn _mm_undefined_ph() -> __m128h {
275    unsafe { transmute(f16x8::ZERO) }
276}
277
278/// Return vector of type `__m256h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
287pub fn _mm256_undefined_ph() -> __m256h {
288    unsafe { transmute(f16x16::ZERO) }
289}
290
291/// Return vector of type `__m512h` with indetermination elements.
292/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
293/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
294/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
295///
296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
297#[inline]
298#[target_feature(enable = "avx512fp16")]
299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
300pub fn _mm512_undefined_ph() -> __m512h {
301    unsafe { transmute(f16x32::ZERO) }
302}
303
304/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
305/// does not generate any instructions, thus it has zero latency.
306///
307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
308#[inline]
309#[target_feature(enable = "avx512fp16")]
310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
311pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
312    unsafe { transmute(a) }
313}
314
315/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
316/// does not generate any instructions, thus it has zero latency.
317///
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
319#[inline]
320#[target_feature(enable = "avx512fp16")]
321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
322pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
323    unsafe { transmute(a) }
324}
325
326/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
327/// does not generate any instructions, thus it has zero latency.
328///
329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
330#[inline]
331#[target_feature(enable = "avx512fp16")]
332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
333pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
334    unsafe { transmute(a) }
335}
336
337/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
338/// does not generate any instructions, thus it has zero latency.
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
341#[inline]
342#[target_feature(enable = "avx512fp16")]
343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
344pub fn _mm_castph_pd(a: __m128h) -> __m128d {
345    unsafe { transmute(a) }
346}
347
348/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
349/// does not generate any instructions, thus it has zero latency.
350///
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
352#[inline]
353#[target_feature(enable = "avx512fp16")]
354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
355pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
356    unsafe { transmute(a) }
357}
358
359/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
360/// does not generate any instructions, thus it has zero latency.
361///
362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
363#[inline]
364#[target_feature(enable = "avx512fp16")]
365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
366pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
367    unsafe { transmute(a) }
368}
369
370/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
371/// does not generate any instructions, thus it has zero latency.
372///
373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
374#[inline]
375#[target_feature(enable = "avx512fp16")]
376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
377pub fn _mm_castps_ph(a: __m128) -> __m128h {
378    unsafe { transmute(a) }
379}
380
381/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
382/// does not generate any instructions, thus it has zero latency.
383///
384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
385#[inline]
386#[target_feature(enable = "avx512fp16")]
387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
388pub fn _mm256_castps_ph(a: __m256) -> __m256h {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
399pub fn _mm512_castps_ph(a: __m512) -> __m512h {
400    unsafe { transmute(a) }
401}
402
403/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
404/// does not generate any instructions, thus it has zero latency.
405///
406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
407#[inline]
408#[target_feature(enable = "avx512fp16")]
409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
410pub fn _mm_castph_ps(a: __m128h) -> __m128 {
411    unsafe { transmute(a) }
412}
413
414/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
415/// does not generate any instructions, thus it has zero latency.
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
418#[inline]
419#[target_feature(enable = "avx512fp16")]
420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
421pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
422    unsafe { transmute(a) }
423}
424
425/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
426/// does not generate any instructions, thus it has zero latency.
427///
428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
429#[inline]
430#[target_feature(enable = "avx512fp16")]
431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
432pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
433    unsafe { transmute(a) }
434}
435
436/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
437/// does not generate any instructions, thus it has zero latency.
438///
439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
440#[inline]
441#[target_feature(enable = "avx512fp16")]
442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
443pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
444    unsafe { transmute(a) }
445}
446
447/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
448/// does not generate any instructions, thus it has zero latency.
449///
450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
451#[inline]
452#[target_feature(enable = "avx512fp16")]
453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
454pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
455    unsafe { transmute(a) }
456}
457
458/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
459/// does not generate any instructions, thus it has zero latency.
460///
461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
462#[inline]
463#[target_feature(enable = "avx512fp16")]
464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
465pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
466    unsafe { transmute(a) }
467}
468
469/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
470/// does not generate any instructions, thus it has zero latency.
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
473#[inline]
474#[target_feature(enable = "avx512fp16")]
475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
476pub fn _mm_castph_si128(a: __m128h) -> __m128i {
477    unsafe { transmute(a) }
478}
479
480/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
481/// does not generate any instructions, thus it has zero latency.
482///
483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
484#[inline]
485#[target_feature(enable = "avx512fp16")]
486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
487pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
488    unsafe { transmute(a) }
489}
490
491/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
492/// does not generate any instructions, thus it has zero latency.
493///
494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
495#[inline]
496#[target_feature(enable = "avx512fp16")]
497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
498pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
499    unsafe { transmute(a) }
500}
501
502/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
503/// does not generate any instructions, thus it has zero latency.
504///
505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
506#[inline]
507#[target_feature(enable = "avx512fp16")]
508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
509pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
510    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
511}
512
513/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
514/// does not generate any instructions, thus it has zero latency.
515///
516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
517#[inline]
518#[target_feature(enable = "avx512fp16")]
519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
520pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
521    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
522}
523
524/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
531pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
532    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
533}
534
535/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
536/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
537/// but most of the time it does not generate any instructions.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
543pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
544    unsafe {
545        simd_shuffle!(
546            a,
547            _mm_undefined_ph(),
548            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
549        )
550    }
551}
552
553/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
554/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
555/// but most of the time it does not generate any instructions.
556///
557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
558#[inline]
559#[target_feature(enable = "avx512fp16")]
560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
561pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
562    unsafe {
563        simd_shuffle!(
564            a,
565            _mm_undefined_ph(),
566            [
567                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
568                8, 8, 8, 8
569            ]
570        )
571    }
572}
573
574/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
575/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
576/// but most of the time it does not generate any instructions.
577///
578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
579#[inline]
580#[target_feature(enable = "avx512fp16")]
581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
582pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
583    unsafe {
584        simd_shuffle!(
585            a,
586            _mm256_undefined_ph(),
587            [
588                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
589                16, 16, 16, 16, 16, 16, 16, 16, 16
590            ]
591        )
592    }
593}
594
595/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
596/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
597/// any instructions.
598///
599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
600#[inline]
601#[target_feature(enable = "avx512fp16")]
602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
603pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
604    unsafe {
605        simd_shuffle!(
606            a,
607            _mm_setzero_ph(),
608            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
614/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
615/// any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
621pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
622    unsafe {
623        simd_shuffle!(
624            a,
625            _mm256_setzero_ph(),
626            [
627                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
628                16, 16, 16, 16, 16, 16, 16, 16, 16
629            ]
630        )
631    }
632}
633
634/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
635/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
636/// any instructions.
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
639#[inline]
640#[target_feature(enable = "avx512fp16")]
641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
642pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
643    unsafe {
644        simd_shuffle!(
645            a,
646            _mm_setzero_ph(),
647            [
648                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
649                8, 8, 8, 8
650            ]
651        )
652    }
653}
654
655macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
656    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
657        let dst: $mask_type;
658        asm!(
659            "vcmpph {k}, {a}, {b}, {imm8}",
660            k = lateout(kreg) dst,
661            a = in($reg) $a,
662            b = in($reg) $b,
663            imm8 = const IMM5,
664            options(pure, nomem, nostack)
665        );
666        dst
667    }};
668    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
669        let dst: $mask_type;
670        asm!(
671            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
672            k = lateout(kreg) dst,
673            mask = in(kreg) $mask,
674            a = in($reg) $a,
675            b = in($reg) $b,
676            imm8 = const IMM5,
677            options(pure, nomem, nostack)
678        );
679        dst
680    }};
681}
682
683/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
684/// operand specified by imm8, and store the results in mask vector k.
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
687#[inline]
688#[target_feature(enable = "avx512fp16,avx512vl")]
689#[rustc_legacy_const_generics(2)]
690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
691pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
692    unsafe {
693        static_assert_uimm_bits!(IMM5, 5);
694        cmp_asm!(__mmask8, xmm_reg, a, b)
695    }
696}
697
698/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
699/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
700/// zeroed out when the corresponding mask bit is not set).
701///
702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
703#[inline]
704#[target_feature(enable = "avx512fp16,avx512vl")]
705#[rustc_legacy_const_generics(3)]
706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
707pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
708    unsafe {
709        static_assert_uimm_bits!(IMM5, 5);
710        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
711    }
712}
713
714/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
715/// operand specified by imm8, and store the results in mask vector k.
716///
717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
718#[inline]
719#[target_feature(enable = "avx512fp16,avx512vl")]
720#[rustc_legacy_const_generics(2)]
721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
722pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
723    unsafe {
724        static_assert_uimm_bits!(IMM5, 5);
725        cmp_asm!(__mmask16, ymm_reg, a, b)
726    }
727}
728
729/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
730/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
731/// zeroed out when the corresponding mask bit is not set).
732///
733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
734#[inline]
735#[target_feature(enable = "avx512fp16,avx512vl")]
736#[rustc_legacy_const_generics(3)]
737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
738pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
739    k1: __mmask16,
740    a: __m256h,
741    b: __m256h,
742) -> __mmask16 {
743    unsafe {
744        static_assert_uimm_bits!(IMM5, 5);
745        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
746    }
747}
748
749/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
750/// operand specified by imm8, and store the results in mask vector k.
751///
752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
753#[inline]
754#[target_feature(enable = "avx512fp16")]
755#[rustc_legacy_const_generics(2)]
756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
757pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
758    unsafe {
759        static_assert_uimm_bits!(IMM5, 5);
760        cmp_asm!(__mmask32, zmm_reg, a, b)
761    }
762}
763
764/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
765/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
766/// zeroed out when the corresponding mask bit is not set).
767///
768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
769#[inline]
770#[target_feature(enable = "avx512fp16")]
771#[rustc_legacy_const_generics(3)]
772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
773pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
774    k1: __mmask32,
775    a: __m512h,
776    b: __m512h,
777) -> __mmask32 {
778    unsafe {
779        static_assert_uimm_bits!(IMM5, 5);
780        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
781    }
782}
783
784/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
785/// operand specified by imm8, and store the results in mask vector k.
786///
787/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
788///
789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
790#[inline]
791#[target_feature(enable = "avx512fp16")]
792#[rustc_legacy_const_generics(2, 3)]
793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
794pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
795    a: __m512h,
796    b: __m512h,
797) -> __mmask32 {
798    unsafe {
799        static_assert_uimm_bits!(IMM5, 5);
800        static_assert_sae!(SAE);
801        if SAE == _MM_FROUND_NO_EXC {
802            let dst: __mmask32;
803            asm!(
804                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
805                k = lateout(kreg) dst,
806                a = in(zmm_reg) a,
807                b = in(zmm_reg) b,
808                imm8 = const IMM5,
809                options(pure, nomem, nostack)
810            );
811            dst
812        } else {
813            cmp_asm!(__mmask32, zmm_reg, a, b)
814        }
815    }
816}
817
818/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
819/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
820/// zeroed out when the corresponding mask bit is not set).
821///
822/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
823///
824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
825#[inline]
826#[target_feature(enable = "avx512fp16")]
827#[rustc_legacy_const_generics(3, 4)]
828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
829pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
830    k1: __mmask32,
831    a: __m512h,
832    b: __m512h,
833) -> __mmask32 {
834    unsafe {
835        static_assert_uimm_bits!(IMM5, 5);
836        static_assert_sae!(SAE);
837        if SAE == _MM_FROUND_NO_EXC {
838            let dst: __mmask32;
839            asm!(
840                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
841                k = lateout(kreg) dst,
842                k1 = in(kreg) k1,
843                a = in(zmm_reg) a,
844                b = in(zmm_reg) b,
845                imm8 = const IMM5,
846                options(pure, nomem, nostack)
847            );
848            dst
849        } else {
850            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
851        }
852    }
853}
854
855/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
856/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
857/// passing _MM_FROUND_NO_EXC in the sae parameter.
858///
859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
860#[inline]
861#[target_feature(enable = "avx512fp16")]
862#[rustc_legacy_const_generics(2, 3)]
863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
864pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
865    static_assert_uimm_bits!(IMM5, 5);
866    static_assert_sae!(SAE);
867    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
868}
869
870/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
871/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
872/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
873///
874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
875#[inline]
876#[target_feature(enable = "avx512fp16")]
877#[rustc_legacy_const_generics(3, 4)]
878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
879pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
880    k1: __mmask8,
881    a: __m128h,
882    b: __m128h,
883) -> __mmask8 {
884    unsafe {
885        static_assert_uimm_bits!(IMM5, 5);
886        static_assert_sae!(SAE);
887        vcmpsh(a, b, IMM5, k1, SAE)
888    }
889}
890
891/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
892/// operand specified by imm8, and store the result in mask vector k.
893///
894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
895#[inline]
896#[target_feature(enable = "avx512fp16")]
897#[rustc_legacy_const_generics(2)]
898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
899pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
900    static_assert_uimm_bits!(IMM5, 5);
901    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
902}
903
904/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
905/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
906///
907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
908#[inline]
909#[target_feature(enable = "avx512fp16")]
910#[rustc_legacy_const_generics(3)]
911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
912pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
913    static_assert_uimm_bits!(IMM5, 5);
914    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
915}
916
917/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
918/// operand specified by imm8, and return the boolean result (0 or 1).
919/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
920///
921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
922#[inline]
923#[target_feature(enable = "avx512fp16")]
924#[rustc_legacy_const_generics(2, 3)]
925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
926pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcomish(a, b, IMM5, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and return the boolean result (0 or 1).
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
942pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
948/// the boolean result (0 or 1).
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
954pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
955    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
956}
957
958/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
959/// and return the boolean result (0 or 1).
960///
961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
962#[inline]
963#[target_feature(enable = "avx512fp16")]
964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
965pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
966    _mm_comi_sh::<_CMP_GE_OS>(a, b)
967}
968
969/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
970/// the boolean result (0 or 1).
971///
972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
973#[inline]
974#[target_feature(enable = "avx512fp16")]
975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
976pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
977    _mm_comi_sh::<_CMP_GT_OS>(a, b)
978}
979
980/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
981/// return the boolean result (0 or 1).
982///
983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
984#[inline]
985#[target_feature(enable = "avx512fp16")]
986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
987pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
988    _mm_comi_sh::<_CMP_LE_OS>(a, b)
989}
990
991/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
992/// the boolean result (0 or 1).
993///
994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
995#[inline]
996#[target_feature(enable = "avx512fp16")]
997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
998pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
999    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1000}
1001
1002/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1003/// the boolean result (0 or 1).
1004///
1005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1006#[inline]
1007#[target_feature(enable = "avx512fp16")]
1008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1009pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1010    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
1011}
1012
1013/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1014/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1015///
1016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1017#[inline]
1018#[target_feature(enable = "avx512fp16")]
1019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1020pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1021    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1022}
1023
1024/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1025/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1026///
1027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1028#[inline]
1029#[target_feature(enable = "avx512fp16")]
1030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1031pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1032    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1033}
1034
1035/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1036/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1037///
1038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1039#[inline]
1040#[target_feature(enable = "avx512fp16")]
1041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1042pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1043    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1044}
1045
1046/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1047/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1048///
1049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1050#[inline]
1051#[target_feature(enable = "avx512fp16")]
1052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1053pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1054    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1055}
1056
1057/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1058/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1059///
1060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1061#[inline]
1062#[target_feature(enable = "avx512fp16")]
1063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1064pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1065    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1066}
1067
1068/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1069/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1070///
1071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1072#[inline]
1073#[target_feature(enable = "avx512fp16")]
1074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1075pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1076    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1077}
1078
1079/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1080/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1081///
1082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1083#[inline]
1084#[target_feature(enable = "avx512fp16,avx512vl")]
1085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1086pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1087    *mem_addr.cast()
1088}
1089
1090/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1091/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1092///
1093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1094#[inline]
1095#[target_feature(enable = "avx512fp16,avx512vl")]
1096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1097pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1098    *mem_addr.cast()
1099}
1100
1101/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1102/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1103///
1104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1105#[inline]
1106#[target_feature(enable = "avx512fp16")]
1107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1108pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1109    *mem_addr.cast()
1110}
1111
1112/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1113/// and zero the upper elements
1114///
1115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1116#[inline]
1117#[target_feature(enable = "avx512fp16")]
1118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1119pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1120    _mm_set_sh(*mem_addr)
1121}
1122
1123/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1124/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1125///
1126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1127#[inline]
1128#[target_feature(enable = "avx512fp16")]
1129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1130pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1131    let mut dst = src;
1132    asm!(
1133        vpl!("vmovsh {dst}{{{k}}}"),
1134        dst = inout(xmm_reg) dst,
1135        k = in(kreg) k,
1136        p = in(reg) mem_addr,
1137        options(pure, readonly, nostack, preserves_flags)
1138    );
1139    dst
1140}
1141
1142/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1143/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1144///
1145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1146#[inline]
1147#[target_feature(enable = "avx512fp16")]
1148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1149pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1150    let mut dst: __m128h;
1151    asm!(
1152        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1153        dst = out(xmm_reg) dst,
1154        k = in(kreg) k,
1155        p = in(reg) mem_addr,
1156        options(pure, readonly, nostack, preserves_flags)
1157    );
1158    dst
1159}
1160
1161/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1162/// a new vector. The address does not need to be aligned to any particular boundary.
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1165#[inline]
1166#[target_feature(enable = "avx512fp16,avx512vl")]
1167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1168pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1169    ptr::read_unaligned(mem_addr.cast())
1170}
1171
1172/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1173/// a new vector. The address does not need to be aligned to any particular boundary.
1174///
1175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1176#[inline]
1177#[target_feature(enable = "avx512fp16,avx512vl")]
1178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1179pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1180    ptr::read_unaligned(mem_addr.cast())
1181}
1182
1183/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1184/// a new vector. The address does not need to be aligned to any particular boundary.
1185///
1186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1187#[inline]
1188#[target_feature(enable = "avx512fp16")]
1189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1190pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1191    ptr::read_unaligned(mem_addr.cast())
1192}
1193
1194/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1195/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1196/// 7 packed elements from a to the upper elements of dst.
1197///
1198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1199#[inline]
1200#[target_feature(enable = "avx512fp16")]
1201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1202pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1203    unsafe {
1204        let mut mov: f16 = simd_extract!(src, 0);
1205        if (k & 1) != 0 {
1206            mov = simd_extract!(b, 0);
1207        }
1208        simd_insert!(a, 0, mov)
1209    }
1210}
1211
1212/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1213/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1214/// elements from a to the upper elements of dst.
1215///
1216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1217#[inline]
1218#[target_feature(enable = "avx512fp16")]
1219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1220pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1221    unsafe {
1222        let mut mov: f16 = 0.;
1223        if (k & 1) != 0 {
1224            mov = simd_extract!(b, 0);
1225        }
1226        simd_insert!(a, 0, mov)
1227    }
1228}
1229
1230/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1231/// and copy the upper 7 packed elements from a to the upper elements of dst.
1232///
1233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1234#[inline]
1235#[target_feature(enable = "avx512fp16")]
1236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1237pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1238    unsafe {
1239        let mov: f16 = simd_extract!(b, 0);
1240        simd_insert!(a, 0, mov)
1241    }
1242}
1243
1244/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1245/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1246///
1247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1248#[inline]
1249#[target_feature(enable = "avx512fp16,avx512vl")]
1250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1251pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1252    *mem_addr.cast() = a;
1253}
1254
1255/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1256/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1257///
1258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1259#[inline]
1260#[target_feature(enable = "avx512fp16,avx512vl")]
1261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1262pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1263    *mem_addr.cast() = a;
1264}
1265
1266/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1267/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1268///
1269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1270#[inline]
1271#[target_feature(enable = "avx512fp16")]
1272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1273pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1274    *mem_addr.cast() = a;
1275}
1276
1277/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1278///
1279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1280#[inline]
1281#[target_feature(enable = "avx512fp16")]
1282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1283pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1284    *mem_addr = simd_extract!(a, 0);
1285}
1286
1287/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1288///
1289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1290#[inline]
1291#[target_feature(enable = "avx512fp16")]
1292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1293pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1294    asm!(
1295        vps!("vmovdqu16", "{{{k}}}, {src}"),
1296        p = in(reg) mem_addr,
1297        k = in(kreg) k,
1298        src = in(xmm_reg) a,
1299        options(nostack, preserves_flags)
1300    );
1301}
1302
1303/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1304/// The address does not need to be aligned to any particular boundary.
1305///
1306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1307#[inline]
1308#[target_feature(enable = "avx512fp16,avx512vl")]
1309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1310pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1311    ptr::write_unaligned(mem_addr.cast(), a);
1312}
1313
1314/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1315/// The address does not need to be aligned to any particular boundary.
1316///
1317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1318#[inline]
1319#[target_feature(enable = "avx512fp16,avx512vl")]
1320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1321pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1322    ptr::write_unaligned(mem_addr.cast(), a);
1323}
1324
1325/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1326/// The address does not need to be aligned to any particular boundary.
1327///
1328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1329#[inline]
1330#[target_feature(enable = "avx512fp16")]
1331#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1332pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1333    ptr::write_unaligned(mem_addr.cast(), a);
1334}
1335
1336/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1337///
1338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1339#[inline]
1340#[target_feature(enable = "avx512fp16,avx512vl")]
1341#[cfg_attr(test, assert_instr(vaddph))]
1342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1343pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1344    unsafe { simd_add(a, b) }
1345}
1346
1347/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1348/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1349///
1350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1351#[inline]
1352#[target_feature(enable = "avx512fp16,avx512vl")]
1353#[cfg_attr(test, assert_instr(vaddph))]
1354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1355pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1356    unsafe {
1357        let r = _mm_add_ph(a, b);
1358        simd_select_bitmask(k, r, src)
1359    }
1360}
1361
1362/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1363/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1364///
1365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1366#[inline]
1367#[target_feature(enable = "avx512fp16,avx512vl")]
1368#[cfg_attr(test, assert_instr(vaddph))]
1369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1370pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1371    unsafe {
1372        let r = _mm_add_ph(a, b);
1373        simd_select_bitmask(k, r, _mm_setzero_ph())
1374    }
1375}
1376
1377/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1378///
1379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1380#[inline]
1381#[target_feature(enable = "avx512fp16,avx512vl")]
1382#[cfg_attr(test, assert_instr(vaddph))]
1383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1384pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1385    unsafe { simd_add(a, b) }
1386}
1387
1388/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1389/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1390///
1391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1392#[inline]
1393#[target_feature(enable = "avx512fp16,avx512vl")]
1394#[cfg_attr(test, assert_instr(vaddph))]
1395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1396pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1397    unsafe {
1398        let r = _mm256_add_ph(a, b);
1399        simd_select_bitmask(k, r, src)
1400    }
1401}
1402
1403/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1404/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1405///
1406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1407#[inline]
1408#[target_feature(enable = "avx512fp16,avx512vl")]
1409#[cfg_attr(test, assert_instr(vaddph))]
1410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1411pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1412    unsafe {
1413        let r = _mm256_add_ph(a, b);
1414        simd_select_bitmask(k, r, _mm256_setzero_ph())
1415    }
1416}
1417
1418/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1419///
1420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1421#[inline]
1422#[target_feature(enable = "avx512fp16")]
1423#[cfg_attr(test, assert_instr(vaddph))]
1424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1425pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1426    unsafe { simd_add(a, b) }
1427}
1428
1429/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1430/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1431///
1432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1433#[inline]
1434#[target_feature(enable = "avx512fp16")]
1435#[cfg_attr(test, assert_instr(vaddph))]
1436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1437pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1438    unsafe {
1439        let r = _mm512_add_ph(a, b);
1440        simd_select_bitmask(k, r, src)
1441    }
1442}
1443
1444/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1445/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1446///
1447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1448#[inline]
1449#[target_feature(enable = "avx512fp16")]
1450#[cfg_attr(test, assert_instr(vaddph))]
1451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1452pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1453    unsafe {
1454        let r = _mm512_add_ph(a, b);
1455        simd_select_bitmask(k, r, _mm512_setzero_ph())
1456    }
1457}
1458
1459/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1460/// Rounding is done according to the rounding parameter, which can be one of:
1461///
1462/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1463/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1464/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1465/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1466/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1467///
1468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1469#[inline]
1470#[target_feature(enable = "avx512fp16")]
1471#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1472#[rustc_legacy_const_generics(2)]
1473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1474pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1475    unsafe {
1476        static_assert_rounding!(ROUNDING);
1477        vaddph(a, b, ROUNDING)
1478    }
1479}
1480
1481/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1482/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1483/// Rounding is done according to the rounding parameter, which can be one of:
1484///
1485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1490///
1491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1492#[inline]
1493#[target_feature(enable = "avx512fp16")]
1494#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1495#[rustc_legacy_const_generics(4)]
1496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1497pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1498    src: __m512h,
1499    k: __mmask32,
1500    a: __m512h,
1501    b: __m512h,
1502) -> __m512h {
1503    unsafe {
1504        static_assert_rounding!(ROUNDING);
1505        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1506        simd_select_bitmask(k, r, src)
1507    }
1508}
1509
1510/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1511/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1512/// Rounding is done according to the rounding parameter, which can be one of:
1513///
1514/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1515/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1516/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1517/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1518///
1519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1520#[inline]
1521#[target_feature(enable = "avx512fp16")]
1522#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1523#[rustc_legacy_const_generics(3)]
1524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1525pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1526    k: __mmask32,
1527    a: __m512h,
1528    b: __m512h,
1529) -> __m512h {
1530    unsafe {
1531        static_assert_rounding!(ROUNDING);
1532        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1533        simd_select_bitmask(k, r, _mm512_setzero_ph())
1534    }
1535}
1536
1537/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1538/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1539/// Rounding is done according to the rounding parameter, which can be one of:
1540///
1541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1546///
1547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1548#[inline]
1549#[target_feature(enable = "avx512fp16")]
1550#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1551#[rustc_legacy_const_generics(2)]
1552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1553pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1554    static_assert_rounding!(ROUNDING);
1555    _mm_mask_add_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1556}
1557
1558/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1559/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1560/// writemask k (the element is copied from src when mask bit 0 is not set).
1561/// Rounding is done according to the rounding parameter, which can be one of:
1562///
1563/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1564/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1565/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1566/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1567/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1568///
1569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1570#[inline]
1571#[target_feature(enable = "avx512fp16")]
1572#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1573#[rustc_legacy_const_generics(4)]
1574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1575pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1576    src: __m128h,
1577    k: __mmask8,
1578    a: __m128h,
1579    b: __m128h,
1580) -> __m128h {
1581    unsafe {
1582        static_assert_rounding!(ROUNDING);
1583        vaddsh(a, b, src, k, ROUNDING)
1584    }
1585}
1586
1587/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1588/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1589/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1590/// Rounding is done according to the rounding parameter, which can be one of:
1591///
1592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1597///
1598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1599#[inline]
1600#[target_feature(enable = "avx512fp16")]
1601#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1602#[rustc_legacy_const_generics(3)]
1603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1604pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1605    static_assert_rounding!(ROUNDING);
1606    _mm_mask_add_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1607}
1608
1609/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1610/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1613#[inline]
1614#[target_feature(enable = "avx512fp16")]
1615#[cfg_attr(test, assert_instr(vaddsh))]
1616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1618    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1619}
1620
1621/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1622/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1623/// writemask k (the element is copied from src when mask bit 0 is not set).
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1626#[inline]
1627#[target_feature(enable = "avx512fp16")]
1628#[cfg_attr(test, assert_instr(vaddsh))]
1629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1632}
1633
1634/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1635/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1636/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1639#[inline]
1640#[target_feature(enable = "avx512fp16")]
1641#[cfg_attr(test, assert_instr(vaddsh))]
1642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1643pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1644    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1645}
1646
1647/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1648///
1649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1650#[inline]
1651#[target_feature(enable = "avx512fp16,avx512vl")]
1652#[cfg_attr(test, assert_instr(vsubph))]
1653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1654pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1655    unsafe { simd_sub(a, b) }
1656}
1657
1658/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1659/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1660///
1661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1662#[inline]
1663#[target_feature(enable = "avx512fp16,avx512vl")]
1664#[cfg_attr(test, assert_instr(vsubph))]
1665#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1666pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1667    unsafe {
1668        let r = _mm_sub_ph(a, b);
1669        simd_select_bitmask(k, r, src)
1670    }
1671}
1672
1673/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1674/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1675///
1676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1677#[inline]
1678#[target_feature(enable = "avx512fp16,avx512vl")]
1679#[cfg_attr(test, assert_instr(vsubph))]
1680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1681pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1682    unsafe {
1683        let r = _mm_sub_ph(a, b);
1684        simd_select_bitmask(k, r, _mm_setzero_ph())
1685    }
1686}
1687
1688/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1689///
1690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1691#[inline]
1692#[target_feature(enable = "avx512fp16,avx512vl")]
1693#[cfg_attr(test, assert_instr(vsubph))]
1694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1695pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1696    unsafe { simd_sub(a, b) }
1697}
1698
1699/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1700/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1701///
1702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1703#[inline]
1704#[target_feature(enable = "avx512fp16,avx512vl")]
1705#[cfg_attr(test, assert_instr(vsubph))]
1706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1707pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1708    unsafe {
1709        let r = _mm256_sub_ph(a, b);
1710        simd_select_bitmask(k, r, src)
1711    }
1712}
1713
1714/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1715/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1716///
1717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1718#[inline]
1719#[target_feature(enable = "avx512fp16,avx512vl")]
1720#[cfg_attr(test, assert_instr(vsubph))]
1721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1722pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1723    unsafe {
1724        let r = _mm256_sub_ph(a, b);
1725        simd_select_bitmask(k, r, _mm256_setzero_ph())
1726    }
1727}
1728
1729/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1730///
1731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1732#[inline]
1733#[target_feature(enable = "avx512fp16")]
1734#[cfg_attr(test, assert_instr(vsubph))]
1735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1736pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1737    unsafe { simd_sub(a, b) }
1738}
1739
1740/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1741/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1742///
1743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1744#[inline]
1745#[target_feature(enable = "avx512fp16")]
1746#[cfg_attr(test, assert_instr(vsubph))]
1747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1748pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1749    unsafe {
1750        let r = _mm512_sub_ph(a, b);
1751        simd_select_bitmask(k, r, src)
1752    }
1753}
1754
1755/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1756/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1757///
1758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1759#[inline]
1760#[target_feature(enable = "avx512fp16")]
1761#[cfg_attr(test, assert_instr(vsubph))]
1762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1763pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1764    unsafe {
1765        let r = _mm512_sub_ph(a, b);
1766        simd_select_bitmask(k, r, _mm512_setzero_ph())
1767    }
1768}
1769
1770/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1771/// Rounding is done according to the rounding parameter, which can be one of:
1772///
1773/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1774/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1775/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1776/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1777/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1778///
1779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1780#[inline]
1781#[target_feature(enable = "avx512fp16")]
1782#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1783#[rustc_legacy_const_generics(2)]
1784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1785pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1786    unsafe {
1787        static_assert_rounding!(ROUNDING);
1788        vsubph(a, b, ROUNDING)
1789    }
1790}
1791
1792/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1793/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1794/// Rounding is done according to the rounding parameter, which can be one of:
1795///
1796/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1797/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1798/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1799/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1801///
1802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1803#[inline]
1804#[target_feature(enable = "avx512fp16")]
1805#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1806#[rustc_legacy_const_generics(4)]
1807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1808pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1809    src: __m512h,
1810    k: __mmask32,
1811    a: __m512h,
1812    b: __m512h,
1813) -> __m512h {
1814    unsafe {
1815        static_assert_rounding!(ROUNDING);
1816        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1817        simd_select_bitmask(k, r, src)
1818    }
1819}
1820
1821/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1822/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1823/// Rounding is done according to the rounding parameter, which can be one of:
1824///
1825/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1826/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1827/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1828/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1830///
1831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1832#[inline]
1833#[target_feature(enable = "avx512fp16")]
1834#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1835#[rustc_legacy_const_generics(3)]
1836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1837pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1838    k: __mmask32,
1839    a: __m512h,
1840    b: __m512h,
1841) -> __m512h {
1842    unsafe {
1843        static_assert_rounding!(ROUNDING);
1844        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1845        simd_select_bitmask(k, r, _mm512_setzero_ph())
1846    }
1847}
1848
1849/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1850/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1851/// Rounding is done according to the rounding parameter, which can be one of:
1852///
1853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1858///
1859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1860#[inline]
1861#[target_feature(enable = "avx512fp16")]
1862#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1863#[rustc_legacy_const_generics(2)]
1864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1865pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1866    static_assert_rounding!(ROUNDING);
1867    _mm_mask_sub_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1868}
1869
1870/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1871/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1872/// writemask k (the element is copied from src when mask bit 0 is not set).
1873/// Rounding is done according to the rounding parameter, which can be one of:
1874///
1875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1880///
1881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1882#[inline]
1883#[target_feature(enable = "avx512fp16")]
1884#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1885#[rustc_legacy_const_generics(4)]
1886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1887pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1888    src: __m128h,
1889    k: __mmask8,
1890    a: __m128h,
1891    b: __m128h,
1892) -> __m128h {
1893    unsafe {
1894        static_assert_rounding!(ROUNDING);
1895        vsubsh(a, b, src, k, ROUNDING)
1896    }
1897}
1898
1899/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1900/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1901/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1902/// Rounding is done according to the rounding parameter, which can be one of:
1903///
1904/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1905/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1906/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1907/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1908/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1909///
1910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1911#[inline]
1912#[target_feature(enable = "avx512fp16")]
1913#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1914#[rustc_legacy_const_generics(3)]
1915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1916pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1917    static_assert_rounding!(ROUNDING);
1918    _mm_mask_sub_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1919}
1920
1921/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1922/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1923///
1924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1925#[inline]
1926#[target_feature(enable = "avx512fp16")]
1927#[cfg_attr(test, assert_instr(vsubsh))]
1928#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1929pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1930    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1931}
1932
1933/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1934/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1935/// writemask k (the element is copied from src when mask bit 0 is not set).
1936///
1937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1938#[inline]
1939#[target_feature(enable = "avx512fp16")]
1940#[cfg_attr(test, assert_instr(vsubsh))]
1941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1942pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1943    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1944}
1945
1946/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1947/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1948/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1949///
1950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1951#[inline]
1952#[target_feature(enable = "avx512fp16")]
1953#[cfg_attr(test, assert_instr(vsubsh))]
1954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1955pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1956    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1957}
1958
1959/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1960///
1961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1962#[inline]
1963#[target_feature(enable = "avx512fp16,avx512vl")]
1964#[cfg_attr(test, assert_instr(vmulph))]
1965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1966pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
1967    unsafe { simd_mul(a, b) }
1968}
1969
1970/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1971/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1972///
1973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
1974#[inline]
1975#[target_feature(enable = "avx512fp16,avx512vl")]
1976#[cfg_attr(test, assert_instr(vmulph))]
1977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1978pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1979    unsafe {
1980        let r = _mm_mul_ph(a, b);
1981        simd_select_bitmask(k, r, src)
1982    }
1983}
1984
1985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1986/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1987///
1988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
1989#[inline]
1990#[target_feature(enable = "avx512fp16,avx512vl")]
1991#[cfg_attr(test, assert_instr(vmulph))]
1992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1993pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1994    unsafe {
1995        let r = _mm_mul_ph(a, b);
1996        simd_select_bitmask(k, r, _mm_setzero_ph())
1997    }
1998}
1999
2000/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2001///
2002/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2003#[inline]
2004#[target_feature(enable = "avx512fp16,avx512vl")]
2005#[cfg_attr(test, assert_instr(vmulph))]
2006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2007pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2008    unsafe { simd_mul(a, b) }
2009}
2010
2011/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2012/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2013///
2014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2015#[inline]
2016#[target_feature(enable = "avx512fp16,avx512vl")]
2017#[cfg_attr(test, assert_instr(vmulph))]
2018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2019pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2020    unsafe {
2021        let r = _mm256_mul_ph(a, b);
2022        simd_select_bitmask(k, r, src)
2023    }
2024}
2025
2026/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2027/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2028///
2029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2030#[inline]
2031#[target_feature(enable = "avx512fp16,avx512vl")]
2032#[cfg_attr(test, assert_instr(vmulph))]
2033#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2034pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2035    unsafe {
2036        let r = _mm256_mul_ph(a, b);
2037        simd_select_bitmask(k, r, _mm256_setzero_ph())
2038    }
2039}
2040
2041/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2042///
2043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2044#[inline]
2045#[target_feature(enable = "avx512fp16")]
2046#[cfg_attr(test, assert_instr(vmulph))]
2047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2048pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2049    unsafe { simd_mul(a, b) }
2050}
2051
2052/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2053/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2054///
2055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2056#[inline]
2057#[target_feature(enable = "avx512fp16")]
2058#[cfg_attr(test, assert_instr(vmulph))]
2059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2060pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2061    unsafe {
2062        let r = _mm512_mul_ph(a, b);
2063        simd_select_bitmask(k, r, src)
2064    }
2065}
2066
2067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2068/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2069///
2070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2071#[inline]
2072#[target_feature(enable = "avx512fp16")]
2073#[cfg_attr(test, assert_instr(vmulph))]
2074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2075pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2076    unsafe {
2077        let r = _mm512_mul_ph(a, b);
2078        simd_select_bitmask(k, r, _mm512_setzero_ph())
2079    }
2080}
2081
2082/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2083/// Rounding is done according to the rounding parameter, which can be one of:
2084///
2085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2090///
2091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2092#[inline]
2093#[target_feature(enable = "avx512fp16")]
2094#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2095#[rustc_legacy_const_generics(2)]
2096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2097pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2098    unsafe {
2099        static_assert_rounding!(ROUNDING);
2100        vmulph(a, b, ROUNDING)
2101    }
2102}
2103
2104/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2105/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2106/// Rounding is done according to the rounding parameter, which can be one of:
2107///
2108/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2109/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2110/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2111/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2112/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2113///
2114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2115#[inline]
2116#[target_feature(enable = "avx512fp16")]
2117#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2118#[rustc_legacy_const_generics(4)]
2119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2120pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2121    src: __m512h,
2122    k: __mmask32,
2123    a: __m512h,
2124    b: __m512h,
2125) -> __m512h {
2126    unsafe {
2127        static_assert_rounding!(ROUNDING);
2128        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2129        simd_select_bitmask(k, r, src)
2130    }
2131}
2132
2133/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2134/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2135/// Rounding is done according to the rounding parameter, which can be one of:
2136///
2137/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2138/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2139/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2140/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2141/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2142///
2143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2144#[inline]
2145#[target_feature(enable = "avx512fp16")]
2146#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2147#[rustc_legacy_const_generics(3)]
2148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2149pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2150    k: __mmask32,
2151    a: __m512h,
2152    b: __m512h,
2153) -> __m512h {
2154    unsafe {
2155        static_assert_rounding!(ROUNDING);
2156        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2157        simd_select_bitmask(k, r, _mm512_setzero_ph())
2158    }
2159}
2160
2161/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2162/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2163/// Rounding is done according to the rounding parameter, which can be one of:
2164///
2165/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2166/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2167/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2168/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2169/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2170///
2171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2172#[inline]
2173#[target_feature(enable = "avx512fp16")]
2174#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2175#[rustc_legacy_const_generics(2)]
2176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2177pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2178    static_assert_rounding!(ROUNDING);
2179    _mm_mask_mul_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2180}
2181
2182/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2183/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2184/// writemask k (the element is copied from src when mask bit 0 is not set).
2185/// Rounding is done according to the rounding parameter, which can be one of:
2186///
2187/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2188/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2189/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2190/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2191/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2192///
2193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2194#[inline]
2195#[target_feature(enable = "avx512fp16")]
2196#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2197#[rustc_legacy_const_generics(4)]
2198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2199pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2200    src: __m128h,
2201    k: __mmask8,
2202    a: __m128h,
2203    b: __m128h,
2204) -> __m128h {
2205    unsafe {
2206        static_assert_rounding!(ROUNDING);
2207        vmulsh(a, b, src, k, ROUNDING)
2208    }
2209}
2210
2211/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2212/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2213/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2214/// Rounding is done according to the rounding parameter, which can be one of:
2215///
2216/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2217/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2218/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2219/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2220/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2221///
2222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2223#[inline]
2224#[target_feature(enable = "avx512fp16")]
2225#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2226#[rustc_legacy_const_generics(3)]
2227#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2228pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2229    static_assert_rounding!(ROUNDING);
2230    _mm_mask_mul_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2231}
2232
2233/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2234/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2235///
2236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2237#[inline]
2238#[target_feature(enable = "avx512fp16")]
2239#[cfg_attr(test, assert_instr(vmulsh))]
2240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2241pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2242    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2243}
2244
2245/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2246/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2247/// writemask k (the element is copied from src when mask bit 0 is not set).
2248///
2249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2250#[inline]
2251#[target_feature(enable = "avx512fp16")]
2252#[cfg_attr(test, assert_instr(vmulsh))]
2253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2254pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2255    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2256}
2257
2258/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2259/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2260/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2261///
2262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2263#[inline]
2264#[target_feature(enable = "avx512fp16")]
2265#[cfg_attr(test, assert_instr(vmulsh))]
2266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2267pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2268    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2269}
2270
2271/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2272///
2273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2274#[inline]
2275#[target_feature(enable = "avx512fp16,avx512vl")]
2276#[cfg_attr(test, assert_instr(vdivph))]
2277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2278pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2279    unsafe { simd_div(a, b) }
2280}
2281
2282/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2283/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2284///
2285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2286#[inline]
2287#[target_feature(enable = "avx512fp16,avx512vl")]
2288#[cfg_attr(test, assert_instr(vdivph))]
2289#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2290pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2291    unsafe {
2292        let r = _mm_div_ph(a, b);
2293        simd_select_bitmask(k, r, src)
2294    }
2295}
2296
2297/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2298/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2299///
2300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2301#[inline]
2302#[target_feature(enable = "avx512fp16,avx512vl")]
2303#[cfg_attr(test, assert_instr(vdivph))]
2304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2305pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2306    unsafe {
2307        let r = _mm_div_ph(a, b);
2308        simd_select_bitmask(k, r, _mm_setzero_ph())
2309    }
2310}
2311
2312/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2313///
2314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2315#[inline]
2316#[target_feature(enable = "avx512fp16,avx512vl")]
2317#[cfg_attr(test, assert_instr(vdivph))]
2318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2319pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2320    unsafe { simd_div(a, b) }
2321}
2322
2323/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2324/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2325///
2326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2327#[inline]
2328#[target_feature(enable = "avx512fp16,avx512vl")]
2329#[cfg_attr(test, assert_instr(vdivph))]
2330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2331pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2332    unsafe {
2333        let r = _mm256_div_ph(a, b);
2334        simd_select_bitmask(k, r, src)
2335    }
2336}
2337
2338/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2339/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2340///
2341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2342#[inline]
2343#[target_feature(enable = "avx512fp16,avx512vl")]
2344#[cfg_attr(test, assert_instr(vdivph))]
2345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2346pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2347    unsafe {
2348        let r = _mm256_div_ph(a, b);
2349        simd_select_bitmask(k, r, _mm256_setzero_ph())
2350    }
2351}
2352
2353/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2354///
2355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2356#[inline]
2357#[target_feature(enable = "avx512fp16")]
2358#[cfg_attr(test, assert_instr(vdivph))]
2359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2360pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2361    unsafe { simd_div(a, b) }
2362}
2363
2364/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2365/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2366///
2367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2368#[inline]
2369#[target_feature(enable = "avx512fp16")]
2370#[cfg_attr(test, assert_instr(vdivph))]
2371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2372pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2373    unsafe {
2374        let r = _mm512_div_ph(a, b);
2375        simd_select_bitmask(k, r, src)
2376    }
2377}
2378
2379/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2380/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2381///
2382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2383#[inline]
2384#[target_feature(enable = "avx512fp16")]
2385#[cfg_attr(test, assert_instr(vdivph))]
2386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2387pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2388    unsafe {
2389        let r = _mm512_div_ph(a, b);
2390        simd_select_bitmask(k, r, _mm512_setzero_ph())
2391    }
2392}
2393
2394/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2395/// Rounding is done according to the rounding parameter, which can be one of:
2396///
2397/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2398/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2399/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2400/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2401/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2402///
2403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2404#[inline]
2405#[target_feature(enable = "avx512fp16")]
2406#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2407#[rustc_legacy_const_generics(2)]
2408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2409pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2410    unsafe {
2411        static_assert_rounding!(ROUNDING);
2412        vdivph(a, b, ROUNDING)
2413    }
2414}
2415
2416/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2417/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2418/// Rounding is done according to the rounding parameter, which can be one of:
2419///
2420/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2421/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2422/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2423/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2424/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2425///
2426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2427#[inline]
2428#[target_feature(enable = "avx512fp16")]
2429#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2430#[rustc_legacy_const_generics(4)]
2431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2432pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2433    src: __m512h,
2434    k: __mmask32,
2435    a: __m512h,
2436    b: __m512h,
2437) -> __m512h {
2438    unsafe {
2439        static_assert_rounding!(ROUNDING);
2440        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2441        simd_select_bitmask(k, r, src)
2442    }
2443}
2444
2445/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2446/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2447/// Rounding is done according to the rounding parameter, which can be one of:
2448///
2449/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2450/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2451/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2452/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2453/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2454///
2455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2456#[inline]
2457#[target_feature(enable = "avx512fp16")]
2458#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2459#[rustc_legacy_const_generics(3)]
2460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2461pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2462    k: __mmask32,
2463    a: __m512h,
2464    b: __m512h,
2465) -> __m512h {
2466    unsafe {
2467        static_assert_rounding!(ROUNDING);
2468        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2469        simd_select_bitmask(k, r, _mm512_setzero_ph())
2470    }
2471}
2472
2473/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2474/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2475/// Rounding is done according to the rounding parameter, which can be one of:
2476///
2477/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2478/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2479/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2480/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2481/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2482///
2483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2484#[inline]
2485#[target_feature(enable = "avx512fp16")]
2486#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2487#[rustc_legacy_const_generics(2)]
2488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2489pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2490    static_assert_rounding!(ROUNDING);
2491    _mm_mask_div_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2492}
2493
2494/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2495/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2496/// writemask k (the element is copied from src when mask bit 0 is not set).
2497/// Rounding is done according to the rounding parameter, which can be one of:
2498///
2499/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2500/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2501/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2502/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2504///
2505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2506#[inline]
2507#[target_feature(enable = "avx512fp16")]
2508#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2509#[rustc_legacy_const_generics(4)]
2510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2511pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2512    src: __m128h,
2513    k: __mmask8,
2514    a: __m128h,
2515    b: __m128h,
2516) -> __m128h {
2517    unsafe {
2518        static_assert_rounding!(ROUNDING);
2519        vdivsh(a, b, src, k, ROUNDING)
2520    }
2521}
2522
2523/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2524/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2525/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2526/// Rounding is done according to the rounding parameter, which can be one of:
2527///
2528/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2529/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2530/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2531/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2532/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2533///
2534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2535#[inline]
2536#[target_feature(enable = "avx512fp16")]
2537#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2538#[rustc_legacy_const_generics(3)]
2539#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2540pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2541    static_assert_rounding!(ROUNDING);
2542    _mm_mask_div_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2543}
2544
2545/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2546/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2547///
2548/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2549#[inline]
2550#[target_feature(enable = "avx512fp16")]
2551#[cfg_attr(test, assert_instr(vdivsh))]
2552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2553pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2554    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2555}
2556
2557/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2558/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2559/// writemask k (the element is copied from src when mask bit 0 is not set).
2560///
2561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2562#[inline]
2563#[target_feature(enable = "avx512fp16")]
2564#[cfg_attr(test, assert_instr(vdivsh))]
2565#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2566pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2567    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2568}
2569
2570/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2571/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2572/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2573///
2574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2575#[inline]
2576#[target_feature(enable = "avx512fp16")]
2577#[cfg_attr(test, assert_instr(vdivsh))]
2578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2579pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2580    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2581}
2582
2583/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2584/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2585/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2586///
2587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2588#[inline]
2589#[target_feature(enable = "avx512fp16,avx512vl")]
2590#[cfg_attr(test, assert_instr(vfmulcph))]
2591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2592pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2593    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2594}
2595
2596/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2597/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2598/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2599///
2600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2601#[inline]
2602#[target_feature(enable = "avx512fp16,avx512vl")]
2603#[cfg_attr(test, assert_instr(vfmulcph))]
2604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2605pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2606    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2607}
2608
2609/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2610/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2611/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2612///
2613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2614#[inline]
2615#[target_feature(enable = "avx512fp16,avx512vl")]
2616#[cfg_attr(test, assert_instr(vfmulcph))]
2617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2618pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2619    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2620}
2621
2622/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2623/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2624/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2625///
2626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2627#[inline]
2628#[target_feature(enable = "avx512fp16,avx512vl")]
2629#[cfg_attr(test, assert_instr(vfmulcph))]
2630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2631pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2632    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2633}
2634
2635/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2636/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2637/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2638///
2639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2640#[inline]
2641#[target_feature(enable = "avx512fp16,avx512vl")]
2642#[cfg_attr(test, assert_instr(vfmulcph))]
2643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2644pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2645    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2646}
2647
2648/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2649/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2650/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2651///
2652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2653#[inline]
2654#[target_feature(enable = "avx512fp16,avx512vl")]
2655#[cfg_attr(test, assert_instr(vfmulcph))]
2656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2657pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2658    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2659}
2660
2661/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2662/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2663/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2664///
2665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2666#[inline]
2667#[target_feature(enable = "avx512fp16")]
2668#[cfg_attr(test, assert_instr(vfmulcph))]
2669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2670pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2671    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2672}
2673
2674/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2675/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2676/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2677///
2678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2679#[inline]
2680#[target_feature(enable = "avx512fp16")]
2681#[cfg_attr(test, assert_instr(vfmulcph))]
2682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2683pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2684    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2685}
2686
2687/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2688/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2689/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2690///
2691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2692#[inline]
2693#[target_feature(enable = "avx512fp16")]
2694#[cfg_attr(test, assert_instr(vfmulcph))]
2695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2696pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2697    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2698}
2699
2700/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2701/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2702/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2703///
2704/// Rounding is done according to the rounding parameter, which can be one of:
2705///
2706/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2707/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2708/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2709/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2710/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2711///
2712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2713#[inline]
2714#[target_feature(enable = "avx512fp16")]
2715#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2716#[rustc_legacy_const_generics(2)]
2717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2718pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2719    static_assert_rounding!(ROUNDING);
2720    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2721}
2722
2723/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2724/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2725/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2726///
2727/// Rounding is done according to the rounding parameter, which can be one of:
2728///
2729/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2730/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2731/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2732/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2733/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2734///
2735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2736#[inline]
2737#[target_feature(enable = "avx512fp16")]
2738#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2739#[rustc_legacy_const_generics(4)]
2740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2741pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2742    src: __m512h,
2743    k: __mmask16,
2744    a: __m512h,
2745    b: __m512h,
2746) -> __m512h {
2747    unsafe {
2748        static_assert_rounding!(ROUNDING);
2749        transmute(vfmulcph_512(
2750            transmute(a),
2751            transmute(b),
2752            transmute(src),
2753            k,
2754            ROUNDING,
2755        ))
2756    }
2757}
2758
2759/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2760/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2761/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2762///
2763/// Rounding is done according to the rounding parameter, which can be one of:
2764///
2765/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2766/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2767/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2768/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2769/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2770///
2771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2772#[inline]
2773#[target_feature(enable = "avx512fp16")]
2774#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2775#[rustc_legacy_const_generics(3)]
2776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2777pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2778    k: __mmask16,
2779    a: __m512h,
2780    b: __m512h,
2781) -> __m512h {
2782    static_assert_rounding!(ROUNDING);
2783    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2784}
2785
2786/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2787/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2788/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2789/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2790///
2791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2792#[inline]
2793#[target_feature(enable = "avx512fp16")]
2794#[cfg_attr(test, assert_instr(vfmulcsh))]
2795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2796pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2797    _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b)
2798}
2799
2800/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2801/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2802/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2803/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2804///
2805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2806#[inline]
2807#[target_feature(enable = "avx512fp16")]
2808#[cfg_attr(test, assert_instr(vfmulcsh))]
2809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2810pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2811    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2812}
2813
2814/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2815/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2816/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2817/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2818///
2819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2820#[inline]
2821#[target_feature(enable = "avx512fp16")]
2822#[cfg_attr(test, assert_instr(vfmulcsh))]
2823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2824pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2825    _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b)
2826}
2827
2828/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2829/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2830/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2831/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2832///
2833/// Rounding is done according to the rounding parameter, which can be one of:
2834///
2835/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2836/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2837/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2838/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2839/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2840///
2841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2842#[inline]
2843#[target_feature(enable = "avx512fp16")]
2844#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2845#[rustc_legacy_const_generics(2)]
2846#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2847pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2848    static_assert_rounding!(ROUNDING);
2849    _mm_mask_mul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2850}
2851
2852/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2853/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2854/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2855/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2856///
2857/// Rounding is done according to the rounding parameter, which can be one of:
2858///
2859/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2860/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2861/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2862/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2863/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2864///
2865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2866#[inline]
2867#[target_feature(enable = "avx512fp16")]
2868#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2869#[rustc_legacy_const_generics(4)]
2870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2871pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2872    src: __m128h,
2873    k: __mmask8,
2874    a: __m128h,
2875    b: __m128h,
2876) -> __m128h {
2877    unsafe {
2878        static_assert_rounding!(ROUNDING);
2879        transmute(vfmulcsh(
2880            transmute(a),
2881            transmute(b),
2882            transmute(src),
2883            k,
2884            ROUNDING,
2885        ))
2886    }
2887}
2888
2889/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2890/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2891/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2892/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2893///
2894/// Rounding is done according to the rounding parameter, which can be one of:
2895///
2896/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2897/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2898/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2899/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2900/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2901///
2902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2903#[inline]
2904#[target_feature(enable = "avx512fp16")]
2905#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2906#[rustc_legacy_const_generics(3)]
2907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2908pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2909    k: __mmask8,
2910    a: __m128h,
2911    b: __m128h,
2912) -> __m128h {
2913    static_assert_rounding!(ROUNDING);
2914    _mm_mask_mul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2915}
2916
2917/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2918/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2919/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2920///
2921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2922#[inline]
2923#[target_feature(enable = "avx512fp16,avx512vl")]
2924#[cfg_attr(test, assert_instr(vfmulcph))]
2925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2926pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2927    _mm_mul_pch(a, b)
2928}
2929
2930/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2931/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2932/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2933///
2934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
2935#[inline]
2936#[target_feature(enable = "avx512fp16,avx512vl")]
2937#[cfg_attr(test, assert_instr(vfmulcph))]
2938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2939pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2940    _mm_mask_mul_pch(src, k, a, b)
2941}
2942
2943/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2944/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2945/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2946///
2947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
2948#[inline]
2949#[target_feature(enable = "avx512fp16,avx512vl")]
2950#[cfg_attr(test, assert_instr(vfmulcph))]
2951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2952pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2953    _mm_maskz_mul_pch(k, a, b)
2954}
2955
2956/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2957/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2958/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2959///
2960/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
2961#[inline]
2962#[target_feature(enable = "avx512fp16,avx512vl")]
2963#[cfg_attr(test, assert_instr(vfmulcph))]
2964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2965pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
2966    _mm256_mul_pch(a, b)
2967}
2968
2969/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2970/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2971/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2972///
2973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
2974#[inline]
2975#[target_feature(enable = "avx512fp16,avx512vl")]
2976#[cfg_attr(test, assert_instr(vfmulcph))]
2977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2978pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2979    _mm256_mask_mul_pch(src, k, a, b)
2980}
2981
2982/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2983/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2984/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2985///
2986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
2987#[inline]
2988#[target_feature(enable = "avx512fp16,avx512vl")]
2989#[cfg_attr(test, assert_instr(vfmulcph))]
2990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2991pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2992    _mm256_maskz_mul_pch(k, a, b)
2993}
2994
2995/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2996/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2997///
2998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
2999#[inline]
3000#[target_feature(enable = "avx512fp16")]
3001#[cfg_attr(test, assert_instr(vfmulcph))]
3002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3003pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3004    _mm512_mul_pch(a, b)
3005}
3006
3007/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3008/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3009/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3010///
3011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3012#[inline]
3013#[target_feature(enable = "avx512fp16")]
3014#[cfg_attr(test, assert_instr(vfmulcph))]
3015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3016pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3017    _mm512_mask_mul_pch(src, k, a, b)
3018}
3019
3020/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3021/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3022/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3023///
3024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3025#[inline]
3026#[target_feature(enable = "avx512fp16")]
3027#[cfg_attr(test, assert_instr(vfmulcph))]
3028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3029pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3030    _mm512_maskz_mul_pch(k, a, b)
3031}
3032
3033/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3034/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3035/// Rounding is done according to the rounding parameter, which can be one of:
3036///
3037/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3038/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3039/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3040/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3041/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3042///
3043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3044#[inline]
3045#[target_feature(enable = "avx512fp16")]
3046#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3047#[rustc_legacy_const_generics(2)]
3048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3049pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3050    static_assert_rounding!(ROUNDING);
3051    _mm512_mul_round_pch::<ROUNDING>(a, b)
3052}
3053
3054/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3055/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3056/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3057/// Rounding is done according to the rounding parameter, which can be one of:
3058///
3059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3064///
3065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3066#[inline]
3067#[target_feature(enable = "avx512fp16")]
3068#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3069#[rustc_legacy_const_generics(4)]
3070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3071pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3072    src: __m512h,
3073    k: __mmask16,
3074    a: __m512h,
3075    b: __m512h,
3076) -> __m512h {
3077    static_assert_rounding!(ROUNDING);
3078    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3079}
3080
3081/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3082/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3083/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3084/// Rounding is done according to the rounding parameter, which can be one of:
3085///
3086/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3087/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3088/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3089/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3091///
3092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3093#[inline]
3094#[target_feature(enable = "avx512fp16")]
3095#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3096#[rustc_legacy_const_generics(3)]
3097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3098pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3099    k: __mmask16,
3100    a: __m512h,
3101    b: __m512h,
3102) -> __m512h {
3103    static_assert_rounding!(ROUNDING);
3104    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3105}
3106
3107/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3108/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3109/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3110///
3111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3112#[inline]
3113#[target_feature(enable = "avx512fp16")]
3114#[cfg_attr(test, assert_instr(vfmulcsh))]
3115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3116pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3117    _mm_mul_sch(a, b)
3118}
3119
3120/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3121/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3122/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3123///
3124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3125#[inline]
3126#[target_feature(enable = "avx512fp16")]
3127#[cfg_attr(test, assert_instr(vfmulcsh))]
3128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3129pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3130    _mm_mask_mul_sch(src, k, a, b)
3131}
3132
3133/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3134/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3135/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3136///
3137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3138#[inline]
3139#[target_feature(enable = "avx512fp16")]
3140#[cfg_attr(test, assert_instr(vfmulcsh))]
3141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3142pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3143    _mm_maskz_mul_sch(k, a, b)
3144}
3145
3146/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3147/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3148///
3149/// Rounding is done according to the rounding parameter, which can be one of:
3150///
3151/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3152/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3153/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3154/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3155/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3156///
3157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3158#[inline]
3159#[target_feature(enable = "avx512fp16")]
3160#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3161#[rustc_legacy_const_generics(2)]
3162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3163pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3164    static_assert_rounding!(ROUNDING);
3165    _mm_mul_round_sch::<ROUNDING>(a, b)
3166}
3167
3168/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3169/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3170/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3171///
3172/// Rounding is done according to the rounding parameter, which can be one of:
3173///
3174/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3175/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3176/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3177/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3179///
3180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3181#[inline]
3182#[target_feature(enable = "avx512fp16")]
3183#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3184#[rustc_legacy_const_generics(4)]
3185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3186pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3187    src: __m128h,
3188    k: __mmask8,
3189    a: __m128h,
3190    b: __m128h,
3191) -> __m128h {
3192    static_assert_rounding!(ROUNDING);
3193    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3194}
3195
3196/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3197/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3198/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3199///
3200/// Rounding is done according to the rounding parameter, which can be one of:
3201///
3202/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3203/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3204/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3205/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3206/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3207///
3208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3209#[inline]
3210#[target_feature(enable = "avx512fp16")]
3211#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3212#[rustc_legacy_const_generics(3)]
3213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3214pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3215    k: __mmask8,
3216    a: __m128h,
3217    b: __m128h,
3218) -> __m128h {
3219    static_assert_rounding!(ROUNDING);
3220    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3221}
3222
3223/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3224/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3225/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3226/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3227///
3228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3229#[inline]
3230#[target_feature(enable = "avx512fp16,avx512vl")]
3231#[cfg_attr(test, assert_instr(vfcmulcph))]
3232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3233pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3234    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3235}
3236
3237/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3238/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3239/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3240/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3241///
3242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3243#[inline]
3244#[target_feature(enable = "avx512fp16,avx512vl")]
3245#[cfg_attr(test, assert_instr(vfcmulcph))]
3246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3247pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3248    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3249}
3250
3251/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3252/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3253/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3254/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3255///
3256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3257#[inline]
3258#[target_feature(enable = "avx512fp16,avx512vl")]
3259#[cfg_attr(test, assert_instr(vfcmulcph))]
3260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3261pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3262    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3263}
3264
3265/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3266/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3267/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3268/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3269///
3270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3271#[inline]
3272#[target_feature(enable = "avx512fp16,avx512vl")]
3273#[cfg_attr(test, assert_instr(vfcmulcph))]
3274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3275pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3276    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3277}
3278
3279/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3280/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3281/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3282/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3283///
3284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3285#[inline]
3286#[target_feature(enable = "avx512fp16,avx512vl")]
3287#[cfg_attr(test, assert_instr(vfcmulcph))]
3288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3289pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3290    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3291}
3292
3293/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3294/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3295/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3296/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3297///
3298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3299#[inline]
3300#[target_feature(enable = "avx512fp16,avx512vl")]
3301#[cfg_attr(test, assert_instr(vfcmulcph))]
3302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3303pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3304    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3305}
3306
3307/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3308/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3310/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3311///
3312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3313#[inline]
3314#[target_feature(enable = "avx512fp16")]
3315#[cfg_attr(test, assert_instr(vfcmulcph))]
3316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3317pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3318    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3319}
3320
3321/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3322/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3323/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3324/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3325///
3326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3327#[inline]
3328#[target_feature(enable = "avx512fp16")]
3329#[cfg_attr(test, assert_instr(vfcmulcph))]
3330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3331pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3332    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3333}
3334
3335/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3336/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3337/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3338/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3339///
3340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3341#[inline]
3342#[target_feature(enable = "avx512fp16")]
3343#[cfg_attr(test, assert_instr(vfcmulcph))]
3344#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3345pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3346    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3347}
3348
3349/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3350/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3351/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3352/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3353///
3354/// Rounding is done according to the rounding parameter, which can be one of:
3355///
3356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3361///
3362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3363#[inline]
3364#[target_feature(enable = "avx512fp16")]
3365#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3366#[rustc_legacy_const_generics(2)]
3367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3368pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3369    static_assert_rounding!(ROUNDING);
3370    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3371}
3372
3373/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3374/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3375/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3376/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3377///
3378/// Rounding is done according to the rounding parameter, which can be one of:
3379///
3380/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3381/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3382/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3383/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3384/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3385///
3386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3387#[inline]
3388#[target_feature(enable = "avx512fp16")]
3389#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3390#[rustc_legacy_const_generics(4)]
3391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3392pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3393    src: __m512h,
3394    k: __mmask16,
3395    a: __m512h,
3396    b: __m512h,
3397) -> __m512h {
3398    unsafe {
3399        static_assert_rounding!(ROUNDING);
3400        transmute(vfcmulcph_512(
3401            transmute(a),
3402            transmute(b),
3403            transmute(src),
3404            k,
3405            ROUNDING,
3406        ))
3407    }
3408}
3409
3410/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3411/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3413/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3414///
3415/// Rounding is done according to the rounding parameter, which can be one of:
3416///
3417/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3418/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3419/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3420/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3421/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3422///
3423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3424#[inline]
3425#[target_feature(enable = "avx512fp16")]
3426#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3427#[rustc_legacy_const_generics(3)]
3428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3429pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3430    k: __mmask16,
3431    a: __m512h,
3432    b: __m512h,
3433) -> __m512h {
3434    static_assert_rounding!(ROUNDING);
3435    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3436}
3437
3438/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3439/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3440/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3441///
3442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3443#[inline]
3444#[target_feature(enable = "avx512fp16")]
3445#[cfg_attr(test, assert_instr(vfcmulcsh))]
3446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3447pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3448    _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b)
3449}
3450
3451/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3452/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3453/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3454/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3455///
3456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3457#[inline]
3458#[target_feature(enable = "avx512fp16")]
3459#[cfg_attr(test, assert_instr(vfcmulcsh))]
3460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3461pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3462    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3463}
3464
3465/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3466/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3467/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3468/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3469///
3470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3471#[inline]
3472#[target_feature(enable = "avx512fp16")]
3473#[cfg_attr(test, assert_instr(vfcmulcsh))]
3474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3475pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3476    _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b)
3477}
3478
3479/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3480/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3481/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3482///
3483/// Rounding is done according to the rounding parameter, which can be one of:
3484///
3485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3490///
3491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3492#[inline]
3493#[target_feature(enable = "avx512fp16")]
3494#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3495#[rustc_legacy_const_generics(2)]
3496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3497pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3498    static_assert_rounding!(ROUNDING);
3499    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
3500}
3501
3502/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3503/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3504/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3505/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3506///
3507/// Rounding is done according to the rounding parameter, which can be one of:
3508///
3509/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3510/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3511/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3512/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3513/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3514///
3515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3516#[inline]
3517#[target_feature(enable = "avx512fp16")]
3518#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3519#[rustc_legacy_const_generics(4)]
3520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3521pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3522    src: __m128h,
3523    k: __mmask8,
3524    a: __m128h,
3525    b: __m128h,
3526) -> __m128h {
3527    unsafe {
3528        static_assert_rounding!(ROUNDING);
3529        transmute(vfcmulcsh(
3530            transmute(a),
3531            transmute(b),
3532            transmute(src),
3533            k,
3534            ROUNDING,
3535        ))
3536    }
3537}
3538
3539/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3540/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3541/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3543///
3544/// Rounding is done according to the rounding parameter, which can be one of:
3545///
3546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3551///
3552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3553#[inline]
3554#[target_feature(enable = "avx512fp16")]
3555#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3556#[rustc_legacy_const_generics(3)]
3557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3558pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3559    k: __mmask8,
3560    a: __m128h,
3561    b: __m128h,
3562) -> __m128h {
3563    static_assert_rounding!(ROUNDING);
3564    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
3565}
3566
3567/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3568/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3569/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3570/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3571///
3572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3573#[inline]
3574#[target_feature(enable = "avx512fp16,avx512vl")]
3575#[cfg_attr(test, assert_instr(vfcmulcph))]
3576#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3577pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3578    _mm_cmul_pch(a, b)
3579}
3580
3581/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3582/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3583/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3584/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3585///
3586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3587#[inline]
3588#[target_feature(enable = "avx512fp16,avx512vl")]
3589#[cfg_attr(test, assert_instr(vfcmulcph))]
3590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3591pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3592    _mm_mask_cmul_pch(src, k, a, b)
3593}
3594
3595/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3596/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3597/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3598/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3599///
3600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3601#[inline]
3602#[target_feature(enable = "avx512fp16,avx512vl")]
3603#[cfg_attr(test, assert_instr(vfcmulcph))]
3604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3605pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3606    _mm_maskz_cmul_pch(k, a, b)
3607}
3608
3609/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3610/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3611/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3612/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3613///
3614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3615#[inline]
3616#[target_feature(enable = "avx512fp16,avx512vl")]
3617#[cfg_attr(test, assert_instr(vfcmulcph))]
3618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3619pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3620    _mm256_cmul_pch(a, b)
3621}
3622
3623/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3624/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3625/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3626/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3627///
3628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3629#[inline]
3630#[target_feature(enable = "avx512fp16,avx512vl")]
3631#[cfg_attr(test, assert_instr(vfcmulcph))]
3632#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3633pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3634    _mm256_mask_cmul_pch(src, k, a, b)
3635}
3636
3637/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3638/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3639/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3640/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3641///
3642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3643#[inline]
3644#[target_feature(enable = "avx512fp16,avx512vl")]
3645#[cfg_attr(test, assert_instr(vfcmulcph))]
3646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3647pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3648    _mm256_maskz_cmul_pch(k, a, b)
3649}
3650
3651/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3652/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3653/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3654/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3655///
3656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3657#[inline]
3658#[target_feature(enable = "avx512fp16")]
3659#[cfg_attr(test, assert_instr(vfcmulcph))]
3660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3661pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3662    _mm512_cmul_pch(a, b)
3663}
3664
3665/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3666/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3667/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3668/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3669///
3670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3671#[inline]
3672#[target_feature(enable = "avx512fp16")]
3673#[cfg_attr(test, assert_instr(vfcmulcph))]
3674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3675pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3676    _mm512_mask_cmul_pch(src, k, a, b)
3677}
3678
3679/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3680/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3681/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3682/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3683///
3684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3685#[inline]
3686#[target_feature(enable = "avx512fp16")]
3687#[cfg_attr(test, assert_instr(vfcmulcph))]
3688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3689pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3690    _mm512_maskz_cmul_pch(k, a, b)
3691}
3692
3693/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3694/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3695/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3696///
3697/// Rounding is done according to the rounding parameter, which can be one of:
3698///
3699/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3700/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3701/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3702/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3703/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3704///
3705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3706#[inline]
3707#[target_feature(enable = "avx512fp16")]
3708#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3709#[rustc_legacy_const_generics(2)]
3710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3711pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3712    static_assert_rounding!(ROUNDING);
3713    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3714}
3715
3716/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3717/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3718/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3719/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3720///
3721/// Rounding is done according to the rounding parameter, which can be one of:
3722///
3723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3728///
3729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3730#[inline]
3731#[target_feature(enable = "avx512fp16")]
3732#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3733#[rustc_legacy_const_generics(4)]
3734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3735pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3736    src: __m512h,
3737    k: __mmask16,
3738    a: __m512h,
3739    b: __m512h,
3740) -> __m512h {
3741    static_assert_rounding!(ROUNDING);
3742    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3743}
3744
3745/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3746/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3747/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3748/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3749///
3750/// Rounding is done according to the rounding parameter, which can be one of:
3751///
3752/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3753/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3754/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3755/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3756/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3757///
3758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3759#[inline]
3760#[target_feature(enable = "avx512fp16")]
3761#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3762#[rustc_legacy_const_generics(3)]
3763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3764pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3765    k: __mmask16,
3766    a: __m512h,
3767    b: __m512h,
3768) -> __m512h {
3769    static_assert_rounding!(ROUNDING);
3770    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3771}
3772
3773/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3774/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3775/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3776/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3777///
3778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3779#[inline]
3780#[target_feature(enable = "avx512fp16")]
3781#[cfg_attr(test, assert_instr(vfcmulcsh))]
3782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3783pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3784    _mm_cmul_sch(a, b)
3785}
3786
3787/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3788/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3789/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3790/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3791///
3792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3793#[inline]
3794#[target_feature(enable = "avx512fp16")]
3795#[cfg_attr(test, assert_instr(vfcmulcsh))]
3796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3797pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3798    _mm_mask_cmul_sch(src, k, a, b)
3799}
3800
3801/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3802/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3803/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3804/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3805///
3806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3807#[inline]
3808#[target_feature(enable = "avx512fp16")]
3809#[cfg_attr(test, assert_instr(vfcmulcsh))]
3810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3811pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3812    _mm_maskz_cmul_sch(k, a, b)
3813}
3814
3815/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3816/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3817/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3818///
3819/// Rounding is done according to the rounding parameter, which can be one of:
3820///
3821/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3822/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3823/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3824/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3825/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3826///
3827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3828#[inline]
3829#[target_feature(enable = "avx512fp16")]
3830#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3831#[rustc_legacy_const_generics(2)]
3832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3833pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3834    static_assert_rounding!(ROUNDING);
3835    _mm_cmul_round_sch::<ROUNDING>(a, b)
3836}
3837
3838/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3839/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3840/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3841/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3842///
3843/// Rounding is done according to the rounding parameter, which can be one of:
3844///
3845/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3846/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3847/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3848/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3849/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3850///
3851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3852#[inline]
3853#[target_feature(enable = "avx512fp16")]
3854#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3855#[rustc_legacy_const_generics(4)]
3856#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3857pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3858    src: __m128h,
3859    k: __mmask8,
3860    a: __m128h,
3861    b: __m128h,
3862) -> __m128h {
3863    static_assert_rounding!(ROUNDING);
3864    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3865}
3866
3867/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3868/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3869/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3870/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3871///
3872/// Rounding is done according to the rounding parameter, which can be one of:
3873///
3874/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3875/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3876/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3877/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3878/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3879///
3880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3881#[inline]
3882#[target_feature(enable = "avx512fp16")]
3883#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3884#[rustc_legacy_const_generics(3)]
3885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3886pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3887    k: __mmask8,
3888    a: __m128h,
3889    b: __m128h,
3890) -> __m128h {
3891    static_assert_rounding!(ROUNDING);
3892    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3893}
3894
3895/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3896/// the results in dst.
3897///
3898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3899#[inline]
3900#[target_feature(enable = "avx512fp16,avx512vl")]
3901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3902pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
3903    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
3904}
3905
3906/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3907/// the result in dst.
3908///
3909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3910#[inline]
3911#[target_feature(enable = "avx512fp16,avx512vl")]
3912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3913pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3914    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
3915}
3916
3917/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3918/// the result in dst.
3919///
3920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3921#[inline]
3922#[target_feature(enable = "avx512fp16")]
3923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3924pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3925    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
3926}
3927
3928/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3929/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3930/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3931/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3932///
3933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
3934#[inline]
3935#[target_feature(enable = "avx512fp16,avx512vl")]
3936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3937pub fn _mm_conj_pch(a: __m128h) -> __m128h {
3938    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
3939}
3940
3941/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3942/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3943/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
3944/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3945///
3946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
3947#[inline]
3948#[target_feature(enable = "avx512fp16,avx512vl")]
3949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3950pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
3951    unsafe {
3952        let r: __m128 = transmute(_mm_conj_pch(a));
3953        transmute(simd_select_bitmask(k, r, transmute(src)))
3954    }
3955}
3956
3957/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3958/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3959/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3960/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3961///
3962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
3963#[inline]
3964#[target_feature(enable = "avx512fp16,avx512vl")]
3965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3966pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
3967    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
3968}
3969
3970/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3971/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3972/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3973///
3974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
3975#[inline]
3976#[target_feature(enable = "avx512fp16,avx512vl")]
3977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3978pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
3979    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
3980}
3981
3982/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3983/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3984/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3985/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3986///
3987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
3988#[inline]
3989#[target_feature(enable = "avx512fp16,avx512vl")]
3990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3991pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
3992    unsafe {
3993        let r: __m256 = transmute(_mm256_conj_pch(a));
3994        transmute(simd_select_bitmask(k, r, transmute(src)))
3995    }
3996}
3997
3998/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3999/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4000/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4001/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4004#[inline]
4005#[target_feature(enable = "avx512fp16,avx512vl")]
4006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4007pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4008    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4009}
4010
4011/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4012/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4013/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4014///
4015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4016#[inline]
4017#[target_feature(enable = "avx512fp16")]
4018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4019pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
4020    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4021}
4022
4023/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4024/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4025/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4026/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4027///
4028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4029#[inline]
4030#[target_feature(enable = "avx512fp16")]
4031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4032pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4033    unsafe {
4034        let r: __m512 = transmute(_mm512_conj_pch(a));
4035        transmute(simd_select_bitmask(k, r, transmute(src)))
4036    }
4037}
4038
4039/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4040/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4041/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4042/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4043///
4044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4045#[inline]
4046#[target_feature(enable = "avx512fp16")]
4047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4048pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4049    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4050}
4051
4052/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4053/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4054/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4055///
4056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4057#[inline]
4058#[target_feature(enable = "avx512fp16,avx512vl")]
4059#[cfg_attr(test, assert_instr(vfmaddcph))]
4060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4061pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4062    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4063}
4064
4065/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4066/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4067/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4068/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4069///
4070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4071#[inline]
4072#[target_feature(enable = "avx512fp16,avx512vl")]
4073#[cfg_attr(test, assert_instr(vfmaddcph))]
4074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4075pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4076    unsafe {
4077        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4078        transmute(simd_select_bitmask(k, r, transmute(a)))
4079    }
4080}
4081
4082/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4083/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4084/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4085/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4086///
4087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4088#[inline]
4089#[target_feature(enable = "avx512fp16,avx512vl")]
4090#[cfg_attr(test, assert_instr(vfmaddcph))]
4091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4092pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4093    unsafe {
4094        transmute(vfmaddcph_mask3_128(
4095            transmute(a),
4096            transmute(b),
4097            transmute(c),
4098            k,
4099        ))
4100    }
4101}
4102
4103/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4104/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4105/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4106/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4107///
4108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4109#[inline]
4110#[target_feature(enable = "avx512fp16,avx512vl")]
4111#[cfg_attr(test, assert_instr(vfmaddcph))]
4112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4113pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4114    unsafe {
4115        transmute(vfmaddcph_maskz_128(
4116            transmute(a),
4117            transmute(b),
4118            transmute(c),
4119            k,
4120        ))
4121    }
4122}
4123
4124/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4125/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4126/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4127///
4128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4129#[inline]
4130#[target_feature(enable = "avx512fp16,avx512vl")]
4131#[cfg_attr(test, assert_instr(vfmaddcph))]
4132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4133pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4134    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4135}
4136
4137/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4138/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4139/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4140/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4141///
4142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4143#[inline]
4144#[target_feature(enable = "avx512fp16,avx512vl")]
4145#[cfg_attr(test, assert_instr(vfmaddcph))]
4146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4147pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4148    unsafe {
4149        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4150        transmute(simd_select_bitmask(k, r, transmute(a)))
4151    }
4152}
4153
4154/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4155/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4156/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4157/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4158///
4159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4160#[inline]
4161#[target_feature(enable = "avx512fp16,avx512vl")]
4162#[cfg_attr(test, assert_instr(vfmaddcph))]
4163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4164pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4165    unsafe {
4166        transmute(vfmaddcph_mask3_256(
4167            transmute(a),
4168            transmute(b),
4169            transmute(c),
4170            k,
4171        ))
4172    }
4173}
4174
4175/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4176/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4177/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4178/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4179///
4180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4181#[inline]
4182#[target_feature(enable = "avx512fp16,avx512vl")]
4183#[cfg_attr(test, assert_instr(vfmaddcph))]
4184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4185pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4186    unsafe {
4187        transmute(vfmaddcph_maskz_256(
4188            transmute(a),
4189            transmute(b),
4190            transmute(c),
4191            k,
4192        ))
4193    }
4194}
4195
4196/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4197/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4198/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4199///
4200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4201#[inline]
4202#[target_feature(enable = "avx512fp16")]
4203#[cfg_attr(test, assert_instr(vfmaddcph))]
4204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4205pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4206    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4207}
4208
4209/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4210/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4211/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4212/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4213///
4214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4215#[inline]
4216#[target_feature(enable = "avx512fp16")]
4217#[cfg_attr(test, assert_instr(vfmaddcph))]
4218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4219pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4220    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4221}
4222
4223/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4224/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4225/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4226/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4227///
4228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4229#[inline]
4230#[target_feature(enable = "avx512fp16")]
4231#[cfg_attr(test, assert_instr(vfmaddcph))]
4232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4233pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4234    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4235}
4236
4237/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4238/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4239/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4240/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4241///
4242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4243#[inline]
4244#[target_feature(enable = "avx512fp16")]
4245#[cfg_attr(test, assert_instr(vfmaddcph))]
4246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4247pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4248    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4249}
4250
4251/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4252/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4253/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4254///
4255/// Rounding is done according to the rounding parameter, which can be one of:
4256///
4257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4262///
4263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4264#[inline]
4265#[target_feature(enable = "avx512fp16")]
4266#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4267#[rustc_legacy_const_generics(3)]
4268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4269pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4270    static_assert_rounding!(ROUNDING);
4271    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4272}
4273
4274/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4275/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4276/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4277/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4278///
4279/// Rounding is done according to the rounding parameter, which can be one of:
4280///
4281/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4282/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4283/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4284/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4285/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4286///
4287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4288#[inline]
4289#[target_feature(enable = "avx512fp16")]
4290#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4291#[rustc_legacy_const_generics(4)]
4292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4293pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4294    a: __m512h,
4295    k: __mmask16,
4296    b: __m512h,
4297    c: __m512h,
4298) -> __m512h {
4299    unsafe {
4300        static_assert_rounding!(ROUNDING);
4301        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4302        transmute(simd_select_bitmask(k, r, transmute(a)))
4303    }
4304}
4305
4306/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4307/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4308/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4309/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4310///
4311/// Rounding is done according to the rounding parameter, which can be one of:
4312///
4313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4318///
4319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4320#[inline]
4321#[target_feature(enable = "avx512fp16")]
4322#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4323#[rustc_legacy_const_generics(4)]
4324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4325pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4326    a: __m512h,
4327    b: __m512h,
4328    c: __m512h,
4329    k: __mmask16,
4330) -> __m512h {
4331    unsafe {
4332        static_assert_rounding!(ROUNDING);
4333        transmute(vfmaddcph_mask3_512(
4334            transmute(a),
4335            transmute(b),
4336            transmute(c),
4337            k,
4338            ROUNDING,
4339        ))
4340    }
4341}
4342
4343/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4344/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4345/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4346/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4347///
4348/// Rounding is done according to the rounding parameter, which can be one of:
4349///
4350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4355///
4356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4357#[inline]
4358#[target_feature(enable = "avx512fp16")]
4359#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4360#[rustc_legacy_const_generics(4)]
4361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4362pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4363    k: __mmask16,
4364    a: __m512h,
4365    b: __m512h,
4366    c: __m512h,
4367) -> __m512h {
4368    unsafe {
4369        static_assert_rounding!(ROUNDING);
4370        transmute(vfmaddcph_maskz_512(
4371            transmute(a),
4372            transmute(b),
4373            transmute(c),
4374            k,
4375            ROUNDING,
4376        ))
4377    }
4378}
4379
4380/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4381/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4382/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4383/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4384///
4385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4386#[inline]
4387#[target_feature(enable = "avx512fp16")]
4388#[cfg_attr(test, assert_instr(vfmaddcsh))]
4389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4390pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4391    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4392}
4393
4394/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4395/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4396/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4397/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4398/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4399///
4400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4401#[inline]
4402#[target_feature(enable = "avx512fp16")]
4403#[cfg_attr(test, assert_instr(vfmaddcsh))]
4404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4405pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4406    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4407}
4408
4409/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4410/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4411/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4412/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4413/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4414///
4415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4416#[inline]
4417#[target_feature(enable = "avx512fp16")]
4418#[cfg_attr(test, assert_instr(vfmaddcsh))]
4419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4420pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4421    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4422}
4423
4424/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4425/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4426/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4427/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4428/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4429///
4430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4431#[inline]
4432#[target_feature(enable = "avx512fp16")]
4433#[cfg_attr(test, assert_instr(vfmaddcsh))]
4434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4435pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4436    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4437}
4438
4439/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4440/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4441/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4442///
4443/// Rounding is done according to the rounding parameter, which can be one of:
4444///
4445/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4446/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4447/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4448/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4449/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4450///
4451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4452#[inline]
4453#[target_feature(enable = "avx512fp16")]
4454#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4455#[rustc_legacy_const_generics(3)]
4456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4457pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4458    unsafe {
4459        static_assert_rounding!(ROUNDING);
4460        transmute(vfmaddcsh_mask(
4461            transmute(a),
4462            transmute(b),
4463            transmute(c),
4464            0xff,
4465            ROUNDING,
4466        ))
4467    }
4468}
4469
4470/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4471/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4472/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4473/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4474/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4475///
4476/// Rounding is done according to the rounding parameter, which can be one of:
4477///
4478/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4479/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4480/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4481/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4482/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4483///
4484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4485#[inline]
4486#[target_feature(enable = "avx512fp16")]
4487#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4488#[rustc_legacy_const_generics(4)]
4489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4490pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4491    a: __m128h,
4492    k: __mmask8,
4493    b: __m128h,
4494    c: __m128h,
4495) -> __m128h {
4496    unsafe {
4497        static_assert_rounding!(ROUNDING);
4498        let a = transmute(a);
4499        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4500        transmute(_mm_mask_move_ss(a, k, a, r))
4501    }
4502}
4503
4504/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4505/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4506/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4507/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4508/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4509///
4510/// Rounding is done according to the rounding parameter, which can be one of:
4511///
4512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4516/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4517///
4518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4519#[inline]
4520#[target_feature(enable = "avx512fp16")]
4521#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4522#[rustc_legacy_const_generics(4)]
4523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4524pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4525    a: __m128h,
4526    b: __m128h,
4527    c: __m128h,
4528    k: __mmask8,
4529) -> __m128h {
4530    unsafe {
4531        static_assert_rounding!(ROUNDING);
4532        let c = transmute(c);
4533        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4534        transmute(_mm_move_ss(c, r))
4535    }
4536}
4537
4538/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4539/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4540/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4541/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4542/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4543///
4544/// Rounding is done according to the rounding parameter, which can be one of:
4545///
4546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4551///
4552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4553#[inline]
4554#[target_feature(enable = "avx512fp16")]
4555#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4556#[rustc_legacy_const_generics(4)]
4557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4558pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4559    k: __mmask8,
4560    a: __m128h,
4561    b: __m128h,
4562    c: __m128h,
4563) -> __m128h {
4564    unsafe {
4565        static_assert_rounding!(ROUNDING);
4566        let a = transmute(a);
4567        let r = vfmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
4568        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfmaddcsh_maskz` to output an all-zero vector, which is incorrect
4569    }
4570}
4571
4572/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4573/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4574/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4575/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4576///
4577/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4578#[inline]
4579#[target_feature(enable = "avx512fp16,avx512vl")]
4580#[cfg_attr(test, assert_instr(vfcmaddcph))]
4581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4582pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4583    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4584}
4585
4586/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4587/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4588/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4589/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4590/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4591///
4592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4593#[inline]
4594#[target_feature(enable = "avx512fp16,avx512vl")]
4595#[cfg_attr(test, assert_instr(vfcmaddcph))]
4596#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4597pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4598    unsafe {
4599        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4600        transmute(simd_select_bitmask(k, r, transmute(a)))
4601    }
4602}
4603
4604/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4605/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4606/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4607/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4608/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4609///
4610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4611#[inline]
4612#[target_feature(enable = "avx512fp16,avx512vl")]
4613#[cfg_attr(test, assert_instr(vfcmaddcph))]
4614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4615pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4616    unsafe {
4617        transmute(vfcmaddcph_mask3_128(
4618            transmute(a),
4619            transmute(b),
4620            transmute(c),
4621            k,
4622        ))
4623    }
4624}
4625
4626/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4627/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4628/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4629/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4630/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4631///
4632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4633#[inline]
4634#[target_feature(enable = "avx512fp16,avx512vl")]
4635#[cfg_attr(test, assert_instr(vfcmaddcph))]
4636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4637pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4638    unsafe {
4639        transmute(vfcmaddcph_maskz_128(
4640            transmute(a),
4641            transmute(b),
4642            transmute(c),
4643            k,
4644        ))
4645    }
4646}
4647
4648/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4649/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4650/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4651/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4652///
4653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4654#[inline]
4655#[target_feature(enable = "avx512fp16,avx512vl")]
4656#[cfg_attr(test, assert_instr(vfcmaddcph))]
4657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4658pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4659    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4660}
4661
4662/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4663/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4664/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4665/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4666/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4667///
4668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4669#[inline]
4670#[target_feature(enable = "avx512fp16,avx512vl")]
4671#[cfg_attr(test, assert_instr(vfcmaddcph))]
4672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4673pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4674    unsafe {
4675        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4676        transmute(simd_select_bitmask(k, r, transmute(a)))
4677    }
4678}
4679
4680/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4681/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4682/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4683/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4684/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4685///
4686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4687#[inline]
4688#[target_feature(enable = "avx512fp16,avx512vl")]
4689#[cfg_attr(test, assert_instr(vfcmaddcph))]
4690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4691pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4692    unsafe {
4693        transmute(vfcmaddcph_mask3_256(
4694            transmute(a),
4695            transmute(b),
4696            transmute(c),
4697            k,
4698        ))
4699    }
4700}
4701
4702/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4703/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4704/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4705/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4706/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4707///
4708/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4709#[inline]
4710#[target_feature(enable = "avx512fp16,avx512vl")]
4711#[cfg_attr(test, assert_instr(vfcmaddcph))]
4712#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4713pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4714    unsafe {
4715        transmute(vfcmaddcph_maskz_256(
4716            transmute(a),
4717            transmute(b),
4718            transmute(c),
4719            k,
4720        ))
4721    }
4722}
4723
4724/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4725/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4726/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4727/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4728///
4729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4730#[inline]
4731#[target_feature(enable = "avx512fp16")]
4732#[cfg_attr(test, assert_instr(vfcmaddcph))]
4733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4734pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4735    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4736}
4737
4738/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4739/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4740/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4741/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4742/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4743///
4744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4745#[inline]
4746#[target_feature(enable = "avx512fp16")]
4747#[cfg_attr(test, assert_instr(vfcmaddcph))]
4748#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4749pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4750    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4751}
4752
4753/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4754/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4755/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4756/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4757/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4758///
4759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4760#[inline]
4761#[target_feature(enable = "avx512fp16")]
4762#[cfg_attr(test, assert_instr(vfcmaddcph))]
4763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4764pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4765    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4766}
4767
4768/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4769/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4770/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4771/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4772/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4773///
4774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4775#[inline]
4776#[target_feature(enable = "avx512fp16")]
4777#[cfg_attr(test, assert_instr(vfcmaddcph))]
4778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4779pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4780    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4781}
4782
4783/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4784/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4785/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4786/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4787///
4788/// Rounding is done according to the rounding parameter, which can be one of:
4789///
4790/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4791/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4792/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4793/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4794/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4795///
4796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4797#[inline]
4798#[target_feature(enable = "avx512fp16")]
4799#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4800#[rustc_legacy_const_generics(3)]
4801#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4802pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4803    static_assert_rounding!(ROUNDING);
4804    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4805}
4806
4807/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4808/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4809/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4810/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4811/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4812///
4813/// Rounding is done according to the rounding parameter, which can be one of:
4814///
4815/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4816/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4817/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4818/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4819/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4820///
4821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4822#[inline]
4823#[target_feature(enable = "avx512fp16")]
4824#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4825#[rustc_legacy_const_generics(4)]
4826#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4827pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4828    a: __m512h,
4829    k: __mmask16,
4830    b: __m512h,
4831    c: __m512h,
4832) -> __m512h {
4833    unsafe {
4834        static_assert_rounding!(ROUNDING);
4835        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4836        transmute(simd_select_bitmask(k, r, transmute(a)))
4837    }
4838}
4839
4840/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4841/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4842/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4843/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4844/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4845///
4846/// Rounding is done according to the rounding parameter, which can be one of:
4847///
4848/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4849/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4850/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4851/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4852/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4853///
4854/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4855#[inline]
4856#[target_feature(enable = "avx512fp16")]
4857#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4858#[rustc_legacy_const_generics(4)]
4859#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4860pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4861    a: __m512h,
4862    b: __m512h,
4863    c: __m512h,
4864    k: __mmask16,
4865) -> __m512h {
4866    unsafe {
4867        static_assert_rounding!(ROUNDING);
4868        transmute(vfcmaddcph_mask3_512(
4869            transmute(a),
4870            transmute(b),
4871            transmute(c),
4872            k,
4873            ROUNDING,
4874        ))
4875    }
4876}
4877
4878/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4879/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4880/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4881/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4882/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4883///
4884/// Rounding is done according to the rounding parameter, which can be one of:
4885///
4886/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4887/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4888/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4889/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4890/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4891///
4892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4893#[inline]
4894#[target_feature(enable = "avx512fp16")]
4895#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4896#[rustc_legacy_const_generics(4)]
4897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4898pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4899    k: __mmask16,
4900    a: __m512h,
4901    b: __m512h,
4902    c: __m512h,
4903) -> __m512h {
4904    unsafe {
4905        static_assert_rounding!(ROUNDING);
4906        transmute(vfcmaddcph_maskz_512(
4907            transmute(a),
4908            transmute(b),
4909            transmute(c),
4910            k,
4911            ROUNDING,
4912        ))
4913    }
4914}
4915
4916/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4917/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4918/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4919/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4920/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4921///
4922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4923#[inline]
4924#[target_feature(enable = "avx512fp16")]
4925#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4927pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4928    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4929}
4930
4931/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4932/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4933/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4934/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4935/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4936/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4937///
4938/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
4939#[inline]
4940#[target_feature(enable = "avx512fp16")]
4941#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4943pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4944    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4945}
4946
4947/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4948/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4949/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4950/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4951/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4952/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4953///
4954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
4955#[inline]
4956#[target_feature(enable = "avx512fp16")]
4957#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4959pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4960    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4961}
4962
4963/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4964/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4965/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
4966/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4967/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4968/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4969///
4970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
4971#[inline]
4972#[target_feature(enable = "avx512fp16")]
4973#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4975pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4976    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4977}
4978
4979/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4980/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4981/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4982/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4983/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4984///
4985/// Rounding is done according to the rounding parameter, which can be one of:
4986///
4987/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4988/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4989/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4990/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4991/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4992///
4993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
4994#[inline]
4995#[target_feature(enable = "avx512fp16")]
4996#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4997#[rustc_legacy_const_generics(3)]
4998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4999pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5000    unsafe {
5001        static_assert_rounding!(ROUNDING);
5002        transmute(vfcmaddcsh_mask(
5003            transmute(a),
5004            transmute(b),
5005            transmute(c),
5006            0xff,
5007            ROUNDING,
5008        ))
5009    }
5010}
5011
5012/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5013/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5014/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5015/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5016/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5017/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5018///
5019/// Rounding is done according to the rounding parameter, which can be one of:
5020///
5021/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5022/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5023/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5024/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5025/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5026///
5027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5028#[inline]
5029#[target_feature(enable = "avx512fp16")]
5030#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5031#[rustc_legacy_const_generics(4)]
5032#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5033pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5034    a: __m128h,
5035    k: __mmask8,
5036    b: __m128h,
5037    c: __m128h,
5038) -> __m128h {
5039    unsafe {
5040        static_assert_rounding!(ROUNDING);
5041        let a = transmute(a);
5042        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5043        transmute(_mm_mask_move_ss(a, k, a, r))
5044    }
5045}
5046
5047/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5048/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5049/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5050/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5051/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5052/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5053///
5054/// Rounding is done according to the rounding parameter, which can be one of:
5055///
5056/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5057/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5058/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5059/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5060/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5061///
5062/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5063#[inline]
5064#[target_feature(enable = "avx512fp16")]
5065#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5066#[rustc_legacy_const_generics(4)]
5067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5068pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5069    a: __m128h,
5070    b: __m128h,
5071    c: __m128h,
5072    k: __mmask8,
5073) -> __m128h {
5074    unsafe {
5075        static_assert_rounding!(ROUNDING);
5076        let c = transmute(c);
5077        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5078        transmute(_mm_move_ss(c, r))
5079    }
5080}
5081
5082/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5083/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5084/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5085/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5086/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5087/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5088///
5089/// Rounding is done according to the rounding parameter, which can be one of:
5090///
5091/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5092/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5093/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5094/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5095/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5096///
5097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5098#[inline]
5099#[target_feature(enable = "avx512fp16")]
5100#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5101#[rustc_legacy_const_generics(4)]
5102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5103pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5104    k: __mmask8,
5105    a: __m128h,
5106    b: __m128h,
5107    c: __m128h,
5108) -> __m128h {
5109    unsafe {
5110        static_assert_rounding!(ROUNDING);
5111        let a = transmute(a);
5112        let r = vfcmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
5113        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfcmaddcsh_maskz` to output an all-zero vector, which is incorrect
5114    }
5115}
5116
5117/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5118/// result to packed elements in c, and store the results in dst.
5119///
5120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5121#[inline]
5122#[target_feature(enable = "avx512fp16,avx512vl")]
5123#[cfg_attr(test, assert_instr(vfmadd))]
5124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5125pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5126    unsafe { simd_fma(a, b, c) }
5127}
5128
5129/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5130/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5131/// from a when the corresponding mask bit is not set).
5132///
5133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5134#[inline]
5135#[target_feature(enable = "avx512fp16,avx512vl")]
5136#[cfg_attr(test, assert_instr(vfmadd))]
5137#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5138pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5139    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5140}
5141
5142/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5143/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5144/// from c when the corresponding mask bit is not set).
5145///
5146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5147#[inline]
5148#[target_feature(enable = "avx512fp16,avx512vl")]
5149#[cfg_attr(test, assert_instr(vfmadd))]
5150#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5151pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5152    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5153}
5154
5155/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5156/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5157/// out when the corresponding mask bit is not set).
5158///
5159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5160#[inline]
5161#[target_feature(enable = "avx512fp16,avx512vl")]
5162#[cfg_attr(test, assert_instr(vfmadd))]
5163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5164pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5165    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5166}
5167
5168/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5169/// result to packed elements in c, and store the results in dst.
5170///
5171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5172#[inline]
5173#[target_feature(enable = "avx512fp16,avx512vl")]
5174#[cfg_attr(test, assert_instr(vfmadd))]
5175#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5176pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5177    unsafe { simd_fma(a, b, c) }
5178}
5179
5180/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5181/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5182/// from a when the corresponding mask bit is not set).
5183///
5184/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5185#[inline]
5186#[target_feature(enable = "avx512fp16,avx512vl")]
5187#[cfg_attr(test, assert_instr(vfmadd))]
5188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5189pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5190    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5191}
5192
5193/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5194/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5195/// from c when the corresponding mask bit is not set).
5196///
5197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5198#[inline]
5199#[target_feature(enable = "avx512fp16,avx512vl")]
5200#[cfg_attr(test, assert_instr(vfmadd))]
5201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5202pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5203    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5204}
5205
5206/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5207/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5208/// out when the corresponding mask bit is not set).
5209///
5210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5211#[inline]
5212#[target_feature(enable = "avx512fp16,avx512vl")]
5213#[cfg_attr(test, assert_instr(vfmadd))]
5214#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5215pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5216    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5217}
5218
5219/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5220/// result to packed elements in c, and store the results in dst.
5221///
5222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5223#[inline]
5224#[target_feature(enable = "avx512fp16")]
5225#[cfg_attr(test, assert_instr(vfmadd))]
5226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5227pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5228    unsafe { simd_fma(a, b, c) }
5229}
5230
5231/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5232/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5233/// from a when the corresponding mask bit is not set).
5234///
5235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5236#[inline]
5237#[target_feature(enable = "avx512fp16")]
5238#[cfg_attr(test, assert_instr(vfmadd))]
5239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5240pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5241    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5242}
5243
5244/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5245/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5246/// from c when the corresponding mask bit is not set).
5247///
5248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5249#[inline]
5250#[target_feature(enable = "avx512fp16")]
5251#[cfg_attr(test, assert_instr(vfmadd))]
5252#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5253pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5254    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5255}
5256
5257/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5258/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5259/// out when the corresponding mask bit is not set).
5260///
5261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5262#[inline]
5263#[target_feature(enable = "avx512fp16")]
5264#[cfg_attr(test, assert_instr(vfmadd))]
5265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5266pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5267    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5268}
5269
5270/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5271/// result to packed elements in c, and store the results in dst.
5272///
5273/// Rounding is done according to the rounding parameter, which can be one of:
5274///
5275/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5276/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5277/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5278/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5279/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5280///
5281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5282#[inline]
5283#[target_feature(enable = "avx512fp16")]
5284#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5285#[rustc_legacy_const_generics(3)]
5286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5287pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5288    unsafe {
5289        static_assert_rounding!(ROUNDING);
5290        vfmaddph_512(a, b, c, ROUNDING)
5291    }
5292}
5293
5294/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5295/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5296/// from a when the corresponding mask bit is not set).
5297///
5298/// Rounding is done according to the rounding parameter, which can be one of:
5299///
5300/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5301/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5302/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5303/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5304/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5305///
5306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5307#[inline]
5308#[target_feature(enable = "avx512fp16")]
5309#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5310#[rustc_legacy_const_generics(4)]
5311#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5312pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5313    a: __m512h,
5314    k: __mmask32,
5315    b: __m512h,
5316    c: __m512h,
5317) -> __m512h {
5318    unsafe {
5319        static_assert_rounding!(ROUNDING);
5320        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5321    }
5322}
5323
5324/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5325/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5326/// from c when the corresponding mask bit is not set).
5327///
5328/// Rounding is done according to the rounding parameter, which can be one of:
5329///
5330/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5331/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5332/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5333/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5334/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5335///
5336/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5337#[inline]
5338#[target_feature(enable = "avx512fp16")]
5339#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5340#[rustc_legacy_const_generics(4)]
5341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5342pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5343    a: __m512h,
5344    b: __m512h,
5345    c: __m512h,
5346    k: __mmask32,
5347) -> __m512h {
5348    unsafe {
5349        static_assert_rounding!(ROUNDING);
5350        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5351    }
5352}
5353
5354/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5355/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5356/// out when the corresponding mask bit is not set).
5357///
5358/// Rounding is done according to the rounding parameter, which can be one of:
5359///
5360/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5361/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5362/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5363/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5364/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5365///
5366/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5367#[inline]
5368#[target_feature(enable = "avx512fp16")]
5369#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5370#[rustc_legacy_const_generics(4)]
5371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5372pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5373    k: __mmask32,
5374    a: __m512h,
5375    b: __m512h,
5376    c: __m512h,
5377) -> __m512h {
5378    unsafe {
5379        static_assert_rounding!(ROUNDING);
5380        simd_select_bitmask(
5381            k,
5382            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5383            _mm512_setzero_ph(),
5384        )
5385    }
5386}
5387
5388/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5389/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5390/// 7 packed elements from a to the upper elements of dst.
5391///
5392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5393#[inline]
5394#[target_feature(enable = "avx512fp16")]
5395#[cfg_attr(test, assert_instr(vfmadd))]
5396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5397pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5398    unsafe {
5399        let extracta: f16 = simd_extract!(a, 0);
5400        let extractb: f16 = simd_extract!(b, 0);
5401        let extractc: f16 = simd_extract!(c, 0);
5402        let r = fmaf16(extracta, extractb, extractc);
5403        simd_insert!(a, 0, r)
5404    }
5405}
5406
5407/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5408/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5409/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5410/// upper elements of dst.
5411///
5412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5413#[inline]
5414#[target_feature(enable = "avx512fp16")]
5415#[cfg_attr(test, assert_instr(vfmadd))]
5416#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5417pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5418    unsafe {
5419        let mut fmadd: f16 = simd_extract!(a, 0);
5420        if k & 1 != 0 {
5421            let extractb: f16 = simd_extract!(b, 0);
5422            let extractc: f16 = simd_extract!(c, 0);
5423            fmadd = fmaf16(fmadd, extractb, extractc);
5424        }
5425        simd_insert!(a, 0, fmadd)
5426    }
5427}
5428
5429/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5430/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5431/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5432/// upper elements of dst.
5433///
5434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5435#[inline]
5436#[target_feature(enable = "avx512fp16")]
5437#[cfg_attr(test, assert_instr(vfmadd))]
5438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5439pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5440    unsafe {
5441        let mut fmadd: f16 = simd_extract!(c, 0);
5442        if k & 1 != 0 {
5443            let extracta: f16 = simd_extract!(a, 0);
5444            let extractb: f16 = simd_extract!(b, 0);
5445            fmadd = fmaf16(extracta, extractb, fmadd);
5446        }
5447        simd_insert!(c, 0, fmadd)
5448    }
5449}
5450
5451/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5452/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5453/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5454/// upper elements of dst.
5455///
5456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5457#[inline]
5458#[target_feature(enable = "avx512fp16")]
5459#[cfg_attr(test, assert_instr(vfmadd))]
5460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5461pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5462    unsafe {
5463        let mut fmadd: f16 = 0.0;
5464        if k & 1 != 0 {
5465            let extracta: f16 = simd_extract!(a, 0);
5466            let extractb: f16 = simd_extract!(b, 0);
5467            let extractc: f16 = simd_extract!(c, 0);
5468            fmadd = fmaf16(extracta, extractb, extractc);
5469        }
5470        simd_insert!(a, 0, fmadd)
5471    }
5472}
5473
5474/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5475/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5476/// 7 packed elements from a to the upper elements of dst.
5477///
5478/// Rounding is done according to the rounding parameter, which can be one of:
5479///
5480/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5481/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5482/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5483/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5484/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5485///
5486/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5487#[inline]
5488#[target_feature(enable = "avx512fp16")]
5489#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5490#[rustc_legacy_const_generics(3)]
5491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5492pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5493    unsafe {
5494        static_assert_rounding!(ROUNDING);
5495        let extracta: f16 = simd_extract!(a, 0);
5496        let extractb: f16 = simd_extract!(b, 0);
5497        let extractc: f16 = simd_extract!(c, 0);
5498        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5499        simd_insert!(a, 0, r)
5500    }
5501}
5502
5503/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5504/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5505/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5506/// upper elements of dst.
5507///
5508/// Rounding is done according to the rounding parameter, which can be one of:
5509///
5510/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5511/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5512/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5513/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5514/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5515///
5516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5517#[inline]
5518#[target_feature(enable = "avx512fp16")]
5519#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5520#[rustc_legacy_const_generics(4)]
5521#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5522pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5523    a: __m128h,
5524    k: __mmask8,
5525    b: __m128h,
5526    c: __m128h,
5527) -> __m128h {
5528    unsafe {
5529        static_assert_rounding!(ROUNDING);
5530        let mut fmadd: f16 = simd_extract!(a, 0);
5531        if k & 1 != 0 {
5532            let extractb: f16 = simd_extract!(b, 0);
5533            let extractc: f16 = simd_extract!(c, 0);
5534            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5535        }
5536        simd_insert!(a, 0, fmadd)
5537    }
5538}
5539
5540/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5541/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5542/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5543/// upper elements of dst.
5544///
5545/// Rounding is done according to the rounding parameter, which can be one of:
5546///
5547/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5548/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5549/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5550/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5551/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5552///
5553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5554#[inline]
5555#[target_feature(enable = "avx512fp16")]
5556#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5557#[rustc_legacy_const_generics(4)]
5558#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5559pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5560    a: __m128h,
5561    b: __m128h,
5562    c: __m128h,
5563    k: __mmask8,
5564) -> __m128h {
5565    unsafe {
5566        static_assert_rounding!(ROUNDING);
5567        let mut fmadd: f16 = simd_extract!(c, 0);
5568        if k & 1 != 0 {
5569            let extracta: f16 = simd_extract!(a, 0);
5570            let extractb: f16 = simd_extract!(b, 0);
5571            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5572        }
5573        simd_insert!(c, 0, fmadd)
5574    }
5575}
5576
5577/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5578/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5579/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5580/// upper elements of dst.
5581///
5582/// Rounding is done according to the rounding parameter, which can be one of:
5583///
5584/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5585/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5586/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5587/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5588/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5589///
5590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5591#[inline]
5592#[target_feature(enable = "avx512fp16")]
5593#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5594#[rustc_legacy_const_generics(4)]
5595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5596pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5597    k: __mmask8,
5598    a: __m128h,
5599    b: __m128h,
5600    c: __m128h,
5601) -> __m128h {
5602    unsafe {
5603        static_assert_rounding!(ROUNDING);
5604        let mut fmadd: f16 = 0.0;
5605        if k & 1 != 0 {
5606            let extracta: f16 = simd_extract!(a, 0);
5607            let extractb: f16 = simd_extract!(b, 0);
5608            let extractc: f16 = simd_extract!(c, 0);
5609            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5610        }
5611        simd_insert!(a, 0, fmadd)
5612    }
5613}
5614
5615/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5616/// in c from the intermediate result, and store the results in dst.
5617/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5618///
5619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5620#[inline]
5621#[target_feature(enable = "avx512fp16,avx512vl")]
5622#[cfg_attr(test, assert_instr(vfmsub))]
5623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5624pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5625    unsafe { simd_fma(a, b, simd_neg(c)) }
5626}
5627
5628/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5629/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5630/// from a when the corresponding mask bit is not set).
5631///
5632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5633#[inline]
5634#[target_feature(enable = "avx512fp16,avx512vl")]
5635#[cfg_attr(test, assert_instr(vfmsub))]
5636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5637pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5638    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5639}
5640
5641/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5642/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5643/// from c when the corresponding mask bit is not set).
5644///
5645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5646#[inline]
5647#[target_feature(enable = "avx512fp16,avx512vl")]
5648#[cfg_attr(test, assert_instr(vfmsub))]
5649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5650pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5651    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5652}
5653
5654/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5655/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5656/// out when the corresponding mask bit is not set).
5657///
5658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5659#[inline]
5660#[target_feature(enable = "avx512fp16,avx512vl")]
5661#[cfg_attr(test, assert_instr(vfmsub))]
5662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5663pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5664    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5665}
5666
5667/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5668/// in c from the intermediate result, and store the results in dst.
5669///
5670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5671#[inline]
5672#[target_feature(enable = "avx512fp16,avx512vl")]
5673#[cfg_attr(test, assert_instr(vfmsub))]
5674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5675pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5676    unsafe { simd_fma(a, b, simd_neg(c)) }
5677}
5678
5679/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5680/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5681/// from a when the corresponding mask bit is not set).
5682///
5683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5684#[inline]
5685#[target_feature(enable = "avx512fp16,avx512vl")]
5686#[cfg_attr(test, assert_instr(vfmsub))]
5687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5688pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5689    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5690}
5691
5692/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5693/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5694/// from c when the corresponding mask bit is not set).
5695///
5696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5697#[inline]
5698#[target_feature(enable = "avx512fp16,avx512vl")]
5699#[cfg_attr(test, assert_instr(vfmsub))]
5700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5701pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5702    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5703}
5704
5705/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5706/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5707/// out when the corresponding mask bit is not set).
5708///
5709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5710#[inline]
5711#[target_feature(enable = "avx512fp16,avx512vl")]
5712#[cfg_attr(test, assert_instr(vfmsub))]
5713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5714pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5715    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5716}
5717
5718/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5719/// in c from the intermediate result, and store the results in dst.
5720///
5721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5722#[inline]
5723#[target_feature(enable = "avx512fp16")]
5724#[cfg_attr(test, assert_instr(vfmsub))]
5725#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5726pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5727    unsafe { simd_fma(a, b, simd_neg(c)) }
5728}
5729
5730/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5731/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5732/// from a when the corresponding mask bit is not set).
5733///
5734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5735#[inline]
5736#[target_feature(enable = "avx512fp16")]
5737#[cfg_attr(test, assert_instr(vfmsub))]
5738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5739pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5740    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5741}
5742
5743/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5744/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5745/// from c when the corresponding mask bit is not set).
5746///
5747/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5748#[inline]
5749#[target_feature(enable = "avx512fp16")]
5750#[cfg_attr(test, assert_instr(vfmsub))]
5751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5752pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5753    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5754}
5755
5756/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5757/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5758/// out when the corresponding mask bit is not set).
5759///
5760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5761#[inline]
5762#[target_feature(enable = "avx512fp16")]
5763#[cfg_attr(test, assert_instr(vfmsub))]
5764#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5765pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5766    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5767}
5768
5769/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5770/// in c from the intermediate result, and store the results in dst.
5771///
5772/// Rounding is done according to the rounding parameter, which can be one of:
5773///
5774/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5775/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5776/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5777/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5778/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5779///
5780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5781#[inline]
5782#[target_feature(enable = "avx512fp16")]
5783#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5784#[rustc_legacy_const_generics(3)]
5785#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5786pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5787    unsafe {
5788        static_assert_rounding!(ROUNDING);
5789        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5790    }
5791}
5792
5793/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5794/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5795/// from a when the corresponding mask bit is not set).
5796///
5797/// Rounding is done according to the rounding parameter, which can be one of:
5798///
5799/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5800/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5801/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5802/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5803/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5804///
5805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5806#[inline]
5807#[target_feature(enable = "avx512fp16")]
5808#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5809#[rustc_legacy_const_generics(4)]
5810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5811pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5812    a: __m512h,
5813    k: __mmask32,
5814    b: __m512h,
5815    c: __m512h,
5816) -> __m512h {
5817    unsafe {
5818        static_assert_rounding!(ROUNDING);
5819        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5820    }
5821}
5822
5823/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5824/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5825/// from c when the corresponding mask bit is not set).
5826///
5827/// Rounding is done according to the rounding parameter, which can be one of:
5828///
5829/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5830/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5831/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5832/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5833/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5834///
5835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5836#[inline]
5837#[target_feature(enable = "avx512fp16")]
5838#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5839#[rustc_legacy_const_generics(4)]
5840#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5841pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5842    a: __m512h,
5843    b: __m512h,
5844    c: __m512h,
5845    k: __mmask32,
5846) -> __m512h {
5847    unsafe {
5848        static_assert_rounding!(ROUNDING);
5849        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5850    }
5851}
5852
5853/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5854/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5855/// out when the corresponding mask bit is not set).
5856///
5857/// Rounding is done according to the rounding parameter, which can be one of:
5858///
5859/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5860/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5861/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5862/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5863/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5864///
5865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5866#[inline]
5867#[target_feature(enable = "avx512fp16")]
5868#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5869#[rustc_legacy_const_generics(4)]
5870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5871pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5872    k: __mmask32,
5873    a: __m512h,
5874    b: __m512h,
5875    c: __m512h,
5876) -> __m512h {
5877    unsafe {
5878        static_assert_rounding!(ROUNDING);
5879        simd_select_bitmask(
5880            k,
5881            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5882            _mm512_setzero_ph(),
5883        )
5884    }
5885}
5886
5887/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5888/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5889/// 7 packed elements from a to the upper elements of dst.
5890///
5891/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5892#[inline]
5893#[target_feature(enable = "avx512fp16")]
5894#[cfg_attr(test, assert_instr(vfmsub))]
5895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5896pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5897    unsafe {
5898        let extracta: f16 = simd_extract!(a, 0);
5899        let extractb: f16 = simd_extract!(b, 0);
5900        let extractc: f16 = simd_extract!(c, 0);
5901        let r = fmaf16(extracta, extractb, -extractc);
5902        simd_insert!(a, 0, r)
5903    }
5904}
5905
5906/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5907/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5908/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5909/// upper elements of dst.
5910///
5911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5912#[inline]
5913#[target_feature(enable = "avx512fp16")]
5914#[cfg_attr(test, assert_instr(vfmsub))]
5915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5916pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5917    unsafe {
5918        let mut fmsub: f16 = simd_extract!(a, 0);
5919        if k & 1 != 0 {
5920            let extractb: f16 = simd_extract!(b, 0);
5921            let extractc: f16 = simd_extract!(c, 0);
5922            fmsub = fmaf16(fmsub, extractb, -extractc);
5923        }
5924        simd_insert!(a, 0, fmsub)
5925    }
5926}
5927
5928/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5929/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5930/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5931/// upper elements of dst.
5932///
5933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
5934#[inline]
5935#[target_feature(enable = "avx512fp16")]
5936#[cfg_attr(test, assert_instr(vfmsub))]
5937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5938pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5939    unsafe {
5940        let mut fmsub: f16 = simd_extract!(c, 0);
5941        if k & 1 != 0 {
5942            let extracta: f16 = simd_extract!(a, 0);
5943            let extractb: f16 = simd_extract!(b, 0);
5944            fmsub = fmaf16(extracta, extractb, -fmsub);
5945        }
5946        simd_insert!(c, 0, fmsub)
5947    }
5948}
5949
5950/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5951/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5952/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5953/// upper elements of dst.
5954///
5955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
5956#[inline]
5957#[target_feature(enable = "avx512fp16")]
5958#[cfg_attr(test, assert_instr(vfmsub))]
5959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5960pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5961    unsafe {
5962        let mut fmsub: f16 = 0.0;
5963        if k & 1 != 0 {
5964            let extracta: f16 = simd_extract!(a, 0);
5965            let extractb: f16 = simd_extract!(b, 0);
5966            let extractc: f16 = simd_extract!(c, 0);
5967            fmsub = fmaf16(extracta, extractb, -extractc);
5968        }
5969        simd_insert!(a, 0, fmsub)
5970    }
5971}
5972
5973/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5974/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5975/// 7 packed elements from a to the upper elements of dst.
5976///
5977/// Rounding is done according to the rounding parameter, which can be one of:
5978///
5979/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5980/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5981/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5982/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5983/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5984///
5985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
5986#[inline]
5987#[target_feature(enable = "avx512fp16")]
5988#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5989#[rustc_legacy_const_generics(3)]
5990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5991pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5992    unsafe {
5993        static_assert_rounding!(ROUNDING);
5994        let extracta: f16 = simd_extract!(a, 0);
5995        let extractb: f16 = simd_extract!(b, 0);
5996        let extractc: f16 = simd_extract!(c, 0);
5997        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
5998        simd_insert!(a, 0, r)
5999    }
6000}
6001
6002/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6003/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6004/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6005/// upper elements of dst.
6006///
6007/// Rounding is done according to the rounding parameter, which can be one of:
6008///
6009/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6010/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6011/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6012/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6013/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6014///
6015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6016#[inline]
6017#[target_feature(enable = "avx512fp16")]
6018#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6019#[rustc_legacy_const_generics(4)]
6020#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6021pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6022    a: __m128h,
6023    k: __mmask8,
6024    b: __m128h,
6025    c: __m128h,
6026) -> __m128h {
6027    unsafe {
6028        static_assert_rounding!(ROUNDING);
6029        let mut fmsub: f16 = simd_extract!(a, 0);
6030        if k & 1 != 0 {
6031            let extractb: f16 = simd_extract!(b, 0);
6032            let extractc: f16 = simd_extract!(c, 0);
6033            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6034        }
6035        simd_insert!(a, 0, fmsub)
6036    }
6037}
6038
6039/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6040/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6041/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6042/// upper elements of dst.
6043///
6044/// Rounding is done according to the rounding parameter, which can be one of:
6045///
6046/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6047/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6048/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6049/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6050/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6051///
6052/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6053#[inline]
6054#[target_feature(enable = "avx512fp16")]
6055#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6056#[rustc_legacy_const_generics(4)]
6057#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6058pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6059    a: __m128h,
6060    b: __m128h,
6061    c: __m128h,
6062    k: __mmask8,
6063) -> __m128h {
6064    unsafe {
6065        static_assert_rounding!(ROUNDING);
6066        let mut fmsub: f16 = simd_extract!(c, 0);
6067        if k & 1 != 0 {
6068            let extracta: f16 = simd_extract!(a, 0);
6069            let extractb: f16 = simd_extract!(b, 0);
6070            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6071        }
6072        simd_insert!(c, 0, fmsub)
6073    }
6074}
6075
6076/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6077/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6078/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6079/// upper elements of dst.
6080///
6081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6082#[inline]
6083#[target_feature(enable = "avx512fp16")]
6084#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6085#[rustc_legacy_const_generics(4)]
6086#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6087pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6088    k: __mmask8,
6089    a: __m128h,
6090    b: __m128h,
6091    c: __m128h,
6092) -> __m128h {
6093    unsafe {
6094        static_assert_rounding!(ROUNDING);
6095        let mut fmsub: f16 = 0.0;
6096        if k & 1 != 0 {
6097            let extracta: f16 = simd_extract!(a, 0);
6098            let extractb: f16 = simd_extract!(b, 0);
6099            let extractc: f16 = simd_extract!(c, 0);
6100            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6101        }
6102        simd_insert!(a, 0, fmsub)
6103    }
6104}
6105
6106/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6107/// result from packed elements in c, and store the results in dst.
6108///
6109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6110#[inline]
6111#[target_feature(enable = "avx512fp16,avx512vl")]
6112#[cfg_attr(test, assert_instr(vfnmadd))]
6113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6114pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6115    unsafe { simd_fma(simd_neg(a), b, c) }
6116}
6117
6118/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6119/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6120/// from a when the corresponding mask bit is not set).
6121///
6122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6123#[inline]
6124#[target_feature(enable = "avx512fp16,avx512vl")]
6125#[cfg_attr(test, assert_instr(vfnmadd))]
6126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6127pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6128    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6129}
6130
6131/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6132/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6133/// from c when the corresponding mask bit is not set).
6134///
6135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6136#[inline]
6137#[target_feature(enable = "avx512fp16,avx512vl")]
6138#[cfg_attr(test, assert_instr(vfnmadd))]
6139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6140pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6141    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6142}
6143
6144/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6145/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6146/// out when the corresponding mask bit is not set).
6147///
6148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6149#[inline]
6150#[target_feature(enable = "avx512fp16,avx512vl")]
6151#[cfg_attr(test, assert_instr(vfnmadd))]
6152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6153pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6154    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6155}
6156
6157/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6158/// result from packed elements in c, and store the results in dst.
6159///
6160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6161#[inline]
6162#[target_feature(enable = "avx512fp16,avx512vl")]
6163#[cfg_attr(test, assert_instr(vfnmadd))]
6164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6165pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6166    unsafe { simd_fma(simd_neg(a), b, c) }
6167}
6168
6169/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6170/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6171/// from a when the corresponding mask bit is not set).
6172///
6173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6174#[inline]
6175#[target_feature(enable = "avx512fp16,avx512vl")]
6176#[cfg_attr(test, assert_instr(vfnmadd))]
6177#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6178pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6179    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6180}
6181
6182/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6183/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6184/// from c when the corresponding mask bit is not set).
6185///
6186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6187#[inline]
6188#[target_feature(enable = "avx512fp16,avx512vl")]
6189#[cfg_attr(test, assert_instr(vfnmadd))]
6190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6191pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6192    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6193}
6194
6195/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6196/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6197/// out when the corresponding mask bit is not set).
6198///
6199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6200#[inline]
6201#[target_feature(enable = "avx512fp16,avx512vl")]
6202#[cfg_attr(test, assert_instr(vfnmadd))]
6203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6204pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6205    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6206}
6207
6208/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6209/// result from packed elements in c, and store the results in dst.
6210///
6211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6212#[inline]
6213#[target_feature(enable = "avx512fp16")]
6214#[cfg_attr(test, assert_instr(vfnmadd))]
6215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6216pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6217    unsafe { simd_fma(simd_neg(a), b, c) }
6218}
6219
6220/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6221/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6222/// from a when the corresponding mask bit is not set).
6223///
6224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6225#[inline]
6226#[target_feature(enable = "avx512fp16")]
6227#[cfg_attr(test, assert_instr(vfnmadd))]
6228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6229pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6230    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6231}
6232
6233/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6234/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6235/// from c when the corresponding mask bit is not set).
6236///
6237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6238#[inline]
6239#[target_feature(enable = "avx512fp16")]
6240#[cfg_attr(test, assert_instr(vfnmadd))]
6241#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6242pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6243    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6244}
6245
6246/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6247/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6248/// out when the corresponding mask bit is not set).
6249///
6250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6251#[inline]
6252#[target_feature(enable = "avx512fp16")]
6253#[cfg_attr(test, assert_instr(vfnmadd))]
6254#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6255pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6256    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6257}
6258
6259/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6260/// result from packed elements in c, and store the results in dst.
6261///
6262/// Rounding is done according to the rounding parameter, which can be one of:
6263///
6264/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6265/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6266/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6267/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6268/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6269///
6270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6271#[inline]
6272#[target_feature(enable = "avx512fp16")]
6273#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6274#[rustc_legacy_const_generics(3)]
6275#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6276pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6277    unsafe {
6278        static_assert_rounding!(ROUNDING);
6279        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6280    }
6281}
6282
6283/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6284/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6285/// from a when the corresponding mask bit is not set).
6286///
6287/// Rounding is done according to the rounding parameter, which can be one of:
6288///
6289/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6290/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6291/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6292/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6293/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6294///
6295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6296#[inline]
6297#[target_feature(enable = "avx512fp16")]
6298#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6299#[rustc_legacy_const_generics(4)]
6300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6301pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6302    a: __m512h,
6303    k: __mmask32,
6304    b: __m512h,
6305    c: __m512h,
6306) -> __m512h {
6307    unsafe {
6308        static_assert_rounding!(ROUNDING);
6309        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6310    }
6311}
6312
6313/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6314/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6315/// from c when the corresponding mask bit is not set).
6316///
6317/// Rounding is done according to the rounding parameter, which can be one of:
6318///
6319/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6320/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6321/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6322/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6323/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6324///
6325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6326#[inline]
6327#[target_feature(enable = "avx512fp16")]
6328#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6329#[rustc_legacy_const_generics(4)]
6330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6331pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6332    a: __m512h,
6333    b: __m512h,
6334    c: __m512h,
6335    k: __mmask32,
6336) -> __m512h {
6337    unsafe {
6338        static_assert_rounding!(ROUNDING);
6339        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6340    }
6341}
6342
6343/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6344/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6345/// out when the corresponding mask bit is not set).
6346///
6347/// Rounding is done according to the rounding parameter, which can be one of:
6348///
6349/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6350/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6351/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6352/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6353/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6354///
6355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6356#[inline]
6357#[target_feature(enable = "avx512fp16")]
6358#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6359#[rustc_legacy_const_generics(4)]
6360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6361pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6362    k: __mmask32,
6363    a: __m512h,
6364    b: __m512h,
6365    c: __m512h,
6366) -> __m512h {
6367    unsafe {
6368        static_assert_rounding!(ROUNDING);
6369        simd_select_bitmask(
6370            k,
6371            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6372            _mm512_setzero_ph(),
6373        )
6374    }
6375}
6376
6377/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6378/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6379/// elements from a to the upper elements of dst.
6380///
6381/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6382#[inline]
6383#[target_feature(enable = "avx512fp16")]
6384#[cfg_attr(test, assert_instr(vfnmadd))]
6385#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6386pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6387    unsafe {
6388        let extracta: f16 = simd_extract!(a, 0);
6389        let extractb: f16 = simd_extract!(b, 0);
6390        let extractc: f16 = simd_extract!(c, 0);
6391        let r = fmaf16(-extracta, extractb, extractc);
6392        simd_insert!(a, 0, r)
6393    }
6394}
6395
6396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6397/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6398/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6399/// elements of dst.
6400///
6401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6402#[inline]
6403#[target_feature(enable = "avx512fp16")]
6404#[cfg_attr(test, assert_instr(vfnmadd))]
6405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6406pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6407    unsafe {
6408        let mut fnmadd: f16 = simd_extract!(a, 0);
6409        if k & 1 != 0 {
6410            let extractb: f16 = simd_extract!(b, 0);
6411            let extractc: f16 = simd_extract!(c, 0);
6412            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6413        }
6414        simd_insert!(a, 0, fnmadd)
6415    }
6416}
6417
6418/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6419/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6420/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6421/// elements of dst.
6422///
6423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6424#[inline]
6425#[target_feature(enable = "avx512fp16")]
6426#[cfg_attr(test, assert_instr(vfnmadd))]
6427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6428pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6429    unsafe {
6430        let mut fnmadd: f16 = simd_extract!(c, 0);
6431        if k & 1 != 0 {
6432            let extracta: f16 = simd_extract!(a, 0);
6433            let extractb: f16 = simd_extract!(b, 0);
6434            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6435        }
6436        simd_insert!(c, 0, fnmadd)
6437    }
6438}
6439
6440/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6441/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6442/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6443/// elements of dst.
6444///
6445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6446#[inline]
6447#[target_feature(enable = "avx512fp16")]
6448#[cfg_attr(test, assert_instr(vfnmadd))]
6449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6450pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6451    unsafe {
6452        let mut fnmadd: f16 = 0.0;
6453        if k & 1 != 0 {
6454            let extracta: f16 = simd_extract!(a, 0);
6455            let extractb: f16 = simd_extract!(b, 0);
6456            let extractc: f16 = simd_extract!(c, 0);
6457            fnmadd = fmaf16(-extracta, extractb, extractc);
6458        }
6459        simd_insert!(a, 0, fnmadd)
6460    }
6461}
6462
6463/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6464/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6465/// elements from a to the upper elements of dst.
6466///
6467/// Rounding is done according to the rounding parameter, which can be one of:
6468///
6469/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6470/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6471/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6472/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6473/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6474///
6475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6476#[inline]
6477#[target_feature(enable = "avx512fp16")]
6478#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6479#[rustc_legacy_const_generics(3)]
6480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6481pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6482    unsafe {
6483        static_assert_rounding!(ROUNDING);
6484        let extracta: f16 = simd_extract!(a, 0);
6485        let extractb: f16 = simd_extract!(b, 0);
6486        let extractc: f16 = simd_extract!(c, 0);
6487        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6488        simd_insert!(a, 0, r)
6489    }
6490}
6491
6492/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6493/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6494/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6495/// elements of dst.
6496///
6497/// Rounding is done according to the rounding parameter, which can be one of:
6498///
6499/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6500/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6501/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6502/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6504///
6505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6506#[inline]
6507#[target_feature(enable = "avx512fp16")]
6508#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6509#[rustc_legacy_const_generics(4)]
6510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6511pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6512    a: __m128h,
6513    k: __mmask8,
6514    b: __m128h,
6515    c: __m128h,
6516) -> __m128h {
6517    unsafe {
6518        static_assert_rounding!(ROUNDING);
6519        let mut fnmadd: f16 = simd_extract!(a, 0);
6520        if k & 1 != 0 {
6521            let extractb: f16 = simd_extract!(b, 0);
6522            let extractc: f16 = simd_extract!(c, 0);
6523            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6524        }
6525        simd_insert!(a, 0, fnmadd)
6526    }
6527}
6528
6529/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6530/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6531/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6532/// elements of dst.
6533///
6534/// Rounding is done according to the rounding parameter, which can be one of:
6535///
6536/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6537/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6538/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6539/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6540/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6541///
6542/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6543#[inline]
6544#[target_feature(enable = "avx512fp16")]
6545#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6546#[rustc_legacy_const_generics(4)]
6547#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6548pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6549    a: __m128h,
6550    b: __m128h,
6551    c: __m128h,
6552    k: __mmask8,
6553) -> __m128h {
6554    unsafe {
6555        static_assert_rounding!(ROUNDING);
6556        let mut fnmadd: f16 = simd_extract!(c, 0);
6557        if k & 1 != 0 {
6558            let extracta: f16 = simd_extract!(a, 0);
6559            let extractb: f16 = simd_extract!(b, 0);
6560            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6561        }
6562        simd_insert!(c, 0, fnmadd)
6563    }
6564}
6565
6566/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6567/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6568/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6569/// elements of dst.
6570///
6571/// Rounding is done according to the rounding parameter, which can be one of:
6572///
6573/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6574/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6575/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6576/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6577/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6578///
6579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6580#[inline]
6581#[target_feature(enable = "avx512fp16")]
6582#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6583#[rustc_legacy_const_generics(4)]
6584#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6585pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6586    k: __mmask8,
6587    a: __m128h,
6588    b: __m128h,
6589    c: __m128h,
6590) -> __m128h {
6591    unsafe {
6592        static_assert_rounding!(ROUNDING);
6593        let mut fnmadd: f16 = 0.0;
6594        if k & 1 != 0 {
6595            let extracta: f16 = simd_extract!(a, 0);
6596            let extractb: f16 = simd_extract!(b, 0);
6597            let extractc: f16 = simd_extract!(c, 0);
6598            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6599        }
6600        simd_insert!(a, 0, fnmadd)
6601    }
6602}
6603
6604/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6605/// in c from the negated intermediate result, and store the results in dst.
6606///
6607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6608#[inline]
6609#[target_feature(enable = "avx512fp16,avx512vl")]
6610#[cfg_attr(test, assert_instr(vfnmsub))]
6611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6612pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6613    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6614}
6615
6616/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6617/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6618/// copied from a when the corresponding mask bit is not set).
6619///
6620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6621#[inline]
6622#[target_feature(enable = "avx512fp16,avx512vl")]
6623#[cfg_attr(test, assert_instr(vfnmsub))]
6624#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6625pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6626    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6627}
6628
6629/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6630/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6631/// copied from c when the corresponding mask bit is not set).
6632///
6633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6634#[inline]
6635#[target_feature(enable = "avx512fp16,avx512vl")]
6636#[cfg_attr(test, assert_instr(vfnmsub))]
6637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6638pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6639    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6640}
6641
6642/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6643/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6644/// zeroed out when the corresponding mask bit is not set).
6645///
6646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6647#[inline]
6648#[target_feature(enable = "avx512fp16,avx512vl")]
6649#[cfg_attr(test, assert_instr(vfnmsub))]
6650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6651pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6652    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6653}
6654
6655/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6656/// in c from the negated intermediate result, and store the results in dst.
6657///
6658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6659#[inline]
6660#[target_feature(enable = "avx512fp16,avx512vl")]
6661#[cfg_attr(test, assert_instr(vfnmsub))]
6662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6663pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6664    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6665}
6666
6667/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6668/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6669/// copied from a when the corresponding mask bit is not set).
6670///
6671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6672#[inline]
6673#[target_feature(enable = "avx512fp16,avx512vl")]
6674#[cfg_attr(test, assert_instr(vfnmsub))]
6675#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6676pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6677    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6678}
6679
6680/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6681/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6682/// copied from c when the corresponding mask bit is not set).
6683///
6684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6685#[inline]
6686#[target_feature(enable = "avx512fp16,avx512vl")]
6687#[cfg_attr(test, assert_instr(vfnmsub))]
6688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6689pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6690    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6691}
6692
6693/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6694/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6695/// zeroed out when the corresponding mask bit is not set).
6696///
6697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6698#[inline]
6699#[target_feature(enable = "avx512fp16,avx512vl")]
6700#[cfg_attr(test, assert_instr(vfnmsub))]
6701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6702pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6703    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6704}
6705
6706/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6707/// in c from the negated intermediate result, and store the results in dst.
6708///
6709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6710#[inline]
6711#[target_feature(enable = "avx512fp16")]
6712#[cfg_attr(test, assert_instr(vfnmsub))]
6713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6714pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6715    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6716}
6717
6718/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6719/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6720/// copied from a when the corresponding mask bit is not set).
6721///
6722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6723#[inline]
6724#[target_feature(enable = "avx512fp16")]
6725#[cfg_attr(test, assert_instr(vfnmsub))]
6726#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6727pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6728    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6729}
6730
6731/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6732/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6733/// copied from c when the corresponding mask bit is not set).
6734///
6735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6736#[inline]
6737#[target_feature(enable = "avx512fp16")]
6738#[cfg_attr(test, assert_instr(vfnmsub))]
6739#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6740pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6741    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6742}
6743
6744/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6745/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6746/// zeroed out when the corresponding mask bit is not set).
6747///
6748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6749#[inline]
6750#[target_feature(enable = "avx512fp16")]
6751#[cfg_attr(test, assert_instr(vfnmsub))]
6752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6753pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6754    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6755}
6756
6757/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6758/// in c from the negated intermediate result, and store the results in dst.
6759///
6760/// Rounding is done according to the rounding parameter, which can be one of:
6761///
6762/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6763/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6764/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6765/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6766/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6767///
6768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6769#[inline]
6770#[target_feature(enable = "avx512fp16")]
6771#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6772#[rustc_legacy_const_generics(3)]
6773#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6774pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6775    unsafe {
6776        static_assert_rounding!(ROUNDING);
6777        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6778    }
6779}
6780
6781/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6782/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6783/// copied from a when the corresponding mask bit is not set).
6784///
6785/// Rounding is done according to the rounding parameter, which can be one of:
6786///
6787/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6788/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6789/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6790/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6791/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6792///
6793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6794#[inline]
6795#[target_feature(enable = "avx512fp16")]
6796#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6797#[rustc_legacy_const_generics(4)]
6798#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6799pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6800    a: __m512h,
6801    k: __mmask32,
6802    b: __m512h,
6803    c: __m512h,
6804) -> __m512h {
6805    unsafe {
6806        static_assert_rounding!(ROUNDING);
6807        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6808    }
6809}
6810
6811/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6812/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6813/// copied from c when the corresponding mask bit is not set).
6814///
6815/// Rounding is done according to the rounding parameter, which can be one of:
6816///
6817/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6818/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6819/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6820/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6821/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6822///
6823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6824#[inline]
6825#[target_feature(enable = "avx512fp16")]
6826#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6827#[rustc_legacy_const_generics(4)]
6828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6829pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6830    a: __m512h,
6831    b: __m512h,
6832    c: __m512h,
6833    k: __mmask32,
6834) -> __m512h {
6835    unsafe {
6836        static_assert_rounding!(ROUNDING);
6837        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6838    }
6839}
6840
6841/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6842/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6843/// zeroed out when the corresponding mask bit is not set).
6844///
6845/// Rounding is done according to the rounding parameter, which can be one of:
6846///
6847/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6848/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6849/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6850/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6851/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6852///
6853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6854#[inline]
6855#[target_feature(enable = "avx512fp16")]
6856#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6857#[rustc_legacy_const_generics(4)]
6858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6859pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6860    k: __mmask32,
6861    a: __m512h,
6862    b: __m512h,
6863    c: __m512h,
6864) -> __m512h {
6865    unsafe {
6866        static_assert_rounding!(ROUNDING);
6867        simd_select_bitmask(
6868            k,
6869            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6870            _mm512_setzero_ph(),
6871        )
6872    }
6873}
6874
6875/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6876/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6877/// elements from a to the upper elements of dst.
6878///
6879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6880#[inline]
6881#[target_feature(enable = "avx512fp16")]
6882#[cfg_attr(test, assert_instr(vfnmsub))]
6883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6884pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6885    unsafe {
6886        let extracta: f16 = simd_extract!(a, 0);
6887        let extractb: f16 = simd_extract!(b, 0);
6888        let extractc: f16 = simd_extract!(c, 0);
6889        let r = fmaf16(-extracta, extractb, -extractc);
6890        simd_insert!(a, 0, r)
6891    }
6892}
6893
6894/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6895/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6896/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6897/// elements of dst.
6898///
6899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6900#[inline]
6901#[target_feature(enable = "avx512fp16")]
6902#[cfg_attr(test, assert_instr(vfnmsub))]
6903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6904pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6905    unsafe {
6906        let mut fnmsub: f16 = simd_extract!(a, 0);
6907        if k & 1 != 0 {
6908            let extractb: f16 = simd_extract!(b, 0);
6909            let extractc: f16 = simd_extract!(c, 0);
6910            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6911        }
6912        simd_insert!(a, 0, fnmsub)
6913    }
6914}
6915
6916/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6917/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6918/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6919/// elements of dst.
6920///
6921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6922#[inline]
6923#[target_feature(enable = "avx512fp16")]
6924#[cfg_attr(test, assert_instr(vfnmsub))]
6925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6926pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6927    unsafe {
6928        let mut fnmsub: f16 = simd_extract!(c, 0);
6929        if k & 1 != 0 {
6930            let extracta: f16 = simd_extract!(a, 0);
6931            let extractb: f16 = simd_extract!(b, 0);
6932            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
6933        }
6934        simd_insert!(c, 0, fnmsub)
6935    }
6936}
6937
6938/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6939/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6940/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6941/// elements of dst.
6942///
6943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
6944#[inline]
6945#[target_feature(enable = "avx512fp16")]
6946#[cfg_attr(test, assert_instr(vfnmsub))]
6947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6948pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6949    unsafe {
6950        let mut fnmsub: f16 = 0.0;
6951        if k & 1 != 0 {
6952            let extracta: f16 = simd_extract!(a, 0);
6953            let extractb: f16 = simd_extract!(b, 0);
6954            let extractc: f16 = simd_extract!(c, 0);
6955            fnmsub = fmaf16(-extracta, extractb, -extractc);
6956        }
6957        simd_insert!(a, 0, fnmsub)
6958    }
6959}
6960
6961/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6962/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6963/// elements from a to the upper elements of dst.
6964///
6965/// Rounding is done according to the rounding parameter, which can be one of:
6966///
6967/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6968/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6969/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6970/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6971/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6972///
6973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
6974#[inline]
6975#[target_feature(enable = "avx512fp16")]
6976#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6977#[rustc_legacy_const_generics(3)]
6978#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6979pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6980    unsafe {
6981        static_assert_rounding!(ROUNDING);
6982        let extracta: f16 = simd_extract!(a, 0);
6983        let extractb: f16 = simd_extract!(b, 0);
6984        let extractc: f16 = simd_extract!(c, 0);
6985        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6986        simd_insert!(a, 0, r)
6987    }
6988}
6989
6990/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6991/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6992/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6993/// elements of dst.
6994///
6995/// Rounding is done according to the rounding parameter, which can be one of:
6996///
6997/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6998/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6999/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7000/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7001/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7002///
7003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7004#[inline]
7005#[target_feature(enable = "avx512fp16")]
7006#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7007#[rustc_legacy_const_generics(4)]
7008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7009pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7010    a: __m128h,
7011    k: __mmask8,
7012    b: __m128h,
7013    c: __m128h,
7014) -> __m128h {
7015    unsafe {
7016        static_assert_rounding!(ROUNDING);
7017        let mut fnmsub: f16 = simd_extract!(a, 0);
7018        if k & 1 != 0 {
7019            let extractb: f16 = simd_extract!(b, 0);
7020            let extractc: f16 = simd_extract!(c, 0);
7021            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7022        }
7023        simd_insert!(a, 0, fnmsub)
7024    }
7025}
7026
7027/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7028/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7029/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7030/// elements of dst.
7031///
7032/// Rounding is done according to the rounding parameter, which can be one of:
7033///
7034/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7035/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7036/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7037/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7038/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7039///
7040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7041#[inline]
7042#[target_feature(enable = "avx512fp16")]
7043#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7044#[rustc_legacy_const_generics(4)]
7045#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7046pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7047    a: __m128h,
7048    b: __m128h,
7049    c: __m128h,
7050    k: __mmask8,
7051) -> __m128h {
7052    unsafe {
7053        static_assert_rounding!(ROUNDING);
7054        let mut fnmsub: f16 = simd_extract!(c, 0);
7055        if k & 1 != 0 {
7056            let extracta: f16 = simd_extract!(a, 0);
7057            let extractb: f16 = simd_extract!(b, 0);
7058            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7059        }
7060        simd_insert!(c, 0, fnmsub)
7061    }
7062}
7063
7064/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7065/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7066/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7067/// elements of dst.
7068///
7069/// Rounding is done according to the rounding parameter, which can be one of:
7070///
7071/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7072/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7073/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7074/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7075/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7076///
7077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7078#[inline]
7079#[target_feature(enable = "avx512fp16")]
7080#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7081#[rustc_legacy_const_generics(4)]
7082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7083pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7084    k: __mmask8,
7085    a: __m128h,
7086    b: __m128h,
7087    c: __m128h,
7088) -> __m128h {
7089    unsafe {
7090        static_assert_rounding!(ROUNDING);
7091        let mut fnmsub: f16 = 0.0;
7092        if k & 1 != 0 {
7093            let extracta: f16 = simd_extract!(a, 0);
7094            let extractb: f16 = simd_extract!(b, 0);
7095            let extractc: f16 = simd_extract!(c, 0);
7096            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7097        }
7098        simd_insert!(a, 0, fnmsub)
7099    }
7100}
7101
7102/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7103/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7104///
7105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7106#[inline]
7107#[target_feature(enable = "avx512fp16,avx512vl")]
7108#[cfg_attr(test, assert_instr(vfmaddsub))]
7109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7110pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7111    unsafe { vfmaddsubph_128(a, b, c) }
7112}
7113
7114/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7115/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7116/// (the element is copied from a when the corresponding mask bit is not set).
7117///
7118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7119#[inline]
7120#[target_feature(enable = "avx512fp16,avx512vl")]
7121#[cfg_attr(test, assert_instr(vfmaddsub))]
7122#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7123pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7124    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7125}
7126
7127/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7128/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7129/// (the element is copied from c when the corresponding mask bit is not set).
7130///
7131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7132#[inline]
7133#[target_feature(enable = "avx512fp16,avx512vl")]
7134#[cfg_attr(test, assert_instr(vfmaddsub))]
7135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7136pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7137    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7138}
7139
7140/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7141/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7142/// (the element is zeroed out when the corresponding mask bit is not set).
7143///
7144/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7145#[inline]
7146#[target_feature(enable = "avx512fp16,avx512vl")]
7147#[cfg_attr(test, assert_instr(vfmaddsub))]
7148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7149pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7150    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7151}
7152
7153/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7154/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7155///
7156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7157#[inline]
7158#[target_feature(enable = "avx512fp16,avx512vl")]
7159#[cfg_attr(test, assert_instr(vfmaddsub))]
7160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7161pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7162    unsafe { vfmaddsubph_256(a, b, c) }
7163}
7164
7165/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7166/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7167/// (the element is copied from a when the corresponding mask bit is not set).
7168///
7169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7170#[inline]
7171#[target_feature(enable = "avx512fp16,avx512vl")]
7172#[cfg_attr(test, assert_instr(vfmaddsub))]
7173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7174pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7175    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7176}
7177
7178/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7179/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7180/// (the element is copied from c when the corresponding mask bit is not set).
7181///
7182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7183#[inline]
7184#[target_feature(enable = "avx512fp16,avx512vl")]
7185#[cfg_attr(test, assert_instr(vfmaddsub))]
7186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7187pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7188    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7189}
7190
7191/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7192/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7193/// (the element is zeroed out when the corresponding mask bit is not set).
7194///
7195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7196#[inline]
7197#[target_feature(enable = "avx512fp16,avx512vl")]
7198#[cfg_attr(test, assert_instr(vfmaddsub))]
7199#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7200pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7201    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7202}
7203
7204/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7205/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7206///
7207/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7208#[inline]
7209#[target_feature(enable = "avx512fp16")]
7210#[cfg_attr(test, assert_instr(vfmaddsub))]
7211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7212pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7213    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7214}
7215
7216/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7217/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7218/// (the element is copied from a when the corresponding mask bit is not set).
7219///
7220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7221#[inline]
7222#[target_feature(enable = "avx512fp16")]
7223#[cfg_attr(test, assert_instr(vfmaddsub))]
7224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7225pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7226    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7227}
7228
7229/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7230/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7231/// (the element is copied from c when the corresponding mask bit is not set).
7232///
7233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7234#[inline]
7235#[target_feature(enable = "avx512fp16")]
7236#[cfg_attr(test, assert_instr(vfmaddsub))]
7237#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7238pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7239    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7240}
7241
7242/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7243/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7244/// (the element is zeroed out when the corresponding mask bit is not set).
7245///
7246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7247#[inline]
7248#[target_feature(enable = "avx512fp16")]
7249#[cfg_attr(test, assert_instr(vfmaddsub))]
7250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7251pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7252    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7253}
7254
7255/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7256/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7257///
7258/// Rounding is done according to the rounding parameter, which can be one of:
7259///
7260/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7261/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7262/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7263/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7264/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7265///
7266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7267#[inline]
7268#[target_feature(enable = "avx512fp16")]
7269#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7270#[rustc_legacy_const_generics(3)]
7271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7272pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7273    a: __m512h,
7274    b: __m512h,
7275    c: __m512h,
7276) -> __m512h {
7277    unsafe {
7278        static_assert_rounding!(ROUNDING);
7279        vfmaddsubph_512(a, b, c, ROUNDING)
7280    }
7281}
7282
7283/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7284/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7285/// (the element is copied from a when the corresponding mask bit is not set).
7286///
7287/// Rounding is done according to the rounding parameter, which can be one of:
7288///
7289/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7290/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7291/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7292/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7293/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7294///
7295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7296#[inline]
7297#[target_feature(enable = "avx512fp16")]
7298#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7299#[rustc_legacy_const_generics(4)]
7300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7301pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7302    a: __m512h,
7303    k: __mmask32,
7304    b: __m512h,
7305    c: __m512h,
7306) -> __m512h {
7307    unsafe {
7308        static_assert_rounding!(ROUNDING);
7309        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7310    }
7311}
7312
7313/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7314/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7315/// (the element is copied from c when the corresponding mask bit is not set).
7316///
7317/// Rounding is done according to the rounding parameter, which can be one of:
7318///
7319/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7320/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7321/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7322/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7323/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7324///
7325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7326#[inline]
7327#[target_feature(enable = "avx512fp16")]
7328#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7329#[rustc_legacy_const_generics(4)]
7330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7331pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7332    a: __m512h,
7333    b: __m512h,
7334    c: __m512h,
7335    k: __mmask32,
7336) -> __m512h {
7337    unsafe {
7338        static_assert_rounding!(ROUNDING);
7339        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7340    }
7341}
7342
7343/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7344/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7345/// (the element is zeroed out when the corresponding mask bit is not set).
7346///
7347/// Rounding is done according to the rounding parameter, which can be one of:
7348///
7349/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7350/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7351/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7352/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7353/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7354///
7355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7356#[inline]
7357#[target_feature(enable = "avx512fp16")]
7358#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7359#[rustc_legacy_const_generics(4)]
7360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7361pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7362    k: __mmask32,
7363    a: __m512h,
7364    b: __m512h,
7365    c: __m512h,
7366) -> __m512h {
7367    unsafe {
7368        static_assert_rounding!(ROUNDING);
7369        simd_select_bitmask(
7370            k,
7371            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7372            _mm512_setzero_ph(),
7373        )
7374    }
7375}
7376
7377/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7378/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7379///
7380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7381#[inline]
7382#[target_feature(enable = "avx512fp16,avx512vl")]
7383#[cfg_attr(test, assert_instr(vfmsubadd))]
7384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7385pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7386    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
7387}
7388
7389/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7390/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7391/// (the element is copied from a when the corresponding mask bit is not set).
7392///
7393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7394#[inline]
7395#[target_feature(enable = "avx512fp16,avx512vl")]
7396#[cfg_attr(test, assert_instr(vfmsubadd))]
7397#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7398pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7399    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7400}
7401
7402/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7403/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7404/// (the element is copied from c when the corresponding mask bit is not set).
7405///
7406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7407#[inline]
7408#[target_feature(enable = "avx512fp16,avx512vl")]
7409#[cfg_attr(test, assert_instr(vfmsubadd))]
7410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7411pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7412    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7413}
7414
7415/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7416/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7417/// (the element is zeroed out when the corresponding mask bit is not set).
7418///
7419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7420#[inline]
7421#[target_feature(enable = "avx512fp16,avx512vl")]
7422#[cfg_attr(test, assert_instr(vfmsubadd))]
7423#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7424pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7425    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7426}
7427
7428/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7429/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7430///
7431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7432#[inline]
7433#[target_feature(enable = "avx512fp16,avx512vl")]
7434#[cfg_attr(test, assert_instr(vfmsubadd))]
7435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7436pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7437    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
7438}
7439
7440/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7441/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7442/// (the element is copied from a when the corresponding mask bit is not set).
7443///
7444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7445#[inline]
7446#[target_feature(enable = "avx512fp16,avx512vl")]
7447#[cfg_attr(test, assert_instr(vfmsubadd))]
7448#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7449pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7450    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7451}
7452
7453/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7454/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7455/// (the element is copied from c when the corresponding mask bit is not set).
7456///
7457/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7458#[inline]
7459#[target_feature(enable = "avx512fp16,avx512vl")]
7460#[cfg_attr(test, assert_instr(vfmsubadd))]
7461#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7462pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7463    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7464}
7465
7466/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7467/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7468/// (the element is zeroed out when the corresponding mask bit is not set).
7469///
7470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7471#[inline]
7472#[target_feature(enable = "avx512fp16,avx512vl")]
7473#[cfg_attr(test, assert_instr(vfmsubadd))]
7474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7475pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7476    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7477}
7478
7479/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7480/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7481///
7482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7483#[inline]
7484#[target_feature(enable = "avx512fp16")]
7485#[cfg_attr(test, assert_instr(vfmsubadd))]
7486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7487pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7488    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7489}
7490
7491/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7492/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7493/// (the element is copied from a when the corresponding mask bit is not set).
7494///
7495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7496#[inline]
7497#[target_feature(enable = "avx512fp16")]
7498#[cfg_attr(test, assert_instr(vfmsubadd))]
7499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7500pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7501    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7502}
7503
7504/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7505/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7506/// (the element is copied from c when the corresponding mask bit is not set).
7507///
7508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7509#[inline]
7510#[target_feature(enable = "avx512fp16")]
7511#[cfg_attr(test, assert_instr(vfmsubadd))]
7512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7513pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7514    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7515}
7516
7517/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7518/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7519/// (the element is zeroed out when the corresponding mask bit is not set).
7520///
7521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7522#[inline]
7523#[target_feature(enable = "avx512fp16")]
7524#[cfg_attr(test, assert_instr(vfmsubadd))]
7525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7526pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7527    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7528}
7529
7530/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7531/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7532///
7533/// Rounding is done according to the rounding parameter, which can be one of:
7534///
7535/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7536/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7537/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7538/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7539/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7540///
7541/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7542#[inline]
7543#[target_feature(enable = "avx512fp16")]
7544#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7545#[rustc_legacy_const_generics(3)]
7546#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7547pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7548    a: __m512h,
7549    b: __m512h,
7550    c: __m512h,
7551) -> __m512h {
7552    unsafe {
7553        static_assert_rounding!(ROUNDING);
7554        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7555    }
7556}
7557
7558/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7559/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7560/// (the element is copied from a when the corresponding mask bit is not set).
7561///
7562/// Rounding is done according to the rounding parameter, which can be one of:
7563///
7564/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7565/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7566/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7567/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7568/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7569///
7570/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7571#[inline]
7572#[target_feature(enable = "avx512fp16")]
7573#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7574#[rustc_legacy_const_generics(4)]
7575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7576pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7577    a: __m512h,
7578    k: __mmask32,
7579    b: __m512h,
7580    c: __m512h,
7581) -> __m512h {
7582    unsafe {
7583        static_assert_rounding!(ROUNDING);
7584        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7585    }
7586}
7587
7588/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7589/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7590/// (the element is copied from c when the corresponding mask bit is not set).
7591///
7592/// Rounding is done according to the rounding parameter, which can be one of:
7593///
7594/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7595/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7596/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7597/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7598/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7599///
7600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7601#[inline]
7602#[target_feature(enable = "avx512fp16")]
7603#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7604#[rustc_legacy_const_generics(4)]
7605#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7606pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7607    a: __m512h,
7608    b: __m512h,
7609    c: __m512h,
7610    k: __mmask32,
7611) -> __m512h {
7612    unsafe {
7613        static_assert_rounding!(ROUNDING);
7614        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7615    }
7616}
7617
7618/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7619/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7620/// (the element is zeroed out when the corresponding mask bit is not set).
7621///
7622/// Rounding is done according to the rounding parameter, which can be one of:
7623///
7624/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7625/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7626/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7627/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7628/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7629///
7630/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7631#[inline]
7632#[target_feature(enable = "avx512fp16")]
7633#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7634#[rustc_legacy_const_generics(4)]
7635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7636pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7637    k: __mmask32,
7638    a: __m512h,
7639    b: __m512h,
7640    c: __m512h,
7641) -> __m512h {
7642    unsafe {
7643        static_assert_rounding!(ROUNDING);
7644        simd_select_bitmask(
7645            k,
7646            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7647            _mm512_setzero_ph(),
7648        )
7649    }
7650}
7651
7652/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7653/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7654///
7655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7656#[inline]
7657#[target_feature(enable = "avx512fp16,avx512vl")]
7658#[cfg_attr(test, assert_instr(vrcpph))]
7659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7660pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7661    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7662}
7663
7664/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7665/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7666/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7667///
7668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7669#[inline]
7670#[target_feature(enable = "avx512fp16,avx512vl")]
7671#[cfg_attr(test, assert_instr(vrcpph))]
7672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7673pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7674    unsafe { vrcpph_128(a, src, k) }
7675}
7676
7677/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7678/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7679/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7680///
7681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7682#[inline]
7683#[target_feature(enable = "avx512fp16,avx512vl")]
7684#[cfg_attr(test, assert_instr(vrcpph))]
7685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7686pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7687    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7688}
7689
7690/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7691/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7692///
7693/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7694#[inline]
7695#[target_feature(enable = "avx512fp16,avx512vl")]
7696#[cfg_attr(test, assert_instr(vrcpph))]
7697#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7698pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7699    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7700}
7701
7702/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7703/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7704/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7705///
7706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7707#[inline]
7708#[target_feature(enable = "avx512fp16,avx512vl")]
7709#[cfg_attr(test, assert_instr(vrcpph))]
7710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7711pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7712    unsafe { vrcpph_256(a, src, k) }
7713}
7714
7715/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7716/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7717/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7718///
7719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7720#[inline]
7721#[target_feature(enable = "avx512fp16,avx512vl")]
7722#[cfg_attr(test, assert_instr(vrcpph))]
7723#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7724pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7725    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7726}
7727
7728/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7729/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7730///
7731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7732#[inline]
7733#[target_feature(enable = "avx512fp16")]
7734#[cfg_attr(test, assert_instr(vrcpph))]
7735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7736pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7737    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7738}
7739
7740/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7741/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7742/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7743///
7744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7745#[inline]
7746#[target_feature(enable = "avx512fp16")]
7747#[cfg_attr(test, assert_instr(vrcpph))]
7748#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7749pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7750    unsafe { vrcpph_512(a, src, k) }
7751}
7752
7753/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7754/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7755/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7756///
7757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7758#[inline]
7759#[target_feature(enable = "avx512fp16")]
7760#[cfg_attr(test, assert_instr(vrcpph))]
7761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7762pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7763    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7764}
7765
7766/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7767/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7768/// upper elements of dst.
7769/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7770///
7771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7772#[inline]
7773#[target_feature(enable = "avx512fp16")]
7774#[cfg_attr(test, assert_instr(vrcpsh))]
7775#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7776pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7777    _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b)
7778}
7779
7780/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7781/// store the result in the lower element of dst using writemask k (the element is copied from src when
7782/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7783/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7784///
7785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7786#[inline]
7787#[target_feature(enable = "avx512fp16")]
7788#[cfg_attr(test, assert_instr(vrcpsh))]
7789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7790pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7791    unsafe { vrcpsh(a, b, src, k) }
7792}
7793
7794/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7795/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7796/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7797/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7798///
7799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7800#[inline]
7801#[target_feature(enable = "avx512fp16")]
7802#[cfg_attr(test, assert_instr(vrcpsh))]
7803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7804pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7805    _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b)
7806}
7807
7808/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7809/// elements in a, and store the results in dst.
7810/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7811///
7812/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7813#[inline]
7814#[target_feature(enable = "avx512fp16,avx512vl")]
7815#[cfg_attr(test, assert_instr(vrsqrtph))]
7816#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7817pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7818    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7819}
7820
7821/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7822/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7823/// the corresponding mask bit is not set).
7824/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7825///
7826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7827#[inline]
7828#[target_feature(enable = "avx512fp16,avx512vl")]
7829#[cfg_attr(test, assert_instr(vrsqrtph))]
7830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7831pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7832    unsafe { vrsqrtph_128(a, src, k) }
7833}
7834
7835/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7836/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7837/// corresponding mask bit is not set).
7838/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7839///
7840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7841#[inline]
7842#[target_feature(enable = "avx512fp16,avx512vl")]
7843#[cfg_attr(test, assert_instr(vrsqrtph))]
7844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7845pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7846    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7847}
7848
7849/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7850/// elements in a, and store the results in dst.
7851/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7852///
7853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7854#[inline]
7855#[target_feature(enable = "avx512fp16,avx512vl")]
7856#[cfg_attr(test, assert_instr(vrsqrtph))]
7857#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7858pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7859    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7860}
7861
7862/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7863/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7864/// the corresponding mask bit is not set).
7865/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7866///
7867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7868#[inline]
7869#[target_feature(enable = "avx512fp16,avx512vl")]
7870#[cfg_attr(test, assert_instr(vrsqrtph))]
7871#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7872pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7873    unsafe { vrsqrtph_256(a, src, k) }
7874}
7875
7876/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7877/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7878/// corresponding mask bit is not set).
7879/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7880///
7881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7882#[inline]
7883#[target_feature(enable = "avx512fp16,avx512vl")]
7884#[cfg_attr(test, assert_instr(vrsqrtph))]
7885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7886pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7887    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7888}
7889
7890/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7891/// elements in a, and store the results in dst.
7892/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7893///
7894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7895#[inline]
7896#[target_feature(enable = "avx512fp16")]
7897#[cfg_attr(test, assert_instr(vrsqrtph))]
7898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7899pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7900    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
7901}
7902
7903/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7904/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7905/// the corresponding mask bit is not set).
7906/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7907///
7908/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
7909#[inline]
7910#[target_feature(enable = "avx512fp16")]
7911#[cfg_attr(test, assert_instr(vrsqrtph))]
7912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7913pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7914    unsafe { vrsqrtph_512(a, src, k) }
7915}
7916
7917/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7918/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7919/// corresponding mask bit is not set).
7920/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7921///
7922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
7923#[inline]
7924#[target_feature(enable = "avx512fp16")]
7925#[cfg_attr(test, assert_instr(vrsqrtph))]
7926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7927pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7928    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
7929}
7930
7931/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7932/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
7933/// to the upper elements of dst.
7934/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7935///
7936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
7937#[inline]
7938#[target_feature(enable = "avx512fp16")]
7939#[cfg_attr(test, assert_instr(vrsqrtsh))]
7940#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7941pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7942    _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b)
7943}
7944
7945/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7946/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
7947/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7948/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7949///
7950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
7951#[inline]
7952#[target_feature(enable = "avx512fp16")]
7953#[cfg_attr(test, assert_instr(vrsqrtsh))]
7954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7955pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7956    unsafe { vrsqrtsh(a, b, src, k) }
7957}
7958
7959/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7960/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
7961/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7962/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7963///
7964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
7965#[inline]
7966#[target_feature(enable = "avx512fp16")]
7967#[cfg_attr(test, assert_instr(vrsqrtsh))]
7968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7969pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7970    _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b)
7971}
7972
7973/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7974/// results in dst.
7975///
7976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
7977#[inline]
7978#[target_feature(enable = "avx512fp16,avx512vl")]
7979#[cfg_attr(test, assert_instr(vsqrtph))]
7980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7981pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
7982    unsafe { simd_fsqrt(a) }
7983}
7984
7985/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7986/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7987///
7988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
7989#[inline]
7990#[target_feature(enable = "avx512fp16,avx512vl")]
7991#[cfg_attr(test, assert_instr(vsqrtph))]
7992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7993pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7994    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
7995}
7996
7997/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7998/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7999///
8000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8001#[inline]
8002#[target_feature(enable = "avx512fp16,avx512vl")]
8003#[cfg_attr(test, assert_instr(vsqrtph))]
8004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8005pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8006    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8007}
8008
8009/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8010/// results in dst.
8011///
8012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8013#[inline]
8014#[target_feature(enable = "avx512fp16,avx512vl")]
8015#[cfg_attr(test, assert_instr(vsqrtph))]
8016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8017pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8018    unsafe { simd_fsqrt(a) }
8019}
8020
8021/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8022/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8023///
8024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8025#[inline]
8026#[target_feature(enable = "avx512fp16,avx512vl")]
8027#[cfg_attr(test, assert_instr(vsqrtph))]
8028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8029pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8030    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8031}
8032
8033/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8034/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8035///
8036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8037#[inline]
8038#[target_feature(enable = "avx512fp16,avx512vl")]
8039#[cfg_attr(test, assert_instr(vsqrtph))]
8040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8041pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8042    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8043}
8044
8045/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8046/// results in dst.
8047///
8048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8049#[inline]
8050#[target_feature(enable = "avx512fp16")]
8051#[cfg_attr(test, assert_instr(vsqrtph))]
8052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8053pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8054    unsafe { simd_fsqrt(a) }
8055}
8056
8057/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8058/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8059///
8060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8061#[inline]
8062#[target_feature(enable = "avx512fp16")]
8063#[cfg_attr(test, assert_instr(vsqrtph))]
8064#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8065pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8066    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8067}
8068
8069/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8070/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8071///
8072/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8073#[inline]
8074#[target_feature(enable = "avx512fp16")]
8075#[cfg_attr(test, assert_instr(vsqrtph))]
8076#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8077pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8078    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8079}
8080
8081/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8082/// results in dst.
8083/// Rounding is done according to the rounding parameter, which can be one of:
8084///
8085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8090///
8091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8092#[inline]
8093#[target_feature(enable = "avx512fp16")]
8094#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8095#[rustc_legacy_const_generics(1)]
8096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8097pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8098    unsafe {
8099        static_assert_rounding!(ROUNDING);
8100        vsqrtph_512(a, ROUNDING)
8101    }
8102}
8103
8104/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8105/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8106/// Rounding is done according to the rounding parameter, which can be one of:
8107///
8108/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8109/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8110/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8111/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8112/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8113///
8114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8115#[inline]
8116#[target_feature(enable = "avx512fp16")]
8117#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8118#[rustc_legacy_const_generics(3)]
8119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8120pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8121    src: __m512h,
8122    k: __mmask32,
8123    a: __m512h,
8124) -> __m512h {
8125    unsafe {
8126        static_assert_rounding!(ROUNDING);
8127        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8128    }
8129}
8130
8131/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8132/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8133/// Rounding is done according to the rounding parameter, which can be one of:
8134///
8135/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8136/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8137/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8138/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8139/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8140///
8141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8142#[inline]
8143#[target_feature(enable = "avx512fp16")]
8144#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8145#[rustc_legacy_const_generics(2)]
8146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8147pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8148    unsafe {
8149        static_assert_rounding!(ROUNDING);
8150        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8151    }
8152}
8153
8154/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8155/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8156/// elements of dst.
8157///
8158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8159#[inline]
8160#[target_feature(enable = "avx512fp16")]
8161#[cfg_attr(test, assert_instr(vsqrtsh))]
8162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8163pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8164    _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b)
8165}
8166
8167/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8168/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8169/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8170///
8171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8172#[inline]
8173#[target_feature(enable = "avx512fp16")]
8174#[cfg_attr(test, assert_instr(vsqrtsh))]
8175#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8176pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8177    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8178}
8179
8180/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8181/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8182/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8183///
8184/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8185#[inline]
8186#[target_feature(enable = "avx512fp16")]
8187#[cfg_attr(test, assert_instr(vsqrtsh))]
8188#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8189pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8190    _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b)
8191}
8192
8193/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8194/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8195/// elements of dst.
8196/// Rounding is done according to the rounding parameter, which can be one of:
8197///
8198/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8199/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8200/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8201/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8202/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8203///
8204/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8205#[inline]
8206#[target_feature(enable = "avx512fp16")]
8207#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8208#[rustc_legacy_const_generics(2)]
8209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8210pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8211    static_assert_rounding!(ROUNDING);
8212    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
8213}
8214
8215/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8216/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8217/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8218/// Rounding is done according to the rounding parameter, which can be one of:
8219///
8220/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8221/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8222/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8223/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8224/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8225///
8226/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8227#[inline]
8228#[target_feature(enable = "avx512fp16")]
8229#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8230#[rustc_legacy_const_generics(4)]
8231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8232pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8233    src: __m128h,
8234    k: __mmask8,
8235    a: __m128h,
8236    b: __m128h,
8237) -> __m128h {
8238    unsafe {
8239        static_assert_rounding!(ROUNDING);
8240        vsqrtsh(a, b, src, k, ROUNDING)
8241    }
8242}
8243
8244/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8245/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8246/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8247/// Rounding is done according to the rounding parameter, which can be one of:
8248///
8249/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8250/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8251/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8252/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8253/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8254///
8255/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8256#[inline]
8257#[target_feature(enable = "avx512fp16")]
8258#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8259#[rustc_legacy_const_generics(3)]
8260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8261pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8262    k: __mmask8,
8263    a: __m128h,
8264    b: __m128h,
8265) -> __m128h {
8266    static_assert_rounding!(ROUNDING);
8267    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
8268}
8269
8270/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8271/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8272/// value when inputs are NaN or signed-zero values.
8273///
8274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8275#[inline]
8276#[target_feature(enable = "avx512fp16,avx512vl")]
8277#[cfg_attr(test, assert_instr(vmaxph))]
8278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8279pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8280    unsafe { vmaxph_128(a, b) }
8281}
8282
8283/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8284/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8285/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8286/// NaN or signed-zero values.
8287///
8288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8289#[inline]
8290#[target_feature(enable = "avx512fp16,avx512vl")]
8291#[cfg_attr(test, assert_instr(vmaxph))]
8292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8293pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8294    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8295}
8296
8297/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8298/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8299/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8300/// NaN or signed-zero values.
8301///
8302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8303#[inline]
8304#[target_feature(enable = "avx512fp16,avx512vl")]
8305#[cfg_attr(test, assert_instr(vmaxph))]
8306#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8307pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8308    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8309}
8310
8311/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8312/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8313/// value when inputs are NaN or signed-zero values.
8314///
8315/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8316#[inline]
8317#[target_feature(enable = "avx512fp16,avx512vl")]
8318#[cfg_attr(test, assert_instr(vmaxph))]
8319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8320pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8321    unsafe { vmaxph_256(a, b) }
8322}
8323
8324/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8325/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8326/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8327/// NaN or signed-zero values.
8328///
8329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8330#[inline]
8331#[target_feature(enable = "avx512fp16,avx512vl")]
8332#[cfg_attr(test, assert_instr(vmaxph))]
8333#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8334pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8335    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8336}
8337
8338/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8339/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8340/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8341/// NaN or signed-zero values.
8342///
8343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8344#[inline]
8345#[target_feature(enable = "avx512fp16,avx512vl")]
8346#[cfg_attr(test, assert_instr(vmaxph))]
8347#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8348pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8349    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8350}
8351
8352/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8353/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8354/// value when inputs are NaN or signed-zero values.
8355///
8356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8357#[inline]
8358#[target_feature(enable = "avx512fp16")]
8359#[cfg_attr(test, assert_instr(vmaxph))]
8360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8361pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8362    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8363}
8364
8365/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8366/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8367/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8368/// NaN or signed-zero values.
8369///
8370/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8371#[inline]
8372#[target_feature(enable = "avx512fp16")]
8373#[cfg_attr(test, assert_instr(vmaxph))]
8374#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8375pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8376    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8377}
8378
8379/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8380/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8381/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8382/// NaN or signed-zero values.
8383///
8384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8385#[inline]
8386#[target_feature(enable = "avx512fp16")]
8387#[cfg_attr(test, assert_instr(vmaxph))]
8388#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8389pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8390    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8391}
8392
8393/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8394/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8395/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8396/// NaN or signed-zero values.
8397///
8398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8399#[inline]
8400#[target_feature(enable = "avx512fp16")]
8401#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8402#[rustc_legacy_const_generics(2)]
8403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8404pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8405    unsafe {
8406        static_assert_sae!(SAE);
8407        vmaxph_512(a, b, SAE)
8408    }
8409}
8410
8411/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8412/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8413/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8414/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8415///
8416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8417#[inline]
8418#[target_feature(enable = "avx512fp16")]
8419#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8420#[rustc_legacy_const_generics(4)]
8421#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8422pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8423    src: __m512h,
8424    k: __mmask32,
8425    a: __m512h,
8426    b: __m512h,
8427) -> __m512h {
8428    unsafe {
8429        static_assert_sae!(SAE);
8430        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8431    }
8432}
8433
8434/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8435/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8436/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8437/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8438///
8439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8440#[inline]
8441#[target_feature(enable = "avx512fp16")]
8442#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8443#[rustc_legacy_const_generics(3)]
8444#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8445pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8446    unsafe {
8447        static_assert_sae!(SAE);
8448        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8449    }
8450}
8451
8452/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8453/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8454/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8455/// when inputs are NaN or signed-zero values.
8456///
8457/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8458#[inline]
8459#[target_feature(enable = "avx512fp16,avx512vl")]
8460#[cfg_attr(test, assert_instr(vmaxsh))]
8461#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8462pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8463    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8464}
8465
8466/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8467/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8468/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8469/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8470///
8471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8472#[inline]
8473#[target_feature(enable = "avx512fp16,avx512vl")]
8474#[cfg_attr(test, assert_instr(vmaxsh))]
8475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8476pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8477    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8478}
8479
8480/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8481/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8482/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8483/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8484///
8485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8486#[inline]
8487#[target_feature(enable = "avx512fp16,avx512vl")]
8488#[cfg_attr(test, assert_instr(vmaxsh))]
8489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8490pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8491    _mm_mask_max_sh(_mm_setzero_ph(), k, a, b)
8492}
8493
8494/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8495/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8496/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8497/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8498///
8499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8500#[inline]
8501#[target_feature(enable = "avx512fp16,avx512vl")]
8502#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8503#[rustc_legacy_const_generics(2)]
8504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8505pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8506    static_assert_sae!(SAE);
8507    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8508}
8509
8510/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8511/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8512/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8513/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8514/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8515///
8516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8517#[inline]
8518#[target_feature(enable = "avx512fp16,avx512vl")]
8519#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8520#[rustc_legacy_const_generics(4)]
8521#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8522pub fn _mm_mask_max_round_sh<const SAE: i32>(
8523    src: __m128h,
8524    k: __mmask8,
8525    a: __m128h,
8526    b: __m128h,
8527) -> __m128h {
8528    unsafe {
8529        static_assert_sae!(SAE);
8530        vmaxsh(a, b, src, k, SAE)
8531    }
8532}
8533
8534/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8535/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8536/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8537/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8538/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8539///
8540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8541#[inline]
8542#[target_feature(enable = "avx512fp16,avx512vl")]
8543#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8544#[rustc_legacy_const_generics(3)]
8545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8546pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8547    static_assert_sae!(SAE);
8548    _mm_mask_max_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8549}
8550
8551/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8552/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8553/// when inputs are NaN or signed-zero values.
8554///
8555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8556#[inline]
8557#[target_feature(enable = "avx512fp16,avx512vl")]
8558#[cfg_attr(test, assert_instr(vminph))]
8559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8560pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8561    unsafe { vminph_128(a, b) }
8562}
8563
8564/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8565/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8566/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8567/// NaN or signed-zero values.
8568///
8569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8570#[inline]
8571#[target_feature(enable = "avx512fp16,avx512vl")]
8572#[cfg_attr(test, assert_instr(vminph))]
8573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8574pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8575    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8576}
8577
8578/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8579/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8580/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8581/// NaN or signed-zero values.
8582///
8583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8584#[inline]
8585#[target_feature(enable = "avx512fp16,avx512vl")]
8586#[cfg_attr(test, assert_instr(vminph))]
8587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8588pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8589    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8590}
8591
8592/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8593/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8594/// when inputs are NaN or signed-zero values.
8595///
8596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8597#[inline]
8598#[target_feature(enable = "avx512fp16,avx512vl")]
8599#[cfg_attr(test, assert_instr(vminph))]
8600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8601pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8602    unsafe { vminph_256(a, b) }
8603}
8604
8605/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8606/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8607/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8608/// NaN or signed-zero values.
8609///
8610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8611#[inline]
8612#[target_feature(enable = "avx512fp16,avx512vl")]
8613#[cfg_attr(test, assert_instr(vminph))]
8614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8615pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8616    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8617}
8618
8619/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8620/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8621/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8622/// NaN or signed-zero values.
8623///
8624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8625#[inline]
8626#[target_feature(enable = "avx512fp16,avx512vl")]
8627#[cfg_attr(test, assert_instr(vminph))]
8628#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8629pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8630    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8631}
8632
8633/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8634/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8635/// when inputs are NaN or signed-zero values.
8636///
8637/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8638#[inline]
8639#[target_feature(enable = "avx512fp16")]
8640#[cfg_attr(test, assert_instr(vminph))]
8641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8642pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8643    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8644}
8645
8646/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8647/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8648/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8649/// NaN or signed-zero values.
8650///
8651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8652#[inline]
8653#[target_feature(enable = "avx512fp16")]
8654#[cfg_attr(test, assert_instr(vminph))]
8655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8656pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8657    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8658}
8659
8660/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8661/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8662/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8663/// NaN or signed-zero values.
8664///
8665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8666#[inline]
8667#[target_feature(enable = "avx512fp16")]
8668#[cfg_attr(test, assert_instr(vminph))]
8669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8670pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8671    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8672}
8673
8674/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8675/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8676/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8677///
8678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8679#[inline]
8680#[target_feature(enable = "avx512fp16")]
8681#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8682#[rustc_legacy_const_generics(2)]
8683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8684pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8685    unsafe {
8686        static_assert_sae!(SAE);
8687        vminph_512(a, b, SAE)
8688    }
8689}
8690
8691/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8692/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8693/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8694/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8695///
8696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8697#[inline]
8698#[target_feature(enable = "avx512fp16")]
8699#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8700#[rustc_legacy_const_generics(4)]
8701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8702pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8703    src: __m512h,
8704    k: __mmask32,
8705    a: __m512h,
8706    b: __m512h,
8707) -> __m512h {
8708    unsafe {
8709        static_assert_sae!(SAE);
8710        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8711    }
8712}
8713
8714/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8715/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8716/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8717/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8718///
8719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8720#[inline]
8721#[target_feature(enable = "avx512fp16")]
8722#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8723#[rustc_legacy_const_generics(3)]
8724#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8725pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8726    unsafe {
8727        static_assert_sae!(SAE);
8728        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8729    }
8730}
8731
8732/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8733/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8734/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8735/// inputs are NaN or signed-zero values.
8736///
8737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8738#[inline]
8739#[target_feature(enable = "avx512fp16,avx512vl")]
8740#[cfg_attr(test, assert_instr(vminsh))]
8741#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8742pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8743    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8744}
8745
8746/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8747/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8748/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8749/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8750///
8751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8752#[inline]
8753#[target_feature(enable = "avx512fp16,avx512vl")]
8754#[cfg_attr(test, assert_instr(vminsh))]
8755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8756pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8757    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8758}
8759
8760/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8761/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8762/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8763/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8764///
8765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8766#[inline]
8767#[target_feature(enable = "avx512fp16,avx512vl")]
8768#[cfg_attr(test, assert_instr(vminsh))]
8769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8770pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8771    _mm_mask_min_sh(_mm_setzero_ph(), k, a, b)
8772}
8773
8774/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8775/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8776/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8777/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8778///
8779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8780#[inline]
8781#[target_feature(enable = "avx512fp16,avx512vl")]
8782#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8783#[rustc_legacy_const_generics(2)]
8784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8785pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8786    static_assert_sae!(SAE);
8787    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8788}
8789
8790/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8791/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8792/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8793/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8794/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8795///
8796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8797#[inline]
8798#[target_feature(enable = "avx512fp16,avx512vl")]
8799#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8800#[rustc_legacy_const_generics(4)]
8801#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8802pub fn _mm_mask_min_round_sh<const SAE: i32>(
8803    src: __m128h,
8804    k: __mmask8,
8805    a: __m128h,
8806    b: __m128h,
8807) -> __m128h {
8808    unsafe {
8809        static_assert_sae!(SAE);
8810        vminsh(a, b, src, k, SAE)
8811    }
8812}
8813
8814/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8815/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8816/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8817/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8818/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8819///
8820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8821#[inline]
8822#[target_feature(enable = "avx512fp16,avx512vl")]
8823#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8824#[rustc_legacy_const_generics(3)]
8825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8826pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8827    static_assert_sae!(SAE);
8828    _mm_mask_min_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8829}
8830
8831/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8832/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8833/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8834///
8835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8836#[inline]
8837#[target_feature(enable = "avx512fp16,avx512vl")]
8838#[cfg_attr(test, assert_instr(vgetexpph))]
8839#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8840pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
8841    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8842}
8843
8844/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8845/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8846/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8847/// `floor(log2(x))` for each element.
8848///
8849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8850#[inline]
8851#[target_feature(enable = "avx512fp16,avx512vl")]
8852#[cfg_attr(test, assert_instr(vgetexpph))]
8853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8854pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8855    unsafe { vgetexpph_128(a, src, k) }
8856}
8857
8858/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8859/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8860/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8861/// `floor(log2(x))` for each element.
8862///
8863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8864#[inline]
8865#[target_feature(enable = "avx512fp16,avx512vl")]
8866#[cfg_attr(test, assert_instr(vgetexpph))]
8867#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8868pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8869    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8870}
8871
8872/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8873/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8874/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8875///
8876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8877#[inline]
8878#[target_feature(enable = "avx512fp16,avx512vl")]
8879#[cfg_attr(test, assert_instr(vgetexpph))]
8880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8881pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8882    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8883}
8884
8885/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8886/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8887/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8888/// `floor(log2(x))` for each element.
8889///
8890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8891#[inline]
8892#[target_feature(enable = "avx512fp16,avx512vl")]
8893#[cfg_attr(test, assert_instr(vgetexpph))]
8894#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8895pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8896    unsafe { vgetexpph_256(a, src, k) }
8897}
8898
8899/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8900/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8901/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8902/// `floor(log2(x))` for each element.
8903///
8904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
8905#[inline]
8906#[target_feature(enable = "avx512fp16,avx512vl")]
8907#[cfg_attr(test, assert_instr(vgetexpph))]
8908#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8909pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
8910    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
8911}
8912
8913/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8914/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8915/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8916///
8917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
8918#[inline]
8919#[target_feature(enable = "avx512fp16")]
8920#[cfg_attr(test, assert_instr(vgetexpph))]
8921#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8922pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
8923    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8924}
8925
8926/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8927/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8928/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8929/// `floor(log2(x))` for each element.
8930///
8931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
8932#[inline]
8933#[target_feature(enable = "avx512fp16")]
8934#[cfg_attr(test, assert_instr(vgetexpph))]
8935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8936pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8937    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
8938}
8939
8940/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8941/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8942/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8943/// `floor(log2(x))` for each element.
8944///
8945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
8946#[inline]
8947#[target_feature(enable = "avx512fp16")]
8948#[cfg_attr(test, assert_instr(vgetexpph))]
8949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8950pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
8951    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
8952}
8953
8954/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8955/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8956/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
8957/// by passing _MM_FROUND_NO_EXC in the sae parameter
8958///
8959/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
8960#[inline]
8961#[target_feature(enable = "avx512fp16")]
8962#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8963#[rustc_legacy_const_generics(1)]
8964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8965pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
8966    static_assert_sae!(SAE);
8967    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
8968}
8969
8970/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8971/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8972/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8973/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8974///
8975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
8976#[inline]
8977#[target_feature(enable = "avx512fp16")]
8978#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8979#[rustc_legacy_const_generics(3)]
8980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8981pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
8982    src: __m512h,
8983    k: __mmask32,
8984    a: __m512h,
8985) -> __m512h {
8986    unsafe {
8987        static_assert_sae!(SAE);
8988        vgetexpph_512(a, src, k, SAE)
8989    }
8990}
8991
8992/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8993/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8994/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8995/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8996///
8997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
8998#[inline]
8999#[target_feature(enable = "avx512fp16")]
9000#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9001#[rustc_legacy_const_generics(2)]
9002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9003pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9004    static_assert_sae!(SAE);
9005    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9006}
9007
9008/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9009/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9010/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9011/// calculates `floor(log2(x))` for the lower element.
9012///
9013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9014#[inline]
9015#[target_feature(enable = "avx512fp16")]
9016#[cfg_attr(test, assert_instr(vgetexpsh))]
9017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9018pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9019    _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
9020}
9021
9022/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9023/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9024/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9025/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9026/// for the lower element.
9027///
9028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9029#[inline]
9030#[target_feature(enable = "avx512fp16")]
9031#[cfg_attr(test, assert_instr(vgetexpsh))]
9032#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9033pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9034    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9035}
9036
9037/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9038/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9039/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9040/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9041/// lower element.
9042///
9043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9044#[inline]
9045#[target_feature(enable = "avx512fp16")]
9046#[cfg_attr(test, assert_instr(vgetexpsh))]
9047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9048pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9049    _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
9050}
9051
9052/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9053/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9054/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9055/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9056/// in the sae parameter
9057///
9058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9059#[inline]
9060#[target_feature(enable = "avx512fp16")]
9061#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9062#[rustc_legacy_const_generics(2)]
9063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9064pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9065    static_assert_sae!(SAE);
9066    _mm_mask_getexp_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
9067}
9068
9069/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9070/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9071/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9072/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9073/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9074///
9075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9076#[inline]
9077#[target_feature(enable = "avx512fp16")]
9078#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9079#[rustc_legacy_const_generics(4)]
9080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9081pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9082    src: __m128h,
9083    k: __mmask8,
9084    a: __m128h,
9085    b: __m128h,
9086) -> __m128h {
9087    unsafe {
9088        static_assert_sae!(SAE);
9089        vgetexpsh(a, b, src, k, SAE)
9090    }
9091}
9092
9093/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9094/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9095/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9096/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9097/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9098///
9099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9100#[inline]
9101#[target_feature(enable = "avx512fp16")]
9102#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9103#[rustc_legacy_const_generics(3)]
9104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9105pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9106    static_assert_sae!(SAE);
9107    _mm_mask_getexp_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
9108}
9109
9110/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9111/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9112/// on the interval range defined by norm and the sign depends on sign and the source sign.
9113///
9114/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9115///
9116///     _MM_MANT_NORM_1_2     // interval [1, 2)
9117///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9118///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9119///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9120///
9121/// The sign is determined by sc which can take the following values:
9122///
9123///     _MM_MANT_SIGN_src     // sign = sign(src)
9124///     _MM_MANT_SIGN_zero    // sign = 0
9125///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9126///
9127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9128#[inline]
9129#[target_feature(enable = "avx512fp16,avx512vl")]
9130#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9131#[rustc_legacy_const_generics(1, 2)]
9132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9133pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9134    a: __m128h,
9135) -> __m128h {
9136    static_assert_uimm_bits!(NORM, 4);
9137    static_assert_uimm_bits!(SIGN, 2);
9138    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9139}
9140
9141/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9142/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9143/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9144/// by norm and the sign depends on sign and the source sign.
9145///
9146/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9147///
9148///     _MM_MANT_NORM_1_2     // interval [1, 2)
9149///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9150///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9151///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9152///
9153/// The sign is determined by sc which can take the following values:
9154///
9155///     _MM_MANT_SIGN_src     // sign = sign(src)
9156///     _MM_MANT_SIGN_zero    // sign = 0
9157///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9158///
9159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9160#[inline]
9161#[target_feature(enable = "avx512fp16,avx512vl")]
9162#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9163#[rustc_legacy_const_generics(3, 4)]
9164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9165pub fn _mm_mask_getmant_ph<
9166    const NORM: _MM_MANTISSA_NORM_ENUM,
9167    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9168>(
9169    src: __m128h,
9170    k: __mmask8,
9171    a: __m128h,
9172) -> __m128h {
9173    unsafe {
9174        static_assert_uimm_bits!(NORM, 4);
9175        static_assert_uimm_bits!(SIGN, 2);
9176        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9177    }
9178}
9179
9180/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9181/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9182/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9183/// by norm and the sign depends on sign and the source sign.
9184///
9185/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9186///
9187///     _MM_MANT_NORM_1_2     // interval [1, 2)
9188///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9189///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9190///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9191///
9192/// The sign is determined by sc which can take the following values:
9193///
9194///     _MM_MANT_SIGN_src     // sign = sign(src)
9195///     _MM_MANT_SIGN_zero    // sign = 0
9196///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9197///
9198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9199#[inline]
9200#[target_feature(enable = "avx512fp16,avx512vl")]
9201#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9202#[rustc_legacy_const_generics(2, 3)]
9203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9204pub fn _mm_maskz_getmant_ph<
9205    const NORM: _MM_MANTISSA_NORM_ENUM,
9206    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9207>(
9208    k: __mmask8,
9209    a: __m128h,
9210) -> __m128h {
9211    static_assert_uimm_bits!(NORM, 4);
9212    static_assert_uimm_bits!(SIGN, 2);
9213    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9214}
9215
9216/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9217/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9218/// on the interval range defined by norm and the sign depends on sign and the source sign.
9219///
9220/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9221///
9222///     _MM_MANT_NORM_1_2     // interval [1, 2)
9223///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9224///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9225///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9226///
9227/// The sign is determined by sc which can take the following values:
9228///
9229///     _MM_MANT_SIGN_src     // sign = sign(src)
9230///     _MM_MANT_SIGN_zero    // sign = 0
9231///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9232///
9233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9234#[inline]
9235#[target_feature(enable = "avx512fp16,avx512vl")]
9236#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9237#[rustc_legacy_const_generics(1, 2)]
9238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9239pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9240    a: __m256h,
9241) -> __m256h {
9242    static_assert_uimm_bits!(NORM, 4);
9243    static_assert_uimm_bits!(SIGN, 2);
9244    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9245}
9246
9247/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9248/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9249/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9250/// by norm and the sign depends on sign and the source sign.
9251///
9252/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9253///
9254///     _MM_MANT_NORM_1_2     // interval [1, 2)
9255///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9256///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9257///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9258///
9259/// The sign is determined by sc which can take the following values:
9260///
9261///     _MM_MANT_SIGN_src     // sign = sign(src)
9262///     _MM_MANT_SIGN_zero    // sign = 0
9263///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9264///
9265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9266#[inline]
9267#[target_feature(enable = "avx512fp16,avx512vl")]
9268#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9269#[rustc_legacy_const_generics(3, 4)]
9270#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9271pub fn _mm256_mask_getmant_ph<
9272    const NORM: _MM_MANTISSA_NORM_ENUM,
9273    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9274>(
9275    src: __m256h,
9276    k: __mmask16,
9277    a: __m256h,
9278) -> __m256h {
9279    unsafe {
9280        static_assert_uimm_bits!(NORM, 4);
9281        static_assert_uimm_bits!(SIGN, 2);
9282        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9283    }
9284}
9285
9286/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9287/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9288/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9289/// by norm and the sign depends on sign and the source sign.
9290///
9291/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9292///
9293///     _MM_MANT_NORM_1_2     // interval [1, 2)
9294///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9295///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9296///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9297///
9298/// The sign is determined by sc which can take the following values:
9299///
9300///     _MM_MANT_SIGN_src     // sign = sign(src)
9301///     _MM_MANT_SIGN_zero    // sign = 0
9302///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9303///
9304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9305#[inline]
9306#[target_feature(enable = "avx512fp16,avx512vl")]
9307#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9308#[rustc_legacy_const_generics(2, 3)]
9309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9310pub fn _mm256_maskz_getmant_ph<
9311    const NORM: _MM_MANTISSA_NORM_ENUM,
9312    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9313>(
9314    k: __mmask16,
9315    a: __m256h,
9316) -> __m256h {
9317    static_assert_uimm_bits!(NORM, 4);
9318    static_assert_uimm_bits!(SIGN, 2);
9319    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9320}
9321
9322/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9323/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9324/// on the interval range defined by norm and the sign depends on sign and the source sign.
9325///
9326/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9327///
9328///     _MM_MANT_NORM_1_2     // interval [1, 2)
9329///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9330///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9331///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9332///
9333/// The sign is determined by sc which can take the following values:
9334///
9335///     _MM_MANT_SIGN_src     // sign = sign(src)
9336///     _MM_MANT_SIGN_zero    // sign = 0
9337///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9338///
9339/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9340#[inline]
9341#[target_feature(enable = "avx512fp16")]
9342#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9343#[rustc_legacy_const_generics(1, 2)]
9344#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9345pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9346    a: __m512h,
9347) -> __m512h {
9348    static_assert_uimm_bits!(NORM, 4);
9349    static_assert_uimm_bits!(SIGN, 2);
9350    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9351}
9352
9353/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9354/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9355/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9356/// by norm and the sign depends on sign and the source sign.
9357///
9358/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9359///
9360///     _MM_MANT_NORM_1_2     // interval [1, 2)
9361///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9362///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9363///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9364///
9365/// The sign is determined by sc which can take the following values:
9366///
9367///     _MM_MANT_SIGN_src     // sign = sign(src)
9368///     _MM_MANT_SIGN_zero    // sign = 0
9369///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9370///
9371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9372#[inline]
9373#[target_feature(enable = "avx512fp16")]
9374#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9375#[rustc_legacy_const_generics(3, 4)]
9376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9377pub fn _mm512_mask_getmant_ph<
9378    const NORM: _MM_MANTISSA_NORM_ENUM,
9379    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9380>(
9381    src: __m512h,
9382    k: __mmask32,
9383    a: __m512h,
9384) -> __m512h {
9385    static_assert_uimm_bits!(NORM, 4);
9386    static_assert_uimm_bits!(SIGN, 2);
9387    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9388}
9389
9390/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9391/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9392/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9393/// by norm and the sign depends on sign and the source sign.
9394///
9395/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9396///
9397///     _MM_MANT_NORM_1_2     // interval [1, 2)
9398///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9399///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9400///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9401///
9402/// The sign is determined by sc which can take the following values:
9403///
9404///     _MM_MANT_SIGN_src     // sign = sign(src)
9405///     _MM_MANT_SIGN_zero    // sign = 0
9406///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9407///
9408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9409#[inline]
9410#[target_feature(enable = "avx512fp16")]
9411#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9412#[rustc_legacy_const_generics(2, 3)]
9413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9414pub fn _mm512_maskz_getmant_ph<
9415    const NORM: _MM_MANTISSA_NORM_ENUM,
9416    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9417>(
9418    k: __mmask32,
9419    a: __m512h,
9420) -> __m512h {
9421    static_assert_uimm_bits!(NORM, 4);
9422    static_assert_uimm_bits!(SIGN, 2);
9423    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9424}
9425
9426/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9427/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9428/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9429/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9430///
9431/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9432///
9433///     _MM_MANT_NORM_1_2     // interval [1, 2)
9434///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9435///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9436///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9437///
9438/// The sign is determined by sc which can take the following values:
9439///
9440///     _MM_MANT_SIGN_src     // sign = sign(src)
9441///     _MM_MANT_SIGN_zero    // sign = 0
9442///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9443///
9444/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9445///
9446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9447#[inline]
9448#[target_feature(enable = "avx512fp16")]
9449#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9450#[rustc_legacy_const_generics(1, 2, 3)]
9451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9452pub fn _mm512_getmant_round_ph<
9453    const NORM: _MM_MANTISSA_NORM_ENUM,
9454    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9455    const SAE: i32,
9456>(
9457    a: __m512h,
9458) -> __m512h {
9459    static_assert_uimm_bits!(NORM, 4);
9460    static_assert_uimm_bits!(SIGN, 2);
9461    static_assert_sae!(SAE);
9462    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9463}
9464
9465/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9466/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9467/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9468/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9469/// in the sae parameter
9470///
9471/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9472///
9473///     _MM_MANT_NORM_1_2     // interval [1, 2)
9474///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9475///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9476///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9477///
9478/// The sign is determined by sc which can take the following values:
9479///
9480///     _MM_MANT_SIGN_src     // sign = sign(src)
9481///     _MM_MANT_SIGN_zero    // sign = 0
9482///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9483///
9484/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9485///
9486/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9487#[inline]
9488#[target_feature(enable = "avx512fp16")]
9489#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9490#[rustc_legacy_const_generics(3, 4, 5)]
9491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9492pub fn _mm512_mask_getmant_round_ph<
9493    const NORM: _MM_MANTISSA_NORM_ENUM,
9494    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9495    const SAE: i32,
9496>(
9497    src: __m512h,
9498    k: __mmask32,
9499    a: __m512h,
9500) -> __m512h {
9501    unsafe {
9502        static_assert_uimm_bits!(NORM, 4);
9503        static_assert_uimm_bits!(SIGN, 2);
9504        static_assert_sae!(SAE);
9505        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9506    }
9507}
9508
9509/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9510/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9511/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9512/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9513/// in the sae parameter
9514///
9515/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9516///
9517///     _MM_MANT_NORM_1_2     // interval [1, 2)
9518///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9519///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9520///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9521///
9522/// The sign is determined by sc which can take the following values:
9523///
9524///     _MM_MANT_SIGN_src     // sign = sign(src)
9525///     _MM_MANT_SIGN_zero    // sign = 0
9526///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9527///
9528/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9529///
9530/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9531#[inline]
9532#[target_feature(enable = "avx512fp16")]
9533#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9534#[rustc_legacy_const_generics(2, 3, 4)]
9535#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9536pub fn _mm512_maskz_getmant_round_ph<
9537    const NORM: _MM_MANTISSA_NORM_ENUM,
9538    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9539    const SAE: i32,
9540>(
9541    k: __mmask32,
9542    a: __m512h,
9543) -> __m512h {
9544    static_assert_uimm_bits!(NORM, 4);
9545    static_assert_uimm_bits!(SIGN, 2);
9546    static_assert_sae!(SAE);
9547    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9548}
9549
9550/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9551/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9552/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9553/// on the interval range defined by norm and the sign depends on sign and the source sign.
9554///
9555/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9556///
9557///     _MM_MANT_NORM_1_2     // interval [1, 2)
9558///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9559///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9560///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9561///
9562/// The sign is determined by sc which can take the following values:
9563///
9564///     _MM_MANT_SIGN_src     // sign = sign(src)
9565///     _MM_MANT_SIGN_zero    // sign = 0
9566///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9567///
9568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9569#[inline]
9570#[target_feature(enable = "avx512fp16")]
9571#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9572#[rustc_legacy_const_generics(2, 3)]
9573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9574pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9575    a: __m128h,
9576    b: __m128h,
9577) -> __m128h {
9578    static_assert_uimm_bits!(NORM, 4);
9579    static_assert_uimm_bits!(SIGN, 2);
9580    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a, b)
9581}
9582
9583/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9584/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9585/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9586/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9587/// the source sign.
9588///
9589/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9590///
9591///     _MM_MANT_NORM_1_2     // interval [1, 2)
9592///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9593///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9594///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9595///
9596/// The sign is determined by sc which can take the following values:
9597///
9598///     _MM_MANT_SIGN_src     // sign = sign(src)
9599///     _MM_MANT_SIGN_zero    // sign = 0
9600///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9601///
9602/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9603#[inline]
9604#[target_feature(enable = "avx512fp16")]
9605#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9606#[rustc_legacy_const_generics(4, 5)]
9607#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9608pub fn _mm_mask_getmant_sh<
9609    const NORM: _MM_MANTISSA_NORM_ENUM,
9610    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9611>(
9612    src: __m128h,
9613    k: __mmask8,
9614    a: __m128h,
9615    b: __m128h,
9616) -> __m128h {
9617    static_assert_uimm_bits!(NORM, 4);
9618    static_assert_uimm_bits!(SIGN, 2);
9619    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9620}
9621
9622/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9623/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9624/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9625/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9626/// the source sign.
9627///
9628/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9629///
9630///     _MM_MANT_NORM_1_2     // interval [1, 2)
9631///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9632///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9633///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9634///
9635/// The sign is determined by sc which can take the following values:
9636///
9637///     _MM_MANT_SIGN_src     // sign = sign(src)
9638///     _MM_MANT_SIGN_zero    // sign = 0
9639///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9640///
9641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9642#[inline]
9643#[target_feature(enable = "avx512fp16")]
9644#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9645#[rustc_legacy_const_generics(3, 4)]
9646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9647pub fn _mm_maskz_getmant_sh<
9648    const NORM: _MM_MANTISSA_NORM_ENUM,
9649    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9650>(
9651    k: __mmask8,
9652    a: __m128h,
9653    b: __m128h,
9654) -> __m128h {
9655    static_assert_uimm_bits!(NORM, 4);
9656    static_assert_uimm_bits!(SIGN, 2);
9657    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_setzero_ph(), k, a, b)
9658}
9659
9660/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9661/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9662/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9663/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9664/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9665///
9666/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9667///
9668///     _MM_MANT_NORM_1_2     // interval [1, 2)
9669///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9670///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9671///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9672///
9673/// The sign is determined by sc which can take the following values:
9674///
9675///     _MM_MANT_SIGN_src     // sign = sign(src)
9676///     _MM_MANT_SIGN_zero    // sign = 0
9677///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9678///
9679/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9680///
9681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9682#[inline]
9683#[target_feature(enable = "avx512fp16")]
9684#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9685#[rustc_legacy_const_generics(2, 3, 4)]
9686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9687pub fn _mm_getmant_round_sh<
9688    const NORM: _MM_MANTISSA_NORM_ENUM,
9689    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9690    const SAE: i32,
9691>(
9692    a: __m128h,
9693    b: __m128h,
9694) -> __m128h {
9695    static_assert_uimm_bits!(NORM, 4);
9696    static_assert_uimm_bits!(SIGN, 2);
9697    static_assert_sae!(SAE);
9698    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_undefined_ph(), 0xff, a, b)
9699}
9700
9701/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9702/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9703/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9704/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9705/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9706///
9707/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9708///
9709///     _MM_MANT_NORM_1_2     // interval [1, 2)
9710///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9711///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9712///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9713///
9714/// The sign is determined by sc which can take the following values:
9715///
9716///     _MM_MANT_SIGN_src     // sign = sign(src)
9717///     _MM_MANT_SIGN_zero    // sign = 0
9718///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9719///
9720/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9721///
9722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9723#[inline]
9724#[target_feature(enable = "avx512fp16")]
9725#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9726#[rustc_legacy_const_generics(4, 5, 6)]
9727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9728pub fn _mm_mask_getmant_round_sh<
9729    const NORM: _MM_MANTISSA_NORM_ENUM,
9730    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9731    const SAE: i32,
9732>(
9733    src: __m128h,
9734    k: __mmask8,
9735    a: __m128h,
9736    b: __m128h,
9737) -> __m128h {
9738    unsafe {
9739        static_assert_uimm_bits!(NORM, 4);
9740        static_assert_uimm_bits!(SIGN, 2);
9741        static_assert_sae!(SAE);
9742        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9743    }
9744}
9745
9746/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9747/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9748/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9749/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9750/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9751///
9752/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9753///
9754///     _MM_MANT_NORM_1_2     // interval [1, 2)
9755///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9756///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9757///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9758///
9759/// The sign is determined by sc which can take the following values:
9760///
9761///     _MM_MANT_SIGN_src     // sign = sign(src)
9762///     _MM_MANT_SIGN_zero    // sign = 0
9763///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9764///
9765/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9766///
9767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9768#[inline]
9769#[target_feature(enable = "avx512fp16")]
9770#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9771#[rustc_legacy_const_generics(3, 4, 5)]
9772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9773pub fn _mm_maskz_getmant_round_sh<
9774    const NORM: _MM_MANTISSA_NORM_ENUM,
9775    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9776    const SAE: i32,
9777>(
9778    k: __mmask8,
9779    a: __m128h,
9780    b: __m128h,
9781) -> __m128h {
9782    static_assert_uimm_bits!(NORM, 4);
9783    static_assert_uimm_bits!(SIGN, 2);
9784    static_assert_sae!(SAE);
9785    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_setzero_ph(), k, a, b)
9786}
9787
9788/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9789/// specified by imm8, and store the results in dst.
9790///
9791/// Rounding is done according to the imm8 parameter, which can be one of:
9792///
9793/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9794/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9795/// * [`_MM_FROUND_TO_POS_INF`] : round up
9796/// * [`_MM_FROUND_TO_ZERO`] : truncate
9797/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9798///
9799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9800#[inline]
9801#[target_feature(enable = "avx512fp16,avx512vl")]
9802#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9803#[rustc_legacy_const_generics(1)]
9804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9805pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9806    static_assert_uimm_bits!(IMM8, 8);
9807    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9808}
9809
9810/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9811/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9812/// the corresponding mask bit is not set).
9813///
9814/// Rounding is done according to the imm8 parameter, which can be one of:
9815///
9816/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9817/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9818/// * [`_MM_FROUND_TO_POS_INF`] : round up
9819/// * [`_MM_FROUND_TO_ZERO`] : truncate
9820/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9821///
9822/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9823#[inline]
9824#[target_feature(enable = "avx512fp16,avx512vl")]
9825#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9826#[rustc_legacy_const_generics(3)]
9827#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9828pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9829    unsafe {
9830        static_assert_uimm_bits!(IMM8, 8);
9831        vrndscaleph_128(a, IMM8, src, k)
9832    }
9833}
9834
9835/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9836/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9837/// mask bit is not set).
9838///
9839/// Rounding is done according to the imm8 parameter, which can be one of:
9840///
9841/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9842/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9843/// * [`_MM_FROUND_TO_POS_INF`] : round up
9844/// * [`_MM_FROUND_TO_ZERO`] : truncate
9845/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9846///
9847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9848#[inline]
9849#[target_feature(enable = "avx512fp16,avx512vl")]
9850#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9851#[rustc_legacy_const_generics(2)]
9852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9853pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9854    static_assert_uimm_bits!(IMM8, 8);
9855    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9856}
9857
9858/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9859/// specified by imm8, and store the results in dst.
9860///
9861/// Rounding is done according to the imm8 parameter, which can be one of:
9862///
9863/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9864/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9865/// * [`_MM_FROUND_TO_POS_INF`] : round up
9866/// * [`_MM_FROUND_TO_ZERO`] : truncate
9867/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9868///
9869/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9870#[inline]
9871#[target_feature(enable = "avx512fp16,avx512vl")]
9872#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9873#[rustc_legacy_const_generics(1)]
9874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9875pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9876    static_assert_uimm_bits!(IMM8, 8);
9877    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9878}
9879
9880/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9881/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9882/// the corresponding mask bit is not set).
9883///
9884/// Rounding is done according to the imm8 parameter, which can be one of:
9885///
9886/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9887/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9888/// * [`_MM_FROUND_TO_POS_INF`] : round up
9889/// * [`_MM_FROUND_TO_ZERO`] : truncate
9890/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9891///
9892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9893#[inline]
9894#[target_feature(enable = "avx512fp16,avx512vl")]
9895#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9896#[rustc_legacy_const_generics(3)]
9897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9898pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9899    src: __m256h,
9900    k: __mmask16,
9901    a: __m256h,
9902) -> __m256h {
9903    unsafe {
9904        static_assert_uimm_bits!(IMM8, 8);
9905        vrndscaleph_256(a, IMM8, src, k)
9906    }
9907}
9908
9909/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9910/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9911/// mask bit is not set).
9912///
9913/// Rounding is done according to the imm8 parameter, which can be one of:
9914///
9915/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9916/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9917/// * [`_MM_FROUND_TO_POS_INF`] : round up
9918/// * [`_MM_FROUND_TO_ZERO`] : truncate
9919/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9920///
9921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
9922#[inline]
9923#[target_feature(enable = "avx512fp16,avx512vl")]
9924#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9925#[rustc_legacy_const_generics(2)]
9926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9927pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
9928    static_assert_uimm_bits!(IMM8, 8);
9929    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
9930}
9931
9932/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9933/// specified by imm8, and store the results in dst.
9934///
9935/// Rounding is done according to the imm8 parameter, which can be one of:
9936///
9937/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9938/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9939/// * [`_MM_FROUND_TO_POS_INF`] : round up
9940/// * [`_MM_FROUND_TO_ZERO`] : truncate
9941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9942///
9943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
9944#[inline]
9945#[target_feature(enable = "avx512fp16")]
9946#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9947#[rustc_legacy_const_generics(1)]
9948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9949pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
9950    static_assert_uimm_bits!(IMM8, 8);
9951    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
9952}
9953
9954/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9955/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9956/// the corresponding mask bit is not set).
9957///
9958/// Rounding is done according to the imm8 parameter, which can be one of:
9959///
9960/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9961/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9962/// * [`_MM_FROUND_TO_POS_INF`] : round up
9963/// * [`_MM_FROUND_TO_ZERO`] : truncate
9964/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9965///
9966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
9967#[inline]
9968#[target_feature(enable = "avx512fp16")]
9969#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9970#[rustc_legacy_const_generics(3)]
9971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9972pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
9973    src: __m512h,
9974    k: __mmask32,
9975    a: __m512h,
9976) -> __m512h {
9977    static_assert_uimm_bits!(IMM8, 8);
9978    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9979}
9980
9981/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9982/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9983/// mask bit is not set).
9984///
9985/// Rounding is done according to the imm8 parameter, which can be one of:
9986///
9987/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9988/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9989/// * [`_MM_FROUND_TO_POS_INF`] : round up
9990/// * [`_MM_FROUND_TO_ZERO`] : truncate
9991/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9992///
9993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
9994#[inline]
9995#[target_feature(enable = "avx512fp16")]
9996#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9997#[rustc_legacy_const_generics(2)]
9998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9999pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10000    static_assert_uimm_bits!(IMM8, 8);
10001    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10002}
10003
10004/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10005/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10006/// in the sae parameter
10007///
10008/// Rounding is done according to the imm8 parameter, which can be one of:
10009///
10010/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10011/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10012/// * [`_MM_FROUND_TO_POS_INF`] : round up
10013/// * [`_MM_FROUND_TO_ZERO`] : truncate
10014/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10015///
10016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10017#[inline]
10018#[target_feature(enable = "avx512fp16")]
10019#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10020#[rustc_legacy_const_generics(1, 2)]
10021#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10022pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10023    static_assert_uimm_bits!(IMM8, 8);
10024    static_assert_sae!(SAE);
10025    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10026}
10027
10028/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10029/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10030/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10031/// in the sae parameter
10032///
10033/// Rounding is done according to the imm8 parameter, which can be one of:
10034///
10035/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10036/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10037/// * [`_MM_FROUND_TO_POS_INF`] : round up
10038/// * [`_MM_FROUND_TO_ZERO`] : truncate
10039/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10040///
10041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10042#[inline]
10043#[target_feature(enable = "avx512fp16")]
10044#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10045#[rustc_legacy_const_generics(3, 4)]
10046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10047pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10048    src: __m512h,
10049    k: __mmask32,
10050    a: __m512h,
10051) -> __m512h {
10052    unsafe {
10053        static_assert_uimm_bits!(IMM8, 8);
10054        static_assert_sae!(SAE);
10055        vrndscaleph_512(a, IMM8, src, k, SAE)
10056    }
10057}
10058
10059/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10060/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10061/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10062///
10063/// Rounding is done according to the imm8 parameter, which can be one of:
10064///
10065/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10066/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10067/// * [`_MM_FROUND_TO_POS_INF`] : round up
10068/// * [`_MM_FROUND_TO_ZERO`] : truncate
10069/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10070///
10071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10072#[inline]
10073#[target_feature(enable = "avx512fp16")]
10074#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10075#[rustc_legacy_const_generics(2, 3)]
10076#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10077pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10078    k: __mmask32,
10079    a: __m512h,
10080) -> __m512h {
10081    static_assert_uimm_bits!(IMM8, 8);
10082    static_assert_sae!(SAE);
10083    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10084}
10085
10086/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10087/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10088/// from a to the upper elements of dst.
10089///
10090/// Rounding is done according to the imm8 parameter, which can be one of:
10091///
10092/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10093/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10094/// * [`_MM_FROUND_TO_POS_INF`] : round up
10095/// * [`_MM_FROUND_TO_ZERO`] : truncate
10096/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10097///
10098/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10099#[inline]
10100#[target_feature(enable = "avx512fp16")]
10101#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10102#[rustc_legacy_const_generics(2)]
10103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10104pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10105    static_assert_uimm_bits!(IMM8, 8);
10106    _mm_mask_roundscale_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
10107}
10108
10109/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10110/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10111/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10112///
10113/// Rounding is done according to the imm8 parameter, which can be one of:
10114///
10115/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10116/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10117/// * [`_MM_FROUND_TO_POS_INF`] : round up
10118/// * [`_MM_FROUND_TO_ZERO`] : truncate
10119/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10120///
10121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10122#[inline]
10123#[target_feature(enable = "avx512fp16")]
10124#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10125#[rustc_legacy_const_generics(4)]
10126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10127pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10128    src: __m128h,
10129    k: __mmask8,
10130    a: __m128h,
10131    b: __m128h,
10132) -> __m128h {
10133    static_assert_uimm_bits!(IMM8, 8);
10134    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10135}
10136
10137/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10138/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10139/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10140///
10141/// Rounding is done according to the imm8 parameter, which can be one of:
10142///
10143/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10144/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10145/// * [`_MM_FROUND_TO_POS_INF`] : round up
10146/// * [`_MM_FROUND_TO_ZERO`] : truncate
10147/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10148///
10149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10150#[inline]
10151#[target_feature(enable = "avx512fp16")]
10152#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10153#[rustc_legacy_const_generics(3)]
10154#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10155pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10156    static_assert_uimm_bits!(IMM8, 8);
10157    _mm_mask_roundscale_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
10158}
10159
10160/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10161/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10162/// from a to the upper elements of dst.
10163///
10164/// Rounding is done according to the imm8 parameter, which can be one of:
10165///
10166/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10167/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10168/// * [`_MM_FROUND_TO_POS_INF`] : round up
10169/// * [`_MM_FROUND_TO_ZERO`] : truncate
10170/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10171///
10172/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10173///
10174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10175#[inline]
10176#[target_feature(enable = "avx512fp16")]
10177#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10178#[rustc_legacy_const_generics(2, 3)]
10179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10180pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10181    static_assert_uimm_bits!(IMM8, 8);
10182    static_assert_sae!(SAE);
10183    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
10184}
10185
10186/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10187/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10188/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10189///
10190/// Rounding is done according to the imm8 parameter, which can be one of:
10191///
10192/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10193/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10194/// * [`_MM_FROUND_TO_POS_INF`] : round up
10195/// * [`_MM_FROUND_TO_ZERO`] : truncate
10196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10197///
10198/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10199///
10200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10201#[inline]
10202#[target_feature(enable = "avx512fp16")]
10203#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10204#[rustc_legacy_const_generics(4, 5)]
10205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10206pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10207    src: __m128h,
10208    k: __mmask8,
10209    a: __m128h,
10210    b: __m128h,
10211) -> __m128h {
10212    unsafe {
10213        static_assert_uimm_bits!(IMM8, 8);
10214        static_assert_sae!(SAE);
10215        vrndscalesh(a, b, src, k, IMM8, SAE)
10216    }
10217}
10218
10219/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10220/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10221/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10222///
10223/// Rounding is done according to the imm8 parameter, which can be one of:
10224///
10225/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10226/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10227/// * [`_MM_FROUND_TO_POS_INF`] : round up
10228/// * [`_MM_FROUND_TO_ZERO`] : truncate
10229/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10230///
10231/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10232///
10233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10234#[inline]
10235#[target_feature(enable = "avx512fp16")]
10236#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10237#[rustc_legacy_const_generics(3, 4)]
10238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10239pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10240    k: __mmask8,
10241    a: __m128h,
10242    b: __m128h,
10243) -> __m128h {
10244    static_assert_uimm_bits!(IMM8, 8);
10245    static_assert_sae!(SAE);
10246    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
10247}
10248
10249/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10250/// the results in dst.
10251///
10252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10253#[inline]
10254#[target_feature(enable = "avx512fp16,avx512vl")]
10255#[cfg_attr(test, assert_instr(vscalefph))]
10256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10257pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10258    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10259}
10260
10261/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10262/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10263///
10264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10265#[inline]
10266#[target_feature(enable = "avx512fp16,avx512vl")]
10267#[cfg_attr(test, assert_instr(vscalefph))]
10268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10269pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10270    unsafe { vscalefph_128(a, b, src, k) }
10271}
10272
10273/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10274/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10275///
10276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10277#[inline]
10278#[target_feature(enable = "avx512fp16,avx512vl")]
10279#[cfg_attr(test, assert_instr(vscalefph))]
10280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10281pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10282    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10283}
10284
10285/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10286/// the results in dst.
10287///
10288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10289#[inline]
10290#[target_feature(enable = "avx512fp16,avx512vl")]
10291#[cfg_attr(test, assert_instr(vscalefph))]
10292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10293pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10294    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10295}
10296
10297/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10298/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10299///
10300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10301#[inline]
10302#[target_feature(enable = "avx512fp16,avx512vl")]
10303#[cfg_attr(test, assert_instr(vscalefph))]
10304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10305pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10306    unsafe { vscalefph_256(a, b, src, k) }
10307}
10308
10309/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10310/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10311///
10312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10313#[inline]
10314#[target_feature(enable = "avx512fp16,avx512vl")]
10315#[cfg_attr(test, assert_instr(vscalefph))]
10316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10317pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10318    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10319}
10320
10321/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10322/// the results in dst.
10323///
10324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10325#[inline]
10326#[target_feature(enable = "avx512fp16")]
10327#[cfg_attr(test, assert_instr(vscalefph))]
10328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10329pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10330    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10331}
10332
10333/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10334/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10335///
10336/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10337#[inline]
10338#[target_feature(enable = "avx512fp16")]
10339#[cfg_attr(test, assert_instr(vscalefph))]
10340#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10341pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10342    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10343}
10344
10345/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10346/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10347///
10348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10349#[inline]
10350#[target_feature(enable = "avx512fp16")]
10351#[cfg_attr(test, assert_instr(vscalefph))]
10352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10353pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10354    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10355}
10356
10357/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10358/// the results in dst.
10359///
10360/// Rounding is done according to the rounding parameter, which can be one of:
10361///
10362/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10363/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10364/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10365/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10366/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10367///
10368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10369#[inline]
10370#[target_feature(enable = "avx512fp16")]
10371#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10372#[rustc_legacy_const_generics(2)]
10373#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10374pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10375    static_assert_rounding!(ROUNDING);
10376    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10377}
10378
10379/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10380/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10381///
10382/// Rounding is done according to the rounding parameter, which can be one of:
10383///
10384/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10385/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10386/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10387/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10388/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10389///
10390/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10391#[inline]
10392#[target_feature(enable = "avx512fp16")]
10393#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10394#[rustc_legacy_const_generics(4)]
10395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10396pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10397    src: __m512h,
10398    k: __mmask32,
10399    a: __m512h,
10400    b: __m512h,
10401) -> __m512h {
10402    unsafe {
10403        static_assert_rounding!(ROUNDING);
10404        vscalefph_512(a, b, src, k, ROUNDING)
10405    }
10406}
10407
10408/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10409/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10410///
10411/// Rounding is done according to the rounding parameter, which can be one of:
10412///
10413/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10414/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10415/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10416/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10417/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10418///
10419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10420#[inline]
10421#[target_feature(enable = "avx512fp16")]
10422#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10423#[rustc_legacy_const_generics(3)]
10424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10425pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10426    k: __mmask32,
10427    a: __m512h,
10428    b: __m512h,
10429) -> __m512h {
10430    static_assert_rounding!(ROUNDING);
10431    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10432}
10433
10434/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10435/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10436/// elements of dst.
10437///
10438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10439#[inline]
10440#[target_feature(enable = "avx512fp16")]
10441#[cfg_attr(test, assert_instr(vscalefsh))]
10442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10443pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10444    _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
10445}
10446
10447/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10448/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10449/// and copy the upper 7 packed elements from a to the upper elements of dst.
10450///
10451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10452#[inline]
10453#[target_feature(enable = "avx512fp16")]
10454#[cfg_attr(test, assert_instr(vscalefsh))]
10455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10456pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10457    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10458}
10459
10460/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10461/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10462/// and copy the upper 7 packed elements from a to the upper elements of dst.
10463///
10464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10465#[inline]
10466#[target_feature(enable = "avx512fp16")]
10467#[cfg_attr(test, assert_instr(vscalefsh))]
10468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10469pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10470    _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
10471}
10472
10473/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10474/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10475/// elements of dst.
10476///
10477/// Rounding is done according to the rounding parameter, which can be one of:
10478///
10479/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10480/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10481/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10482/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10483/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10484///
10485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10486#[inline]
10487#[target_feature(enable = "avx512fp16")]
10488#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10489#[rustc_legacy_const_generics(2)]
10490#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10491pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10492    static_assert_rounding!(ROUNDING);
10493    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
10494}
10495
10496/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10497/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10498/// and copy the upper 7 packed elements from a to the upper elements of dst.
10499///
10500/// Rounding is done according to the rounding parameter, which can be one of:
10501///
10502/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10503/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10504/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10505/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10506/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10507///
10508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10509#[inline]
10510#[target_feature(enable = "avx512fp16")]
10511#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10512#[rustc_legacy_const_generics(4)]
10513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10514pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10515    src: __m128h,
10516    k: __mmask8,
10517    a: __m128h,
10518    b: __m128h,
10519) -> __m128h {
10520    unsafe {
10521        static_assert_rounding!(ROUNDING);
10522        vscalefsh(a, b, src, k, ROUNDING)
10523    }
10524}
10525
10526/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10527/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10528/// and copy the upper 7 packed elements from a to the upper elements of dst.
10529///
10530/// Rounding is done according to the rounding parameter, which can be one of:
10531///
10532/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10533/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10534/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10535/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10536/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10537///
10538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10539#[inline]
10540#[target_feature(enable = "avx512fp16")]
10541#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10542#[rustc_legacy_const_generics(3)]
10543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10544pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10545    k: __mmask8,
10546    a: __m128h,
10547    b: __m128h,
10548) -> __m128h {
10549    static_assert_rounding!(ROUNDING);
10550    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
10551}
10552
10553/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10554/// number of bits specified by imm8, and store the results in dst.
10555///
10556/// Rounding is done according to the imm8 parameter, which can be one of:
10557///
10558/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10559/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10560/// * [`_MM_FROUND_TO_POS_INF`] : round up
10561/// * [`_MM_FROUND_TO_ZERO`] : truncate
10562/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10563///
10564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10565#[inline]
10566#[target_feature(enable = "avx512fp16,avx512vl")]
10567#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10568#[rustc_legacy_const_generics(1)]
10569#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10570pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10571    static_assert_uimm_bits!(IMM8, 8);
10572    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10573}
10574
10575/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10576/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10577/// from src when the corresponding mask bit is not set).
10578///
10579/// Rounding is done according to the imm8 parameter, which can be one of:
10580///
10581/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10582/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10583/// * [`_MM_FROUND_TO_POS_INF`] : round up
10584/// * [`_MM_FROUND_TO_ZERO`] : truncate
10585/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10586///
10587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10588#[inline]
10589#[target_feature(enable = "avx512fp16,avx512vl")]
10590#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10591#[rustc_legacy_const_generics(3)]
10592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10593pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10594    unsafe {
10595        static_assert_uimm_bits!(IMM8, 8);
10596        vreduceph_128(a, IMM8, src, k)
10597    }
10598}
10599
10600/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10601/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10602/// out when the corresponding mask bit is not set).
10603///
10604/// Rounding is done according to the imm8 parameter, which can be one of:
10605///
10606/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10607/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10608/// * [`_MM_FROUND_TO_POS_INF`] : round up
10609/// * [`_MM_FROUND_TO_ZERO`] : truncate
10610/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10611///
10612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10613#[inline]
10614#[target_feature(enable = "avx512fp16,avx512vl")]
10615#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10616#[rustc_legacy_const_generics(2)]
10617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10618pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10619    static_assert_uimm_bits!(IMM8, 8);
10620    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10621}
10622
10623/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10624/// number of bits specified by imm8, and store the results in dst.
10625///
10626/// Rounding is done according to the imm8 parameter, which can be one of:
10627///
10628/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10629/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10630/// * [`_MM_FROUND_TO_POS_INF`] : round up
10631/// * [`_MM_FROUND_TO_ZERO`] : truncate
10632/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10633///
10634/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10635#[inline]
10636#[target_feature(enable = "avx512fp16,avx512vl")]
10637#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10638#[rustc_legacy_const_generics(1)]
10639#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10640pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10641    static_assert_uimm_bits!(IMM8, 8);
10642    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10643}
10644
10645/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10646/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10647/// from src when the corresponding mask bit is not set).
10648///
10649/// Rounding is done according to the imm8 parameter, which can be one of:
10650///
10651/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10652/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10653/// * [`_MM_FROUND_TO_POS_INF`] : round up
10654/// * [`_MM_FROUND_TO_ZERO`] : truncate
10655/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10656///
10657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10658#[inline]
10659#[target_feature(enable = "avx512fp16,avx512vl")]
10660#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10661#[rustc_legacy_const_generics(3)]
10662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10663pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10664    unsafe {
10665        static_assert_uimm_bits!(IMM8, 8);
10666        vreduceph_256(a, IMM8, src, k)
10667    }
10668}
10669
10670/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10671/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10672/// out when the corresponding mask bit is not set).
10673///
10674/// Rounding is done according to the imm8 parameter, which can be one of:
10675///
10676/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10677/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10678/// * [`_MM_FROUND_TO_POS_INF`] : round up
10679/// * [`_MM_FROUND_TO_ZERO`] : truncate
10680/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10681///
10682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10683#[inline]
10684#[target_feature(enable = "avx512fp16,avx512vl")]
10685#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10686#[rustc_legacy_const_generics(2)]
10687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10688pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10689    static_assert_uimm_bits!(IMM8, 8);
10690    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10691}
10692
10693/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10694/// number of bits specified by imm8, and store the results in dst.
10695///
10696/// Rounding is done according to the imm8 parameter, which can be one of:
10697///
10698/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10699/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10700/// * [`_MM_FROUND_TO_POS_INF`] : round up
10701/// * [`_MM_FROUND_TO_ZERO`] : truncate
10702/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10703///
10704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10705#[inline]
10706#[target_feature(enable = "avx512fp16")]
10707#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10708#[rustc_legacy_const_generics(1)]
10709#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10710pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10711    static_assert_uimm_bits!(IMM8, 8);
10712    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10713}
10714
10715/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10716/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10717/// from src when the corresponding mask bit is not set).
10718///
10719/// Rounding is done according to the imm8 parameter, which can be one of:
10720///
10721/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10722/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10723/// * [`_MM_FROUND_TO_POS_INF`] : round up
10724/// * [`_MM_FROUND_TO_ZERO`] : truncate
10725/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10726///
10727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10728#[inline]
10729#[target_feature(enable = "avx512fp16")]
10730#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10731#[rustc_legacy_const_generics(3)]
10732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10733pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10734    static_assert_uimm_bits!(IMM8, 8);
10735    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10736}
10737
10738/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10739/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10740/// out when the corresponding mask bit is not set).
10741///
10742/// Rounding is done according to the imm8 parameter, which can be one of:
10743///
10744/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10745/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10746/// * [`_MM_FROUND_TO_POS_INF`] : round up
10747/// * [`_MM_FROUND_TO_ZERO`] : truncate
10748/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10749///
10750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10751#[inline]
10752#[target_feature(enable = "avx512fp16")]
10753#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10754#[rustc_legacy_const_generics(2)]
10755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10756pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10757    static_assert_uimm_bits!(IMM8, 8);
10758    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10759}
10760
10761/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10762/// number of bits specified by imm8, and store the results in dst.
10763///
10764/// Rounding is done according to the imm8 parameter, which can be one of:
10765///
10766/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10767/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10768/// * [`_MM_FROUND_TO_POS_INF`] : round up
10769/// * [`_MM_FROUND_TO_ZERO`] : truncate
10770/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10771///
10772/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10773///
10774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10775#[inline]
10776#[target_feature(enable = "avx512fp16")]
10777#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10778#[rustc_legacy_const_generics(1, 2)]
10779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10780pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10781    static_assert_uimm_bits!(IMM8, 8);
10782    static_assert_sae!(SAE);
10783    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10784}
10785
10786/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10787/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10788/// from src when the corresponding mask bit is not set).
10789///
10790/// Rounding is done according to the imm8 parameter, which can be one of:
10791///
10792/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10793/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10794/// * [`_MM_FROUND_TO_POS_INF`] : round up
10795/// * [`_MM_FROUND_TO_ZERO`] : truncate
10796/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10797///
10798/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10799///
10800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10801#[inline]
10802#[target_feature(enable = "avx512fp16")]
10803#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10804#[rustc_legacy_const_generics(3, 4)]
10805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10806pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10807    src: __m512h,
10808    k: __mmask32,
10809    a: __m512h,
10810) -> __m512h {
10811    unsafe {
10812        static_assert_uimm_bits!(IMM8, 8);
10813        static_assert_sae!(SAE);
10814        vreduceph_512(a, IMM8, src, k, SAE)
10815    }
10816}
10817
10818/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10819/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10820/// out when the corresponding mask bit is not set).
10821///
10822/// Rounding is done according to the imm8 parameter, which can be one of:
10823///
10824/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10825/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10826/// * [`_MM_FROUND_TO_POS_INF`] : round up
10827/// * [`_MM_FROUND_TO_ZERO`] : truncate
10828/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10829///
10830/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10831///
10832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10833#[inline]
10834#[target_feature(enable = "avx512fp16")]
10835#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10836#[rustc_legacy_const_generics(2, 3)]
10837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10838pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10839    k: __mmask32,
10840    a: __m512h,
10841) -> __m512h {
10842    static_assert_uimm_bits!(IMM8, 8);
10843    static_assert_sae!(SAE);
10844    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10845}
10846
10847/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10848/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10849/// upper 7 packed elements from a to the upper elements of dst.
10850///
10851/// Rounding is done according to the imm8 parameter, which can be one of:
10852///
10853/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10854/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10855/// * [`_MM_FROUND_TO_POS_INF`] : round up
10856/// * [`_MM_FROUND_TO_ZERO`] : truncate
10857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10858///
10859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10860#[inline]
10861#[target_feature(enable = "avx512fp16")]
10862#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10863#[rustc_legacy_const_generics(2)]
10864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10865pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10866    static_assert_uimm_bits!(IMM8, 8);
10867    _mm_mask_reduce_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
10868}
10869
10870/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10871/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10872/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10873/// a to the upper elements of dst.
10874///
10875/// Rounding is done according to the imm8 parameter, which can be one of:
10876///
10877/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10878/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10879/// * [`_MM_FROUND_TO_POS_INF`] : round up
10880/// * [`_MM_FROUND_TO_ZERO`] : truncate
10881/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10882///
10883/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10884#[inline]
10885#[target_feature(enable = "avx512fp16")]
10886#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10887#[rustc_legacy_const_generics(4)]
10888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10889pub fn _mm_mask_reduce_sh<const IMM8: i32>(
10890    src: __m128h,
10891    k: __mmask8,
10892    a: __m128h,
10893    b: __m128h,
10894) -> __m128h {
10895    static_assert_uimm_bits!(IMM8, 8);
10896    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10897}
10898
10899/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10900/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10901/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10902/// to the upper elements of dst.
10903///
10904/// Rounding is done according to the imm8 parameter, which can be one of:
10905///
10906/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10907/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10908/// * [`_MM_FROUND_TO_POS_INF`] : round up
10909/// * [`_MM_FROUND_TO_ZERO`] : truncate
10910/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10911///
10912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
10913#[inline]
10914#[target_feature(enable = "avx512fp16")]
10915#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10916#[rustc_legacy_const_generics(3)]
10917#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10918pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10919    static_assert_uimm_bits!(IMM8, 8);
10920    _mm_mask_reduce_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
10921}
10922
10923/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10924/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
10925/// 7 packed elements from a to the upper elements of dst.
10926///
10927/// Rounding is done according to the imm8 parameter, which can be one of:
10928///
10929/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10930/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10931/// * [`_MM_FROUND_TO_POS_INF`] : round up
10932/// * [`_MM_FROUND_TO_ZERO`] : truncate
10933/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10934///
10935/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10936///
10937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
10938#[inline]
10939#[target_feature(enable = "avx512fp16")]
10940#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10941#[rustc_legacy_const_generics(2, 3)]
10942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10943pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10944    static_assert_uimm_bits!(IMM8, 8);
10945    static_assert_sae!(SAE);
10946    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
10947}
10948
10949/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10950/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10951/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
10952/// to the upper elements of dst.
10953///
10954/// Rounding is done according to the imm8 parameter, which can be one of:
10955///
10956/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10957/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10958/// * [`_MM_FROUND_TO_POS_INF`] : round up
10959/// * [`_MM_FROUND_TO_ZERO`] : truncate
10960/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10961///
10962/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10963///
10964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
10965#[inline]
10966#[target_feature(enable = "avx512fp16")]
10967#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10968#[rustc_legacy_const_generics(4, 5)]
10969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10970pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10971    src: __m128h,
10972    k: __mmask8,
10973    a: __m128h,
10974    b: __m128h,
10975) -> __m128h {
10976    unsafe {
10977        static_assert_uimm_bits!(IMM8, 8);
10978        static_assert_sae!(SAE);
10979        vreducesh(a, b, src, k, IMM8, SAE)
10980    }
10981}
10982
10983/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10984/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10985/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10986/// to the upper elements of dst.
10987///
10988/// Rounding is done according to the imm8 parameter, which can be one of:
10989///
10990/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10991/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10992/// * [`_MM_FROUND_TO_POS_INF`] : round up
10993/// * [`_MM_FROUND_TO_ZERO`] : truncate
10994/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10995///
10996/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10997///
10998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
10999#[inline]
11000#[target_feature(enable = "avx512fp16")]
11001#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11002#[rustc_legacy_const_generics(3, 4)]
11003#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11004pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11005    k: __mmask8,
11006    a: __m128h,
11007    b: __m128h,
11008) -> __m128h {
11009    static_assert_uimm_bits!(IMM8, 8);
11010    static_assert_sae!(SAE);
11011    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
11012}
11013
11014/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11015/// sum of all elements in a.
11016///
11017/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11018#[inline]
11019#[target_feature(enable = "avx512fp16,avx512vl")]
11020#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11021pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11022    unsafe {
11023        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11024        let a = _mm_add_ph(a, b);
11025        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11026        let a = _mm_add_ph(a, b);
11027        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
11028    }
11029}
11030
11031/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11032/// sum of all elements in a.
11033///
11034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11035#[inline]
11036#[target_feature(enable = "avx512fp16,avx512vl")]
11037#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11038pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11039    unsafe {
11040        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11041        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11042        _mm_reduce_add_ph(_mm_add_ph(p, q))
11043    }
11044}
11045
11046/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11047/// sum of all elements in a.
11048///
11049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11050#[inline]
11051#[target_feature(enable = "avx512fp16")]
11052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11053pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11054    unsafe {
11055        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11056        let q = simd_shuffle!(
11057            a,
11058            a,
11059            [
11060                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11061            ]
11062        );
11063        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11064    }
11065}
11066
11067/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11068/// the product of all elements in a.
11069///
11070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11071#[inline]
11072#[target_feature(enable = "avx512fp16,avx512vl")]
11073#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11074pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11075    unsafe {
11076        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11077        let a = _mm_mul_ph(a, b);
11078        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11079        let a = _mm_mul_ph(a, b);
11080        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
11081    }
11082}
11083
11084/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11085/// the product of all elements in a.
11086///
11087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11088#[inline]
11089#[target_feature(enable = "avx512fp16,avx512vl")]
11090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11091pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11092    unsafe {
11093        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11094        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11095        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11096    }
11097}
11098
11099/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11100/// the product of all elements in a.
11101///
11102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11103#[inline]
11104#[target_feature(enable = "avx512fp16")]
11105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11106pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11107    unsafe {
11108        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11109        let q = simd_shuffle!(
11110            a,
11111            a,
11112            [
11113                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11114            ]
11115        );
11116        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11117    }
11118}
11119
11120/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11121/// minimum of all elements in a.
11122///
11123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11124#[inline]
11125#[target_feature(enable = "avx512fp16,avx512vl")]
11126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11127pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11128    unsafe {
11129        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11130        let a = _mm_min_ph(a, b);
11131        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11132        let a = _mm_min_ph(a, b);
11133        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11134        simd_extract!(_mm_min_sh(a, b), 0)
11135    }
11136}
11137
11138/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11139/// minimum of all elements in a.
11140///
11141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11142#[inline]
11143#[target_feature(enable = "avx512fp16,avx512vl")]
11144#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11145pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11146    unsafe {
11147        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11148        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11149        _mm_reduce_min_ph(_mm_min_ph(p, q))
11150    }
11151}
11152
11153/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11154/// minimum of all elements in a.
11155///
11156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11157#[inline]
11158#[target_feature(enable = "avx512fp16")]
11159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11160pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11161    unsafe {
11162        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11163        let q = simd_shuffle!(
11164            a,
11165            a,
11166            [
11167                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11168            ]
11169        );
11170        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11171    }
11172}
11173
11174/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11175/// maximum of all elements in a.
11176///
11177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11178#[inline]
11179#[target_feature(enable = "avx512fp16,avx512vl")]
11180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11181pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11182    unsafe {
11183        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11184        let a = _mm_max_ph(a, b);
11185        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11186        let a = _mm_max_ph(a, b);
11187        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11188        simd_extract!(_mm_max_sh(a, b), 0)
11189    }
11190}
11191
11192/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11193/// maximum of all elements in a.
11194///
11195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11196#[inline]
11197#[target_feature(enable = "avx512fp16,avx512vl")]
11198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11199pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11200    unsafe {
11201        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11202        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11203        _mm_reduce_max_ph(_mm_max_ph(p, q))
11204    }
11205}
11206
11207/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11208/// maximum of all elements in a.
11209///
11210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11211#[inline]
11212#[target_feature(enable = "avx512fp16")]
11213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11214pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11215    unsafe {
11216        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11217        let q = simd_shuffle!(
11218            a,
11219            a,
11220            [
11221                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11222            ]
11223        );
11224        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11225    }
11226}
11227
11228macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11229    ($mask_type: ty, $reg: ident, $a: expr) => {{
11230        let dst: $mask_type;
11231        asm!(
11232            "vfpclassph {k}, {src}, {imm8}",
11233            k = lateout(kreg) dst,
11234            src = in($reg) $a,
11235            imm8 = const IMM8,
11236            options(pure, nomem, nostack)
11237        );
11238        dst
11239    }};
11240    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11241        let dst: $mask_type;
11242        asm!(
11243            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11244            k = lateout(kreg) dst,
11245            mask = in(kreg) $mask,
11246            src = in($reg) $a,
11247            imm8 = const IMM8,
11248            options(pure, nomem, nostack)
11249        );
11250        dst
11251    }};
11252}
11253
11254/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11255/// by imm8, and store the results in mask vector k.
11256/// imm can be a combination of:
11257///
11258///     0x01 // QNaN
11259///     0x02 // Positive Zero
11260///     0x04 // Negative Zero
11261///     0x08 // Positive Infinity
11262///     0x10 // Negative Infinity
11263///     0x20 // Denormal
11264///     0x40 // Negative
11265///     0x80 // SNaN
11266///
11267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11268#[inline]
11269#[target_feature(enable = "avx512fp16,avx512vl")]
11270#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11271#[rustc_legacy_const_generics(1)]
11272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11273pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11274    unsafe {
11275        static_assert_uimm_bits!(IMM8, 8);
11276        fpclass_asm!(__mmask8, xmm_reg, a)
11277    }
11278}
11279
11280/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11281/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11282/// corresponding mask bit is not set).
11283/// imm can be a combination of:
11284///
11285///     0x01 // QNaN
11286///     0x02 // Positive Zero
11287///     0x04 // Negative Zero
11288///     0x08 // Positive Infinity
11289///     0x10 // Negative Infinity
11290///     0x20 // Denormal
11291///     0x40 // Negative
11292///     0x80 // SNaN
11293///
11294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11295#[inline]
11296#[target_feature(enable = "avx512fp16,avx512vl")]
11297#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11298#[rustc_legacy_const_generics(2)]
11299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11300pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11301    unsafe {
11302        static_assert_uimm_bits!(IMM8, 8);
11303        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11304    }
11305}
11306
11307/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11308/// by imm8, and store the results in mask vector k.
11309/// imm can be a combination of:
11310///
11311///     0x01 // QNaN
11312///     0x02 // Positive Zero
11313///     0x04 // Negative Zero
11314///     0x08 // Positive Infinity
11315///     0x10 // Negative Infinity
11316///     0x20 // Denormal
11317///     0x40 // Negative
11318///     0x80 // SNaN
11319///
11320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11321#[inline]
11322#[target_feature(enable = "avx512fp16,avx512vl")]
11323#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11324#[rustc_legacy_const_generics(1)]
11325#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11326pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11327    unsafe {
11328        static_assert_uimm_bits!(IMM8, 8);
11329        fpclass_asm!(__mmask16, ymm_reg, a)
11330    }
11331}
11332
11333/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11334/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11335/// corresponding mask bit is not set).
11336/// imm can be a combination of:
11337///
11338///     0x01 // QNaN
11339///     0x02 // Positive Zero
11340///     0x04 // Negative Zero
11341///     0x08 // Positive Infinity
11342///     0x10 // Negative Infinity
11343///     0x20 // Denormal
11344///     0x40 // Negative
11345///     0x80 // SNaN
11346///
11347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11348#[inline]
11349#[target_feature(enable = "avx512fp16,avx512vl")]
11350#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11351#[rustc_legacy_const_generics(2)]
11352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11353pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11354    unsafe {
11355        static_assert_uimm_bits!(IMM8, 8);
11356        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11357    }
11358}
11359
11360/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11361/// by imm8, and store the results in mask vector k.
11362/// imm can be a combination of:
11363///
11364///     0x01 // QNaN
11365///     0x02 // Positive Zero
11366///     0x04 // Negative Zero
11367///     0x08 // Positive Infinity
11368///     0x10 // Negative Infinity
11369///     0x20 // Denormal
11370///     0x40 // Negative
11371///     0x80 // SNaN
11372///
11373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11374#[inline]
11375#[target_feature(enable = "avx512fp16")]
11376#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11377#[rustc_legacy_const_generics(1)]
11378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11379pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11380    unsafe {
11381        static_assert_uimm_bits!(IMM8, 8);
11382        fpclass_asm!(__mmask32, zmm_reg, a)
11383    }
11384}
11385
11386/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11387/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11388/// corresponding mask bit is not set).
11389/// imm can be a combination of:
11390///
11391///     0x01 // QNaN
11392///     0x02 // Positive Zero
11393///     0x04 // Negative Zero
11394///     0x08 // Positive Infinity
11395///     0x10 // Negative Infinity
11396///     0x20 // Denormal
11397///     0x40 // Negative
11398///     0x80 // SNaN
11399///
11400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11401#[inline]
11402#[target_feature(enable = "avx512fp16")]
11403#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11404#[rustc_legacy_const_generics(2)]
11405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11406pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11407    unsafe {
11408        static_assert_uimm_bits!(IMM8, 8);
11409        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11410    }
11411}
11412
11413/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11414/// by imm8, and store the result in mask vector k.
11415/// imm can be a combination of:
11416///
11417///     0x01 // QNaN
11418///     0x02 // Positive Zero
11419///     0x04 // Negative Zero
11420///     0x08 // Positive Infinity
11421///     0x10 // Negative Infinity
11422///     0x20 // Denormal
11423///     0x40 // Negative
11424///     0x80 // SNaN
11425///
11426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11427#[inline]
11428#[target_feature(enable = "avx512fp16")]
11429#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11430#[rustc_legacy_const_generics(1)]
11431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11432pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11433    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11434}
11435
11436/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11437/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11438/// corresponding mask bit is not set).
11439/// imm can be a combination of:
11440///
11441///     0x01 // QNaN
11442///     0x02 // Positive Zero
11443///     0x04 // Negative Zero
11444///     0x08 // Positive Infinity
11445///     0x10 // Negative Infinity
11446///     0x20 // Denormal
11447///     0x40 // Negative
11448///     0x80 // SNaN
11449///
11450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11451#[inline]
11452#[target_feature(enable = "avx512fp16")]
11453#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11454#[rustc_legacy_const_generics(2)]
11455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11456pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11457    unsafe {
11458        static_assert_uimm_bits!(IMM8, 8);
11459        vfpclasssh(a, IMM8, k1)
11460    }
11461}
11462
11463/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11464/// and store the results in dst.
11465///
11466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11467#[inline]
11468#[target_feature(enable = "avx512fp16,avx512vl")]
11469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11470pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11471    unsafe { simd_select_bitmask(k, b, a) }
11472}
11473
11474/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11475/// and store the results in dst.
11476///
11477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11478#[inline]
11479#[target_feature(enable = "avx512fp16,avx512vl")]
11480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11481pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11482    unsafe { simd_select_bitmask(k, b, a) }
11483}
11484
11485/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11486/// and store the results in dst.
11487///
11488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11489#[inline]
11490#[target_feature(enable = "avx512fp16")]
11491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11492pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11493    unsafe { simd_select_bitmask(k, b, a) }
11494}
11495
11496/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11497/// and index in idx, and store the results in dst.
11498///
11499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11500#[inline]
11501#[target_feature(enable = "avx512fp16,avx512vl")]
11502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11503pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11504    _mm_castsi128_ph(_mm_permutex2var_epi16(
11505        _mm_castph_si128(a),
11506        idx,
11507        _mm_castph_si128(b),
11508    ))
11509}
11510
11511/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11512/// and index in idx, and store the results in dst.
11513///
11514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11515#[inline]
11516#[target_feature(enable = "avx512fp16,avx512vl")]
11517#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11518pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11519    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11520        _mm256_castph_si256(a),
11521        idx,
11522        _mm256_castph_si256(b),
11523    ))
11524}
11525
11526/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11527/// and index in idx, and store the results in dst.
11528///
11529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11530#[inline]
11531#[target_feature(enable = "avx512fp16")]
11532#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11533pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11534    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11535        _mm512_castph_si512(a),
11536        idx,
11537        _mm512_castph_si512(b),
11538    ))
11539}
11540
11541/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11542/// and store the results in dst.
11543///
11544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11545#[inline]
11546#[target_feature(enable = "avx512fp16,avx512vl")]
11547#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11548pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11549    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11550}
11551
11552/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11553/// and store the results in dst.
11554///
11555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11556#[inline]
11557#[target_feature(enable = "avx512fp16,avx512vl")]
11558#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11559pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11560    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11561}
11562
11563/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11564/// and store the results in dst.
11565///
11566/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11567#[inline]
11568#[target_feature(enable = "avx512fp16")]
11569#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11570pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11571    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11572}
11573
11574/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11575/// and store the results in dst.
11576///
11577/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11578#[inline]
11579#[target_feature(enable = "avx512fp16,avx512vl")]
11580#[cfg_attr(test, assert_instr(vcvtw2ph))]
11581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11582pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11583    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11584}
11585
11586/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11587/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11588/// mask bit is not set).
11589///
11590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11591#[inline]
11592#[target_feature(enable = "avx512fp16,avx512vl")]
11593#[cfg_attr(test, assert_instr(vcvtw2ph))]
11594#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11595pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11596    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11597}
11598
11599/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11600/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11601///
11602/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11603#[inline]
11604#[target_feature(enable = "avx512fp16,avx512vl")]
11605#[cfg_attr(test, assert_instr(vcvtw2ph))]
11606#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11607pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11608    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11609}
11610
11611/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11612/// and store the results in dst.
11613///
11614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11615#[inline]
11616#[target_feature(enable = "avx512fp16,avx512vl")]
11617#[cfg_attr(test, assert_instr(vcvtw2ph))]
11618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11619pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11620    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11621}
11622
11623/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11624/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11625/// mask bit is not set).
11626///
11627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11628#[inline]
11629#[target_feature(enable = "avx512fp16,avx512vl")]
11630#[cfg_attr(test, assert_instr(vcvtw2ph))]
11631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11632pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11633    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11634}
11635
11636/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11637/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11638///
11639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11640#[inline]
11641#[target_feature(enable = "avx512fp16,avx512vl")]
11642#[cfg_attr(test, assert_instr(vcvtw2ph))]
11643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11644pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11645    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11646}
11647
11648/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11649/// and store the results in dst.
11650///
11651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11652#[inline]
11653#[target_feature(enable = "avx512fp16")]
11654#[cfg_attr(test, assert_instr(vcvtw2ph))]
11655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11656pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11657    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11658}
11659
11660/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11661/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11662/// mask bit is not set).
11663///
11664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11665#[inline]
11666#[target_feature(enable = "avx512fp16")]
11667#[cfg_attr(test, assert_instr(vcvtw2ph))]
11668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11669pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11670    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11671}
11672
11673/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11674/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11675///
11676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11677#[inline]
11678#[target_feature(enable = "avx512fp16")]
11679#[cfg_attr(test, assert_instr(vcvtw2ph))]
11680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11681pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11682    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11683}
11684
11685/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11686/// and store the results in dst.
11687///
11688/// Rounding is done according to the rounding parameter, which can be one of:
11689///
11690/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11691/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11692/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11693/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11694/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11695///
11696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11697#[inline]
11698#[target_feature(enable = "avx512fp16")]
11699#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11700#[rustc_legacy_const_generics(1)]
11701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11702pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11703    unsafe {
11704        static_assert_rounding!(ROUNDING);
11705        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11706    }
11707}
11708
11709/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11710/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11711/// mask bit is not set).
11712///
11713/// Rounding is done according to the rounding parameter, which can be one of:
11714///
11715/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11716/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11717/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11718/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11719/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11720///
11721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11722#[inline]
11723#[target_feature(enable = "avx512fp16")]
11724#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11725#[rustc_legacy_const_generics(3)]
11726#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11727pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11728    src: __m512h,
11729    k: __mmask32,
11730    a: __m512i,
11731) -> __m512h {
11732    unsafe {
11733        static_assert_rounding!(ROUNDING);
11734        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11735    }
11736}
11737
11738/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11739/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11740///
11741/// Rounding is done according to the rounding parameter, which can be one of:
11742///
11743/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11744/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11745/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11746/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11747/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11748///
11749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11750#[inline]
11751#[target_feature(enable = "avx512fp16")]
11752#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11753#[rustc_legacy_const_generics(2)]
11754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11755pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11756    static_assert_rounding!(ROUNDING);
11757    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11758}
11759
11760/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11761/// and store the results in dst.
11762///
11763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11764#[inline]
11765#[target_feature(enable = "avx512fp16,avx512vl")]
11766#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11768pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11769    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11770}
11771
11772/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11773/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11774/// mask bit is not set).
11775///
11776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11777#[inline]
11778#[target_feature(enable = "avx512fp16,avx512vl")]
11779#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11781pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11782    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11783}
11784
11785/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11786/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11787///
11788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11789#[inline]
11790#[target_feature(enable = "avx512fp16,avx512vl")]
11791#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11793pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11794    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11795}
11796
11797/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11798/// and store the results in dst.
11799///
11800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11801#[inline]
11802#[target_feature(enable = "avx512fp16,avx512vl")]
11803#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11805pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11806    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
11807}
11808
11809/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11810/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11811/// mask bit is not set).
11812///
11813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11814#[inline]
11815#[target_feature(enable = "avx512fp16,avx512vl")]
11816#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11818pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11819    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
11820}
11821
11822/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11823/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11824///
11825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11826#[inline]
11827#[target_feature(enable = "avx512fp16,avx512vl")]
11828#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11829#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11830pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11831    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11832}
11833
11834/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11835/// and store the results in dst.
11836///
11837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11838#[inline]
11839#[target_feature(enable = "avx512fp16")]
11840#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11841#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11842pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11843    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
11844}
11845
11846/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11847/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11848/// mask bit is not set).
11849///
11850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11851#[inline]
11852#[target_feature(enable = "avx512fp16")]
11853#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11854#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11855pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11856    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
11857}
11858
11859/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11860/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11861///
11862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11863#[inline]
11864#[target_feature(enable = "avx512fp16")]
11865#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11867pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11868    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11869}
11870
11871/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11872/// and store the results in dst.
11873///
11874/// Rounding is done according to the rounding parameter, which can be one of:
11875///
11876/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11877/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11878/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11879/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11880/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11881///
11882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11883#[inline]
11884#[target_feature(enable = "avx512fp16")]
11885#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11886#[rustc_legacy_const_generics(1)]
11887#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11888pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11889    unsafe {
11890        static_assert_rounding!(ROUNDING);
11891        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11892    }
11893}
11894
11895/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11896/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11897/// mask bit is not set).
11898///
11899/// Rounding is done according to the rounding parameter, which can be one of:
11900///
11901/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11902/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11903/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11904/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11905/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11906///
11907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
11908#[inline]
11909#[target_feature(enable = "avx512fp16")]
11910#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11911#[rustc_legacy_const_generics(3)]
11912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11913pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
11914    src: __m512h,
11915    k: __mmask32,
11916    a: __m512i,
11917) -> __m512h {
11918    unsafe {
11919        static_assert_rounding!(ROUNDING);
11920        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
11921    }
11922}
11923
11924/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11925/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11926///
11927/// Rounding is done according to the rounding parameter, which can be one of:
11928///
11929/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11930/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11931/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11932/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11933/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11934///
11935/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
11936#[inline]
11937#[target_feature(enable = "avx512fp16")]
11938#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11939#[rustc_legacy_const_generics(2)]
11940#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11941pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11942    static_assert_rounding!(ROUNDING);
11943    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11944}
11945
11946/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11947/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11948///
11949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
11950#[inline]
11951#[target_feature(enable = "avx512fp16,avx512vl")]
11952#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11954pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
11955    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
11956}
11957
11958/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11959/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11960/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11961///
11962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
11963#[inline]
11964#[target_feature(enable = "avx512fp16,avx512vl")]
11965#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11966#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11967pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11968    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
11969}
11970
11971/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11972/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11973/// The upper 64 bits of dst are zeroed out.
11974///
11975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
11976#[inline]
11977#[target_feature(enable = "avx512fp16,avx512vl")]
11978#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11980pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
11981    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11982}
11983
11984/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11985/// and store the results in dst.
11986///
11987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
11988#[inline]
11989#[target_feature(enable = "avx512fp16,avx512vl")]
11990#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11991#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11992pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
11993    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
11994}
11995
11996/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11997/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11998/// mask bit is not set).
11999///
12000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12001#[inline]
12002#[target_feature(enable = "avx512fp16,avx512vl")]
12003#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12005pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12006    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12007}
12008
12009/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12010/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12011///
12012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12013#[inline]
12014#[target_feature(enable = "avx512fp16,avx512vl")]
12015#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12017pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12018    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12019}
12020
12021/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12022/// and store the results in dst.
12023///
12024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12025#[inline]
12026#[target_feature(enable = "avx512fp16")]
12027#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12029pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12030    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12031}
12032
12033/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12034/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12035/// mask bit is not set).
12036///
12037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12038#[inline]
12039#[target_feature(enable = "avx512fp16")]
12040#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12042pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12043    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12044}
12045
12046/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12047/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12048///
12049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12050#[inline]
12051#[target_feature(enable = "avx512fp16")]
12052#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12054pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12055    _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a)
12056}
12057
12058/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12059/// and store the results in dst.
12060///
12061/// Rounding is done according to the rounding parameter, which can be one of:
12062///
12063/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12064/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12065/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12066/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12067/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12068///
12069/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12070#[inline]
12071#[target_feature(enable = "avx512fp16")]
12072#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12073#[rustc_legacy_const_generics(1)]
12074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12075pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12076    unsafe {
12077        static_assert_rounding!(ROUNDING);
12078        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12079    }
12080}
12081
12082/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12083/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12084/// mask bit is not set).
12085///
12086/// Rounding is done according to the rounding parameter, which can be one of:
12087///
12088/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12089/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12090/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12091/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12092/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12093///
12094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12095#[inline]
12096#[target_feature(enable = "avx512fp16")]
12097#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12098#[rustc_legacy_const_generics(3)]
12099#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12100pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12101    src: __m256h,
12102    k: __mmask16,
12103    a: __m512i,
12104) -> __m256h {
12105    unsafe {
12106        static_assert_rounding!(ROUNDING);
12107        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12108    }
12109}
12110
12111/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12112/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12113///
12114/// Rounding is done according to the rounding parameter, which can be one of:
12115///
12116/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12117/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12118/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12119/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12120/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12121///
12122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12123#[inline]
12124#[target_feature(enable = "avx512fp16")]
12125#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12126#[rustc_legacy_const_generics(2)]
12127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12128pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12129    static_assert_rounding!(ROUNDING);
12130    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12131}
12132
12133/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12134/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12135/// of dst.
12136///
12137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12138#[inline]
12139#[target_feature(enable = "avx512fp16")]
12140#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12142pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12143    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12144}
12145
12146/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12147/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12148/// of dst.
12149///
12150/// Rounding is done according to the rounding parameter, which can be one of:
12151///
12152/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12153/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12154/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12155/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12156/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12157///
12158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12159#[inline]
12160#[target_feature(enable = "avx512fp16")]
12161#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12162#[rustc_legacy_const_generics(2)]
12163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12164pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12165    unsafe {
12166        static_assert_rounding!(ROUNDING);
12167        vcvtsi2sh(a, b, ROUNDING)
12168    }
12169}
12170
12171/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12172/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12173///
12174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12175#[inline]
12176#[target_feature(enable = "avx512fp16,avx512vl")]
12177#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12179pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12180    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12181}
12182
12183/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12184/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12185/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12186///
12187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12188#[inline]
12189#[target_feature(enable = "avx512fp16,avx512vl")]
12190#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12191#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12192pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12193    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12194}
12195
12196/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12197/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12198/// The upper 64 bits of dst are zeroed out.
12199///
12200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12201#[inline]
12202#[target_feature(enable = "avx512fp16,avx512vl")]
12203#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12205pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12206    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12207}
12208
12209/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12210/// and store the results in dst.
12211///
12212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12213#[inline]
12214#[target_feature(enable = "avx512fp16,avx512vl")]
12215#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12216#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12217pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12218    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12219}
12220
12221/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12222/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12223/// mask bit is not set).
12224///
12225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12226#[inline]
12227#[target_feature(enable = "avx512fp16,avx512vl")]
12228#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12229#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12230pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12231    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12232}
12233
12234/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12235/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12236///
12237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12238#[inline]
12239#[target_feature(enable = "avx512fp16,avx512vl")]
12240#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12241#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12242pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12243    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12244}
12245
12246/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12247/// and store the results in dst.
12248///
12249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12250#[inline]
12251#[target_feature(enable = "avx512fp16")]
12252#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12254pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12255    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12256}
12257
12258/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12259/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12260/// mask bit is not set).
12261///
12262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12263#[inline]
12264#[target_feature(enable = "avx512fp16")]
12265#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12267pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12268    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12269}
12270
12271/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12272/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12273///
12274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12275#[inline]
12276#[target_feature(enable = "avx512fp16")]
12277#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12279pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12280    _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a)
12281}
12282
12283/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12284/// and store the results in dst.
12285///
12286/// Rounding is done according to the rounding parameter, which can be one of:
12287///
12288/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12289/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12290/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12291/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12292/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12293///
12294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12295#[inline]
12296#[target_feature(enable = "avx512fp16")]
12297#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12298#[rustc_legacy_const_generics(1)]
12299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12300pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12301    unsafe {
12302        static_assert_rounding!(ROUNDING);
12303        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12304    }
12305}
12306
12307/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12308/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12309/// mask bit is not set).
12310///
12311/// Rounding is done according to the rounding parameter, which can be one of:
12312///
12313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12318///
12319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12320#[inline]
12321#[target_feature(enable = "avx512fp16")]
12322#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12323#[rustc_legacy_const_generics(3)]
12324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12325pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12326    src: __m256h,
12327    k: __mmask16,
12328    a: __m512i,
12329) -> __m256h {
12330    unsafe {
12331        static_assert_rounding!(ROUNDING);
12332        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12333    }
12334}
12335
12336/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12337/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12338///
12339/// Rounding is done according to the rounding parameter, which can be one of:
12340///
12341/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12342/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12343/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12344/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12345/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12346///
12347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12348#[inline]
12349#[target_feature(enable = "avx512fp16")]
12350#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12351#[rustc_legacy_const_generics(2)]
12352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12353pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12354    static_assert_rounding!(ROUNDING);
12355    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12356}
12357
12358/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12359/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12360/// of dst.
12361///
12362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12363#[inline]
12364#[target_feature(enable = "avx512fp16")]
12365#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12366#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12367pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12368    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12369}
12370
12371/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12372/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12373/// of dst.
12374///
12375/// Rounding is done according to the rounding parameter, which can be one of:
12376///
12377/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12378/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12379/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12380/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12381/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12382///
12383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12384#[inline]
12385#[target_feature(enable = "avx512fp16")]
12386#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12387#[rustc_legacy_const_generics(2)]
12388#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12389pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12390    unsafe {
12391        static_assert_rounding!(ROUNDING);
12392        vcvtusi2sh(a, b, ROUNDING)
12393    }
12394}
12395
12396/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12397/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12398///
12399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12400#[inline]
12401#[target_feature(enable = "avx512fp16,avx512vl")]
12402#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12404pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12405    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12406}
12407
12408/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12409/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12410/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12411///
12412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12413#[inline]
12414#[target_feature(enable = "avx512fp16,avx512vl")]
12415#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12416#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12417pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12418    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12419}
12420
12421/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12422/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12423/// The upper 96 bits of dst are zeroed out.
12424///
12425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12426#[inline]
12427#[target_feature(enable = "avx512fp16,avx512vl")]
12428#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12430pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12431    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12432}
12433
12434/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12435/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12436///
12437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12438#[inline]
12439#[target_feature(enable = "avx512fp16,avx512vl")]
12440#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12441#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12442pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12443    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12444}
12445
12446/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12447/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12448/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12449///
12450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12451#[inline]
12452#[target_feature(enable = "avx512fp16,avx512vl")]
12453#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12455pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12456    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12457}
12458
12459/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12460/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12461/// The upper 64 bits of dst are zeroed out.
12462///
12463/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12464#[inline]
12465#[target_feature(enable = "avx512fp16,avx512vl")]
12466#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12467#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12468pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12469    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12470}
12471
12472/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12473/// and store the results in dst.
12474///
12475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12476#[inline]
12477#[target_feature(enable = "avx512fp16")]
12478#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12479#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12480pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12481    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12482}
12483
12484/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12485/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12486/// mask bit is not set).
12487///
12488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12489#[inline]
12490#[target_feature(enable = "avx512fp16")]
12491#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12492#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12493pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12494    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12495}
12496
12497/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12498/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12499///
12500/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12501#[inline]
12502#[target_feature(enable = "avx512fp16")]
12503#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12505pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12506    _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12507}
12508
12509/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12510/// and store the results in dst.
12511///
12512/// Rounding is done according to the rounding parameter, which can be one of:
12513///
12514/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12515/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12516/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12517/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12518/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12519///
12520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12521#[inline]
12522#[target_feature(enable = "avx512fp16")]
12523#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12524#[rustc_legacy_const_generics(1)]
12525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12526pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12527    unsafe {
12528        static_assert_rounding!(ROUNDING);
12529        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12530    }
12531}
12532
12533/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12534/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12535/// mask bit is not set).
12536///
12537/// Rounding is done according to the rounding parameter, which can be one of:
12538///
12539/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12540/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12541/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12542/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12543/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12544///
12545/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12546#[inline]
12547#[target_feature(enable = "avx512fp16")]
12548#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12549#[rustc_legacy_const_generics(3)]
12550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12551pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12552    src: __m128h,
12553    k: __mmask8,
12554    a: __m512i,
12555) -> __m128h {
12556    unsafe {
12557        static_assert_rounding!(ROUNDING);
12558        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12559    }
12560}
12561
12562/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12563/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12564///
12565/// Rounding is done according to the rounding parameter, which can be one of:
12566///
12567/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12568/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12569/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12570/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12571/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12572///
12573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12574#[inline]
12575#[target_feature(enable = "avx512fp16")]
12576#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12577#[rustc_legacy_const_generics(2)]
12578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12579pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12580    static_assert_rounding!(ROUNDING);
12581    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12582}
12583
12584/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12585/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12586///
12587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12588#[inline]
12589#[target_feature(enable = "avx512fp16,avx512vl")]
12590#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12592pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12593    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12594}
12595
12596/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12597/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12598/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12599///
12600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12601#[inline]
12602#[target_feature(enable = "avx512fp16,avx512vl")]
12603#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12605pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12606    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12607}
12608
12609/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12610/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12611/// The upper 96 bits of dst are zeroed out.
12612///
12613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12614#[inline]
12615#[target_feature(enable = "avx512fp16,avx512vl")]
12616#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12618pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12619    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12620}
12621
12622/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12623/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12624///
12625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12626#[inline]
12627#[target_feature(enable = "avx512fp16,avx512vl")]
12628#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12630pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12631    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12632}
12633
12634/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12635/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12636/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12637///
12638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12639#[inline]
12640#[target_feature(enable = "avx512fp16,avx512vl")]
12641#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12643pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12644    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12645}
12646
12647/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12648/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12649/// The upper 64 bits of dst are zeroed out.
12650///
12651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12652#[inline]
12653#[target_feature(enable = "avx512fp16,avx512vl")]
12654#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12656pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12657    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12658}
12659
12660/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12661/// and store the results in dst.
12662///
12663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12664#[inline]
12665#[target_feature(enable = "avx512fp16")]
12666#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12667#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12668pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12669    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12670}
12671
12672/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12673/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12674/// mask bit is not set).
12675///
12676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12677#[inline]
12678#[target_feature(enable = "avx512fp16")]
12679#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12681pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12682    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12683}
12684
12685/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12686/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12687///
12688/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12689#[inline]
12690#[target_feature(enable = "avx512fp16")]
12691#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12692#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12693pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12694    _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12695}
12696
12697/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12698/// and store the results in dst.
12699///
12700/// Rounding is done according to the rounding parameter, which can be one of:
12701///
12702/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12703/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12704/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12705/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12706/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12707///
12708/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12709#[inline]
12710#[target_feature(enable = "avx512fp16")]
12711#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12712#[rustc_legacy_const_generics(1)]
12713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12714pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12715    unsafe {
12716        static_assert_rounding!(ROUNDING);
12717        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12718    }
12719}
12720
12721/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12722/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12723/// mask bit is not set).
12724///
12725/// Rounding is done according to the rounding parameter, which can be one of:
12726///
12727/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12728/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12729/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12730/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12731/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12732///
12733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12734#[inline]
12735#[target_feature(enable = "avx512fp16")]
12736#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12737#[rustc_legacy_const_generics(3)]
12738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12739pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12740    src: __m128h,
12741    k: __mmask8,
12742    a: __m512i,
12743) -> __m128h {
12744    unsafe {
12745        static_assert_rounding!(ROUNDING);
12746        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12747    }
12748}
12749
12750/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12751/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12752///
12753/// Rounding is done according to the rounding parameter, which can be one of:
12754///
12755/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12756/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12757/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12758/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12759/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12760///
12761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12762#[inline]
12763#[target_feature(enable = "avx512fp16")]
12764#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12765#[rustc_legacy_const_generics(2)]
12766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12767pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12768    static_assert_rounding!(ROUNDING);
12769    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12770}
12771
12772/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12773/// floating-point elements, and store the results in dst.
12774///
12775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12776#[inline]
12777#[target_feature(enable = "avx512fp16,avx512vl")]
12778#[cfg_attr(test, assert_instr(vcvtps2phx))]
12779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12780pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12781    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12782}
12783
12784/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12785/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12786/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12787///
12788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12789#[inline]
12790#[target_feature(enable = "avx512fp16,avx512vl")]
12791#[cfg_attr(test, assert_instr(vcvtps2phx))]
12792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12793pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12794    unsafe { vcvtps2phx_128(a, src, k) }
12795}
12796
12797/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12798/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12799/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12800///
12801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12802#[inline]
12803#[target_feature(enable = "avx512fp16,avx512vl")]
12804#[cfg_attr(test, assert_instr(vcvtps2phx))]
12805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12806pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12807    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12808}
12809
12810/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12811/// floating-point elements, and store the results in dst.
12812///
12813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12814#[inline]
12815#[target_feature(enable = "avx512fp16,avx512vl")]
12816#[cfg_attr(test, assert_instr(vcvtps2phx))]
12817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12818pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12819    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12820}
12821
12822/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12823/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12824/// when the corresponding mask bit is not set).
12825///
12826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12827#[inline]
12828#[target_feature(enable = "avx512fp16,avx512vl")]
12829#[cfg_attr(test, assert_instr(vcvtps2phx))]
12830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12831pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12832    unsafe { vcvtps2phx_256(a, src, k) }
12833}
12834
12835/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12836/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12837/// corresponding mask bit is not set).
12838///
12839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12840#[inline]
12841#[target_feature(enable = "avx512fp16,avx512vl")]
12842#[cfg_attr(test, assert_instr(vcvtps2phx))]
12843#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12844pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12845    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12846}
12847
12848/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12849/// floating-point elements, and store the results in dst.
12850///
12851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12852#[inline]
12853#[target_feature(enable = "avx512fp16")]
12854#[cfg_attr(test, assert_instr(vcvtps2phx))]
12855#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12856pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12857    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a)
12858}
12859
12860/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12861/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12862/// when the corresponding mask bit is not set).
12863///
12864/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12865#[inline]
12866#[target_feature(enable = "avx512fp16")]
12867#[cfg_attr(test, assert_instr(vcvtps2phx))]
12868#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12869pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12870    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
12871}
12872
12873/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12874/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12875/// corresponding mask bit is not set).
12876///
12877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12878#[inline]
12879#[target_feature(enable = "avx512fp16")]
12880#[cfg_attr(test, assert_instr(vcvtps2phx))]
12881#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12882pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12883    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a)
12884}
12885
12886/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12887/// floating-point elements, and store the results in dst.
12888///
12889/// Rounding is done according to the rounding parameter, which can be one of:
12890///
12891/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12892/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12893/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12894/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12895/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12896///
12897/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12898#[inline]
12899#[target_feature(enable = "avx512fp16")]
12900#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12901#[rustc_legacy_const_generics(1)]
12902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12903pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
12904    static_assert_rounding!(ROUNDING);
12905    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), 0xffff, a)
12906}
12907
12908/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12909/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12910/// when the corresponding mask bit is not set).
12911///
12912/// Rounding is done according to the rounding parameter, which can be one of:
12913///
12914/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12915/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12916/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12917/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12918/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12919///
12920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
12921#[inline]
12922#[target_feature(enable = "avx512fp16")]
12923#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12924#[rustc_legacy_const_generics(3)]
12925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12926pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
12927    src: __m256h,
12928    k: __mmask16,
12929    a: __m512,
12930) -> __m256h {
12931    unsafe {
12932        static_assert_rounding!(ROUNDING);
12933        vcvtps2phx_512(a, src, k, ROUNDING)
12934    }
12935}
12936
12937/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12938/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12939/// corresponding mask bit is not set).
12940///
12941/// Rounding is done according to the rounding parameter, which can be one of:
12942///
12943/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12944/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12945/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12946/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12947/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12948///
12949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
12950#[inline]
12951#[target_feature(enable = "avx512fp16")]
12952#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12953#[rustc_legacy_const_generics(2)]
12954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12955pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
12956    static_assert_rounding!(ROUNDING);
12957    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12958}
12959
12960/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12961/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12962/// elements from a to the upper elements of dst.
12963///
12964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
12965#[inline]
12966#[target_feature(enable = "avx512fp16")]
12967#[cfg_attr(test, assert_instr(vcvtss2sh))]
12968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12969pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
12970    _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b)
12971}
12972
12973/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12974/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12975/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12976/// upper elements of dst.
12977///
12978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
12979#[inline]
12980#[target_feature(enable = "avx512fp16")]
12981#[cfg_attr(test, assert_instr(vcvtss2sh))]
12982#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12983pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12984    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
12985}
12986
12987/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12988/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12989/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12990/// elements of dst.
12991///
12992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
12993#[inline]
12994#[target_feature(enable = "avx512fp16")]
12995#[cfg_attr(test, assert_instr(vcvtss2sh))]
12996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12997pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12998    _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b)
12999}
13000
13001/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13002/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13003/// elements from a to the upper elements of dst.
13004///
13005/// Rounding is done according to the rounding parameter, which can be one of:
13006///
13007/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13008/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13009/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13010/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13011/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13012///
13013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13014#[inline]
13015#[target_feature(enable = "avx512fp16")]
13016#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13017#[rustc_legacy_const_generics(2)]
13018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13019pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13020    static_assert_rounding!(ROUNDING);
13021    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
13022}
13023
13024/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13025/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13026/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13027/// upper elements of dst.
13028///
13029/// Rounding is done according to the rounding parameter, which can be one of:
13030///
13031/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13032/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13033/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13034/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13035/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13036///
13037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13038#[inline]
13039#[target_feature(enable = "avx512fp16")]
13040#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13041#[rustc_legacy_const_generics(4)]
13042#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13043pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13044    src: __m128h,
13045    k: __mmask8,
13046    a: __m128h,
13047    b: __m128,
13048) -> __m128h {
13049    unsafe {
13050        static_assert_rounding!(ROUNDING);
13051        vcvtss2sh(a, b, src, k, ROUNDING)
13052    }
13053}
13054
13055/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13056/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13057/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13058/// elements of dst.
13059///
13060/// Rounding is done according to the rounding parameter, which can be one of:
13061///
13062/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13063/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13064/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13065/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13066/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13067///
13068/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13069#[inline]
13070#[target_feature(enable = "avx512fp16")]
13071#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13072#[rustc_legacy_const_generics(3)]
13073#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13074pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13075    k: __mmask8,
13076    a: __m128h,
13077    b: __m128,
13078) -> __m128h {
13079    static_assert_rounding!(ROUNDING);
13080    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
13081}
13082
13083/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13084/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13085///
13086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13087#[inline]
13088#[target_feature(enable = "avx512fp16,avx512vl")]
13089#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13091pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13092    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13093}
13094
13095/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13096/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13097/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13098///
13099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13100#[inline]
13101#[target_feature(enable = "avx512fp16,avx512vl")]
13102#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13104pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13105    unsafe { vcvtpd2ph_128(a, src, k) }
13106}
13107
13108/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13109/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13110/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13111///
13112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13113#[inline]
13114#[target_feature(enable = "avx512fp16,avx512vl")]
13115#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13117pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13118    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13119}
13120
13121/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13122/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13123///
13124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13125#[inline]
13126#[target_feature(enable = "avx512fp16,avx512vl")]
13127#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13129pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13130    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13131}
13132
13133/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13134/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13135/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13136///
13137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13138#[inline]
13139#[target_feature(enable = "avx512fp16,avx512vl")]
13140#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13142pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13143    unsafe { vcvtpd2ph_256(a, src, k) }
13144}
13145
13146/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13147/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13148/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13149///
13150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13151#[inline]
13152#[target_feature(enable = "avx512fp16,avx512vl")]
13153#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13154#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13155pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13156    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13157}
13158
13159/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13160/// floating-point elements, and store the results in dst.
13161///
13162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13163#[inline]
13164#[target_feature(enable = "avx512fp16")]
13165#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13166#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13167pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13168    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13169}
13170
13171/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13172/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13173/// when the corresponding mask bit is not set).
13174///
13175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13176#[inline]
13177#[target_feature(enable = "avx512fp16")]
13178#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13180pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13181    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13182}
13183
13184/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13185/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13186/// corresponding mask bit is not set).
13187///
13188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13189#[inline]
13190#[target_feature(enable = "avx512fp16")]
13191#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13193pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13194    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13195}
13196
13197/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13198/// floating-point elements, and store the results in dst.
13199///
13200/// Rounding is done according to the rounding parameter, which can be one of:
13201///
13202/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13203/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13204/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13205/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13206/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13207///
13208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13209#[inline]
13210#[target_feature(enable = "avx512fp16")]
13211#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13212#[rustc_legacy_const_generics(1)]
13213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13214pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13215    static_assert_rounding!(ROUNDING);
13216    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), 0xff, a)
13217}
13218
13219/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13220/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13221/// when the corresponding mask bit is not set).
13222///
13223/// Rounding is done according to the rounding parameter, which can be one of:
13224///
13225/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13226/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13227/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13228/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13229/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13230///
13231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13232#[inline]
13233#[target_feature(enable = "avx512fp16")]
13234#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13235#[rustc_legacy_const_generics(3)]
13236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13237pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13238    src: __m128h,
13239    k: __mmask8,
13240    a: __m512d,
13241) -> __m128h {
13242    unsafe {
13243        static_assert_rounding!(ROUNDING);
13244        vcvtpd2ph_512(a, src, k, ROUNDING)
13245    }
13246}
13247
13248/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13249/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13250/// corresponding mask bit is not set).
13251///
13252/// Rounding is done according to the rounding parameter, which can be one of:
13253///
13254/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13255/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13256/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13257/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13258/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13259///
13260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13261#[inline]
13262#[target_feature(enable = "avx512fp16")]
13263#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13264#[rustc_legacy_const_generics(2)]
13265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13266pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13267    static_assert_rounding!(ROUNDING);
13268    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
13269}
13270
13271/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13272/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13273/// elements from a to the upper elements of dst.
13274///
13275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13276#[inline]
13277#[target_feature(enable = "avx512fp16")]
13278#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13280pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13281    _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b)
13282}
13283
13284/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13285/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13286/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13287/// upper elements of dst.
13288///
13289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13290#[inline]
13291#[target_feature(enable = "avx512fp16")]
13292#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13293#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13294pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13295    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13296}
13297
13298/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13299/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13300/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13301/// elements of dst.
13302///
13303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13304#[inline]
13305#[target_feature(enable = "avx512fp16")]
13306#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13308pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13309    _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b)
13310}
13311
13312/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13313/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13314/// elements from a to the upper elements of dst.
13315///
13316/// Rounding is done according to the rounding parameter, which can be one of:
13317///
13318/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13319/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13320/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13321/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13322/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13323///
13324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13325#[inline]
13326#[target_feature(enable = "avx512fp16")]
13327#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13328#[rustc_legacy_const_generics(2)]
13329#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13330pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13331    static_assert_rounding!(ROUNDING);
13332    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
13333}
13334
13335/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13336/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13337/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13338/// upper elements of dst.
13339///
13340/// Rounding is done according to the rounding parameter, which can be one of:
13341///
13342/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13343/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13344/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13345/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13346/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13347///
13348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13349#[inline]
13350#[target_feature(enable = "avx512fp16")]
13351#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13352#[rustc_legacy_const_generics(4)]
13353#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13354pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13355    src: __m128h,
13356    k: __mmask8,
13357    a: __m128h,
13358    b: __m128d,
13359) -> __m128h {
13360    unsafe {
13361        static_assert_rounding!(ROUNDING);
13362        vcvtsd2sh(a, b, src, k, ROUNDING)
13363    }
13364}
13365
13366/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13367/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13368/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13369/// elements of dst.
13370///
13371/// Rounding is done according to the rounding parameter, which can be one of:
13372///
13373/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13374/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13375/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13376/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13377/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13378///
13379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13380#[inline]
13381#[target_feature(enable = "avx512fp16")]
13382#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13383#[rustc_legacy_const_generics(3)]
13384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13385pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13386    k: __mmask8,
13387    a: __m128h,
13388    b: __m128d,
13389) -> __m128h {
13390    static_assert_rounding!(ROUNDING);
13391    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
13392}
13393
13394/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13395/// store the results in dst.
13396///
13397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13398#[inline]
13399#[target_feature(enable = "avx512fp16,avx512vl")]
13400#[cfg_attr(test, assert_instr(vcvtph2w))]
13401#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13402pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13403    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13404}
13405
13406/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13407/// store the results in dst using writemask k (elements are copied from src when the corresponding
13408/// mask bit is not set).
13409///
13410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13411#[inline]
13412#[target_feature(enable = "avx512fp16,avx512vl")]
13413#[cfg_attr(test, assert_instr(vcvtph2w))]
13414#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13415pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13416    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13417}
13418
13419/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13420/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13421///
13422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13423#[inline]
13424#[target_feature(enable = "avx512fp16,avx512vl")]
13425#[cfg_attr(test, assert_instr(vcvtph2w))]
13426#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13427pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13428    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13429}
13430
13431/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13432/// store the results in dst.
13433///
13434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13435#[inline]
13436#[target_feature(enable = "avx512fp16,avx512vl")]
13437#[cfg_attr(test, assert_instr(vcvtph2w))]
13438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13439pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13440    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13441}
13442
13443/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13444/// store the results in dst using writemask k (elements are copied from src when the corresponding
13445/// mask bit is not set).
13446///
13447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13448#[inline]
13449#[target_feature(enable = "avx512fp16,avx512vl")]
13450#[cfg_attr(test, assert_instr(vcvtph2w))]
13451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13452pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13453    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13454}
13455
13456/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13457/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13458///
13459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13460#[inline]
13461#[target_feature(enable = "avx512fp16,avx512vl")]
13462#[cfg_attr(test, assert_instr(vcvtph2w))]
13463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13464pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13465    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13466}
13467
13468/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13469/// store the results in dst.
13470///
13471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13472#[inline]
13473#[target_feature(enable = "avx512fp16")]
13474#[cfg_attr(test, assert_instr(vcvtph2w))]
13475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13476pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13477    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13478}
13479
13480/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13481/// store the results in dst using writemask k (elements are copied from src when the corresponding
13482/// mask bit is not set).
13483///
13484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13485#[inline]
13486#[target_feature(enable = "avx512fp16")]
13487#[cfg_attr(test, assert_instr(vcvtph2w))]
13488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13489pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13490    unsafe {
13491        transmute(vcvtph2w_512(
13492            a,
13493            src.as_i16x32(),
13494            k,
13495            _MM_FROUND_CUR_DIRECTION,
13496        ))
13497    }
13498}
13499
13500/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13501/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13502///
13503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13504#[inline]
13505#[target_feature(enable = "avx512fp16")]
13506#[cfg_attr(test, assert_instr(vcvtph2w))]
13507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13508pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13509    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13510}
13511
13512/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13513/// store the results in dst.
13514///
13515/// Rounding is done according to the rounding parameter, which can be one of:
13516///
13517/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13518/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13519/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13520/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13521/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13522///
13523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13524#[inline]
13525#[target_feature(enable = "avx512fp16")]
13526#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13527#[rustc_legacy_const_generics(1)]
13528#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13529pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13530    static_assert_rounding!(ROUNDING);
13531    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13532}
13533
13534/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13535/// store the results in dst using writemask k (elements are copied from src when the corresponding
13536/// mask bit is not set).
13537///
13538/// Rounding is done according to the rounding parameter, which can be one of:
13539///
13540/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13541/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13542/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13543/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13544/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13545///
13546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13547#[inline]
13548#[target_feature(enable = "avx512fp16")]
13549#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13550#[rustc_legacy_const_generics(3)]
13551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13552pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13553    src: __m512i,
13554    k: __mmask32,
13555    a: __m512h,
13556) -> __m512i {
13557    unsafe {
13558        static_assert_rounding!(ROUNDING);
13559        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13560    }
13561}
13562
13563/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13564/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13565///
13566/// Rounding is done according to the rounding parameter, which can be one of:
13567///
13568/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13569/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13570/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13571/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13572/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13573///
13574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13575#[inline]
13576#[target_feature(enable = "avx512fp16")]
13577#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13578#[rustc_legacy_const_generics(2)]
13579#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13580pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13581    static_assert_rounding!(ROUNDING);
13582    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13583}
13584
13585/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13586/// and store the results in dst.
13587///
13588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13589#[inline]
13590#[target_feature(enable = "avx512fp16,avx512vl")]
13591#[cfg_attr(test, assert_instr(vcvtph2uw))]
13592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13593pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13594    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13595}
13596
13597/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13598/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13599/// mask bit is not set).
13600///
13601/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13602#[inline]
13603#[target_feature(enable = "avx512fp16,avx512vl")]
13604#[cfg_attr(test, assert_instr(vcvtph2uw))]
13605#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13606pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13607    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13608}
13609
13610/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13611/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13612///
13613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13614#[inline]
13615#[target_feature(enable = "avx512fp16,avx512vl")]
13616#[cfg_attr(test, assert_instr(vcvtph2uw))]
13617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13618pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13619    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13620}
13621
13622/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13623/// and store the results in dst.
13624///
13625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13626#[inline]
13627#[target_feature(enable = "avx512fp16,avx512vl")]
13628#[cfg_attr(test, assert_instr(vcvtph2uw))]
13629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13630pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13631    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13632}
13633
13634/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13635/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13636/// mask bit is not set).
13637///
13638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13639#[inline]
13640#[target_feature(enable = "avx512fp16,avx512vl")]
13641#[cfg_attr(test, assert_instr(vcvtph2uw))]
13642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13643pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13644    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13645}
13646
13647/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13648/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13649///
13650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13651#[inline]
13652#[target_feature(enable = "avx512fp16,avx512vl")]
13653#[cfg_attr(test, assert_instr(vcvtph2uw))]
13654#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13655pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13656    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13657}
13658
13659/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13660/// and store the results in dst.
13661///
13662/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13663#[inline]
13664#[target_feature(enable = "avx512fp16")]
13665#[cfg_attr(test, assert_instr(vcvtph2uw))]
13666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13667pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13668    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13669}
13670
13671/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13672/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13673/// mask bit is not set).
13674///
13675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13676#[inline]
13677#[target_feature(enable = "avx512fp16")]
13678#[cfg_attr(test, assert_instr(vcvtph2uw))]
13679#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13680pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13681    unsafe {
13682        transmute(vcvtph2uw_512(
13683            a,
13684            src.as_u16x32(),
13685            k,
13686            _MM_FROUND_CUR_DIRECTION,
13687        ))
13688    }
13689}
13690
13691/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13692/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13693///
13694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13695#[inline]
13696#[target_feature(enable = "avx512fp16")]
13697#[cfg_attr(test, assert_instr(vcvtph2uw))]
13698#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13699pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13700    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13701}
13702
13703/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13704/// and store the results in dst.
13705///
13706/// Rounding is done according to the rounding parameter, which can be one of:
13707///
13708/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13709/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13710/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13711/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13712/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13713///
13714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13715#[inline]
13716#[target_feature(enable = "avx512fp16")]
13717#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13718#[rustc_legacy_const_generics(1)]
13719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13720pub fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13721    static_assert_rounding!(ROUNDING);
13722    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13723}
13724
13725/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13726/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13727/// mask bit is not set).
13728///
13729/// Rounding is done according to the rounding parameter, which can be one of:
13730///
13731/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13732/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13733/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13734/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13735/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13736///
13737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13738#[inline]
13739#[target_feature(enable = "avx512fp16")]
13740#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13741#[rustc_legacy_const_generics(3)]
13742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13743pub fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
13744    src: __m512i,
13745    k: __mmask32,
13746    a: __m512h,
13747) -> __m512i {
13748    unsafe {
13749        static_assert_rounding!(ROUNDING);
13750        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
13751    }
13752}
13753
13754/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13755/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13756///
13757/// Rounding is done according to the rounding parameter, which can be one of:
13758///
13759/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13760/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13761/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13762/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13763/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13764///
13765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13766#[inline]
13767#[target_feature(enable = "avx512fp16")]
13768#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13769#[rustc_legacy_const_generics(2)]
13770#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13771pub fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13772    static_assert_rounding!(ROUNDING);
13773    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13774}
13775
13776/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13777/// truncation, and store the results in dst.
13778///
13779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13780#[inline]
13781#[target_feature(enable = "avx512fp16,avx512vl")]
13782#[cfg_attr(test, assert_instr(vcvttph2w))]
13783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13784pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13785    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13786}
13787
13788/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13789/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13790/// mask bit is not set).
13791///
13792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13793#[inline]
13794#[target_feature(enable = "avx512fp16,avx512vl")]
13795#[cfg_attr(test, assert_instr(vcvttph2w))]
13796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13797pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13798    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13799}
13800
13801/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13802/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13803/// mask bit is not set).
13804///
13805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13806#[inline]
13807#[target_feature(enable = "avx512fp16,avx512vl")]
13808#[cfg_attr(test, assert_instr(vcvttph2w))]
13809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13810pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13811    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13812}
13813
13814/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13815/// truncation, and store the results in dst.
13816///
13817/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13818#[inline]
13819#[target_feature(enable = "avx512fp16,avx512vl")]
13820#[cfg_attr(test, assert_instr(vcvttph2w))]
13821#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13822pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13823    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13824}
13825
13826/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13827/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13828/// mask bit is not set).
13829///
13830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13831#[inline]
13832#[target_feature(enable = "avx512fp16,avx512vl")]
13833#[cfg_attr(test, assert_instr(vcvttph2w))]
13834#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13835pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13836    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
13837}
13838
13839/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13840/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13841/// mask bit is not set).
13842///
13843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13844#[inline]
13845#[target_feature(enable = "avx512fp16,avx512vl")]
13846#[cfg_attr(test, assert_instr(vcvttph2w))]
13847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13848pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13849    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13850}
13851
13852/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13853/// truncation, and store the results in dst.
13854///
13855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13856#[inline]
13857#[target_feature(enable = "avx512fp16")]
13858#[cfg_attr(test, assert_instr(vcvttph2w))]
13859#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13860pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13861    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13862}
13863
13864/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13865/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13866/// mask bit is not set).
13867///
13868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13869#[inline]
13870#[target_feature(enable = "avx512fp16")]
13871#[cfg_attr(test, assert_instr(vcvttph2w))]
13872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13873pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13874    unsafe {
13875        transmute(vcvttph2w_512(
13876            a,
13877            src.as_i16x32(),
13878            k,
13879            _MM_FROUND_CUR_DIRECTION,
13880        ))
13881    }
13882}
13883
13884/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13885/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13886/// mask bit is not set).
13887///
13888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13889#[inline]
13890#[target_feature(enable = "avx512fp16")]
13891#[cfg_attr(test, assert_instr(vcvttph2w))]
13892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13893pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13894    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13895}
13896
13897/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13898/// truncation, and store the results in dst.
13899///
13900/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13901///
13902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13903#[inline]
13904#[target_feature(enable = "avx512fp16")]
13905#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13906#[rustc_legacy_const_generics(1)]
13907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13908pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13909    static_assert_sae!(SAE);
13910    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13911}
13912
13913/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13914/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13915/// mask bit is not set).
13916///
13917/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13918///
13919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
13920#[inline]
13921#[target_feature(enable = "avx512fp16")]
13922#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13923#[rustc_legacy_const_generics(3)]
13924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13925pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
13926    src: __m512i,
13927    k: __mmask32,
13928    a: __m512h,
13929) -> __m512i {
13930    unsafe {
13931        static_assert_sae!(SAE);
13932        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
13933    }
13934}
13935
13936/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13937/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13938/// mask bit is not set).
13939///
13940/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13941///
13942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
13943#[inline]
13944#[target_feature(enable = "avx512fp16")]
13945#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13946#[rustc_legacy_const_generics(2)]
13947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13948pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13949    static_assert_sae!(SAE);
13950    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
13951}
13952
13953/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13954/// truncation, and store the results in dst.
13955///
13956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
13957#[inline]
13958#[target_feature(enable = "avx512fp16,avx512vl")]
13959#[cfg_attr(test, assert_instr(vcvttph2uw))]
13960#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13961pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
13962    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
13963}
13964
13965/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13966/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13967/// mask bit is not set).
13968///
13969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
13970#[inline]
13971#[target_feature(enable = "avx512fp16,avx512vl")]
13972#[cfg_attr(test, assert_instr(vcvttph2uw))]
13973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13974pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13975    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
13976}
13977
13978/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13979/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13980/// mask bit is not set).
13981///
13982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
13983#[inline]
13984#[target_feature(enable = "avx512fp16,avx512vl")]
13985#[cfg_attr(test, assert_instr(vcvttph2uw))]
13986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13987pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13988    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
13989}
13990
13991/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13992/// truncation, and store the results in dst.
13993///
13994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
13995#[inline]
13996#[target_feature(enable = "avx512fp16,avx512vl")]
13997#[cfg_attr(test, assert_instr(vcvttph2uw))]
13998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13999pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14000    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14001}
14002
14003/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14004/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14005/// mask bit is not set).
14006///
14007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14008#[inline]
14009#[target_feature(enable = "avx512fp16,avx512vl")]
14010#[cfg_attr(test, assert_instr(vcvttph2uw))]
14011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14012pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14013    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14014}
14015
14016/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14017/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14018/// mask bit is not set).
14019///
14020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14021#[inline]
14022#[target_feature(enable = "avx512fp16,avx512vl")]
14023#[cfg_attr(test, assert_instr(vcvttph2uw))]
14024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14025pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14026    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14027}
14028
14029/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14030/// truncation, and store the results in dst.
14031///
14032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14033#[inline]
14034#[target_feature(enable = "avx512fp16")]
14035#[cfg_attr(test, assert_instr(vcvttph2uw))]
14036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14037pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14038    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14039}
14040
14041/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14042/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14043/// mask bit is not set).
14044///
14045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14046#[inline]
14047#[target_feature(enable = "avx512fp16")]
14048#[cfg_attr(test, assert_instr(vcvttph2uw))]
14049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14050pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14051    unsafe {
14052        transmute(vcvttph2uw_512(
14053            a,
14054            src.as_u16x32(),
14055            k,
14056            _MM_FROUND_CUR_DIRECTION,
14057        ))
14058    }
14059}
14060
14061/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14062/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14063/// mask bit is not set).
14064///
14065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14066#[inline]
14067#[target_feature(enable = "avx512fp16")]
14068#[cfg_attr(test, assert_instr(vcvttph2uw))]
14069#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14070pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14071    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14072}
14073
14074/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14075/// truncation, and store the results in dst.
14076///
14077/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14078///
14079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14080#[inline]
14081#[target_feature(enable = "avx512fp16")]
14082#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14083#[rustc_legacy_const_generics(1)]
14084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14085pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14086    static_assert_sae!(SAE);
14087    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14088}
14089
14090/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14091/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14092/// mask bit is not set).
14093///
14094/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14095///
14096/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14097#[inline]
14098#[target_feature(enable = "avx512fp16")]
14099#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14100#[rustc_legacy_const_generics(3)]
14101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14102pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14103    src: __m512i,
14104    k: __mmask32,
14105    a: __m512h,
14106) -> __m512i {
14107    unsafe {
14108        static_assert_sae!(SAE);
14109        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14110    }
14111}
14112
14113/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14114/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14115/// mask bit is not set).
14116///
14117/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14118///
14119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14120#[inline]
14121#[target_feature(enable = "avx512fp16")]
14122#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14123#[rustc_legacy_const_generics(2)]
14124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14125pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14126    static_assert_sae!(SAE);
14127    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14128}
14129
14130/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14131/// results in dst.
14132///
14133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14134#[inline]
14135#[target_feature(enable = "avx512fp16,avx512vl")]
14136#[cfg_attr(test, assert_instr(vcvtph2dq))]
14137#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14138pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14139    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14140}
14141
14142/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14143/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14144///
14145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14146#[inline]
14147#[target_feature(enable = "avx512fp16,avx512vl")]
14148#[cfg_attr(test, assert_instr(vcvtph2dq))]
14149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14150pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14151    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14152}
14153
14154/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14155/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14156///
14157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14158#[inline]
14159#[target_feature(enable = "avx512fp16,avx512vl")]
14160#[cfg_attr(test, assert_instr(vcvtph2dq))]
14161#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14162pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14163    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14164}
14165
14166/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14167/// results in dst.
14168///
14169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14170#[inline]
14171#[target_feature(enable = "avx512fp16,avx512vl")]
14172#[cfg_attr(test, assert_instr(vcvtph2dq))]
14173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14174pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14175    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14176}
14177
14178/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14179/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14180///
14181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14182#[inline]
14183#[target_feature(enable = "avx512fp16,avx512vl")]
14184#[cfg_attr(test, assert_instr(vcvtph2dq))]
14185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14186pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14187    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14188}
14189
14190/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14191/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14192///
14193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14194#[inline]
14195#[target_feature(enable = "avx512fp16,avx512vl")]
14196#[cfg_attr(test, assert_instr(vcvtph2dq))]
14197#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14198pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14199    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14200}
14201
14202/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14203/// results in dst.
14204///
14205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14206#[inline]
14207#[target_feature(enable = "avx512fp16")]
14208#[cfg_attr(test, assert_instr(vcvtph2dq))]
14209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14210pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14211    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14212}
14213
14214/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14215/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14216///
14217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14218#[inline]
14219#[target_feature(enable = "avx512fp16")]
14220#[cfg_attr(test, assert_instr(vcvtph2dq))]
14221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14222pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14223    unsafe {
14224        transmute(vcvtph2dq_512(
14225            a,
14226            src.as_i32x16(),
14227            k,
14228            _MM_FROUND_CUR_DIRECTION,
14229        ))
14230    }
14231}
14232
14233/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14234/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14235///
14236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14237#[inline]
14238#[target_feature(enable = "avx512fp16")]
14239#[cfg_attr(test, assert_instr(vcvtph2dq))]
14240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14241pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14242    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14243}
14244
14245/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14246/// results in dst.
14247///
14248/// Rounding is done according to the rounding parameter, which can be one of:
14249///
14250/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14251/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14252/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14253/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14254/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14255///
14256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14257#[inline]
14258#[target_feature(enable = "avx512fp16")]
14259#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14260#[rustc_legacy_const_generics(1)]
14261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14262pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14263    static_assert_rounding!(ROUNDING);
14264    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14265}
14266
14267/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14268/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14269///
14270/// Rounding is done according to the rounding parameter, which can be one of:
14271///
14272/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14273/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14274/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14275/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14276/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14277///
14278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14279#[inline]
14280#[target_feature(enable = "avx512fp16")]
14281#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14282#[rustc_legacy_const_generics(3)]
14283#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14284pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14285    src: __m512i,
14286    k: __mmask16,
14287    a: __m256h,
14288) -> __m512i {
14289    unsafe {
14290        static_assert_rounding!(ROUNDING);
14291        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14292    }
14293}
14294
14295/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14296/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14297///
14298/// Rounding is done according to the rounding parameter, which can be one of:
14299///
14300/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14301/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14302/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14303/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14304/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14305///
14306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14307#[inline]
14308#[target_feature(enable = "avx512fp16")]
14309#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14310#[rustc_legacy_const_generics(2)]
14311#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14312pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14313    static_assert_rounding!(ROUNDING);
14314    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14315}
14316
14317/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14318/// the result in dst.
14319///
14320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14321#[inline]
14322#[target_feature(enable = "avx512fp16")]
14323#[cfg_attr(test, assert_instr(vcvtsh2si))]
14324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14325pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14326    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14327}
14328
14329/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14330/// the result in dst.
14331///
14332/// Rounding is done according to the rounding parameter, which can be one of:
14333///
14334/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14335/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14336/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14337/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14338/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14339///
14340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14341#[inline]
14342#[target_feature(enable = "avx512fp16")]
14343#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14344#[rustc_legacy_const_generics(1)]
14345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14346pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14347    unsafe {
14348        static_assert_rounding!(ROUNDING);
14349        vcvtsh2si32(a, ROUNDING)
14350    }
14351}
14352
14353/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14354/// results in dst.
14355///
14356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14357#[inline]
14358#[target_feature(enable = "avx512fp16,avx512vl")]
14359#[cfg_attr(test, assert_instr(vcvtph2udq))]
14360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14361pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14362    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14363}
14364
14365/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14366/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14367///
14368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14369#[inline]
14370#[target_feature(enable = "avx512fp16,avx512vl")]
14371#[cfg_attr(test, assert_instr(vcvtph2udq))]
14372#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14373pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14374    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14375}
14376
14377/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14378/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14379///
14380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14381#[inline]
14382#[target_feature(enable = "avx512fp16,avx512vl")]
14383#[cfg_attr(test, assert_instr(vcvtph2udq))]
14384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14385pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14386    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14387}
14388
14389/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14390/// the results in dst.
14391///
14392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14393#[inline]
14394#[target_feature(enable = "avx512fp16,avx512vl")]
14395#[cfg_attr(test, assert_instr(vcvtph2udq))]
14396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14397pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14398    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14399}
14400
14401/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14402/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14403///
14404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14405#[inline]
14406#[target_feature(enable = "avx512fp16,avx512vl")]
14407#[cfg_attr(test, assert_instr(vcvtph2udq))]
14408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14409pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14410    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14411}
14412
14413/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14414/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14415///
14416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14417#[inline]
14418#[target_feature(enable = "avx512fp16,avx512vl")]
14419#[cfg_attr(test, assert_instr(vcvtph2udq))]
14420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14421pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14422    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14423}
14424
14425/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14426/// the results in dst.
14427///
14428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14429#[inline]
14430#[target_feature(enable = "avx512fp16")]
14431#[cfg_attr(test, assert_instr(vcvtph2udq))]
14432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14433pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14434    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14435}
14436
14437/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14438/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14439///
14440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14441#[inline]
14442#[target_feature(enable = "avx512fp16")]
14443#[cfg_attr(test, assert_instr(vcvtph2udq))]
14444#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14445pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14446    unsafe {
14447        transmute(vcvtph2udq_512(
14448            a,
14449            src.as_u32x16(),
14450            k,
14451            _MM_FROUND_CUR_DIRECTION,
14452        ))
14453    }
14454}
14455
14456/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14457/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14458///
14459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14460#[inline]
14461#[target_feature(enable = "avx512fp16")]
14462#[cfg_attr(test, assert_instr(vcvtph2udq))]
14463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14464pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14465    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14466}
14467
14468/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14469/// the results in dst.
14470///
14471/// Rounding is done according to the rounding parameter, which can be one of:
14472///
14473/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14474/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14475/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14476/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14477/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14478///
14479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14480#[inline]
14481#[target_feature(enable = "avx512fp16")]
14482#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14483#[rustc_legacy_const_generics(1)]
14484#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14485pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14486    static_assert_rounding!(ROUNDING);
14487    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14488}
14489
14490/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14491/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14492///
14493/// Rounding is done according to the rounding parameter, which can be one of:
14494///
14495/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14496/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14497/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14498/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14499/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14500///
14501/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14502#[inline]
14503#[target_feature(enable = "avx512fp16")]
14504#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14505#[rustc_legacy_const_generics(3)]
14506#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14507pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14508    src: __m512i,
14509    k: __mmask16,
14510    a: __m256h,
14511) -> __m512i {
14512    unsafe {
14513        static_assert_rounding!(ROUNDING);
14514        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14515    }
14516}
14517
14518/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14519/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14520///
14521/// Rounding is done according to the rounding parameter, which can be one of:
14522///
14523/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14524/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14525/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14526/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14527/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14528///
14529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14530#[inline]
14531#[target_feature(enable = "avx512fp16")]
14532#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14533#[rustc_legacy_const_generics(2)]
14534#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14535pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14536    static_assert_rounding!(ROUNDING);
14537    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14538}
14539
14540/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14541/// the result in dst.
14542///
14543/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14544#[inline]
14545#[target_feature(enable = "avx512fp16")]
14546#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14547#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14548pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14549    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14550}
14551
14552/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14553/// the result in dst.
14554///
14555/// Rounding is done according to the rounding parameter, which can be one of:
14556///
14557/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14558/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14559/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14560/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14561/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14562///
14563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14564#[inline]
14565#[target_feature(enable = "avx512fp16")]
14566#[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
14567#[rustc_legacy_const_generics(1)]
14568#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14569pub fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
14570    unsafe {
14571        static_assert_rounding!(ROUNDING);
14572        vcvtsh2usi32(a, ROUNDING)
14573    }
14574}
14575
14576/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14577/// store the results in dst.
14578///
14579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14580#[inline]
14581#[target_feature(enable = "avx512fp16,avx512vl")]
14582#[cfg_attr(test, assert_instr(vcvttph2dq))]
14583#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14584pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14585    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14586}
14587
14588/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14589/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14590///
14591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14592#[inline]
14593#[target_feature(enable = "avx512fp16,avx512vl")]
14594#[cfg_attr(test, assert_instr(vcvttph2dq))]
14595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14596pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14597    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14598}
14599
14600/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14601/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14602///
14603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14604#[inline]
14605#[target_feature(enable = "avx512fp16,avx512vl")]
14606#[cfg_attr(test, assert_instr(vcvttph2dq))]
14607#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14608pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14609    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14610}
14611
14612/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14613/// store the results in dst.
14614///
14615/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14616#[inline]
14617#[target_feature(enable = "avx512fp16,avx512vl")]
14618#[cfg_attr(test, assert_instr(vcvttph2dq))]
14619#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14620pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14621    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14622}
14623
14624/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14625/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14626///
14627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14628#[inline]
14629#[target_feature(enable = "avx512fp16,avx512vl")]
14630#[cfg_attr(test, assert_instr(vcvttph2dq))]
14631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14632pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14633    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14634}
14635
14636/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14637/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14638///
14639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14640#[inline]
14641#[target_feature(enable = "avx512fp16,avx512vl")]
14642#[cfg_attr(test, assert_instr(vcvttph2dq))]
14643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14644pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14645    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14646}
14647
14648/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14649/// store the results in dst.
14650///
14651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14652#[inline]
14653#[target_feature(enable = "avx512fp16")]
14654#[cfg_attr(test, assert_instr(vcvttph2dq))]
14655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14656pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14657    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14658}
14659
14660/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14661/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14662///
14663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14664#[inline]
14665#[target_feature(enable = "avx512fp16")]
14666#[cfg_attr(test, assert_instr(vcvttph2dq))]
14667#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14668pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14669    unsafe {
14670        transmute(vcvttph2dq_512(
14671            a,
14672            src.as_i32x16(),
14673            k,
14674            _MM_FROUND_CUR_DIRECTION,
14675        ))
14676    }
14677}
14678
14679/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14680/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14681///
14682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14683#[inline]
14684#[target_feature(enable = "avx512fp16")]
14685#[cfg_attr(test, assert_instr(vcvttph2dq))]
14686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14687pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14688    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14689}
14690
14691/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14692/// store the results in dst.
14693///
14694/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14695///
14696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14697#[inline]
14698#[target_feature(enable = "avx512fp16")]
14699#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14700#[rustc_legacy_const_generics(1)]
14701#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14702pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14703    static_assert_sae!(SAE);
14704    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14705}
14706
14707/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14708/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14709///
14710/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14711///
14712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14713#[inline]
14714#[target_feature(enable = "avx512fp16")]
14715#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14716#[rustc_legacy_const_generics(3)]
14717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14718pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14719    src: __m512i,
14720    k: __mmask16,
14721    a: __m256h,
14722) -> __m512i {
14723    unsafe {
14724        static_assert_sae!(SAE);
14725        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14726    }
14727}
14728
14729/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14730/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14731///
14732/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14733///
14734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14735#[inline]
14736#[target_feature(enable = "avx512fp16")]
14737#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14738#[rustc_legacy_const_generics(2)]
14739#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14740pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14741    static_assert_sae!(SAE);
14742    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14743}
14744
14745/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14746/// the result in dst.
14747///
14748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14749#[inline]
14750#[target_feature(enable = "avx512fp16")]
14751#[cfg_attr(test, assert_instr(vcvttsh2si))]
14752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14753pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14754    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14755}
14756
14757/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14758/// the result in dst.
14759///
14760/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14761///
14762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14763#[inline]
14764#[target_feature(enable = "avx512fp16")]
14765#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14766#[rustc_legacy_const_generics(1)]
14767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14768pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14769    unsafe {
14770        static_assert_sae!(SAE);
14771        vcvttsh2si32(a, SAE)
14772    }
14773}
14774
14775/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14776/// store the results in dst.
14777///
14778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14779#[inline]
14780#[target_feature(enable = "avx512fp16,avx512vl")]
14781#[cfg_attr(test, assert_instr(vcvttph2udq))]
14782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14783pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14784    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14785}
14786
14787/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14788/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14789///
14790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14791#[inline]
14792#[target_feature(enable = "avx512fp16,avx512vl")]
14793#[cfg_attr(test, assert_instr(vcvttph2udq))]
14794#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14795pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14796    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14797}
14798
14799/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14800/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14801///
14802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14803#[inline]
14804#[target_feature(enable = "avx512fp16,avx512vl")]
14805#[cfg_attr(test, assert_instr(vcvttph2udq))]
14806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14807pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14808    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14809}
14810
14811/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14812/// store the results in dst.
14813///
14814/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14815#[inline]
14816#[target_feature(enable = "avx512fp16,avx512vl")]
14817#[cfg_attr(test, assert_instr(vcvttph2udq))]
14818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14819pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14820    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14821}
14822
14823/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14824/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14825///
14826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14827#[inline]
14828#[target_feature(enable = "avx512fp16,avx512vl")]
14829#[cfg_attr(test, assert_instr(vcvttph2udq))]
14830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14831pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14832    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
14833}
14834
14835/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14836/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14837///
14838/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14839#[inline]
14840#[target_feature(enable = "avx512fp16,avx512vl")]
14841#[cfg_attr(test, assert_instr(vcvttph2udq))]
14842#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14843pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14844    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14845}
14846
14847/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14848/// store the results in dst.
14849///
14850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14851#[inline]
14852#[target_feature(enable = "avx512fp16")]
14853#[cfg_attr(test, assert_instr(vcvttph2udq))]
14854#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14855pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14856    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14857}
14858
14859/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14860/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14861///
14862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14863#[inline]
14864#[target_feature(enable = "avx512fp16")]
14865#[cfg_attr(test, assert_instr(vcvttph2udq))]
14866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14867pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14868    unsafe {
14869        transmute(vcvttph2udq_512(
14870            a,
14871            src.as_u32x16(),
14872            k,
14873            _MM_FROUND_CUR_DIRECTION,
14874        ))
14875    }
14876}
14877
14878/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14879/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14880///
14881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14882#[inline]
14883#[target_feature(enable = "avx512fp16")]
14884#[cfg_attr(test, assert_instr(vcvttph2udq))]
14885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14886pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14887    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14888}
14889
14890/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14891/// store the results in dst.
14892///
14893/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14894///
14895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14896#[inline]
14897#[target_feature(enable = "avx512fp16")]
14898#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14899#[rustc_legacy_const_generics(1)]
14900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14901pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14902    static_assert_sae!(SAE);
14903    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14904}
14905
14906/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14907/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14908///
14909/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14910///
14911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14912#[inline]
14913#[target_feature(enable = "avx512fp16")]
14914#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14915#[rustc_legacy_const_generics(3)]
14916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14917pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14918    src: __m512i,
14919    k: __mmask16,
14920    a: __m256h,
14921) -> __m512i {
14922    unsafe {
14923        static_assert_sae!(SAE);
14924        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
14925    }
14926}
14927
14928/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14929/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14930///
14931/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14932///
14933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
14934#[inline]
14935#[target_feature(enable = "avx512fp16")]
14936#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14937#[rustc_legacy_const_generics(2)]
14938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14939pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14940    static_assert_sae!(SAE);
14941    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
14942}
14943
14944/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14945/// the result in dst.
14946///
14947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
14948#[inline]
14949#[target_feature(enable = "avx512fp16")]
14950#[cfg_attr(test, assert_instr(vcvttsh2usi))]
14951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14952pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
14953    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14954}
14955
14956/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14957/// the result in dst.
14958///
14959/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14960///
14961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
14962#[inline]
14963#[target_feature(enable = "avx512fp16")]
14964#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
14965#[rustc_legacy_const_generics(1)]
14966#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14967pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14968    unsafe {
14969        static_assert_sae!(SAE);
14970        vcvttsh2usi32(a, SAE)
14971    }
14972}
14973
14974/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14975/// store the results in dst.
14976///
14977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
14978#[inline]
14979#[target_feature(enable = "avx512fp16,avx512vl")]
14980#[cfg_attr(test, assert_instr(vcvtph2qq))]
14981#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14982pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
14983    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
14984}
14985
14986/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14987/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14988///
14989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
14990#[inline]
14991#[target_feature(enable = "avx512fp16,avx512vl")]
14992#[cfg_attr(test, assert_instr(vcvtph2qq))]
14993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14994pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14995    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
14996}
14997
14998/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14999/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15000///
15001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15002#[inline]
15003#[target_feature(enable = "avx512fp16,avx512vl")]
15004#[cfg_attr(test, assert_instr(vcvtph2qq))]
15005#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15006pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15007    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15008}
15009
15010/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15011/// store the results in dst.
15012///
15013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15014#[inline]
15015#[target_feature(enable = "avx512fp16,avx512vl")]
15016#[cfg_attr(test, assert_instr(vcvtph2qq))]
15017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15018pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15019    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15020}
15021
15022/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15023/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15024///
15025/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15026#[inline]
15027#[target_feature(enable = "avx512fp16,avx512vl")]
15028#[cfg_attr(test, assert_instr(vcvtph2qq))]
15029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15030pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15031    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15032}
15033
15034/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15035/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15036///
15037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15038#[inline]
15039#[target_feature(enable = "avx512fp16,avx512vl")]
15040#[cfg_attr(test, assert_instr(vcvtph2qq))]
15041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15042pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15043    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15044}
15045
15046/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15047/// store the results in dst.
15048///
15049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15050#[inline]
15051#[target_feature(enable = "avx512fp16")]
15052#[cfg_attr(test, assert_instr(vcvtph2qq))]
15053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15054pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15055    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15056}
15057
15058/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15059/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15060///
15061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15062#[inline]
15063#[target_feature(enable = "avx512fp16")]
15064#[cfg_attr(test, assert_instr(vcvtph2qq))]
15065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15066pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15067    unsafe {
15068        transmute(vcvtph2qq_512(
15069            a,
15070            src.as_i64x8(),
15071            k,
15072            _MM_FROUND_CUR_DIRECTION,
15073        ))
15074    }
15075}
15076
15077/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15078/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15079///
15080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15081#[inline]
15082#[target_feature(enable = "avx512fp16")]
15083#[cfg_attr(test, assert_instr(vcvtph2qq))]
15084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15085pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15086    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15087}
15088
15089/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15090/// store the results in dst.
15091///
15092/// Rounding is done according to the rounding parameter, which can be one of:
15093///
15094/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15095/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15096/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15097/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15098/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15099///
15100/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15101#[inline]
15102#[target_feature(enable = "avx512fp16")]
15103#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15104#[rustc_legacy_const_generics(1)]
15105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15106pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15107    static_assert_rounding!(ROUNDING);
15108    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15109}
15110
15111/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15112/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15113///
15114/// Rounding is done according to the rounding parameter, which can be one of:
15115///
15116/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15117/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15118/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15119/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15120/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15121///
15122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15123#[inline]
15124#[target_feature(enable = "avx512fp16")]
15125#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15126#[rustc_legacy_const_generics(3)]
15127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15128pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15129    src: __m512i,
15130    k: __mmask8,
15131    a: __m128h,
15132) -> __m512i {
15133    unsafe {
15134        static_assert_rounding!(ROUNDING);
15135        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15136    }
15137}
15138
15139/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15140/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15141///
15142/// Rounding is done according to the rounding parameter, which can be one of:
15143///
15144/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15145/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15146/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15147/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15148/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15149///
15150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15151#[inline]
15152#[target_feature(enable = "avx512fp16")]
15153#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15154#[rustc_legacy_const_generics(2)]
15155#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15156pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15157    static_assert_rounding!(ROUNDING);
15158    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15159}
15160
15161/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15162/// store the results in dst.
15163///
15164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15165#[inline]
15166#[target_feature(enable = "avx512fp16,avx512vl")]
15167#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15168#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15169pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15170    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15171}
15172
15173/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15174/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15175///
15176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15177#[inline]
15178#[target_feature(enable = "avx512fp16,avx512vl")]
15179#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15181pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15182    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15183}
15184
15185/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15186/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15187///
15188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15189#[inline]
15190#[target_feature(enable = "avx512fp16,avx512vl")]
15191#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15193pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15194    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15195}
15196
15197/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15198/// store the results in dst.
15199///
15200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15201#[inline]
15202#[target_feature(enable = "avx512fp16,avx512vl")]
15203#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15205pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15206    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15207}
15208
15209/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15210/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15211///
15212/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15213#[inline]
15214#[target_feature(enable = "avx512fp16,avx512vl")]
15215#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15216#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15217pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15218    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15219}
15220
15221/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15222/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15223///
15224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15225#[inline]
15226#[target_feature(enable = "avx512fp16,avx512vl")]
15227#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15229pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15230    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15231}
15232
15233/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15234/// store the results in dst.
15235///
15236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15237#[inline]
15238#[target_feature(enable = "avx512fp16")]
15239#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15241pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15242    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15243}
15244
15245/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15246/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15247///
15248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15249#[inline]
15250#[target_feature(enable = "avx512fp16")]
15251#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15252#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15253pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15254    unsafe {
15255        transmute(vcvtph2uqq_512(
15256            a,
15257            src.as_u64x8(),
15258            k,
15259            _MM_FROUND_CUR_DIRECTION,
15260        ))
15261    }
15262}
15263
15264/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15265/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15266///
15267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15268#[inline]
15269#[target_feature(enable = "avx512fp16")]
15270#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15272pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15273    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15274}
15275
15276/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15277/// store the results in dst.
15278///
15279/// Rounding is done according to the rounding parameter, which can be one of:
15280///
15281/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15282/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15283/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15284/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15285/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15286///
15287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15288#[inline]
15289#[target_feature(enable = "avx512fp16")]
15290#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15291#[rustc_legacy_const_generics(1)]
15292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15293pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15294    static_assert_rounding!(ROUNDING);
15295    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15296}
15297
15298/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15299/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15300///
15301/// Rounding is done according to the rounding parameter, which can be one of:
15302///
15303/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15304/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15305/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15306/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15307/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15308///
15309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15310#[inline]
15311#[target_feature(enable = "avx512fp16")]
15312#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15313#[rustc_legacy_const_generics(3)]
15314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15315pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15316    src: __m512i,
15317    k: __mmask8,
15318    a: __m128h,
15319) -> __m512i {
15320    unsafe {
15321        static_assert_rounding!(ROUNDING);
15322        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15323    }
15324}
15325
15326/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15327/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15328///
15329/// Rounding is done according to the rounding parameter, which can be one of:
15330///
15331/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15332/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15333/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15334/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15335/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15336///
15337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15338#[inline]
15339#[target_feature(enable = "avx512fp16")]
15340#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15341#[rustc_legacy_const_generics(2)]
15342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15343pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15344    static_assert_rounding!(ROUNDING);
15345    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15346}
15347
15348/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15349/// store the results in dst.
15350///
15351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15352#[inline]
15353#[target_feature(enable = "avx512fp16,avx512vl")]
15354#[cfg_attr(test, assert_instr(vcvttph2qq))]
15355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15356pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15357    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15358}
15359
15360/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15361/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15362///
15363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15364#[inline]
15365#[target_feature(enable = "avx512fp16,avx512vl")]
15366#[cfg_attr(test, assert_instr(vcvttph2qq))]
15367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15368pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15369    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15370}
15371
15372/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15373/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15374///
15375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15376#[inline]
15377#[target_feature(enable = "avx512fp16,avx512vl")]
15378#[cfg_attr(test, assert_instr(vcvttph2qq))]
15379#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15380pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15381    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15382}
15383
15384/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15385/// store the results in dst.
15386///
15387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15388#[inline]
15389#[target_feature(enable = "avx512fp16,avx512vl")]
15390#[cfg_attr(test, assert_instr(vcvttph2qq))]
15391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15392pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15393    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15394}
15395
15396/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15397/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15398///
15399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15400#[inline]
15401#[target_feature(enable = "avx512fp16,avx512vl")]
15402#[cfg_attr(test, assert_instr(vcvttph2qq))]
15403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15404pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15405    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15406}
15407
15408/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15409/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15410///
15411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15412#[inline]
15413#[target_feature(enable = "avx512fp16,avx512vl")]
15414#[cfg_attr(test, assert_instr(vcvttph2qq))]
15415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15416pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15417    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15418}
15419
15420/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15421/// store the results in dst.
15422///
15423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15424#[inline]
15425#[target_feature(enable = "avx512fp16")]
15426#[cfg_attr(test, assert_instr(vcvttph2qq))]
15427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15428pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15429    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15430}
15431
15432/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15433/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15434///
15435/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15436#[inline]
15437#[target_feature(enable = "avx512fp16")]
15438#[cfg_attr(test, assert_instr(vcvttph2qq))]
15439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15440pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15441    unsafe {
15442        transmute(vcvttph2qq_512(
15443            a,
15444            src.as_i64x8(),
15445            k,
15446            _MM_FROUND_CUR_DIRECTION,
15447        ))
15448    }
15449}
15450
15451/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15452/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15453///
15454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15455#[inline]
15456#[target_feature(enable = "avx512fp16")]
15457#[cfg_attr(test, assert_instr(vcvttph2qq))]
15458#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15459pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15460    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15461}
15462
15463/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15464/// store the results in dst.
15465///
15466/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15467///
15468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15469#[inline]
15470#[target_feature(enable = "avx512fp16")]
15471#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15472#[rustc_legacy_const_generics(1)]
15473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15474pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15475    static_assert_sae!(SAE);
15476    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15477}
15478
15479/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15480/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15481///
15482/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15483///
15484/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15485#[inline]
15486#[target_feature(enable = "avx512fp16")]
15487#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15488#[rustc_legacy_const_generics(3)]
15489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15490pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15491    src: __m512i,
15492    k: __mmask8,
15493    a: __m128h,
15494) -> __m512i {
15495    unsafe {
15496        static_assert_sae!(SAE);
15497        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15498    }
15499}
15500
15501/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15502/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15503///
15504/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15505///
15506/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15507#[inline]
15508#[target_feature(enable = "avx512fp16")]
15509#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15510#[rustc_legacy_const_generics(2)]
15511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15512pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15513    static_assert_sae!(SAE);
15514    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15515}
15516
15517/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15518/// store the results in dst.
15519///
15520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15521#[inline]
15522#[target_feature(enable = "avx512fp16,avx512vl")]
15523#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15525pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15526    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15527}
15528
15529/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15530/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15531///
15532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15533#[inline]
15534#[target_feature(enable = "avx512fp16,avx512vl")]
15535#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15537pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15538    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15539}
15540
15541/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15542/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15543///
15544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15545#[inline]
15546#[target_feature(enable = "avx512fp16,avx512vl")]
15547#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15548#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15549pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15550    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15551}
15552
15553/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15554/// store the results in dst.
15555///
15556/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15557#[inline]
15558#[target_feature(enable = "avx512fp16,avx512vl")]
15559#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15561pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15562    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15563}
15564
15565/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15566/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15567///
15568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15569#[inline]
15570#[target_feature(enable = "avx512fp16,avx512vl")]
15571#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15573pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15574    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15575}
15576
15577/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15578/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15579///
15580/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15581#[inline]
15582#[target_feature(enable = "avx512fp16,avx512vl")]
15583#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15584#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15585pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15586    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15587}
15588
15589/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15590/// store the results in dst.
15591///
15592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15593#[inline]
15594#[target_feature(enable = "avx512fp16")]
15595#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15596#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15597pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15598    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15599}
15600
15601/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15602/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15603///
15604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15605#[inline]
15606#[target_feature(enable = "avx512fp16")]
15607#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15608#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15609pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15610    unsafe {
15611        transmute(vcvttph2uqq_512(
15612            a,
15613            src.as_u64x8(),
15614            k,
15615            _MM_FROUND_CUR_DIRECTION,
15616        ))
15617    }
15618}
15619
15620/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15621/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15622///
15623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15624#[inline]
15625#[target_feature(enable = "avx512fp16")]
15626#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15627#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15628pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15629    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15630}
15631
15632/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15633/// store the results in dst.
15634///
15635/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15636///
15637/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15638#[inline]
15639#[target_feature(enable = "avx512fp16")]
15640#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15641#[rustc_legacy_const_generics(1)]
15642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15643pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15644    static_assert_sae!(SAE);
15645    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15646}
15647
15648/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15649/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15650///
15651/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15652///
15653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15654#[inline]
15655#[target_feature(enable = "avx512fp16")]
15656#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15657#[rustc_legacy_const_generics(3)]
15658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15659pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15660    src: __m512i,
15661    k: __mmask8,
15662    a: __m128h,
15663) -> __m512i {
15664    unsafe {
15665        static_assert_sae!(SAE);
15666        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15667    }
15668}
15669
15670/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15671/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15672///
15673/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15674///
15675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15676#[inline]
15677#[target_feature(enable = "avx512fp16")]
15678#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15679#[rustc_legacy_const_generics(2)]
15680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15681pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15682    static_assert_sae!(SAE);
15683    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15684}
15685
15686/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15687/// floating-point elements, and store the results in dst.
15688///
15689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15690#[inline]
15691#[target_feature(enable = "avx512fp16,avx512vl")]
15692#[cfg_attr(test, assert_instr(vcvtph2psx))]
15693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15694pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15695    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15696}
15697
15698/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15699/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15700/// dst when the corresponding mask bit is not set).
15701///
15702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15703#[inline]
15704#[target_feature(enable = "avx512fp16,avx512vl")]
15705#[cfg_attr(test, assert_instr(vcvtph2psx))]
15706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15707pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15708    unsafe { vcvtph2psx_128(a, src, k) }
15709}
15710
15711/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15712/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15713/// corresponding mask bit is not set).
15714///
15715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15716#[inline]
15717#[target_feature(enable = "avx512fp16,avx512vl")]
15718#[cfg_attr(test, assert_instr(vcvtph2psx))]
15719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15720pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15721    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15722}
15723
15724/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15725/// floating-point elements, and store the results in dst.
15726///
15727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15728#[inline]
15729#[target_feature(enable = "avx512fp16,avx512vl")]
15730#[cfg_attr(test, assert_instr(vcvtph2psx))]
15731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15732pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15733    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15734}
15735
15736/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15737/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15738/// dst when the corresponding mask bit is not set).
15739///
15740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15741#[inline]
15742#[target_feature(enable = "avx512fp16,avx512vl")]
15743#[cfg_attr(test, assert_instr(vcvtph2psx))]
15744#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15745pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15746    unsafe { vcvtph2psx_256(a, src, k) }
15747}
15748
15749/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15750/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15751/// corresponding mask bit is not set).
15752///
15753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15754#[inline]
15755#[target_feature(enable = "avx512fp16,avx512vl")]
15756#[cfg_attr(test, assert_instr(vcvtph2psx))]
15757#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15758pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15759    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15760}
15761
15762/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15763/// floating-point elements, and store the results in dst.
15764///
15765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15766#[inline]
15767#[target_feature(enable = "avx512fp16")]
15768#[cfg_attr(test, assert_instr(vcvtph2psx))]
15769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15770pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15771    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15772}
15773
15774/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15775/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15776/// dst when the corresponding mask bit is not set).
15777///
15778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15779#[inline]
15780#[target_feature(enable = "avx512fp16")]
15781#[cfg_attr(test, assert_instr(vcvtph2psx))]
15782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15783pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15784    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15785}
15786
15787/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15788/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15789/// corresponding mask bit is not set).
15790///
15791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15792#[inline]
15793#[target_feature(enable = "avx512fp16")]
15794#[cfg_attr(test, assert_instr(vcvtph2psx))]
15795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15796pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15797    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15798}
15799
15800/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15801/// floating-point elements, and store the results in dst.
15802///
15803/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15804///
15805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15806#[inline]
15807#[target_feature(enable = "avx512fp16")]
15808#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15809#[rustc_legacy_const_generics(1)]
15810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15811pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15812    static_assert_sae!(SAE);
15813    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15814}
15815
15816/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15817/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15818/// dst when the corresponding mask bit is not set).
15819///
15820/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15821///
15822/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15823#[inline]
15824#[target_feature(enable = "avx512fp16")]
15825#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15826#[rustc_legacy_const_generics(3)]
15827#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15828pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15829    src: __m512,
15830    k: __mmask16,
15831    a: __m256h,
15832) -> __m512 {
15833    unsafe {
15834        static_assert_sae!(SAE);
15835        vcvtph2psx_512(a, src, k, SAE)
15836    }
15837}
15838
15839/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15840/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15841/// corresponding mask bit is not set).
15842///
15843/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15844///
15845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15846#[inline]
15847#[target_feature(enable = "avx512fp16")]
15848#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15849#[rustc_legacy_const_generics(2)]
15850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15851pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15852    static_assert_sae!(SAE);
15853    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15854}
15855
15856/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15857/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15858/// elements from a to the upper elements of dst.
15859///
15860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15861#[inline]
15862#[target_feature(enable = "avx512fp16")]
15863#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15865pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15866    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15867}
15868
15869/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15870/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15871/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15872/// upper elements of dst.
15873///
15874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15875#[inline]
15876#[target_feature(enable = "avx512fp16")]
15877#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15879pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15880    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
15881}
15882
15883/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15884/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15885/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15886/// of dst.
15887///
15888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15889#[inline]
15890#[target_feature(enable = "avx512fp16")]
15891#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15893pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15894    _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b)
15895}
15896
15897/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15898/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15899/// from a to the upper elements of dst.
15900///
15901/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15902///
15903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15904#[inline]
15905#[target_feature(enable = "avx512fp16")]
15906#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15907#[rustc_legacy_const_generics(2)]
15908#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15909pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15910    static_assert_sae!(SAE);
15911    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15912}
15913
15914/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15915/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15916/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15917/// upper elements of dst.
15918///
15919/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15920///
15921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15922#[inline]
15923#[target_feature(enable = "avx512fp16")]
15924#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15925#[rustc_legacy_const_generics(4)]
15926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15927pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
15928    src: __m128,
15929    k: __mmask8,
15930    a: __m128,
15931    b: __m128h,
15932) -> __m128 {
15933    unsafe {
15934        static_assert_sae!(SAE);
15935        vcvtsh2ss(a, b, src, k, SAE)
15936    }
15937}
15938
15939/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15940/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15941/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15942/// of dst.
15943///
15944/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15945///
15946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
15947#[inline]
15948#[target_feature(enable = "avx512fp16")]
15949#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15950#[rustc_legacy_const_generics(3)]
15951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15952pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15953    static_assert_sae!(SAE);
15954    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_setzero_ps(), k, a, b)
15955}
15956
15957/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15958/// floating-point elements, and store the results in dst.
15959///
15960/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
15961#[inline]
15962#[target_feature(enable = "avx512fp16,avx512vl")]
15963#[cfg_attr(test, assert_instr(vcvtph2pd))]
15964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15965pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
15966    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
15967}
15968
15969/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15970/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15971/// dst when the corresponding mask bit is not set).
15972///
15973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
15974#[inline]
15975#[target_feature(enable = "avx512fp16,avx512vl")]
15976#[cfg_attr(test, assert_instr(vcvtph2pd))]
15977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15978pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
15979    unsafe { vcvtph2pd_128(a, src, k) }
15980}
15981
15982/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15983/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15984/// corresponding mask bit is not set).
15985///
15986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
15987#[inline]
15988#[target_feature(enable = "avx512fp16,avx512vl")]
15989#[cfg_attr(test, assert_instr(vcvtph2pd))]
15990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15991pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
15992    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
15993}
15994
15995/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15996/// floating-point elements, and store the results in dst.
15997///
15998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
15999#[inline]
16000#[target_feature(enable = "avx512fp16,avx512vl")]
16001#[cfg_attr(test, assert_instr(vcvtph2pd))]
16002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16003pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16004    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16005}
16006
16007/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16008/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16009/// dst when the corresponding mask bit is not set).
16010///
16011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16012#[inline]
16013#[target_feature(enable = "avx512fp16,avx512vl")]
16014#[cfg_attr(test, assert_instr(vcvtph2pd))]
16015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16016pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16017    unsafe { vcvtph2pd_256(a, src, k) }
16018}
16019
16020/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16021/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16022/// corresponding mask bit is not set).
16023///
16024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16025#[inline]
16026#[target_feature(enable = "avx512fp16,avx512vl")]
16027#[cfg_attr(test, assert_instr(vcvtph2pd))]
16028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16029pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16030    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16031}
16032
16033/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16034/// floating-point elements, and store the results in dst.
16035///
16036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16037#[inline]
16038#[target_feature(enable = "avx512fp16")]
16039#[cfg_attr(test, assert_instr(vcvtph2pd))]
16040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16041pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16042    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16043}
16044
16045/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16046/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16047/// dst when the corresponding mask bit is not set).
16048///
16049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16050#[inline]
16051#[target_feature(enable = "avx512fp16")]
16052#[cfg_attr(test, assert_instr(vcvtph2pd))]
16053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16054pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16055    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16056}
16057
16058/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16059/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16060/// corresponding mask bit is not set).
16061///
16062/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16063#[inline]
16064#[target_feature(enable = "avx512fp16")]
16065#[cfg_attr(test, assert_instr(vcvtph2pd))]
16066#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16067pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16068    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16069}
16070
16071/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16072/// floating-point elements, and store the results in dst.
16073///
16074/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16075///
16076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16077#[inline]
16078#[target_feature(enable = "avx512fp16")]
16079#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16080#[rustc_legacy_const_generics(1)]
16081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16082pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16083    static_assert_sae!(SAE);
16084    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16085}
16086
16087/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16088/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16089/// dst when the corresponding mask bit is not set).
16090///
16091/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16092///
16093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16094#[inline]
16095#[target_feature(enable = "avx512fp16")]
16096#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16097#[rustc_legacy_const_generics(3)]
16098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16099pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16100    src: __m512d,
16101    k: __mmask8,
16102    a: __m128h,
16103) -> __m512d {
16104    unsafe {
16105        static_assert_sae!(SAE);
16106        vcvtph2pd_512(a, src, k, SAE)
16107    }
16108}
16109
16110/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16111/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16112/// corresponding mask bit is not set).
16113///
16114/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16115///
16116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16117#[inline]
16118#[target_feature(enable = "avx512fp16")]
16119#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16120#[rustc_legacy_const_generics(2)]
16121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16122pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16123    static_assert_sae!(SAE);
16124    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16125}
16126
16127/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16128/// floating-point element, store the result in the lower element of dst, and copy the upper element
16129/// from a to the upper element of dst.
16130///
16131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16132#[inline]
16133#[target_feature(enable = "avx512fp16")]
16134#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16136pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16137    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16138}
16139
16140/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16141/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16142/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16143/// of dst.
16144///
16145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16146#[inline]
16147#[target_feature(enable = "avx512fp16")]
16148#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16150pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16151    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16152}
16153
16154/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16155/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16156/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16157///
16158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16159#[inline]
16160#[target_feature(enable = "avx512fp16")]
16161#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16163pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16164    _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b)
16165}
16166
16167/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16168/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16169/// to the upper element of dst.
16170///
16171/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16172///
16173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16174#[inline]
16175#[target_feature(enable = "avx512fp16")]
16176#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16177#[rustc_legacy_const_generics(2)]
16178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16179pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16180    static_assert_sae!(SAE);
16181    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16182}
16183
16184/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16185/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16186/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16187/// of dst.
16188///
16189/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16190///
16191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16192#[inline]
16193#[target_feature(enable = "avx512fp16")]
16194#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16195#[rustc_legacy_const_generics(4)]
16196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16197pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16198    src: __m128d,
16199    k: __mmask8,
16200    a: __m128d,
16201    b: __m128h,
16202) -> __m128d {
16203    unsafe {
16204        static_assert_sae!(SAE);
16205        vcvtsh2sd(a, b, src, k, SAE)
16206    }
16207}
16208
16209/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16210/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16211/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16212///
16213/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16214///
16215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16216#[inline]
16217#[target_feature(enable = "avx512fp16")]
16218#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16219#[rustc_legacy_const_generics(3)]
16220#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16221pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16222    static_assert_sae!(SAE);
16223    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
16224}
16225
16226/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16227///
16228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16229#[inline]
16230#[target_feature(enable = "avx512fp16")]
16231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16232pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
16233    unsafe { simd_extract!(a, 0) }
16234}
16235
16236/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16237///
16238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16239#[inline]
16240#[target_feature(enable = "avx512fp16")]
16241#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16242pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16243    unsafe { simd_extract!(a, 0) }
16244}
16245
16246/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16247///
16248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16249#[inline]
16250#[target_feature(enable = "avx512fp16")]
16251#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16252pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16253    unsafe { simd_extract!(a, 0) }
16254}
16255
16256/// Copy the lower 16-bit integer in a to dst.
16257///
16258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16259#[inline]
16260#[target_feature(enable = "avx512fp16")]
16261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16262pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16263    unsafe { simd_extract!(a.as_i16x8(), 0) }
16264}
16265
16266/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16267///
16268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16269#[inline]
16270#[target_feature(enable = "avx512fp16")]
16271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16272pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16273    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16274}
16275
16276#[allow(improper_ctypes)]
16277unsafe extern "C" {
16278    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16279    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16280    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16281    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16282
16283    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16284    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16285    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16286    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16287    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16288    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16289    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16290    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16291
16292    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16293    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16294    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16295    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16296    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16297    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16298    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16299    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16300
16301    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16302    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16303    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16304    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16305    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16306    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16307    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16308    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16309
16310    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16311    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16312    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16313    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16314    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16315    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16316    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16317    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16318
16319    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16320    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16321    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16322    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16323    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16324    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16325    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16326    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16327    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16328    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16329    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16330    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16331    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16332    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16333    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16334    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16335
16336    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16337    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16338    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16339    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16340    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16341    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16342    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16343    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16344    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16345    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16346    -> __m512;
16347    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16348    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16349    -> __m512;
16350    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16351    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16352    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16353    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16354
16355    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16356    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16357    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16358    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16359
16360    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
16361    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
16362    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
16363    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
16364    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16365    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16366
16367    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16368    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16369    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16370    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16371    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16372    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16373    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16374    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16375
16376    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16377    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16378    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16379    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16380    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16381    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16382    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16383    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16384
16385    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16386    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16387    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16388    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16389
16390    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16391    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16392    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16393    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16394    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16395    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16396    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16397    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16398
16399    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16400    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16401    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16402    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16403    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16404    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16405    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16406    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16407
16408    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16409    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16410    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16411    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16412    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16413    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16414    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16415    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16416
16417    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16418    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16419    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16420    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16421    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16422    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16423    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16424    fn vgetmantsh(
16425        a: __m128h,
16426        b: __m128h,
16427        imm8: i32,
16428        src: __m128h,
16429        k: __mmask8,
16430        sae: i32,
16431    ) -> __m128h;
16432
16433    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16434    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16435    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16436    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16437    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16438    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16439    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16440    fn vrndscalesh(
16441        a: __m128h,
16442        b: __m128h,
16443        src: __m128h,
16444        k: __mmask8,
16445        imm8: i32,
16446        sae: i32,
16447    ) -> __m128h;
16448
16449    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16450    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16451    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16452    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16453    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16454    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16455    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16456    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16457
16458    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16459    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16460    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16461    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16462    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16463    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16464    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16465    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16466    -> __m128h;
16467
16468    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16469    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16470
16471    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16472    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16473    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16474    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16475    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16476    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16477    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u16"]
16478    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16479    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u16"]
16480    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16481    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32u16"]
16482    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16483
16484    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16485    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16486    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16487    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16488    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16489    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16490    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16491    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16492    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16493    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16494    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u32"]
16495    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16496    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u32"]
16497    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16498    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16499    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16500
16501    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16502    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16503    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16504    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16505    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16506    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16507    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16508    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16509    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16510    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16511    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u64"]
16512    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16513
16514    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16515    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16516    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16517    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16518    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16519    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16520    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16521    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16522
16523    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16524    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16525    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16526    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16527    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16528    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16529    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16530    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16531
16532    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16533    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16534    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16535    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16536    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16537    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16538    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16539    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16540    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16541    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16542    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16543    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, rounding: i32) -> u16x32;
16544
16545    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16546    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16547    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16548    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16549    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16550    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16551    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16552    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16553    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16554    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16555    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16556    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16557
16558    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16559    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16560    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16561    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16562    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16563    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16564    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16565    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16566    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16567    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16568    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16569    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16570    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16571    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16572    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16573    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16574
16575    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16576    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16577    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16578    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16579    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16580    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16581    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16582    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16583    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16584    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16585    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16586    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16587    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16588    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16589    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16590    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16591
16592    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16593    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16594    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16595    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16596    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16597    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16598    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16599    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16600    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16601    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16602    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16603    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16604
16605    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16606    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16607    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16608    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16609    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16610    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16611    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16612    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16613    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16614    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16615    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16616    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16617
16618    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16619    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16620    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16621    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16622    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16623    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16624    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16625    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16626
16627    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16628    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16629    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16630    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16631    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16632    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16633    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16634    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16635
16636}
16637
16638#[cfg(test)]
16639mod tests {
16640    use crate::core_arch::x86::*;
16641    use crate::mem::transmute;
16642    use crate::ptr::{addr_of, addr_of_mut};
16643    use stdarch_test::simd_test;
16644
16645    #[target_feature(enable = "avx512fp16")]
16646    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16647        _mm_setr_ph(re, im, re, im, re, im, re, im)
16648    }
16649
16650    #[target_feature(enable = "avx512fp16")]
16651    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16652        _mm256_setr_ph(
16653            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16654        )
16655    }
16656
16657    #[target_feature(enable = "avx512fp16")]
16658    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16659        _mm512_setr_ph(
16660            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16661            re, im, re, im, re, im, re, im, re, im,
16662        )
16663    }
16664
16665    #[simd_test(enable = "avx512fp16")]
16666    unsafe fn test_mm_set_ph() {
16667        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16668        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16669        assert_eq_m128h(r, e);
16670    }
16671
16672    #[simd_test(enable = "avx512fp16")]
16673    unsafe fn test_mm256_set_ph() {
16674        let r = _mm256_set_ph(
16675            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16676        );
16677        let e = _mm256_setr_ph(
16678            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16679        );
16680        assert_eq_m256h(r, e);
16681    }
16682
16683    #[simd_test(enable = "avx512fp16")]
16684    unsafe fn test_mm512_set_ph() {
16685        let r = _mm512_set_ph(
16686            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16687            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16688            31.0, 32.0,
16689        );
16690        let e = _mm512_setr_ph(
16691            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16692            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16693            3.0, 2.0, 1.0,
16694        );
16695        assert_eq_m512h(r, e);
16696    }
16697
16698    #[simd_test(enable = "avx512fp16")]
16699    unsafe fn test_mm_set_sh() {
16700        let r = _mm_set_sh(1.0);
16701        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16702        assert_eq_m128h(r, e);
16703    }
16704
16705    #[simd_test(enable = "avx512fp16")]
16706    unsafe fn test_mm_set1_ph() {
16707        let r = _mm_set1_ph(1.0);
16708        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16709        assert_eq_m128h(r, e);
16710    }
16711
16712    #[simd_test(enable = "avx512fp16")]
16713    unsafe fn test_mm256_set1_ph() {
16714        let r = _mm256_set1_ph(1.0);
16715        let e = _mm256_set_ph(
16716            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16717        );
16718        assert_eq_m256h(r, e);
16719    }
16720
16721    #[simd_test(enable = "avx512fp16")]
16722    unsafe fn test_mm512_set1_ph() {
16723        let r = _mm512_set1_ph(1.0);
16724        let e = _mm512_set_ph(
16725            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16726            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16727        );
16728        assert_eq_m512h(r, e);
16729    }
16730
16731    #[simd_test(enable = "avx512fp16")]
16732    unsafe fn test_mm_setr_ph() {
16733        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16734        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16735        assert_eq_m128h(r, e);
16736    }
16737
16738    #[simd_test(enable = "avx512fp16")]
16739    unsafe fn test_mm256_setr_ph() {
16740        let r = _mm256_setr_ph(
16741            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16742        );
16743        let e = _mm256_set_ph(
16744            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16745        );
16746        assert_eq_m256h(r, e);
16747    }
16748
16749    #[simd_test(enable = "avx512fp16")]
16750    unsafe fn test_mm512_setr_ph() {
16751        let r = _mm512_setr_ph(
16752            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16753            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16754            31.0, 32.0,
16755        );
16756        let e = _mm512_set_ph(
16757            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16758            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16759            3.0, 2.0, 1.0,
16760        );
16761        assert_eq_m512h(r, e);
16762    }
16763
16764    #[simd_test(enable = "avx512fp16,avx512vl")]
16765    unsafe fn test_mm_setzero_ph() {
16766        let r = _mm_setzero_ph();
16767        let e = _mm_set1_ph(0.0);
16768        assert_eq_m128h(r, e);
16769    }
16770
16771    #[simd_test(enable = "avx512fp16,avx512vl")]
16772    unsafe fn test_mm256_setzero_ph() {
16773        let r = _mm256_setzero_ph();
16774        let e = _mm256_set1_ph(0.0);
16775        assert_eq_m256h(r, e);
16776    }
16777
16778    #[simd_test(enable = "avx512fp16")]
16779    unsafe fn test_mm512_setzero_ph() {
16780        let r = _mm512_setzero_ph();
16781        let e = _mm512_set1_ph(0.0);
16782        assert_eq_m512h(r, e);
16783    }
16784
16785    #[simd_test(enable = "avx512fp16")]
16786    unsafe fn test_mm_castsi128_ph() {
16787        let a = _mm_set1_epi16(0x3c00);
16788        let r = _mm_castsi128_ph(a);
16789        let e = _mm_set1_ph(1.0);
16790        assert_eq_m128h(r, e);
16791    }
16792
16793    #[simd_test(enable = "avx512fp16")]
16794    unsafe fn test_mm256_castsi256_ph() {
16795        let a = _mm256_set1_epi16(0x3c00);
16796        let r = _mm256_castsi256_ph(a);
16797        let e = _mm256_set1_ph(1.0);
16798        assert_eq_m256h(r, e);
16799    }
16800
16801    #[simd_test(enable = "avx512fp16")]
16802    unsafe fn test_mm512_castsi512_ph() {
16803        let a = _mm512_set1_epi16(0x3c00);
16804        let r = _mm512_castsi512_ph(a);
16805        let e = _mm512_set1_ph(1.0);
16806        assert_eq_m512h(r, e);
16807    }
16808
16809    #[simd_test(enable = "avx512fp16")]
16810    unsafe fn test_mm_castph_si128() {
16811        let a = _mm_set1_ph(1.0);
16812        let r = _mm_castph_si128(a);
16813        let e = _mm_set1_epi16(0x3c00);
16814        assert_eq_m128i(r, e);
16815    }
16816
16817    #[simd_test(enable = "avx512fp16")]
16818    unsafe fn test_mm256_castph_si256() {
16819        let a = _mm256_set1_ph(1.0);
16820        let r = _mm256_castph_si256(a);
16821        let e = _mm256_set1_epi16(0x3c00);
16822        assert_eq_m256i(r, e);
16823    }
16824
16825    #[simd_test(enable = "avx512fp16")]
16826    unsafe fn test_mm512_castph_si512() {
16827        let a = _mm512_set1_ph(1.0);
16828        let r = _mm512_castph_si512(a);
16829        let e = _mm512_set1_epi16(0x3c00);
16830        assert_eq_m512i(r, e);
16831    }
16832
16833    #[simd_test(enable = "avx512fp16")]
16834    unsafe fn test_mm_castps_ph() {
16835        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16836        let r = _mm_castps_ph(a);
16837        let e = _mm_set1_ph(1.0);
16838        assert_eq_m128h(r, e);
16839    }
16840
16841    #[simd_test(enable = "avx512fp16")]
16842    unsafe fn test_mm256_castps_ph() {
16843        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16844        let r = _mm256_castps_ph(a);
16845        let e = _mm256_set1_ph(1.0);
16846        assert_eq_m256h(r, e);
16847    }
16848
16849    #[simd_test(enable = "avx512fp16")]
16850    unsafe fn test_mm512_castps_ph() {
16851        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16852        let r = _mm512_castps_ph(a);
16853        let e = _mm512_set1_ph(1.0);
16854        assert_eq_m512h(r, e);
16855    }
16856
16857    #[simd_test(enable = "avx512fp16")]
16858    unsafe fn test_mm_castph_ps() {
16859        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16860        let r = _mm_castph_ps(a);
16861        let e = _mm_set1_ps(1.0);
16862        assert_eq_m128(r, e);
16863    }
16864
16865    #[simd_test(enable = "avx512fp16")]
16866    unsafe fn test_mm256_castph_ps() {
16867        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16868        let r = _mm256_castph_ps(a);
16869        let e = _mm256_set1_ps(1.0);
16870        assert_eq_m256(r, e);
16871    }
16872
16873    #[simd_test(enable = "avx512fp16")]
16874    unsafe fn test_mm512_castph_ps() {
16875        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16876        let r = _mm512_castph_ps(a);
16877        let e = _mm512_set1_ps(1.0);
16878        assert_eq_m512(r, e);
16879    }
16880
16881    #[simd_test(enable = "avx512fp16")]
16882    unsafe fn test_mm_castpd_ph() {
16883        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16884        let r = _mm_castpd_ph(a);
16885        let e = _mm_set1_ph(1.0);
16886        assert_eq_m128h(r, e);
16887    }
16888
16889    #[simd_test(enable = "avx512fp16")]
16890    unsafe fn test_mm256_castpd_ph() {
16891        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16892        let r = _mm256_castpd_ph(a);
16893        let e = _mm256_set1_ph(1.0);
16894        assert_eq_m256h(r, e);
16895    }
16896
16897    #[simd_test(enable = "avx512fp16")]
16898    unsafe fn test_mm512_castpd_ph() {
16899        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16900        let r = _mm512_castpd_ph(a);
16901        let e = _mm512_set1_ph(1.0);
16902        assert_eq_m512h(r, e);
16903    }
16904
16905    #[simd_test(enable = "avx512fp16")]
16906    unsafe fn test_mm_castph_pd() {
16907        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16908        let r = _mm_castph_pd(a);
16909        let e = _mm_set1_pd(1.0);
16910        assert_eq_m128d(r, e);
16911    }
16912
16913    #[simd_test(enable = "avx512fp16")]
16914    unsafe fn test_mm256_castph_pd() {
16915        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16916        let r = _mm256_castph_pd(a);
16917        let e = _mm256_set1_pd(1.0);
16918        assert_eq_m256d(r, e);
16919    }
16920
16921    #[simd_test(enable = "avx512fp16")]
16922    unsafe fn test_mm512_castph_pd() {
16923        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16924        let r = _mm512_castph_pd(a);
16925        let e = _mm512_set1_pd(1.0);
16926        assert_eq_m512d(r, e);
16927    }
16928
16929    #[simd_test(enable = "avx512fp16")]
16930    unsafe fn test_mm256_castph256_ph128() {
16931        let a = _mm256_setr_ph(
16932            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16933        );
16934        let r = _mm256_castph256_ph128(a);
16935        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16936        assert_eq_m128h(r, e);
16937    }
16938
16939    #[simd_test(enable = "avx512fp16")]
16940    unsafe fn test_mm512_castph512_ph128() {
16941        let a = _mm512_setr_ph(
16942            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16943            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16944        );
16945        let r = _mm512_castph512_ph128(a);
16946        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16947        assert_eq_m128h(r, e);
16948    }
16949
16950    #[simd_test(enable = "avx512fp16")]
16951    unsafe fn test_mm512_castph512_ph256() {
16952        let a = _mm512_setr_ph(
16953            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16954            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16955        );
16956        let r = _mm512_castph512_ph256(a);
16957        let e = _mm256_setr_ph(
16958            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16959        );
16960        assert_eq_m256h(r, e);
16961    }
16962
16963    #[simd_test(enable = "avx512fp16")]
16964    unsafe fn test_mm256_castph128_ph256() {
16965        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16966        let r = _mm256_castph128_ph256(a);
16967        assert_eq_m128h(_mm256_castph256_ph128(r), a);
16968    }
16969
16970    #[simd_test(enable = "avx512fp16")]
16971    unsafe fn test_mm512_castph128_ph512() {
16972        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16973        let r = _mm512_castph128_ph512(a);
16974        assert_eq_m128h(_mm512_castph512_ph128(r), a);
16975    }
16976
16977    #[simd_test(enable = "avx512fp16")]
16978    unsafe fn test_mm512_castph256_ph512() {
16979        let a = _mm256_setr_ph(
16980            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16981        );
16982        let r = _mm512_castph256_ph512(a);
16983        assert_eq_m256h(_mm512_castph512_ph256(r), a);
16984    }
16985
16986    #[simd_test(enable = "avx512fp16")]
16987    unsafe fn test_mm256_zextph128_ph256() {
16988        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16989        let r = _mm256_zextph128_ph256(a);
16990        let e = _mm256_setr_ph(
16991            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
16992        );
16993        assert_eq_m256h(r, e);
16994    }
16995
16996    #[simd_test(enable = "avx512fp16")]
16997    unsafe fn test_mm512_zextph128_ph512() {
16998        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16999        let r = _mm512_zextph128_ph512(a);
17000        let e = _mm512_setr_ph(
17001            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17002            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17003        );
17004        assert_eq_m512h(r, e);
17005    }
17006
17007    #[simd_test(enable = "avx512fp16")]
17008    unsafe fn test_mm512_zextph256_ph512() {
17009        let a = _mm256_setr_ph(
17010            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17011        );
17012        let r = _mm512_zextph256_ph512(a);
17013        let e = _mm512_setr_ph(
17014            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17015            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17016        );
17017        assert_eq_m512h(r, e);
17018    }
17019
17020    #[simd_test(enable = "avx512fp16,avx512vl")]
17021    unsafe fn test_mm_cmp_ph_mask() {
17022        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17023        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17024        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17025        assert_eq!(r, 0b11110000);
17026    }
17027
17028    #[simd_test(enable = "avx512fp16,avx512vl")]
17029    unsafe fn test_mm_mask_cmp_ph_mask() {
17030        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17031        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17032        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17033        assert_eq!(r, 0b01010000);
17034    }
17035
17036    #[simd_test(enable = "avx512fp16,avx512vl")]
17037    unsafe fn test_mm256_cmp_ph_mask() {
17038        let a = _mm256_set_ph(
17039            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17040        );
17041        let b = _mm256_set_ph(
17042            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17043            -16.0,
17044        );
17045        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17046        assert_eq!(r, 0b1111000011110000);
17047    }
17048
17049    #[simd_test(enable = "avx512fp16,avx512vl")]
17050    unsafe fn test_mm256_mask_cmp_ph_mask() {
17051        let a = _mm256_set_ph(
17052            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17053        );
17054        let b = _mm256_set_ph(
17055            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17056            -16.0,
17057        );
17058        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17059        assert_eq!(r, 0b0101000001010000);
17060    }
17061
17062    #[simd_test(enable = "avx512fp16")]
17063    unsafe fn test_mm512_cmp_ph_mask() {
17064        let a = _mm512_set_ph(
17065            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17066            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17067            31.0, 32.0,
17068        );
17069        let b = _mm512_set_ph(
17070            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17071            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17072            -29.0, -30.0, -31.0, -32.0,
17073        );
17074        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17075        assert_eq!(r, 0b11110000111100001111000011110000);
17076    }
17077
17078    #[simd_test(enable = "avx512fp16")]
17079    unsafe fn test_mm512_mask_cmp_ph_mask() {
17080        let a = _mm512_set_ph(
17081            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17082            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17083            31.0, 32.0,
17084        );
17085        let b = _mm512_set_ph(
17086            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17087            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17088            -29.0, -30.0, -31.0, -32.0,
17089        );
17090        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17091        assert_eq!(r, 0b01010000010100000101000001010000);
17092    }
17093
17094    #[simd_test(enable = "avx512fp16")]
17095    unsafe fn test_mm512_cmp_round_ph_mask() {
17096        let a = _mm512_set_ph(
17097            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17098            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17099            31.0, 32.0,
17100        );
17101        let b = _mm512_set_ph(
17102            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17103            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17104            -29.0, -30.0, -31.0, -32.0,
17105        );
17106        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17107        assert_eq!(r, 0b11110000111100001111000011110000);
17108    }
17109
17110    #[simd_test(enable = "avx512fp16")]
17111    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17112        let a = _mm512_set_ph(
17113            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17114            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17115            31.0, 32.0,
17116        );
17117        let b = _mm512_set_ph(
17118            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17119            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17120            -29.0, -30.0, -31.0, -32.0,
17121        );
17122        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17123            0b01010101010101010101010101010101,
17124            a,
17125            b,
17126        );
17127        assert_eq!(r, 0b01010000010100000101000001010000);
17128    }
17129
17130    #[simd_test(enable = "avx512fp16")]
17131    unsafe fn test_mm_cmp_round_sh_mask() {
17132        let a = _mm_set_sh(1.0);
17133        let b = _mm_set_sh(1.0);
17134        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17135        assert_eq!(r, 1);
17136    }
17137
17138    #[simd_test(enable = "avx512fp16")]
17139    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17140        let a = _mm_set_sh(1.0);
17141        let b = _mm_set_sh(1.0);
17142        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17143        assert_eq!(r, 0);
17144    }
17145
17146    #[simd_test(enable = "avx512fp16")]
17147    unsafe fn test_mm_cmp_sh_mask() {
17148        let a = _mm_set_sh(1.0);
17149        let b = _mm_set_sh(1.0);
17150        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17151        assert_eq!(r, 1);
17152    }
17153
17154    #[simd_test(enable = "avx512fp16")]
17155    unsafe fn test_mm_mask_cmp_sh_mask() {
17156        let a = _mm_set_sh(1.0);
17157        let b = _mm_set_sh(1.0);
17158        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17159        assert_eq!(r, 0);
17160    }
17161
17162    #[simd_test(enable = "avx512fp16")]
17163    unsafe fn test_mm_comi_round_sh() {
17164        let a = _mm_set_sh(1.0);
17165        let b = _mm_set_sh(1.0);
17166        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17167        assert_eq!(r, 1);
17168    }
17169
17170    #[simd_test(enable = "avx512fp16")]
17171    unsafe fn test_mm_comi_sh() {
17172        let a = _mm_set_sh(1.0);
17173        let b = _mm_set_sh(1.0);
17174        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17175        assert_eq!(r, 1);
17176    }
17177
17178    #[simd_test(enable = "avx512fp16")]
17179    unsafe fn test_mm_comieq_sh() {
17180        let a = _mm_set_sh(1.0);
17181        let b = _mm_set_sh(1.0);
17182        let r = _mm_comieq_sh(a, b);
17183        assert_eq!(r, 1);
17184    }
17185
17186    #[simd_test(enable = "avx512fp16")]
17187    unsafe fn test_mm_comige_sh() {
17188        let a = _mm_set_sh(2.0);
17189        let b = _mm_set_sh(1.0);
17190        let r = _mm_comige_sh(a, b);
17191        assert_eq!(r, 1);
17192    }
17193
17194    #[simd_test(enable = "avx512fp16")]
17195    unsafe fn test_mm_comigt_sh() {
17196        let a = _mm_set_sh(2.0);
17197        let b = _mm_set_sh(1.0);
17198        let r = _mm_comigt_sh(a, b);
17199        assert_eq!(r, 1);
17200    }
17201
17202    #[simd_test(enable = "avx512fp16")]
17203    unsafe fn test_mm_comile_sh() {
17204        let a = _mm_set_sh(1.0);
17205        let b = _mm_set_sh(2.0);
17206        let r = _mm_comile_sh(a, b);
17207        assert_eq!(r, 1);
17208    }
17209
17210    #[simd_test(enable = "avx512fp16")]
17211    unsafe fn test_mm_comilt_sh() {
17212        let a = _mm_set_sh(1.0);
17213        let b = _mm_set_sh(2.0);
17214        let r = _mm_comilt_sh(a, b);
17215        assert_eq!(r, 1);
17216    }
17217
17218    #[simd_test(enable = "avx512fp16")]
17219    unsafe fn test_mm_comineq_sh() {
17220        let a = _mm_set_sh(1.0);
17221        let b = _mm_set_sh(2.0);
17222        let r = _mm_comineq_sh(a, b);
17223        assert_eq!(r, 1);
17224    }
17225
17226    #[simd_test(enable = "avx512fp16")]
17227    unsafe fn test_mm_ucomieq_sh() {
17228        let a = _mm_set_sh(1.0);
17229        let b = _mm_set_sh(1.0);
17230        let r = _mm_ucomieq_sh(a, b);
17231        assert_eq!(r, 1);
17232    }
17233
17234    #[simd_test(enable = "avx512fp16")]
17235    unsafe fn test_mm_ucomige_sh() {
17236        let a = _mm_set_sh(2.0);
17237        let b = _mm_set_sh(1.0);
17238        let r = _mm_ucomige_sh(a, b);
17239        assert_eq!(r, 1);
17240    }
17241
17242    #[simd_test(enable = "avx512fp16")]
17243    unsafe fn test_mm_ucomigt_sh() {
17244        let a = _mm_set_sh(2.0);
17245        let b = _mm_set_sh(1.0);
17246        let r = _mm_ucomigt_sh(a, b);
17247        assert_eq!(r, 1);
17248    }
17249
17250    #[simd_test(enable = "avx512fp16")]
17251    unsafe fn test_mm_ucomile_sh() {
17252        let a = _mm_set_sh(1.0);
17253        let b = _mm_set_sh(2.0);
17254        let r = _mm_ucomile_sh(a, b);
17255        assert_eq!(r, 1);
17256    }
17257
17258    #[simd_test(enable = "avx512fp16")]
17259    unsafe fn test_mm_ucomilt_sh() {
17260        let a = _mm_set_sh(1.0);
17261        let b = _mm_set_sh(2.0);
17262        let r = _mm_ucomilt_sh(a, b);
17263        assert_eq!(r, 1);
17264    }
17265
17266    #[simd_test(enable = "avx512fp16")]
17267    unsafe fn test_mm_ucomineq_sh() {
17268        let a = _mm_set_sh(1.0);
17269        let b = _mm_set_sh(2.0);
17270        let r = _mm_ucomineq_sh(a, b);
17271        assert_eq!(r, 1);
17272    }
17273
17274    #[simd_test(enable = "avx512fp16,avx512vl")]
17275    unsafe fn test_mm_load_ph() {
17276        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17277        let b = _mm_load_ph(addr_of!(a).cast());
17278        assert_eq_m128h(a, b);
17279    }
17280
17281    #[simd_test(enable = "avx512fp16,avx512vl")]
17282    unsafe fn test_mm256_load_ph() {
17283        let a = _mm256_set_ph(
17284            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17285        );
17286        let b = _mm256_load_ph(addr_of!(a).cast());
17287        assert_eq_m256h(a, b);
17288    }
17289
17290    #[simd_test(enable = "avx512fp16")]
17291    unsafe fn test_mm512_load_ph() {
17292        let a = _mm512_set_ph(
17293            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17294            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17295            31.0, 32.0,
17296        );
17297        let b = _mm512_load_ph(addr_of!(a).cast());
17298        assert_eq_m512h(a, b);
17299    }
17300
17301    #[simd_test(enable = "avx512fp16")]
17302    unsafe fn test_mm_load_sh() {
17303        let a = _mm_set_sh(1.0);
17304        let b = _mm_load_sh(addr_of!(a).cast());
17305        assert_eq_m128h(a, b);
17306    }
17307
17308    #[simd_test(enable = "avx512fp16")]
17309    unsafe fn test_mm_mask_load_sh() {
17310        let a = _mm_set_sh(1.0);
17311        let src = _mm_set_sh(2.);
17312        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17313        assert_eq_m128h(a, b);
17314        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17315        assert_eq_m128h(src, b);
17316    }
17317
17318    #[simd_test(enable = "avx512fp16")]
17319    unsafe fn test_mm_maskz_load_sh() {
17320        let a = _mm_set_sh(1.0);
17321        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17322        assert_eq_m128h(a, b);
17323        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17324        assert_eq_m128h(_mm_setzero_ph(), b);
17325    }
17326
17327    #[simd_test(enable = "avx512fp16,avx512vl")]
17328    unsafe fn test_mm_loadu_ph() {
17329        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17330        let r = _mm_loadu_ph(array.as_ptr());
17331        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17332        assert_eq_m128h(r, e);
17333    }
17334
17335    #[simd_test(enable = "avx512fp16,avx512vl")]
17336    unsafe fn test_mm256_loadu_ph() {
17337        let array = [
17338            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17339        ];
17340        let r = _mm256_loadu_ph(array.as_ptr());
17341        let e = _mm256_setr_ph(
17342            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17343        );
17344        assert_eq_m256h(r, e);
17345    }
17346
17347    #[simd_test(enable = "avx512fp16")]
17348    unsafe fn test_mm512_loadu_ph() {
17349        let array = [
17350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17351            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17352            31.0, 32.0,
17353        ];
17354        let r = _mm512_loadu_ph(array.as_ptr());
17355        let e = _mm512_setr_ph(
17356            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17357            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17358            31.0, 32.0,
17359        );
17360        assert_eq_m512h(r, e);
17361    }
17362
17363    #[simd_test(enable = "avx512fp16")]
17364    unsafe fn test_mm_move_sh() {
17365        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17366        let b = _mm_set_sh(9.0);
17367        let r = _mm_move_sh(a, b);
17368        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17369        assert_eq_m128h(r, e);
17370    }
17371
17372    #[simd_test(enable = "avx512fp16")]
17373    unsafe fn test_mm_mask_move_sh() {
17374        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17375        let b = _mm_set_sh(9.0);
17376        let src = _mm_set_sh(10.0);
17377        let r = _mm_mask_move_sh(src, 0, a, b);
17378        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17379        assert_eq_m128h(r, e);
17380    }
17381
17382    #[simd_test(enable = "avx512fp16")]
17383    unsafe fn test_mm_maskz_move_sh() {
17384        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17385        let b = _mm_set_sh(9.0);
17386        let r = _mm_maskz_move_sh(0, a, b);
17387        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17388        assert_eq_m128h(r, e);
17389    }
17390
17391    #[simd_test(enable = "avx512fp16,avx512vl")]
17392    unsafe fn test_mm_store_ph() {
17393        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17394        let mut b = _mm_setzero_ph();
17395        _mm_store_ph(addr_of_mut!(b).cast(), a);
17396        assert_eq_m128h(a, b);
17397    }
17398
17399    #[simd_test(enable = "avx512fp16,avx512vl")]
17400    unsafe fn test_mm256_store_ph() {
17401        let a = _mm256_set_ph(
17402            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17403        );
17404        let mut b = _mm256_setzero_ph();
17405        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17406        assert_eq_m256h(a, b);
17407    }
17408
17409    #[simd_test(enable = "avx512fp16")]
17410    unsafe fn test_mm512_store_ph() {
17411        let a = _mm512_set_ph(
17412            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17413            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17414            31.0, 32.0,
17415        );
17416        let mut b = _mm512_setzero_ph();
17417        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17418        assert_eq_m512h(a, b);
17419    }
17420
17421    #[simd_test(enable = "avx512fp16")]
17422    unsafe fn test_mm_store_sh() {
17423        let a = _mm_set_sh(1.0);
17424        let mut b = _mm_setzero_ph();
17425        _mm_store_sh(addr_of_mut!(b).cast(), a);
17426        assert_eq_m128h(a, b);
17427    }
17428
17429    #[simd_test(enable = "avx512fp16")]
17430    unsafe fn test_mm_mask_store_sh() {
17431        let a = _mm_set_sh(1.0);
17432        let mut b = _mm_setzero_ph();
17433        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17434        assert_eq_m128h(_mm_setzero_ph(), b);
17435        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17436        assert_eq_m128h(a, b);
17437    }
17438
17439    #[simd_test(enable = "avx512fp16,avx512vl")]
17440    unsafe fn test_mm_storeu_ph() {
17441        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17442        let mut array = [0.0; 8];
17443        _mm_storeu_ph(array.as_mut_ptr(), a);
17444        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17445    }
17446
17447    #[simd_test(enable = "avx512fp16,avx512vl")]
17448    unsafe fn test_mm256_storeu_ph() {
17449        let a = _mm256_set_ph(
17450            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17451        );
17452        let mut array = [0.0; 16];
17453        _mm256_storeu_ph(array.as_mut_ptr(), a);
17454        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17455    }
17456
17457    #[simd_test(enable = "avx512fp16")]
17458    unsafe fn test_mm512_storeu_ph() {
17459        let a = _mm512_set_ph(
17460            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17461            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17462            31.0, 32.0,
17463        );
17464        let mut array = [0.0; 32];
17465        _mm512_storeu_ph(array.as_mut_ptr(), a);
17466        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17467    }
17468
17469    #[simd_test(enable = "avx512fp16,avx512vl")]
17470    unsafe fn test_mm_add_ph() {
17471        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17472        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17473        let r = _mm_add_ph(a, b);
17474        let e = _mm_set1_ph(9.0);
17475        assert_eq_m128h(r, e);
17476    }
17477
17478    #[simd_test(enable = "avx512fp16,avx512vl")]
17479    unsafe fn test_mm_mask_add_ph() {
17480        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17481        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17482        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17483        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17484        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17485        assert_eq_m128h(r, e);
17486    }
17487
17488    #[simd_test(enable = "avx512fp16,avx512vl")]
17489    unsafe fn test_mm_maskz_add_ph() {
17490        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17491        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17492        let r = _mm_maskz_add_ph(0b01010101, a, b);
17493        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17494        assert_eq_m128h(r, e);
17495    }
17496
17497    #[simd_test(enable = "avx512fp16,avx512vl")]
17498    unsafe fn test_mm256_add_ph() {
17499        let a = _mm256_set_ph(
17500            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17501        );
17502        let b = _mm256_set_ph(
17503            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17504        );
17505        let r = _mm256_add_ph(a, b);
17506        let e = _mm256_set1_ph(17.0);
17507        assert_eq_m256h(r, e);
17508    }
17509
17510    #[simd_test(enable = "avx512fp16,avx512vl")]
17511    unsafe fn test_mm256_mask_add_ph() {
17512        let a = _mm256_set_ph(
17513            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17514        );
17515        let b = _mm256_set_ph(
17516            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17517        );
17518        let src = _mm256_set_ph(
17519            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17520        );
17521        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17522        let e = _mm256_set_ph(
17523            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17524        );
17525        assert_eq_m256h(r, e);
17526    }
17527
17528    #[simd_test(enable = "avx512fp16,avx512vl")]
17529    unsafe fn test_mm256_maskz_add_ph() {
17530        let a = _mm256_set_ph(
17531            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17532        );
17533        let b = _mm256_set_ph(
17534            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17535        );
17536        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17537        let e = _mm256_set_ph(
17538            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17539        );
17540        assert_eq_m256h(r, e);
17541    }
17542
17543    #[simd_test(enable = "avx512fp16")]
17544    unsafe fn test_mm512_add_ph() {
17545        let a = _mm512_set_ph(
17546            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17547            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17548            31.0, 32.0,
17549        );
17550        let b = _mm512_set_ph(
17551            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17552            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17553            3.0, 2.0, 1.0,
17554        );
17555        let r = _mm512_add_ph(a, b);
17556        let e = _mm512_set1_ph(33.0);
17557        assert_eq_m512h(r, e);
17558    }
17559
17560    #[simd_test(enable = "avx512fp16")]
17561    unsafe fn test_mm512_mask_add_ph() {
17562        let a = _mm512_set_ph(
17563            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17564            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17565            31.0, 32.0,
17566        );
17567        let b = _mm512_set_ph(
17568            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17569            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17570            3.0, 2.0, 1.0,
17571        );
17572        let src = _mm512_set_ph(
17573            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17574            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17575        );
17576        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17577        let e = _mm512_set_ph(
17578            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17579            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17580        );
17581        assert_eq_m512h(r, e);
17582    }
17583
17584    #[simd_test(enable = "avx512fp16")]
17585    unsafe fn test_mm512_maskz_add_ph() {
17586        let a = _mm512_set_ph(
17587            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17588            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17589            31.0, 32.0,
17590        );
17591        let b = _mm512_set_ph(
17592            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17593            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17594            3.0, 2.0, 1.0,
17595        );
17596        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17597        let e = _mm512_set_ph(
17598            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17599            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17600        );
17601        assert_eq_m512h(r, e);
17602    }
17603
17604    #[simd_test(enable = "avx512fp16")]
17605    unsafe fn test_mm512_add_round_ph() {
17606        let a = _mm512_set_ph(
17607            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17608            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17609            31.0, 32.0,
17610        );
17611        let b = _mm512_set_ph(
17612            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17613            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17614            3.0, 2.0, 1.0,
17615        );
17616        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17617        let e = _mm512_set1_ph(33.0);
17618        assert_eq_m512h(r, e);
17619    }
17620
17621    #[simd_test(enable = "avx512fp16")]
17622    unsafe fn test_mm512_mask_add_round_ph() {
17623        let a = _mm512_set_ph(
17624            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17625            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17626            31.0, 32.0,
17627        );
17628        let b = _mm512_set_ph(
17629            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17630            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17631            3.0, 2.0, 1.0,
17632        );
17633        let src = _mm512_set_ph(
17634            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17635            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17636        );
17637        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17638            src,
17639            0b01010101010101010101010101010101,
17640            a,
17641            b,
17642        );
17643        let e = _mm512_set_ph(
17644            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17645            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17646        );
17647        assert_eq_m512h(r, e);
17648    }
17649
17650    #[simd_test(enable = "avx512fp16")]
17651    unsafe fn test_mm512_maskz_add_round_ph() {
17652        let a = _mm512_set_ph(
17653            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17654            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17655            31.0, 32.0,
17656        );
17657        let b = _mm512_set_ph(
17658            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17659            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17660            3.0, 2.0, 1.0,
17661        );
17662        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17663            0b01010101010101010101010101010101,
17664            a,
17665            b,
17666        );
17667        let e = _mm512_set_ph(
17668            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17669            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17670        );
17671        assert_eq_m512h(r, e);
17672    }
17673
17674    #[simd_test(enable = "avx512fp16")]
17675    unsafe fn test_mm_add_round_sh() {
17676        let a = _mm_set_sh(1.0);
17677        let b = _mm_set_sh(2.0);
17678        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17679        let e = _mm_set_sh(3.0);
17680        assert_eq_m128h(r, e);
17681    }
17682
17683    #[simd_test(enable = "avx512fp16")]
17684    unsafe fn test_mm_mask_add_round_sh() {
17685        let a = _mm_set_sh(1.0);
17686        let b = _mm_set_sh(2.0);
17687        let src = _mm_set_sh(4.0);
17688        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17689            src, 0, a, b,
17690        );
17691        let e = _mm_set_sh(4.0);
17692        assert_eq_m128h(r, e);
17693        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17694            src, 1, a, b,
17695        );
17696        let e = _mm_set_sh(3.0);
17697        assert_eq_m128h(r, e);
17698    }
17699
17700    #[simd_test(enable = "avx512fp16")]
17701    unsafe fn test_mm_maskz_add_round_sh() {
17702        let a = _mm_set_sh(1.0);
17703        let b = _mm_set_sh(2.0);
17704        let r =
17705            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17706        let e = _mm_set_sh(0.0);
17707        assert_eq_m128h(r, e);
17708        let r =
17709            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17710        let e = _mm_set_sh(3.0);
17711        assert_eq_m128h(r, e);
17712    }
17713
17714    #[simd_test(enable = "avx512fp16")]
17715    unsafe fn test_mm_add_sh() {
17716        let a = _mm_set_sh(1.0);
17717        let b = _mm_set_sh(2.0);
17718        let r = _mm_add_sh(a, b);
17719        let e = _mm_set_sh(3.0);
17720        assert_eq_m128h(r, e);
17721    }
17722
17723    #[simd_test(enable = "avx512fp16")]
17724    unsafe fn test_mm_mask_add_sh() {
17725        let a = _mm_set_sh(1.0);
17726        let b = _mm_set_sh(2.0);
17727        let src = _mm_set_sh(4.0);
17728        let r = _mm_mask_add_sh(src, 0, a, b);
17729        let e = _mm_set_sh(4.0);
17730        assert_eq_m128h(r, e);
17731        let r = _mm_mask_add_sh(src, 1, a, b);
17732        let e = _mm_set_sh(3.0);
17733        assert_eq_m128h(r, e);
17734    }
17735
17736    #[simd_test(enable = "avx512fp16")]
17737    unsafe fn test_mm_maskz_add_sh() {
17738        let a = _mm_set_sh(1.0);
17739        let b = _mm_set_sh(2.0);
17740        let r = _mm_maskz_add_sh(0, a, b);
17741        let e = _mm_set_sh(0.0);
17742        assert_eq_m128h(r, e);
17743        let r = _mm_maskz_add_sh(1, a, b);
17744        let e = _mm_set_sh(3.0);
17745        assert_eq_m128h(r, e);
17746    }
17747
17748    #[simd_test(enable = "avx512fp16,avx512vl")]
17749    unsafe fn test_mm_sub_ph() {
17750        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17751        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17752        let r = _mm_sub_ph(a, b);
17753        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17754        assert_eq_m128h(r, e);
17755    }
17756
17757    #[simd_test(enable = "avx512fp16,avx512vl")]
17758    unsafe fn test_mm_mask_sub_ph() {
17759        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17760        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17761        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17762        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17763        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17764        assert_eq_m128h(r, e);
17765    }
17766
17767    #[simd_test(enable = "avx512fp16,avx512vl")]
17768    unsafe fn test_mm_maskz_sub_ph() {
17769        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17770        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17771        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17772        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17773        assert_eq_m128h(r, e);
17774    }
17775
17776    #[simd_test(enable = "avx512fp16,avx512vl")]
17777    unsafe fn test_mm256_sub_ph() {
17778        let a = _mm256_set_ph(
17779            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17780        );
17781        let b = _mm256_set_ph(
17782            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17783        );
17784        let r = _mm256_sub_ph(a, b);
17785        let e = _mm256_set_ph(
17786            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17787            15.0,
17788        );
17789        assert_eq_m256h(r, e);
17790    }
17791
17792    #[simd_test(enable = "avx512fp16,avx512vl")]
17793    unsafe fn test_mm256_mask_sub_ph() {
17794        let a = _mm256_set_ph(
17795            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17796        );
17797        let b = _mm256_set_ph(
17798            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17799        );
17800        let src = _mm256_set_ph(
17801            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17802        );
17803        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17804        let e = _mm256_set_ph(
17805            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17806        );
17807        assert_eq_m256h(r, e);
17808    }
17809
17810    #[simd_test(enable = "avx512fp16,avx512vl")]
17811    unsafe fn test_mm256_maskz_sub_ph() {
17812        let a = _mm256_set_ph(
17813            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17814        );
17815        let b = _mm256_set_ph(
17816            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17817        );
17818        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17819        let e = _mm256_set_ph(
17820            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17821        );
17822        assert_eq_m256h(r, e);
17823    }
17824
17825    #[simd_test(enable = "avx512fp16")]
17826    unsafe fn test_mm512_sub_ph() {
17827        let a = _mm512_set_ph(
17828            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17829            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17830            31.0, 32.0,
17831        );
17832        let b = _mm512_set_ph(
17833            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17834            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17835            3.0, 2.0, 1.0,
17836        );
17837        let r = _mm512_sub_ph(a, b);
17838        let e = _mm512_set_ph(
17839            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17840            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17841            23.0, 25.0, 27.0, 29.0, 31.0,
17842        );
17843        assert_eq_m512h(r, e);
17844    }
17845
17846    #[simd_test(enable = "avx512fp16")]
17847    unsafe fn test_mm512_mask_sub_ph() {
17848        let a = _mm512_set_ph(
17849            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17850            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17851            31.0, 32.0,
17852        );
17853        let b = _mm512_set_ph(
17854            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17855            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17856            3.0, 2.0, 1.0,
17857        );
17858        let src = _mm512_set_ph(
17859            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17860            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17861        );
17862        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17863        let e = _mm512_set_ph(
17864            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17865            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17866        );
17867        assert_eq_m512h(r, e);
17868    }
17869
17870    #[simd_test(enable = "avx512fp16")]
17871    unsafe fn test_mm512_maskz_sub_ph() {
17872        let a = _mm512_set_ph(
17873            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17874            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17875            31.0, 32.0,
17876        );
17877        let b = _mm512_set_ph(
17878            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17879            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17880            3.0, 2.0, 1.0,
17881        );
17882        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17883        let e = _mm512_set_ph(
17884            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17885            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17886        );
17887        assert_eq_m512h(r, e);
17888    }
17889
17890    #[simd_test(enable = "avx512fp16")]
17891    unsafe fn test_mm512_sub_round_ph() {
17892        let a = _mm512_set_ph(
17893            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17894            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17895            31.0, 32.0,
17896        );
17897        let b = _mm512_set_ph(
17898            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17899            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17900            3.0, 2.0, 1.0,
17901        );
17902        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17903        let e = _mm512_set_ph(
17904            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17905            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17906            23.0, 25.0, 27.0, 29.0, 31.0,
17907        );
17908        assert_eq_m512h(r, e);
17909    }
17910
17911    #[simd_test(enable = "avx512fp16")]
17912    unsafe fn test_mm512_mask_sub_round_ph() {
17913        let a = _mm512_set_ph(
17914            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17915            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17916            31.0, 32.0,
17917        );
17918        let b = _mm512_set_ph(
17919            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17920            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17921            3.0, 2.0, 1.0,
17922        );
17923        let src = _mm512_set_ph(
17924            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17925            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17926        );
17927        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17928            src,
17929            0b01010101010101010101010101010101,
17930            a,
17931            b,
17932        );
17933        let e = _mm512_set_ph(
17934            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17935            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17936        );
17937        assert_eq_m512h(r, e);
17938    }
17939
17940    #[simd_test(enable = "avx512fp16")]
17941    unsafe fn test_mm512_maskz_sub_round_ph() {
17942        let a = _mm512_set_ph(
17943            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17944            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17945            31.0, 32.0,
17946        );
17947        let b = _mm512_set_ph(
17948            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17949            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17950            3.0, 2.0, 1.0,
17951        );
17952        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17953            0b01010101010101010101010101010101,
17954            a,
17955            b,
17956        );
17957        let e = _mm512_set_ph(
17958            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17959            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17960        );
17961        assert_eq_m512h(r, e);
17962    }
17963
17964    #[simd_test(enable = "avx512fp16")]
17965    unsafe fn test_mm_sub_round_sh() {
17966        let a = _mm_set_sh(1.0);
17967        let b = _mm_set_sh(2.0);
17968        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17969        let e = _mm_set_sh(-1.0);
17970        assert_eq_m128h(r, e);
17971    }
17972
17973    #[simd_test(enable = "avx512fp16")]
17974    unsafe fn test_mm_mask_sub_round_sh() {
17975        let a = _mm_set_sh(1.0);
17976        let b = _mm_set_sh(2.0);
17977        let src = _mm_set_sh(4.0);
17978        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17979            src, 0, a, b,
17980        );
17981        let e = _mm_set_sh(4.0);
17982        assert_eq_m128h(r, e);
17983        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17984            src, 1, a, b,
17985        );
17986        let e = _mm_set_sh(-1.0);
17987        assert_eq_m128h(r, e);
17988    }
17989
17990    #[simd_test(enable = "avx512fp16")]
17991    unsafe fn test_mm_maskz_sub_round_sh() {
17992        let a = _mm_set_sh(1.0);
17993        let b = _mm_set_sh(2.0);
17994        let r =
17995            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17996        let e = _mm_set_sh(0.0);
17997        assert_eq_m128h(r, e);
17998        let r =
17999            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18000        let e = _mm_set_sh(-1.0);
18001        assert_eq_m128h(r, e);
18002    }
18003
18004    #[simd_test(enable = "avx512fp16")]
18005    unsafe fn test_mm_sub_sh() {
18006        let a = _mm_set_sh(1.0);
18007        let b = _mm_set_sh(2.0);
18008        let r = _mm_sub_sh(a, b);
18009        let e = _mm_set_sh(-1.0);
18010        assert_eq_m128h(r, e);
18011    }
18012
18013    #[simd_test(enable = "avx512fp16")]
18014    unsafe fn test_mm_mask_sub_sh() {
18015        let a = _mm_set_sh(1.0);
18016        let b = _mm_set_sh(2.0);
18017        let src = _mm_set_sh(4.0);
18018        let r = _mm_mask_sub_sh(src, 0, a, b);
18019        let e = _mm_set_sh(4.0);
18020        assert_eq_m128h(r, e);
18021        let r = _mm_mask_sub_sh(src, 1, a, b);
18022        let e = _mm_set_sh(-1.0);
18023        assert_eq_m128h(r, e);
18024    }
18025
18026    #[simd_test(enable = "avx512fp16")]
18027    unsafe fn test_mm_maskz_sub_sh() {
18028        let a = _mm_set_sh(1.0);
18029        let b = _mm_set_sh(2.0);
18030        let r = _mm_maskz_sub_sh(0, a, b);
18031        let e = _mm_set_sh(0.0);
18032        assert_eq_m128h(r, e);
18033        let r = _mm_maskz_sub_sh(1, a, b);
18034        let e = _mm_set_sh(-1.0);
18035        assert_eq_m128h(r, e);
18036    }
18037
18038    #[simd_test(enable = "avx512fp16,avx512vl")]
18039    unsafe fn test_mm_mul_ph() {
18040        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18041        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18042        let r = _mm_mul_ph(a, b);
18043        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18044        assert_eq_m128h(r, e);
18045    }
18046
18047    #[simd_test(enable = "avx512fp16,avx512vl")]
18048    unsafe fn test_mm_mask_mul_ph() {
18049        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18050        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18051        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18052        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18053        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18054        assert_eq_m128h(r, e);
18055    }
18056
18057    #[simd_test(enable = "avx512fp16,avx512vl")]
18058    unsafe fn test_mm_maskz_mul_ph() {
18059        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18060        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18061        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18062        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18063        assert_eq_m128h(r, e);
18064    }
18065
18066    #[simd_test(enable = "avx512fp16,avx512vl")]
18067    unsafe fn test_mm256_mul_ph() {
18068        let a = _mm256_set_ph(
18069            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18070        );
18071        let b = _mm256_set_ph(
18072            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18073        );
18074        let r = _mm256_mul_ph(a, b);
18075        let e = _mm256_set_ph(
18076            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18077            30.0, 16.0,
18078        );
18079        assert_eq_m256h(r, e);
18080    }
18081
18082    #[simd_test(enable = "avx512fp16,avx512vl")]
18083    unsafe fn test_mm256_mask_mul_ph() {
18084        let a = _mm256_set_ph(
18085            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18086        );
18087        let b = _mm256_set_ph(
18088            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18089        );
18090        let src = _mm256_set_ph(
18091            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18092        );
18093        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18094        let e = _mm256_set_ph(
18095            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18096        );
18097        assert_eq_m256h(r, e);
18098    }
18099
18100    #[simd_test(enable = "avx512fp16,avx512vl")]
18101    unsafe fn test_mm256_maskz_mul_ph() {
18102        let a = _mm256_set_ph(
18103            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18104        );
18105        let b = _mm256_set_ph(
18106            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18107        );
18108        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18109        let e = _mm256_set_ph(
18110            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18111        );
18112        assert_eq_m256h(r, e);
18113    }
18114
18115    #[simd_test(enable = "avx512fp16")]
18116    unsafe fn test_mm512_mul_ph() {
18117        let a = _mm512_set_ph(
18118            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18119            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18120            31.0, 32.0,
18121        );
18122        let b = _mm512_set_ph(
18123            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18124            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18125            3.0, 2.0, 1.0,
18126        );
18127        let r = _mm512_mul_ph(a, b);
18128        let e = _mm512_set_ph(
18129            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18130            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18131            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18132        );
18133        assert_eq_m512h(r, e);
18134    }
18135
18136    #[simd_test(enable = "avx512fp16")]
18137    unsafe fn test_mm512_mask_mul_ph() {
18138        let a = _mm512_set_ph(
18139            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18140            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18141            31.0, 32.0,
18142        );
18143        let b = _mm512_set_ph(
18144            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18145            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18146            3.0, 2.0, 1.0,
18147        );
18148        let src = _mm512_set_ph(
18149            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18150            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18151        );
18152        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18153        let e = _mm512_set_ph(
18154            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18155            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18156        );
18157        assert_eq_m512h(r, e);
18158    }
18159
18160    #[simd_test(enable = "avx512fp16")]
18161    unsafe fn test_mm512_maskz_mul_ph() {
18162        let a = _mm512_set_ph(
18163            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18164            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18165            31.0, 32.0,
18166        );
18167        let b = _mm512_set_ph(
18168            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18169            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18170            3.0, 2.0, 1.0,
18171        );
18172        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18173        let e = _mm512_set_ph(
18174            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18175            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18176        );
18177        assert_eq_m512h(r, e);
18178    }
18179
18180    #[simd_test(enable = "avx512fp16")]
18181    unsafe fn test_mm512_mul_round_ph() {
18182        let a = _mm512_set_ph(
18183            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18184            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18185            31.0, 32.0,
18186        );
18187        let b = _mm512_set_ph(
18188            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18189            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18190            3.0, 2.0, 1.0,
18191        );
18192        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18193        let e = _mm512_set_ph(
18194            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18195            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18196            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18197        );
18198        assert_eq_m512h(r, e);
18199    }
18200
18201    #[simd_test(enable = "avx512fp16")]
18202    unsafe fn test_mm512_mask_mul_round_ph() {
18203        let a = _mm512_set_ph(
18204            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18205            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18206            31.0, 32.0,
18207        );
18208        let b = _mm512_set_ph(
18209            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18210            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18211            3.0, 2.0, 1.0,
18212        );
18213        let src = _mm512_set_ph(
18214            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18215            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18216        );
18217        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18218            src,
18219            0b01010101010101010101010101010101,
18220            a,
18221            b,
18222        );
18223        let e = _mm512_set_ph(
18224            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18225            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18226        );
18227        assert_eq_m512h(r, e);
18228    }
18229
18230    #[simd_test(enable = "avx512fp16")]
18231    unsafe fn test_mm512_maskz_mul_round_ph() {
18232        let a = _mm512_set_ph(
18233            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18234            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18235            31.0, 32.0,
18236        );
18237        let b = _mm512_set_ph(
18238            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18239            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18240            3.0, 2.0, 1.0,
18241        );
18242        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18243            0b01010101010101010101010101010101,
18244            a,
18245            b,
18246        );
18247        let e = _mm512_set_ph(
18248            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18249            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18250        );
18251        assert_eq_m512h(r, e);
18252    }
18253
18254    #[simd_test(enable = "avx512fp16")]
18255    unsafe fn test_mm_mul_round_sh() {
18256        let a = _mm_set_sh(1.0);
18257        let b = _mm_set_sh(2.0);
18258        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18259        let e = _mm_set_sh(2.0);
18260        assert_eq_m128h(r, e);
18261    }
18262
18263    #[simd_test(enable = "avx512fp16")]
18264    unsafe fn test_mm_mask_mul_round_sh() {
18265        let a = _mm_set_sh(1.0);
18266        let b = _mm_set_sh(2.0);
18267        let src = _mm_set_sh(4.0);
18268        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18269            src, 0, a, b,
18270        );
18271        let e = _mm_set_sh(4.0);
18272        assert_eq_m128h(r, e);
18273        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18274            src, 1, a, b,
18275        );
18276        let e = _mm_set_sh(2.0);
18277        assert_eq_m128h(r, e);
18278    }
18279
18280    #[simd_test(enable = "avx512fp16")]
18281    unsafe fn test_mm_maskz_mul_round_sh() {
18282        let a = _mm_set_sh(1.0);
18283        let b = _mm_set_sh(2.0);
18284        let r =
18285            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18286        let e = _mm_set_sh(0.0);
18287        assert_eq_m128h(r, e);
18288        let r =
18289            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18290        let e = _mm_set_sh(2.0);
18291        assert_eq_m128h(r, e);
18292    }
18293
18294    #[simd_test(enable = "avx512fp16")]
18295    unsafe fn test_mm_mul_sh() {
18296        let a = _mm_set_sh(1.0);
18297        let b = _mm_set_sh(2.0);
18298        let r = _mm_mul_sh(a, b);
18299        let e = _mm_set_sh(2.0);
18300        assert_eq_m128h(r, e);
18301    }
18302
18303    #[simd_test(enable = "avx512fp16")]
18304    unsafe fn test_mm_mask_mul_sh() {
18305        let a = _mm_set_sh(1.0);
18306        let b = _mm_set_sh(2.0);
18307        let src = _mm_set_sh(4.0);
18308        let r = _mm_mask_mul_sh(src, 0, a, b);
18309        let e = _mm_set_sh(4.0);
18310        assert_eq_m128h(r, e);
18311        let r = _mm_mask_mul_sh(src, 1, a, b);
18312        let e = _mm_set_sh(2.0);
18313        assert_eq_m128h(r, e);
18314    }
18315
18316    #[simd_test(enable = "avx512fp16")]
18317    unsafe fn test_mm_maskz_mul_sh() {
18318        let a = _mm_set_sh(1.0);
18319        let b = _mm_set_sh(2.0);
18320        let r = _mm_maskz_mul_sh(0, a, b);
18321        let e = _mm_set_sh(0.0);
18322        assert_eq_m128h(r, e);
18323        let r = _mm_maskz_mul_sh(1, a, b);
18324        let e = _mm_set_sh(2.0);
18325        assert_eq_m128h(r, e);
18326    }
18327
18328    #[simd_test(enable = "avx512fp16,avx512vl")]
18329    unsafe fn test_mm_div_ph() {
18330        let a = _mm_set1_ph(1.0);
18331        let b = _mm_set1_ph(2.0);
18332        let r = _mm_div_ph(a, b);
18333        let e = _mm_set1_ph(0.5);
18334        assert_eq_m128h(r, e);
18335    }
18336
18337    #[simd_test(enable = "avx512fp16,avx512vl")]
18338    unsafe fn test_mm_mask_div_ph() {
18339        let a = _mm_set1_ph(1.0);
18340        let b = _mm_set1_ph(2.0);
18341        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18342        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18343        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18344        assert_eq_m128h(r, e);
18345    }
18346
18347    #[simd_test(enable = "avx512fp16,avx512vl")]
18348    unsafe fn test_mm_maskz_div_ph() {
18349        let a = _mm_set1_ph(1.0);
18350        let b = _mm_set1_ph(2.0);
18351        let r = _mm_maskz_div_ph(0b01010101, a, b);
18352        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18353        assert_eq_m128h(r, e);
18354    }
18355
18356    #[simd_test(enable = "avx512fp16,avx512vl")]
18357    unsafe fn test_mm256_div_ph() {
18358        let a = _mm256_set1_ph(1.0);
18359        let b = _mm256_set1_ph(2.0);
18360        let r = _mm256_div_ph(a, b);
18361        let e = _mm256_set1_ph(0.5);
18362        assert_eq_m256h(r, e);
18363    }
18364
18365    #[simd_test(enable = "avx512fp16,avx512vl")]
18366    unsafe fn test_mm256_mask_div_ph() {
18367        let a = _mm256_set1_ph(1.0);
18368        let b = _mm256_set1_ph(2.0);
18369        let src = _mm256_set_ph(
18370            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18371            19.0,
18372        );
18373        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18374        let e = _mm256_set_ph(
18375            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18376        );
18377        assert_eq_m256h(r, e);
18378    }
18379
18380    #[simd_test(enable = "avx512fp16,avx512vl")]
18381    unsafe fn test_mm256_maskz_div_ph() {
18382        let a = _mm256_set1_ph(1.0);
18383        let b = _mm256_set1_ph(2.0);
18384        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18385        let e = _mm256_set_ph(
18386            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18387        );
18388        assert_eq_m256h(r, e);
18389    }
18390
18391    #[simd_test(enable = "avx512fp16")]
18392    unsafe fn test_mm512_div_ph() {
18393        let a = _mm512_set1_ph(1.0);
18394        let b = _mm512_set1_ph(2.0);
18395        let r = _mm512_div_ph(a, b);
18396        let e = _mm512_set1_ph(0.5);
18397        assert_eq_m512h(r, e);
18398    }
18399
18400    #[simd_test(enable = "avx512fp16")]
18401    unsafe fn test_mm512_mask_div_ph() {
18402        let a = _mm512_set1_ph(1.0);
18403        let b = _mm512_set1_ph(2.0);
18404        let src = _mm512_set_ph(
18405            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18406            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18407            33.0, 34.0, 35.0,
18408        );
18409        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18410        let e = _mm512_set_ph(
18411            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18412            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18413        );
18414        assert_eq_m512h(r, e);
18415    }
18416
18417    #[simd_test(enable = "avx512fp16")]
18418    unsafe fn test_mm512_maskz_div_ph() {
18419        let a = _mm512_set1_ph(1.0);
18420        let b = _mm512_set1_ph(2.0);
18421        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18422        let e = _mm512_set_ph(
18423            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18424            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18425        );
18426        assert_eq_m512h(r, e);
18427    }
18428
18429    #[simd_test(enable = "avx512fp16")]
18430    unsafe fn test_mm512_div_round_ph() {
18431        let a = _mm512_set1_ph(1.0);
18432        let b = _mm512_set1_ph(2.0);
18433        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18434        let e = _mm512_set1_ph(0.5);
18435        assert_eq_m512h(r, e);
18436    }
18437
18438    #[simd_test(enable = "avx512fp16")]
18439    unsafe fn test_mm512_mask_div_round_ph() {
18440        let a = _mm512_set1_ph(1.0);
18441        let b = _mm512_set1_ph(2.0);
18442        let src = _mm512_set_ph(
18443            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18444            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18445            33.0, 34.0, 35.0,
18446        );
18447        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18448            src,
18449            0b01010101010101010101010101010101,
18450            a,
18451            b,
18452        );
18453        let e = _mm512_set_ph(
18454            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18455            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18456        );
18457        assert_eq_m512h(r, e);
18458    }
18459
18460    #[simd_test(enable = "avx512fp16")]
18461    unsafe fn test_mm512_maskz_div_round_ph() {
18462        let a = _mm512_set1_ph(1.0);
18463        let b = _mm512_set1_ph(2.0);
18464        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18465            0b01010101010101010101010101010101,
18466            a,
18467            b,
18468        );
18469        let e = _mm512_set_ph(
18470            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18471            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18472        );
18473        assert_eq_m512h(r, e);
18474    }
18475
18476    #[simd_test(enable = "avx512fp16")]
18477    unsafe fn test_mm_div_round_sh() {
18478        let a = _mm_set_sh(1.0);
18479        let b = _mm_set_sh(2.0);
18480        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18481        let e = _mm_set_sh(0.5);
18482        assert_eq_m128h(r, e);
18483    }
18484
18485    #[simd_test(enable = "avx512fp16")]
18486    unsafe fn test_mm_mask_div_round_sh() {
18487        let a = _mm_set_sh(1.0);
18488        let b = _mm_set_sh(2.0);
18489        let src = _mm_set_sh(4.0);
18490        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18491            src, 0, a, b,
18492        );
18493        let e = _mm_set_sh(4.0);
18494        assert_eq_m128h(r, e);
18495        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18496            src, 1, a, b,
18497        );
18498        let e = _mm_set_sh(0.5);
18499        assert_eq_m128h(r, e);
18500    }
18501
18502    #[simd_test(enable = "avx512fp16")]
18503    unsafe fn test_mm_maskz_div_round_sh() {
18504        let a = _mm_set_sh(1.0);
18505        let b = _mm_set_sh(2.0);
18506        let r =
18507            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18508        let e = _mm_set_sh(0.0);
18509        assert_eq_m128h(r, e);
18510        let r =
18511            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18512        let e = _mm_set_sh(0.5);
18513        assert_eq_m128h(r, e);
18514    }
18515
18516    #[simd_test(enable = "avx512fp16")]
18517    unsafe fn test_mm_div_sh() {
18518        let a = _mm_set_sh(1.0);
18519        let b = _mm_set_sh(2.0);
18520        let r = _mm_div_sh(a, b);
18521        let e = _mm_set_sh(0.5);
18522        assert_eq_m128h(r, e);
18523    }
18524
18525    #[simd_test(enable = "avx512fp16")]
18526    unsafe fn test_mm_mask_div_sh() {
18527        let a = _mm_set_sh(1.0);
18528        let b = _mm_set_sh(2.0);
18529        let src = _mm_set_sh(4.0);
18530        let r = _mm_mask_div_sh(src, 0, a, b);
18531        let e = _mm_set_sh(4.0);
18532        assert_eq_m128h(r, e);
18533        let r = _mm_mask_div_sh(src, 1, a, b);
18534        let e = _mm_set_sh(0.5);
18535        assert_eq_m128h(r, e);
18536    }
18537
18538    #[simd_test(enable = "avx512fp16")]
18539    unsafe fn test_mm_maskz_div_sh() {
18540        let a = _mm_set_sh(1.0);
18541        let b = _mm_set_sh(2.0);
18542        let r = _mm_maskz_div_sh(0, a, b);
18543        let e = _mm_set_sh(0.0);
18544        assert_eq_m128h(r, e);
18545        let r = _mm_maskz_div_sh(1, a, b);
18546        let e = _mm_set_sh(0.5);
18547        assert_eq_m128h(r, e);
18548    }
18549
18550    #[simd_test(enable = "avx512fp16,avx512vl")]
18551    unsafe fn test_mm_mul_pch() {
18552        let a = _mm_set1_pch(0.0, 1.0);
18553        let b = _mm_set1_pch(0.0, 1.0);
18554        let r = _mm_mul_pch(a, b);
18555        let e = _mm_set1_pch(-1.0, 0.0);
18556        assert_eq_m128h(r, e);
18557    }
18558
18559    #[simd_test(enable = "avx512fp16,avx512vl")]
18560    unsafe fn test_mm_mask_mul_pch() {
18561        let a = _mm_set1_pch(0.0, 1.0);
18562        let b = _mm_set1_pch(0.0, 1.0);
18563        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18564        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18565        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18566        assert_eq_m128h(r, e);
18567    }
18568
18569    #[simd_test(enable = "avx512fp16,avx512vl")]
18570    unsafe fn test_mm_maskz_mul_pch() {
18571        let a = _mm_set1_pch(0.0, 1.0);
18572        let b = _mm_set1_pch(0.0, 1.0);
18573        let r = _mm_maskz_mul_pch(0b0101, a, b);
18574        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18575        assert_eq_m128h(r, e);
18576    }
18577
18578    #[simd_test(enable = "avx512fp16,avx512vl")]
18579    unsafe fn test_mm256_mul_pch() {
18580        let a = _mm256_set1_pch(0.0, 1.0);
18581        let b = _mm256_set1_pch(0.0, 1.0);
18582        let r = _mm256_mul_pch(a, b);
18583        let e = _mm256_set1_pch(-1.0, 0.0);
18584        assert_eq_m256h(r, e);
18585    }
18586
18587    #[simd_test(enable = "avx512fp16,avx512vl")]
18588    unsafe fn test_mm256_mask_mul_pch() {
18589        let a = _mm256_set1_pch(0.0, 1.0);
18590        let b = _mm256_set1_pch(0.0, 1.0);
18591        let src = _mm256_setr_ph(
18592            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18593        );
18594        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18595        let e = _mm256_setr_ph(
18596            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18597        );
18598        assert_eq_m256h(r, e);
18599    }
18600
18601    #[simd_test(enable = "avx512fp16,avx512vl")]
18602    unsafe fn test_mm256_maskz_mul_pch() {
18603        let a = _mm256_set1_pch(0.0, 1.0);
18604        let b = _mm256_set1_pch(0.0, 1.0);
18605        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18606        let e = _mm256_setr_ph(
18607            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18608        );
18609        assert_eq_m256h(r, e);
18610    }
18611
18612    #[simd_test(enable = "avx512fp16")]
18613    unsafe fn test_mm512_mul_pch() {
18614        let a = _mm512_set1_pch(0.0, 1.0);
18615        let b = _mm512_set1_pch(0.0, 1.0);
18616        let r = _mm512_mul_pch(a, b);
18617        let e = _mm512_set1_pch(-1.0, 0.0);
18618        assert_eq_m512h(r, e);
18619    }
18620
18621    #[simd_test(enable = "avx512fp16")]
18622    unsafe fn test_mm512_mask_mul_pch() {
18623        let a = _mm512_set1_pch(0.0, 1.0);
18624        let b = _mm512_set1_pch(0.0, 1.0);
18625        let src = _mm512_setr_ph(
18626            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18627            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18628            32.0, 33.0,
18629        );
18630        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18631        let e = _mm512_setr_ph(
18632            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18633            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18634            33.0,
18635        );
18636        assert_eq_m512h(r, e);
18637    }
18638
18639    #[simd_test(enable = "avx512fp16")]
18640    unsafe fn test_mm512_maskz_mul_pch() {
18641        let a = _mm512_set1_pch(0.0, 1.0);
18642        let b = _mm512_set1_pch(0.0, 1.0);
18643        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18644        let e = _mm512_setr_ph(
18645            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18646            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18647        );
18648        assert_eq_m512h(r, e);
18649    }
18650
18651    #[simd_test(enable = "avx512fp16")]
18652    unsafe fn test_mm512_mul_round_pch() {
18653        let a = _mm512_set1_pch(0.0, 1.0);
18654        let b = _mm512_set1_pch(0.0, 1.0);
18655        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18656        let e = _mm512_set1_pch(-1.0, 0.0);
18657        assert_eq_m512h(r, e);
18658    }
18659
18660    #[simd_test(enable = "avx512fp16")]
18661    unsafe fn test_mm512_mask_mul_round_pch() {
18662        let a = _mm512_set1_pch(0.0, 1.0);
18663        let b = _mm512_set1_pch(0.0, 1.0);
18664        let src = _mm512_setr_ph(
18665            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18666            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18667            32.0, 33.0,
18668        );
18669        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18670            src,
18671            0b0101010101010101,
18672            a,
18673            b,
18674        );
18675        let e = _mm512_setr_ph(
18676            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18677            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18678            33.0,
18679        );
18680        assert_eq_m512h(r, e);
18681    }
18682
18683    #[simd_test(enable = "avx512fp16")]
18684    unsafe fn test_mm512_maskz_mul_round_pch() {
18685        let a = _mm512_set1_pch(0.0, 1.0);
18686        let b = _mm512_set1_pch(0.0, 1.0);
18687        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18688            0b0101010101010101,
18689            a,
18690            b,
18691        );
18692        let e = _mm512_setr_ph(
18693            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18694            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18695        );
18696        assert_eq_m512h(r, e);
18697    }
18698
18699    #[simd_test(enable = "avx512fp16")]
18700    unsafe fn test_mm_mul_round_sch() {
18701        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18702        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18703        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18704        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18705        assert_eq_m128h(r, e);
18706    }
18707
18708    #[simd_test(enable = "avx512fp16")]
18709    unsafe fn test_mm_mask_mul_round_sch() {
18710        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18711        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18712        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18713        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18714            src, 0, a, b,
18715        );
18716        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18717        assert_eq_m128h(r, e);
18718    }
18719
18720    #[simd_test(enable = "avx512fp16")]
18721    unsafe fn test_mm_maskz_mul_round_sch() {
18722        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18723        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18724        let r =
18725            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18726        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18727        assert_eq_m128h(r, e);
18728    }
18729
18730    #[simd_test(enable = "avx512fp16")]
18731    unsafe fn test_mm_mul_sch() {
18732        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18733        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18734        let r = _mm_mul_sch(a, b);
18735        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18736        assert_eq_m128h(r, e);
18737    }
18738
18739    #[simd_test(enable = "avx512fp16")]
18740    unsafe fn test_mm_mask_mul_sch() {
18741        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18742        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18743        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18744        let r = _mm_mask_mul_sch(src, 0, a, b);
18745        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18746        assert_eq_m128h(r, e);
18747    }
18748
18749    #[simd_test(enable = "avx512fp16")]
18750    unsafe fn test_mm_maskz_mul_sch() {
18751        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18752        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18753        let r = _mm_maskz_mul_sch(0, a, b);
18754        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18755        assert_eq_m128h(r, e);
18756    }
18757
18758    #[simd_test(enable = "avx512fp16,avx512vl")]
18759    unsafe fn test_mm_fmul_pch() {
18760        let a = _mm_set1_pch(0.0, 1.0);
18761        let b = _mm_set1_pch(0.0, 1.0);
18762        let r = _mm_fmul_pch(a, b);
18763        let e = _mm_set1_pch(-1.0, 0.0);
18764        assert_eq_m128h(r, e);
18765    }
18766
18767    #[simd_test(enable = "avx512fp16,avx512vl")]
18768    unsafe fn test_mm_mask_fmul_pch() {
18769        let a = _mm_set1_pch(0.0, 1.0);
18770        let b = _mm_set1_pch(0.0, 1.0);
18771        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18772        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18773        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18774        assert_eq_m128h(r, e);
18775    }
18776
18777    #[simd_test(enable = "avx512fp16,avx512vl")]
18778    unsafe fn test_mm_maskz_fmul_pch() {
18779        let a = _mm_set1_pch(0.0, 1.0);
18780        let b = _mm_set1_pch(0.0, 1.0);
18781        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18782        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18783        assert_eq_m128h(r, e);
18784    }
18785
18786    #[simd_test(enable = "avx512fp16,avx512vl")]
18787    unsafe fn test_mm256_fmul_pch() {
18788        let a = _mm256_set1_pch(0.0, 1.0);
18789        let b = _mm256_set1_pch(0.0, 1.0);
18790        let r = _mm256_fmul_pch(a, b);
18791        let e = _mm256_set1_pch(-1.0, 0.0);
18792        assert_eq_m256h(r, e);
18793    }
18794
18795    #[simd_test(enable = "avx512fp16,avx512vl")]
18796    unsafe fn test_mm256_mask_fmul_pch() {
18797        let a = _mm256_set1_pch(0.0, 1.0);
18798        let b = _mm256_set1_pch(0.0, 1.0);
18799        let src = _mm256_setr_ph(
18800            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18801        );
18802        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18803        let e = _mm256_setr_ph(
18804            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18805        );
18806        assert_eq_m256h(r, e);
18807    }
18808
18809    #[simd_test(enable = "avx512fp16,avx512vl")]
18810    unsafe fn test_mm256_maskz_fmul_pch() {
18811        let a = _mm256_set1_pch(0.0, 1.0);
18812        let b = _mm256_set1_pch(0.0, 1.0);
18813        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18814        let e = _mm256_setr_ph(
18815            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18816        );
18817        assert_eq_m256h(r, e);
18818    }
18819
18820    #[simd_test(enable = "avx512fp16")]
18821    unsafe fn test_mm512_fmul_pch() {
18822        let a = _mm512_set1_pch(0.0, 1.0);
18823        let b = _mm512_set1_pch(0.0, 1.0);
18824        let r = _mm512_fmul_pch(a, b);
18825        let e = _mm512_set1_pch(-1.0, 0.0);
18826        assert_eq_m512h(r, e);
18827    }
18828
18829    #[simd_test(enable = "avx512fp16")]
18830    unsafe fn test_mm512_mask_fmul_pch() {
18831        let a = _mm512_set1_pch(0.0, 1.0);
18832        let b = _mm512_set1_pch(0.0, 1.0);
18833        let src = _mm512_setr_ph(
18834            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18835            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18836            32.0, 33.0,
18837        );
18838        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18839        let e = _mm512_setr_ph(
18840            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18841            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18842            33.0,
18843        );
18844        assert_eq_m512h(r, e);
18845    }
18846
18847    #[simd_test(enable = "avx512fp16")]
18848    unsafe fn test_mm512_maskz_fmul_pch() {
18849        let a = _mm512_set1_pch(0.0, 1.0);
18850        let b = _mm512_set1_pch(0.0, 1.0);
18851        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18852        let e = _mm512_setr_ph(
18853            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18854            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18855        );
18856        assert_eq_m512h(r, e);
18857    }
18858
18859    #[simd_test(enable = "avx512fp16")]
18860    unsafe fn test_mm512_fmul_round_pch() {
18861        let a = _mm512_set1_pch(0.0, 1.0);
18862        let b = _mm512_set1_pch(0.0, 1.0);
18863        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18864        let e = _mm512_set1_pch(-1.0, 0.0);
18865        assert_eq_m512h(r, e);
18866    }
18867
18868    #[simd_test(enable = "avx512fp16")]
18869    unsafe fn test_mm512_mask_fmul_round_pch() {
18870        let a = _mm512_set1_pch(0.0, 1.0);
18871        let b = _mm512_set1_pch(0.0, 1.0);
18872        let src = _mm512_setr_ph(
18873            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18874            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18875            32.0, 33.0,
18876        );
18877        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18878            src,
18879            0b0101010101010101,
18880            a,
18881            b,
18882        );
18883        let e = _mm512_setr_ph(
18884            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18885            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18886            33.0,
18887        );
18888        assert_eq_m512h(r, e);
18889    }
18890
18891    #[simd_test(enable = "avx512fp16")]
18892    unsafe fn test_mm512_maskz_fmul_round_pch() {
18893        let a = _mm512_set1_pch(0.0, 1.0);
18894        let b = _mm512_set1_pch(0.0, 1.0);
18895        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18896            0b0101010101010101,
18897            a,
18898            b,
18899        );
18900        let e = _mm512_setr_ph(
18901            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18902            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18903        );
18904        assert_eq_m512h(r, e);
18905    }
18906
18907    #[simd_test(enable = "avx512fp16")]
18908    unsafe fn test_mm_fmul_round_sch() {
18909        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18910        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18911        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18912        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18913        assert_eq_m128h(r, e);
18914    }
18915
18916    #[simd_test(enable = "avx512fp16")]
18917    unsafe fn test_mm_mask_fmul_round_sch() {
18918        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18919        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18920        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18921        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18922            src, 0, a, b,
18923        );
18924        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18925        assert_eq_m128h(r, e);
18926    }
18927
18928    #[simd_test(enable = "avx512fp16")]
18929    unsafe fn test_mm_maskz_fmul_round_sch() {
18930        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18931        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18932        let r =
18933            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18934        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18935        assert_eq_m128h(r, e);
18936    }
18937
18938    #[simd_test(enable = "avx512fp16")]
18939    unsafe fn test_mm_fmul_sch() {
18940        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18941        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18942        let r = _mm_fmul_sch(a, b);
18943        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18944        assert_eq_m128h(r, e);
18945    }
18946
18947    #[simd_test(enable = "avx512fp16")]
18948    unsafe fn test_mm_mask_fmul_sch() {
18949        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18950        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18951        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18952        let r = _mm_mask_fmul_sch(src, 0, a, b);
18953        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18954        assert_eq_m128h(r, e);
18955    }
18956
18957    #[simd_test(enable = "avx512fp16")]
18958    unsafe fn test_mm_maskz_fmul_sch() {
18959        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18960        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18961        let r = _mm_maskz_fmul_sch(0, a, b);
18962        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18963        assert_eq_m128h(r, e);
18964    }
18965
18966    #[simd_test(enable = "avx512fp16,avx512vl")]
18967    unsafe fn test_mm_cmul_pch() {
18968        let a = _mm_set1_pch(0.0, 1.0);
18969        let b = _mm_set1_pch(0.0, -1.0);
18970        let r = _mm_cmul_pch(a, b);
18971        let e = _mm_set1_pch(-1.0, 0.0);
18972        assert_eq_m128h(r, e);
18973    }
18974
18975    #[simd_test(enable = "avx512fp16,avx512vl")]
18976    unsafe fn test_mm_mask_cmul_pch() {
18977        let a = _mm_set1_pch(0.0, 1.0);
18978        let b = _mm_set1_pch(0.0, -1.0);
18979        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18980        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
18981        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18982        assert_eq_m128h(r, e);
18983    }
18984
18985    #[simd_test(enable = "avx512fp16,avx512vl")]
18986    unsafe fn test_mm_maskz_cmul_pch() {
18987        let a = _mm_set1_pch(0.0, 1.0);
18988        let b = _mm_set1_pch(0.0, -1.0);
18989        let r = _mm_maskz_cmul_pch(0b0101, a, b);
18990        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18991        assert_eq_m128h(r, e);
18992    }
18993
18994    #[simd_test(enable = "avx512fp16,avx512vl")]
18995    unsafe fn test_mm256_cmul_pch() {
18996        let a = _mm256_set1_pch(0.0, 1.0);
18997        let b = _mm256_set1_pch(0.0, -1.0);
18998        let r = _mm256_cmul_pch(a, b);
18999        let e = _mm256_set1_pch(-1.0, 0.0);
19000        assert_eq_m256h(r, e);
19001    }
19002
19003    #[simd_test(enable = "avx512fp16,avx512vl")]
19004    unsafe fn test_mm256_mask_cmul_pch() {
19005        let a = _mm256_set1_pch(0.0, 1.0);
19006        let b = _mm256_set1_pch(0.0, -1.0);
19007        let src = _mm256_setr_ph(
19008            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19009        );
19010        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19011        let e = _mm256_setr_ph(
19012            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19013        );
19014        assert_eq_m256h(r, e);
19015    }
19016
19017    #[simd_test(enable = "avx512fp16,avx512vl")]
19018    unsafe fn test_mm256_maskz_cmul_pch() {
19019        let a = _mm256_set1_pch(0.0, 1.0);
19020        let b = _mm256_set1_pch(0.0, -1.0);
19021        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19022        let e = _mm256_setr_ph(
19023            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19024        );
19025        assert_eq_m256h(r, e);
19026    }
19027
19028    #[simd_test(enable = "avx512fp16")]
19029    unsafe fn test_mm512_cmul_pch() {
19030        let a = _mm512_set1_pch(0.0, 1.0);
19031        let b = _mm512_set1_pch(0.0, -1.0);
19032        let r = _mm512_cmul_pch(a, b);
19033        let e = _mm512_set1_pch(-1.0, 0.0);
19034        assert_eq_m512h(r, e);
19035    }
19036
19037    #[simd_test(enable = "avx512fp16")]
19038    unsafe fn test_mm512_mask_cmul_pch() {
19039        let a = _mm512_set1_pch(0.0, 1.0);
19040        let b = _mm512_set1_pch(0.0, -1.0);
19041        let src = _mm512_setr_ph(
19042            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19043            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19044            32.0, 33.0,
19045        );
19046        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19047        let e = _mm512_setr_ph(
19048            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19049            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19050            33.0,
19051        );
19052        assert_eq_m512h(r, e);
19053    }
19054
19055    #[simd_test(enable = "avx512fp16")]
19056    unsafe fn test_mm512_maskz_cmul_pch() {
19057        let a = _mm512_set1_pch(0.0, 1.0);
19058        let b = _mm512_set1_pch(0.0, -1.0);
19059        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19060        let e = _mm512_setr_ph(
19061            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19062            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19063        );
19064        assert_eq_m512h(r, e);
19065    }
19066
19067    #[simd_test(enable = "avx512fp16")]
19068    unsafe fn test_mm512_cmul_round_pch() {
19069        let a = _mm512_set1_pch(0.0, 1.0);
19070        let b = _mm512_set1_pch(0.0, -1.0);
19071        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19072        let e = _mm512_set1_pch(-1.0, 0.0);
19073        assert_eq_m512h(r, e);
19074    }
19075
19076    #[simd_test(enable = "avx512fp16")]
19077    unsafe fn test_mm512_mask_cmul_round_pch() {
19078        let a = _mm512_set1_pch(0.0, 1.0);
19079        let b = _mm512_set1_pch(0.0, -1.0);
19080        let src = _mm512_setr_ph(
19081            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19082            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19083            32.0, 33.0,
19084        );
19085        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19086            src,
19087            0b0101010101010101,
19088            a,
19089            b,
19090        );
19091        let e = _mm512_setr_ph(
19092            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19093            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19094            33.0,
19095        );
19096        assert_eq_m512h(r, e);
19097    }
19098
19099    #[simd_test(enable = "avx512fp16")]
19100    unsafe fn test_mm512_maskz_cmul_round_pch() {
19101        let a = _mm512_set1_pch(0.0, 1.0);
19102        let b = _mm512_set1_pch(0.0, -1.0);
19103        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19104            0b0101010101010101,
19105            a,
19106            b,
19107        );
19108        let e = _mm512_setr_ph(
19109            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19110            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19111        );
19112        assert_eq_m512h(r, e);
19113    }
19114
19115    #[simd_test(enable = "avx512fp16")]
19116    unsafe fn test_mm_cmul_sch() {
19117        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19118        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19119        let r = _mm_cmul_sch(a, b);
19120        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19121        assert_eq_m128h(r, e);
19122    }
19123
19124    #[simd_test(enable = "avx512fp16")]
19125    unsafe fn test_mm_mask_cmul_sch() {
19126        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19127        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19128        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19129        let r = _mm_mask_cmul_sch(src, 0, a, b);
19130        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19131        assert_eq_m128h(r, e);
19132    }
19133
19134    #[simd_test(enable = "avx512fp16")]
19135    unsafe fn test_mm_maskz_cmul_sch() {
19136        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19137        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19138        let r = _mm_maskz_cmul_sch(0, a, b);
19139        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19140        assert_eq_m128h(r, e);
19141    }
19142
19143    #[simd_test(enable = "avx512fp16")]
19144    unsafe fn test_mm_cmul_round_sch() {
19145        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19146        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19147        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19148        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19149        assert_eq_m128h(r, e);
19150    }
19151
19152    #[simd_test(enable = "avx512fp16")]
19153    unsafe fn test_mm_mask_cmul_round_sch() {
19154        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19155        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19156        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19157        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19158            src, 0, a, b,
19159        );
19160        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19161        assert_eq_m128h(r, e);
19162    }
19163
19164    #[simd_test(enable = "avx512fp16")]
19165    unsafe fn test_mm_maskz_cmul_round_sch() {
19166        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19167        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19168        let r =
19169            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19170        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19171        assert_eq_m128h(r, e);
19172    }
19173
19174    #[simd_test(enable = "avx512fp16,avx512vl")]
19175    unsafe fn test_mm_fcmul_pch() {
19176        let a = _mm_set1_pch(0.0, 1.0);
19177        let b = _mm_set1_pch(0.0, -1.0);
19178        let r = _mm_fcmul_pch(a, b);
19179        let e = _mm_set1_pch(-1.0, 0.0);
19180        assert_eq_m128h(r, e);
19181    }
19182
19183    #[simd_test(enable = "avx512fp16,avx512vl")]
19184    unsafe fn test_mm_mask_fcmul_pch() {
19185        let a = _mm_set1_pch(0.0, 1.0);
19186        let b = _mm_set1_pch(0.0, -1.0);
19187        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19188        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19189        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19190        assert_eq_m128h(r, e);
19191    }
19192
19193    #[simd_test(enable = "avx512fp16,avx512vl")]
19194    unsafe fn test_mm_maskz_fcmul_pch() {
19195        let a = _mm_set1_pch(0.0, 1.0);
19196        let b = _mm_set1_pch(0.0, -1.0);
19197        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19198        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19199        assert_eq_m128h(r, e);
19200    }
19201
19202    #[simd_test(enable = "avx512fp16,avx512vl")]
19203    unsafe fn test_mm256_fcmul_pch() {
19204        let a = _mm256_set1_pch(0.0, 1.0);
19205        let b = _mm256_set1_pch(0.0, -1.0);
19206        let r = _mm256_fcmul_pch(a, b);
19207        let e = _mm256_set1_pch(-1.0, 0.0);
19208        assert_eq_m256h(r, e);
19209    }
19210
19211    #[simd_test(enable = "avx512fp16,avx512vl")]
19212    unsafe fn test_mm256_mask_fcmul_pch() {
19213        let a = _mm256_set1_pch(0.0, 1.0);
19214        let b = _mm256_set1_pch(0.0, -1.0);
19215        let src = _mm256_setr_ph(
19216            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19217        );
19218        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19219        let e = _mm256_setr_ph(
19220            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19221        );
19222        assert_eq_m256h(r, e);
19223    }
19224
19225    #[simd_test(enable = "avx512fp16,avx512vl")]
19226    unsafe fn test_mm256_maskz_fcmul_pch() {
19227        let a = _mm256_set1_pch(0.0, 1.0);
19228        let b = _mm256_set1_pch(0.0, -1.0);
19229        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19230        let e = _mm256_setr_ph(
19231            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19232        );
19233        assert_eq_m256h(r, e);
19234    }
19235
19236    #[simd_test(enable = "avx512fp16")]
19237    unsafe fn test_mm512_fcmul_pch() {
19238        let a = _mm512_set1_pch(0.0, 1.0);
19239        let b = _mm512_set1_pch(0.0, -1.0);
19240        let r = _mm512_fcmul_pch(a, b);
19241        let e = _mm512_set1_pch(-1.0, 0.0);
19242        assert_eq_m512h(r, e);
19243    }
19244
19245    #[simd_test(enable = "avx512fp16")]
19246    unsafe fn test_mm512_mask_fcmul_pch() {
19247        let a = _mm512_set1_pch(0.0, 1.0);
19248        let b = _mm512_set1_pch(0.0, -1.0);
19249        let src = _mm512_setr_ph(
19250            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19251            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19252            32.0, 33.0,
19253        );
19254        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19255        let e = _mm512_setr_ph(
19256            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19257            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19258            33.0,
19259        );
19260        assert_eq_m512h(r, e);
19261    }
19262
19263    #[simd_test(enable = "avx512fp16")]
19264    unsafe fn test_mm512_maskz_fcmul_pch() {
19265        let a = _mm512_set1_pch(0.0, 1.0);
19266        let b = _mm512_set1_pch(0.0, -1.0);
19267        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19268        let e = _mm512_setr_ph(
19269            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19270            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19271        );
19272        assert_eq_m512h(r, e);
19273    }
19274
19275    #[simd_test(enable = "avx512fp16")]
19276    unsafe fn test_mm512_fcmul_round_pch() {
19277        let a = _mm512_set1_pch(0.0, 1.0);
19278        let b = _mm512_set1_pch(0.0, -1.0);
19279        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19280        let e = _mm512_set1_pch(-1.0, 0.0);
19281        assert_eq_m512h(r, e);
19282    }
19283
19284    #[simd_test(enable = "avx512fp16")]
19285    unsafe fn test_mm512_mask_fcmul_round_pch() {
19286        let a = _mm512_set1_pch(0.0, 1.0);
19287        let b = _mm512_set1_pch(0.0, -1.0);
19288        let src = _mm512_setr_ph(
19289            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19290            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19291            32.0, 33.0,
19292        );
19293        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19294            src,
19295            0b0101010101010101,
19296            a,
19297            b,
19298        );
19299        let e = _mm512_setr_ph(
19300            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19301            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19302            33.0,
19303        );
19304        assert_eq_m512h(r, e);
19305    }
19306
19307    #[simd_test(enable = "avx512fp16")]
19308    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19309        let a = _mm512_set1_pch(0.0, 1.0);
19310        let b = _mm512_set1_pch(0.0, -1.0);
19311        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19312            0b0101010101010101,
19313            a,
19314            b,
19315        );
19316        let e = _mm512_setr_ph(
19317            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19318            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19319        );
19320        assert_eq_m512h(r, e);
19321    }
19322
19323    #[simd_test(enable = "avx512fp16")]
19324    unsafe fn test_mm_fcmul_sch() {
19325        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19326        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19327        let r = _mm_fcmul_sch(a, b);
19328        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19329        assert_eq_m128h(r, e);
19330    }
19331
19332    #[simd_test(enable = "avx512fp16")]
19333    unsafe fn test_mm_mask_fcmul_sch() {
19334        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19335        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19336        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19337        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19338        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19339        assert_eq_m128h(r, e);
19340    }
19341
19342    #[simd_test(enable = "avx512fp16")]
19343    unsafe fn test_mm_maskz_fcmul_sch() {
19344        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19345        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19346        let r = _mm_maskz_fcmul_sch(0, a, b);
19347        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19348        assert_eq_m128h(r, e);
19349    }
19350
19351    #[simd_test(enable = "avx512fp16")]
19352    unsafe fn test_mm_fcmul_round_sch() {
19353        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19354        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19355        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19356        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19357        assert_eq_m128h(r, e);
19358    }
19359
19360    #[simd_test(enable = "avx512fp16")]
19361    unsafe fn test_mm_mask_fcmul_round_sch() {
19362        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19363        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19364        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19365        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19366            src, 0, a, b,
19367        );
19368        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19369        assert_eq_m128h(r, e);
19370    }
19371
19372    #[simd_test(enable = "avx512fp16")]
19373    unsafe fn test_mm_maskz_fcmul_round_sch() {
19374        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19375        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19376        let r =
19377            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19378        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19379        assert_eq_m128h(r, e);
19380    }
19381
19382    #[simd_test(enable = "avx512fp16,avx512vl")]
19383    unsafe fn test_mm_abs_ph() {
19384        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19385        let r = _mm_abs_ph(a);
19386        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19387        assert_eq_m128h(r, e);
19388    }
19389
19390    #[simd_test(enable = "avx512fp16,avx512vl")]
19391    unsafe fn test_mm256_abs_ph() {
19392        let a = _mm256_set_ph(
19393            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19394            -14.0,
19395        );
19396        let r = _mm256_abs_ph(a);
19397        let e = _mm256_set_ph(
19398            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19399        );
19400        assert_eq_m256h(r, e);
19401    }
19402
19403    #[simd_test(enable = "avx512fp16")]
19404    unsafe fn test_mm512_abs_ph() {
19405        let a = _mm512_set_ph(
19406            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19407            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19408            27.0, -28.0, 29.0, -30.0,
19409        );
19410        let r = _mm512_abs_ph(a);
19411        let e = _mm512_set_ph(
19412            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19413            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19414            29.0, 30.0,
19415        );
19416        assert_eq_m512h(r, e);
19417    }
19418
19419    #[simd_test(enable = "avx512fp16,avx512vl")]
19420    unsafe fn test_mm_conj_pch() {
19421        let a = _mm_set1_pch(0.0, 1.0);
19422        let r = _mm_conj_pch(a);
19423        let e = _mm_set1_pch(0.0, -1.0);
19424        assert_eq_m128h(r, e);
19425    }
19426
19427    #[simd_test(enable = "avx512fp16,avx512vl")]
19428    unsafe fn test_mm_mask_conj_pch() {
19429        let a = _mm_set1_pch(0.0, 1.0);
19430        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19431        let r = _mm_mask_conj_pch(src, 0b0101, a);
19432        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19433        assert_eq_m128h(r, e);
19434    }
19435
19436    #[simd_test(enable = "avx512fp16,avx512vl")]
19437    unsafe fn test_mm_maskz_conj_pch() {
19438        let a = _mm_set1_pch(0.0, 1.0);
19439        let r = _mm_maskz_conj_pch(0b0101, a);
19440        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19441        assert_eq_m128h(r, e);
19442    }
19443
19444    #[simd_test(enable = "avx512fp16,avx512vl")]
19445    unsafe fn test_mm256_conj_pch() {
19446        let a = _mm256_set1_pch(0.0, 1.0);
19447        let r = _mm256_conj_pch(a);
19448        let e = _mm256_set1_pch(0.0, -1.0);
19449        assert_eq_m256h(r, e);
19450    }
19451
19452    #[simd_test(enable = "avx512fp16,avx512vl")]
19453    unsafe fn test_mm256_mask_conj_pch() {
19454        let a = _mm256_set1_pch(0.0, 1.0);
19455        let src = _mm256_setr_ph(
19456            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19457        );
19458        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19459        let e = _mm256_setr_ph(
19460            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19461        );
19462        assert_eq_m256h(r, e);
19463    }
19464
19465    #[simd_test(enable = "avx512fp16,avx512vl")]
19466    unsafe fn test_mm256_maskz_conj_pch() {
19467        let a = _mm256_set1_pch(0.0, 1.0);
19468        let r = _mm256_maskz_conj_pch(0b01010101, a);
19469        let e = _mm256_setr_ph(
19470            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19471        );
19472        assert_eq_m256h(r, e);
19473    }
19474
19475    #[simd_test(enable = "avx512fp16")]
19476    unsafe fn test_mm512_conj_pch() {
19477        let a = _mm512_set1_pch(0.0, 1.0);
19478        let r = _mm512_conj_pch(a);
19479        let e = _mm512_set1_pch(0.0, -1.0);
19480        assert_eq_m512h(r, e);
19481    }
19482
19483    #[simd_test(enable = "avx512fp16")]
19484    unsafe fn test_mm512_mask_conj_pch() {
19485        let a = _mm512_set1_pch(0.0, 1.0);
19486        let src = _mm512_setr_ph(
19487            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19488            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19489            32.0, 33.0,
19490        );
19491        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19492        let e = _mm512_setr_ph(
19493            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19494            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19495            33.0,
19496        );
19497        assert_eq_m512h(r, e);
19498    }
19499
19500    #[simd_test(enable = "avx512fp16")]
19501    unsafe fn test_mm512_maskz_conj_pch() {
19502        let a = _mm512_set1_pch(0.0, 1.0);
19503        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19504        let e = _mm512_setr_ph(
19505            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19506            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19507        );
19508        assert_eq_m512h(r, e);
19509    }
19510
19511    #[simd_test(enable = "avx512fp16,avx512vl")]
19512    unsafe fn test_mm_fmadd_pch() {
19513        let a = _mm_set1_pch(0.0, 1.0);
19514        let b = _mm_set1_pch(0.0, 2.0);
19515        let c = _mm_set1_pch(0.0, 3.0);
19516        let r = _mm_fmadd_pch(a, b, c);
19517        let e = _mm_set1_pch(-2.0, 3.0);
19518        assert_eq_m128h(r, e);
19519    }
19520
19521    #[simd_test(enable = "avx512fp16,avx512vl")]
19522    unsafe fn test_mm_mask_fmadd_pch() {
19523        let a = _mm_set1_pch(0.0, 1.0);
19524        let b = _mm_set1_pch(0.0, 2.0);
19525        let c = _mm_set1_pch(0.0, 3.0);
19526        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19527        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19528        assert_eq_m128h(r, e);
19529    }
19530
19531    #[simd_test(enable = "avx512fp16,avx512vl")]
19532    unsafe fn test_mm_mask3_fmadd_pch() {
19533        let a = _mm_set1_pch(0.0, 1.0);
19534        let b = _mm_set1_pch(0.0, 2.0);
19535        let c = _mm_set1_pch(0.0, 3.0);
19536        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19537        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19538        assert_eq_m128h(r, e);
19539    }
19540
19541    #[simd_test(enable = "avx512fp16,avx512vl")]
19542    unsafe fn test_mm_maskz_fmadd_pch() {
19543        let a = _mm_set1_pch(0.0, 1.0);
19544        let b = _mm_set1_pch(0.0, 2.0);
19545        let c = _mm_set1_pch(0.0, 3.0);
19546        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19547        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19548        assert_eq_m128h(r, e);
19549    }
19550
19551    #[simd_test(enable = "avx512fp16,avx512vl")]
19552    unsafe fn test_mm256_fmadd_pch() {
19553        let a = _mm256_set1_pch(0.0, 1.0);
19554        let b = _mm256_set1_pch(0.0, 2.0);
19555        let c = _mm256_set1_pch(0.0, 3.0);
19556        let r = _mm256_fmadd_pch(a, b, c);
19557        let e = _mm256_set1_pch(-2.0, 3.0);
19558        assert_eq_m256h(r, e);
19559    }
19560
19561    #[simd_test(enable = "avx512fp16,avx512vl")]
19562    unsafe fn test_mm256_mask_fmadd_pch() {
19563        let a = _mm256_set1_pch(0.0, 1.0);
19564        let b = _mm256_set1_pch(0.0, 2.0);
19565        let c = _mm256_set1_pch(0.0, 3.0);
19566        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19567        let e = _mm256_setr_ph(
19568            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19569        );
19570        assert_eq_m256h(r, e);
19571    }
19572
19573    #[simd_test(enable = "avx512fp16,avx512vl")]
19574    unsafe fn test_mm256_mask3_fmadd_pch() {
19575        let a = _mm256_set1_pch(0.0, 1.0);
19576        let b = _mm256_set1_pch(0.0, 2.0);
19577        let c = _mm256_set1_pch(0.0, 3.0);
19578        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19579        let e = _mm256_setr_ph(
19580            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19581        );
19582        assert_eq_m256h(r, e);
19583    }
19584
19585    #[simd_test(enable = "avx512fp16,avx512vl")]
19586    unsafe fn test_mm256_maskz_fmadd_pch() {
19587        let a = _mm256_set1_pch(0.0, 1.0);
19588        let b = _mm256_set1_pch(0.0, 2.0);
19589        let c = _mm256_set1_pch(0.0, 3.0);
19590        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19591        let e = _mm256_setr_ph(
19592            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19593        );
19594        assert_eq_m256h(r, e);
19595    }
19596
19597    #[simd_test(enable = "avx512fp16")]
19598    unsafe fn test_mm512_fmadd_pch() {
19599        let a = _mm512_set1_pch(0.0, 1.0);
19600        let b = _mm512_set1_pch(0.0, 2.0);
19601        let c = _mm512_set1_pch(0.0, 3.0);
19602        let r = _mm512_fmadd_pch(a, b, c);
19603        let e = _mm512_set1_pch(-2.0, 3.0);
19604        assert_eq_m512h(r, e);
19605    }
19606
19607    #[simd_test(enable = "avx512fp16")]
19608    unsafe fn test_mm512_mask_fmadd_pch() {
19609        let a = _mm512_set1_pch(0.0, 1.0);
19610        let b = _mm512_set1_pch(0.0, 2.0);
19611        let c = _mm512_set1_pch(0.0, 3.0);
19612        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19613        let e = _mm512_setr_ph(
19614            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19615            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19616        );
19617        assert_eq_m512h(r, e);
19618    }
19619
19620    #[simd_test(enable = "avx512fp16")]
19621    unsafe fn test_mm512_mask3_fmadd_pch() {
19622        let a = _mm512_set1_pch(0.0, 1.0);
19623        let b = _mm512_set1_pch(0.0, 2.0);
19624        let c = _mm512_set1_pch(0.0, 3.0);
19625        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19626        let e = _mm512_setr_ph(
19627            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19628            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19629        );
19630        assert_eq_m512h(r, e);
19631    }
19632
19633    #[simd_test(enable = "avx512fp16")]
19634    unsafe fn test_mm512_maskz_fmadd_pch() {
19635        let a = _mm512_set1_pch(0.0, 1.0);
19636        let b = _mm512_set1_pch(0.0, 2.0);
19637        let c = _mm512_set1_pch(0.0, 3.0);
19638        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19639        let e = _mm512_setr_ph(
19640            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19641            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19642        );
19643        assert_eq_m512h(r, e);
19644    }
19645
19646    #[simd_test(enable = "avx512fp16")]
19647    unsafe fn test_mm512_fmadd_round_pch() {
19648        let a = _mm512_set1_pch(0.0, 1.0);
19649        let b = _mm512_set1_pch(0.0, 2.0);
19650        let c = _mm512_set1_pch(0.0, 3.0);
19651        let r =
19652            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19653        let e = _mm512_set1_pch(-2.0, 3.0);
19654        assert_eq_m512h(r, e);
19655    }
19656
19657    #[simd_test(enable = "avx512fp16")]
19658    unsafe fn test_mm512_mask_fmadd_round_pch() {
19659        let a = _mm512_set1_pch(0.0, 1.0);
19660        let b = _mm512_set1_pch(0.0, 2.0);
19661        let c = _mm512_set1_pch(0.0, 3.0);
19662        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19663            a,
19664            0b0101010101010101,
19665            b,
19666            c,
19667        );
19668        let e = _mm512_setr_ph(
19669            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19670            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19671        );
19672        assert_eq_m512h(r, e);
19673    }
19674
19675    #[simd_test(enable = "avx512fp16")]
19676    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19677        let a = _mm512_set1_pch(0.0, 1.0);
19678        let b = _mm512_set1_pch(0.0, 2.0);
19679        let c = _mm512_set1_pch(0.0, 3.0);
19680        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19681            a,
19682            b,
19683            c,
19684            0b0101010101010101,
19685        );
19686        let e = _mm512_setr_ph(
19687            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19688            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19689        );
19690        assert_eq_m512h(r, e);
19691    }
19692
19693    #[simd_test(enable = "avx512fp16")]
19694    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19695        let a = _mm512_set1_pch(0.0, 1.0);
19696        let b = _mm512_set1_pch(0.0, 2.0);
19697        let c = _mm512_set1_pch(0.0, 3.0);
19698        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19699            0b0101010101010101,
19700            a,
19701            b,
19702            c,
19703        );
19704        let e = _mm512_setr_ph(
19705            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19706            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19707        );
19708        assert_eq_m512h(r, e);
19709    }
19710
19711    #[simd_test(enable = "avx512fp16")]
19712    unsafe fn test_mm_fmadd_sch() {
19713        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19714        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19715        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19716        let r = _mm_fmadd_sch(a, b, c);
19717        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19718        assert_eq_m128h(r, e);
19719    }
19720
19721    #[simd_test(enable = "avx512fp16")]
19722    unsafe fn test_mm_mask_fmadd_sch() {
19723        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19724        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19725        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19726        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19727        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19728        assert_eq_m128h(r, e);
19729        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19730        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19731        assert_eq_m128h(r, e);
19732    }
19733
19734    #[simd_test(enable = "avx512fp16")]
19735    unsafe fn test_mm_mask3_fmadd_sch() {
19736        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19737        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19738        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19739        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19740        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19741        assert_eq_m128h(r, e);
19742        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19743        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19744        assert_eq_m128h(r, e);
19745    }
19746
19747    #[simd_test(enable = "avx512fp16")]
19748    unsafe fn test_mm_maskz_fmadd_sch() {
19749        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19750        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19751        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19752        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19753        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19754        assert_eq_m128h(r, e);
19755        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19756        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19757        assert_eq_m128h(r, e);
19758    }
19759
19760    #[simd_test(enable = "avx512fp16")]
19761    unsafe fn test_mm_fmadd_round_sch() {
19762        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19763        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19764        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19765        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19766        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19767        assert_eq_m128h(r, e);
19768    }
19769
19770    #[simd_test(enable = "avx512fp16")]
19771    unsafe fn test_mm_mask_fmadd_round_sch() {
19772        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19773        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19774        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19775        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19776            a, 0, b, c,
19777        );
19778        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19779        assert_eq_m128h(r, e);
19780        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19781            a, 1, b, c,
19782        );
19783        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19784        assert_eq_m128h(r, e);
19785    }
19786
19787    #[simd_test(enable = "avx512fp16")]
19788    unsafe fn test_mm_mask3_fmadd_round_sch() {
19789        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19790        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19791        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19792        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19793            a, b, c, 0,
19794        );
19795        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19796        assert_eq_m128h(r, e);
19797        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19798            a, b, c, 1,
19799        );
19800        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19801        assert_eq_m128h(r, e);
19802    }
19803
19804    #[simd_test(enable = "avx512fp16")]
19805    unsafe fn test_mm_maskz_fmadd_round_sch() {
19806        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19807        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19808        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19809        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19810            0, a, b, c,
19811        );
19812        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19813        assert_eq_m128h(r, e);
19814        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19815            1, a, b, c,
19816        );
19817        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19818        assert_eq_m128h(r, e);
19819    }
19820
19821    #[simd_test(enable = "avx512fp16,avx512vl")]
19822    unsafe fn test_mm_fcmadd_pch() {
19823        let a = _mm_set1_pch(0.0, 1.0);
19824        let b = _mm_set1_pch(0.0, 2.0);
19825        let c = _mm_set1_pch(0.0, 3.0);
19826        let r = _mm_fcmadd_pch(a, b, c);
19827        let e = _mm_set1_pch(2.0, 3.0);
19828        assert_eq_m128h(r, e);
19829    }
19830
19831    #[simd_test(enable = "avx512fp16,avx512vl")]
19832    unsafe fn test_mm_mask_fcmadd_pch() {
19833        let a = _mm_set1_pch(0.0, 1.0);
19834        let b = _mm_set1_pch(0.0, 2.0);
19835        let c = _mm_set1_pch(0.0, 3.0);
19836        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19837        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19838        assert_eq_m128h(r, e);
19839    }
19840
19841    #[simd_test(enable = "avx512fp16,avx512vl")]
19842    unsafe fn test_mm_mask3_fcmadd_pch() {
19843        let a = _mm_set1_pch(0.0, 1.0);
19844        let b = _mm_set1_pch(0.0, 2.0);
19845        let c = _mm_set1_pch(0.0, 3.0);
19846        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19847        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19848        assert_eq_m128h(r, e);
19849    }
19850
19851    #[simd_test(enable = "avx512fp16,avx512vl")]
19852    unsafe fn test_mm_maskz_fcmadd_pch() {
19853        let a = _mm_set1_pch(0.0, 1.0);
19854        let b = _mm_set1_pch(0.0, 2.0);
19855        let c = _mm_set1_pch(0.0, 3.0);
19856        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19857        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19858        assert_eq_m128h(r, e);
19859    }
19860
19861    #[simd_test(enable = "avx512fp16,avx512vl")]
19862    unsafe fn test_mm256_fcmadd_pch() {
19863        let a = _mm256_set1_pch(0.0, 1.0);
19864        let b = _mm256_set1_pch(0.0, 2.0);
19865        let c = _mm256_set1_pch(0.0, 3.0);
19866        let r = _mm256_fcmadd_pch(a, b, c);
19867        let e = _mm256_set1_pch(2.0, 3.0);
19868        assert_eq_m256h(r, e);
19869    }
19870
19871    #[simd_test(enable = "avx512fp16,avx512vl")]
19872    unsafe fn test_mm256_mask_fcmadd_pch() {
19873        let a = _mm256_set1_pch(0.0, 1.0);
19874        let b = _mm256_set1_pch(0.0, 2.0);
19875        let c = _mm256_set1_pch(0.0, 3.0);
19876        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19877        let e = _mm256_setr_ph(
19878            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19879        );
19880        assert_eq_m256h(r, e);
19881    }
19882
19883    #[simd_test(enable = "avx512fp16,avx512vl")]
19884    unsafe fn test_mm256_mask3_fcmadd_pch() {
19885        let a = _mm256_set1_pch(0.0, 1.0);
19886        let b = _mm256_set1_pch(0.0, 2.0);
19887        let c = _mm256_set1_pch(0.0, 3.0);
19888        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19889        let e = _mm256_setr_ph(
19890            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19891        );
19892        assert_eq_m256h(r, e);
19893    }
19894
19895    #[simd_test(enable = "avx512fp16,avx512vl")]
19896    unsafe fn test_mm256_maskz_fcmadd_pch() {
19897        let a = _mm256_set1_pch(0.0, 1.0);
19898        let b = _mm256_set1_pch(0.0, 2.0);
19899        let c = _mm256_set1_pch(0.0, 3.0);
19900        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19901        let e = _mm256_setr_ph(
19902            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19903        );
19904        assert_eq_m256h(r, e);
19905    }
19906
19907    #[simd_test(enable = "avx512fp16")]
19908    unsafe fn test_mm512_fcmadd_pch() {
19909        let a = _mm512_set1_pch(0.0, 1.0);
19910        let b = _mm512_set1_pch(0.0, 2.0);
19911        let c = _mm512_set1_pch(0.0, 3.0);
19912        let r = _mm512_fcmadd_pch(a, b, c);
19913        let e = _mm512_set1_pch(2.0, 3.0);
19914        assert_eq_m512h(r, e);
19915    }
19916
19917    #[simd_test(enable = "avx512fp16")]
19918    unsafe fn test_mm512_mask_fcmadd_pch() {
19919        let a = _mm512_set1_pch(0.0, 1.0);
19920        let b = _mm512_set1_pch(0.0, 2.0);
19921        let c = _mm512_set1_pch(0.0, 3.0);
19922        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19923        let e = _mm512_setr_ph(
19924            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19925            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19926        );
19927        assert_eq_m512h(r, e);
19928    }
19929
19930    #[simd_test(enable = "avx512fp16")]
19931    unsafe fn test_mm512_mask3_fcmadd_pch() {
19932        let a = _mm512_set1_pch(0.0, 1.0);
19933        let b = _mm512_set1_pch(0.0, 2.0);
19934        let c = _mm512_set1_pch(0.0, 3.0);
19935        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
19936        let e = _mm512_setr_ph(
19937            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19938            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19939        );
19940        assert_eq_m512h(r, e);
19941    }
19942
19943    #[simd_test(enable = "avx512fp16")]
19944    unsafe fn test_mm512_maskz_fcmadd_pch() {
19945        let a = _mm512_set1_pch(0.0, 1.0);
19946        let b = _mm512_set1_pch(0.0, 2.0);
19947        let c = _mm512_set1_pch(0.0, 3.0);
19948        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
19949        let e = _mm512_setr_ph(
19950            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19951            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19952        );
19953        assert_eq_m512h(r, e);
19954    }
19955
19956    #[simd_test(enable = "avx512fp16")]
19957    unsafe fn test_mm512_fcmadd_round_pch() {
19958        let a = _mm512_set1_pch(0.0, 1.0);
19959        let b = _mm512_set1_pch(0.0, 2.0);
19960        let c = _mm512_set1_pch(0.0, 3.0);
19961        let r =
19962            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19963        let e = _mm512_set1_pch(2.0, 3.0);
19964        assert_eq_m512h(r, e);
19965    }
19966
19967    #[simd_test(enable = "avx512fp16")]
19968    unsafe fn test_mm512_mask_fcmadd_round_pch() {
19969        let a = _mm512_set1_pch(0.0, 1.0);
19970        let b = _mm512_set1_pch(0.0, 2.0);
19971        let c = _mm512_set1_pch(0.0, 3.0);
19972        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19973            a,
19974            0b0101010101010101,
19975            b,
19976            c,
19977        );
19978        let e = _mm512_setr_ph(
19979            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19980            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19981        );
19982        assert_eq_m512h(r, e);
19983    }
19984
19985    #[simd_test(enable = "avx512fp16")]
19986    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
19987        let a = _mm512_set1_pch(0.0, 1.0);
19988        let b = _mm512_set1_pch(0.0, 2.0);
19989        let c = _mm512_set1_pch(0.0, 3.0);
19990        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19991            a,
19992            b,
19993            c,
19994            0b0101010101010101,
19995        );
19996        let e = _mm512_setr_ph(
19997            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19998            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19999        );
20000        assert_eq_m512h(r, e);
20001    }
20002
20003    #[simd_test(enable = "avx512fp16")]
20004    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
20005        let a = _mm512_set1_pch(0.0, 1.0);
20006        let b = _mm512_set1_pch(0.0, 2.0);
20007        let c = _mm512_set1_pch(0.0, 3.0);
20008        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20009            0b0101010101010101,
20010            a,
20011            b,
20012            c,
20013        );
20014        let e = _mm512_setr_ph(
20015            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20016            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20017        );
20018        assert_eq_m512h(r, e);
20019    }
20020
20021    #[simd_test(enable = "avx512fp16")]
20022    unsafe fn test_mm_fcmadd_sch() {
20023        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20024        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20025        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20026        let r = _mm_fcmadd_sch(a, b, c);
20027        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20028        assert_eq_m128h(r, e);
20029    }
20030
20031    #[simd_test(enable = "avx512fp16")]
20032    unsafe fn test_mm_mask_fcmadd_sch() {
20033        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20034        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20035        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20036        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20037        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20038        assert_eq_m128h(r, e);
20039        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20040        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20041        assert_eq_m128h(r, e);
20042    }
20043
20044    #[simd_test(enable = "avx512fp16")]
20045    unsafe fn test_mm_mask3_fcmadd_sch() {
20046        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20047        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20048        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20049        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20050        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20051        assert_eq_m128h(r, e);
20052        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20053        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20054        assert_eq_m128h(r, e);
20055    }
20056
20057    #[simd_test(enable = "avx512fp16")]
20058    unsafe fn test_mm_maskz_fcmadd_sch() {
20059        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20060        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20061        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20062        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20063        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20064        assert_eq_m128h(r, e);
20065        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20066        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20067        assert_eq_m128h(r, e);
20068    }
20069
20070    #[simd_test(enable = "avx512fp16")]
20071    unsafe fn test_mm_fcmadd_round_sch() {
20072        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20073        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20074        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20075        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20076        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20077        assert_eq_m128h(r, e);
20078    }
20079
20080    #[simd_test(enable = "avx512fp16")]
20081    unsafe fn test_mm_mask_fcmadd_round_sch() {
20082        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20083        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20084        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20085        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20086            a, 0, b, c,
20087        );
20088        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20089        assert_eq_m128h(r, e);
20090        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20091            a, 1, b, c,
20092        );
20093        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20094        assert_eq_m128h(r, e);
20095    }
20096
20097    #[simd_test(enable = "avx512fp16")]
20098    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20099        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20100        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20101        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20102        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20103            a, b, c, 0,
20104        );
20105        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20106        assert_eq_m128h(r, e);
20107        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20108            a, b, c, 1,
20109        );
20110        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20111        assert_eq_m128h(r, e);
20112    }
20113
20114    #[simd_test(enable = "avx512fp16")]
20115    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20116        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20117        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20118        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20119        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20120            0, a, b, c,
20121        );
20122        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20123        assert_eq_m128h(r, e);
20124        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20125            1, a, b, c,
20126        );
20127        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20128        assert_eq_m128h(r, e);
20129    }
20130
20131    #[simd_test(enable = "avx512fp16,avx512vl")]
20132    unsafe fn test_mm_fmadd_ph() {
20133        let a = _mm_set1_ph(1.0);
20134        let b = _mm_set1_ph(2.0);
20135        let c = _mm_set1_ph(3.0);
20136        let r = _mm_fmadd_ph(a, b, c);
20137        let e = _mm_set1_ph(5.0);
20138        assert_eq_m128h(r, e);
20139    }
20140
20141    #[simd_test(enable = "avx512fp16,avx512vl")]
20142    unsafe fn test_mm_mask_fmadd_ph() {
20143        let a = _mm_set1_ph(1.0);
20144        let b = _mm_set1_ph(2.0);
20145        let c = _mm_set1_ph(3.0);
20146        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20147        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20148        assert_eq_m128h(r, e);
20149    }
20150
20151    #[simd_test(enable = "avx512fp16,avx512vl")]
20152    unsafe fn test_mm_mask3_fmadd_ph() {
20153        let a = _mm_set1_ph(1.0);
20154        let b = _mm_set1_ph(2.0);
20155        let c = _mm_set1_ph(3.0);
20156        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20157        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20158        assert_eq_m128h(r, e);
20159    }
20160
20161    #[simd_test(enable = "avx512fp16,avx512vl")]
20162    unsafe fn test_mm_maskz_fmadd_ph() {
20163        let a = _mm_set1_ph(1.0);
20164        let b = _mm_set1_ph(2.0);
20165        let c = _mm_set1_ph(3.0);
20166        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20167        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20168        assert_eq_m128h(r, e);
20169    }
20170
20171    #[simd_test(enable = "avx512fp16,avx512vl")]
20172    unsafe fn test_mm256_fmadd_ph() {
20173        let a = _mm256_set1_ph(1.0);
20174        let b = _mm256_set1_ph(2.0);
20175        let c = _mm256_set1_ph(3.0);
20176        let r = _mm256_fmadd_ph(a, b, c);
20177        let e = _mm256_set1_ph(5.0);
20178        assert_eq_m256h(r, e);
20179    }
20180
20181    #[simd_test(enable = "avx512fp16,avx512vl")]
20182    unsafe fn test_mm256_mask_fmadd_ph() {
20183        let a = _mm256_set1_ph(1.0);
20184        let b = _mm256_set1_ph(2.0);
20185        let c = _mm256_set1_ph(3.0);
20186        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20187        let e = _mm256_set_ph(
20188            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20189        );
20190        assert_eq_m256h(r, e);
20191    }
20192
20193    #[simd_test(enable = "avx512fp16,avx512vl")]
20194    unsafe fn test_mm256_mask3_fmadd_ph() {
20195        let a = _mm256_set1_ph(1.0);
20196        let b = _mm256_set1_ph(2.0);
20197        let c = _mm256_set1_ph(3.0);
20198        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20199        let e = _mm256_set_ph(
20200            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20201        );
20202        assert_eq_m256h(r, e);
20203    }
20204
20205    #[simd_test(enable = "avx512fp16,avx512vl")]
20206    unsafe fn test_mm256_maskz_fmadd_ph() {
20207        let a = _mm256_set1_ph(1.0);
20208        let b = _mm256_set1_ph(2.0);
20209        let c = _mm256_set1_ph(3.0);
20210        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20211        let e = _mm256_set_ph(
20212            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20213        );
20214        assert_eq_m256h(r, e);
20215    }
20216
20217    #[simd_test(enable = "avx512fp16")]
20218    unsafe fn test_mm512_fmadd_ph() {
20219        let a = _mm512_set1_ph(1.0);
20220        let b = _mm512_set1_ph(2.0);
20221        let c = _mm512_set1_ph(3.0);
20222        let r = _mm512_fmadd_ph(a, b, c);
20223        let e = _mm512_set1_ph(5.0);
20224        assert_eq_m512h(r, e);
20225    }
20226
20227    #[simd_test(enable = "avx512fp16")]
20228    unsafe fn test_mm512_mask_fmadd_ph() {
20229        let a = _mm512_set1_ph(1.0);
20230        let b = _mm512_set1_ph(2.0);
20231        let c = _mm512_set1_ph(3.0);
20232        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20233        let e = _mm512_set_ph(
20234            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20235            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20236        );
20237        assert_eq_m512h(r, e);
20238    }
20239
20240    #[simd_test(enable = "avx512fp16")]
20241    unsafe fn test_mm512_mask3_fmadd_ph() {
20242        let a = _mm512_set1_ph(1.0);
20243        let b = _mm512_set1_ph(2.0);
20244        let c = _mm512_set1_ph(3.0);
20245        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20246        let e = _mm512_set_ph(
20247            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20248            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20249        );
20250        assert_eq_m512h(r, e);
20251    }
20252
20253    #[simd_test(enable = "avx512fp16")]
20254    unsafe fn test_mm512_maskz_fmadd_ph() {
20255        let a = _mm512_set1_ph(1.0);
20256        let b = _mm512_set1_ph(2.0);
20257        let c = _mm512_set1_ph(3.0);
20258        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20259        let e = _mm512_set_ph(
20260            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20261            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20262        );
20263        assert_eq_m512h(r, e);
20264    }
20265
20266    #[simd_test(enable = "avx512fp16")]
20267    unsafe fn test_mm512_fmadd_round_ph() {
20268        let a = _mm512_set1_ph(1.0);
20269        let b = _mm512_set1_ph(2.0);
20270        let c = _mm512_set1_ph(3.0);
20271        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20272        let e = _mm512_set1_ph(5.0);
20273        assert_eq_m512h(r, e);
20274    }
20275
20276    #[simd_test(enable = "avx512fp16")]
20277    unsafe fn test_mm512_mask_fmadd_round_ph() {
20278        let a = _mm512_set1_ph(1.0);
20279        let b = _mm512_set1_ph(2.0);
20280        let c = _mm512_set1_ph(3.0);
20281        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20282            a,
20283            0b01010101010101010101010101010101,
20284            b,
20285            c,
20286        );
20287        let e = _mm512_set_ph(
20288            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20289            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20290        );
20291        assert_eq_m512h(r, e);
20292    }
20293
20294    #[simd_test(enable = "avx512fp16")]
20295    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20296        let a = _mm512_set1_ph(1.0);
20297        let b = _mm512_set1_ph(2.0);
20298        let c = _mm512_set1_ph(3.0);
20299        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20300            a,
20301            b,
20302            c,
20303            0b01010101010101010101010101010101,
20304        );
20305        let e = _mm512_set_ph(
20306            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20307            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20308        );
20309        assert_eq_m512h(r, e);
20310    }
20311
20312    #[simd_test(enable = "avx512fp16")]
20313    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20314        let a = _mm512_set1_ph(1.0);
20315        let b = _mm512_set1_ph(2.0);
20316        let c = _mm512_set1_ph(3.0);
20317        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20318            0b01010101010101010101010101010101,
20319            a,
20320            b,
20321            c,
20322        );
20323        let e = _mm512_set_ph(
20324            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20325            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20326        );
20327        assert_eq_m512h(r, e);
20328    }
20329
20330    #[simd_test(enable = "avx512fp16")]
20331    unsafe fn test_mm_fmadd_sh() {
20332        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20333        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20334        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20335        let r = _mm_fmadd_sh(a, b, c);
20336        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20337        assert_eq_m128h(r, e);
20338    }
20339
20340    #[simd_test(enable = "avx512fp16")]
20341    unsafe fn test_mm_mask_fmadd_sh() {
20342        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20343        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20344        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20345        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20346        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20347        assert_eq_m128h(r, e);
20348        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20349        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20350        assert_eq_m128h(r, e);
20351    }
20352
20353    #[simd_test(enable = "avx512fp16")]
20354    unsafe fn test_mm_mask3_fmadd_sh() {
20355        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20356        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20357        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20358        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20359        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20360        assert_eq_m128h(r, e);
20361        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20362        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20363        assert_eq_m128h(r, e);
20364    }
20365
20366    #[simd_test(enable = "avx512fp16")]
20367    unsafe fn test_mm_maskz_fmadd_sh() {
20368        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20369        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20370        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20371        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20372        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20373        assert_eq_m128h(r, e);
20374        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20375        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20376        assert_eq_m128h(r, e);
20377    }
20378
20379    #[simd_test(enable = "avx512fp16")]
20380    unsafe fn test_mm_fmadd_round_sh() {
20381        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20382        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20383        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20384        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20385        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20386        assert_eq_m128h(r, e);
20387    }
20388
20389    #[simd_test(enable = "avx512fp16")]
20390    unsafe fn test_mm_mask_fmadd_round_sh() {
20391        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20392        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20393        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20394        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20395            a, 0, b, c,
20396        );
20397        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20398        assert_eq_m128h(r, e);
20399        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20400            a, 1, b, c,
20401        );
20402        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20403        assert_eq_m128h(r, e);
20404    }
20405
20406    #[simd_test(enable = "avx512fp16")]
20407    unsafe fn test_mm_mask3_fmadd_round_sh() {
20408        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20409        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20410        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20411        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20412            a, b, c, 0,
20413        );
20414        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20415        assert_eq_m128h(r, e);
20416        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20417            a, b, c, 1,
20418        );
20419        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20420        assert_eq_m128h(r, e);
20421    }
20422
20423    #[simd_test(enable = "avx512fp16")]
20424    unsafe fn test_mm_maskz_fmadd_round_sh() {
20425        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20426        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20427        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20428        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20429            0, a, b, c,
20430        );
20431        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20432        assert_eq_m128h(r, e);
20433        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20434            1, a, b, c,
20435        );
20436        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20437        assert_eq_m128h(r, e);
20438    }
20439
20440    #[simd_test(enable = "avx512fp16,avx512vl")]
20441    unsafe fn test_mm_fmsub_ph() {
20442        let a = _mm_set1_ph(1.0);
20443        let b = _mm_set1_ph(2.0);
20444        let c = _mm_set1_ph(3.0);
20445        let r = _mm_fmsub_ph(a, b, c);
20446        let e = _mm_set1_ph(-1.0);
20447        assert_eq_m128h(r, e);
20448    }
20449
20450    #[simd_test(enable = "avx512fp16,avx512vl")]
20451    unsafe fn test_mm_mask_fmsub_ph() {
20452        let a = _mm_set1_ph(1.0);
20453        let b = _mm_set1_ph(2.0);
20454        let c = _mm_set1_ph(3.0);
20455        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20456        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20457        assert_eq_m128h(r, e);
20458    }
20459
20460    #[simd_test(enable = "avx512fp16,avx512vl")]
20461    unsafe fn test_mm_mask3_fmsub_ph() {
20462        let a = _mm_set1_ph(1.0);
20463        let b = _mm_set1_ph(2.0);
20464        let c = _mm_set1_ph(3.0);
20465        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20466        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20467        assert_eq_m128h(r, e);
20468    }
20469
20470    #[simd_test(enable = "avx512fp16,avx512vl")]
20471    unsafe fn test_mm_maskz_fmsub_ph() {
20472        let a = _mm_set1_ph(1.0);
20473        let b = _mm_set1_ph(2.0);
20474        let c = _mm_set1_ph(3.0);
20475        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20476        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20477        assert_eq_m128h(r, e);
20478    }
20479
20480    #[simd_test(enable = "avx512fp16,avx512vl")]
20481    unsafe fn test_mm256_fmsub_ph() {
20482        let a = _mm256_set1_ph(1.0);
20483        let b = _mm256_set1_ph(2.0);
20484        let c = _mm256_set1_ph(3.0);
20485        let r = _mm256_fmsub_ph(a, b, c);
20486        let e = _mm256_set1_ph(-1.0);
20487        assert_eq_m256h(r, e);
20488    }
20489
20490    #[simd_test(enable = "avx512fp16,avx512vl")]
20491    unsafe fn test_mm256_mask_fmsub_ph() {
20492        let a = _mm256_set1_ph(1.0);
20493        let b = _mm256_set1_ph(2.0);
20494        let c = _mm256_set1_ph(3.0);
20495        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20496        let e = _mm256_set_ph(
20497            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20498        );
20499        assert_eq_m256h(r, e);
20500    }
20501
20502    #[simd_test(enable = "avx512fp16,avx512vl")]
20503    unsafe fn test_mm256_mask3_fmsub_ph() {
20504        let a = _mm256_set1_ph(1.0);
20505        let b = _mm256_set1_ph(2.0);
20506        let c = _mm256_set1_ph(3.0);
20507        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20508        let e = _mm256_set_ph(
20509            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20510        );
20511        assert_eq_m256h(r, e);
20512    }
20513
20514    #[simd_test(enable = "avx512fp16,avx512vl")]
20515    unsafe fn test_mm256_maskz_fmsub_ph() {
20516        let a = _mm256_set1_ph(1.0);
20517        let b = _mm256_set1_ph(2.0);
20518        let c = _mm256_set1_ph(3.0);
20519        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20520        let e = _mm256_set_ph(
20521            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20522        );
20523        assert_eq_m256h(r, e);
20524    }
20525
20526    #[simd_test(enable = "avx512fp16")]
20527    unsafe fn test_mm512_fmsub_ph() {
20528        let a = _mm512_set1_ph(1.0);
20529        let b = _mm512_set1_ph(2.0);
20530        let c = _mm512_set1_ph(3.0);
20531        let r = _mm512_fmsub_ph(a, b, c);
20532        let e = _mm512_set1_ph(-1.0);
20533        assert_eq_m512h(r, e);
20534    }
20535
20536    #[simd_test(enable = "avx512fp16")]
20537    unsafe fn test_mm512_mask_fmsub_ph() {
20538        let a = _mm512_set1_ph(1.0);
20539        let b = _mm512_set1_ph(2.0);
20540        let c = _mm512_set1_ph(3.0);
20541        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20542        let e = _mm512_set_ph(
20543            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20544            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20545        );
20546        assert_eq_m512h(r, e);
20547    }
20548
20549    #[simd_test(enable = "avx512fp16")]
20550    unsafe fn test_mm512_mask3_fmsub_ph() {
20551        let a = _mm512_set1_ph(1.0);
20552        let b = _mm512_set1_ph(2.0);
20553        let c = _mm512_set1_ph(3.0);
20554        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20555        let e = _mm512_set_ph(
20556            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20557            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20558        );
20559        assert_eq_m512h(r, e);
20560    }
20561
20562    #[simd_test(enable = "avx512fp16")]
20563    unsafe fn test_mm512_maskz_fmsub_ph() {
20564        let a = _mm512_set1_ph(1.0);
20565        let b = _mm512_set1_ph(2.0);
20566        let c = _mm512_set1_ph(3.0);
20567        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20568        let e = _mm512_set_ph(
20569            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20570            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20571        );
20572        assert_eq_m512h(r, e);
20573    }
20574
20575    #[simd_test(enable = "avx512fp16")]
20576    unsafe fn test_mm512_fmsub_round_ph() {
20577        let a = _mm512_set1_ph(1.0);
20578        let b = _mm512_set1_ph(2.0);
20579        let c = _mm512_set1_ph(3.0);
20580        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20581        let e = _mm512_set1_ph(-1.0);
20582        assert_eq_m512h(r, e);
20583    }
20584
20585    #[simd_test(enable = "avx512fp16")]
20586    unsafe fn test_mm512_mask_fmsub_round_ph() {
20587        let a = _mm512_set1_ph(1.0);
20588        let b = _mm512_set1_ph(2.0);
20589        let c = _mm512_set1_ph(3.0);
20590        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20591            a,
20592            0b01010101010101010101010101010101,
20593            b,
20594            c,
20595        );
20596        let e = _mm512_set_ph(
20597            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20598            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20599        );
20600        assert_eq_m512h(r, e);
20601    }
20602
20603    #[simd_test(enable = "avx512fp16")]
20604    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20605        let a = _mm512_set1_ph(1.0);
20606        let b = _mm512_set1_ph(2.0);
20607        let c = _mm512_set1_ph(3.0);
20608        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20609            a,
20610            b,
20611            c,
20612            0b01010101010101010101010101010101,
20613        );
20614        let e = _mm512_set_ph(
20615            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20616            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20617        );
20618        assert_eq_m512h(r, e);
20619    }
20620
20621    #[simd_test(enable = "avx512fp16")]
20622    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20623        let a = _mm512_set1_ph(1.0);
20624        let b = _mm512_set1_ph(2.0);
20625        let c = _mm512_set1_ph(3.0);
20626        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20627            0b01010101010101010101010101010101,
20628            a,
20629            b,
20630            c,
20631        );
20632        let e = _mm512_set_ph(
20633            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20634            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20635        );
20636        assert_eq_m512h(r, e);
20637    }
20638
20639    #[simd_test(enable = "avx512fp16")]
20640    unsafe fn test_mm_fmsub_sh() {
20641        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20642        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20643        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20644        let r = _mm_fmsub_sh(a, b, c);
20645        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20646        assert_eq_m128h(r, e);
20647    }
20648
20649    #[simd_test(enable = "avx512fp16")]
20650    unsafe fn test_mm_mask_fmsub_sh() {
20651        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20652        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20653        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20654        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20655        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20656        assert_eq_m128h(r, e);
20657        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20658        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20659        assert_eq_m128h(r, e);
20660    }
20661
20662    #[simd_test(enable = "avx512fp16")]
20663    unsafe fn test_mm_mask3_fmsub_sh() {
20664        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20665        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20666        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20667        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20668        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20669        assert_eq_m128h(r, e);
20670        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20671        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20672        assert_eq_m128h(r, e);
20673    }
20674
20675    #[simd_test(enable = "avx512fp16")]
20676    unsafe fn test_mm_maskz_fmsub_sh() {
20677        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20678        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20679        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20680        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20681        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20682        assert_eq_m128h(r, e);
20683        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20684        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20685        assert_eq_m128h(r, e);
20686    }
20687
20688    #[simd_test(enable = "avx512fp16")]
20689    unsafe fn test_mm_fmsub_round_sh() {
20690        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20691        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20692        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20693        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20694        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20695        assert_eq_m128h(r, e);
20696    }
20697
20698    #[simd_test(enable = "avx512fp16")]
20699    unsafe fn test_mm_mask_fmsub_round_sh() {
20700        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20701        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20702        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20703        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20704            a, 0, b, c,
20705        );
20706        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20707        assert_eq_m128h(r, e);
20708        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20709            a, 1, b, c,
20710        );
20711        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20712        assert_eq_m128h(r, e);
20713    }
20714
20715    #[simd_test(enable = "avx512fp16")]
20716    unsafe fn test_mm_mask3_fmsub_round_sh() {
20717        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20718        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20719        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20720        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20721            a, b, c, 0,
20722        );
20723        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20724        assert_eq_m128h(r, e);
20725        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20726            a, b, c, 1,
20727        );
20728        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20729        assert_eq_m128h(r, e);
20730    }
20731
20732    #[simd_test(enable = "avx512fp16")]
20733    unsafe fn test_mm_maskz_fmsub_round_sh() {
20734        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20735        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20736        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20737        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20738            0, a, b, c,
20739        );
20740        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20741        assert_eq_m128h(r, e);
20742        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20743            1, a, b, c,
20744        );
20745        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20746        assert_eq_m128h(r, e);
20747    }
20748
20749    #[simd_test(enable = "avx512fp16,avx512vl")]
20750    unsafe fn test_mm_fnmadd_ph() {
20751        let a = _mm_set1_ph(1.0);
20752        let b = _mm_set1_ph(2.0);
20753        let c = _mm_set1_ph(3.0);
20754        let r = _mm_fnmadd_ph(a, b, c);
20755        let e = _mm_set1_ph(1.0);
20756        assert_eq_m128h(r, e);
20757    }
20758
20759    #[simd_test(enable = "avx512fp16,avx512vl")]
20760    unsafe fn test_mm_mask_fnmadd_ph() {
20761        let a = _mm_set1_ph(1.0);
20762        let b = _mm_set1_ph(2.0);
20763        let c = _mm_set1_ph(3.0);
20764        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20765        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20766        assert_eq_m128h(r, e);
20767    }
20768
20769    #[simd_test(enable = "avx512fp16,avx512vl")]
20770    unsafe fn test_mm_mask3_fnmadd_ph() {
20771        let a = _mm_set1_ph(1.0);
20772        let b = _mm_set1_ph(2.0);
20773        let c = _mm_set1_ph(3.0);
20774        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20775        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20776        assert_eq_m128h(r, e);
20777    }
20778
20779    #[simd_test(enable = "avx512fp16,avx512vl")]
20780    unsafe fn test_mm_maskz_fnmadd_ph() {
20781        let a = _mm_set1_ph(1.0);
20782        let b = _mm_set1_ph(2.0);
20783        let c = _mm_set1_ph(3.0);
20784        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20785        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20786        assert_eq_m128h(r, e);
20787    }
20788
20789    #[simd_test(enable = "avx512fp16,avx512vl")]
20790    unsafe fn test_mm256_fnmadd_ph() {
20791        let a = _mm256_set1_ph(1.0);
20792        let b = _mm256_set1_ph(2.0);
20793        let c = _mm256_set1_ph(3.0);
20794        let r = _mm256_fnmadd_ph(a, b, c);
20795        let e = _mm256_set1_ph(1.0);
20796        assert_eq_m256h(r, e);
20797    }
20798
20799    #[simd_test(enable = "avx512fp16,avx512vl")]
20800    unsafe fn test_mm256_mask_fnmadd_ph() {
20801        let a = _mm256_set1_ph(1.0);
20802        let b = _mm256_set1_ph(2.0);
20803        let c = _mm256_set1_ph(3.0);
20804        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20805        let e = _mm256_set_ph(
20806            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20807        );
20808        assert_eq_m256h(r, e);
20809    }
20810
20811    #[simd_test(enable = "avx512fp16,avx512vl")]
20812    unsafe fn test_mm256_mask3_fnmadd_ph() {
20813        let a = _mm256_set1_ph(1.0);
20814        let b = _mm256_set1_ph(2.0);
20815        let c = _mm256_set1_ph(3.0);
20816        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20817        let e = _mm256_set_ph(
20818            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20819        );
20820        assert_eq_m256h(r, e);
20821    }
20822
20823    #[simd_test(enable = "avx512fp16,avx512vl")]
20824    unsafe fn test_mm256_maskz_fnmadd_ph() {
20825        let a = _mm256_set1_ph(1.0);
20826        let b = _mm256_set1_ph(2.0);
20827        let c = _mm256_set1_ph(3.0);
20828        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20829        let e = _mm256_set_ph(
20830            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20831        );
20832        assert_eq_m256h(r, e);
20833    }
20834
20835    #[simd_test(enable = "avx512fp16")]
20836    unsafe fn test_mm512_fnmadd_ph() {
20837        let a = _mm512_set1_ph(1.0);
20838        let b = _mm512_set1_ph(2.0);
20839        let c = _mm512_set1_ph(3.0);
20840        let r = _mm512_fnmadd_ph(a, b, c);
20841        let e = _mm512_set1_ph(1.0);
20842        assert_eq_m512h(r, e);
20843    }
20844
20845    #[simd_test(enable = "avx512fp16")]
20846    unsafe fn test_mm512_mask_fnmadd_ph() {
20847        let a = _mm512_set1_ph(1.0);
20848        let b = _mm512_set1_ph(2.0);
20849        let c = _mm512_set1_ph(3.0);
20850        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20851        let e = _mm512_set_ph(
20852            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20853            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20854        );
20855        assert_eq_m512h(r, e);
20856    }
20857
20858    #[simd_test(enable = "avx512fp16")]
20859    unsafe fn test_mm512_mask3_fnmadd_ph() {
20860        let a = _mm512_set1_ph(1.0);
20861        let b = _mm512_set1_ph(2.0);
20862        let c = _mm512_set1_ph(3.0);
20863        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20864        let e = _mm512_set_ph(
20865            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20866            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20867        );
20868        assert_eq_m512h(r, e);
20869    }
20870
20871    #[simd_test(enable = "avx512fp16")]
20872    unsafe fn test_mm512_maskz_fnmadd_ph() {
20873        let a = _mm512_set1_ph(1.0);
20874        let b = _mm512_set1_ph(2.0);
20875        let c = _mm512_set1_ph(3.0);
20876        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20877        let e = _mm512_set_ph(
20878            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20879            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20880        );
20881        assert_eq_m512h(r, e);
20882    }
20883
20884    #[simd_test(enable = "avx512fp16")]
20885    unsafe fn test_mm512_fnmadd_round_ph() {
20886        let a = _mm512_set1_ph(1.0);
20887        let b = _mm512_set1_ph(2.0);
20888        let c = _mm512_set1_ph(3.0);
20889        let r =
20890            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20891        let e = _mm512_set1_ph(1.0);
20892        assert_eq_m512h(r, e);
20893    }
20894
20895    #[simd_test(enable = "avx512fp16")]
20896    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20897        let a = _mm512_set1_ph(1.0);
20898        let b = _mm512_set1_ph(2.0);
20899        let c = _mm512_set1_ph(3.0);
20900        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20901            a,
20902            0b01010101010101010101010101010101,
20903            b,
20904            c,
20905        );
20906        let e = _mm512_set_ph(
20907            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20908            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20909        );
20910        assert_eq_m512h(r, e);
20911    }
20912
20913    #[simd_test(enable = "avx512fp16")]
20914    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20915        let a = _mm512_set1_ph(1.0);
20916        let b = _mm512_set1_ph(2.0);
20917        let c = _mm512_set1_ph(3.0);
20918        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20919            a,
20920            b,
20921            c,
20922            0b01010101010101010101010101010101,
20923        );
20924        let e = _mm512_set_ph(
20925            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20926            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20927        );
20928        assert_eq_m512h(r, e);
20929    }
20930
20931    #[simd_test(enable = "avx512fp16")]
20932    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
20933        let a = _mm512_set1_ph(1.0);
20934        let b = _mm512_set1_ph(2.0);
20935        let c = _mm512_set1_ph(3.0);
20936        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20937            0b01010101010101010101010101010101,
20938            a,
20939            b,
20940            c,
20941        );
20942        let e = _mm512_set_ph(
20943            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20944            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20945        );
20946        assert_eq_m512h(r, e);
20947    }
20948
20949    #[simd_test(enable = "avx512fp16")]
20950    unsafe fn test_mm_fnmadd_sh() {
20951        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20952        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20953        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20954        let r = _mm_fnmadd_sh(a, b, c);
20955        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20956        assert_eq_m128h(r, e);
20957    }
20958
20959    #[simd_test(enable = "avx512fp16")]
20960    unsafe fn test_mm_mask_fnmadd_sh() {
20961        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20962        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20963        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20964        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
20965        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20966        assert_eq_m128h(r, e);
20967        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
20968        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20969        assert_eq_m128h(r, e);
20970    }
20971
20972    #[simd_test(enable = "avx512fp16")]
20973    unsafe fn test_mm_mask3_fnmadd_sh() {
20974        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20975        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20976        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20977        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
20978        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20979        assert_eq_m128h(r, e);
20980        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
20981        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20982        assert_eq_m128h(r, e);
20983    }
20984
20985    #[simd_test(enable = "avx512fp16")]
20986    unsafe fn test_mm_maskz_fnmadd_sh() {
20987        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20988        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20989        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20990        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
20991        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20992        assert_eq_m128h(r, e);
20993        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
20994        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20995        assert_eq_m128h(r, e);
20996    }
20997
20998    #[simd_test(enable = "avx512fp16")]
20999    unsafe fn test_mm_fnmadd_round_sh() {
21000        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21001        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21002        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21003        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21004        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21005        assert_eq_m128h(r, e);
21006    }
21007
21008    #[simd_test(enable = "avx512fp16")]
21009    unsafe fn test_mm_mask_fnmadd_round_sh() {
21010        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21011        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21012        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21013        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21014            a, 0, b, c,
21015        );
21016        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21017        assert_eq_m128h(r, e);
21018        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21019            a, 1, b, c,
21020        );
21021        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21022        assert_eq_m128h(r, e);
21023    }
21024
21025    #[simd_test(enable = "avx512fp16")]
21026    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21027        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21028        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21029        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21030        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21031            a, b, c, 0,
21032        );
21033        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21034        assert_eq_m128h(r, e);
21035        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21036            a, b, c, 1,
21037        );
21038        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21039        assert_eq_m128h(r, e);
21040    }
21041
21042    #[simd_test(enable = "avx512fp16")]
21043    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21044        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21045        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21046        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21047        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21048            0, a, b, c,
21049        );
21050        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21051        assert_eq_m128h(r, e);
21052        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21053            1, a, b, c,
21054        );
21055        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21056        assert_eq_m128h(r, e);
21057    }
21058
21059    #[simd_test(enable = "avx512fp16,avx512vl")]
21060    unsafe fn test_mm_fnmsub_ph() {
21061        let a = _mm_set1_ph(1.0);
21062        let b = _mm_set1_ph(2.0);
21063        let c = _mm_set1_ph(3.0);
21064        let r = _mm_fnmsub_ph(a, b, c);
21065        let e = _mm_set1_ph(-5.0);
21066        assert_eq_m128h(r, e);
21067    }
21068
21069    #[simd_test(enable = "avx512fp16,avx512vl")]
21070    unsafe fn test_mm_mask_fnmsub_ph() {
21071        let a = _mm_set1_ph(1.0);
21072        let b = _mm_set1_ph(2.0);
21073        let c = _mm_set1_ph(3.0);
21074        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21075        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21076        assert_eq_m128h(r, e);
21077    }
21078
21079    #[simd_test(enable = "avx512fp16,avx512vl")]
21080    unsafe fn test_mm_mask3_fnmsub_ph() {
21081        let a = _mm_set1_ph(1.0);
21082        let b = _mm_set1_ph(2.0);
21083        let c = _mm_set1_ph(3.0);
21084        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21085        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21086        assert_eq_m128h(r, e);
21087    }
21088
21089    #[simd_test(enable = "avx512fp16,avx512vl")]
21090    unsafe fn test_mm_maskz_fnmsub_ph() {
21091        let a = _mm_set1_ph(1.0);
21092        let b = _mm_set1_ph(2.0);
21093        let c = _mm_set1_ph(3.0);
21094        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21095        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21096        assert_eq_m128h(r, e);
21097    }
21098
21099    #[simd_test(enable = "avx512fp16,avx512vl")]
21100    unsafe fn test_mm256_fnmsub_ph() {
21101        let a = _mm256_set1_ph(1.0);
21102        let b = _mm256_set1_ph(2.0);
21103        let c = _mm256_set1_ph(3.0);
21104        let r = _mm256_fnmsub_ph(a, b, c);
21105        let e = _mm256_set1_ph(-5.0);
21106        assert_eq_m256h(r, e);
21107    }
21108
21109    #[simd_test(enable = "avx512fp16,avx512vl")]
21110    unsafe fn test_mm256_mask_fnmsub_ph() {
21111        let a = _mm256_set1_ph(1.0);
21112        let b = _mm256_set1_ph(2.0);
21113        let c = _mm256_set1_ph(3.0);
21114        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21115        let e = _mm256_set_ph(
21116            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21117        );
21118        assert_eq_m256h(r, e);
21119    }
21120
21121    #[simd_test(enable = "avx512fp16,avx512vl")]
21122    unsafe fn test_mm256_mask3_fnmsub_ph() {
21123        let a = _mm256_set1_ph(1.0);
21124        let b = _mm256_set1_ph(2.0);
21125        let c = _mm256_set1_ph(3.0);
21126        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21127        let e = _mm256_set_ph(
21128            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21129        );
21130        assert_eq_m256h(r, e);
21131    }
21132
21133    #[simd_test(enable = "avx512fp16,avx512vl")]
21134    unsafe fn test_mm256_maskz_fnmsub_ph() {
21135        let a = _mm256_set1_ph(1.0);
21136        let b = _mm256_set1_ph(2.0);
21137        let c = _mm256_set1_ph(3.0);
21138        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21139        let e = _mm256_set_ph(
21140            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21141        );
21142        assert_eq_m256h(r, e);
21143    }
21144
21145    #[simd_test(enable = "avx512fp16")]
21146    unsafe fn test_mm512_fnmsub_ph() {
21147        let a = _mm512_set1_ph(1.0);
21148        let b = _mm512_set1_ph(2.0);
21149        let c = _mm512_set1_ph(3.0);
21150        let r = _mm512_fnmsub_ph(a, b, c);
21151        let e = _mm512_set1_ph(-5.0);
21152        assert_eq_m512h(r, e);
21153    }
21154
21155    #[simd_test(enable = "avx512fp16")]
21156    unsafe fn test_mm512_mask_fnmsub_ph() {
21157        let a = _mm512_set1_ph(1.0);
21158        let b = _mm512_set1_ph(2.0);
21159        let c = _mm512_set1_ph(3.0);
21160        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21161        let e = _mm512_set_ph(
21162            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21163            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21164        );
21165        assert_eq_m512h(r, e);
21166    }
21167
21168    #[simd_test(enable = "avx512fp16")]
21169    unsafe fn test_mm512_mask3_fnmsub_ph() {
21170        let a = _mm512_set1_ph(1.0);
21171        let b = _mm512_set1_ph(2.0);
21172        let c = _mm512_set1_ph(3.0);
21173        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21174        let e = _mm512_set_ph(
21175            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21176            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21177        );
21178        assert_eq_m512h(r, e);
21179    }
21180
21181    #[simd_test(enable = "avx512fp16")]
21182    unsafe fn test_mm512_maskz_fnmsub_ph() {
21183        let a = _mm512_set1_ph(1.0);
21184        let b = _mm512_set1_ph(2.0);
21185        let c = _mm512_set1_ph(3.0);
21186        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21187        let e = _mm512_set_ph(
21188            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21189            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21190        );
21191        assert_eq_m512h(r, e);
21192    }
21193
21194    #[simd_test(enable = "avx512fp16")]
21195    unsafe fn test_mm512_fnmsub_round_ph() {
21196        let a = _mm512_set1_ph(1.0);
21197        let b = _mm512_set1_ph(2.0);
21198        let c = _mm512_set1_ph(3.0);
21199        let r =
21200            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21201        let e = _mm512_set1_ph(-5.0);
21202        assert_eq_m512h(r, e);
21203    }
21204
21205    #[simd_test(enable = "avx512fp16")]
21206    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21207        let a = _mm512_set1_ph(1.0);
21208        let b = _mm512_set1_ph(2.0);
21209        let c = _mm512_set1_ph(3.0);
21210        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21211            a,
21212            0b01010101010101010101010101010101,
21213            b,
21214            c,
21215        );
21216        let e = _mm512_set_ph(
21217            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21218            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21219        );
21220        assert_eq_m512h(r, e);
21221    }
21222
21223    #[simd_test(enable = "avx512fp16")]
21224    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21225        let a = _mm512_set1_ph(1.0);
21226        let b = _mm512_set1_ph(2.0);
21227        let c = _mm512_set1_ph(3.0);
21228        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21229            a,
21230            b,
21231            c,
21232            0b01010101010101010101010101010101,
21233        );
21234        let e = _mm512_set_ph(
21235            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21236            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21237        );
21238        assert_eq_m512h(r, e);
21239    }
21240
21241    #[simd_test(enable = "avx512fp16")]
21242    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21243        let a = _mm512_set1_ph(1.0);
21244        let b = _mm512_set1_ph(2.0);
21245        let c = _mm512_set1_ph(3.0);
21246        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21247            0b01010101010101010101010101010101,
21248            a,
21249            b,
21250            c,
21251        );
21252        let e = _mm512_set_ph(
21253            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21254            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21255        );
21256        assert_eq_m512h(r, e);
21257    }
21258
21259    #[simd_test(enable = "avx512fp16")]
21260    unsafe fn test_mm_fnmsub_sh() {
21261        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21262        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21263        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21264        let r = _mm_fnmsub_sh(a, b, c);
21265        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21266        assert_eq_m128h(r, e);
21267    }
21268
21269    #[simd_test(enable = "avx512fp16")]
21270    unsafe fn test_mm_mask_fnmsub_sh() {
21271        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21272        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21273        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21274        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21275        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21276        assert_eq_m128h(r, e);
21277        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21278        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21279        assert_eq_m128h(r, e);
21280    }
21281
21282    #[simd_test(enable = "avx512fp16")]
21283    unsafe fn test_mm_mask3_fnmsub_sh() {
21284        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21285        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21286        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21287        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21288        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21289        assert_eq_m128h(r, e);
21290        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21291        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21292        assert_eq_m128h(r, e);
21293    }
21294
21295    #[simd_test(enable = "avx512fp16")]
21296    unsafe fn test_mm_maskz_fnmsub_sh() {
21297        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21298        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21299        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21300        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21301        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21302        assert_eq_m128h(r, e);
21303        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21304        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21305        assert_eq_m128h(r, e);
21306    }
21307
21308    #[simd_test(enable = "avx512fp16")]
21309    unsafe fn test_mm_fnmsub_round_sh() {
21310        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21311        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21312        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21313        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21314        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21315        assert_eq_m128h(r, e);
21316    }
21317
21318    #[simd_test(enable = "avx512fp16")]
21319    unsafe fn test_mm_mask_fnmsub_round_sh() {
21320        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21321        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21322        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21323        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21324            a, 0, b, c,
21325        );
21326        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21327        assert_eq_m128h(r, e);
21328        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21329            a, 1, b, c,
21330        );
21331        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21332        assert_eq_m128h(r, e);
21333    }
21334
21335    #[simd_test(enable = "avx512fp16")]
21336    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21337        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21338        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21339        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21340        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21341            a, b, c, 0,
21342        );
21343        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21344        assert_eq_m128h(r, e);
21345        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21346            a, b, c, 1,
21347        );
21348        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21349        assert_eq_m128h(r, e);
21350    }
21351
21352    #[simd_test(enable = "avx512fp16")]
21353    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21354        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21355        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21356        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21357        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21358            0, a, b, c,
21359        );
21360        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21361        assert_eq_m128h(r, e);
21362        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21363            1, a, b, c,
21364        );
21365        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21366        assert_eq_m128h(r, e);
21367    }
21368
21369    #[simd_test(enable = "avx512fp16,avx512vl")]
21370    unsafe fn test_mm_fmaddsub_ph() {
21371        let a = _mm_set1_ph(1.0);
21372        let b = _mm_set1_ph(2.0);
21373        let c = _mm_set1_ph(3.0);
21374        let r = _mm_fmaddsub_ph(a, b, c);
21375        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21376        assert_eq_m128h(r, e);
21377    }
21378
21379    #[simd_test(enable = "avx512fp16,avx512vl")]
21380    unsafe fn test_mm_mask_fmaddsub_ph() {
21381        let a = _mm_set1_ph(1.0);
21382        let b = _mm_set1_ph(2.0);
21383        let c = _mm_set1_ph(3.0);
21384        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21385        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21386        assert_eq_m128h(r, e);
21387    }
21388
21389    #[simd_test(enable = "avx512fp16,avx512vl")]
21390    unsafe fn test_mm_mask3_fmaddsub_ph() {
21391        let a = _mm_set1_ph(1.0);
21392        let b = _mm_set1_ph(2.0);
21393        let c = _mm_set1_ph(3.0);
21394        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21395        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21396        assert_eq_m128h(r, e);
21397    }
21398
21399    #[simd_test(enable = "avx512fp16,avx512vl")]
21400    unsafe fn test_mm_maskz_fmaddsub_ph() {
21401        let a = _mm_set1_ph(1.0);
21402        let b = _mm_set1_ph(2.0);
21403        let c = _mm_set1_ph(3.0);
21404        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21405        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21406        assert_eq_m128h(r, e);
21407    }
21408
21409    #[simd_test(enable = "avx512fp16,avx512vl")]
21410    unsafe fn test_mm256_fmaddsub_ph() {
21411        let a = _mm256_set1_ph(1.0);
21412        let b = _mm256_set1_ph(2.0);
21413        let c = _mm256_set1_ph(3.0);
21414        let r = _mm256_fmaddsub_ph(a, b, c);
21415        let e = _mm256_set_ph(
21416            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21417        );
21418        assert_eq_m256h(r, e);
21419    }
21420
21421    #[simd_test(enable = "avx512fp16,avx512vl")]
21422    unsafe fn test_mm256_mask_fmaddsub_ph() {
21423        let a = _mm256_set1_ph(1.0);
21424        let b = _mm256_set1_ph(2.0);
21425        let c = _mm256_set1_ph(3.0);
21426        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21427        let e = _mm256_set_ph(
21428            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21429        );
21430        assert_eq_m256h(r, e);
21431    }
21432
21433    #[simd_test(enable = "avx512fp16,avx512vl")]
21434    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21435        let a = _mm256_set1_ph(1.0);
21436        let b = _mm256_set1_ph(2.0);
21437        let c = _mm256_set1_ph(3.0);
21438        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21439        let e = _mm256_set_ph(
21440            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21441        );
21442        assert_eq_m256h(r, e);
21443    }
21444
21445    #[simd_test(enable = "avx512fp16,avx512vl")]
21446    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21447        let a = _mm256_set1_ph(1.0);
21448        let b = _mm256_set1_ph(2.0);
21449        let c = _mm256_set1_ph(3.0);
21450        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21451        let e = _mm256_set_ph(
21452            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21453        );
21454        assert_eq_m256h(r, e);
21455    }
21456
21457    #[simd_test(enable = "avx512fp16")]
21458    unsafe fn test_mm512_fmaddsub_ph() {
21459        let a = _mm512_set1_ph(1.0);
21460        let b = _mm512_set1_ph(2.0);
21461        let c = _mm512_set1_ph(3.0);
21462        let r = _mm512_fmaddsub_ph(a, b, c);
21463        let e = _mm512_set_ph(
21464            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21465            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21466        );
21467        assert_eq_m512h(r, e);
21468    }
21469
21470    #[simd_test(enable = "avx512fp16")]
21471    unsafe fn test_mm512_mask_fmaddsub_ph() {
21472        let a = _mm512_set1_ph(1.0);
21473        let b = _mm512_set1_ph(2.0);
21474        let c = _mm512_set1_ph(3.0);
21475        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21476        let e = _mm512_set_ph(
21477            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21478            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21479        );
21480        assert_eq_m512h(r, e);
21481    }
21482
21483    #[simd_test(enable = "avx512fp16")]
21484    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21485        let a = _mm512_set1_ph(1.0);
21486        let b = _mm512_set1_ph(2.0);
21487        let c = _mm512_set1_ph(3.0);
21488        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21489        let e = _mm512_set_ph(
21490            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21491            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21492        );
21493        assert_eq_m512h(r, e);
21494    }
21495
21496    #[simd_test(enable = "avx512fp16")]
21497    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21498        let a = _mm512_set1_ph(1.0);
21499        let b = _mm512_set1_ph(2.0);
21500        let c = _mm512_set1_ph(3.0);
21501        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21502        let e = _mm512_set_ph(
21503            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21504            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21505        );
21506        assert_eq_m512h(r, e);
21507    }
21508
21509    #[simd_test(enable = "avx512fp16")]
21510    unsafe fn test_mm512_fmaddsub_round_ph() {
21511        let a = _mm512_set1_ph(1.0);
21512        let b = _mm512_set1_ph(2.0);
21513        let c = _mm512_set1_ph(3.0);
21514        let r =
21515            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21516        let e = _mm512_set_ph(
21517            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21518            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21519        );
21520        assert_eq_m512h(r, e);
21521    }
21522
21523    #[simd_test(enable = "avx512fp16")]
21524    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21525        let a = _mm512_set1_ph(1.0);
21526        let b = _mm512_set1_ph(2.0);
21527        let c = _mm512_set1_ph(3.0);
21528        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21529            a,
21530            0b00110011001100110011001100110011,
21531            b,
21532            c,
21533        );
21534        let e = _mm512_set_ph(
21535            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21536            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21537        );
21538        assert_eq_m512h(r, e);
21539    }
21540
21541    #[simd_test(enable = "avx512fp16")]
21542    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21543        let a = _mm512_set1_ph(1.0);
21544        let b = _mm512_set1_ph(2.0);
21545        let c = _mm512_set1_ph(3.0);
21546        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21547            a,
21548            b,
21549            c,
21550            0b00110011001100110011001100110011,
21551        );
21552        let e = _mm512_set_ph(
21553            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21554            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21555        );
21556        assert_eq_m512h(r, e);
21557    }
21558
21559    #[simd_test(enable = "avx512fp16")]
21560    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21561        let a = _mm512_set1_ph(1.0);
21562        let b = _mm512_set1_ph(2.0);
21563        let c = _mm512_set1_ph(3.0);
21564        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21565            0b00110011001100110011001100110011,
21566            a,
21567            b,
21568            c,
21569        );
21570        let e = _mm512_set_ph(
21571            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21572            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21573        );
21574        assert_eq_m512h(r, e);
21575    }
21576
21577    #[simd_test(enable = "avx512fp16,avx512vl")]
21578    unsafe fn test_mm_fmsubadd_ph() {
21579        let a = _mm_set1_ph(1.0);
21580        let b = _mm_set1_ph(2.0);
21581        let c = _mm_set1_ph(3.0);
21582        let r = _mm_fmsubadd_ph(a, b, c);
21583        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21584        assert_eq_m128h(r, e);
21585    }
21586
21587    #[simd_test(enable = "avx512fp16,avx512vl")]
21588    unsafe fn test_mm_mask_fmsubadd_ph() {
21589        let a = _mm_set1_ph(1.0);
21590        let b = _mm_set1_ph(2.0);
21591        let c = _mm_set1_ph(3.0);
21592        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21593        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21594        assert_eq_m128h(r, e);
21595    }
21596
21597    #[simd_test(enable = "avx512fp16,avx512vl")]
21598    unsafe fn test_mm_mask3_fmsubadd_ph() {
21599        let a = _mm_set1_ph(1.0);
21600        let b = _mm_set1_ph(2.0);
21601        let c = _mm_set1_ph(3.0);
21602        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21603        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21604        assert_eq_m128h(r, e);
21605    }
21606
21607    #[simd_test(enable = "avx512fp16,avx512vl")]
21608    unsafe fn test_mm_maskz_fmsubadd_ph() {
21609        let a = _mm_set1_ph(1.0);
21610        let b = _mm_set1_ph(2.0);
21611        let c = _mm_set1_ph(3.0);
21612        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21613        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21614        assert_eq_m128h(r, e);
21615    }
21616
21617    #[simd_test(enable = "avx512fp16,avx512vl")]
21618    unsafe fn test_mm256_fmsubadd_ph() {
21619        let a = _mm256_set1_ph(1.0);
21620        let b = _mm256_set1_ph(2.0);
21621        let c = _mm256_set1_ph(3.0);
21622        let r = _mm256_fmsubadd_ph(a, b, c);
21623        let e = _mm256_set_ph(
21624            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21625        );
21626        assert_eq_m256h(r, e);
21627    }
21628
21629    #[simd_test(enable = "avx512fp16,avx512vl")]
21630    unsafe fn test_mm256_mask_fmsubadd_ph() {
21631        let a = _mm256_set1_ph(1.0);
21632        let b = _mm256_set1_ph(2.0);
21633        let c = _mm256_set1_ph(3.0);
21634        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21635        let e = _mm256_set_ph(
21636            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21637        );
21638        assert_eq_m256h(r, e);
21639    }
21640
21641    #[simd_test(enable = "avx512fp16,avx512vl")]
21642    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21643        let a = _mm256_set1_ph(1.0);
21644        let b = _mm256_set1_ph(2.0);
21645        let c = _mm256_set1_ph(3.0);
21646        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21647        let e = _mm256_set_ph(
21648            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21649        );
21650        assert_eq_m256h(r, e);
21651    }
21652
21653    #[simd_test(enable = "avx512fp16,avx512vl")]
21654    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21655        let a = _mm256_set1_ph(1.0);
21656        let b = _mm256_set1_ph(2.0);
21657        let c = _mm256_set1_ph(3.0);
21658        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21659        let e = _mm256_set_ph(
21660            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21661        );
21662        assert_eq_m256h(r, e);
21663    }
21664
21665    #[simd_test(enable = "avx512fp16")]
21666    unsafe fn test_mm512_fmsubadd_ph() {
21667        let a = _mm512_set1_ph(1.0);
21668        let b = _mm512_set1_ph(2.0);
21669        let c = _mm512_set1_ph(3.0);
21670        let r = _mm512_fmsubadd_ph(a, b, c);
21671        let e = _mm512_set_ph(
21672            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21673            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21674        );
21675        assert_eq_m512h(r, e);
21676    }
21677
21678    #[simd_test(enable = "avx512fp16")]
21679    unsafe fn test_mm512_mask_fmsubadd_ph() {
21680        let a = _mm512_set1_ph(1.0);
21681        let b = _mm512_set1_ph(2.0);
21682        let c = _mm512_set1_ph(3.0);
21683        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21684        let e = _mm512_set_ph(
21685            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21686            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21687        );
21688        assert_eq_m512h(r, e);
21689    }
21690
21691    #[simd_test(enable = "avx512fp16")]
21692    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21693        let a = _mm512_set1_ph(1.0);
21694        let b = _mm512_set1_ph(2.0);
21695        let c = _mm512_set1_ph(3.0);
21696        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21697        let e = _mm512_set_ph(
21698            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21699            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21700        );
21701        assert_eq_m512h(r, e);
21702    }
21703
21704    #[simd_test(enable = "avx512fp16")]
21705    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21706        let a = _mm512_set1_ph(1.0);
21707        let b = _mm512_set1_ph(2.0);
21708        let c = _mm512_set1_ph(3.0);
21709        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21710        let e = _mm512_set_ph(
21711            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21712            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21713        );
21714        assert_eq_m512h(r, e);
21715    }
21716
21717    #[simd_test(enable = "avx512fp16")]
21718    unsafe fn test_mm512_fmsubadd_round_ph() {
21719        let a = _mm512_set1_ph(1.0);
21720        let b = _mm512_set1_ph(2.0);
21721        let c = _mm512_set1_ph(3.0);
21722        let r =
21723            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21724        let e = _mm512_set_ph(
21725            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21726            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21727        );
21728        assert_eq_m512h(r, e);
21729    }
21730
21731    #[simd_test(enable = "avx512fp16")]
21732    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21733        let a = _mm512_set1_ph(1.0);
21734        let b = _mm512_set1_ph(2.0);
21735        let c = _mm512_set1_ph(3.0);
21736        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21737            a,
21738            0b00110011001100110011001100110011,
21739            b,
21740            c,
21741        );
21742        let e = _mm512_set_ph(
21743            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21744            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21745        );
21746        assert_eq_m512h(r, e);
21747    }
21748
21749    #[simd_test(enable = "avx512fp16")]
21750    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21751        let a = _mm512_set1_ph(1.0);
21752        let b = _mm512_set1_ph(2.0);
21753        let c = _mm512_set1_ph(3.0);
21754        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21755            a,
21756            b,
21757            c,
21758            0b00110011001100110011001100110011,
21759        );
21760        let e = _mm512_set_ph(
21761            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21762            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21763        );
21764        assert_eq_m512h(r, e);
21765    }
21766
21767    #[simd_test(enable = "avx512fp16")]
21768    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21769        let a = _mm512_set1_ph(1.0);
21770        let b = _mm512_set1_ph(2.0);
21771        let c = _mm512_set1_ph(3.0);
21772        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21773            0b00110011001100110011001100110011,
21774            a,
21775            b,
21776            c,
21777        );
21778        let e = _mm512_set_ph(
21779            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21780            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21781        );
21782        assert_eq_m512h(r, e);
21783    }
21784
21785    #[simd_test(enable = "avx512fp16,avx512vl")]
21786    unsafe fn test_mm_rcp_ph() {
21787        let a = _mm_set1_ph(2.0);
21788        let r = _mm_rcp_ph(a);
21789        let e = _mm_set1_ph(0.5);
21790        assert_eq_m128h(r, e);
21791    }
21792
21793    #[simd_test(enable = "avx512fp16,avx512vl")]
21794    unsafe fn test_mm_mask_rcp_ph() {
21795        let a = _mm_set1_ph(2.0);
21796        let src = _mm_set1_ph(1.0);
21797        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21798        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21799        assert_eq_m128h(r, e);
21800    }
21801
21802    #[simd_test(enable = "avx512fp16,avx512vl")]
21803    unsafe fn test_mm_maskz_rcp_ph() {
21804        let a = _mm_set1_ph(2.0);
21805        let r = _mm_maskz_rcp_ph(0b01010101, a);
21806        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21807        assert_eq_m128h(r, e);
21808    }
21809
21810    #[simd_test(enable = "avx512fp16,avx512vl")]
21811    unsafe fn test_mm256_rcp_ph() {
21812        let a = _mm256_set1_ph(2.0);
21813        let r = _mm256_rcp_ph(a);
21814        let e = _mm256_set1_ph(0.5);
21815        assert_eq_m256h(r, e);
21816    }
21817
21818    #[simd_test(enable = "avx512fp16,avx512vl")]
21819    unsafe fn test_mm256_mask_rcp_ph() {
21820        let a = _mm256_set1_ph(2.0);
21821        let src = _mm256_set1_ph(1.0);
21822        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21823        let e = _mm256_set_ph(
21824            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21825        );
21826        assert_eq_m256h(r, e);
21827    }
21828
21829    #[simd_test(enable = "avx512fp16,avx512vl")]
21830    unsafe fn test_mm256_maskz_rcp_ph() {
21831        let a = _mm256_set1_ph(2.0);
21832        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21833        let e = _mm256_set_ph(
21834            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21835        );
21836        assert_eq_m256h(r, e);
21837    }
21838
21839    #[simd_test(enable = "avx512fp16")]
21840    unsafe fn test_mm512_rcp_ph() {
21841        let a = _mm512_set1_ph(2.0);
21842        let r = _mm512_rcp_ph(a);
21843        let e = _mm512_set1_ph(0.5);
21844        assert_eq_m512h(r, e);
21845    }
21846
21847    #[simd_test(enable = "avx512fp16")]
21848    unsafe fn test_mm512_mask_rcp_ph() {
21849        let a = _mm512_set1_ph(2.0);
21850        let src = _mm512_set1_ph(1.0);
21851        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21852        let e = _mm512_set_ph(
21853            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21854            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21855        );
21856        assert_eq_m512h(r, e);
21857    }
21858
21859    #[simd_test(enable = "avx512fp16")]
21860    unsafe fn test_mm512_maskz_rcp_ph() {
21861        let a = _mm512_set1_ph(2.0);
21862        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21863        let e = _mm512_set_ph(
21864            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21865            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21866        );
21867        assert_eq_m512h(r, e);
21868    }
21869
21870    #[simd_test(enable = "avx512fp16")]
21871    unsafe fn test_mm_rcp_sh() {
21872        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21873        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21874        let r = _mm_rcp_sh(a, b);
21875        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21876        assert_eq_m128h(r, e);
21877    }
21878
21879    #[simd_test(enable = "avx512fp16")]
21880    unsafe fn test_mm_mask_rcp_sh() {
21881        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21882        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21883        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21884        let r = _mm_mask_rcp_sh(src, 0, a, b);
21885        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21886        assert_eq_m128h(r, e);
21887        let r = _mm_mask_rcp_sh(src, 1, a, b);
21888        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21889        assert_eq_m128h(r, e);
21890    }
21891
21892    #[simd_test(enable = "avx512fp16")]
21893    unsafe fn test_mm_maskz_rcp_sh() {
21894        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21895        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21896        let r = _mm_maskz_rcp_sh(0, a, b);
21897        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21898        assert_eq_m128h(r, e);
21899        let r = _mm_maskz_rcp_sh(1, a, b);
21900        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21901        assert_eq_m128h(r, e);
21902    }
21903
21904    #[simd_test(enable = "avx512fp16,avx512vl")]
21905    unsafe fn test_mm_rsqrt_ph() {
21906        let a = _mm_set1_ph(4.0);
21907        let r = _mm_rsqrt_ph(a);
21908        let e = _mm_set1_ph(0.5);
21909        assert_eq_m128h(r, e);
21910    }
21911
21912    #[simd_test(enable = "avx512fp16,avx512vl")]
21913    unsafe fn test_mm_mask_rsqrt_ph() {
21914        let a = _mm_set1_ph(4.0);
21915        let src = _mm_set1_ph(1.0);
21916        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21917        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21918        assert_eq_m128h(r, e);
21919    }
21920
21921    #[simd_test(enable = "avx512fp16,avx512vl")]
21922    unsafe fn test_mm_maskz_rsqrt_ph() {
21923        let a = _mm_set1_ph(4.0);
21924        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21925        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21926        assert_eq_m128h(r, e);
21927    }
21928
21929    #[simd_test(enable = "avx512fp16,avx512vl")]
21930    unsafe fn test_mm256_rsqrt_ph() {
21931        let a = _mm256_set1_ph(4.0);
21932        let r = _mm256_rsqrt_ph(a);
21933        let e = _mm256_set1_ph(0.5);
21934        assert_eq_m256h(r, e);
21935    }
21936
21937    #[simd_test(enable = "avx512fp16,avx512vl")]
21938    unsafe fn test_mm256_mask_rsqrt_ph() {
21939        let a = _mm256_set1_ph(4.0);
21940        let src = _mm256_set1_ph(1.0);
21941        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
21942        let e = _mm256_set_ph(
21943            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21944        );
21945        assert_eq_m256h(r, e);
21946    }
21947
21948    #[simd_test(enable = "avx512fp16,avx512vl")]
21949    unsafe fn test_mm256_maskz_rsqrt_ph() {
21950        let a = _mm256_set1_ph(4.0);
21951        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
21952        let e = _mm256_set_ph(
21953            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21954        );
21955        assert_eq_m256h(r, e);
21956    }
21957
21958    #[simd_test(enable = "avx512fp16")]
21959    unsafe fn test_mm512_rsqrt_ph() {
21960        let a = _mm512_set1_ph(4.0);
21961        let r = _mm512_rsqrt_ph(a);
21962        let e = _mm512_set1_ph(0.5);
21963        assert_eq_m512h(r, e);
21964    }
21965
21966    #[simd_test(enable = "avx512fp16")]
21967    unsafe fn test_mm512_mask_rsqrt_ph() {
21968        let a = _mm512_set1_ph(4.0);
21969        let src = _mm512_set1_ph(1.0);
21970        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
21971        let e = _mm512_set_ph(
21972            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21973            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21974        );
21975        assert_eq_m512h(r, e);
21976    }
21977
21978    #[simd_test(enable = "avx512fp16")]
21979    unsafe fn test_mm512_maskz_rsqrt_ph() {
21980        let a = _mm512_set1_ph(4.0);
21981        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
21982        let e = _mm512_set_ph(
21983            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21984            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21985        );
21986        assert_eq_m512h(r, e);
21987    }
21988
21989    #[simd_test(enable = "avx512fp16")]
21990    unsafe fn test_mm_rsqrt_sh() {
21991        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21992        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21993        let r = _mm_rsqrt_sh(a, b);
21994        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21995        assert_eq_m128h(r, e);
21996    }
21997
21998    #[simd_test(enable = "avx512fp16")]
21999    unsafe fn test_mm_mask_rsqrt_sh() {
22000        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22001        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22002        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22003        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22004        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22005        assert_eq_m128h(r, e);
22006        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22007        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22008        assert_eq_m128h(r, e);
22009    }
22010
22011    #[simd_test(enable = "avx512fp16")]
22012    unsafe fn test_mm_maskz_rsqrt_sh() {
22013        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22014        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22015        let r = _mm_maskz_rsqrt_sh(0, a, b);
22016        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22017        assert_eq_m128h(r, e);
22018        let r = _mm_maskz_rsqrt_sh(1, a, b);
22019        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22020        assert_eq_m128h(r, e);
22021    }
22022
22023    #[simd_test(enable = "avx512fp16,avx512vl")]
22024    unsafe fn test_mm_sqrt_ph() {
22025        let a = _mm_set1_ph(4.0);
22026        let r = _mm_sqrt_ph(a);
22027        let e = _mm_set1_ph(2.0);
22028        assert_eq_m128h(r, e);
22029    }
22030
22031    #[simd_test(enable = "avx512fp16,avx512vl")]
22032    unsafe fn test_mm_mask_sqrt_ph() {
22033        let a = _mm_set1_ph(4.0);
22034        let src = _mm_set1_ph(1.0);
22035        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22036        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22037        assert_eq_m128h(r, e);
22038    }
22039
22040    #[simd_test(enable = "avx512fp16,avx512vl")]
22041    unsafe fn test_mm_maskz_sqrt_ph() {
22042        let a = _mm_set1_ph(4.0);
22043        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22044        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22045        assert_eq_m128h(r, e);
22046    }
22047
22048    #[simd_test(enable = "avx512fp16,avx512vl")]
22049    unsafe fn test_mm256_sqrt_ph() {
22050        let a = _mm256_set1_ph(4.0);
22051        let r = _mm256_sqrt_ph(a);
22052        let e = _mm256_set1_ph(2.0);
22053        assert_eq_m256h(r, e);
22054    }
22055
22056    #[simd_test(enable = "avx512fp16,avx512vl")]
22057    unsafe fn test_mm256_mask_sqrt_ph() {
22058        let a = _mm256_set1_ph(4.0);
22059        let src = _mm256_set1_ph(1.0);
22060        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22061        let e = _mm256_set_ph(
22062            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22063        );
22064        assert_eq_m256h(r, e);
22065    }
22066
22067    #[simd_test(enable = "avx512fp16,avx512vl")]
22068    unsafe fn test_mm256_maskz_sqrt_ph() {
22069        let a = _mm256_set1_ph(4.0);
22070        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22071        let e = _mm256_set_ph(
22072            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22073        );
22074        assert_eq_m256h(r, e);
22075    }
22076
22077    #[simd_test(enable = "avx512fp16")]
22078    unsafe fn test_mm512_sqrt_ph() {
22079        let a = _mm512_set1_ph(4.0);
22080        let r = _mm512_sqrt_ph(a);
22081        let e = _mm512_set1_ph(2.0);
22082        assert_eq_m512h(r, e);
22083    }
22084
22085    #[simd_test(enable = "avx512fp16")]
22086    unsafe fn test_mm512_mask_sqrt_ph() {
22087        let a = _mm512_set1_ph(4.0);
22088        let src = _mm512_set1_ph(1.0);
22089        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22090        let e = _mm512_set_ph(
22091            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22092            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22093        );
22094        assert_eq_m512h(r, e);
22095    }
22096
22097    #[simd_test(enable = "avx512fp16")]
22098    unsafe fn test_mm512_maskz_sqrt_ph() {
22099        let a = _mm512_set1_ph(4.0);
22100        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22101        let e = _mm512_set_ph(
22102            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22103            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22104        );
22105        assert_eq_m512h(r, e);
22106    }
22107
22108    #[simd_test(enable = "avx512fp16")]
22109    unsafe fn test_mm512_sqrt_round_ph() {
22110        let a = _mm512_set1_ph(4.0);
22111        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22112        let e = _mm512_set1_ph(2.0);
22113        assert_eq_m512h(r, e);
22114    }
22115
22116    #[simd_test(enable = "avx512fp16")]
22117    unsafe fn test_mm512_mask_sqrt_round_ph() {
22118        let a = _mm512_set1_ph(4.0);
22119        let src = _mm512_set1_ph(1.0);
22120        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22121            src,
22122            0b01010101010101010101010101010101,
22123            a,
22124        );
22125        let e = _mm512_set_ph(
22126            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22127            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22128        );
22129        assert_eq_m512h(r, e);
22130    }
22131
22132    #[simd_test(enable = "avx512fp16")]
22133    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22134        let a = _mm512_set1_ph(4.0);
22135        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22136            0b01010101010101010101010101010101,
22137            a,
22138        );
22139        let e = _mm512_set_ph(
22140            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22141            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22142        );
22143        assert_eq_m512h(r, e);
22144    }
22145
22146    #[simd_test(enable = "avx512fp16")]
22147    unsafe fn test_mm_sqrt_sh() {
22148        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22149        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22150        let r = _mm_sqrt_sh(a, b);
22151        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22152        assert_eq_m128h(r, e);
22153    }
22154
22155    #[simd_test(enable = "avx512fp16")]
22156    unsafe fn test_mm_mask_sqrt_sh() {
22157        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22158        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22159        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22160        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22161        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22162        assert_eq_m128h(r, e);
22163        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22164        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22165        assert_eq_m128h(r, e);
22166    }
22167
22168    #[simd_test(enable = "avx512fp16")]
22169    unsafe fn test_mm_maskz_sqrt_sh() {
22170        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22171        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22172        let r = _mm_maskz_sqrt_sh(0, a, b);
22173        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22174        assert_eq_m128h(r, e);
22175        let r = _mm_maskz_sqrt_sh(1, a, b);
22176        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22177        assert_eq_m128h(r, e);
22178    }
22179
22180    #[simd_test(enable = "avx512fp16")]
22181    unsafe fn test_mm_sqrt_round_sh() {
22182        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22183        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22184        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22185        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22186        assert_eq_m128h(r, e);
22187    }
22188
22189    #[simd_test(enable = "avx512fp16")]
22190    unsafe fn test_mm_mask_sqrt_round_sh() {
22191        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22192        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22193        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22194        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22195            src, 0, a, b,
22196        );
22197        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22198        assert_eq_m128h(r, e);
22199        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22200            src, 1, a, b,
22201        );
22202        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22203        assert_eq_m128h(r, e);
22204    }
22205
22206    #[simd_test(enable = "avx512fp16")]
22207    unsafe fn test_mm_maskz_sqrt_round_sh() {
22208        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22209        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22210        let r =
22211            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22212        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22213        assert_eq_m128h(r, e);
22214        let r =
22215            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22216        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22217        assert_eq_m128h(r, e);
22218    }
22219
22220    #[simd_test(enable = "avx512fp16,avx512vl")]
22221    unsafe fn test_mm_max_ph() {
22222        let a = _mm_set1_ph(2.0);
22223        let b = _mm_set1_ph(1.0);
22224        let r = _mm_max_ph(a, b);
22225        let e = _mm_set1_ph(2.0);
22226        assert_eq_m128h(r, e);
22227    }
22228
22229    #[simd_test(enable = "avx512fp16,avx512vl")]
22230    unsafe fn test_mm_mask_max_ph() {
22231        let a = _mm_set1_ph(2.0);
22232        let b = _mm_set1_ph(1.0);
22233        let src = _mm_set1_ph(3.0);
22234        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22235        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22236        assert_eq_m128h(r, e);
22237    }
22238
22239    #[simd_test(enable = "avx512fp16,avx512vl")]
22240    unsafe fn test_mm_maskz_max_ph() {
22241        let a = _mm_set1_ph(2.0);
22242        let b = _mm_set1_ph(1.0);
22243        let r = _mm_maskz_max_ph(0b01010101, a, b);
22244        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22245        assert_eq_m128h(r, e);
22246    }
22247
22248    #[simd_test(enable = "avx512fp16,avx512vl")]
22249    unsafe fn test_mm256_max_ph() {
22250        let a = _mm256_set1_ph(2.0);
22251        let b = _mm256_set1_ph(1.0);
22252        let r = _mm256_max_ph(a, b);
22253        let e = _mm256_set1_ph(2.0);
22254        assert_eq_m256h(r, e);
22255    }
22256
22257    #[simd_test(enable = "avx512fp16,avx512vl")]
22258    unsafe fn test_mm256_mask_max_ph() {
22259        let a = _mm256_set1_ph(2.0);
22260        let b = _mm256_set1_ph(1.0);
22261        let src = _mm256_set1_ph(3.0);
22262        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22263        let e = _mm256_set_ph(
22264            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22265        );
22266        assert_eq_m256h(r, e);
22267    }
22268
22269    #[simd_test(enable = "avx512fp16,avx512vl")]
22270    unsafe fn test_mm256_maskz_max_ph() {
22271        let a = _mm256_set1_ph(2.0);
22272        let b = _mm256_set1_ph(1.0);
22273        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22274        let e = _mm256_set_ph(
22275            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22276        );
22277        assert_eq_m256h(r, e);
22278    }
22279
22280    #[simd_test(enable = "avx512fp16")]
22281    unsafe fn test_mm512_max_ph() {
22282        let a = _mm512_set1_ph(2.0);
22283        let b = _mm512_set1_ph(1.0);
22284        let r = _mm512_max_ph(a, b);
22285        let e = _mm512_set1_ph(2.0);
22286        assert_eq_m512h(r, e);
22287    }
22288
22289    #[simd_test(enable = "avx512fp16")]
22290    unsafe fn test_mm512_mask_max_ph() {
22291        let a = _mm512_set1_ph(2.0);
22292        let b = _mm512_set1_ph(1.0);
22293        let src = _mm512_set1_ph(3.0);
22294        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22295        let e = _mm512_set_ph(
22296            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22297            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22298        );
22299        assert_eq_m512h(r, e);
22300    }
22301
22302    #[simd_test(enable = "avx512fp16")]
22303    unsafe fn test_mm512_maskz_max_ph() {
22304        let a = _mm512_set1_ph(2.0);
22305        let b = _mm512_set1_ph(1.0);
22306        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22307        let e = _mm512_set_ph(
22308            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22309            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22310        );
22311        assert_eq_m512h(r, e);
22312    }
22313
22314    #[simd_test(enable = "avx512fp16")]
22315    unsafe fn test_mm512_max_round_ph() {
22316        let a = _mm512_set1_ph(2.0);
22317        let b = _mm512_set1_ph(1.0);
22318        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22319        let e = _mm512_set1_ph(2.0);
22320        assert_eq_m512h(r, e);
22321    }
22322
22323    #[simd_test(enable = "avx512fp16")]
22324    unsafe fn test_mm512_mask_max_round_ph() {
22325        let a = _mm512_set1_ph(2.0);
22326        let b = _mm512_set1_ph(1.0);
22327        let src = _mm512_set1_ph(3.0);
22328        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22329            src,
22330            0b01010101010101010101010101010101,
22331            a,
22332            b,
22333        );
22334        let e = _mm512_set_ph(
22335            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22336            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22337        );
22338        assert_eq_m512h(r, e);
22339    }
22340
22341    #[simd_test(enable = "avx512fp16")]
22342    unsafe fn test_mm512_maskz_max_round_ph() {
22343        let a = _mm512_set1_ph(2.0);
22344        let b = _mm512_set1_ph(1.0);
22345        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22346            0b01010101010101010101010101010101,
22347            a,
22348            b,
22349        );
22350        let e = _mm512_set_ph(
22351            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22352            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22353        );
22354        assert_eq_m512h(r, e);
22355    }
22356
22357    #[simd_test(enable = "avx512fp16")]
22358    unsafe fn test_mm_max_sh() {
22359        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22360        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22361        let r = _mm_max_sh(a, b);
22362        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22363        assert_eq_m128h(r, e);
22364    }
22365
22366    #[simd_test(enable = "avx512fp16")]
22367    unsafe fn test_mm_mask_max_sh() {
22368        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22369        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22370        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22371        let r = _mm_mask_max_sh(src, 0, a, b);
22372        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22373        assert_eq_m128h(r, e);
22374        let r = _mm_mask_max_sh(src, 1, a, b);
22375        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22376        assert_eq_m128h(r, e);
22377    }
22378
22379    #[simd_test(enable = "avx512fp16")]
22380    unsafe fn test_mm_maskz_max_sh() {
22381        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22382        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22383        let r = _mm_maskz_max_sh(0, a, b);
22384        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22385        assert_eq_m128h(r, e);
22386        let r = _mm_maskz_max_sh(1, a, b);
22387        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22388        assert_eq_m128h(r, e);
22389    }
22390
22391    #[simd_test(enable = "avx512fp16")]
22392    unsafe fn test_mm_max_round_sh() {
22393        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22394        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22395        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22396        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22397        assert_eq_m128h(r, e);
22398    }
22399
22400    #[simd_test(enable = "avx512fp16")]
22401    unsafe fn test_mm_mask_max_round_sh() {
22402        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22403        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22404        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22405        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22406            src, 0, a, b,
22407        );
22408        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22409        assert_eq_m128h(r, e);
22410        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22411            src, 1, a, b,
22412        );
22413        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22414        assert_eq_m128h(r, e);
22415    }
22416
22417    #[simd_test(enable = "avx512fp16")]
22418    unsafe fn test_mm_maskz_max_round_sh() {
22419        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22420        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22421        let r =
22422            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22423        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22424        assert_eq_m128h(r, e);
22425        let r =
22426            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22427        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22428        assert_eq_m128h(r, e);
22429    }
22430
22431    #[simd_test(enable = "avx512fp16,avx512vl")]
22432    unsafe fn test_mm_min_ph() {
22433        let a = _mm_set1_ph(2.0);
22434        let b = _mm_set1_ph(1.0);
22435        let r = _mm_min_ph(a, b);
22436        let e = _mm_set1_ph(1.0);
22437        assert_eq_m128h(r, e);
22438    }
22439
22440    #[simd_test(enable = "avx512fp16,avx512vl")]
22441    unsafe fn test_mm_mask_min_ph() {
22442        let a = _mm_set1_ph(2.0);
22443        let b = _mm_set1_ph(1.0);
22444        let src = _mm_set1_ph(3.0);
22445        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22446        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22447        assert_eq_m128h(r, e);
22448    }
22449
22450    #[simd_test(enable = "avx512fp16,avx512vl")]
22451    unsafe fn test_mm_maskz_min_ph() {
22452        let a = _mm_set1_ph(2.0);
22453        let b = _mm_set1_ph(1.0);
22454        let r = _mm_maskz_min_ph(0b01010101, a, b);
22455        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22456        assert_eq_m128h(r, e);
22457    }
22458
22459    #[simd_test(enable = "avx512fp16,avx512vl")]
22460    unsafe fn test_mm256_min_ph() {
22461        let a = _mm256_set1_ph(2.0);
22462        let b = _mm256_set1_ph(1.0);
22463        let r = _mm256_min_ph(a, b);
22464        let e = _mm256_set1_ph(1.0);
22465        assert_eq_m256h(r, e);
22466    }
22467
22468    #[simd_test(enable = "avx512fp16,avx512vl")]
22469    unsafe fn test_mm256_mask_min_ph() {
22470        let a = _mm256_set1_ph(2.0);
22471        let b = _mm256_set1_ph(1.0);
22472        let src = _mm256_set1_ph(3.0);
22473        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22474        let e = _mm256_set_ph(
22475            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22476        );
22477        assert_eq_m256h(r, e);
22478    }
22479
22480    #[simd_test(enable = "avx512fp16,avx512vl")]
22481    unsafe fn test_mm256_maskz_min_ph() {
22482        let a = _mm256_set1_ph(2.0);
22483        let b = _mm256_set1_ph(1.0);
22484        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22485        let e = _mm256_set_ph(
22486            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22487        );
22488        assert_eq_m256h(r, e);
22489    }
22490
22491    #[simd_test(enable = "avx512fp16")]
22492    unsafe fn test_mm512_min_ph() {
22493        let a = _mm512_set1_ph(2.0);
22494        let b = _mm512_set1_ph(1.0);
22495        let r = _mm512_min_ph(a, b);
22496        let e = _mm512_set1_ph(1.0);
22497        assert_eq_m512h(r, e);
22498    }
22499
22500    #[simd_test(enable = "avx512fp16")]
22501    unsafe fn test_mm512_mask_min_ph() {
22502        let a = _mm512_set1_ph(2.0);
22503        let b = _mm512_set1_ph(1.0);
22504        let src = _mm512_set1_ph(3.0);
22505        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22506        let e = _mm512_set_ph(
22507            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22508            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22509        );
22510        assert_eq_m512h(r, e);
22511    }
22512
22513    #[simd_test(enable = "avx512fp16")]
22514    unsafe fn test_mm512_maskz_min_ph() {
22515        let a = _mm512_set1_ph(2.0);
22516        let b = _mm512_set1_ph(1.0);
22517        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22518        let e = _mm512_set_ph(
22519            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22520            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22521        );
22522        assert_eq_m512h(r, e);
22523    }
22524
22525    #[simd_test(enable = "avx512fp16")]
22526    unsafe fn test_mm512_min_round_ph() {
22527        let a = _mm512_set1_ph(2.0);
22528        let b = _mm512_set1_ph(1.0);
22529        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22530        let e = _mm512_set1_ph(1.0);
22531        assert_eq_m512h(r, e);
22532    }
22533
22534    #[simd_test(enable = "avx512fp16")]
22535    unsafe fn test_mm512_mask_min_round_ph() {
22536        let a = _mm512_set1_ph(2.0);
22537        let b = _mm512_set1_ph(1.0);
22538        let src = _mm512_set1_ph(3.0);
22539        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22540            src,
22541            0b01010101010101010101010101010101,
22542            a,
22543            b,
22544        );
22545        let e = _mm512_set_ph(
22546            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22547            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22548        );
22549        assert_eq_m512h(r, e);
22550    }
22551
22552    #[simd_test(enable = "avx512fp16")]
22553    unsafe fn test_mm512_maskz_min_round_ph() {
22554        let a = _mm512_set1_ph(2.0);
22555        let b = _mm512_set1_ph(1.0);
22556        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22557            0b01010101010101010101010101010101,
22558            a,
22559            b,
22560        );
22561        let e = _mm512_set_ph(
22562            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22563            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22564        );
22565        assert_eq_m512h(r, e);
22566    }
22567
22568    #[simd_test(enable = "avx512fp16")]
22569    unsafe fn test_mm_min_sh() {
22570        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22571        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22572        let r = _mm_min_sh(a, b);
22573        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22574        assert_eq_m128h(r, e);
22575    }
22576
22577    #[simd_test(enable = "avx512fp16")]
22578    unsafe fn test_mm_mask_min_sh() {
22579        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22580        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22581        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22582        let r = _mm_mask_min_sh(src, 0, a, b);
22583        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22584        assert_eq_m128h(r, e);
22585        let r = _mm_mask_min_sh(src, 1, a, b);
22586        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22587        assert_eq_m128h(r, e);
22588    }
22589
22590    #[simd_test(enable = "avx512fp16")]
22591    unsafe fn test_mm_maskz_min_sh() {
22592        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22593        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22594        let r = _mm_maskz_min_sh(0, a, b);
22595        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22596        assert_eq_m128h(r, e);
22597        let r = _mm_maskz_min_sh(1, a, b);
22598        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22599        assert_eq_m128h(r, e);
22600    }
22601
22602    #[simd_test(enable = "avx512fp16")]
22603    unsafe fn test_mm_min_round_sh() {
22604        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22605        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22606        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22607        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22608        assert_eq_m128h(r, e);
22609    }
22610
22611    #[simd_test(enable = "avx512fp16")]
22612    unsafe fn test_mm_mask_min_round_sh() {
22613        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22614        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22615        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22616        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22617            src, 0, a, b,
22618        );
22619        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22620        assert_eq_m128h(r, e);
22621        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22622            src, 1, a, b,
22623        );
22624        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22625        assert_eq_m128h(r, e);
22626    }
22627
22628    #[simd_test(enable = "avx512fp16")]
22629    unsafe fn test_mm_maskz_min_round_sh() {
22630        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22631        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22632        let r =
22633            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22634        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22635        assert_eq_m128h(r, e);
22636        let r =
22637            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22638        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22639        assert_eq_m128h(r, e);
22640    }
22641
22642    #[simd_test(enable = "avx512fp16,avx512vl")]
22643    unsafe fn test_mm_getexp_ph() {
22644        let a = _mm_set1_ph(3.0);
22645        let r = _mm_getexp_ph(a);
22646        let e = _mm_set1_ph(1.0);
22647        assert_eq_m128h(r, e);
22648    }
22649
22650    #[simd_test(enable = "avx512fp16,avx512vl")]
22651    unsafe fn test_mm_mask_getexp_ph() {
22652        let a = _mm_set1_ph(3.0);
22653        let src = _mm_set1_ph(4.0);
22654        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22655        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22656        assert_eq_m128h(r, e);
22657    }
22658
22659    #[simd_test(enable = "avx512fp16,avx512vl")]
22660    unsafe fn test_mm_maskz_getexp_ph() {
22661        let a = _mm_set1_ph(3.0);
22662        let r = _mm_maskz_getexp_ph(0b01010101, a);
22663        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22664        assert_eq_m128h(r, e);
22665    }
22666
22667    #[simd_test(enable = "avx512fp16,avx512vl")]
22668    unsafe fn test_mm256_getexp_ph() {
22669        let a = _mm256_set1_ph(3.0);
22670        let r = _mm256_getexp_ph(a);
22671        let e = _mm256_set1_ph(1.0);
22672        assert_eq_m256h(r, e);
22673    }
22674
22675    #[simd_test(enable = "avx512fp16,avx512vl")]
22676    unsafe fn test_mm256_mask_getexp_ph() {
22677        let a = _mm256_set1_ph(3.0);
22678        let src = _mm256_set1_ph(4.0);
22679        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22680        let e = _mm256_set_ph(
22681            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22682        );
22683        assert_eq_m256h(r, e);
22684    }
22685
22686    #[simd_test(enable = "avx512fp16,avx512vl")]
22687    unsafe fn test_mm256_maskz_getexp_ph() {
22688        let a = _mm256_set1_ph(3.0);
22689        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22690        let e = _mm256_set_ph(
22691            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22692        );
22693        assert_eq_m256h(r, e);
22694    }
22695
22696    #[simd_test(enable = "avx512fp16")]
22697    unsafe fn test_mm512_getexp_ph() {
22698        let a = _mm512_set1_ph(3.0);
22699        let r = _mm512_getexp_ph(a);
22700        let e = _mm512_set1_ph(1.0);
22701        assert_eq_m512h(r, e);
22702    }
22703
22704    #[simd_test(enable = "avx512fp16")]
22705    unsafe fn test_mm512_mask_getexp_ph() {
22706        let a = _mm512_set1_ph(3.0);
22707        let src = _mm512_set1_ph(4.0);
22708        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22709        let e = _mm512_set_ph(
22710            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22711            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22712        );
22713        assert_eq_m512h(r, e);
22714    }
22715
22716    #[simd_test(enable = "avx512fp16")]
22717    unsafe fn test_mm512_maskz_getexp_ph() {
22718        let a = _mm512_set1_ph(3.0);
22719        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22720        let e = _mm512_set_ph(
22721            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22722            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22723        );
22724        assert_eq_m512h(r, e);
22725    }
22726
22727    #[simd_test(enable = "avx512fp16")]
22728    unsafe fn test_mm512_getexp_round_ph() {
22729        let a = _mm512_set1_ph(3.0);
22730        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22731        let e = _mm512_set1_ph(1.0);
22732        assert_eq_m512h(r, e);
22733    }
22734
22735    #[simd_test(enable = "avx512fp16")]
22736    unsafe fn test_mm512_mask_getexp_round_ph() {
22737        let a = _mm512_set1_ph(3.0);
22738        let src = _mm512_set1_ph(4.0);
22739        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22740            src,
22741            0b01010101010101010101010101010101,
22742            a,
22743        );
22744        let e = _mm512_set_ph(
22745            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22746            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22747        );
22748        assert_eq_m512h(r, e);
22749    }
22750
22751    #[simd_test(enable = "avx512fp16")]
22752    unsafe fn test_mm512_maskz_getexp_round_ph() {
22753        let a = _mm512_set1_ph(3.0);
22754        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22755            0b01010101010101010101010101010101,
22756            a,
22757        );
22758        let e = _mm512_set_ph(
22759            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22760            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22761        );
22762        assert_eq_m512h(r, e);
22763    }
22764
22765    #[simd_test(enable = "avx512fp16")]
22766    unsafe fn test_mm_getexp_sh() {
22767        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22768        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22769        let r = _mm_getexp_sh(a, b);
22770        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22771        assert_eq_m128h(r, e);
22772    }
22773
22774    #[simd_test(enable = "avx512fp16")]
22775    unsafe fn test_mm_mask_getexp_sh() {
22776        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22777        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22778        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22779        let r = _mm_mask_getexp_sh(src, 0, a, b);
22780        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22781        assert_eq_m128h(r, e);
22782        let r = _mm_mask_getexp_sh(src, 1, a, b);
22783        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22784        assert_eq_m128h(r, e);
22785    }
22786
22787    #[simd_test(enable = "avx512fp16")]
22788    unsafe fn test_mm_maskz_getexp_sh() {
22789        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22790        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22791        let r = _mm_maskz_getexp_sh(0, a, b);
22792        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22793        assert_eq_m128h(r, e);
22794        let r = _mm_maskz_getexp_sh(1, a, b);
22795        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22796        assert_eq_m128h(r, e);
22797    }
22798
22799    #[simd_test(enable = "avx512fp16")]
22800    unsafe fn test_mm_getexp_round_sh() {
22801        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22802        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22803        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22804        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22805        assert_eq_m128h(r, e);
22806    }
22807
22808    #[simd_test(enable = "avx512fp16")]
22809    unsafe fn test_mm_mask_getexp_round_sh() {
22810        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22811        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22812        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22813        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22814        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22815        assert_eq_m128h(r, e);
22816        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22817        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22818        assert_eq_m128h(r, e);
22819    }
22820
22821    #[simd_test(enable = "avx512fp16")]
22822    unsafe fn test_mm_maskz_getexp_round_sh() {
22823        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22824        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22825        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22826        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22827        assert_eq_m128h(r, e);
22828        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22829        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22830        assert_eq_m128h(r, e);
22831    }
22832
22833    #[simd_test(enable = "avx512fp16,avx512vl")]
22834    unsafe fn test_mm_getmant_ph() {
22835        let a = _mm_set1_ph(10.0);
22836        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22837        let e = _mm_set1_ph(1.25);
22838        assert_eq_m128h(r, e);
22839    }
22840
22841    #[simd_test(enable = "avx512fp16,avx512vl")]
22842    unsafe fn test_mm_mask_getmant_ph() {
22843        let a = _mm_set1_ph(10.0);
22844        let src = _mm_set1_ph(20.0);
22845        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22846        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22847        assert_eq_m128h(r, e);
22848    }
22849
22850    #[simd_test(enable = "avx512fp16,avx512vl")]
22851    unsafe fn test_mm_maskz_getmant_ph() {
22852        let a = _mm_set1_ph(10.0);
22853        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22854        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22855        assert_eq_m128h(r, e);
22856    }
22857
22858    #[simd_test(enable = "avx512fp16,avx512vl")]
22859    unsafe fn test_mm256_getmant_ph() {
22860        let a = _mm256_set1_ph(10.0);
22861        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22862        let e = _mm256_set1_ph(1.25);
22863        assert_eq_m256h(r, e);
22864    }
22865
22866    #[simd_test(enable = "avx512fp16,avx512vl")]
22867    unsafe fn test_mm256_mask_getmant_ph() {
22868        let a = _mm256_set1_ph(10.0);
22869        let src = _mm256_set1_ph(20.0);
22870        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22871            src,
22872            0b0101010101010101,
22873            a,
22874        );
22875        let e = _mm256_set_ph(
22876            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22877            20.0, 1.25,
22878        );
22879        assert_eq_m256h(r, e);
22880    }
22881
22882    #[simd_test(enable = "avx512fp16,avx512vl")]
22883    unsafe fn test_mm256_maskz_getmant_ph() {
22884        let a = _mm256_set1_ph(10.0);
22885        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22886            0b0101010101010101,
22887            a,
22888        );
22889        let e = _mm256_set_ph(
22890            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22891        );
22892        assert_eq_m256h(r, e);
22893    }
22894
22895    #[simd_test(enable = "avx512fp16")]
22896    unsafe fn test_mm512_getmant_ph() {
22897        let a = _mm512_set1_ph(10.0);
22898        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22899        let e = _mm512_set1_ph(1.25);
22900        assert_eq_m512h(r, e);
22901    }
22902
22903    #[simd_test(enable = "avx512fp16")]
22904    unsafe fn test_mm512_mask_getmant_ph() {
22905        let a = _mm512_set1_ph(10.0);
22906        let src = _mm512_set1_ph(20.0);
22907        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22908            src,
22909            0b01010101010101010101010101010101,
22910            a,
22911        );
22912        let e = _mm512_set_ph(
22913            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22914            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22915            20.0, 1.25, 20.0, 1.25,
22916        );
22917        assert_eq_m512h(r, e);
22918    }
22919
22920    #[simd_test(enable = "avx512fp16")]
22921    unsafe fn test_mm512_maskz_getmant_ph() {
22922        let a = _mm512_set1_ph(10.0);
22923        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22924            0b01010101010101010101010101010101,
22925            a,
22926        );
22927        let e = _mm512_set_ph(
22928            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22929            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22930        );
22931        assert_eq_m512h(r, e);
22932    }
22933
22934    #[simd_test(enable = "avx512fp16")]
22935    unsafe fn test_mm512_getmant_round_ph() {
22936        let a = _mm512_set1_ph(10.0);
22937        let r =
22938            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22939                a,
22940            );
22941        let e = _mm512_set1_ph(1.25);
22942        assert_eq_m512h(r, e);
22943    }
22944
22945    #[simd_test(enable = "avx512fp16")]
22946    unsafe fn test_mm512_mask_getmant_round_ph() {
22947        let a = _mm512_set1_ph(10.0);
22948        let src = _mm512_set1_ph(20.0);
22949        let r = _mm512_mask_getmant_round_ph::<
22950            _MM_MANT_NORM_P75_1P5,
22951            _MM_MANT_SIGN_NAN,
22952            _MM_FROUND_NO_EXC,
22953        >(src, 0b01010101010101010101010101010101, a);
22954        let e = _mm512_set_ph(
22955            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22956            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22957            20.0, 1.25, 20.0, 1.25,
22958        );
22959        assert_eq_m512h(r, e);
22960    }
22961
22962    #[simd_test(enable = "avx512fp16")]
22963    unsafe fn test_mm512_maskz_getmant_round_ph() {
22964        let a = _mm512_set1_ph(10.0);
22965        let r = _mm512_maskz_getmant_round_ph::<
22966            _MM_MANT_NORM_P75_1P5,
22967            _MM_MANT_SIGN_NAN,
22968            _MM_FROUND_NO_EXC,
22969        >(0b01010101010101010101010101010101, a);
22970        let e = _mm512_set_ph(
22971            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22972            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22973        );
22974        assert_eq_m512h(r, e);
22975    }
22976
22977    #[simd_test(enable = "avx512fp16")]
22978    unsafe fn test_mm_getmant_sh() {
22979        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22980        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22981        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
22982        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22983        assert_eq_m128h(r, e);
22984    }
22985
22986    #[simd_test(enable = "avx512fp16")]
22987    unsafe fn test_mm_mask_getmant_sh() {
22988        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22989        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22990        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22991        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
22992        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
22993        assert_eq_m128h(r, e);
22994        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
22995        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22996        assert_eq_m128h(r, e);
22997    }
22998
22999    #[simd_test(enable = "avx512fp16")]
23000    unsafe fn test_mm_maskz_getmant_sh() {
23001        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23002        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23003        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23004        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23005        assert_eq_m128h(r, e);
23006        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23007        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23008        assert_eq_m128h(r, e);
23009    }
23010
23011    #[simd_test(enable = "avx512fp16")]
23012    unsafe fn test_mm_getmant_round_sh() {
23013        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23014        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23015        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23016            a, b,
23017        );
23018        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23019        assert_eq_m128h(r, e);
23020    }
23021
23022    #[simd_test(enable = "avx512fp16")]
23023    unsafe fn test_mm_mask_getmant_round_sh() {
23024        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23025        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23026        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23027        let r = _mm_mask_getmant_round_sh::<
23028            _MM_MANT_NORM_P75_1P5,
23029            _MM_MANT_SIGN_NAN,
23030            _MM_FROUND_NO_EXC,
23031        >(src, 0, a, b);
23032        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23033        assert_eq_m128h(r, e);
23034        let r = _mm_mask_getmant_round_sh::<
23035            _MM_MANT_NORM_P75_1P5,
23036            _MM_MANT_SIGN_NAN,
23037            _MM_FROUND_NO_EXC,
23038        >(src, 1, a, b);
23039        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23040        assert_eq_m128h(r, e);
23041    }
23042
23043    #[simd_test(enable = "avx512fp16")]
23044    unsafe fn test_mm_maskz_getmant_round_sh() {
23045        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23046        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23047        let r = _mm_maskz_getmant_round_sh::<
23048            _MM_MANT_NORM_P75_1P5,
23049            _MM_MANT_SIGN_NAN,
23050            _MM_FROUND_NO_EXC,
23051        >(0, a, b);
23052        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23053        assert_eq_m128h(r, e);
23054        let r = _mm_maskz_getmant_round_sh::<
23055            _MM_MANT_NORM_P75_1P5,
23056            _MM_MANT_SIGN_NAN,
23057            _MM_FROUND_NO_EXC,
23058        >(1, a, b);
23059        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23060        assert_eq_m128h(r, e);
23061    }
23062
23063    #[simd_test(enable = "avx512fp16,avx512vl")]
23064    unsafe fn test_mm_roundscale_ph() {
23065        let a = _mm_set1_ph(1.1);
23066        let r = _mm_roundscale_ph::<0>(a);
23067        let e = _mm_set1_ph(1.0);
23068        assert_eq_m128h(r, e);
23069    }
23070
23071    #[simd_test(enable = "avx512fp16,avx512vl")]
23072    unsafe fn test_mm_mask_roundscale_ph() {
23073        let a = _mm_set1_ph(1.1);
23074        let src = _mm_set1_ph(2.0);
23075        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23076        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23077        assert_eq_m128h(r, e);
23078    }
23079
23080    #[simd_test(enable = "avx512fp16,avx512vl")]
23081    unsafe fn test_mm_maskz_roundscale_ph() {
23082        let a = _mm_set1_ph(1.1);
23083        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23084        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23085        assert_eq_m128h(r, e);
23086    }
23087
23088    #[simd_test(enable = "avx512fp16,avx512vl")]
23089    unsafe fn test_mm256_roundscale_ph() {
23090        let a = _mm256_set1_ph(1.1);
23091        let r = _mm256_roundscale_ph::<0>(a);
23092        let e = _mm256_set1_ph(1.0);
23093        assert_eq_m256h(r, e);
23094    }
23095
23096    #[simd_test(enable = "avx512fp16,avx512vl")]
23097    unsafe fn test_mm256_mask_roundscale_ph() {
23098        let a = _mm256_set1_ph(1.1);
23099        let src = _mm256_set1_ph(2.0);
23100        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23101        let e = _mm256_set_ph(
23102            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23103        );
23104        assert_eq_m256h(r, e);
23105    }
23106
23107    #[simd_test(enable = "avx512fp16,avx512vl")]
23108    unsafe fn test_mm256_maskz_roundscale_ph() {
23109        let a = _mm256_set1_ph(1.1);
23110        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23111        let e = _mm256_set_ph(
23112            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23113        );
23114        assert_eq_m256h(r, e);
23115    }
23116
23117    #[simd_test(enable = "avx512fp16")]
23118    unsafe fn test_mm512_roundscale_ph() {
23119        let a = _mm512_set1_ph(1.1);
23120        let r = _mm512_roundscale_ph::<0>(a);
23121        let e = _mm512_set1_ph(1.0);
23122        assert_eq_m512h(r, e);
23123    }
23124
23125    #[simd_test(enable = "avx512fp16")]
23126    unsafe fn test_mm512_mask_roundscale_ph() {
23127        let a = _mm512_set1_ph(1.1);
23128        let src = _mm512_set1_ph(2.0);
23129        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23130        let e = _mm512_set_ph(
23131            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23132            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23133        );
23134        assert_eq_m512h(r, e);
23135    }
23136
23137    #[simd_test(enable = "avx512fp16")]
23138    unsafe fn test_mm512_maskz_roundscale_ph() {
23139        let a = _mm512_set1_ph(1.1);
23140        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23141        let e = _mm512_set_ph(
23142            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23143            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23144        );
23145        assert_eq_m512h(r, e);
23146    }
23147
23148    #[simd_test(enable = "avx512fp16")]
23149    unsafe fn test_mm512_roundscale_round_ph() {
23150        let a = _mm512_set1_ph(1.1);
23151        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23152        let e = _mm512_set1_ph(1.0);
23153        assert_eq_m512h(r, e);
23154    }
23155
23156    #[simd_test(enable = "avx512fp16")]
23157    unsafe fn test_mm512_mask_roundscale_round_ph() {
23158        let a = _mm512_set1_ph(1.1);
23159        let src = _mm512_set1_ph(2.0);
23160        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23161            src,
23162            0b01010101010101010101010101010101,
23163            a,
23164        );
23165        let e = _mm512_set_ph(
23166            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23167            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23168        );
23169        assert_eq_m512h(r, e);
23170    }
23171
23172    #[simd_test(enable = "avx512fp16")]
23173    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23174        let a = _mm512_set1_ph(1.1);
23175        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23176            0b01010101010101010101010101010101,
23177            a,
23178        );
23179        let e = _mm512_set_ph(
23180            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23181            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23182        );
23183        assert_eq_m512h(r, e);
23184    }
23185
23186    #[simd_test(enable = "avx512fp16")]
23187    unsafe fn test_mm_roundscale_sh() {
23188        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23189        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23190        let r = _mm_roundscale_sh::<0>(a, b);
23191        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23192        assert_eq_m128h(r, e);
23193    }
23194
23195    #[simd_test(enable = "avx512fp16")]
23196    unsafe fn test_mm_mask_roundscale_sh() {
23197        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23198        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23199        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23200        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23201        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23202        assert_eq_m128h(r, e);
23203        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23204        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23205        assert_eq_m128h(r, e);
23206    }
23207
23208    #[simd_test(enable = "avx512fp16")]
23209    unsafe fn test_mm_maskz_roundscale_sh() {
23210        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23211        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23212        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23213        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23214        assert_eq_m128h(r, e);
23215        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23216        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23217        assert_eq_m128h(r, e);
23218    }
23219
23220    #[simd_test(enable = "avx512fp16")]
23221    unsafe fn test_mm_roundscale_round_sh() {
23222        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23223        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23224        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23225        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23226        assert_eq_m128h(r, e);
23227    }
23228
23229    #[simd_test(enable = "avx512fp16")]
23230    unsafe fn test_mm_mask_roundscale_round_sh() {
23231        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23232        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23233        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23234        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23235        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23236        assert_eq_m128h(r, e);
23237        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23238        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23239        assert_eq_m128h(r, e);
23240    }
23241
23242    #[simd_test(enable = "avx512fp16")]
23243    unsafe fn test_mm_maskz_roundscale_round_sh() {
23244        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23245        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23246        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23247        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23248        assert_eq_m128h(r, e);
23249        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23250        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23251        assert_eq_m128h(r, e);
23252    }
23253
23254    #[simd_test(enable = "avx512fp16,avx512vl")]
23255    unsafe fn test_mm_scalef_ph() {
23256        let a = _mm_set1_ph(1.);
23257        let b = _mm_set1_ph(3.);
23258        let r = _mm_scalef_ph(a, b);
23259        let e = _mm_set1_ph(8.0);
23260        assert_eq_m128h(r, e);
23261    }
23262
23263    #[simd_test(enable = "avx512fp16,avx512vl")]
23264    unsafe fn test_mm_mask_scalef_ph() {
23265        let a = _mm_set1_ph(1.);
23266        let b = _mm_set1_ph(3.);
23267        let src = _mm_set1_ph(2.);
23268        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23269        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23270        assert_eq_m128h(r, e);
23271    }
23272
23273    #[simd_test(enable = "avx512fp16,avx512vl")]
23274    unsafe fn test_mm_maskz_scalef_ph() {
23275        let a = _mm_set1_ph(1.);
23276        let b = _mm_set1_ph(3.);
23277        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23278        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23279        assert_eq_m128h(r, e);
23280    }
23281
23282    #[simd_test(enable = "avx512fp16,avx512vl")]
23283    unsafe fn test_mm256_scalef_ph() {
23284        let a = _mm256_set1_ph(1.);
23285        let b = _mm256_set1_ph(3.);
23286        let r = _mm256_scalef_ph(a, b);
23287        let e = _mm256_set1_ph(8.0);
23288        assert_eq_m256h(r, e);
23289    }
23290
23291    #[simd_test(enable = "avx512fp16,avx512vl")]
23292    unsafe fn test_mm256_mask_scalef_ph() {
23293        let a = _mm256_set1_ph(1.);
23294        let b = _mm256_set1_ph(3.);
23295        let src = _mm256_set1_ph(2.);
23296        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23297        let e = _mm256_set_ph(
23298            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23299        );
23300        assert_eq_m256h(r, e);
23301    }
23302
23303    #[simd_test(enable = "avx512fp16,avx512vl")]
23304    unsafe fn test_mm256_maskz_scalef_ph() {
23305        let a = _mm256_set1_ph(1.);
23306        let b = _mm256_set1_ph(3.);
23307        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23308        let e = _mm256_set_ph(
23309            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23310        );
23311        assert_eq_m256h(r, e);
23312    }
23313
23314    #[simd_test(enable = "avx512fp16")]
23315    unsafe fn test_mm512_scalef_ph() {
23316        let a = _mm512_set1_ph(1.);
23317        let b = _mm512_set1_ph(3.);
23318        let r = _mm512_scalef_ph(a, b);
23319        let e = _mm512_set1_ph(8.0);
23320        assert_eq_m512h(r, e);
23321    }
23322
23323    #[simd_test(enable = "avx512fp16")]
23324    unsafe fn test_mm512_mask_scalef_ph() {
23325        let a = _mm512_set1_ph(1.);
23326        let b = _mm512_set1_ph(3.);
23327        let src = _mm512_set1_ph(2.);
23328        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23329        let e = _mm512_set_ph(
23330            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23331            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23332        );
23333        assert_eq_m512h(r, e);
23334    }
23335
23336    #[simd_test(enable = "avx512fp16")]
23337    unsafe fn test_mm512_maskz_scalef_ph() {
23338        let a = _mm512_set1_ph(1.);
23339        let b = _mm512_set1_ph(3.);
23340        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23341        let e = _mm512_set_ph(
23342            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23343            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23344        );
23345        assert_eq_m512h(r, e);
23346    }
23347
23348    #[simd_test(enable = "avx512fp16")]
23349    unsafe fn test_mm512_scalef_round_ph() {
23350        let a = _mm512_set1_ph(1.);
23351        let b = _mm512_set1_ph(3.);
23352        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23353        let e = _mm512_set1_ph(8.0);
23354        assert_eq_m512h(r, e);
23355    }
23356
23357    #[simd_test(enable = "avx512fp16")]
23358    unsafe fn test_mm512_mask_scalef_round_ph() {
23359        let a = _mm512_set1_ph(1.);
23360        let b = _mm512_set1_ph(3.);
23361        let src = _mm512_set1_ph(2.);
23362        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23363            src,
23364            0b01010101010101010101010101010101,
23365            a,
23366            b,
23367        );
23368        let e = _mm512_set_ph(
23369            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23370            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23371        );
23372        assert_eq_m512h(r, e);
23373    }
23374
23375    #[simd_test(enable = "avx512fp16")]
23376    unsafe fn test_mm512_maskz_scalef_round_ph() {
23377        let a = _mm512_set1_ph(1.);
23378        let b = _mm512_set1_ph(3.);
23379        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23380            0b01010101010101010101010101010101,
23381            a,
23382            b,
23383        );
23384        let e = _mm512_set_ph(
23385            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23386            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23387        );
23388        assert_eq_m512h(r, e);
23389    }
23390
23391    #[simd_test(enable = "avx512fp16")]
23392    unsafe fn test_mm_scalef_sh() {
23393        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23394        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23395        let r = _mm_scalef_sh(a, b);
23396        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23397        assert_eq_m128h(r, e);
23398    }
23399
23400    #[simd_test(enable = "avx512fp16")]
23401    unsafe fn test_mm_mask_scalef_sh() {
23402        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23403        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23404        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23405        let r = _mm_mask_scalef_sh(src, 0, a, b);
23406        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23407        assert_eq_m128h(r, e);
23408        let r = _mm_mask_scalef_sh(src, 1, a, b);
23409        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23410        assert_eq_m128h(r, e);
23411    }
23412
23413    #[simd_test(enable = "avx512fp16")]
23414    unsafe fn test_mm_maskz_scalef_sh() {
23415        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23416        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23417        let r = _mm_maskz_scalef_sh(0, a, b);
23418        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23419        assert_eq_m128h(r, e);
23420        let r = _mm_maskz_scalef_sh(1, a, b);
23421        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23422        assert_eq_m128h(r, e);
23423    }
23424
23425    #[simd_test(enable = "avx512fp16")]
23426    unsafe fn test_mm_scalef_round_sh() {
23427        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23428        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23429        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23430        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23431        assert_eq_m128h(r, e);
23432    }
23433
23434    #[simd_test(enable = "avx512fp16")]
23435    unsafe fn test_mm_mask_scalef_round_sh() {
23436        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23437        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23438        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23439        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23440            src, 0, a, b,
23441        );
23442        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23443        assert_eq_m128h(r, e);
23444        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23445            src, 1, a, b,
23446        );
23447        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23448        assert_eq_m128h(r, e);
23449    }
23450
23451    #[simd_test(enable = "avx512fp16")]
23452    unsafe fn test_mm_maskz_scalef_round_sh() {
23453        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23454        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23455        let r =
23456            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23457        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23458        assert_eq_m128h(r, e);
23459        let r =
23460            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23461        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23462        assert_eq_m128h(r, e);
23463    }
23464
23465    #[simd_test(enable = "avx512fp16,avx512vl")]
23466    unsafe fn test_mm_reduce_ph() {
23467        let a = _mm_set1_ph(1.25);
23468        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23469        let e = _mm_set1_ph(0.25);
23470        assert_eq_m128h(r, e);
23471    }
23472
23473    #[simd_test(enable = "avx512fp16,avx512vl")]
23474    unsafe fn test_mm_mask_reduce_ph() {
23475        let a = _mm_set1_ph(1.25);
23476        let src = _mm_set1_ph(2.0);
23477        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23478        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23479        assert_eq_m128h(r, e);
23480    }
23481
23482    #[simd_test(enable = "avx512fp16,avx512vl")]
23483    unsafe fn test_mm_maskz_reduce_ph() {
23484        let a = _mm_set1_ph(1.25);
23485        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23486        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23487        assert_eq_m128h(r, e);
23488    }
23489
23490    #[simd_test(enable = "avx512fp16,avx512vl")]
23491    unsafe fn test_mm256_reduce_ph() {
23492        let a = _mm256_set1_ph(1.25);
23493        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23494        let e = _mm256_set1_ph(0.25);
23495        assert_eq_m256h(r, e);
23496    }
23497
23498    #[simd_test(enable = "avx512fp16,avx512vl")]
23499    unsafe fn test_mm256_mask_reduce_ph() {
23500        let a = _mm256_set1_ph(1.25);
23501        let src = _mm256_set1_ph(2.0);
23502        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23503        let e = _mm256_set_ph(
23504            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23505        );
23506        assert_eq_m256h(r, e);
23507    }
23508
23509    #[simd_test(enable = "avx512fp16,avx512vl")]
23510    unsafe fn test_mm256_maskz_reduce_ph() {
23511        let a = _mm256_set1_ph(1.25);
23512        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23513        let e = _mm256_set_ph(
23514            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23515        );
23516        assert_eq_m256h(r, e);
23517    }
23518
23519    #[simd_test(enable = "avx512fp16")]
23520    unsafe fn test_mm512_reduce_ph() {
23521        let a = _mm512_set1_ph(1.25);
23522        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23523        let e = _mm512_set1_ph(0.25);
23524        assert_eq_m512h(r, e);
23525    }
23526
23527    #[simd_test(enable = "avx512fp16")]
23528    unsafe fn test_mm512_mask_reduce_ph() {
23529        let a = _mm512_set1_ph(1.25);
23530        let src = _mm512_set1_ph(2.0);
23531        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23532            src,
23533            0b01010101010101010101010101010101,
23534            a,
23535        );
23536        let e = _mm512_set_ph(
23537            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23538            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23539        );
23540        assert_eq_m512h(r, e);
23541    }
23542
23543    #[simd_test(enable = "avx512fp16")]
23544    unsafe fn test_mm512_maskz_reduce_ph() {
23545        let a = _mm512_set1_ph(1.25);
23546        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23547            0b01010101010101010101010101010101,
23548            a,
23549        );
23550        let e = _mm512_set_ph(
23551            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23552            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23553        );
23554        assert_eq_m512h(r, e);
23555    }
23556
23557    #[simd_test(enable = "avx512fp16")]
23558    unsafe fn test_mm512_reduce_round_ph() {
23559        let a = _mm512_set1_ph(1.25);
23560        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23561        let e = _mm512_set1_ph(0.25);
23562        assert_eq_m512h(r, e);
23563    }
23564
23565    #[simd_test(enable = "avx512fp16")]
23566    unsafe fn test_mm512_mask_reduce_round_ph() {
23567        let a = _mm512_set1_ph(1.25);
23568        let src = _mm512_set1_ph(2.0);
23569        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23570            src,
23571            0b01010101010101010101010101010101,
23572            a,
23573        );
23574        let e = _mm512_set_ph(
23575            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23576            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23577        );
23578        assert_eq_m512h(r, e);
23579    }
23580
23581    #[simd_test(enable = "avx512fp16")]
23582    unsafe fn test_mm512_maskz_reduce_round_ph() {
23583        let a = _mm512_set1_ph(1.25);
23584        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23585            0b01010101010101010101010101010101,
23586            a,
23587        );
23588        let e = _mm512_set_ph(
23589            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23590            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23591        );
23592        assert_eq_m512h(r, e);
23593    }
23594
23595    #[simd_test(enable = "avx512fp16")]
23596    unsafe fn test_mm_reduce_sh() {
23597        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23598        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23599        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23600        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23601        assert_eq_m128h(r, e);
23602    }
23603
23604    #[simd_test(enable = "avx512fp16")]
23605    unsafe fn test_mm_mask_reduce_sh() {
23606        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23607        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23608        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23609        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23610        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23611        assert_eq_m128h(r, e);
23612        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23613        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23614        assert_eq_m128h(r, e);
23615    }
23616
23617    #[simd_test(enable = "avx512fp16")]
23618    unsafe fn test_mm_maskz_reduce_sh() {
23619        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23620        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23621        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23622        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23623        assert_eq_m128h(r, e);
23624        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23625        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23626        assert_eq_m128h(r, e);
23627    }
23628
23629    #[simd_test(enable = "avx512fp16")]
23630    unsafe fn test_mm_reduce_round_sh() {
23631        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23632        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23633        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23634        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23635        assert_eq_m128h(r, e);
23636    }
23637
23638    #[simd_test(enable = "avx512fp16")]
23639    unsafe fn test_mm_mask_reduce_round_sh() {
23640        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23641        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23642        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23643        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23644            src, 0, a, b,
23645        );
23646        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23647        assert_eq_m128h(r, e);
23648        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23649            src, 1, a, b,
23650        );
23651        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23652        assert_eq_m128h(r, e);
23653    }
23654
23655    #[simd_test(enable = "avx512fp16")]
23656    unsafe fn test_mm_maskz_reduce_round_sh() {
23657        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23658        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23659        let r =
23660            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23661        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23662        assert_eq_m128h(r, e);
23663        let r =
23664            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23665        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23666        assert_eq_m128h(r, e);
23667    }
23668
23669    #[simd_test(enable = "avx512fp16,avx512vl")]
23670    unsafe fn test_mm_reduce_add_ph() {
23671        let a = _mm_set1_ph(2.0);
23672        let r = _mm_reduce_add_ph(a);
23673        assert_eq!(r, 16.0);
23674    }
23675
23676    #[simd_test(enable = "avx512fp16,avx512vl")]
23677    unsafe fn test_mm256_reduce_add_ph() {
23678        let a = _mm256_set1_ph(2.0);
23679        let r = _mm256_reduce_add_ph(a);
23680        assert_eq!(r, 32.0);
23681    }
23682
23683    #[simd_test(enable = "avx512fp16")]
23684    unsafe fn test_mm512_reduce_add_ph() {
23685        let a = _mm512_set1_ph(2.0);
23686        let r = _mm512_reduce_add_ph(a);
23687        assert_eq!(r, 64.0);
23688    }
23689
23690    #[simd_test(enable = "avx512fp16,avx512vl")]
23691    unsafe fn test_mm_reduce_mul_ph() {
23692        let a = _mm_set1_ph(2.0);
23693        let r = _mm_reduce_mul_ph(a);
23694        assert_eq!(r, 256.0);
23695    }
23696
23697    #[simd_test(enable = "avx512fp16,avx512vl")]
23698    unsafe fn test_mm256_reduce_mul_ph() {
23699        let a = _mm256_set1_ph(2.0);
23700        let r = _mm256_reduce_mul_ph(a);
23701        assert_eq!(r, 65536.0);
23702    }
23703
23704    #[simd_test(enable = "avx512fp16")]
23705    unsafe fn test_mm512_reduce_mul_ph() {
23706        let a = _mm512_set1_ph(2.0);
23707        let r = _mm512_reduce_mul_ph(a);
23708        assert_eq!(r, 16777216.0);
23709    }
23710
23711    #[simd_test(enable = "avx512fp16,avx512vl")]
23712    unsafe fn test_mm_reduce_max_ph() {
23713        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23714        let r = _mm_reduce_max_ph(a);
23715        assert_eq!(r, 8.0);
23716    }
23717
23718    #[simd_test(enable = "avx512fp16,avx512vl")]
23719    unsafe fn test_mm256_reduce_max_ph() {
23720        let a = _mm256_set_ph(
23721            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23722        );
23723        let r = _mm256_reduce_max_ph(a);
23724        assert_eq!(r, 16.0);
23725    }
23726
23727    #[simd_test(enable = "avx512fp16")]
23728    unsafe fn test_mm512_reduce_max_ph() {
23729        let a = _mm512_set_ph(
23730            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23731            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23732            31.0, 32.0,
23733        );
23734        let r = _mm512_reduce_max_ph(a);
23735        assert_eq!(r, 32.0);
23736    }
23737
23738    #[simd_test(enable = "avx512fp16,avx512vl")]
23739    unsafe fn test_mm_reduce_min_ph() {
23740        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23741        let r = _mm_reduce_min_ph(a);
23742        assert_eq!(r, 1.0);
23743    }
23744
23745    #[simd_test(enable = "avx512fp16,avx512vl")]
23746    unsafe fn test_mm256_reduce_min_ph() {
23747        let a = _mm256_set_ph(
23748            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23749        );
23750        let r = _mm256_reduce_min_ph(a);
23751        assert_eq!(r, 1.0);
23752    }
23753
23754    #[simd_test(enable = "avx512fp16")]
23755    unsafe fn test_mm512_reduce_min_ph() {
23756        let a = _mm512_set_ph(
23757            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23758            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23759            31.0, 32.0,
23760        );
23761        let r = _mm512_reduce_min_ph(a);
23762        assert_eq!(r, 1.0);
23763    }
23764
23765    #[simd_test(enable = "avx512fp16,avx512vl")]
23766    unsafe fn test_mm_fpclass_ph_mask() {
23767        let a = _mm_set_ph(
23768            1.,
23769            f16::INFINITY,
23770            f16::NEG_INFINITY,
23771            0.0,
23772            -0.0,
23773            -2.0,
23774            f16::NAN,
23775            5.9e-8, // Denormal
23776        );
23777        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23778        assert_eq!(r, 0b01100000);
23779    }
23780
23781    #[simd_test(enable = "avx512fp16,avx512vl")]
23782    unsafe fn test_mm_mask_fpclass_ph_mask() {
23783        let a = _mm_set_ph(
23784            1.,
23785            f16::INFINITY,
23786            f16::NEG_INFINITY,
23787            0.0,
23788            -0.0,
23789            -2.0,
23790            f16::NAN,
23791            5.9e-8, // Denormal
23792        );
23793        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23794        assert_eq!(r, 0b01000000);
23795    }
23796
23797    #[simd_test(enable = "avx512fp16,avx512vl")]
23798    unsafe fn test_mm256_fpclass_ph_mask() {
23799        let a = _mm256_set_ph(
23800            1.,
23801            f16::INFINITY,
23802            f16::NEG_INFINITY,
23803            0.0,
23804            -0.0,
23805            -2.0,
23806            f16::NAN,
23807            5.9e-8, // Denormal
23808            1.,
23809            f16::INFINITY,
23810            f16::NEG_INFINITY,
23811            0.0,
23812            -0.0,
23813            -2.0,
23814            f16::NAN,
23815            5.9e-8, // Denormal
23816        );
23817        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23818        assert_eq!(r, 0b0110000001100000);
23819    }
23820
23821    #[simd_test(enable = "avx512fp16,avx512vl")]
23822    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23823        let a = _mm256_set_ph(
23824            1.,
23825            f16::INFINITY,
23826            f16::NEG_INFINITY,
23827            0.0,
23828            -0.0,
23829            -2.0,
23830            f16::NAN,
23831            5.9e-8, // Denormal
23832            1.,
23833            f16::INFINITY,
23834            f16::NEG_INFINITY,
23835            0.0,
23836            -0.0,
23837            -2.0,
23838            f16::NAN,
23839            5.9e-8, // Denormal
23840        );
23841        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23842        assert_eq!(r, 0b0100000001000000);
23843    }
23844
23845    #[simd_test(enable = "avx512fp16")]
23846    unsafe fn test_mm512_fpclass_ph_mask() {
23847        let a = _mm512_set_ph(
23848            1.,
23849            f16::INFINITY,
23850            f16::NEG_INFINITY,
23851            0.0,
23852            -0.0,
23853            -2.0,
23854            f16::NAN,
23855            5.9e-8, // Denormal
23856            1.,
23857            f16::INFINITY,
23858            f16::NEG_INFINITY,
23859            0.0,
23860            -0.0,
23861            -2.0,
23862            f16::NAN,
23863            5.9e-8, // Denormal
23864            1.,
23865            f16::INFINITY,
23866            f16::NEG_INFINITY,
23867            0.0,
23868            -0.0,
23869            -2.0,
23870            f16::NAN,
23871            5.9e-8, // Denormal
23872            1.,
23873            f16::INFINITY,
23874            f16::NEG_INFINITY,
23875            0.0,
23876            -0.0,
23877            -2.0,
23878            f16::NAN,
23879            5.9e-8, // Denormal
23880        );
23881        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23882        assert_eq!(r, 0b01100000011000000110000001100000);
23883    }
23884
23885    #[simd_test(enable = "avx512fp16")]
23886    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23887        let a = _mm512_set_ph(
23888            1.,
23889            f16::INFINITY,
23890            f16::NEG_INFINITY,
23891            0.0,
23892            -0.0,
23893            -2.0,
23894            f16::NAN,
23895            5.9e-8, // Denormal
23896            1.,
23897            f16::INFINITY,
23898            f16::NEG_INFINITY,
23899            0.0,
23900            -0.0,
23901            -2.0,
23902            f16::NAN,
23903            5.9e-8, // Denormal
23904            1.,
23905            f16::INFINITY,
23906            f16::NEG_INFINITY,
23907            0.0,
23908            -0.0,
23909            -2.0,
23910            f16::NAN,
23911            5.9e-8, // Denormal
23912            1.,
23913            f16::INFINITY,
23914            f16::NEG_INFINITY,
23915            0.0,
23916            -0.0,
23917            -2.0,
23918            f16::NAN,
23919            5.9e-8, // Denormal
23920        );
23921        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23922        assert_eq!(r, 0b01000000010000000100000001000000);
23923    }
23924
23925    #[simd_test(enable = "avx512fp16")]
23926    unsafe fn test_mm_fpclass_sh_mask() {
23927        let a = _mm_set_sh(f16::INFINITY);
23928        let r = _mm_fpclass_sh_mask::<0x18>(a);
23929        assert_eq!(r, 1);
23930    }
23931
23932    #[simd_test(enable = "avx512fp16")]
23933    unsafe fn test_mm_mask_fpclass_sh_mask() {
23934        let a = _mm_set_sh(f16::INFINITY);
23935        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
23936        assert_eq!(r, 0);
23937        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
23938        assert_eq!(r, 1);
23939    }
23940
23941    #[simd_test(enable = "avx512fp16,avx512vl")]
23942    unsafe fn test_mm_mask_blend_ph() {
23943        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23944        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
23945        let r = _mm_mask_blend_ph(0b01010101, a, b);
23946        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
23947        assert_eq_m128h(r, e);
23948    }
23949
23950    #[simd_test(enable = "avx512fp16,avx512vl")]
23951    unsafe fn test_mm256_mask_blend_ph() {
23952        let a = _mm256_set_ph(
23953            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23954        );
23955        let b = _mm256_set_ph(
23956            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23957            -14.0, -15.0, -16.0,
23958        );
23959        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
23960        let e = _mm256_set_ph(
23961            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23962            -16.0,
23963        );
23964        assert_eq_m256h(r, e);
23965    }
23966
23967    #[simd_test(enable = "avx512fp16")]
23968    unsafe fn test_mm512_mask_blend_ph() {
23969        let a = _mm512_set_ph(
23970            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23971            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23972            31.0, 32.0,
23973        );
23974        let b = _mm512_set_ph(
23975            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23976            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
23977            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
23978        );
23979        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
23980        let e = _mm512_set_ph(
23981            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23982            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
23983            29.0, -30.0, 31.0, -32.0,
23984        );
23985        assert_eq_m512h(r, e);
23986    }
23987
23988    #[simd_test(enable = "avx512fp16,avx512vl")]
23989    unsafe fn test_mm_permutex2var_ph() {
23990        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23991        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
23992        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
23993        let r = _mm_permutex2var_ph(a, idx, b);
23994        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
23995        assert_eq_m128h(r, e);
23996    }
23997
23998    #[simd_test(enable = "avx512fp16,avx512vl")]
23999    unsafe fn test_mm256_permutex2var_ph() {
24000        let a = _mm256_setr_ph(
24001            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24002        );
24003        let b = _mm256_setr_ph(
24004            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24005            31.0, 32.0,
24006        );
24007        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24008        let r = _mm256_permutex2var_ph(a, idx, b);
24009        let e = _mm256_setr_ph(
24010            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24011            31.0,
24012        );
24013        assert_eq_m256h(r, e);
24014    }
24015
24016    #[simd_test(enable = "avx512fp16")]
24017    unsafe fn test_mm512_permutex2var_ph() {
24018        let a = _mm512_setr_ph(
24019            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24020            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24021            31.0, 32.0,
24022        );
24023        let b = _mm512_setr_ph(
24024            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24025            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24026            61.0, 62.0, 63.0, 64.0,
24027        );
24028        let idx = _mm512_set_epi16(
24029            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24030            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24031        );
24032        let r = _mm512_permutex2var_ph(a, idx, b);
24033        let e = _mm512_setr_ph(
24034            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24035            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24036            59.0, 61.0, 63.0,
24037        );
24038        assert_eq_m512h(r, e);
24039    }
24040
24041    #[simd_test(enable = "avx512fp16,avx512vl")]
24042    unsafe fn test_mm_permutexvar_ph() {
24043        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24044        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24045        let r = _mm_permutexvar_ph(idx, a);
24046        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24047        assert_eq_m128h(r, e);
24048    }
24049
24050    #[simd_test(enable = "avx512fp16,avx512vl")]
24051    unsafe fn test_mm256_permutexvar_ph() {
24052        let a = _mm256_set_ph(
24053            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24054        );
24055        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24056        let r = _mm256_permutexvar_ph(idx, a);
24057        let e = _mm256_setr_ph(
24058            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24059        );
24060        assert_eq_m256h(r, e);
24061    }
24062
24063    #[simd_test(enable = "avx512fp16")]
24064    unsafe fn test_mm512_permutexvar_ph() {
24065        let a = _mm512_set_ph(
24066            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24067            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24068            31.0, 32.0,
24069        );
24070        let idx = _mm512_set_epi16(
24071            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24072            17, 19, 21, 23, 25, 27, 29, 31,
24073        );
24074        let r = _mm512_permutexvar_ph(idx, a);
24075        let e = _mm512_setr_ph(
24076            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24077            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24078            30.0, 32.0,
24079        );
24080        assert_eq_m512h(r, e);
24081    }
24082
24083    #[simd_test(enable = "avx512fp16,avx512vl")]
24084    unsafe fn test_mm_cvtepi16_ph() {
24085        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24086        let r = _mm_cvtepi16_ph(a);
24087        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24088        assert_eq_m128h(r, e);
24089    }
24090
24091    #[simd_test(enable = "avx512fp16,avx512vl")]
24092    unsafe fn test_mm_mask_cvtepi16_ph() {
24093        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24094        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24095        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24096        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24097        assert_eq_m128h(r, e);
24098    }
24099
24100    #[simd_test(enable = "avx512fp16,avx512vl")]
24101    unsafe fn test_mm_maskz_cvtepi16_ph() {
24102        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24103        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24104        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24105        assert_eq_m128h(r, e);
24106    }
24107
24108    #[simd_test(enable = "avx512fp16,avx512vl")]
24109    unsafe fn test_mm256_cvtepi16_ph() {
24110        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24111        let r = _mm256_cvtepi16_ph(a);
24112        let e = _mm256_set_ph(
24113            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24114        );
24115        assert_eq_m256h(r, e);
24116    }
24117
24118    #[simd_test(enable = "avx512fp16,avx512vl")]
24119    unsafe fn test_mm256_mask_cvtepi16_ph() {
24120        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24121        let src = _mm256_set_ph(
24122            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24123        );
24124        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24125        let e = _mm256_set_ph(
24126            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24127        );
24128        assert_eq_m256h(r, e);
24129    }
24130
24131    #[simd_test(enable = "avx512fp16,avx512vl")]
24132    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24133        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24134        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24135        let e = _mm256_set_ph(
24136            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24137        );
24138        assert_eq_m256h(r, e);
24139    }
24140
24141    #[simd_test(enable = "avx512fp16")]
24142    unsafe fn test_mm512_cvtepi16_ph() {
24143        let a = _mm512_set_epi16(
24144            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24145            25, 26, 27, 28, 29, 30, 31, 32,
24146        );
24147        let r = _mm512_cvtepi16_ph(a);
24148        let e = _mm512_set_ph(
24149            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24150            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24151            31.0, 32.0,
24152        );
24153        assert_eq_m512h(r, e);
24154    }
24155
24156    #[simd_test(enable = "avx512fp16")]
24157    unsafe fn test_mm512_mask_cvtepi16_ph() {
24158        let a = _mm512_set_epi16(
24159            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24160            25, 26, 27, 28, 29, 30, 31, 32,
24161        );
24162        let src = _mm512_set_ph(
24163            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24164            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24165        );
24166        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24167        let e = _mm512_set_ph(
24168            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24169            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24170        );
24171        assert_eq_m512h(r, e);
24172    }
24173
24174    #[simd_test(enable = "avx512fp16")]
24175    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24176        let a = _mm512_set_epi16(
24177            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24178            25, 26, 27, 28, 29, 30, 31, 32,
24179        );
24180        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24181        let e = _mm512_set_ph(
24182            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24183            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24184        );
24185        assert_eq_m512h(r, e);
24186    }
24187
24188    #[simd_test(enable = "avx512fp16")]
24189    unsafe fn test_mm512_cvt_roundepi16_ph() {
24190        let a = _mm512_set_epi16(
24191            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24192            25, 26, 27, 28, 29, 30, 31, 32,
24193        );
24194        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24195        let e = _mm512_set_ph(
24196            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24197            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24198            31.0, 32.0,
24199        );
24200        assert_eq_m512h(r, e);
24201    }
24202
24203    #[simd_test(enable = "avx512fp16")]
24204    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24205        let a = _mm512_set_epi16(
24206            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24207            25, 26, 27, 28, 29, 30, 31, 32,
24208        );
24209        let src = _mm512_set_ph(
24210            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24211            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24212        );
24213        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24214            src,
24215            0b01010101010101010101010101010101,
24216            a,
24217        );
24218        let e = _mm512_set_ph(
24219            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24220            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24221        );
24222        assert_eq_m512h(r, e);
24223    }
24224
24225    #[simd_test(enable = "avx512fp16")]
24226    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24227        let a = _mm512_set_epi16(
24228            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24229            25, 26, 27, 28, 29, 30, 31, 32,
24230        );
24231        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24232            0b01010101010101010101010101010101,
24233            a,
24234        );
24235        let e = _mm512_set_ph(
24236            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24237            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24238        );
24239        assert_eq_m512h(r, e);
24240    }
24241
24242    #[simd_test(enable = "avx512fp16,avx512vl")]
24243    unsafe fn test_mm_cvtepu16_ph() {
24244        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24245        let r = _mm_cvtepu16_ph(a);
24246        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24247        assert_eq_m128h(r, e);
24248    }
24249
24250    #[simd_test(enable = "avx512fp16,avx512vl")]
24251    unsafe fn test_mm_mask_cvtepu16_ph() {
24252        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24253        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24254        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24255        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24256        assert_eq_m128h(r, e);
24257    }
24258
24259    #[simd_test(enable = "avx512fp16,avx512vl")]
24260    unsafe fn test_mm_maskz_cvtepu16_ph() {
24261        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24262        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24263        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24264        assert_eq_m128h(r, e);
24265    }
24266
24267    #[simd_test(enable = "avx512fp16,avx512vl")]
24268    unsafe fn test_mm256_cvtepu16_ph() {
24269        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24270        let r = _mm256_cvtepu16_ph(a);
24271        let e = _mm256_set_ph(
24272            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24273        );
24274        assert_eq_m256h(r, e);
24275    }
24276
24277    #[simd_test(enable = "avx512fp16,avx512vl")]
24278    unsafe fn test_mm256_mask_cvtepu16_ph() {
24279        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24280        let src = _mm256_set_ph(
24281            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24282        );
24283        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24284        let e = _mm256_set_ph(
24285            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24286        );
24287        assert_eq_m256h(r, e);
24288    }
24289
24290    #[simd_test(enable = "avx512fp16,avx512vl")]
24291    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24292        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24293        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24294        let e = _mm256_set_ph(
24295            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24296        );
24297        assert_eq_m256h(r, e);
24298    }
24299
24300    #[simd_test(enable = "avx512fp16")]
24301    unsafe fn test_mm512_cvtepu16_ph() {
24302        let a = _mm512_set_epi16(
24303            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24304            25, 26, 27, 28, 29, 30, 31, 32,
24305        );
24306        let r = _mm512_cvtepu16_ph(a);
24307        let e = _mm512_set_ph(
24308            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24309            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24310            31.0, 32.0,
24311        );
24312        assert_eq_m512h(r, e);
24313    }
24314
24315    #[simd_test(enable = "avx512fp16")]
24316    unsafe fn test_mm512_mask_cvtepu16_ph() {
24317        let a = _mm512_set_epi16(
24318            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24319            25, 26, 27, 28, 29, 30, 31, 32,
24320        );
24321        let src = _mm512_set_ph(
24322            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24323            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24324        );
24325        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24326        let e = _mm512_set_ph(
24327            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24328            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24329        );
24330        assert_eq_m512h(r, e);
24331    }
24332
24333    #[simd_test(enable = "avx512fp16")]
24334    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24335        let a = _mm512_set_epi16(
24336            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24337            25, 26, 27, 28, 29, 30, 31, 32,
24338        );
24339        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24340        let e = _mm512_set_ph(
24341            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24342            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24343        );
24344        assert_eq_m512h(r, e);
24345    }
24346
24347    #[simd_test(enable = "avx512fp16")]
24348    unsafe fn test_mm512_cvt_roundepu16_ph() {
24349        let a = _mm512_set_epi16(
24350            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24351            25, 26, 27, 28, 29, 30, 31, 32,
24352        );
24353        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24354        let e = _mm512_set_ph(
24355            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24356            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24357            31.0, 32.0,
24358        );
24359        assert_eq_m512h(r, e);
24360    }
24361
24362    #[simd_test(enable = "avx512fp16")]
24363    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24364        let a = _mm512_set_epi16(
24365            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24366            25, 26, 27, 28, 29, 30, 31, 32,
24367        );
24368        let src = _mm512_set_ph(
24369            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24370            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24371        );
24372        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24373            src,
24374            0b01010101010101010101010101010101,
24375            a,
24376        );
24377        let e = _mm512_set_ph(
24378            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24379            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24380        );
24381        assert_eq_m512h(r, e);
24382    }
24383
24384    #[simd_test(enable = "avx512fp16")]
24385    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24386        let a = _mm512_set_epi16(
24387            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24388            25, 26, 27, 28, 29, 30, 31, 32,
24389        );
24390        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24391            0b01010101010101010101010101010101,
24392            a,
24393        );
24394        let e = _mm512_set_ph(
24395            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24396            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24397        );
24398        assert_eq_m512h(r, e);
24399    }
24400
24401    #[simd_test(enable = "avx512fp16,avx512vl")]
24402    unsafe fn test_mm_cvtepi32_ph() {
24403        let a = _mm_set_epi32(1, 2, 3, 4);
24404        let r = _mm_cvtepi32_ph(a);
24405        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24406        assert_eq_m128h(r, e);
24407    }
24408
24409    #[simd_test(enable = "avx512fp16,avx512vl")]
24410    unsafe fn test_mm_mask_cvtepi32_ph() {
24411        let a = _mm_set_epi32(1, 2, 3, 4);
24412        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24413        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24414        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24415        assert_eq_m128h(r, e);
24416    }
24417
24418    #[simd_test(enable = "avx512fp16,avx512vl")]
24419    unsafe fn test_mm_maskz_cvtepi32_ph() {
24420        let a = _mm_set_epi32(1, 2, 3, 4);
24421        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24422        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24423        assert_eq_m128h(r, e);
24424    }
24425
24426    #[simd_test(enable = "avx512fp16,avx512vl")]
24427    unsafe fn test_mm256_cvtepi32_ph() {
24428        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24429        let r = _mm256_cvtepi32_ph(a);
24430        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24431        assert_eq_m128h(r, e);
24432    }
24433
24434    #[simd_test(enable = "avx512fp16,avx512vl")]
24435    unsafe fn test_mm256_mask_cvtepi32_ph() {
24436        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24437        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24438        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24439        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24440        assert_eq_m128h(r, e);
24441    }
24442
24443    #[simd_test(enable = "avx512fp16,avx512vl")]
24444    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24445        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24446        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24447        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24448        assert_eq_m128h(r, e);
24449    }
24450
24451    #[simd_test(enable = "avx512fp16")]
24452    unsafe fn test_mm512_cvtepi32_ph() {
24453        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24454        let r = _mm512_cvtepi32_ph(a);
24455        let e = _mm256_set_ph(
24456            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24457        );
24458        assert_eq_m256h(r, e);
24459    }
24460
24461    #[simd_test(enable = "avx512fp16")]
24462    unsafe fn test_mm512_mask_cvtepi32_ph() {
24463        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24464        let src = _mm256_set_ph(
24465            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24466        );
24467        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24468        let e = _mm256_set_ph(
24469            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24470        );
24471        assert_eq_m256h(r, e);
24472    }
24473
24474    #[simd_test(enable = "avx512fp16")]
24475    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24476        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24477        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24478        let e = _mm256_set_ph(
24479            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24480        );
24481        assert_eq_m256h(r, e);
24482    }
24483
24484    #[simd_test(enable = "avx512fp16")]
24485    unsafe fn test_mm512_cvt_roundepi32_ph() {
24486        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24487        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24488        let e = _mm256_set_ph(
24489            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24490        );
24491        assert_eq_m256h(r, e);
24492    }
24493
24494    #[simd_test(enable = "avx512fp16")]
24495    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24496        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24497        let src = _mm256_set_ph(
24498            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24499        );
24500        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24501            src,
24502            0b0101010101010101,
24503            a,
24504        );
24505        let e = _mm256_set_ph(
24506            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24507        );
24508        assert_eq_m256h(r, e);
24509    }
24510
24511    #[simd_test(enable = "avx512fp16")]
24512    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24513        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24514        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24515            0b0101010101010101,
24516            a,
24517        );
24518        let e = _mm256_set_ph(
24519            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24520        );
24521        assert_eq_m256h(r, e);
24522    }
24523
24524    #[simd_test(enable = "avx512fp16")]
24525    unsafe fn test_mm_cvti32_sh() {
24526        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24527        let r = _mm_cvti32_sh(a, 10);
24528        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24529        assert_eq_m128h(r, e);
24530    }
24531
24532    #[simd_test(enable = "avx512fp16")]
24533    unsafe fn test_mm_cvt_roundi32_sh() {
24534        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24535        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24536        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24537        assert_eq_m128h(r, e);
24538    }
24539
24540    #[simd_test(enable = "avx512fp16,avx512vl")]
24541    unsafe fn test_mm_cvtepu32_ph() {
24542        let a = _mm_set_epi32(1, 2, 3, 4);
24543        let r = _mm_cvtepu32_ph(a);
24544        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24545        assert_eq_m128h(r, e);
24546    }
24547
24548    #[simd_test(enable = "avx512fp16,avx512vl")]
24549    unsafe fn test_mm_mask_cvtepu32_ph() {
24550        let a = _mm_set_epi32(1, 2, 3, 4);
24551        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24552        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24553        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24554        assert_eq_m128h(r, e);
24555    }
24556
24557    #[simd_test(enable = "avx512fp16,avx512vl")]
24558    unsafe fn test_mm_maskz_cvtepu32_ph() {
24559        let a = _mm_set_epi32(1, 2, 3, 4);
24560        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24561        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24562        assert_eq_m128h(r, e);
24563    }
24564
24565    #[simd_test(enable = "avx512fp16,avx512vl")]
24566    unsafe fn test_mm256_cvtepu32_ph() {
24567        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24568        let r = _mm256_cvtepu32_ph(a);
24569        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24570        assert_eq_m128h(r, e);
24571    }
24572
24573    #[simd_test(enable = "avx512fp16,avx512vl")]
24574    unsafe fn test_mm256_mask_cvtepu32_ph() {
24575        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24576        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24577        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24578        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24579        assert_eq_m128h(r, e);
24580    }
24581
24582    #[simd_test(enable = "avx512fp16,avx512vl")]
24583    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24584        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24585        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24586        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24587        assert_eq_m128h(r, e);
24588    }
24589
24590    #[simd_test(enable = "avx512fp16")]
24591    unsafe fn test_mm512_cvtepu32_ph() {
24592        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24593        let r = _mm512_cvtepu32_ph(a);
24594        let e = _mm256_set_ph(
24595            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24596        );
24597        assert_eq_m256h(r, e);
24598    }
24599
24600    #[simd_test(enable = "avx512fp16")]
24601    unsafe fn test_mm512_mask_cvtepu32_ph() {
24602        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24603        let src = _mm256_set_ph(
24604            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24605        );
24606        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24607        let e = _mm256_set_ph(
24608            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24609        );
24610        assert_eq_m256h(r, e);
24611    }
24612
24613    #[simd_test(enable = "avx512fp16")]
24614    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24615        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24616        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24617        let e = _mm256_set_ph(
24618            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24619        );
24620        assert_eq_m256h(r, e);
24621    }
24622
24623    #[simd_test(enable = "avx512fp16")]
24624    unsafe fn test_mm512_cvt_roundepu32_ph() {
24625        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24626        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24627        let e = _mm256_set_ph(
24628            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24629        );
24630        assert_eq_m256h(r, e);
24631    }
24632
24633    #[simd_test(enable = "avx512fp16")]
24634    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24635        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24636        let src = _mm256_set_ph(
24637            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24638        );
24639        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24640            src,
24641            0b0101010101010101,
24642            a,
24643        );
24644        let e = _mm256_set_ph(
24645            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24646            16.0,
24647        );
24648        assert_eq_m256h(r, e);
24649    }
24650
24651    #[simd_test(enable = "avx512fp16")]
24652    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24653        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24654        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24655            0b0101010101010101,
24656            a,
24657        );
24658        let e = _mm256_set_ph(
24659            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24660        );
24661        assert_eq_m256h(r, e);
24662    }
24663
24664    #[simd_test(enable = "avx512fp16")]
24665    unsafe fn test_mm_cvtu32_sh() {
24666        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24667        let r = _mm_cvtu32_sh(a, 10);
24668        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24669        assert_eq_m128h(r, e);
24670    }
24671
24672    #[simd_test(enable = "avx512fp16")]
24673    unsafe fn test_mm_cvt_roundu32_sh() {
24674        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24675        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24676        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24677        assert_eq_m128h(r, e);
24678    }
24679
24680    #[simd_test(enable = "avx512fp16,avx512vl")]
24681    unsafe fn test_mm_cvtepi64_ph() {
24682        let a = _mm_set_epi64x(1, 2);
24683        let r = _mm_cvtepi64_ph(a);
24684        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24685        assert_eq_m128h(r, e);
24686    }
24687
24688    #[simd_test(enable = "avx512fp16,avx512vl")]
24689    unsafe fn test_mm_mask_cvtepi64_ph() {
24690        let a = _mm_set_epi64x(1, 2);
24691        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24692        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24693        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24694        assert_eq_m128h(r, e);
24695    }
24696
24697    #[simd_test(enable = "avx512fp16,avx512vl")]
24698    unsafe fn test_mm_maskz_cvtepi64_ph() {
24699        let a = _mm_set_epi64x(1, 2);
24700        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24701        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24702        assert_eq_m128h(r, e);
24703    }
24704
24705    #[simd_test(enable = "avx512fp16,avx512vl")]
24706    unsafe fn test_mm256_cvtepi64_ph() {
24707        let a = _mm256_set_epi64x(1, 2, 3, 4);
24708        let r = _mm256_cvtepi64_ph(a);
24709        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24710        assert_eq_m128h(r, e);
24711    }
24712
24713    #[simd_test(enable = "avx512fp16,avx512vl")]
24714    unsafe fn test_mm256_mask_cvtepi64_ph() {
24715        let a = _mm256_set_epi64x(1, 2, 3, 4);
24716        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24717        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24718        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24719        assert_eq_m128h(r, e);
24720    }
24721
24722    #[simd_test(enable = "avx512fp16,avx512vl")]
24723    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24724        let a = _mm256_set_epi64x(1, 2, 3, 4);
24725        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24726        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24727        assert_eq_m128h(r, e);
24728    }
24729
24730    #[simd_test(enable = "avx512fp16")]
24731    unsafe fn test_mm512_cvtepi64_ph() {
24732        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24733        let r = _mm512_cvtepi64_ph(a);
24734        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24735        assert_eq_m128h(r, e);
24736    }
24737
24738    #[simd_test(enable = "avx512fp16")]
24739    unsafe fn test_mm512_mask_cvtepi64_ph() {
24740        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24741        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24742        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24743        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24744        assert_eq_m128h(r, e);
24745    }
24746
24747    #[simd_test(enable = "avx512fp16")]
24748    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24749        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24750        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24751        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24752        assert_eq_m128h(r, e);
24753    }
24754
24755    #[simd_test(enable = "avx512fp16")]
24756    unsafe fn test_mm512_cvt_roundepi64_ph() {
24757        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24758        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24759        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24760        assert_eq_m128h(r, e);
24761    }
24762
24763    #[simd_test(enable = "avx512fp16")]
24764    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24765        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24766        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24767        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24768            src, 0b01010101, a,
24769        );
24770        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24771        assert_eq_m128h(r, e);
24772    }
24773
24774    #[simd_test(enable = "avx512fp16")]
24775    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24776        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24777        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24778            0b01010101, a,
24779        );
24780        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24781        assert_eq_m128h(r, e);
24782    }
24783
24784    #[simd_test(enable = "avx512fp16,avx512vl")]
24785    unsafe fn test_mm_cvtepu64_ph() {
24786        let a = _mm_set_epi64x(1, 2);
24787        let r = _mm_cvtepu64_ph(a);
24788        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24789        assert_eq_m128h(r, e);
24790    }
24791
24792    #[simd_test(enable = "avx512fp16,avx512vl")]
24793    unsafe fn test_mm_mask_cvtepu64_ph() {
24794        let a = _mm_set_epi64x(1, 2);
24795        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24796        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24797        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24798        assert_eq_m128h(r, e);
24799    }
24800
24801    #[simd_test(enable = "avx512fp16,avx512vl")]
24802    unsafe fn test_mm_maskz_cvtepu64_ph() {
24803        let a = _mm_set_epi64x(1, 2);
24804        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24805        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24806        assert_eq_m128h(r, e);
24807    }
24808
24809    #[simd_test(enable = "avx512fp16,avx512vl")]
24810    unsafe fn test_mm256_cvtepu64_ph() {
24811        let a = _mm256_set_epi64x(1, 2, 3, 4);
24812        let r = _mm256_cvtepu64_ph(a);
24813        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24814        assert_eq_m128h(r, e);
24815    }
24816
24817    #[simd_test(enable = "avx512fp16,avx512vl")]
24818    unsafe fn test_mm256_mask_cvtepu64_ph() {
24819        let a = _mm256_set_epi64x(1, 2, 3, 4);
24820        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24821        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24822        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24823        assert_eq_m128h(r, e);
24824    }
24825
24826    #[simd_test(enable = "avx512fp16,avx512vl")]
24827    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24828        let a = _mm256_set_epi64x(1, 2, 3, 4);
24829        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24830        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24831        assert_eq_m128h(r, e);
24832    }
24833
24834    #[simd_test(enable = "avx512fp16")]
24835    unsafe fn test_mm512_cvtepu64_ph() {
24836        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24837        let r = _mm512_cvtepu64_ph(a);
24838        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24839        assert_eq_m128h(r, e);
24840    }
24841
24842    #[simd_test(enable = "avx512fp16")]
24843    unsafe fn test_mm512_mask_cvtepu64_ph() {
24844        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24845        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24846        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24847        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24848        assert_eq_m128h(r, e);
24849    }
24850
24851    #[simd_test(enable = "avx512fp16")]
24852    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24853        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24854        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24855        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24856        assert_eq_m128h(r, e);
24857    }
24858
24859    #[simd_test(enable = "avx512fp16")]
24860    unsafe fn test_mm512_cvt_roundepu64_ph() {
24861        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24862        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24863        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24864        assert_eq_m128h(r, e);
24865    }
24866
24867    #[simd_test(enable = "avx512fp16")]
24868    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24869        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24870        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24871        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24872            src, 0b01010101, a,
24873        );
24874        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24875        assert_eq_m128h(r, e);
24876    }
24877
24878    #[simd_test(enable = "avx512fp16")]
24879    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24880        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24881        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24882            0b01010101, a,
24883        );
24884        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24885        assert_eq_m128h(r, e);
24886    }
24887
24888    #[simd_test(enable = "avx512fp16,avx512vl")]
24889    unsafe fn test_mm_cvtxps_ph() {
24890        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24891        let r = _mm_cvtxps_ph(a);
24892        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24893        assert_eq_m128h(r, e);
24894    }
24895
24896    #[simd_test(enable = "avx512fp16,avx512vl")]
24897    unsafe fn test_mm_mask_cvtxps_ph() {
24898        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24899        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24900        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24901        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24902        assert_eq_m128h(r, e);
24903    }
24904
24905    #[simd_test(enable = "avx512fp16,avx512vl")]
24906    unsafe fn test_mm_maskz_cvtxps_ph() {
24907        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24908        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24909        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24910        assert_eq_m128h(r, e);
24911    }
24912
24913    #[simd_test(enable = "avx512fp16,avx512vl")]
24914    unsafe fn test_mm256_cvtxps_ph() {
24915        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24916        let r = _mm256_cvtxps_ph(a);
24917        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24918        assert_eq_m128h(r, e);
24919    }
24920
24921    #[simd_test(enable = "avx512fp16,avx512vl")]
24922    unsafe fn test_mm256_mask_cvtxps_ph() {
24923        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24924        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24925        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24926        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24927        assert_eq_m128h(r, e);
24928    }
24929
24930    #[simd_test(enable = "avx512fp16,avx512vl")]
24931    unsafe fn test_mm256_maskz_cvtxps_ph() {
24932        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24933        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
24934        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24935        assert_eq_m128h(r, e);
24936    }
24937
24938    #[simd_test(enable = "avx512fp16")]
24939    unsafe fn test_mm512_cvtxps_ph() {
24940        let a = _mm512_set_ps(
24941            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24942        );
24943        let r = _mm512_cvtxps_ph(a);
24944        let e = _mm256_set_ph(
24945            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24946        );
24947        assert_eq_m256h(r, e);
24948    }
24949
24950    #[simd_test(enable = "avx512fp16")]
24951    unsafe fn test_mm512_mask_cvtxps_ph() {
24952        let a = _mm512_set_ps(
24953            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24954        );
24955        let src = _mm256_set_ph(
24956            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24957        );
24958        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
24959        let e = _mm256_set_ph(
24960            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24961        );
24962        assert_eq_m256h(r, e);
24963    }
24964
24965    #[simd_test(enable = "avx512fp16")]
24966    unsafe fn test_mm512_maskz_cvtxps_ph() {
24967        let a = _mm512_set_ps(
24968            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24969        );
24970        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
24971        let e = _mm256_set_ph(
24972            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24973        );
24974        assert_eq_m256h(r, e);
24975    }
24976
24977    #[simd_test(enable = "avx512fp16")]
24978    unsafe fn test_mm512_cvtx_roundps_ph() {
24979        let a = _mm512_set_ps(
24980            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24981        );
24982        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24983        let e = _mm256_set_ph(
24984            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24985        );
24986        assert_eq_m256h(r, e);
24987    }
24988
24989    #[simd_test(enable = "avx512fp16")]
24990    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
24991        let a = _mm512_set_ps(
24992            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24993        );
24994        let src = _mm256_set_ph(
24995            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24996        );
24997        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24998            src,
24999            0b0101010101010101,
25000            a,
25001        );
25002        let e = _mm256_set_ph(
25003            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25004            16.0,
25005        );
25006        assert_eq_m256h(r, e);
25007    }
25008
25009    #[simd_test(enable = "avx512fp16")]
25010    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
25011        let a = _mm512_set_ps(
25012            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25013        );
25014        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25015            0b0101010101010101,
25016            a,
25017        );
25018        let e = _mm256_set_ph(
25019            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25020        );
25021        assert_eq_m256h(r, e);
25022    }
25023
25024    #[simd_test(enable = "avx512fp16")]
25025    unsafe fn test_mm_cvtss_sh() {
25026        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25027        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25028        let r = _mm_cvtss_sh(a, b);
25029        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25030        assert_eq_m128h(r, e);
25031    }
25032
25033    #[simd_test(enable = "avx512fp16")]
25034    unsafe fn test_mm_mask_cvtss_sh() {
25035        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25036        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25037        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25038        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25039        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25040        assert_eq_m128h(r, e);
25041        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25042        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25043        assert_eq_m128h(r, e);
25044    }
25045
25046    #[simd_test(enable = "avx512fp16")]
25047    unsafe fn test_mm_maskz_cvtss_sh() {
25048        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25049        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25050        let r = _mm_maskz_cvtss_sh(0, a, b);
25051        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25052        assert_eq_m128h(r, e);
25053        let r = _mm_maskz_cvtss_sh(1, a, b);
25054        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25055        assert_eq_m128h(r, e);
25056    }
25057
25058    #[simd_test(enable = "avx512fp16")]
25059    unsafe fn test_mm_cvt_roundss_sh() {
25060        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25061        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25062        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25063        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25064        assert_eq_m128h(r, e);
25065    }
25066
25067    #[simd_test(enable = "avx512fp16")]
25068    unsafe fn test_mm_mask_cvt_roundss_sh() {
25069        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25070        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25071        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25072        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25073            src, 0, a, b,
25074        );
25075        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25076        assert_eq_m128h(r, e);
25077        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25078            src, 1, a, b,
25079        );
25080        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25081        assert_eq_m128h(r, e);
25082    }
25083
25084    #[simd_test(enable = "avx512fp16")]
25085    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25086        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25087        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25088        let r =
25089            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25090        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25091        assert_eq_m128h(r, e);
25092        let r =
25093            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25094        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25095        assert_eq_m128h(r, e);
25096    }
25097
25098    #[simd_test(enable = "avx512fp16,avx512vl")]
25099    unsafe fn test_mm_cvtpd_ph() {
25100        let a = _mm_set_pd(1.0, 2.0);
25101        let r = _mm_cvtpd_ph(a);
25102        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25103        assert_eq_m128h(r, e);
25104    }
25105
25106    #[simd_test(enable = "avx512fp16,avx512vl")]
25107    unsafe fn test_mm_mask_cvtpd_ph() {
25108        let a = _mm_set_pd(1.0, 2.0);
25109        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25110        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25111        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25112        assert_eq_m128h(r, e);
25113    }
25114
25115    #[simd_test(enable = "avx512fp16,avx512vl")]
25116    unsafe fn test_mm_maskz_cvtpd_ph() {
25117        let a = _mm_set_pd(1.0, 2.0);
25118        let r = _mm_maskz_cvtpd_ph(0b01, a);
25119        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25120        assert_eq_m128h(r, e);
25121    }
25122
25123    #[simd_test(enable = "avx512fp16,avx512vl")]
25124    unsafe fn test_mm256_cvtpd_ph() {
25125        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25126        let r = _mm256_cvtpd_ph(a);
25127        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25128        assert_eq_m128h(r, e);
25129    }
25130
25131    #[simd_test(enable = "avx512fp16,avx512vl")]
25132    unsafe fn test_mm256_mask_cvtpd_ph() {
25133        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25134        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25135        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25136        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25137        assert_eq_m128h(r, e);
25138    }
25139
25140    #[simd_test(enable = "avx512fp16,avx512vl")]
25141    unsafe fn test_mm256_maskz_cvtpd_ph() {
25142        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25143        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25144        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25145        assert_eq_m128h(r, e);
25146    }
25147
25148    #[simd_test(enable = "avx512fp16")]
25149    unsafe fn test_mm512_cvtpd_ph() {
25150        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25151        let r = _mm512_cvtpd_ph(a);
25152        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25153        assert_eq_m128h(r, e);
25154    }
25155
25156    #[simd_test(enable = "avx512fp16")]
25157    unsafe fn test_mm512_mask_cvtpd_ph() {
25158        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25159        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25160        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25161        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25162        assert_eq_m128h(r, e);
25163    }
25164
25165    #[simd_test(enable = "avx512fp16")]
25166    unsafe fn test_mm512_maskz_cvtpd_ph() {
25167        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25168        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25169        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25170        assert_eq_m128h(r, e);
25171    }
25172
25173    #[simd_test(enable = "avx512fp16")]
25174    unsafe fn test_mm512_cvt_roundpd_ph() {
25175        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25176        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25177        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25178        assert_eq_m128h(r, e);
25179    }
25180
25181    #[simd_test(enable = "avx512fp16")]
25182    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25183        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25184        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25185        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25186            src, 0b01010101, a,
25187        );
25188        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25189        assert_eq_m128h(r, e);
25190    }
25191
25192    #[simd_test(enable = "avx512fp16")]
25193    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25194        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25195        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25196            0b01010101, a,
25197        );
25198        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25199        assert_eq_m128h(r, e);
25200    }
25201
25202    #[simd_test(enable = "avx512fp16")]
25203    unsafe fn test_mm_cvtsd_sh() {
25204        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25205        let b = _mm_setr_pd(1.0, 2.0);
25206        let r = _mm_cvtsd_sh(a, b);
25207        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25208        assert_eq_m128h(r, e);
25209    }
25210
25211    #[simd_test(enable = "avx512fp16")]
25212    unsafe fn test_mm_mask_cvtsd_sh() {
25213        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25214        let b = _mm_setr_pd(1.0, 2.0);
25215        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25216        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25217        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25218        assert_eq_m128h(r, e);
25219        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25220        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25221        assert_eq_m128h(r, e);
25222    }
25223
25224    #[simd_test(enable = "avx512fp16")]
25225    unsafe fn test_mm_maskz_cvtsd_sh() {
25226        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25227        let b = _mm_setr_pd(1.0, 2.0);
25228        let r = _mm_maskz_cvtsd_sh(0, a, b);
25229        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25230        assert_eq_m128h(r, e);
25231        let r = _mm_maskz_cvtsd_sh(1, a, b);
25232        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25233        assert_eq_m128h(r, e);
25234    }
25235
25236    #[simd_test(enable = "avx512fp16")]
25237    unsafe fn test_mm_cvt_roundsd_sh() {
25238        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25239        let b = _mm_setr_pd(1.0, 2.0);
25240        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25241        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25242        assert_eq_m128h(r, e);
25243    }
25244
25245    #[simd_test(enable = "avx512fp16")]
25246    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25247        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25248        let b = _mm_setr_pd(1.0, 2.0);
25249        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25250        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25251            src, 0, a, b,
25252        );
25253        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25254        assert_eq_m128h(r, e);
25255        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25256            src, 1, a, b,
25257        );
25258        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25259        assert_eq_m128h(r, e);
25260    }
25261
25262    #[simd_test(enable = "avx512fp16")]
25263    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25264        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25265        let b = _mm_setr_pd(1.0, 2.0);
25266        let r =
25267            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25268        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25269        assert_eq_m128h(r, e);
25270        let r =
25271            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25272        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25273        assert_eq_m128h(r, e);
25274    }
25275
25276    #[simd_test(enable = "avx512fp16,avx512vl")]
25277    unsafe fn test_mm_cvtph_epi16() {
25278        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25279        let r = _mm_cvttph_epi16(a);
25280        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25281        assert_eq_m128i(r, e);
25282    }
25283
25284    #[simd_test(enable = "avx512fp16,avx512vl")]
25285    unsafe fn test_mm_mask_cvtph_epi16() {
25286        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25287        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25288        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25289        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25290        assert_eq_m128i(r, e);
25291    }
25292
25293    #[simd_test(enable = "avx512fp16,avx512vl")]
25294    unsafe fn test_mm_maskz_cvtph_epi16() {
25295        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25296        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25297        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25298        assert_eq_m128i(r, e);
25299    }
25300
25301    #[simd_test(enable = "avx512fp16,avx512vl")]
25302    unsafe fn test_mm256_cvtph_epi16() {
25303        let a = _mm256_set_ph(
25304            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25305        );
25306        let r = _mm256_cvttph_epi16(a);
25307        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25308        assert_eq_m256i(r, e);
25309    }
25310
25311    #[simd_test(enable = "avx512fp16,avx512vl")]
25312    unsafe fn test_mm256_mask_cvtph_epi16() {
25313        let a = _mm256_set_ph(
25314            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25315        );
25316        let src = _mm256_set_epi16(
25317            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25318        );
25319        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25320        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25321        assert_eq_m256i(r, e);
25322    }
25323
25324    #[simd_test(enable = "avx512fp16,avx512vl")]
25325    unsafe fn test_mm256_maskz_cvtph_epi16() {
25326        let a = _mm256_set_ph(
25327            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25328        );
25329        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25330        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25331        assert_eq_m256i(r, e);
25332    }
25333
25334    #[simd_test(enable = "avx512fp16")]
25335    unsafe fn test_mm512_cvtph_epi16() {
25336        let a = _mm512_set_ph(
25337            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25338            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25339            31.0, 32.0,
25340        );
25341        let r = _mm512_cvttph_epi16(a);
25342        let e = _mm512_set_epi16(
25343            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25344            25, 26, 27, 28, 29, 30, 31, 32,
25345        );
25346        assert_eq_m512i(r, e);
25347    }
25348
25349    #[simd_test(enable = "avx512fp16")]
25350    unsafe fn test_mm512_mask_cvtph_epi16() {
25351        let a = _mm512_set_ph(
25352            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25353            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25354            31.0, 32.0,
25355        );
25356        let src = _mm512_set_epi16(
25357            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25358            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25359        );
25360        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25361        let e = _mm512_set_epi16(
25362            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25363            24, 34, 26, 36, 28, 38, 30, 40, 32,
25364        );
25365        assert_eq_m512i(r, e);
25366    }
25367
25368    #[simd_test(enable = "avx512fp16")]
25369    unsafe fn test_mm512_maskz_cvtph_epi16() {
25370        let a = _mm512_set_ph(
25371            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25372            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25373            31.0, 32.0,
25374        );
25375        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25376        let e = _mm512_set_epi16(
25377            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25378            0, 28, 0, 30, 0, 32,
25379        );
25380        assert_eq_m512i(r, e);
25381    }
25382
25383    #[simd_test(enable = "avx512fp16")]
25384    unsafe fn test_mm512_cvt_roundph_epi16() {
25385        let a = _mm512_set_ph(
25386            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25387            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25388            31.0, 32.0,
25389        );
25390        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25391        let e = _mm512_set_epi16(
25392            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25393            25, 26, 27, 28, 29, 30, 31, 32,
25394        );
25395        assert_eq_m512i(r, e);
25396    }
25397
25398    #[simd_test(enable = "avx512fp16")]
25399    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25400        let a = _mm512_set_ph(
25401            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25402            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25403            31.0, 32.0,
25404        );
25405        let src = _mm512_set_epi16(
25406            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25407            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25408        );
25409        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25410            src,
25411            0b01010101010101010101010101010101,
25412            a,
25413        );
25414        let e = _mm512_set_epi16(
25415            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25416            24, 34, 26, 36, 28, 38, 30, 40, 32,
25417        );
25418        assert_eq_m512i(r, e);
25419    }
25420
25421    #[simd_test(enable = "avx512fp16")]
25422    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25423        let a = _mm512_set_ph(
25424            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25425            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25426            31.0, 32.0,
25427        );
25428        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25429            0b01010101010101010101010101010101,
25430            a,
25431        );
25432        let e = _mm512_set_epi16(
25433            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25434            0, 28, 0, 30, 0, 32,
25435        );
25436        assert_eq_m512i(r, e);
25437    }
25438
25439    #[simd_test(enable = "avx512fp16,avx512vl")]
25440    unsafe fn test_mm_cvtph_epu16() {
25441        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25442        let r = _mm_cvttph_epu16(a);
25443        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25444        assert_eq_m128i(r, e);
25445    }
25446
25447    #[simd_test(enable = "avx512fp16,avx512vl")]
25448    unsafe fn test_mm_mask_cvtph_epu16() {
25449        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25450        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25451        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25452        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25453        assert_eq_m128i(r, e);
25454    }
25455
25456    #[simd_test(enable = "avx512fp16,avx512vl")]
25457    unsafe fn test_mm_maskz_cvtph_epu16() {
25458        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25459        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25460        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25461        assert_eq_m128i(r, e);
25462    }
25463
25464    #[simd_test(enable = "avx512fp16,avx512vl")]
25465    unsafe fn test_mm256_cvtph_epu16() {
25466        let a = _mm256_set_ph(
25467            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25468        );
25469        let r = _mm256_cvttph_epu16(a);
25470        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25471        assert_eq_m256i(r, e);
25472    }
25473
25474    #[simd_test(enable = "avx512fp16,avx512vl")]
25475    unsafe fn test_mm256_mask_cvtph_epu16() {
25476        let a = _mm256_set_ph(
25477            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25478        );
25479        let src = _mm256_set_epi16(
25480            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25481        );
25482        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25483        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25484        assert_eq_m256i(r, e);
25485    }
25486
25487    #[simd_test(enable = "avx512fp16,avx512vl")]
25488    unsafe fn test_mm256_maskz_cvtph_epu16() {
25489        let a = _mm256_set_ph(
25490            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25491        );
25492        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25493        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25494        assert_eq_m256i(r, e);
25495    }
25496
25497    #[simd_test(enable = "avx512fp16")]
25498    unsafe fn test_mm512_cvtph_epu16() {
25499        let a = _mm512_set_ph(
25500            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25501            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25502            31.0, 32.0,
25503        );
25504        let r = _mm512_cvttph_epu16(a);
25505        let e = _mm512_set_epi16(
25506            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25507            25, 26, 27, 28, 29, 30, 31, 32,
25508        );
25509        assert_eq_m512i(r, e);
25510    }
25511
25512    #[simd_test(enable = "avx512fp16")]
25513    unsafe fn test_mm512_mask_cvtph_epu16() {
25514        let a = _mm512_set_ph(
25515            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25516            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25517            31.0, 32.0,
25518        );
25519        let src = _mm512_set_epi16(
25520            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25521            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25522        );
25523        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25524        let e = _mm512_set_epi16(
25525            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25526            24, 34, 26, 36, 28, 38, 30, 40, 32,
25527        );
25528        assert_eq_m512i(r, e);
25529    }
25530
25531    #[simd_test(enable = "avx512fp16")]
25532    unsafe fn test_mm512_maskz_cvtph_epu16() {
25533        let a = _mm512_set_ph(
25534            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25535            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25536            31.0, 32.0,
25537        );
25538        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25539        let e = _mm512_set_epi16(
25540            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25541            0, 28, 0, 30, 0, 32,
25542        );
25543        assert_eq_m512i(r, e);
25544    }
25545
25546    #[simd_test(enable = "avx512fp16")]
25547    unsafe fn test_mm512_cvt_roundph_epu16() {
25548        let a = _mm512_set_ph(
25549            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25550            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25551            31.0, 32.0,
25552        );
25553        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25554        let e = _mm512_set_epi16(
25555            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25556            25, 26, 27, 28, 29, 30, 31, 32,
25557        );
25558        assert_eq_m512i(r, e);
25559    }
25560
25561    #[simd_test(enable = "avx512fp16")]
25562    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25563        let a = _mm512_set_ph(
25564            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25565            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25566            31.0, 32.0,
25567        );
25568        let src = _mm512_set_epi16(
25569            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25570            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25571        );
25572        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25573            src,
25574            0b01010101010101010101010101010101,
25575            a,
25576        );
25577        let e = _mm512_set_epi16(
25578            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25579            24, 34, 26, 36, 28, 38, 30, 40, 32,
25580        );
25581        assert_eq_m512i(r, e);
25582    }
25583
25584    #[simd_test(enable = "avx512fp16")]
25585    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25586        let a = _mm512_set_ph(
25587            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25588            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25589            31.0, 32.0,
25590        );
25591        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25592            0b01010101010101010101010101010101,
25593            a,
25594        );
25595        let e = _mm512_set_epi16(
25596            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25597            0, 28, 0, 30, 0, 32,
25598        );
25599        assert_eq_m512i(r, e);
25600    }
25601
25602    #[simd_test(enable = "avx512fp16,avx512vl")]
25603    unsafe fn test_mm_cvttph_epi16() {
25604        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25605        let r = _mm_cvttph_epi16(a);
25606        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25607        assert_eq_m128i(r, e);
25608    }
25609
25610    #[simd_test(enable = "avx512fp16,avx512vl")]
25611    unsafe fn test_mm_mask_cvttph_epi16() {
25612        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25613        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25614        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25615        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25616        assert_eq_m128i(r, e);
25617    }
25618
25619    #[simd_test(enable = "avx512fp16,avx512vl")]
25620    unsafe fn test_mm_maskz_cvttph_epi16() {
25621        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25622        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25623        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25624        assert_eq_m128i(r, e);
25625    }
25626
25627    #[simd_test(enable = "avx512fp16,avx512vl")]
25628    unsafe fn test_mm256_cvttph_epi16() {
25629        let a = _mm256_set_ph(
25630            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25631        );
25632        let r = _mm256_cvttph_epi16(a);
25633        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25634        assert_eq_m256i(r, e);
25635    }
25636
25637    #[simd_test(enable = "avx512fp16,avx512vl")]
25638    unsafe fn test_mm256_mask_cvttph_epi16() {
25639        let a = _mm256_set_ph(
25640            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25641        );
25642        let src = _mm256_set_epi16(
25643            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25644        );
25645        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25646        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25647        assert_eq_m256i(r, e);
25648    }
25649
25650    #[simd_test(enable = "avx512fp16,avx512vl")]
25651    unsafe fn test_mm256_maskz_cvttph_epi16() {
25652        let a = _mm256_set_ph(
25653            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25654        );
25655        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25656        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25657        assert_eq_m256i(r, e);
25658    }
25659
25660    #[simd_test(enable = "avx512fp16")]
25661    unsafe fn test_mm512_cvttph_epi16() {
25662        let a = _mm512_set_ph(
25663            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25664            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25665            31.0, 32.0,
25666        );
25667        let r = _mm512_cvttph_epi16(a);
25668        let e = _mm512_set_epi16(
25669            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25670            25, 26, 27, 28, 29, 30, 31, 32,
25671        );
25672        assert_eq_m512i(r, e);
25673    }
25674
25675    #[simd_test(enable = "avx512fp16")]
25676    unsafe fn test_mm512_mask_cvttph_epi16() {
25677        let a = _mm512_set_ph(
25678            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25679            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25680            31.0, 32.0,
25681        );
25682        let src = _mm512_set_epi16(
25683            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25684            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25685        );
25686        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25687        let e = _mm512_set_epi16(
25688            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25689            24, 34, 26, 36, 28, 38, 30, 40, 32,
25690        );
25691        assert_eq_m512i(r, e);
25692    }
25693
25694    #[simd_test(enable = "avx512fp16")]
25695    unsafe fn test_mm512_maskz_cvttph_epi16() {
25696        let a = _mm512_set_ph(
25697            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25698            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25699            31.0, 32.0,
25700        );
25701        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25702        let e = _mm512_set_epi16(
25703            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25704            0, 28, 0, 30, 0, 32,
25705        );
25706        assert_eq_m512i(r, e);
25707    }
25708
25709    #[simd_test(enable = "avx512fp16")]
25710    unsafe fn test_mm512_cvtt_roundph_epi16() {
25711        let a = _mm512_set_ph(
25712            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25713            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25714            31.0, 32.0,
25715        );
25716        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25717        let e = _mm512_set_epi16(
25718            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25719            25, 26, 27, 28, 29, 30, 31, 32,
25720        );
25721        assert_eq_m512i(r, e);
25722    }
25723
25724    #[simd_test(enable = "avx512fp16")]
25725    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25726        let a = _mm512_set_ph(
25727            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25728            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25729            31.0, 32.0,
25730        );
25731        let src = _mm512_set_epi16(
25732            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25733            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25734        );
25735        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25736            src,
25737            0b01010101010101010101010101010101,
25738            a,
25739        );
25740        let e = _mm512_set_epi16(
25741            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25742            24, 34, 26, 36, 28, 38, 30, 40, 32,
25743        );
25744        assert_eq_m512i(r, e);
25745    }
25746
25747    #[simd_test(enable = "avx512fp16")]
25748    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25749        let a = _mm512_set_ph(
25750            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25751            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25752            31.0, 32.0,
25753        );
25754        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25755            0b01010101010101010101010101010101,
25756            a,
25757        );
25758        let e = _mm512_set_epi16(
25759            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25760            0, 28, 0, 30, 0, 32,
25761        );
25762        assert_eq_m512i(r, e);
25763    }
25764
25765    #[simd_test(enable = "avx512fp16,avx512vl")]
25766    unsafe fn test_mm_cvttph_epu16() {
25767        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25768        let r = _mm_cvttph_epu16(a);
25769        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25770        assert_eq_m128i(r, e);
25771    }
25772
25773    #[simd_test(enable = "avx512fp16,avx512vl")]
25774    unsafe fn test_mm_mask_cvttph_epu16() {
25775        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25776        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25777        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25778        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25779        assert_eq_m128i(r, e);
25780    }
25781
25782    #[simd_test(enable = "avx512fp16,avx512vl")]
25783    unsafe fn test_mm_maskz_cvttph_epu16() {
25784        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25785        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25786        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25787        assert_eq_m128i(r, e);
25788    }
25789
25790    #[simd_test(enable = "avx512fp16,avx512vl")]
25791    unsafe fn test_mm256_cvttph_epu16() {
25792        let a = _mm256_set_ph(
25793            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25794        );
25795        let r = _mm256_cvttph_epu16(a);
25796        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25797        assert_eq_m256i(r, e);
25798    }
25799
25800    #[simd_test(enable = "avx512fp16,avx512vl")]
25801    unsafe fn test_mm256_mask_cvttph_epu16() {
25802        let a = _mm256_set_ph(
25803            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25804        );
25805        let src = _mm256_set_epi16(
25806            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25807        );
25808        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25809        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25810        assert_eq_m256i(r, e);
25811    }
25812
25813    #[simd_test(enable = "avx512fp16,avx512vl")]
25814    unsafe fn test_mm256_maskz_cvttph_epu16() {
25815        let a = _mm256_set_ph(
25816            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25817        );
25818        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25819        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25820        assert_eq_m256i(r, e);
25821    }
25822
25823    #[simd_test(enable = "avx512fp16")]
25824    unsafe fn test_mm512_cvttph_epu16() {
25825        let a = _mm512_set_ph(
25826            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25827            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25828            31.0, 32.0,
25829        );
25830        let r = _mm512_cvttph_epu16(a);
25831        let e = _mm512_set_epi16(
25832            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25833            25, 26, 27, 28, 29, 30, 31, 32,
25834        );
25835        assert_eq_m512i(r, e);
25836    }
25837
25838    #[simd_test(enable = "avx512fp16")]
25839    unsafe fn test_mm512_mask_cvttph_epu16() {
25840        let a = _mm512_set_ph(
25841            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25842            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25843            31.0, 32.0,
25844        );
25845        let src = _mm512_set_epi16(
25846            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25847            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25848        );
25849        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25850        let e = _mm512_set_epi16(
25851            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25852            24, 34, 26, 36, 28, 38, 30, 40, 32,
25853        );
25854        assert_eq_m512i(r, e);
25855    }
25856
25857    #[simd_test(enable = "avx512fp16")]
25858    unsafe fn test_mm512_maskz_cvttph_epu16() {
25859        let a = _mm512_set_ph(
25860            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25861            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25862            31.0, 32.0,
25863        );
25864        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25865        let e = _mm512_set_epi16(
25866            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25867            0, 28, 0, 30, 0, 32,
25868        );
25869        assert_eq_m512i(r, e);
25870    }
25871
25872    #[simd_test(enable = "avx512fp16")]
25873    unsafe fn test_mm512_cvtt_roundph_epu16() {
25874        let a = _mm512_set_ph(
25875            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25876            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25877            31.0, 32.0,
25878        );
25879        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25880        let e = _mm512_set_epi16(
25881            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25882            25, 26, 27, 28, 29, 30, 31, 32,
25883        );
25884        assert_eq_m512i(r, e);
25885    }
25886
25887    #[simd_test(enable = "avx512fp16")]
25888    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25889        let a = _mm512_set_ph(
25890            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25891            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25892            31.0, 32.0,
25893        );
25894        let src = _mm512_set_epi16(
25895            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25896            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25897        );
25898        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25899            src,
25900            0b01010101010101010101010101010101,
25901            a,
25902        );
25903        let e = _mm512_set_epi16(
25904            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25905            24, 34, 26, 36, 28, 38, 30, 40, 32,
25906        );
25907        assert_eq_m512i(r, e);
25908    }
25909
25910    #[simd_test(enable = "avx512fp16")]
25911    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25912        let a = _mm512_set_ph(
25913            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25914            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25915            31.0, 32.0,
25916        );
25917        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25918            0b01010101010101010101010101010101,
25919            a,
25920        );
25921        let e = _mm512_set_epi16(
25922            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25923            0, 28, 0, 30, 0, 32,
25924        );
25925        assert_eq_m512i(r, e);
25926    }
25927
25928    #[simd_test(enable = "avx512fp16,avx512vl")]
25929    unsafe fn test_mm_cvtph_epi32() {
25930        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25931        let r = _mm_cvtph_epi32(a);
25932        let e = _mm_set_epi32(1, 2, 3, 4);
25933        assert_eq_m128i(r, e);
25934    }
25935
25936    #[simd_test(enable = "avx512fp16,avx512vl")]
25937    unsafe fn test_mm_mask_cvtph_epi32() {
25938        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25939        let src = _mm_set_epi32(10, 11, 12, 13);
25940        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
25941        let e = _mm_set_epi32(10, 2, 12, 4);
25942        assert_eq_m128i(r, e);
25943    }
25944
25945    #[simd_test(enable = "avx512fp16,avx512vl")]
25946    unsafe fn test_mm_maskz_cvtph_epi32() {
25947        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25948        let r = _mm_maskz_cvtph_epi32(0b0101, a);
25949        let e = _mm_set_epi32(0, 2, 0, 4);
25950        assert_eq_m128i(r, e);
25951    }
25952
25953    #[simd_test(enable = "avx512fp16,avx512vl")]
25954    unsafe fn test_mm256_cvtph_epi32() {
25955        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25956        let r = _mm256_cvtph_epi32(a);
25957        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25958        assert_eq_m256i(r, e);
25959    }
25960
25961    #[simd_test(enable = "avx512fp16,avx512vl")]
25962    unsafe fn test_mm256_mask_cvtph_epi32() {
25963        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25964        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25965        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
25966        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25967        assert_eq_m256i(r, e);
25968    }
25969
25970    #[simd_test(enable = "avx512fp16,avx512vl")]
25971    unsafe fn test_mm256_maskz_cvtph_epi32() {
25972        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25973        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
25974        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25975        assert_eq_m256i(r, e);
25976    }
25977
25978    #[simd_test(enable = "avx512fp16")]
25979    unsafe fn test_mm512_cvtph_epi32() {
25980        let a = _mm256_set_ph(
25981            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25982        );
25983        let r = _mm512_cvtph_epi32(a);
25984        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25985        assert_eq_m512i(r, e);
25986    }
25987
25988    #[simd_test(enable = "avx512fp16")]
25989    unsafe fn test_mm512_mask_cvtph_epi32() {
25990        let a = _mm256_set_ph(
25991            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25992        );
25993        let src = _mm512_set_epi32(
25994            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25995        );
25996        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
25997        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25998        assert_eq_m512i(r, e);
25999    }
26000
26001    #[simd_test(enable = "avx512fp16")]
26002    unsafe fn test_mm512_maskz_cvtph_epi32() {
26003        let a = _mm256_set_ph(
26004            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26005        );
26006        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26007        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26008        assert_eq_m512i(r, e);
26009    }
26010
26011    #[simd_test(enable = "avx512fp16")]
26012    unsafe fn test_mm512_cvt_roundph_epi32() {
26013        let a = _mm256_set_ph(
26014            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26015        );
26016        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26017        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26018        assert_eq_m512i(r, e);
26019    }
26020
26021    #[simd_test(enable = "avx512fp16")]
26022    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26023        let a = _mm256_set_ph(
26024            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26025        );
26026        let src = _mm512_set_epi32(
26027            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26028        );
26029        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26030            src,
26031            0b0101010101010101,
26032            a,
26033        );
26034        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26035        assert_eq_m512i(r, e);
26036    }
26037
26038    #[simd_test(enable = "avx512fp16")]
26039    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26040        let a = _mm256_set_ph(
26041            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26042        );
26043        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26044            0b0101010101010101,
26045            a,
26046        );
26047        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26048        assert_eq_m512i(r, e);
26049    }
26050
26051    #[simd_test(enable = "avx512fp16")]
26052    unsafe fn test_mm_cvtsh_i32() {
26053        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26054        let r = _mm_cvtsh_i32(a);
26055        assert_eq!(r, 1);
26056    }
26057
26058    #[simd_test(enable = "avx512fp16")]
26059    unsafe fn test_mm_cvt_roundsh_i32() {
26060        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26061        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26062        assert_eq!(r, 1);
26063    }
26064
26065    #[simd_test(enable = "avx512fp16,avx512vl")]
26066    unsafe fn test_mm_cvtph_epu32() {
26067        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26068        let r = _mm_cvtph_epu32(a);
26069        let e = _mm_set_epi32(1, 2, 3, 4);
26070        assert_eq_m128i(r, e);
26071    }
26072
26073    #[simd_test(enable = "avx512fp16,avx512vl")]
26074    unsafe fn test_mm_mask_cvtph_epu32() {
26075        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26076        let src = _mm_set_epi32(10, 11, 12, 13);
26077        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26078        let e = _mm_set_epi32(10, 2, 12, 4);
26079        assert_eq_m128i(r, e);
26080    }
26081
26082    #[simd_test(enable = "avx512fp16,avx512vl")]
26083    unsafe fn test_mm_maskz_cvtph_epu32() {
26084        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26085        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26086        let e = _mm_set_epi32(0, 2, 0, 4);
26087        assert_eq_m128i(r, e);
26088    }
26089
26090    #[simd_test(enable = "avx512fp16,avx512vl")]
26091    unsafe fn test_mm256_cvtph_epu32() {
26092        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26093        let r = _mm256_cvtph_epu32(a);
26094        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26095        assert_eq_m256i(r, e);
26096    }
26097
26098    #[simd_test(enable = "avx512fp16,avx512vl")]
26099    unsafe fn test_mm256_mask_cvtph_epu32() {
26100        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26101        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26102        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26103        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26104        assert_eq_m256i(r, e);
26105    }
26106
26107    #[simd_test(enable = "avx512fp16,avx512vl")]
26108    unsafe fn test_mm256_maskz_cvtph_epu32() {
26109        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26110        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26111        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26112        assert_eq_m256i(r, e);
26113    }
26114
26115    #[simd_test(enable = "avx512fp16")]
26116    unsafe fn test_mm512_cvtph_epu32() {
26117        let a = _mm256_set_ph(
26118            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26119        );
26120        let r = _mm512_cvtph_epu32(a);
26121        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26122        assert_eq_m512i(r, e);
26123    }
26124
26125    #[simd_test(enable = "avx512fp16")]
26126    unsafe fn test_mm512_mask_cvtph_epu32() {
26127        let a = _mm256_set_ph(
26128            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26129        );
26130        let src = _mm512_set_epi32(
26131            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26132        );
26133        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26134        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26135        assert_eq_m512i(r, e);
26136    }
26137
26138    #[simd_test(enable = "avx512fp16")]
26139    unsafe fn test_mm512_maskz_cvtph_epu32() {
26140        let a = _mm256_set_ph(
26141            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26142        );
26143        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26144        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26145        assert_eq_m512i(r, e);
26146    }
26147
26148    #[simd_test(enable = "avx512fp16")]
26149    unsafe fn test_mm512_cvt_roundph_epu32() {
26150        let a = _mm256_set_ph(
26151            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26152        );
26153        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26154        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26155        assert_eq_m512i(r, e);
26156    }
26157
26158    #[simd_test(enable = "avx512fp16")]
26159    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26160        let a = _mm256_set_ph(
26161            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26162        );
26163        let src = _mm512_set_epi32(
26164            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26165        );
26166        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26167            src,
26168            0b0101010101010101,
26169            a,
26170        );
26171        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26172        assert_eq_m512i(r, e);
26173    }
26174
26175    #[simd_test(enable = "avx512fp16")]
26176    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26177        let a = _mm256_set_ph(
26178            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26179        );
26180        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26181            0b0101010101010101,
26182            a,
26183        );
26184        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26185        assert_eq_m512i(r, e);
26186    }
26187
26188    #[simd_test(enable = "avx512fp16")]
26189    unsafe fn test_mm_cvtsh_u32() {
26190        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26191        let r = _mm_cvtsh_u32(a);
26192        assert_eq!(r, 1);
26193    }
26194
26195    #[simd_test(enable = "avx512fp16")]
26196    unsafe fn test_mm_cvt_roundsh_u32() {
26197        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26198        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26199        assert_eq!(r, 1);
26200    }
26201
26202    #[simd_test(enable = "avx512fp16,avx512vl")]
26203    unsafe fn test_mm_cvttph_epi32() {
26204        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26205        let r = _mm_cvttph_epi32(a);
26206        let e = _mm_set_epi32(1, 2, 3, 4);
26207        assert_eq_m128i(r, e);
26208    }
26209
26210    #[simd_test(enable = "avx512fp16,avx512vl")]
26211    unsafe fn test_mm_mask_cvttph_epi32() {
26212        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26213        let src = _mm_set_epi32(10, 11, 12, 13);
26214        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26215        let e = _mm_set_epi32(10, 2, 12, 4);
26216        assert_eq_m128i(r, e);
26217    }
26218
26219    #[simd_test(enable = "avx512fp16,avx512vl")]
26220    unsafe fn test_mm_maskz_cvttph_epi32() {
26221        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26222        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26223        let e = _mm_set_epi32(0, 2, 0, 4);
26224        assert_eq_m128i(r, e);
26225    }
26226
26227    #[simd_test(enable = "avx512fp16,avx512vl")]
26228    unsafe fn test_mm256_cvttph_epi32() {
26229        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26230        let r = _mm256_cvttph_epi32(a);
26231        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26232        assert_eq_m256i(r, e);
26233    }
26234
26235    #[simd_test(enable = "avx512fp16,avx512vl")]
26236    unsafe fn test_mm256_mask_cvttph_epi32() {
26237        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26238        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26239        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26240        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26241        assert_eq_m256i(r, e);
26242    }
26243
26244    #[simd_test(enable = "avx512fp16,avx512vl")]
26245    unsafe fn test_mm256_maskz_cvttph_epi32() {
26246        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26247        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26248        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26249        assert_eq_m256i(r, e);
26250    }
26251
26252    #[simd_test(enable = "avx512fp16")]
26253    unsafe fn test_mm512_cvttph_epi32() {
26254        let a = _mm256_set_ph(
26255            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26256        );
26257        let r = _mm512_cvttph_epi32(a);
26258        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26259        assert_eq_m512i(r, e);
26260    }
26261
26262    #[simd_test(enable = "avx512fp16")]
26263    unsafe fn test_mm512_mask_cvttph_epi32() {
26264        let a = _mm256_set_ph(
26265            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26266        );
26267        let src = _mm512_set_epi32(
26268            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26269        );
26270        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26271        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26272        assert_eq_m512i(r, e);
26273    }
26274
26275    #[simd_test(enable = "avx512fp16")]
26276    unsafe fn test_mm512_maskz_cvttph_epi32() {
26277        let a = _mm256_set_ph(
26278            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26279        );
26280        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26281        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26282        assert_eq_m512i(r, e);
26283    }
26284
26285    #[simd_test(enable = "avx512fp16")]
26286    unsafe fn test_mm512_cvtt_roundph_epi32() {
26287        let a = _mm256_set_ph(
26288            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26289        );
26290        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26291        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26292        assert_eq_m512i(r, e);
26293    }
26294
26295    #[simd_test(enable = "avx512fp16")]
26296    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26297        let a = _mm256_set_ph(
26298            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26299        );
26300        let src = _mm512_set_epi32(
26301            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26302        );
26303        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26304        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26305        assert_eq_m512i(r, e);
26306    }
26307
26308    #[simd_test(enable = "avx512fp16")]
26309    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26310        let a = _mm256_set_ph(
26311            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26312        );
26313        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26314        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26315        assert_eq_m512i(r, e);
26316    }
26317
26318    #[simd_test(enable = "avx512fp16")]
26319    unsafe fn test_mm_cvttsh_i32() {
26320        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26321        let r = _mm_cvttsh_i32(a);
26322        assert_eq!(r, 1);
26323    }
26324
26325    #[simd_test(enable = "avx512fp16")]
26326    unsafe fn test_mm_cvtt_roundsh_i32() {
26327        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26328        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26329        assert_eq!(r, 1);
26330    }
26331
26332    #[simd_test(enable = "avx512fp16,avx512vl")]
26333    unsafe fn test_mm_cvttph_epu32() {
26334        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26335        let r = _mm_cvttph_epu32(a);
26336        let e = _mm_set_epi32(1, 2, 3, 4);
26337        assert_eq_m128i(r, e);
26338    }
26339
26340    #[simd_test(enable = "avx512fp16,avx512vl")]
26341    unsafe fn test_mm_mask_cvttph_epu32() {
26342        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26343        let src = _mm_set_epi32(10, 11, 12, 13);
26344        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26345        let e = _mm_set_epi32(10, 2, 12, 4);
26346        assert_eq_m128i(r, e);
26347    }
26348
26349    #[simd_test(enable = "avx512fp16,avx512vl")]
26350    unsafe fn test_mm_maskz_cvttph_epu32() {
26351        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26352        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26353        let e = _mm_set_epi32(0, 2, 0, 4);
26354        assert_eq_m128i(r, e);
26355    }
26356
26357    #[simd_test(enable = "avx512fp16,avx512vl")]
26358    unsafe fn test_mm256_cvttph_epu32() {
26359        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26360        let r = _mm256_cvttph_epu32(a);
26361        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26362        assert_eq_m256i(r, e);
26363    }
26364
26365    #[simd_test(enable = "avx512fp16,avx512vl")]
26366    unsafe fn test_mm256_mask_cvttph_epu32() {
26367        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26368        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26369        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26370        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26371        assert_eq_m256i(r, e);
26372    }
26373
26374    #[simd_test(enable = "avx512fp16,avx512vl")]
26375    unsafe fn test_mm256_maskz_cvttph_epu32() {
26376        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26377        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26378        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26379        assert_eq_m256i(r, e);
26380    }
26381
26382    #[simd_test(enable = "avx512fp16")]
26383    unsafe fn test_mm512_cvttph_epu32() {
26384        let a = _mm256_set_ph(
26385            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26386        );
26387        let r = _mm512_cvttph_epu32(a);
26388        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26389        assert_eq_m512i(r, e);
26390    }
26391
26392    #[simd_test(enable = "avx512fp16")]
26393    unsafe fn test_mm512_mask_cvttph_epu32() {
26394        let a = _mm256_set_ph(
26395            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26396        );
26397        let src = _mm512_set_epi32(
26398            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26399        );
26400        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26401        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26402        assert_eq_m512i(r, e);
26403    }
26404
26405    #[simd_test(enable = "avx512fp16")]
26406    unsafe fn test_mm512_maskz_cvttph_epu32() {
26407        let a = _mm256_set_ph(
26408            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26409        );
26410        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26411        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26412        assert_eq_m512i(r, e);
26413    }
26414
26415    #[simd_test(enable = "avx512fp16")]
26416    unsafe fn test_mm512_cvtt_roundph_epu32() {
26417        let a = _mm256_set_ph(
26418            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26419        );
26420        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26421        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26422        assert_eq_m512i(r, e);
26423    }
26424
26425    #[simd_test(enable = "avx512fp16")]
26426    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26427        let a = _mm256_set_ph(
26428            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26429        );
26430        let src = _mm512_set_epi32(
26431            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26432        );
26433        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26434        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26435        assert_eq_m512i(r, e);
26436    }
26437
26438    #[simd_test(enable = "avx512fp16")]
26439    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26440        let a = _mm256_set_ph(
26441            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26442        );
26443        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26444        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26445        assert_eq_m512i(r, e);
26446    }
26447
26448    #[simd_test(enable = "avx512fp16")]
26449    unsafe fn test_mm_cvttsh_u32() {
26450        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26451        let r = _mm_cvttsh_u32(a);
26452        assert_eq!(r, 1);
26453    }
26454
26455    #[simd_test(enable = "avx512fp16")]
26456    unsafe fn test_mm_cvtt_roundsh_u32() {
26457        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26458        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26459        assert_eq!(r, 1);
26460    }
26461
26462    #[simd_test(enable = "avx512fp16,avx512vl")]
26463    unsafe fn test_mm_cvtph_epi64() {
26464        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26465        let r = _mm_cvtph_epi64(a);
26466        let e = _mm_set_epi64x(1, 2);
26467        assert_eq_m128i(r, e);
26468    }
26469
26470    #[simd_test(enable = "avx512fp16,avx512vl")]
26471    unsafe fn test_mm_mask_cvtph_epi64() {
26472        let src = _mm_set_epi64x(3, 4);
26473        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26474        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26475        let e = _mm_set_epi64x(3, 2);
26476        assert_eq_m128i(r, e);
26477    }
26478
26479    #[simd_test(enable = "avx512fp16,avx512vl")]
26480    unsafe fn test_mm_maskz_cvtph_epi64() {
26481        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26482        let r = _mm_maskz_cvtph_epi64(0b01, a);
26483        let e = _mm_set_epi64x(0, 2);
26484        assert_eq_m128i(r, e);
26485    }
26486
26487    #[simd_test(enable = "avx512fp16,avx512vl")]
26488    unsafe fn test_mm256_cvtph_epi64() {
26489        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26490        let r = _mm256_cvtph_epi64(a);
26491        let e = _mm256_set_epi64x(1, 2, 3, 4);
26492        assert_eq_m256i(r, e);
26493    }
26494
26495    #[simd_test(enable = "avx512fp16,avx512vl")]
26496    unsafe fn test_mm256_mask_cvtph_epi64() {
26497        let src = _mm256_set_epi64x(5, 6, 7, 8);
26498        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26499        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26500        let e = _mm256_set_epi64x(5, 2, 7, 4);
26501        assert_eq_m256i(r, e);
26502    }
26503
26504    #[simd_test(enable = "avx512fp16,avx512vl")]
26505    unsafe fn test_mm256_maskz_cvtph_epi64() {
26506        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26507        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26508        let e = _mm256_set_epi64x(0, 2, 0, 4);
26509        assert_eq_m256i(r, e);
26510    }
26511
26512    #[simd_test(enable = "avx512fp16")]
26513    unsafe fn test_mm512_cvtph_epi64() {
26514        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26515        let r = _mm512_cvtph_epi64(a);
26516        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26517        assert_eq_m512i(r, e);
26518    }
26519
26520    #[simd_test(enable = "avx512fp16")]
26521    unsafe fn test_mm512_mask_cvtph_epi64() {
26522        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26523        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26524        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26525        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26526        assert_eq_m512i(r, e);
26527    }
26528
26529    #[simd_test(enable = "avx512fp16")]
26530    unsafe fn test_mm512_maskz_cvtph_epi64() {
26531        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26532        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26533        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26534        assert_eq_m512i(r, e);
26535    }
26536
26537    #[simd_test(enable = "avx512fp16")]
26538    unsafe fn test_mm512_cvt_roundph_epi64() {
26539        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26540        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26541        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26542        assert_eq_m512i(r, e);
26543    }
26544
26545    #[simd_test(enable = "avx512fp16")]
26546    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26547        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26548        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26549        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26550            src, 0b01010101, a,
26551        );
26552        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26553        assert_eq_m512i(r, e);
26554    }
26555
26556    #[simd_test(enable = "avx512fp16")]
26557    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26558        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26559        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26560            0b01010101, a,
26561        );
26562        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26563        assert_eq_m512i(r, e);
26564    }
26565
26566    #[simd_test(enable = "avx512fp16,avx512vl")]
26567    unsafe fn test_mm_cvtph_epu64() {
26568        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26569        let r = _mm_cvtph_epu64(a);
26570        let e = _mm_set_epi64x(1, 2);
26571        assert_eq_m128i(r, e);
26572    }
26573
26574    #[simd_test(enable = "avx512fp16,avx512vl")]
26575    unsafe fn test_mm_mask_cvtph_epu64() {
26576        let src = _mm_set_epi64x(3, 4);
26577        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26578        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26579        let e = _mm_set_epi64x(3, 2);
26580        assert_eq_m128i(r, e);
26581    }
26582
26583    #[simd_test(enable = "avx512fp16,avx512vl")]
26584    unsafe fn test_mm_maskz_cvtph_epu64() {
26585        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26586        let r = _mm_maskz_cvtph_epu64(0b01, a);
26587        let e = _mm_set_epi64x(0, 2);
26588        assert_eq_m128i(r, e);
26589    }
26590
26591    #[simd_test(enable = "avx512fp16,avx512vl")]
26592    unsafe fn test_mm256_cvtph_epu64() {
26593        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26594        let r = _mm256_cvtph_epu64(a);
26595        let e = _mm256_set_epi64x(1, 2, 3, 4);
26596        assert_eq_m256i(r, e);
26597    }
26598
26599    #[simd_test(enable = "avx512fp16,avx512vl")]
26600    unsafe fn test_mm256_mask_cvtph_epu64() {
26601        let src = _mm256_set_epi64x(5, 6, 7, 8);
26602        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26603        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26604        let e = _mm256_set_epi64x(5, 2, 7, 4);
26605        assert_eq_m256i(r, e);
26606    }
26607
26608    #[simd_test(enable = "avx512fp16,avx512vl")]
26609    unsafe fn test_mm256_maskz_cvtph_epu64() {
26610        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26611        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26612        let e = _mm256_set_epi64x(0, 2, 0, 4);
26613        assert_eq_m256i(r, e);
26614    }
26615
26616    #[simd_test(enable = "avx512fp16")]
26617    unsafe fn test_mm512_cvtph_epu64() {
26618        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26619        let r = _mm512_cvtph_epu64(a);
26620        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26621        assert_eq_m512i(r, e);
26622    }
26623
26624    #[simd_test(enable = "avx512fp16")]
26625    unsafe fn test_mm512_mask_cvtph_epu64() {
26626        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26627        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26628        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26629        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26630        assert_eq_m512i(r, e);
26631    }
26632
26633    #[simd_test(enable = "avx512fp16")]
26634    unsafe fn test_mm512_maskz_cvtph_epu64() {
26635        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26636        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26637        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26638        assert_eq_m512i(r, e);
26639    }
26640
26641    #[simd_test(enable = "avx512fp16")]
26642    unsafe fn test_mm512_cvt_roundph_epu64() {
26643        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26644        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26645        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26646        assert_eq_m512i(r, e);
26647    }
26648
26649    #[simd_test(enable = "avx512fp16")]
26650    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26651        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26652        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26653        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26654            src, 0b01010101, a,
26655        );
26656        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26657        assert_eq_m512i(r, e);
26658    }
26659
26660    #[simd_test(enable = "avx512fp16")]
26661    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26662        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26663        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26664            0b01010101, a,
26665        );
26666        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26667        assert_eq_m512i(r, e);
26668    }
26669
26670    #[simd_test(enable = "avx512fp16,avx512vl")]
26671    unsafe fn test_mm_cvttph_epi64() {
26672        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26673        let r = _mm_cvttph_epi64(a);
26674        let e = _mm_set_epi64x(1, 2);
26675        assert_eq_m128i(r, e);
26676    }
26677
26678    #[simd_test(enable = "avx512fp16,avx512vl")]
26679    unsafe fn test_mm_mask_cvttph_epi64() {
26680        let src = _mm_set_epi64x(3, 4);
26681        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26682        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26683        let e = _mm_set_epi64x(3, 2);
26684        assert_eq_m128i(r, e);
26685    }
26686
26687    #[simd_test(enable = "avx512fp16,avx512vl")]
26688    unsafe fn test_mm_maskz_cvttph_epi64() {
26689        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26690        let r = _mm_maskz_cvttph_epi64(0b01, a);
26691        let e = _mm_set_epi64x(0, 2);
26692        assert_eq_m128i(r, e);
26693    }
26694
26695    #[simd_test(enable = "avx512fp16,avx512vl")]
26696    unsafe fn test_mm256_cvttph_epi64() {
26697        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26698        let r = _mm256_cvttph_epi64(a);
26699        let e = _mm256_set_epi64x(1, 2, 3, 4);
26700        assert_eq_m256i(r, e);
26701    }
26702
26703    #[simd_test(enable = "avx512fp16,avx512vl")]
26704    unsafe fn test_mm256_mask_cvttph_epi64() {
26705        let src = _mm256_set_epi64x(5, 6, 7, 8);
26706        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26707        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26708        let e = _mm256_set_epi64x(5, 2, 7, 4);
26709        assert_eq_m256i(r, e);
26710    }
26711
26712    #[simd_test(enable = "avx512fp16,avx512vl")]
26713    unsafe fn test_mm256_maskz_cvttph_epi64() {
26714        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26715        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26716        let e = _mm256_set_epi64x(0, 2, 0, 4);
26717        assert_eq_m256i(r, e);
26718    }
26719
26720    #[simd_test(enable = "avx512fp16")]
26721    unsafe fn test_mm512_cvttph_epi64() {
26722        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26723        let r = _mm512_cvttph_epi64(a);
26724        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26725        assert_eq_m512i(r, e);
26726    }
26727
26728    #[simd_test(enable = "avx512fp16")]
26729    unsafe fn test_mm512_mask_cvttph_epi64() {
26730        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26731        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26732        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26733        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26734        assert_eq_m512i(r, e);
26735    }
26736
26737    #[simd_test(enable = "avx512fp16")]
26738    unsafe fn test_mm512_maskz_cvttph_epi64() {
26739        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26740        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26741        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26742        assert_eq_m512i(r, e);
26743    }
26744
26745    #[simd_test(enable = "avx512fp16")]
26746    unsafe fn test_mm512_cvtt_roundph_epi64() {
26747        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26748        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26749        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26750        assert_eq_m512i(r, e);
26751    }
26752
26753    #[simd_test(enable = "avx512fp16")]
26754    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26755        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26756        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26757        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26758        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26759        assert_eq_m512i(r, e);
26760    }
26761
26762    #[simd_test(enable = "avx512fp16")]
26763    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26764        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26765        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26766        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26767        assert_eq_m512i(r, e);
26768    }
26769
26770    #[simd_test(enable = "avx512fp16,avx512vl")]
26771    unsafe fn test_mm_cvttph_epu64() {
26772        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26773        let r = _mm_cvttph_epu64(a);
26774        let e = _mm_set_epi64x(1, 2);
26775        assert_eq_m128i(r, e);
26776    }
26777
26778    #[simd_test(enable = "avx512fp16,avx512vl")]
26779    unsafe fn test_mm_mask_cvttph_epu64() {
26780        let src = _mm_set_epi64x(3, 4);
26781        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26782        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26783        let e = _mm_set_epi64x(3, 2);
26784        assert_eq_m128i(r, e);
26785    }
26786
26787    #[simd_test(enable = "avx512fp16,avx512vl")]
26788    unsafe fn test_mm_maskz_cvttph_epu64() {
26789        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26790        let r = _mm_maskz_cvttph_epu64(0b01, a);
26791        let e = _mm_set_epi64x(0, 2);
26792        assert_eq_m128i(r, e);
26793    }
26794
26795    #[simd_test(enable = "avx512fp16,avx512vl")]
26796    unsafe fn test_mm256_cvttph_epu64() {
26797        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26798        let r = _mm256_cvttph_epu64(a);
26799        let e = _mm256_set_epi64x(1, 2, 3, 4);
26800        assert_eq_m256i(r, e);
26801    }
26802
26803    #[simd_test(enable = "avx512fp16,avx512vl")]
26804    unsafe fn test_mm256_mask_cvttph_epu64() {
26805        let src = _mm256_set_epi64x(5, 6, 7, 8);
26806        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26807        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26808        let e = _mm256_set_epi64x(5, 2, 7, 4);
26809        assert_eq_m256i(r, e);
26810    }
26811
26812    #[simd_test(enable = "avx512fp16,avx512vl")]
26813    unsafe fn test_mm256_maskz_cvttph_epu64() {
26814        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26815        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26816        let e = _mm256_set_epi64x(0, 2, 0, 4);
26817        assert_eq_m256i(r, e);
26818    }
26819
26820    #[simd_test(enable = "avx512fp16")]
26821    unsafe fn test_mm512_cvttph_epu64() {
26822        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26823        let r = _mm512_cvttph_epu64(a);
26824        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26825        assert_eq_m512i(r, e);
26826    }
26827
26828    #[simd_test(enable = "avx512fp16")]
26829    unsafe fn test_mm512_mask_cvttph_epu64() {
26830        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26831        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26832        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26833        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26834        assert_eq_m512i(r, e);
26835    }
26836
26837    #[simd_test(enable = "avx512fp16")]
26838    unsafe fn test_mm512_maskz_cvttph_epu64() {
26839        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26840        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26841        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26842        assert_eq_m512i(r, e);
26843    }
26844
26845    #[simd_test(enable = "avx512fp16")]
26846    unsafe fn test_mm512_cvtt_roundph_epu64() {
26847        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26848        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26849        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26850        assert_eq_m512i(r, e);
26851    }
26852
26853    #[simd_test(enable = "avx512fp16")]
26854    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26855        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26856        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26857        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26858        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26859        assert_eq_m512i(r, e);
26860    }
26861
26862    #[simd_test(enable = "avx512fp16")]
26863    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26864        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26865        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26866        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26867        assert_eq_m512i(r, e);
26868    }
26869
26870    #[simd_test(enable = "avx512fp16,avx512vl")]
26871    unsafe fn test_mm_cvtxph_ps() {
26872        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26873        let r = _mm_cvtxph_ps(a);
26874        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26875        assert_eq_m128(r, e);
26876    }
26877
26878    #[simd_test(enable = "avx512fp16,avx512vl")]
26879    unsafe fn test_mm_mask_cvtxph_ps() {
26880        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26881        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26882        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26883        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26884        assert_eq_m128(r, e);
26885    }
26886
26887    #[simd_test(enable = "avx512fp16,avx512vl")]
26888    unsafe fn test_mm_maskz_cvtxph_ps() {
26889        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26890        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26891        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26892        assert_eq_m128(r, e);
26893    }
26894
26895    #[simd_test(enable = "avx512fp16,avx512vl")]
26896    unsafe fn test_mm256_cvtxph_ps() {
26897        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26898        let r = _mm256_cvtxph_ps(a);
26899        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26900        assert_eq_m256(r, e);
26901    }
26902
26903    #[simd_test(enable = "avx512fp16,avx512vl")]
26904    unsafe fn test_mm256_mask_cvtxph_ps() {
26905        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26906        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26907        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26908        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26909        assert_eq_m256(r, e);
26910    }
26911
26912    #[simd_test(enable = "avx512fp16,avx512vl")]
26913    unsafe fn test_mm256_maskz_cvtxph_ps() {
26914        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26915        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26916        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26917        assert_eq_m256(r, e);
26918    }
26919
26920    #[simd_test(enable = "avx512fp16")]
26921    unsafe fn test_mm512_cvtxph_ps() {
26922        let a = _mm256_set_ph(
26923            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26924        );
26925        let r = _mm512_cvtxph_ps(a);
26926        let e = _mm512_set_ps(
26927            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26928        );
26929        assert_eq_m512(r, e);
26930    }
26931
26932    #[simd_test(enable = "avx512fp16")]
26933    unsafe fn test_mm512_mask_cvtxph_ps() {
26934        let src = _mm512_set_ps(
26935            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26936            24.0, 25.0,
26937        );
26938        let a = _mm256_set_ph(
26939            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26940        );
26941        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
26942        let e = _mm512_set_ps(
26943            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26944            16.0,
26945        );
26946        assert_eq_m512(r, e);
26947    }
26948
26949    #[simd_test(enable = "avx512fp16")]
26950    unsafe fn test_mm512_maskz_cvtxph_ps() {
26951        let a = _mm256_set_ph(
26952            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26953        );
26954        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
26955        let e = _mm512_set_ps(
26956            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26957        );
26958        assert_eq_m512(r, e);
26959    }
26960
26961    #[simd_test(enable = "avx512fp16")]
26962    unsafe fn test_mm512_cvtx_roundph_ps() {
26963        let a = _mm256_set_ph(
26964            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26965        );
26966        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
26967        let e = _mm512_set_ps(
26968            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26969        );
26970        assert_eq_m512(r, e);
26971    }
26972
26973    #[simd_test(enable = "avx512fp16")]
26974    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
26975        let src = _mm512_set_ps(
26976            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26977            24.0, 25.0,
26978        );
26979        let a = _mm256_set_ph(
26980            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26981        );
26982        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26983        let e = _mm512_set_ps(
26984            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26985            16.0,
26986        );
26987        assert_eq_m512(r, e);
26988    }
26989
26990    #[simd_test(enable = "avx512fp16")]
26991    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
26992        let a = _mm256_set_ph(
26993            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26994        );
26995        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26996        let e = _mm512_set_ps(
26997            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26998        );
26999        assert_eq_m512(r, e);
27000    }
27001
27002    #[simd_test(enable = "avx512fp16")]
27003    unsafe fn test_mm_cvtsh_ss() {
27004        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27005        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27006        let r = _mm_cvtsh_ss(a, b);
27007        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27008        assert_eq_m128(r, e);
27009    }
27010
27011    #[simd_test(enable = "avx512fp16")]
27012    unsafe fn test_mm_mask_cvtsh_ss() {
27013        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27014        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27015        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27016        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27017        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27018        assert_eq_m128(r, e);
27019        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27020        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27021        assert_eq_m128(r, e);
27022    }
27023
27024    #[simd_test(enable = "avx512fp16")]
27025    unsafe fn test_mm_maskz_cvtsh_ss() {
27026        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27027        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27028        let r = _mm_maskz_cvtsh_ss(0, a, b);
27029        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27030        assert_eq_m128(r, e);
27031        let r = _mm_maskz_cvtsh_ss(1, a, b);
27032        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27033        assert_eq_m128(r, e);
27034    }
27035
27036    #[simd_test(enable = "avx512fp16")]
27037    unsafe fn test_mm_cvt_roundsh_ss() {
27038        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27039        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27040        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27041        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27042        assert_eq_m128(r, e);
27043    }
27044
27045    #[simd_test(enable = "avx512fp16")]
27046    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27047        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27048        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27049        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27050        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27051        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27052        assert_eq_m128(r, e);
27053        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27054        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27055        assert_eq_m128(r, e);
27056    }
27057
27058    #[simd_test(enable = "avx512fp16")]
27059    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27060        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27061        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27062        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27063        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27064        assert_eq_m128(r, e);
27065        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27066        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27067        assert_eq_m128(r, e);
27068    }
27069
27070    #[simd_test(enable = "avx512fp16,avx512vl")]
27071    unsafe fn test_mm_cvtph_pd() {
27072        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27073        let r = _mm_cvtph_pd(a);
27074        let e = _mm_set_pd(1.0, 2.0);
27075        assert_eq_m128d(r, e);
27076    }
27077
27078    #[simd_test(enable = "avx512fp16,avx512vl")]
27079    unsafe fn test_mm_mask_cvtph_pd() {
27080        let src = _mm_set_pd(10.0, 11.0);
27081        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27082        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27083        let e = _mm_set_pd(10.0, 2.0);
27084        assert_eq_m128d(r, e);
27085    }
27086
27087    #[simd_test(enable = "avx512fp16,avx512vl")]
27088    unsafe fn test_mm_maskz_cvtph_pd() {
27089        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27090        let r = _mm_maskz_cvtph_pd(0b01, a);
27091        let e = _mm_set_pd(0.0, 2.0);
27092        assert_eq_m128d(r, e);
27093    }
27094
27095    #[simd_test(enable = "avx512fp16,avx512vl")]
27096    unsafe fn test_mm256_cvtph_pd() {
27097        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27098        let r = _mm256_cvtph_pd(a);
27099        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27100        assert_eq_m256d(r, e);
27101    }
27102
27103    #[simd_test(enable = "avx512fp16,avx512vl")]
27104    unsafe fn test_mm256_mask_cvtph_pd() {
27105        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27106        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27107        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27108        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27109        assert_eq_m256d(r, e);
27110    }
27111
27112    #[simd_test(enable = "avx512fp16,avx512vl")]
27113    unsafe fn test_mm256_maskz_cvtph_pd() {
27114        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27115        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27116        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27117        assert_eq_m256d(r, e);
27118    }
27119
27120    #[simd_test(enable = "avx512fp16")]
27121    unsafe fn test_mm512_cvtph_pd() {
27122        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27123        let r = _mm512_cvtph_pd(a);
27124        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27125        assert_eq_m512d(r, e);
27126    }
27127
27128    #[simd_test(enable = "avx512fp16")]
27129    unsafe fn test_mm512_mask_cvtph_pd() {
27130        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27131        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27132        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27133        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27134        assert_eq_m512d(r, e);
27135    }
27136
27137    #[simd_test(enable = "avx512fp16")]
27138    unsafe fn test_mm512_maskz_cvtph_pd() {
27139        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27140        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27141        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27142        assert_eq_m512d(r, e);
27143    }
27144
27145    #[simd_test(enable = "avx512fp16")]
27146    unsafe fn test_mm512_cvt_roundph_pd() {
27147        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27148        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27149        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27150        assert_eq_m512d(r, e);
27151    }
27152
27153    #[simd_test(enable = "avx512fp16")]
27154    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27155        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27156        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27157        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27158        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27159        assert_eq_m512d(r, e);
27160    }
27161
27162    #[simd_test(enable = "avx512fp16")]
27163    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27164        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27165        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27166        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27167        assert_eq_m512d(r, e);
27168    }
27169
27170    #[simd_test(enable = "avx512fp16")]
27171    unsafe fn test_mm_cvtsh_sd() {
27172        let a = _mm_setr_pd(2.0, 20.0);
27173        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27174        let r = _mm_cvtsh_sd(a, b);
27175        let e = _mm_setr_pd(1.0, 20.0);
27176        assert_eq_m128d(r, e);
27177    }
27178
27179    #[simd_test(enable = "avx512fp16")]
27180    unsafe fn test_mm_mask_cvtsh_sd() {
27181        let src = _mm_setr_pd(3.0, 11.0);
27182        let a = _mm_setr_pd(2.0, 20.0);
27183        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27184        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27185        let e = _mm_setr_pd(3.0, 20.0);
27186        assert_eq_m128d(r, e);
27187        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27188        let e = _mm_setr_pd(1.0, 20.0);
27189        assert_eq_m128d(r, e);
27190    }
27191
27192    #[simd_test(enable = "avx512fp16")]
27193    unsafe fn test_mm_maskz_cvtsh_sd() {
27194        let a = _mm_setr_pd(2.0, 20.0);
27195        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27196        let r = _mm_maskz_cvtsh_sd(0, a, b);
27197        let e = _mm_setr_pd(0.0, 20.0);
27198        assert_eq_m128d(r, e);
27199        let r = _mm_maskz_cvtsh_sd(1, a, b);
27200        let e = _mm_setr_pd(1.0, 20.0);
27201        assert_eq_m128d(r, e);
27202    }
27203
27204    #[simd_test(enable = "avx512fp16")]
27205    unsafe fn test_mm_cvt_roundsh_sd() {
27206        let a = _mm_setr_pd(2.0, 20.0);
27207        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27208        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27209        let e = _mm_setr_pd(1.0, 20.0);
27210        assert_eq_m128d(r, e);
27211    }
27212
27213    #[simd_test(enable = "avx512fp16")]
27214    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27215        let src = _mm_setr_pd(3.0, 11.0);
27216        let a = _mm_setr_pd(2.0, 20.0);
27217        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27218        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27219        let e = _mm_setr_pd(3.0, 20.0);
27220        assert_eq_m128d(r, e);
27221        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27222        let e = _mm_setr_pd(1.0, 20.0);
27223        assert_eq_m128d(r, e);
27224    }
27225
27226    #[simd_test(enable = "avx512fp16")]
27227    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27228        let a = _mm_setr_pd(2.0, 20.0);
27229        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27230        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27231        let e = _mm_setr_pd(0.0, 20.0);
27232        assert_eq_m128d(r, e);
27233        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27234        let e = _mm_setr_pd(1.0, 20.0);
27235        assert_eq_m128d(r, e);
27236    }
27237
27238    #[simd_test(enable = "avx512fp16")]
27239    unsafe fn test_mm_cvtsh_h() {
27240        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27241        let r = _mm_cvtsh_h(a);
27242        assert_eq!(r, 1.0);
27243    }
27244
27245    #[simd_test(enable = "avx512fp16")]
27246    unsafe fn test_mm256_cvtsh_h() {
27247        let a = _mm256_setr_ph(
27248            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27249        );
27250        let r = _mm256_cvtsh_h(a);
27251        assert_eq!(r, 1.0);
27252    }
27253
27254    #[simd_test(enable = "avx512fp16")]
27255    unsafe fn test_mm512_cvtsh_h() {
27256        let a = _mm512_setr_ph(
27257            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27258            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27259            31.0, 32.0,
27260        );
27261        let r = _mm512_cvtsh_h(a);
27262        assert_eq!(r, 1.0);
27263    }
27264
27265    #[simd_test(enable = "avx512fp16")]
27266    unsafe fn test_mm_cvtsi128_si16() {
27267        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27268        let r = _mm_cvtsi128_si16(a);
27269        assert_eq!(r, 1);
27270    }
27271
27272    #[simd_test(enable = "avx512fp16")]
27273    unsafe fn test_mm_cvtsi16_si128() {
27274        let a = 1;
27275        let r = _mm_cvtsi16_si128(a);
27276        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27277        assert_eq_m128i(r, e);
27278    }
27279}