ppv_lite86/x86_64/
sse2.rs

1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12
13macro_rules! impl_binop {
14    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16            type Output = Self;
17            #[inline(always)]
18            fn $fn(self, rhs: Self) -> Self::Output {
19                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20            }
21        }
22    };
23}
24
25macro_rules! impl_binop_assign {
26    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28        where
29            $vec<S3, S4, NI>: Copy,
30        {
31            #[inline(always)]
32            fn $fn_assign(&mut self, rhs: Self) {
33                *self = self.$fn(rhs);
34            }
35        }
36    };
37}
38
39macro_rules! def_vec {
40    ($vec:ident, $word:ident) => {
41        #[allow(non_camel_case_types)]
42        #[derive(Copy, Clone)]
43        pub struct $vec<S3, S4, NI> {
44            x: __m128i,
45            s3: PhantomData<S3>,
46            s4: PhantomData<S4>,
47            ni: PhantomData<NI>,
48        }
49
50        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51            #[inline(always)]
52            unsafe fn unpack(x: vec128_storage) -> Self {
53                Self::new(x.sse2)
54            }
55        }
56        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57            #[inline(always)]
58            fn from(x: $vec<S3, S4, NI>) -> Self {
59                vec128_storage { sse2: x.x }
60            }
61        }
62        impl<S3, S4, NI> $vec<S3, S4, NI> {
63            #[inline(always)]
64            fn new(x: __m128i) -> Self {
65                $vec {
66                    x,
67                    s3: PhantomData,
68                    s4: PhantomData,
69                    ni: PhantomData,
70                }
71            }
72        }
73
74        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75        where
76            Self: BSwap,
77        {
78            #[inline(always)]
79            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80                assert_eq!(input.len(), 16);
81                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82            }
83            #[inline(always)]
84            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85                assert_eq!(input.len(), 16);
86                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87            }
88            #[inline(always)]
89            fn write_le(self, out: &mut [u8]) {
90                assert_eq!(out.len(), 16);
91                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92            }
93            #[inline(always)]
94            fn write_be(self, out: &mut [u8]) {
95                assert_eq!(out.len(), 16);
96                let x = self.bswap().x;
97                unsafe {
98                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99                }
100            }
101        }
102
103        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104            #[inline(always)]
105            fn default() -> Self {
106                Self::new(unsafe { _mm_setzero_si128() })
107            }
108        }
109
110        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111            type Output = Self;
112            #[inline(always)]
113            fn not(self) -> Self::Output {
114                unsafe {
115                    let ff = _mm_set1_epi64x(-1i64);
116                    self ^ Self::new(ff)
117                }
118            }
119        }
120
121        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129            type Output = Self;
130            #[inline(always)]
131            fn andnot(self, rhs: Self) -> Self {
132                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133            }
134        }
135    };
136}
137
138macro_rules! impl_bitops32 {
139    ($vec:ident) => {
140        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141            $vec<S3, S4, NI>: RotateEachWord32
142        {
143        }
144    };
145}
146
147macro_rules! impl_bitops64 {
148    ($vec:ident) => {
149        impl_bitops32!($vec);
150        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152        {
153        }
154    };
155}
156
157macro_rules! impl_bitops128 {
158    ($vec:ident) => {
159        impl_bitops64!($vec);
160        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161            $vec<S3, S4, NI>: RotateEachWord128
162        {
163        }
164    };
165}
166
167macro_rules! rotr_32_s3 {
168    ($name:ident, $k0:expr, $k1:expr) => {
169        #[inline(always)]
170        fn $name(self) -> Self {
171            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
172        }
173    };
174}
175macro_rules! rotr_32 {
176    ($name:ident, $i:expr) => {
177        #[inline(always)]
178        fn $name(self) -> Self {
179            Self::new(unsafe {
180                _mm_or_si128(
181                    _mm_srli_epi32(self.x, $i as i32),
182                    _mm_slli_epi32(self.x, 32 - $i as i32),
183                )
184            })
185        }
186    };
187}
188impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
189    rotr_32!(rotate_each_word_right7, 7);
190    rotr_32_s3!(
191        rotate_each_word_right8,
192        0x0c0f_0e0d_080b_0a09,
193        0x0407_0605_0003_0201
194    );
195    rotr_32!(rotate_each_word_right11, 11);
196    rotr_32!(rotate_each_word_right12, 12);
197    rotr_32_s3!(
198        rotate_each_word_right16,
199        0x0d0c_0f0e_0908_0b0a,
200        0x0504_0706_0100_0302
201    );
202    rotr_32!(rotate_each_word_right20, 20);
203    rotr_32_s3!(
204        rotate_each_word_right24,
205        0x0e0d_0c0f_0a09_080b,
206        0x0605_0407_0201_0003
207    );
208    rotr_32!(rotate_each_word_right25, 25);
209}
210impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
211    rotr_32!(rotate_each_word_right7, 7);
212    rotr_32!(rotate_each_word_right8, 8);
213    rotr_32!(rotate_each_word_right11, 11);
214    rotr_32!(rotate_each_word_right12, 12);
215    #[inline(always)]
216    fn rotate_each_word_right16(self) -> Self {
217        Self::new(swap16_s2(self.x))
218    }
219    rotr_32!(rotate_each_word_right20, 20);
220    rotr_32!(rotate_each_word_right24, 24);
221    rotr_32!(rotate_each_word_right25, 25);
222}
223
224macro_rules! rotr_64_s3 {
225    ($name:ident, $k0:expr, $k1:expr) => {
226        #[inline(always)]
227        fn $name(self) -> Self {
228            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
229        }
230    };
231}
232macro_rules! rotr_64 {
233    ($name:ident, $i:expr) => {
234        #[inline(always)]
235        fn $name(self) -> Self {
236            Self::new(unsafe {
237                _mm_or_si128(
238                    _mm_srli_epi64(self.x, $i as i32),
239                    _mm_slli_epi64(self.x, 64 - $i as i32),
240                )
241            })
242        }
243    };
244}
245impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
246    rotr_64!(rotate_each_word_right7, 7);
247    rotr_64_s3!(
248        rotate_each_word_right8,
249        0x080f_0e0d_0c0b_0a09,
250        0x0007_0605_0403_0201
251    );
252    rotr_64!(rotate_each_word_right11, 11);
253    rotr_64!(rotate_each_word_right12, 12);
254    rotr_64_s3!(
255        rotate_each_word_right16,
256        0x0908_0f0e_0d0c_0b0a,
257        0x0100_0706_0504_0302
258    );
259    rotr_64!(rotate_each_word_right20, 20);
260    rotr_64_s3!(
261        rotate_each_word_right24,
262        0x0a09_080f_0e0d_0c0b,
263        0x0201_0007_0605_0403
264    );
265    rotr_64!(rotate_each_word_right25, 25);
266}
267impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
268    rotr_64!(rotate_each_word_right7, 7);
269    rotr_64!(rotate_each_word_right8, 8);
270    rotr_64!(rotate_each_word_right11, 11);
271    rotr_64!(rotate_each_word_right12, 12);
272    #[inline(always)]
273    fn rotate_each_word_right16(self) -> Self {
274        Self::new(swap16_s2(self.x))
275    }
276    rotr_64!(rotate_each_word_right20, 20);
277    rotr_64!(rotate_each_word_right24, 24);
278    rotr_64!(rotate_each_word_right25, 25);
279}
280impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
281    #[inline(always)]
282    fn rotate_each_word_right32(self) -> Self {
283        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
284    }
285}
286
287macro_rules! rotr_128 {
288    ($name:ident, $i:expr) => {
289        #[inline(always)]
290        fn $name(self) -> Self {
291            Self::new(unsafe {
292                _mm_or_si128(
293                    _mm_srli_si128(self.x, $i as i32),
294                    _mm_slli_si128(self.x, 128 - $i as i32),
295                )
296            })
297        }
298    };
299}
300// TODO: completely unoptimized
301impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
302    rotr_128!(rotate_each_word_right7, 7);
303    rotr_128!(rotate_each_word_right8, 8);
304    rotr_128!(rotate_each_word_right11, 11);
305    rotr_128!(rotate_each_word_right12, 12);
306    rotr_128!(rotate_each_word_right16, 16);
307    rotr_128!(rotate_each_word_right20, 20);
308    rotr_128!(rotate_each_word_right24, 24);
309    rotr_128!(rotate_each_word_right25, 25);
310}
311// TODO: completely unoptimized
312impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
313    rotr_128!(rotate_each_word_right32, 32);
314}
315impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
316
317def_vec!(u32x4_sse2, u32);
318def_vec!(u64x2_sse2, u64);
319def_vec!(u128x1_sse2, u128);
320
321impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
322    #[inline(always)]
323    fn to_lanes(self) -> [u32; 4] {
324        unsafe {
325            let x = _mm_cvtsi128_si64(self.x) as u64;
326            let y = _mm_extract_epi64(self.x, 1) as u64;
327            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
328        }
329    }
330    #[inline(always)]
331    fn from_lanes(xs: [u32; 4]) -> Self {
332        unsafe {
333            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
334            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
335            Self::new(x)
336        }
337    }
338}
339impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
340    #[inline(always)]
341    fn to_lanes(self) -> [u32; 4] {
342        unsafe {
343            let x = _mm_cvtsi128_si64(self.x) as u64;
344            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
345            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
346        }
347    }
348    #[inline(always)]
349    fn from_lanes(xs: [u32; 4]) -> Self {
350        unsafe {
351            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
352            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
353            let x = _mm_cvtsi64_si128(x);
354            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
355            Self::new(_mm_or_si128(x, y))
356        }
357    }
358}
359impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
360    #[inline(always)]
361    fn to_lanes(self) -> [u64; 2] {
362        unsafe {
363            [
364                _mm_cvtsi128_si64(self.x) as u64,
365                _mm_extract_epi64(self.x, 1) as u64,
366            ]
367        }
368    }
369    #[inline(always)]
370    fn from_lanes(xs: [u64; 2]) -> Self {
371        unsafe {
372            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
373            x = _mm_insert_epi64(x, xs[1] as i64, 1);
374            Self::new(x)
375        }
376    }
377}
378impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
379    #[inline(always)]
380    fn to_lanes(self) -> [u64; 2] {
381        unsafe {
382            [
383                _mm_cvtsi128_si64(self.x) as u64,
384                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
385            ]
386        }
387    }
388    #[inline(always)]
389    fn from_lanes(xs: [u64; 2]) -> Self {
390        unsafe {
391            let x = _mm_cvtsi64_si128(xs[0] as i64);
392            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
393            Self::new(_mm_or_si128(x, y))
394        }
395    }
396}
397impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
398    #[inline(always)]
399    fn to_lanes(self) -> [u128; 1] {
400        unimplemented!()
401    }
402    #[inline(always)]
403    fn from_lanes(xs: [u128; 1]) -> Self {
404        unimplemented!("{:?}", xs)
405    }
406}
407
408impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
409where
410    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
411{
412    #[inline(always)]
413    fn to_lanes(self) -> [u64; 4] {
414        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
415        [a[0], a[1], b[0], b[1]]
416    }
417    #[inline(always)]
418    fn from_lanes(xs: [u64; 4]) -> Self {
419        let (a, b) = (
420            u64x2_sse2::from_lanes([xs[0], xs[1]]),
421            u64x2_sse2::from_lanes([xs[2], xs[3]]),
422        );
423        x2::new([a, b])
424    }
425}
426
427macro_rules! impl_into {
428    ($from:ident, $to:ident) => {
429        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
430            #[inline(always)]
431            fn from(x: $from<S3, S4, NI>) -> Self {
432                $to::new(x.x)
433            }
434        }
435    };
436}
437
438impl_into!(u128x1_sse2, u32x4_sse2);
439impl_into!(u128x1_sse2, u64x2_sse2);
440
441impl_bitops32!(u32x4_sse2);
442impl_bitops64!(u64x2_sse2);
443impl_bitops128!(u128x1_sse2);
444
445impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
446    u32x4_sse2<S3, S4, NI>: BSwap
447{
448}
449impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
450    u64x2_sse2<S3, S4, NI>: BSwap
451{
452}
453impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
454impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
455impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
456impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
457
458impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
459where
460    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
461    Machine86<S3, S4, NI>: Machine,
462{
463}
464impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
465where
466    u64x2_sse2<S3, S4, NI>:
467        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
468    Machine86<S3, S4, NI>: Machine,
469{
470}
471impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
472where
473    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
474    Machine86<S3, S4, NI>: Machine,
475    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
476    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
477{
478}
479
480impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
481where
482    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
483    Machine86<YesS3, YesS4, NI>: Machine,
484{
485}
486impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
487where
488    u64x2_sse2<YesS3, YesS4, NI>:
489        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
490    Machine86<YesS3, YesS4, NI>: Machine,
491{
492}
493impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
494where
495    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
496    Machine86<YesS3, YesS4, NI>: Machine,
497    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
498    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
499{
500}
501
502impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
503    #[inline(always)]
504    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
505        Self::new(_mm_set_epi32(
506            xs[3] as i32,
507            xs[2] as i32,
508            xs[1] as i32,
509            xs[0] as i32,
510        ))
511    }
512}
513
514impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
515where
516    Self: MultiLane<[u32; 4]>,
517{
518    #[inline(always)]
519    fn extract(self, i: u32) -> u32 {
520        self.to_lanes()[i as usize]
521    }
522    #[inline(always)]
523    fn insert(self, v: u32, i: u32) -> Self {
524        Self::new(unsafe {
525            match i {
526                0 => _mm_insert_epi32(self.x, v as i32, 0),
527                1 => _mm_insert_epi32(self.x, v as i32, 1),
528                2 => _mm_insert_epi32(self.x, v as i32, 2),
529                3 => _mm_insert_epi32(self.x, v as i32, 3),
530                _ => unreachable!(),
531            }
532        })
533    }
534}
535impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
536where
537    Self: MultiLane<[u32; 4]>,
538{
539    #[inline(always)]
540    fn extract(self, i: u32) -> u32 {
541        self.to_lanes()[i as usize]
542    }
543    #[inline(always)]
544    fn insert(self, v: u32, i: u32) -> Self {
545        Self::new(unsafe {
546            match i {
547                0 => {
548                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
549                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
550                }
551                1 => {
552                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
553                    x = _mm_slli_si128(x, 4);
554                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
555                    _mm_shuffle_epi32(x, 0b1110_0001)
556                }
557                2 => {
558                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
559                    x = _mm_slli_si128(x, 4);
560                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
561                    _mm_shuffle_epi32(x, 0b1100_1001)
562                }
563                3 => {
564                    let mut x = _mm_slli_si128(self.x, 4);
565                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
566                    _mm_shuffle_epi32(x, 0b0011_1001)
567                }
568                _ => unreachable!(),
569            }
570        })
571    }
572}
573
574impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
575    #[inline(always)]
576    fn shuffle_lane_words2301(self) -> Self {
577        self.shuffle2301()
578    }
579    #[inline(always)]
580    fn shuffle_lane_words1230(self) -> Self {
581        self.shuffle1230()
582    }
583    #[inline(always)]
584    fn shuffle_lane_words3012(self) -> Self {
585        self.shuffle3012()
586    }
587}
588
589impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
590    #[inline(always)]
591    fn shuffle2301(self) -> Self {
592        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
593    }
594    #[inline(always)]
595    fn shuffle1230(self) -> Self {
596        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
597    }
598    #[inline(always)]
599    fn shuffle3012(self) -> Self {
600        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
601    }
602}
603
604impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
605    #[inline(always)]
606    fn shuffle2301(self) -> Self {
607        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
608    }
609    #[inline(always)]
610    fn shuffle3012(self) -> Self {
611        unsafe {
612            x2::new([
613                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
614                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
615            ])
616        }
617    }
618    #[inline(always)]
619    fn shuffle1230(self) -> Self {
620        unsafe {
621            x2::new([
622                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
623                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624            ])
625        }
626    }
627}
628impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
629    #[inline(always)]
630    fn shuffle2301(self) -> Self {
631        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
632    }
633    #[inline(always)]
634    fn shuffle3012(self) -> Self {
635        unsafe {
636            let a = _mm_srli_si128(self.0[0].x, 8);
637            let b = _mm_slli_si128(self.0[0].x, 8);
638            let c = _mm_srli_si128(self.0[1].x, 8);
639            let d = _mm_slli_si128(self.0[1].x, 8);
640            let da = _mm_or_si128(d, a);
641            let bc = _mm_or_si128(b, c);
642            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
643        }
644    }
645    #[inline(always)]
646    fn shuffle1230(self) -> Self {
647        unsafe {
648            let a = _mm_srli_si128(self.0[0].x, 8);
649            let b = _mm_slli_si128(self.0[0].x, 8);
650            let c = _mm_srli_si128(self.0[1].x, 8);
651            let d = _mm_slli_si128(self.0[1].x, 8);
652            let da = _mm_or_si128(d, a);
653            let bc = _mm_or_si128(b, c);
654            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
655        }
656    }
657}
658
659impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
660    #[inline(always)]
661    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
662        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
663    }
664}
665
666impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
667    #[inline(always)]
668    fn extract(self, i: u32) -> u64 {
669        unsafe {
670            match i {
671                0 => _mm_cvtsi128_si64(self.x) as u64,
672                1 => _mm_extract_epi64(self.x, 1) as u64,
673                _ => unreachable!(),
674            }
675        }
676    }
677    #[inline(always)]
678    fn insert(self, x: u64, i: u32) -> Self {
679        Self::new(unsafe {
680            match i {
681                0 => _mm_insert_epi64(self.x, x as i64, 0),
682                1 => _mm_insert_epi64(self.x, x as i64, 1),
683                _ => unreachable!(),
684            }
685        })
686    }
687}
688impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
689    #[inline(always)]
690    fn extract(self, i: u32) -> u64 {
691        unsafe {
692            match i {
693                0 => _mm_cvtsi128_si64(self.x) as u64,
694                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
695                _ => unreachable!(),
696            }
697        }
698    }
699    #[inline(always)]
700    fn insert(self, x: u64, i: u32) -> Self {
701        Self::new(unsafe {
702            match i {
703                0 => _mm_or_si128(
704                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
705                    _mm_cvtsi64_si128(x as i64),
706                ),
707                1 => _mm_or_si128(
708                    _mm_move_epi64(self.x),
709                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
710                ),
711                _ => unreachable!(),
712            }
713        })
714    }
715}
716
717impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
718    #[inline(always)]
719    fn bswap(self) -> Self {
720        Self::new(unsafe {
721            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
722            _mm_shuffle_epi8(self.x, k)
723        })
724    }
725}
726#[inline(always)]
727fn bswap32_s2(x: __m128i) -> __m128i {
728    unsafe {
729        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
730        y = _mm_shufflehi_epi16(y, 0b0001_1011);
731        y = _mm_shufflelo_epi16(y, 0b0001_1011);
732        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
733        z = _mm_shufflehi_epi16(z, 0b0001_1011);
734        z = _mm_shufflelo_epi16(z, 0b0001_1011);
735        _mm_packus_epi16(y, z)
736    }
737}
738impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
739    #[inline(always)]
740    fn bswap(self) -> Self {
741        Self::new(bswap32_s2(self.x))
742    }
743}
744
745impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
746    #[inline(always)]
747    fn bswap(self) -> Self {
748        Self::new(unsafe {
749            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
750            _mm_shuffle_epi8(self.x, k)
751        })
752    }
753}
754impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
755    #[inline(always)]
756    fn bswap(self) -> Self {
757        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
758    }
759}
760
761impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
762    #[inline(always)]
763    fn bswap(self) -> Self {
764        Self::new(unsafe {
765            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
766            _mm_shuffle_epi8(self.x, k)
767        })
768    }
769}
770impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
771    #[inline(always)]
772    fn bswap(self) -> Self {
773        unimplemented!()
774    }
775}
776
777macro_rules! swapi {
778    ($x:expr, $i:expr, $k:expr) => {
779        unsafe {
780            const K: u8 = $k;
781            let k = _mm_set1_epi8(K as i8);
782            u128x1_sse2::new(_mm_or_si128(
783                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
784                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
785            ))
786        }
787    };
788}
789#[inline(always)]
790fn swap16_s2(x: __m128i) -> __m128i {
791    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
792}
793impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
794    #[inline(always)]
795    fn swap1(self) -> Self {
796        swapi!(self, 1, 0xaa)
797    }
798    #[inline(always)]
799    fn swap2(self) -> Self {
800        swapi!(self, 2, 0xcc)
801    }
802    #[inline(always)]
803    fn swap4(self) -> Self {
804        swapi!(self, 4, 0xf0)
805    }
806    #[inline(always)]
807    fn swap8(self) -> Self {
808        u128x1_sse2::new(unsafe {
809            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
810            _mm_shuffle_epi8(self.x, k)
811        })
812    }
813    #[inline(always)]
814    fn swap16(self) -> Self {
815        u128x1_sse2::new(unsafe {
816            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
817            _mm_shuffle_epi8(self.x, k)
818        })
819    }
820    #[inline(always)]
821    fn swap32(self) -> Self {
822        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
823    }
824    #[inline(always)]
825    fn swap64(self) -> Self {
826        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
827    }
828}
829impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
830    #[inline(always)]
831    fn swap1(self) -> Self {
832        swapi!(self, 1, 0xaa)
833    }
834    #[inline(always)]
835    fn swap2(self) -> Self {
836        swapi!(self, 2, 0xcc)
837    }
838    #[inline(always)]
839    fn swap4(self) -> Self {
840        swapi!(self, 4, 0xf0)
841    }
842    #[inline(always)]
843    fn swap8(self) -> Self {
844        u128x1_sse2::new(unsafe {
845            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
846        })
847    }
848    #[inline(always)]
849    fn swap16(self) -> Self {
850        u128x1_sse2::new(swap16_s2(self.x))
851    }
852    #[inline(always)]
853    fn swap32(self) -> Self {
854        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
855    }
856    #[inline(always)]
857    fn swap64(self) -> Self {
858        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
859    }
860}
861
862#[derive(Copy, Clone)]
863pub struct G0;
864#[derive(Copy, Clone)]
865pub struct G1;
866
867#[allow(non_camel_case_types)]
868pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
869#[allow(non_camel_case_types)]
870pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
873#[allow(non_camel_case_types)]
874pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
875
876#[allow(non_camel_case_types)]
877pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
878#[allow(non_camel_case_types)]
879pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
882
883impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
884    #[inline(always)]
885    fn to_scalars(self) -> [u32; 16] {
886        unsafe { core::mem::transmute(self) }
887    }
888}
889
890impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
891where
892    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
893    Machine86<S3, S4, NI>: Machine,
894    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
895    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
896{
897}
898impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
899where
900    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
901    Machine86<S3, S4, NI>: Machine,
902    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
903    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
904{
905}
906impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
907where
908    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
909    Machine86<S3, S4, NI>: Machine,
910    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
911{
912}
913impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
914where
915    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
916    Machine86<S3, S4, NI>: Machine,
917    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
918    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
919    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
920    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
921    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
922{
923}
924
925impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
926where
927    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
928    Avx2Machine<NI>: Machine,
929    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
930    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
931{
932}
933impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
934where
935    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
936    Avx2Machine<NI>: Machine,
937    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
938    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
939{
940}
941impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
942where
943    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
944    Avx2Machine<NI>: Machine,
945    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
946{
947}
948impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
949where
950    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
951    Avx2Machine<NI>: Machine,
952    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
953    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
954    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
955    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
956    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
957{
958}
959
960impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
961where
962    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
963{
964    #[inline(always)]
965    fn extract(self, i: u32) -> u64 {
966        match i {
967            0 => self.0[0].extract(0),
968            1 => self.0[0].extract(1),
969            2 => self.0[1].extract(0),
970            3 => self.0[1].extract(1),
971            _ => panic!(),
972        }
973    }
974    #[inline(always)]
975    fn insert(mut self, w: u64, i: u32) -> Self {
976        match i {
977            0 => self.0[0] = self.0[0].insert(w, 0),
978            1 => self.0[0] = self.0[0].insert(w, 1),
979            2 => self.0[1] = self.0[1].insert(w, 0),
980            3 => self.0[1] = self.0[1].insert(w, 1),
981            _ => panic!(),
982        };
983        self
984    }
985}
986
987impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
988where
989    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
990    Machine86<S3, S4, NI>: Machine,
991    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
992    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
993    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
994    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
995{
996}
997impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
998where
999    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1000    Machine86<S3, S4, NI>: Machine,
1001    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1002    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1003{
1004}
1005impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1006where
1007    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1008    Machine86<S3, S4, NI>: Machine,
1009    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1010    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1011    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1012    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1013{
1014}
1015
1016impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1017where
1018    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1019    Avx2Machine<NI>: Machine,
1020    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1021    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1022{
1023}
1024impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1025where
1026    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1027    Avx2Machine<NI>: Machine,
1028    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1029    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1030    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1031    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1032{
1033}
1034
1035macro_rules! impl_into_x {
1036    ($from:ident, $to:ident) => {
1037        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1038            for x2<$to<S3, S4, NI>, Gt>
1039        {
1040            #[inline(always)]
1041            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1042                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1043            }
1044        }
1045        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1046            #[inline(always)]
1047            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1048                x4::new([
1049                    $to::from(x.0[0]),
1050                    $to::from(x.0[1]),
1051                    $to::from(x.0[2]),
1052                    $to::from(x.0[3]),
1053                ])
1054            }
1055        }
1056    };
1057}
1058impl_into_x!(u128x1_sse2, u64x2_sse2);
1059impl_into_x!(u128x1_sse2, u32x4_sse2);
1060
1061///// Debugging
1062
1063use core::fmt::{Debug, Formatter, Result};
1064
1065impl<W: PartialEq, G> PartialEq for x2<W, G> {
1066    #[inline(always)]
1067    fn eq(&self, rhs: &Self) -> bool {
1068        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1069    }
1070}
1071
1072#[allow(unused)]
1073#[inline(always)]
1074unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1075    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1076    _mm_cvtsi128_si64(q) == -1
1077}
1078
1079#[inline(always)]
1080unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1081    let q = _mm_cmpeq_epi32(x, y);
1082    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1083    let q = _mm_cvtsi128_si64(q);
1084    (p & q) == -1
1085}
1086
1087impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1088    #[inline(always)]
1089    fn eq(&self, rhs: &Self) -> bool {
1090        unsafe { eq128_s2(self.x, rhs.x) }
1091    }
1092}
1093impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1094where
1095    Self: Copy + MultiLane<[u32; 4]>,
1096{
1097    #[cold]
1098    fn fmt(&self, fmt: &mut Formatter) -> Result {
1099        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1100    }
1101}
1102
1103impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1104    #[inline(always)]
1105    fn eq(&self, rhs: &Self) -> bool {
1106        unsafe { eq128_s2(self.x, rhs.x) }
1107    }
1108}
1109impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1110where
1111    Self: Copy + MultiLane<[u64; 2]>,
1112{
1113    #[cold]
1114    fn fmt(&self, fmt: &mut Formatter) -> Result {
1115        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1116    }
1117}
1118
1119impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1120where
1121    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1122{
1123    #[cold]
1124    fn fmt(&self, fmt: &mut Formatter) -> Result {
1125        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1126        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1127    }
1128}
1129
1130#[cfg(test)]
1131#[cfg(target_arch = "x86_64")]
1132mod test {
1133    use super::*;
1134    use crate::x86_64::{SSE2, SSE41, SSSE3};
1135    use crate::Machine;
1136
1137    #[test]
1138    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1139    fn test_bswap32_s2_vs_s3() {
1140        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1141        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1142
1143        let s2 = unsafe { SSE2::instance() };
1144        let s3 = unsafe { SSSE3::instance() };
1145
1146        let x_s2 = {
1147            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1148            x_s2.bswap()
1149        };
1150
1151        let x_s3 = {
1152            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1153            x_s3.bswap()
1154        };
1155
1156        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1157        assert_eq!(x_s2, s2.vec(ys));
1158    }
1159
1160    #[test]
1161    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1162    fn test_bswap64_s2_vs_s3() {
1163        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1164        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1165
1166        let s2 = unsafe { SSE2::instance() };
1167        let s3 = unsafe { SSSE3::instance() };
1168
1169        let x_s2 = {
1170            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1171            x_s2.bswap()
1172        };
1173
1174        let x_s3 = {
1175            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1176            x_s3.bswap()
1177        };
1178
1179        assert_eq!(x_s2, s2.vec(ys));
1180        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1181    }
1182
1183    #[test]
1184    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1185    fn test_shuffle32_s2_vs_s3() {
1186        let xs = [0x0, 0x1, 0x2, 0x3];
1187        let ys = [0x2, 0x3, 0x0, 0x1];
1188        let zs = [0x1, 0x2, 0x3, 0x0];
1189
1190        let s2 = unsafe { SSE2::instance() };
1191        let s3 = unsafe { SSSE3::instance() };
1192
1193        let x_s2 = {
1194            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1195            x_s2.shuffle2301()
1196        };
1197        let x_s3 = {
1198            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1199            x_s3.shuffle2301()
1200        };
1201        assert_eq!(x_s2, s2.vec(ys));
1202        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1203
1204        let x_s2 = {
1205            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1206            x_s2.shuffle3012()
1207        };
1208        let x_s3 = {
1209            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1210            x_s3.shuffle3012()
1211        };
1212        assert_eq!(x_s2, s2.vec(zs));
1213        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1214
1215        let x_s2 = x_s2.shuffle1230();
1216        let x_s3 = x_s3.shuffle1230();
1217        assert_eq!(x_s2, s2.vec(xs));
1218        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1219    }
1220
1221    #[test]
1222    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1223    fn test_shuffle64_s2_vs_s3() {
1224        let xs = [0x0, 0x1, 0x2, 0x3];
1225        let ys = [0x2, 0x3, 0x0, 0x1];
1226        let zs = [0x1, 0x2, 0x3, 0x0];
1227
1228        let s2 = unsafe { SSE2::instance() };
1229        let s3 = unsafe { SSSE3::instance() };
1230
1231        let x_s2 = {
1232            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1233            x_s2.shuffle2301()
1234        };
1235        let x_s3 = {
1236            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1237            x_s3.shuffle2301()
1238        };
1239        assert_eq!(x_s2, s2.vec(ys));
1240        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1241
1242        let x_s2 = {
1243            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1244            x_s2.shuffle3012()
1245        };
1246        let x_s3 = {
1247            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1248            x_s3.shuffle3012()
1249        };
1250        assert_eq!(x_s2, s2.vec(zs));
1251        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1252
1253        let x_s2 = x_s2.shuffle1230();
1254        let x_s3 = x_s3.shuffle1230();
1255        assert_eq!(x_s2, s2.vec(xs));
1256        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1257    }
1258
1259    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1260    #[test]
1261    fn test_lanes_u32x4() {
1262        let xs = [0x1, 0x2, 0x3, 0x4];
1263
1264        let s2 = unsafe { SSE2::instance() };
1265        let s3 = unsafe { SSSE3::instance() };
1266        let s4 = unsafe { SSE41::instance() };
1267
1268        {
1269            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1270            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1271            assert_eq!(x_s2, y_s2);
1272            assert_eq!(xs, y_s2.to_lanes());
1273        }
1274
1275        {
1276            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1277            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1278            assert_eq!(x_s3, y_s3);
1279            assert_eq!(xs, y_s3.to_lanes());
1280        }
1281
1282        {
1283            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1284            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1285            assert_eq!(x_s4, y_s4);
1286            assert_eq!(xs, y_s4.to_lanes());
1287        }
1288    }
1289
1290    #[test]
1291    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1292    fn test_lanes_u64x2() {
1293        let xs = [0x1, 0x2];
1294
1295        let s2 = unsafe { SSE2::instance() };
1296        let s3 = unsafe { SSSE3::instance() };
1297        let s4 = unsafe { SSE41::instance() };
1298
1299        {
1300            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1301            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1302            assert_eq!(x_s2, y_s2);
1303            assert_eq!(xs, y_s2.to_lanes());
1304        }
1305
1306        {
1307            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1308            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1309            assert_eq!(x_s3, y_s3);
1310            assert_eq!(xs, y_s3.to_lanes());
1311        }
1312
1313        {
1314            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1315            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1316            assert_eq!(x_s4, y_s4);
1317            assert_eq!(xs, y_s4.to_lanes());
1318        }
1319    }
1320
1321    #[test]
1322    fn test_vec4_u32x4_s2() {
1323        let xs = [1, 2, 3, 4];
1324        let s2 = unsafe { SSE2::instance() };
1325        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1326        assert_eq!(x_s2.extract(0), 1);
1327        assert_eq!(x_s2.extract(1), 2);
1328        assert_eq!(x_s2.extract(2), 3);
1329        assert_eq!(x_s2.extract(3), 4);
1330        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1331        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1332        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1333        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1334    }
1335
1336    #[test]
1337    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1338    fn test_vec4_u32x4_s4() {
1339        let xs = [1, 2, 3, 4];
1340        let s4 = unsafe { SSE41::instance() };
1341        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1342        assert_eq!(x_s4.extract(0), 1);
1343        assert_eq!(x_s4.extract(1), 2);
1344        assert_eq!(x_s4.extract(2), 3);
1345        assert_eq!(x_s4.extract(3), 4);
1346        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1347        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1348        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1349        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1350    }
1351
1352    #[test]
1353    fn test_vec2_u64x2_s2() {
1354        let xs = [0x1, 0x2];
1355        let s2 = unsafe { SSE2::instance() };
1356        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1357        assert_eq!(x_s2.extract(0), 1);
1358        assert_eq!(x_s2.extract(1), 2);
1359        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1360        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1361    }
1362
1363    #[test]
1364    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1365    fn test_vec4_u64x2_s4() {
1366        let xs = [0x1, 0x2];
1367        let s4 = unsafe { SSE41::instance() };
1368        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1369        assert_eq!(x_s4.extract(0), 1);
1370        assert_eq!(x_s4.extract(1), 2);
1371        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1372        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1373    }
1374}
1375
1376pub mod avx2 {
1377    #![allow(non_camel_case_types)]
1378    use crate::soft::{x2, x4};
1379    use crate::types::*;
1380    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1381    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1382    use core::arch::x86_64::*;
1383    use core::marker::PhantomData;
1384    use core::ops::*;
1385
1386    #[derive(Copy, Clone)]
1387    pub struct u32x4x2_avx2<NI> {
1388        x: __m256i,
1389        ni: PhantomData<NI>,
1390    }
1391
1392    impl<NI> u32x4x2_avx2<NI> {
1393        #[inline(always)]
1394        fn new(x: __m256i) -> Self {
1395            Self { x, ni: PhantomData }
1396        }
1397    }
1398
1399    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1400    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1401        #[inline(always)]
1402        unsafe fn unpack(p: vec256_storage) -> Self {
1403            Self::new(p.avx)
1404        }
1405    }
1406    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1407        #[inline(always)]
1408        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1409            assert_eq!(input.len(), 32);
1410            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1411        }
1412        #[inline(always)]
1413        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1414            Self::unsafe_read_le(input).bswap()
1415        }
1416        #[inline(always)]
1417        fn write_le(self, out: &mut [u8]) {
1418            unsafe {
1419                assert_eq!(out.len(), 32);
1420                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1421            }
1422        }
1423        #[inline(always)]
1424        fn write_be(self, out: &mut [u8]) {
1425            self.bswap().write_le(out)
1426        }
1427    }
1428    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1429        #[inline(always)]
1430        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1431            unsafe {
1432                [
1433                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1434                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1435                ]
1436            }
1437        }
1438        #[inline(always)]
1439        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1440            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1441        }
1442    }
1443    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1444        #[inline(always)]
1445        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1446            unsafe {
1447                match i {
1448                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1449                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1450                    _ => panic!(),
1451                }
1452            }
1453        }
1454        #[inline(always)]
1455        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1456            Self::new(unsafe {
1457                match i {
1458                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
1459                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
1460                    _ => panic!(),
1461                }
1462            })
1463        }
1464    }
1465    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1466    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1467    macro_rules! shuf_lane_bytes {
1468        ($name:ident, $k0:expr, $k1:expr) => {
1469            #[inline(always)]
1470            fn $name(self) -> Self {
1471                Self::new(unsafe {
1472                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1473                })
1474            }
1475        };
1476    }
1477    macro_rules! rotr_32 {
1478        ($name:ident, $i:expr) => {
1479            #[inline(always)]
1480            fn $name(self) -> Self {
1481                Self::new(unsafe {
1482                    _mm256_or_si256(
1483                        _mm256_srli_epi32(self.x, $i as i32),
1484                        _mm256_slli_epi32(self.x, 32 - $i as i32),
1485                    )
1486                })
1487            }
1488        };
1489    }
1490    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1491        rotr_32!(rotate_each_word_right7, 7);
1492        shuf_lane_bytes!(
1493            rotate_each_word_right8,
1494            0x0c0f_0e0d_080b_0a09,
1495            0x0407_0605_0003_0201
1496        );
1497        rotr_32!(rotate_each_word_right11, 11);
1498        rotr_32!(rotate_each_word_right12, 12);
1499        shuf_lane_bytes!(
1500            rotate_each_word_right16,
1501            0x0d0c_0f0e_0908_0b0a,
1502            0x0504_0706_0100_0302
1503        );
1504        rotr_32!(rotate_each_word_right20, 20);
1505        shuf_lane_bytes!(
1506            rotate_each_word_right24,
1507            0x0e0d_0c0f_0a09_080b,
1508            0x0605_0407_0201_0003
1509        );
1510        rotr_32!(rotate_each_word_right25, 25);
1511    }
1512    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1513    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1514        #[inline(always)]
1515        fn from(x: u32x4x2_avx2<NI>) -> Self {
1516            Self { avx: x.x }
1517        }
1518    }
1519
1520    macro_rules! impl_assign {
1521        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1522            impl<NI> $Assign for $vec<NI>
1523            where
1524                NI: Copy,
1525            {
1526                #[inline(always)]
1527                fn $assign_fn(&mut self, rhs: Self) {
1528                    *self = self.$bin_fn(rhs);
1529                }
1530            }
1531        };
1532    }
1533    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1534    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1535    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1536    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1537
1538    macro_rules! impl_bitop {
1539        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1540            impl<NI> $Op for $vec<NI> {
1541                type Output = Self;
1542                #[inline(always)]
1543                fn $op_fn(self, rhs: Self) -> Self::Output {
1544                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1545                }
1546            }
1547        };
1548    }
1549    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1550    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1551    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1552    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1553    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1554
1555    impl<NI> Not for u32x4x2_avx2<NI> {
1556        type Output = Self;
1557        #[inline(always)]
1558        fn not(self) -> Self::Output {
1559            unsafe {
1560                let f = _mm256_set1_epi8(-0x7f);
1561                Self::new(f) ^ self
1562            }
1563        }
1564    }
1565
1566    impl<NI> BSwap for u32x4x2_avx2<NI> {
1567        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1568    }
1569
1570    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1571    where
1572        NI: Copy,
1573    {
1574        #[inline(always)]
1575        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1576            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1577        }
1578    }
1579
1580    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1581        #[inline(always)]
1582        fn shuffle_lane_words1230(self) -> Self {
1583            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1584        }
1585        #[inline(always)]
1586        fn shuffle_lane_words2301(self) -> Self {
1587            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1588        }
1589        #[inline(always)]
1590        fn shuffle_lane_words3012(self) -> Self {
1591            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1592        }
1593    }
1594
1595    ///////////////////////////////////////////////////////////////////////////////////////////
1596
1597    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1598    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1599
1600    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1601        #[inline(always)]
1602        unsafe fn unpack(p: vec512_storage) -> Self {
1603            Self::new([
1604                u32x4x2_avx2::unpack(p.avx[0]),
1605                u32x4x2_avx2::unpack(p.avx[1]),
1606            ])
1607        }
1608    }
1609    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1610        #[inline(always)]
1611        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1612            let [a, b] = self.0[0].to_lanes();
1613            let [c, d] = self.0[1].to_lanes();
1614            [a, b, c, d]
1615        }
1616        #[inline(always)]
1617        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1618            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1619            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1620            Self::new([ab, cd])
1621        }
1622    }
1623    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1624        #[inline(always)]
1625        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1626            match i {
1627                0 => self.0[0].extract(0),
1628                1 => self.0[0].extract(1),
1629                2 => self.0[1].extract(0),
1630                3 => self.0[1].extract(1),
1631                _ => panic!(),
1632            }
1633        }
1634        #[inline(always)]
1635        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1636            Self::new(match i {
1637                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1638                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1639                _ => panic!(),
1640            })
1641        }
1642    }
1643    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1644        #[inline(always)]
1645        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1646            /*
1647             * a00:a01 a10:a11
1648             * b00:b01 b10:b11
1649             * c00:c01 c10:c11
1650             * d00:d01 d10:d11
1651             *       =>
1652             * a00:b00 c00:d00
1653             * a01:b01 c01:d01
1654             * a10:b10 c10:d10
1655             * a11:b11 c11:d11
1656             */
1657            unsafe {
1658                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1659                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1660                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1661                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1662                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1663                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1664                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1665                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1666                (
1667                    Self::new([ab00, cd00]),
1668                    Self::new([ab01, cd01]),
1669                    Self::new([ab10, cd10]),
1670                    Self::new([ab11, cd11]),
1671                )
1672            }
1673        }
1674    }
1675    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1676        #[inline(always)]
1677        fn to_scalars(self) -> [u32; 16] {
1678            unsafe { core::mem::transmute(self) }
1679        }
1680    }
1681    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1682        #[inline(always)]
1683        fn from(x: u32x4x4_avx2<NI>) -> Self {
1684            Self {
1685                avx: [
1686                    vec256_storage { avx: x.0[0].x },
1687                    vec256_storage { avx: x.0[1].x },
1688                ],
1689            }
1690        }
1691    }
1692    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1693        #[inline(always)]
1694        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1695            Self::new(unsafe {
1696                [
1697                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1698                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1699                ]
1700            })
1701        }
1702    }
1703}