28 #ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
30 #include <immintrin.h>
35 __m256 yl, yh, tmp1, tmp2;
36 yl = _mm256_moveldup_ps(y);
37 yh = _mm256_movehdup_ps(y);
38 tmp1 = _mm256_mul_ps(x, yl);
39 x = _mm256_shuffle_ps(x, x, 0xB1);
40 tmp2 = _mm256_mul_ps(x, yh);
41 return _mm256_addsub_ps(tmp1, tmp2);
46 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
47 return _mm256_xor_ps(x, conjugator);
59 __m256 tmp1 = _mm256_mul_ps(
val,
val);
60 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
61 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0));
62 tmp1 = _mm256_sqrt_ps(tmp1);
63 return _mm256_div_ps(
val, tmp1);
68 __m256 complex1, complex2;
69 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
70 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
71 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
72 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
73 return _mm256_hadd_ps(complex1, complex2);
82 _mm256_scaled_norm_dist_ps(
const __m256 symbols0,
const __m256 symbols1,
const __m256 points0,
const __m256 points1,
const __m256 scalar){
88 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
89 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
91 return _mm256_mul_ps(norms, scalar);
96 __m256 sign_mask_dummy = _mm256_setzero_ps();
97 const __m128i zeros = _mm_set1_epi8(0x00);
98 const __m128i sign_extract = _mm_set1_epi8(0x80);
99 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x01, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, 0x03);
100 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x05, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff, 0x07);
102 fbits = _mm_cmpgt_epi8(fbits, zeros);
103 fbits = _mm_and_si128(fbits, sign_extract);
104 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
105 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
107 __m256 sign_mask = _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
108 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
117 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
118 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
119 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
120 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
125 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
126 const __m256 abs_mask = _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
132 __m256 sign = _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
133 __m256 dst = _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
134 return _mm256_or_ps(dst, sign);
146 llr0 = _mm256_xor_ps(llr0, sign_mask);
147 __m256 dst = _mm256_add_ps(llr0, llr1);