78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
90 unsigned int num_points)
92 const unsigned int num_bytes = num_points*8;
93 __m128 xmm0, xmm9, xmm10;
94 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
98 int bound = num_bytes >> 6;
99 int leftovers0 = (num_bytes >> 5) & 1;
100 int leftovers1 = (num_bytes >> 4) & 1;
101 int leftovers2 = (num_bytes >> 3) & 1;
104 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
105 xmm1 = _mm256_setzero_ps();
106 xmm2 = _mm256_load_ps((
float*)&points[0]);
107 xmm0 = _mm_load_ps((
float*)src0);
108 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
109 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
110 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
111 xmm3 = _mm256_load_ps((
float*)&points[4]);
113 for(;
i < bound; ++
i) {
114 xmm4 = _mm256_sub_ps(xmm1, xmm2);
115 xmm5 = _mm256_sub_ps(xmm1, xmm3);
117 xmm6 = _mm256_mul_ps(xmm4, xmm4);
118 xmm7 = _mm256_mul_ps(xmm5, xmm5);
120 xmm2 = _mm256_load_ps((
float*)&points[0]);
122 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
123 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
125 xmm3 = _mm256_load_ps((
float*)&points[4]);
127 _mm256_store_ps(target, xmm4);
132 for(
i = 0;
i < leftovers0; ++
i) {
134 xmm2 = _mm256_load_ps((
float*)&points[0]);
136 xmm4 = _mm256_sub_ps(xmm1, xmm2);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm9 = _mm256_extractf128_ps(xmm4, 1);
146 _mm_store_ps(target,xmm9);
151 for(
i = 0;
i < leftovers1; ++
i) {
152 xmm9 = _mm_load_ps((
float*)&points[0]);
154 xmm10 = _mm_sub_ps(xmm0, xmm9);
158 xmm9 = _mm_mul_ps(xmm10, xmm10);
160 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162 _mm_storeh_pi((__m64*)target, xmm10);
167 for(
i = 0;
i < leftovers2; ++
i) {
169 diff = src0[0] - points[0];
180 #include<xmmintrin.h>
181 #include<pmmintrin.h>
185 unsigned int num_points)
187 const unsigned int num_bytes = num_points*8;
189 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
193 int bound = num_bytes >> 5;
196 xmm1 = _mm_setzero_ps();
197 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
198 xmm2 = _mm_load_ps((
float*)&points[0]);
199 xmm1 = _mm_movelh_ps(xmm1, xmm1);
200 xmm3 = _mm_load_ps((
float*)&points[2]);
202 for(;
i < bound - 1; ++
i) {
203 xmm4 = _mm_sub_ps(xmm1, xmm2);
204 xmm5 = _mm_sub_ps(xmm1, xmm3);
206 xmm6 = _mm_mul_ps(xmm4, xmm4);
207 xmm7 = _mm_mul_ps(xmm5, xmm5);
209 xmm2 = _mm_load_ps((
float*)&points[0]);
211 xmm4 = _mm_hadd_ps(xmm6, xmm7);
213 xmm3 = _mm_load_ps((
float*)&points[2]);
215 _mm_store_ps(target, xmm4);
220 xmm4 = _mm_sub_ps(xmm1, xmm2);
221 xmm5 = _mm_sub_ps(xmm1, xmm3);
224 xmm6 = _mm_mul_ps(xmm4, xmm4);
225 xmm7 = _mm_mul_ps(xmm5, xmm5);
227 xmm4 = _mm_hadd_ps(xmm6, xmm7);
229 _mm_store_ps(target, xmm4);
233 if (num_bytes >> 4 & 1) {
235 xmm2 = _mm_load_ps((
float*)&points[0]);
237 xmm4 = _mm_sub_ps(xmm1, xmm2);
241 xmm6 = _mm_mul_ps(xmm4, xmm4);
243 xmm4 = _mm_hadd_ps(xmm6, xmm6);
245 _mm_storeh_pi((__m64*)target, xmm4);
250 if (num_bytes >> 3 & 1) {
252 diff = src0[0] - points[0];
264 #include <arm_neon.h>
268 const unsigned int quarter_points = num_points / 4;
271 float32x4x2_t a_vec, b_vec;
272 float32x4x2_t diff_vec;
273 float32x4_t tmp, tmp1, dist_sq;
274 a_vec.val[0] = vdupq_n_f32(
lv_creal(src0[0]) );
275 a_vec.val[1] = vdupq_n_f32(
lv_cimag(src0[0]) );
276 for(number=0; number < quarter_points; ++number) {
277 b_vec = vld2q_f32((
float*)points);
278 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
279 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
280 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
281 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
283 dist_sq = vaddq_f32(tmp, tmp1);
284 vst1q_f32(target, dist_sq);
288 for(number=quarter_points*4; number < num_points; ++number) {
296 #ifdef LV_HAVE_GENERIC
299 unsigned int num_points)
301 const unsigned int num_bytes = num_points*8;
307 for(; i < num_bytes >> 3; ++
i) {
308 diff = src0[0] - points[
i];
321 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
322 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
329 #include<immintrin.h>
332 volk_32fc_x2_square_dist_32f_u_avx2(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
333 unsigned int num_points)
335 const unsigned int num_bytes = num_points*8;
337 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
341 int bound = num_bytes >> 6;
342 int leftovers1 = (num_bytes >> 3) & 0b11;
345 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
346 xmm1 = _mm256_setzero_ps();
347 xmm0 = _mm_loadu_ps((
float*)src0);
348 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
349 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
350 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
352 for(;
i < bound; ++
i) {
353 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
354 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
355 xmm4 = _mm256_sub_ps(xmm1, xmm2);
356 xmm5 = _mm256_sub_ps(xmm1, xmm3);
358 xmm6 = _mm256_mul_ps(xmm4, xmm4);
359 xmm7 = _mm256_mul_ps(xmm5, xmm5);
361 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
362 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
364 _mm256_storeu_ps(target, xmm4);
369 if (num_bytes >> 5 & 1) {
371 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
373 xmm4 = _mm256_sub_ps(xmm1, xmm2);
377 xmm6 = _mm256_mul_ps(xmm4, xmm4);
379 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
380 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
382 xmm9 = _mm256_extractf128_ps(xmm4, 1);
383 _mm_storeu_ps(target,xmm9);
388 for(
i = 0;
i < leftovers1; ++
i) {
390 diff = src0[0] - points[0];