Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_conjugate_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
59 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
60 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
61 
62 
63 #include<volk/volk_complex.h>
64 
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
69 
70  const unsigned int num_bytes = num_points*8;
71 
72  float * res = (float*) result;
73  float * in = (float*) input;
74  float * tp = (float*) taps;
75  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
76 
77  float sum0[2] = {0,0};
78  float sum1[2] = {0,0};
79  unsigned int i = 0;
80 
81  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
82  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
83  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
84  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
85  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
86 
87  in += 4;
88  tp += 4;
89  }
90 
91  res[0] = sum0[0] + sum1[0];
92  res[1] = sum0[1] + sum1[1];
93 
94  if (num_bytes >> 3 & 1) {
95  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
96  }
97 }
98 
99 #endif /*LV_HAVE_GENERIC*/
100 
101 #ifdef LV_HAVE_AVX
102 
103 #include <immintrin.h>
104 
106  const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
107 {
108  // Partial sums for indices i, i+1, i+2 and i+3.
109  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
110  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
111 
112  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
113  /* Four complex elements a time are processed.
114  * (ar + j⋅ai)*conj(br + j⋅bi) =
115  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
116  */
117 
118  /* Load input and taps, split and duplicate real und imaginary parts of taps.
119  * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
120  * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
121  * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
122  * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
123  */
124  __m256 a = _mm256_loadu_ps((const float *) &input[i]);
125  __m256 b = _mm256_loadu_ps((const float *) &taps[i]);
126  __m256 b_real = _mm256_moveldup_ps(b);
127  __m256 b_imag = _mm256_movehdup_ps(b);
128 
129  // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
130  sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
131  // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
132  sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
133  }
134 
135  // Swap position of −ar⋅bi and ai⋅bi.
136  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
137  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
138  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
139  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
140  * s1 + s3 and s0 + s2 …
141  */
142  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
143  // … and now (s0 + s2) + (s1 + s3)
144  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
145  // Store result.
146  __m128 lower = _mm256_extractf128_ps(sum, 0);
147  _mm_storel_pi((__m64 *) result, lower);
148 
149  // Handle the last elements if num_points mod 4 is bigger than 0.
150  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
151  *result += lv_cmake(
152  lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
153  lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
154  }
155 }
156 
157 #endif /* LV_HAVE_AVX */
158 
159 #ifdef LV_HAVE_SSE3
160 
161 #include <xmmintrin.h>
162 #include <pmmintrin.h>
163 
165  const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
166 {
167  // Partial sums for indices i and i+1.
168  __m128 sum_a_mult_b_real = _mm_setzero_ps();
169  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
170 
171  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
172  /* Two complex elements a time are processed.
173  * (ar + j⋅ai)*conj(br + j⋅bi) =
174  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
175  */
176 
177  /* Load input and taps, split and duplicate real und imaginary parts of taps.
178  * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
179  * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
180  * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
181  * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
182  */
183  __m128 a = _mm_loadu_ps((const float *) &input[i]);
184  __m128 b = _mm_loadu_ps((const float *) &taps[i]);
185  __m128 b_real = _mm_moveldup_ps(b);
186  __m128 b_imag = _mm_movehdup_ps(b);
187 
188  // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
189  sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
190  // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
191  sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
192  }
193 
194  // Swap position of −ar⋅bi and ai⋅bi.
195  sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
196  _MM_SHUFFLE(2, 3, 0, 1));
197  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
198  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
199  // Sum the two partial sums.
200  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
201  // Store result.
202  _mm_storel_pi((__m64 *) result, sum);
203 
204  // Handle the last element if num_points mod 2 is 1.
205  if (num_points & 1u) {
206  *result += lv_cmake(
207  lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
208  lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
209  lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
210  lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
211  }
212 }
213 
214 #endif /*LV_HAVE_SSE3*/
215 
216 #ifdef LV_HAVE_NEON
217 #include <arm_neon.h>
218 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
219 
220  unsigned int quarter_points = num_points / 4;
221  unsigned int number;
222 
223  lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
224  lv_32fc_t* b_ptr = (lv_32fc_t*) input;
225  // for 2-lane vectors, 1st lane holds the real part,
226  // 2nd lane holds the imaginary part
227  float32x4x2_t a_val, b_val, accumulator;
228  float32x4x2_t tmp_imag;
229  accumulator.val[0] = vdupq_n_f32(0);
230  accumulator.val[1] = vdupq_n_f32(0);
231 
232  for(number = 0; number < quarter_points; ++number) {
233  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
234  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
235  __VOLK_PREFETCH(a_ptr+8);
236  __VOLK_PREFETCH(b_ptr+8);
237 
238  // do the first multiply
239  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
240  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
241 
242  // use multiply accumulate/subtract to get result
243  tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
244  tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
245 
246  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
247  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
248 
249  // increment pointers
250  a_ptr += 4;
251  b_ptr += 4;
252  }
253  lv_32fc_t accum_result[4];
254  vst2q_f32((float*)accum_result, accumulator);
255  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
256 
257  // tail case
258  for(number = quarter_points*4; number < num_points; ++number) {
259  *result += (*a_ptr++) * lv_conj(*b_ptr++);
260  }
261  *result = lv_conj(*result);
262 
263 }
264 #endif /*LV_HAVE_NEON*/
265 
266 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
267 
268 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
269 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
270 
271 #include <volk/volk_common.h>
272 #include<volk/volk_complex.h>
273 #include<stdio.h>
274 
275 
276 #ifdef LV_HAVE_AVX
277 #include <immintrin.h>
278 
280  const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
281 {
282  // Partial sums for indices i, i+1, i+2 and i+3.
283  __m256 sum_a_mult_b_real = _mm256_setzero_ps();
284  __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
285 
286  for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
287  /* Four complex elements a time are processed.
288  * (ar + j⋅ai)*conj(br + j⋅bi) =
289  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
290  */
291 
292  /* Load input and taps, split and duplicate real und imaginary parts of taps.
293  * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
294  * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
295  * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
296  * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
297  */
298  __m256 a = _mm256_load_ps((const float *) &input[i]);
299  __m256 b = _mm256_load_ps((const float *) &taps[i]);
300  __m256 b_real = _mm256_moveldup_ps(b);
301  __m256 b_imag = _mm256_movehdup_ps(b);
302 
303  // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
304  sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
305  // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
306  sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
307  }
308 
309  // Swap position of −ar⋅bi and ai⋅bi.
310  sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
311  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
312  __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
313  /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
314  * s1 + s3 and s0 + s2 …
315  */
316  sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
317  // … and now (s0 + s2) + (s1 + s3)
318  sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
319  // Store result.
320  __m128 lower = _mm256_extractf128_ps(sum, 0);
321  _mm_storel_pi((__m64 *) result, lower);
322 
323  // Handle the last elements if num_points mod 4 is bigger than 0.
324  for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
325  *result += lv_cmake(
326  lv_creal(input[i]) * lv_creal(taps[i]) + lv_cimag(input[i]) * lv_cimag(taps[i]),
327  lv_cimag(input[i]) * lv_creal(taps[i]) - lv_creal(input[i]) * lv_cimag(taps[i]));
328  }
329 }
330 #endif /* LV_HAVE_AVX */
331 
332 #ifdef LV_HAVE_SSE3
333 
334 #include <xmmintrin.h>
335 #include <pmmintrin.h>
336 
338  const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
339 {
340  // Partial sums for indices i and i+1.
341  __m128 sum_a_mult_b_real = _mm_setzero_ps();
342  __m128 sum_a_mult_b_imag = _mm_setzero_ps();
343 
344  for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
345  /* Two complex elements a time are processed.
346  * (ar + j⋅ai)*conj(br + j⋅bi) =
347  * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
348  */
349 
350  /* Load input and taps, split and duplicate real und imaginary parts of taps.
351  * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
352  * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
353  * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
354  * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
355  */
356  __m128 a = _mm_load_ps((const float *) &input[i]);
357  __m128 b = _mm_load_ps((const float *) &taps[i]);
358  __m128 b_real = _mm_moveldup_ps(b);
359  __m128 b_imag = _mm_movehdup_ps(b);
360 
361  // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
362  sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
363  // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
364  sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
365  }
366 
367  // Swap position of −ar⋅bi and ai⋅bi.
368  sum_a_mult_b_imag = _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag,
369  _MM_SHUFFLE(2, 3, 0, 1));
370  // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
371  __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
372  // Sum the two partial sums.
373  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
374  // Store result.
375  _mm_storel_pi((__m64 *) result, sum);
376 
377  // Handle the last element if num_points mod 2 is 1.
378  if (num_points & 1u) {
379  *result += lv_cmake(
380  lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
381  lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
382  lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
383  lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
384  }
385 }
386 
387 #endif /*LV_HAVE_SSE3*/
388 
389 
390 #ifdef LV_HAVE_GENERIC
391 
392 
393 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
394 
395  const unsigned int num_bytes = num_points*8;
396 
397  float * res = (float*) result;
398  float * in = (float*) input;
399  float * tp = (float*) taps;
400  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
401 
402  float sum0[2] = {0,0};
403  float sum1[2] = {0,0};
404  unsigned int i = 0;
405 
406  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
407  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
408  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
409  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
410  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
411 
412  in += 4;
413  tp += 4;
414  }
415 
416  res[0] = sum0[0] + sum1[0];
417  res[1] = sum0[1] + sum1[1];
418 
419  if (num_bytes >> 3 & 1) {
420  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
421  }
422 }
423 
424 #endif /*LV_HAVE_GENERIC*/
425 
426 
427 #if LV_HAVE_SSE && LV_HAVE_64
428 
429 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
430 
431  const unsigned int num_bytes = num_points*8;
432 
433  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
434 
436  (
437  "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
438  "# const float *taps, unsigned num_bytes)\n\t"
439  "# float sum0 = 0;\n\t"
440  "# float sum1 = 0;\n\t"
441  "# float sum2 = 0;\n\t"
442  "# float sum3 = 0;\n\t"
443  "# do {\n\t"
444  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
445  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
446  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
447  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
448  "# input += 4;\n\t"
449  "# taps += 4; \n\t"
450  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
451  "# result[0] = sum0 + sum2;\n\t"
452  "# result[1] = sum1 + sum3;\n\t"
453  "# TODO: prefetch and better scheduling\n\t"
454  " xor %%r9, %%r9\n\t"
455  " xor %%r10, %%r10\n\t"
456  " movq %[conjugator], %%r9\n\t"
457  " movq %%rcx, %%rax\n\t"
458  " movaps 0(%%r9), %%xmm8\n\t"
459  " movq %%rcx, %%r8\n\t"
460  " movq %[rsi], %%r9\n\t"
461  " movq %[rdx], %%r10\n\t"
462  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
463  " movaps 0(%%r9), %%xmm0\n\t"
464  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
465  " movups 0(%%r10), %%xmm2\n\t"
466  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
467  " shr $4, %%r8\n\t"
468  " xorps %%xmm8, %%xmm2\n\t"
469  " jmp .%=L1_test\n\t"
470  " # 4 taps / loop\n\t"
471  " # something like ?? cycles / loop\n\t"
472  ".%=Loop1: \n\t"
473  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
474  "# movaps (%%r9), %%xmmA\n\t"
475  "# movaps (%%r10), %%xmmB\n\t"
476  "# movaps %%xmmA, %%xmmZ\n\t"
477  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
478  "# mulps %%xmmB, %%xmmA\n\t"
479  "# mulps %%xmmZ, %%xmmB\n\t"
480  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
481  "# xorps %%xmmPN, %%xmmA\n\t"
482  "# movaps %%xmmA, %%xmmZ\n\t"
483  "# unpcklps %%xmmB, %%xmmA\n\t"
484  "# unpckhps %%xmmB, %%xmmZ\n\t"
485  "# movaps %%xmmZ, %%xmmY\n\t"
486  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
487  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
488  "# addps %%xmmZ, %%xmmA\n\t"
489  "# addps %%xmmA, %%xmmC\n\t"
490  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
491  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
492  " movaps 16(%%r9), %%xmm1\n\t"
493  " movaps %%xmm0, %%xmm4\n\t"
494  " mulps %%xmm2, %%xmm0\n\t"
495  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
496  " movaps 16(%%r10), %%xmm3\n\t"
497  " movaps %%xmm1, %%xmm5\n\t"
498  " xorps %%xmm8, %%xmm3\n\t"
499  " addps %%xmm0, %%xmm6\n\t"
500  " mulps %%xmm3, %%xmm1\n\t"
501  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
502  " addps %%xmm1, %%xmm6\n\t"
503  " mulps %%xmm4, %%xmm2\n\t"
504  " movaps 32(%%r9), %%xmm0\n\t"
505  " addps %%xmm2, %%xmm7\n\t"
506  " mulps %%xmm5, %%xmm3\n\t"
507  " add $32, %%r9\n\t"
508  " movaps 32(%%r10), %%xmm2\n\t"
509  " addps %%xmm3, %%xmm7\n\t"
510  " add $32, %%r10\n\t"
511  " xorps %%xmm8, %%xmm2\n\t"
512  ".%=L1_test:\n\t"
513  " dec %%rax\n\t"
514  " jge .%=Loop1\n\t"
515  " # We've handled the bulk of multiplies up to here.\n\t"
516  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
517  " # If so, we've got 2 more taps to do.\n\t"
518  " and $1, %%r8\n\t"
519  " je .%=Leven\n\t"
520  " # The count was odd, do 2 more taps.\n\t"
521  " # Note that we've already got mm0/mm2 preloaded\n\t"
522  " # from the main loop.\n\t"
523  " movaps %%xmm0, %%xmm4\n\t"
524  " mulps %%xmm2, %%xmm0\n\t"
525  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
526  " addps %%xmm0, %%xmm6\n\t"
527  " mulps %%xmm4, %%xmm2\n\t"
528  " addps %%xmm2, %%xmm7\n\t"
529  ".%=Leven:\n\t"
530  " # neg inversor\n\t"
531  " xorps %%xmm1, %%xmm1\n\t"
532  " mov $0x80000000, %%r9\n\t"
533  " movd %%r9, %%xmm1\n\t"
534  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
535  " # pfpnacc\n\t"
536  " xorps %%xmm1, %%xmm6\n\t"
537  " movaps %%xmm6, %%xmm2\n\t"
538  " unpcklps %%xmm7, %%xmm6\n\t"
539  " unpckhps %%xmm7, %%xmm2\n\t"
540  " movaps %%xmm2, %%xmm3\n\t"
541  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
542  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
543  " addps %%xmm2, %%xmm6\n\t"
544  " # xmm6 = r1 i2 r3 i4\n\t"
545  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
546  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
547  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
548  :
549  :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
550  :"rax", "r8", "r9", "r10"
551  );
552 
553  int getem = num_bytes % 16;
554 
555  for(; getem > 0; getem -= 8) {
556  *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
557  }
558 }
559 #endif
560 
561 #if LV_HAVE_SSE && LV_HAVE_32
562 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
563 
564  const unsigned int num_bytes = num_points*8;
565 
566  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
567 
568  int bound = num_bytes >> 4;
569  int leftovers = num_bytes % 16;
570 
572  (
573  " #pushl %%ebp\n\t"
574  " #movl %%esp, %%ebp\n\t"
575  " #movl 12(%%ebp), %%eax # input\n\t"
576  " #movl 16(%%ebp), %%edx # taps\n\t"
577  " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
578  " movaps 0(%[conjugator]), %%xmm1\n\t"
579  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
580  " movaps 0(%[eax]), %%xmm0\n\t"
581  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
582  " movaps 0(%[edx]), %%xmm2\n\t"
583  " movl %[ecx], (%[out])\n\t"
584  " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
585 
586  " xorps %%xmm1, %%xmm2\n\t"
587  " jmp .%=L1_test\n\t"
588  " # 4 taps / loop\n\t"
589  " # something like ?? cycles / loop\n\t"
590  ".%=Loop1: \n\t"
591  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
592  "# movaps (%[eax]), %%xmmA\n\t"
593  "# movaps (%[edx]), %%xmmB\n\t"
594  "# movaps %%xmmA, %%xmmZ\n\t"
595  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
596  "# mulps %%xmmB, %%xmmA\n\t"
597  "# mulps %%xmmZ, %%xmmB\n\t"
598  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
599  "# xorps %%xmmPN, %%xmmA\n\t"
600  "# movaps %%xmmA, %%xmmZ\n\t"
601  "# unpcklps %%xmmB, %%xmmA\n\t"
602  "# unpckhps %%xmmB, %%xmmZ\n\t"
603  "# movaps %%xmmZ, %%xmmY\n\t"
604  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
605  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
606  "# addps %%xmmZ, %%xmmA\n\t"
607  "# addps %%xmmA, %%xmmC\n\t"
608  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
609  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
610  " movaps 16(%[edx]), %%xmm3\n\t"
611  " movaps %%xmm0, %%xmm4\n\t"
612  " xorps %%xmm1, %%xmm3\n\t"
613  " mulps %%xmm2, %%xmm0\n\t"
614  " movaps 16(%[eax]), %%xmm1\n\t"
615  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
616  " movaps %%xmm1, %%xmm5\n\t"
617  " addps %%xmm0, %%xmm6\n\t"
618  " mulps %%xmm3, %%xmm1\n\t"
619  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
620  " addps %%xmm1, %%xmm6\n\t"
621  " movaps 0(%[conjugator]), %%xmm1\n\t"
622  " mulps %%xmm4, %%xmm2\n\t"
623  " movaps 32(%[eax]), %%xmm0\n\t"
624  " addps %%xmm2, %%xmm7\n\t"
625  " mulps %%xmm5, %%xmm3\n\t"
626  " addl $32, %[eax]\n\t"
627  " movaps 32(%[edx]), %%xmm2\n\t"
628  " addps %%xmm3, %%xmm7\n\t"
629  " xorps %%xmm1, %%xmm2\n\t"
630  " addl $32, %[edx]\n\t"
631  ".%=L1_test:\n\t"
632  " decl %[ecx]\n\t"
633  " jge .%=Loop1\n\t"
634  " # We've handled the bulk of multiplies up to here.\n\t"
635  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
636  " # If so, we've got 2 more taps to do.\n\t"
637  " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
638  " shrl $4, %[ecx]\n\t"
639  " andl $1, %[ecx]\n\t"
640  " je .%=Leven\n\t"
641  " # The count was odd, do 2 more taps.\n\t"
642  " # Note that we've already got mm0/mm2 preloaded\n\t"
643  " # from the main loop.\n\t"
644  " movaps %%xmm0, %%xmm4\n\t"
645  " mulps %%xmm2, %%xmm0\n\t"
646  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
647  " addps %%xmm0, %%xmm6\n\t"
648  " mulps %%xmm4, %%xmm2\n\t"
649  " addps %%xmm2, %%xmm7\n\t"
650  ".%=Leven:\n\t"
651  " # neg inversor\n\t"
652  " #movl 8(%%ebp), %[eax] \n\t"
653  " xorps %%xmm1, %%xmm1\n\t"
654  " movl $0x80000000, (%[out])\n\t"
655  " movss (%[out]), %%xmm1\n\t"
656  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
657  " # pfpnacc\n\t"
658  " xorps %%xmm1, %%xmm6\n\t"
659  " movaps %%xmm6, %%xmm2\n\t"
660  " unpcklps %%xmm7, %%xmm6\n\t"
661  " unpckhps %%xmm7, %%xmm2\n\t"
662  " movaps %%xmm2, %%xmm3\n\t"
663  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
664  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
665  " addps %%xmm2, %%xmm6\n\t"
666  " # xmm6 = r1 i2 r3 i4\n\t"
667  " #movl 8(%%ebp), %[eax] # @result\n\t"
668  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
669  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
670  " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
671  " #popl %%ebp\n\t"
672  :
673  : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
674  );
675 
676  for(; leftovers > 0; leftovers -= 8) {
677  *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
678  }
679 }
680 #endif /*LV_HAVE_SSE*/
681 
682 
683 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
volk_32fc_x2_conjugate_dot_prod_32fc_a_generic
static void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:393
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:85
volk_32fc_x2_conjugate_dot_prod_32fc_u_avx
static void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:105
volk_32fc_x2_conjugate_dot_prod_32fc_generic
static void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:68
__VOLK_ASM
#define __VOLK_ASM
Definition: volk_common.h:54
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
__VOLK_VOLATILE
#define __VOLK_VOLATILE
Definition: volk_common.h:55
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3
static void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:164
lv_conj
#define lv_conj(x)
Definition: volk_complex.h:87
i
for i
Definition: volk_config_fixed.tmpl.h:25
lv_cmake
#define lv_cmake(r, i)
Definition: volk_complex.h:64
volk_common.h
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:61
volk_complex.h
volk_32fc_x2_conjugate_dot_prod_32fc_neon
static void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:218
volk_32fc_x2_conjugate_dot_prod_32fc_a_avx
static void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:279
volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3
static void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_conjugate_dot_prod_32fc.h:337
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:83