Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
84 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
85 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
86 
87 #include <inttypes.h>
88 #include <stdio.h>
89 #include <volk/volk_complex.h>
90 #include <float.h>
91 
92 
93 #ifdef LV_HAVE_GENERIC
94 
95 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
96  const lv_32fc_t* aPtr = aVector;
97  const lv_32fc_t* bPtr = bVector;
98  lv_32fc_t* cPtr = cVector;
99  unsigned int number = num_points;
100 
101  // unwrap loop
102  while (number >= 8) {
103  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
104  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
105  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
106  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
107  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
108  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
109  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
110  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
111  number -= 8;
112  }
113 
114  // clean up any remaining
115  while (number-- > 0) {
116  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
117  }
118 }
119 #endif /* LV_HAVE_GENERIC */
120 
121 
122 #ifdef LV_HAVE_AVX
123 #include <immintrin.h>
125 
126 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
127  unsigned int number = 0;
128  unsigned int i = 0;
129  const unsigned int quarterPoints = num_points / 4;
130  unsigned int isodd = num_points & 3;
131 
132  __m256 x, y, s, z;
133  lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
134 
135  const lv_32fc_t* a = aVector;
136  const lv_32fc_t* b = bVector;
137  lv_32fc_t* c = cVector;
138 
139  // Set up constant scalar vector
140  s = _mm256_loadu_ps((float*)v_scalar);
141 
142  for(;number < quarterPoints; number++) {
143  x = _mm256_loadu_ps((float*)b);
144  y = _mm256_loadu_ps((float*)a);
146  z = _mm256_add_ps(y, z);
147  _mm256_storeu_ps((float*)c,z);
148 
149  a += 4;
150  b += 4;
151  c += 4;
152  }
153 
154  for(i = num_points-isodd; i < num_points; i++) {
155  *c++ = (*a++) + lv_conj(*b++) * scalar;
156  }
157 }
158 #endif /* LV_HAVE_AVX */
159 
160 
161 #ifdef LV_HAVE_SSE3
162 #include <pmmintrin.h>
164 
165 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
166  unsigned int number = 0;
167  const unsigned int halfPoints = num_points / 2;
168 
169  __m128 x, y, s, z;
170  lv_32fc_t v_scalar[2] = {scalar, scalar};
171 
172  const lv_32fc_t* a = aVector;
173  const lv_32fc_t* b = bVector;
174  lv_32fc_t* c = cVector;
175 
176  // Set up constant scalar vector
177  s = _mm_loadu_ps((float*)v_scalar);
178 
179  for(;number < halfPoints; number++){
180  x = _mm_loadu_ps((float*)b);
181  y = _mm_loadu_ps((float*)a);
182  z = _mm_complexconjugatemul_ps(s, x);
183  z = _mm_add_ps(y, z);
184  _mm_storeu_ps((float*)c,z);
185 
186  a += 2;
187  b += 2;
188  c += 2;
189  }
190 
191  if((num_points % 2) != 0) {
192  *c = *a + lv_conj(*b) * scalar;
193  }
194 }
195 #endif /* LV_HAVE_SSE */
196 
197 
198 #ifdef LV_HAVE_AVX
199 #include <immintrin.h>
201 
202 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
203  unsigned int number = 0;
204  unsigned int i = 0;
205  const unsigned int quarterPoints = num_points / 4;
206  unsigned int isodd = num_points & 3;
207 
208  __m256 x, y, s, z;
209  lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
210 
211  const lv_32fc_t* a = aVector;
212  const lv_32fc_t* b = bVector;
213  lv_32fc_t* c = cVector;
214 
215  // Set up constant scalar vector
216  s = _mm256_load_ps((float*)v_scalar);
217 
218  for(;number < quarterPoints; number++) {
219  x = _mm256_load_ps((float*)b);
220  y = _mm256_load_ps((float*)a);
222  z = _mm256_add_ps(y, z);
223  _mm256_store_ps((float*)c,z);
224 
225  a += 4;
226  b += 4;
227  c += 4;
228  }
229 
230  for(i = num_points-isodd; i < num_points; i++) {
231  *c++ = (*a++) + lv_conj(*b++) * scalar;
232  }
233 }
234 #endif /* LV_HAVE_AVX */
235 
236 
237 #ifdef LV_HAVE_SSE3
238 #include <pmmintrin.h>
240 
241 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points) {
242  unsigned int number = 0;
243  const unsigned int halfPoints = num_points / 2;
244 
245  __m128 x, y, s, z;
246  lv_32fc_t v_scalar[2] = {scalar, scalar};
247 
248  const lv_32fc_t* a = aVector;
249  const lv_32fc_t* b = bVector;
250  lv_32fc_t* c = cVector;
251 
252  // Set up constant scalar vector
253  s = _mm_load_ps((float*)v_scalar);
254 
255  for(;number < halfPoints; number++){
256  x = _mm_load_ps((float*)b);
257  y = _mm_load_ps((float*)a);
258  z = _mm_complexconjugatemul_ps(s, x);
259  z = _mm_add_ps(y, z);
260  _mm_store_ps((float*)c,z);
261 
262  a += 2;
263  b += 2;
264  c += 2;
265  }
266 
267  if((num_points % 2) != 0) {
268  *c = *a + lv_conj(*b) * scalar;
269  }
270 }
271 #endif /* LV_HAVE_SSE */
272 
273 
274 #ifdef LV_HAVE_NEON
275 #include <arm_neon.h>
276 
277 static inline void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int num_points){
278  const lv_32fc_t* bPtr = bVector;
279  const lv_32fc_t* aPtr = aVector;
280  lv_32fc_t* cPtr = cVector;
281  unsigned int number = num_points;
282  unsigned int quarter_points = num_points / 4;
283 
284  float32x4x2_t a_val, b_val, c_val, scalar_val;
285  float32x4x2_t tmp_val;
286 
287  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
288  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
289 
290  for(number = 0; number < quarter_points; ++number) {
291  a_val = vld2q_f32((float*)aPtr);
292  b_val = vld2q_f32((float*)bPtr);
293  b_val.val[1] = vnegq_f32(b_val.val[1]);
294  __VOLK_PREFETCH(aPtr + 8);
295  __VOLK_PREFETCH(bPtr + 8);
296 
297  tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
298  tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
299 
300  tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
301  tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
302 
303  c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
304  c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
305 
306  vst2q_f32((float*)cPtr, c_val);
307 
308  aPtr += 4;
309  bPtr += 4;
310  cPtr += 4;
311  }
312 
313  for(number = quarter_points*4; number < num_points; number++){
314  *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
315  }
316 }
317 #endif /* LV_HAVE_NEON */
318 
319 #endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
volk_sse3_intrinsics.h
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:202
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:165
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:95
lv_conj
#define lv_conj(x)
Definition: volk_complex.h:87
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:126
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:61
volk_complex.h
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:241
volk_avx_intrinsics.h
_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:45
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:277