Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16ic_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
55 #define INCLUDED_volk_16ic_magnitude_16i_a_H
56 
57 #include <volk/volk_common.h>
58 #include <inttypes.h>
59 #include <stdio.h>
60 #include <math.h>
61 #include <limits.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline void
67 volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
68 {
69  unsigned int number = 0;
70  const unsigned int eighthPoints = num_points / 8;
71 
72  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
73  int16_t* magnitudeVectorPtr = magnitudeVector;
74 
75  __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
76  __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
77  __m256i int1, int2;
78  __m128i short1, short2;
79  __m256 cplxValue1, cplxValue2, result;
80  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
81 
82  for(;number < eighthPoints; number++){
83 
84  int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
85  complexVectorPtr += 16;
86  short1 = _mm256_extracti128_si256(int1,0);
87  short2 = _mm256_extracti128_si256(int1,1);
88 
89  int1 = _mm256_cvtepi16_epi32(short1);
90  int2 = _mm256_cvtepi16_epi32(short2);
91  cplxValue1 = _mm256_cvtepi32_ps(int1);
92  cplxValue2 = _mm256_cvtepi32_ps(int2);
93 
94  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
95  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
96 
97  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
98  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
99 
100  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
101 
102  result = _mm256_sqrt_ps(result); // Square root the values
103 
104  result = _mm256_mul_ps(result, vScalar); // Scale the results
105 
106  int1 = _mm256_cvtps_epi32(result);
107  int1 = _mm256_packs_epi32(int1, int1);
108  int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
109  short1 = _mm256_extracti128_si256(int1, 0);
110  _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
111  magnitudeVectorPtr += 8;
112  }
113 
114  number = eighthPoints * 8;
115  magnitudeVectorPtr = &magnitudeVector[number];
116  complexVectorPtr = (const int16_t*)&complexVector[number];
117  for(; number < num_points; number++){
118  const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
119  const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
120  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
121  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
122  }
123 }
124 #endif /* LV_HAVE_AVX2 */
125 
126 #ifdef LV_HAVE_SSE3
127 #include <pmmintrin.h>
128 
129 static inline void
130 volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
131 {
132  unsigned int number = 0;
133  const unsigned int quarterPoints = num_points / 4;
134 
135  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
136  int16_t* magnitudeVectorPtr = magnitudeVector;
137 
138  __m128 vScalar = _mm_set_ps1(SHRT_MAX);
139  __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
140 
141  __m128 cplxValue1, cplxValue2, result;
142 
143  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
144  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
145 
146  for(;number < quarterPoints; number++){
147 
148  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
149  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
150  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
151  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
152 
153  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
154  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
155  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
156  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
157 
158  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
159  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
160 
161  complexVectorPtr += 8;
162 
163  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
164  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
165 
166  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
167  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
168 
169  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
170 
171  result = _mm_sqrt_ps(result); // Square root the values
172 
173  result = _mm_mul_ps(result, vScalar); // Scale the results
174 
175  _mm_store_ps(outputFloatBuffer, result);
176  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
177  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
178  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
179  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
180  }
181 
182  number = quarterPoints * 4;
183  magnitudeVectorPtr = &magnitudeVector[number];
184  complexVectorPtr = (const int16_t*)&complexVector[number];
185  for(; number < num_points; number++){
186  const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
187  const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
188  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
189  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
190  }
191 }
192 #endif /* LV_HAVE_SSE3 */
193 
194 #ifdef LV_HAVE_SSE
195 #include <xmmintrin.h>
196 
197 static inline void
198 volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
199 {
200  unsigned int number = 0;
201  const unsigned int quarterPoints = num_points / 4;
202 
203  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
204  int16_t* magnitudeVectorPtr = magnitudeVector;
205 
206  __m128 vScalar = _mm_set_ps1(SHRT_MAX);
207  __m128 invScalar = _mm_set_ps1(1.0f/SHRT_MAX);
208 
209  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
210 
211  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
212  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
213 
214  for(;number < quarterPoints; number++){
215 
216  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
217  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
218  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
219  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
220 
221  cplxValue1 = _mm_load_ps(inputFloatBuffer);
222  complexVectorPtr += 4;
223 
224  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
225  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
226  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
227  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
228 
229  cplxValue2 = _mm_load_ps(inputFloatBuffer);
230  complexVectorPtr += 4;
231 
232  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
233  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
234 
235  // Arrange in i1i2i3i4 format
236  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
237  // Arrange in q1q2q3q4 format
238  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
239 
240  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
241  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
242 
243  result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
244 
245  result = _mm_sqrt_ps(result); // Square root the values
246 
247  result = _mm_mul_ps(result, vScalar); // Scale the results
248 
249  _mm_store_ps(outputFloatBuffer, result);
250  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
251  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
252  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
253  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
254  }
255 
256  number = quarterPoints * 4;
257  magnitudeVectorPtr = &magnitudeVector[number];
258  complexVectorPtr = (const int16_t*)&complexVector[number];
259  for(; number < num_points; number++){
260  const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
261  const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
262  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
263  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
264  }
265 }
266 #endif /* LV_HAVE_SSE */
267 
268 #ifdef LV_HAVE_GENERIC
269 
270 static inline void
271 volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
272 {
273  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
274  int16_t* magnitudeVectorPtr = magnitudeVector;
275  unsigned int number = 0;
276  const float scalar = SHRT_MAX;
277  for(number = 0; number < num_points; number++){
278  float real = ((float)(*complexVectorPtr++)) / scalar;
279  float imag = ((float)(*complexVectorPtr++)) / scalar;
280  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
281  }
282 }
283 #endif /* LV_HAVE_GENERIC */
284 
285 #ifdef LV_HAVE_ORC_DISABLED
286 extern void
287 volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
288 
289 static inline void
290 volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
291 {
292  volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, SHRT_MAX, num_points);
293 }
294 #endif /* LV_HAVE_ORC */
295 
296 
297 #endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
298 
299 
300 #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
301 #define INCLUDED_volk_16ic_magnitude_16i_u_H
302 
303 #include <volk/volk_common.h>
304 #include <inttypes.h>
305 #include <stdio.h>
306 #include <math.h>
307 
308 #ifdef LV_HAVE_AVX2
309 #include <immintrin.h>
310 
311 static inline void
312 volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
313 {
314  unsigned int number = 0;
315  const unsigned int eighthPoints = num_points / 8;
316 
317  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
318  int16_t* magnitudeVectorPtr = magnitudeVector;
319 
320  __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
321  __m256 invScalar = _mm256_set1_ps(1.0f/SHRT_MAX);
322  __m256i int1, int2;
323  __m128i short1, short2;
324  __m256 cplxValue1, cplxValue2, result;
325  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
326 
327  for(;number < eighthPoints; number++){
328 
329  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
330  complexVectorPtr += 16;
331  short1 = _mm256_extracti128_si256(int1,0);
332  short2 = _mm256_extracti128_si256(int1,1);
333 
334  int1 = _mm256_cvtepi16_epi32(short1);
335  int2 = _mm256_cvtepi16_epi32(short2);
336  cplxValue1 = _mm256_cvtepi32_ps(int1);
337  cplxValue2 = _mm256_cvtepi32_ps(int2);
338 
339  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
340  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
341 
342  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
343  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
344 
345  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
346 
347  result = _mm256_sqrt_ps(result); // Square root the values
348 
349  result = _mm256_mul_ps(result, vScalar); // Scale the results
350 
351  int1 = _mm256_cvtps_epi32(result);
352  int1 = _mm256_packs_epi32(int1, int1);
353  int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
354  short1 = _mm256_extracti128_si256(int1, 0);
355  _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
356  magnitudeVectorPtr += 8;
357  }
358 
359  number = eighthPoints * 8;
360  magnitudeVectorPtr = &magnitudeVector[number];
361  complexVectorPtr = (const int16_t*)&complexVector[number];
362  for(; number < num_points; number++){
363  const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
364  const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
365  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
366  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
367  }
368 }
369 #endif /* LV_HAVE_AVX2 */
370 
371 #ifdef LV_HAVE_NEONV7
372 #include <arm_neon.h>
374 
375 static inline void
376 volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
377 {
378  unsigned int number = 0;
379  unsigned int quarter_points = num_points / 4;
380 
381  const float scalar = SHRT_MAX;
382  const float inv_scalar = 1.0f / scalar;
383 
384  int16_t* magnitudeVectorPtr = magnitudeVector;
385  const lv_16sc_t* complexVectorPtr = complexVector;
386 
387  float32x4_t mag_vec;
388  float32x4x2_t c_vec;
389 
390  for(number = 0; number < quarter_points; number++) {
391  const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
392  __VOLK_PREFETCH(complexVectorPtr+4);
393  c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
394  c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
395  // Scale to close to 0-1
396  c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
397  c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
398  // vsqrtq_f32 is armv8
399  const float32x4_t mag_vec_squared = _vmagnitudesquaredq_f32(c_vec);
400  mag_vec = vmulq_f32(mag_vec_squared, _vinvsqrtq_f32(mag_vec_squared));
401  // Reconstruct
402  mag_vec = vmulq_n_f32(mag_vec, scalar);
403  // Add 0.5 for correct rounding because vcvtq_s32_f32 truncates.
404  // This works because the magnitude is always positive.
405  mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
406  const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
407  vst1_s16(magnitudeVectorPtr, mag16_vec);
408  // Advance pointers
409  magnitudeVectorPtr+=4;
410  complexVectorPtr+=4;
411  }
412 
413  // Deal with the rest
414  for(number = quarter_points * 4; number < num_points; number++) {
415  const float real = lv_creal(*complexVectorPtr) * inv_scalar;
416  const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
417  *magnitudeVectorPtr = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
418  complexVectorPtr++;
419  magnitudeVectorPtr++;
420  }
421 }
422 #endif /* LV_HAVE_NEONV7 */
423 
424 #endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:85
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:88
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
volk_16ic_magnitude_16i_a_sse3
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:130
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:53
lv_16sc_t
short complex lv_16sc_t
Definition: volk_complex.h:58
volk_16ic_magnitude_16i_generic
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:271
volk_common.h
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:98
volk_16ic_magnitude_16i_a_sse
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:198
volk_neon_intrinsics.h
rintf
static float rintf(float x)
Definition: config.h:31
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:83