71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
79 #define POW_POLY_DEGREE 3
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
84 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
86 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
87 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
88 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
89 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
92 volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
const float* bVector,
93 const float* aVector,
unsigned int num_points)
95 float* cPtr = cVector;
96 const float* bPtr = bVector;
97 const float* aPtr = aVector;
99 unsigned int number = 0;
100 const unsigned int eighthPoints = num_points / 8;
102 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103 __m256 tmp, fx, mask, pow2n, z, y;
104 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106 __m256i bias, exp, emm0, pi32_0x7f;
108 one = _mm256_set1_ps(1.0);
109 exp_hi = _mm256_set1_ps(88.3762626647949);
110 exp_lo = _mm256_set1_ps(-88.3762626647949);
111 ln2 = _mm256_set1_ps(0.6931471805);
112 log2EF = _mm256_set1_ps(1.44269504088896341);
113 half = _mm256_set1_ps(0.5);
114 exp_C1 = _mm256_set1_ps(0.693359375);
115 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
116 pi32_0x7f = _mm256_set1_epi32(0x7f);
118 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
119 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
120 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
121 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
122 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
123 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
125 for(;number < eighthPoints; number++){
127 aVal = _mm256_load_ps(aPtr);
128 bias = _mm256_set1_epi32(127);
129 leadingOne = _mm256_set1_ps(1.0f);
130 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
131 logarithm = _mm256_cvtepi32_ps(exp);
133 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135 #if POW_POLY_DEGREE == 6
136 mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5
138 mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4
140 mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3
142 mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
147 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
148 logarithm = _mm256_mul_ps(logarithm, ln2);
151 bVal = _mm256_load_ps(bPtr);
152 bVal = _mm256_mul_ps(bVal, logarithm);
155 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
157 fx = _mm256_fmadd_ps(bVal, log2EF, half);
159 emm0 = _mm256_cvttps_epi32(fx);
160 tmp = _mm256_cvtepi32_ps(emm0);
162 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
163 fx = _mm256_sub_ps(tmp, mask);
165 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
166 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
167 z = _mm256_mul_ps(bVal, bVal);
169 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
170 y = _mm256_fmadd_ps(y, bVal, exp_p2);
171 y = _mm256_fmadd_ps(y, bVal, exp_p3);
172 y = _mm256_fmadd_ps(y, bVal, exp_p4);
173 y = _mm256_fmadd_ps(y, bVal, exp_p5);
174 y = _mm256_fmadd_ps(y, z, bVal);
175 y = _mm256_add_ps(y, one);
177 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
179 pow2n = _mm256_castsi256_ps(emm0);
180 cVal = _mm256_mul_ps(y, pow2n);
182 _mm256_store_ps(cPtr, cVal);
189 number = eighthPoints * 8;
190 for(;number < num_points; number++){
191 *cPtr++ = pow(*aPtr++, *bPtr++);
198 #include <immintrin.h>
200 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
201 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
202 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
203 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
204 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
205 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
208 volk_32f_x2_pow_32f_a_avx2(
float* cVector,
const float* bVector,
209 const float* aVector,
unsigned int num_points)
211 float* cPtr = cVector;
212 const float* bPtr = bVector;
213 const float* aPtr = aVector;
215 unsigned int number = 0;
216 const unsigned int eighthPoints = num_points / 8;
218 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
219 __m256 tmp, fx, mask, pow2n, z, y;
220 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
221 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
222 __m256i bias, exp, emm0, pi32_0x7f;
224 one = _mm256_set1_ps(1.0);
225 exp_hi = _mm256_set1_ps(88.3762626647949);
226 exp_lo = _mm256_set1_ps(-88.3762626647949);
227 ln2 = _mm256_set1_ps(0.6931471805);
228 log2EF = _mm256_set1_ps(1.44269504088896341);
229 half = _mm256_set1_ps(0.5);
230 exp_C1 = _mm256_set1_ps(0.693359375);
231 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
232 pi32_0x7f = _mm256_set1_epi32(0x7f);
234 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
235 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
236 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
237 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
238 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
239 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
241 for(;number < eighthPoints; number++){
243 aVal = _mm256_load_ps(aPtr);
244 bias = _mm256_set1_epi32(127);
245 leadingOne = _mm256_set1_ps(1.0f);
246 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
247 logarithm = _mm256_cvtepi32_ps(exp);
249 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
251 #if POW_POLY_DEGREE == 6
252 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
253 #elif POW_POLY_DEGREE == 5
254 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
255 #elif POW_POLY_DEGREE == 4
256 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
257 #elif POW_POLY_DEGREE == 3
258 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
263 logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
264 logarithm = _mm256_mul_ps(logarithm, ln2);
267 bVal = _mm256_load_ps(bPtr);
268 bVal = _mm256_mul_ps(bVal, logarithm);
271 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
273 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
275 emm0 = _mm256_cvttps_epi32(fx);
276 tmp = _mm256_cvtepi32_ps(emm0);
278 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
279 fx = _mm256_sub_ps(tmp, mask);
281 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
282 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
283 z = _mm256_mul_ps(bVal, bVal);
285 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
286 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
287 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
288 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
289 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
290 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
291 y = _mm256_add_ps(y, one);
293 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
295 pow2n = _mm256_castsi256_ps(emm0);
296 cVal = _mm256_mul_ps(y, pow2n);
298 _mm256_store_ps(cPtr, cVal);
305 number = eighthPoints * 8;
306 for(;number < num_points; number++){
307 *cPtr++ = pow(*aPtr++, *bPtr++);
314 #ifdef LV_HAVE_SSE4_1
315 #include <smmintrin.h>
317 #define POLY0(x, c0) _mm_set1_ps(c0)
318 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
319 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
320 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
321 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
322 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
325 volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
const float* bVector,
326 const float* aVector,
unsigned int num_points)
328 float* cPtr = cVector;
329 const float* bPtr = bVector;
330 const float* aPtr = aVector;
332 unsigned int number = 0;
333 const unsigned int quarterPoints = num_points / 4;
335 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
336 __m128 tmp, fx, mask, pow2n, z, y;
337 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
338 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
339 __m128i bias, exp, emm0, pi32_0x7f;
341 one = _mm_set1_ps(1.0);
342 exp_hi = _mm_set1_ps(88.3762626647949);
343 exp_lo = _mm_set1_ps(-88.3762626647949);
344 ln2 = _mm_set1_ps(0.6931471805);
345 log2EF = _mm_set1_ps(1.44269504088896341);
346 half = _mm_set1_ps(0.5);
347 exp_C1 = _mm_set1_ps(0.693359375);
348 exp_C2 = _mm_set1_ps(-2.12194440e-4);
349 pi32_0x7f = _mm_set1_epi32(0x7f);
351 exp_p0 = _mm_set1_ps(1.9875691500e-4);
352 exp_p1 = _mm_set1_ps(1.3981999507e-3);
353 exp_p2 = _mm_set1_ps(8.3334519073e-3);
354 exp_p3 = _mm_set1_ps(4.1665795894e-2);
355 exp_p4 = _mm_set1_ps(1.6666665459e-1);
356 exp_p5 = _mm_set1_ps(5.0000001201e-1);
358 for(;number < quarterPoints; number++){
360 aVal = _mm_load_ps(aPtr);
361 bias = _mm_set1_epi32(127);
362 leadingOne = _mm_set1_ps(1.0f);
363 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
364 logarithm = _mm_cvtepi32_ps(exp);
366 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
368 #if POW_POLY_DEGREE == 6
369 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
370 #elif POW_POLY_DEGREE == 5
371 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
372 #elif POW_POLY_DEGREE == 4
373 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
374 #elif POW_POLY_DEGREE == 3
375 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
380 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
381 logarithm = _mm_mul_ps(logarithm, ln2);
385 bVal = _mm_load_ps(bPtr);
386 bVal = _mm_mul_ps(bVal, logarithm);
389 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
391 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
393 emm0 = _mm_cvttps_epi32(fx);
394 tmp = _mm_cvtepi32_ps(emm0);
396 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
397 fx = _mm_sub_ps(tmp, mask);
399 tmp = _mm_mul_ps(fx, exp_C1);
400 z = _mm_mul_ps(fx, exp_C2);
401 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
402 z = _mm_mul_ps(bVal, bVal);
404 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
405 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
406 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
407 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
408 y = _mm_add_ps(y, one);
410 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
412 pow2n = _mm_castsi128_ps(emm0);
413 cVal = _mm_mul_ps(y, pow2n);
415 _mm_store_ps(cPtr, cVal);
422 number = quarterPoints * 4;
423 for(;number < num_points; number++){
424 *cPtr++ = powf(*aPtr++, *bPtr++);
432 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
433 #define INCLUDED_volk_32f_x2_pow_32f_u_H
437 #include <inttypes.h>
440 #define POW_POLY_DEGREE 3
442 #ifdef LV_HAVE_GENERIC
446 const float* aVector,
unsigned int num_points)
448 float* cPtr = cVector;
449 const float* bPtr = bVector;
450 const float* aPtr = aVector;
451 unsigned int number = 0;
453 for(number = 0; number < num_points; number++){
454 *cPtr++ = powf(*aPtr++, *bPtr++);
460 #ifdef LV_HAVE_SSE4_1
461 #include <smmintrin.h>
463 #define POLY0(x, c0) _mm_set1_ps(c0)
464 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
465 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
466 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
467 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
468 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
471 volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
const float* bVector,
472 const float* aVector,
unsigned int num_points)
474 float* cPtr = cVector;
475 const float* bPtr = bVector;
476 const float* aPtr = aVector;
478 unsigned int number = 0;
479 const unsigned int quarterPoints = num_points / 4;
481 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
482 __m128 tmp, fx, mask, pow2n, z, y;
483 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
484 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
485 __m128i bias, exp, emm0, pi32_0x7f;
487 one = _mm_set1_ps(1.0);
488 exp_hi = _mm_set1_ps(88.3762626647949);
489 exp_lo = _mm_set1_ps(-88.3762626647949);
490 ln2 = _mm_set1_ps(0.6931471805);
491 log2EF = _mm_set1_ps(1.44269504088896341);
492 half = _mm_set1_ps(0.5);
493 exp_C1 = _mm_set1_ps(0.693359375);
494 exp_C2 = _mm_set1_ps(-2.12194440e-4);
495 pi32_0x7f = _mm_set1_epi32(0x7f);
497 exp_p0 = _mm_set1_ps(1.9875691500e-4);
498 exp_p1 = _mm_set1_ps(1.3981999507e-3);
499 exp_p2 = _mm_set1_ps(8.3334519073e-3);
500 exp_p3 = _mm_set1_ps(4.1665795894e-2);
501 exp_p4 = _mm_set1_ps(1.6666665459e-1);
502 exp_p5 = _mm_set1_ps(5.0000001201e-1);
504 for(;number < quarterPoints; number++){
506 aVal = _mm_loadu_ps(aPtr);
507 bias = _mm_set1_epi32(127);
508 leadingOne = _mm_set1_ps(1.0f);
509 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
510 logarithm = _mm_cvtepi32_ps(exp);
512 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
514 #if POW_POLY_DEGREE == 6
515 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
516 #elif POW_POLY_DEGREE == 5
517 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
518 #elif POW_POLY_DEGREE == 4
519 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
520 #elif POW_POLY_DEGREE == 3
521 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
526 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
527 logarithm = _mm_mul_ps(logarithm, ln2);
531 bVal = _mm_loadu_ps(bPtr);
532 bVal = _mm_mul_ps(bVal, logarithm);
535 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
537 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
539 emm0 = _mm_cvttps_epi32(fx);
540 tmp = _mm_cvtepi32_ps(emm0);
542 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
543 fx = _mm_sub_ps(tmp, mask);
545 tmp = _mm_mul_ps(fx, exp_C1);
546 z = _mm_mul_ps(fx, exp_C2);
547 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
548 z = _mm_mul_ps(bVal, bVal);
550 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
551 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
552 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
553 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
554 y = _mm_add_ps(y, one);
556 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
558 pow2n = _mm_castsi128_ps(emm0);
559 cVal = _mm_mul_ps(y, pow2n);
561 _mm_storeu_ps(cPtr, cVal);
568 number = quarterPoints * 4;
569 for(;number < num_points; number++){
570 *cPtr++ = powf(*aPtr++, *bPtr++);
576 #if LV_HAVE_AVX2 && LV_HAVE_FMA
577 #include <immintrin.h>
579 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
580 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
581 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
582 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
583 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
584 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
587 volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
const float* bVector,
588 const float* aVector,
unsigned int num_points)
590 float* cPtr = cVector;
591 const float* bPtr = bVector;
592 const float* aPtr = aVector;
594 unsigned int number = 0;
595 const unsigned int eighthPoints = num_points / 8;
597 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
598 __m256 tmp, fx, mask, pow2n, z, y;
599 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
600 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
601 __m256i bias, exp, emm0, pi32_0x7f;
603 one = _mm256_set1_ps(1.0);
604 exp_hi = _mm256_set1_ps(88.3762626647949);
605 exp_lo = _mm256_set1_ps(-88.3762626647949);
606 ln2 = _mm256_set1_ps(0.6931471805);
607 log2EF = _mm256_set1_ps(1.44269504088896341);
608 half = _mm256_set1_ps(0.5);
609 exp_C1 = _mm256_set1_ps(0.693359375);
610 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
611 pi32_0x7f = _mm256_set1_epi32(0x7f);
613 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
614 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
615 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
616 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
617 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
618 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
620 for(;number < eighthPoints; number++){
622 aVal = _mm256_loadu_ps(aPtr);
623 bias = _mm256_set1_epi32(127);
624 leadingOne = _mm256_set1_ps(1.0f);
625 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
626 logarithm = _mm256_cvtepi32_ps(exp);
628 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
630 #if POW_POLY_DEGREE == 6
631 mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
632 #elif POW_POLY_DEGREE == 5
633 mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
634 #elif POW_POLY_DEGREE == 4
635 mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
636 #elif POW_POLY_DEGREE == 3
637 mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
642 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
643 logarithm = _mm256_mul_ps(logarithm, ln2);
647 bVal = _mm256_loadu_ps(bPtr);
648 bVal = _mm256_mul_ps(bVal, logarithm);
651 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
653 fx = _mm256_fmadd_ps(bVal, log2EF, half);
655 emm0 = _mm256_cvttps_epi32(fx);
656 tmp = _mm256_cvtepi32_ps(emm0);
658 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
659 fx = _mm256_sub_ps(tmp, mask);
661 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
662 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
663 z = _mm256_mul_ps(bVal, bVal);
665 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
666 y = _mm256_fmadd_ps(y, bVal, exp_p2);
667 y = _mm256_fmadd_ps(y, bVal, exp_p3);
668 y = _mm256_fmadd_ps(y, bVal, exp_p4);
669 y = _mm256_fmadd_ps(y, bVal, exp_p5);
670 y = _mm256_fmadd_ps(y, z, bVal);
671 y = _mm256_add_ps(y, one);
673 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
675 pow2n = _mm256_castsi256_ps(emm0);
676 cVal = _mm256_mul_ps(y, pow2n);
678 _mm256_storeu_ps(cPtr, cVal);
685 number = eighthPoints * 8;
686 for(;number < num_points; number++){
687 *cPtr++ = pow(*aPtr++, *bPtr++);
694 #include <immintrin.h>
696 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
697 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
698 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
699 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
700 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
701 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
704 volk_32f_x2_pow_32f_u_avx2(
float* cVector,
const float* bVector,
705 const float* aVector,
unsigned int num_points)
707 float* cPtr = cVector;
708 const float* bPtr = bVector;
709 const float* aPtr = aVector;
711 unsigned int number = 0;
712 const unsigned int eighthPoints = num_points / 8;
714 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
715 __m256 tmp, fx, mask, pow2n, z, y;
716 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
717 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
718 __m256i bias, exp, emm0, pi32_0x7f;
720 one = _mm256_set1_ps(1.0);
721 exp_hi = _mm256_set1_ps(88.3762626647949);
722 exp_lo = _mm256_set1_ps(-88.3762626647949);
723 ln2 = _mm256_set1_ps(0.6931471805);
724 log2EF = _mm256_set1_ps(1.44269504088896341);
725 half = _mm256_set1_ps(0.5);
726 exp_C1 = _mm256_set1_ps(0.693359375);
727 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
728 pi32_0x7f = _mm256_set1_epi32(0x7f);
730 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
731 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
732 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
733 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
734 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
735 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
737 for(;number < eighthPoints; number++){
739 aVal = _mm256_loadu_ps(aPtr);
740 bias = _mm256_set1_epi32(127);
741 leadingOne = _mm256_set1_ps(1.0f);
742 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
743 logarithm = _mm256_cvtepi32_ps(exp);
745 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
747 #if POW_POLY_DEGREE == 6
748 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
749 #elif POW_POLY_DEGREE == 5
750 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
751 #elif POW_POLY_DEGREE == 4
752 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
753 #elif POW_POLY_DEGREE == 3
754 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
759 logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
760 logarithm = _mm256_mul_ps(logarithm, ln2);
763 bVal = _mm256_loadu_ps(bPtr);
764 bVal = _mm256_mul_ps(bVal, logarithm);
767 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
769 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
771 emm0 = _mm256_cvttps_epi32(fx);
772 tmp = _mm256_cvtepi32_ps(emm0);
774 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
775 fx = _mm256_sub_ps(tmp, mask);
777 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
778 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
779 z = _mm256_mul_ps(bVal, bVal);
781 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
782 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
783 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
784 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
785 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
786 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
787 y = _mm256_add_ps(y, one);
789 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
791 pow2n = _mm256_castsi256_ps(emm0);
792 cVal = _mm256_mul_ps(y, pow2n);
794 _mm256_storeu_ps(cPtr, cVal);
801 number = eighthPoints * 8;
802 for(;number < num_points; number++){
803 *cPtr++ = pow(*aPtr++, *bPtr++);