Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * Copyright (c) 2016-2019 ARM Limited.
25  *
26  * SPDX-License-Identifier: MIT
27  *
28  * Permission is hereby granted, free of charge, to any person obtaining a copy
29  * of this software and associated documentation files (the "Software"), to
30  * deal in the Software without restriction, including without limitation the
31  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
32  * sell copies of the Software, and to permit persons to whom the Software is
33  * furnished to do so, subject to the following conditions:
34  *
35  * The above copyright notice and this permission notice shall be included in all
36  * copies or substantial portions of the Software.
37  *
38  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44  * SOFTWARE.
45  *
46  * _vtaylor_polyq_f32
47  * _vlogq_f32
48  *
49  */
50 
51 /* Copyright (C) 2011 Julien Pommier
52 
53  This software is provided 'as-is', without any express or implied
54  warranty. In no event will the authors be held liable for any damages
55  arising from the use of this software.
56 
57  Permission is granted to anyone to use this software for any purpose,
58  including commercial applications, and to alter it and redistribute it
59  freely, subject to the following restrictions:
60 
61  1. The origin of this software must not be misrepresented; you must not
62  claim that you wrote the original software. If you use this software
63  in a product, an acknowledgment in the product documentation would be
64  appreciated but is not required.
65  2. Altered source versions must be plainly marked as such, and must not be
66  misrepresented as being the original software.
67  3. This notice may not be removed or altered from any source distribution.
68 
69  (this is the zlib license)
70 
71  _vsincosq_f32
72 
73 */
74 
75 /*
76  * This file is intended to hold NEON intrinsics of intrinsics.
77  * They should be used in VOLK kernels to avoid copy-pasta.
78  */
79 
80 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
81 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
82 #ifdef LV_HAVE_NEON
83 #include <arm_neon.h>
84 
85 
86 /* Magnitude squared for float32x4x2_t */
87 static inline float32x4_t
88 _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
89 {
90  float32x4_t iValue, qValue, result;
91  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
92  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
93  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
94  return result;
95 }
96 
97 /* Inverse square root for float32x4_t */
98 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
99 {
100  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
101  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
102  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
103 
104  return sqrt_reciprocal;
105 }
106 
107 /* Inverse */
108 static inline float32x4_t _vinvq_f32(float32x4_t x)
109 {
110  // Newton's method
111  float32x4_t recip = vrecpeq_f32(x);
112  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
113  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
114  return recip;
115 }
116 
117 /* Complex multiplication for float32x4x2_t */
118 static inline float32x4x2_t
119 _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
120 {
121  float32x4x2_t tmp_real;
122  float32x4x2_t tmp_imag;
123  float32x4x2_t c_val;
124 
125  // multiply the real*real and imag*imag to get real result
126  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
127  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
128  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
129  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
130  // Multiply cross terms to get the imaginary result
131  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
132  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
133  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
134  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
135  // combine the products
136  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
137  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
138  return c_val;
139 }
140 
141 /* From ARM Compute Library, MIT license */
142 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
143 {
144  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
145  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
146  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
147  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
148  float32x4_t x2 = vmulq_f32(x, x);
149  float32x4_t x4 = vmulq_f32(x2, x2);
150  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
151  return res;
152 }
153 
154 /* Natural logarithm.
155  * From ARM Compute Library, MIT license */
156 static inline float32x4_t _vlogq_f32(float32x4_t x)
157 {
158  const float32x4_t log_tab[8] = {
159  vdupq_n_f32(-2.29561495781f),
160  vdupq_n_f32(-2.47071170807f),
161  vdupq_n_f32(-5.68692588806f),
162  vdupq_n_f32(-0.165253549814f),
163  vdupq_n_f32(5.17591238022f),
164  vdupq_n_f32(0.844007015228f),
165  vdupq_n_f32(4.58445882797f),
166  vdupq_n_f32(0.0141278216615f),
167  };
168 
169  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
170  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
171 
172  // Extract exponent
173  int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
174  float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
175 
176  // Polynomial Approximation
177  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
178 
179  // Reconstruct
180  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
181 
182  return poly;
183 }
184 
185 /* Evaluation of 4 sines & cosines at once.
186  * Optimized from here (zlib license)
187  * http://gruntthepeon.free.fr/ssemath/ */
188 static inline float32x4x2_t _vsincosq_f32(float32x4_t x) {
189  const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
190  const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
191  const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
192  const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
193  const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
194  const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
195  const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
196  const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
197  const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
198  const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
199 
200  const float32x4_t CONST_1 = vdupq_n_f32(1.f);
201  const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
202  const float32x4_t CONST_0 = vdupq_n_f32(0.f);
203  const uint32x4_t CONST_2 = vdupq_n_u32(2);
204  const uint32x4_t CONST_4 = vdupq_n_u32(4);
205 
206  uint32x4_t emm2;
207 
208  uint32x4_t sign_mask_sin, sign_mask_cos;
209  sign_mask_sin = vcltq_f32(x, CONST_0);
210  x = vabsq_f32(x);
211  // scale by 4/pi
212  float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
213 
214  // store the integer part of y in mm0
215  emm2 = vcvtq_u32_f32(y);
216  /* j=(j+1) & (~1) (see the cephes sources) */
217  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
218  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
219  y = vcvtq_f32_u32(emm2);
220 
221  /* get the polynom selection mask
222  there is one polynom for 0 <= x <= Pi/4
223  and another one for Pi/4<x<=Pi/2
224  Both branches will be computed. */
225  const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
226 
227  // The magic pass: "Extended precision modular arithmetic"
228  x = vmlaq_f32(x, y, c_minus_cephes_DP1);
229  x = vmlaq_f32(x, y, c_minus_cephes_DP2);
230  x = vmlaq_f32(x, y, c_minus_cephes_DP3);
231 
232  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
233  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
234 
235  /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
236  and the second polynom (Pi/4 <= x <= 0) in y2 */
237  float32x4_t y1, y2;
238  float32x4_t z = vmulq_f32(x,x);
239 
240  y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
241  y1 = vmlaq_f32(c_coscof_p2, z, y1);
242  y1 = vmulq_f32(y1, z);
243  y1 = vmulq_f32(y1, z);
244  y1 = vmlsq_f32(y1, z, CONST_1_2);
245  y1 = vaddq_f32(y1, CONST_1);
246 
247  y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
248  y2 = vmlaq_f32(c_sincof_p2, z, y2);
249  y2 = vmulq_f32(y2, z);
250  y2 = vmlaq_f32(x, x, y2);
251 
252  /* select the correct result from the two polynoms */
253  const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
254  const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
255 
256  float32x4x2_t sincos;
257  sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
258  sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
259 
260  return sincos;
261 }
262 
263 static inline float32x4_t _vsinq_f32(float32x4_t x) {
264  const float32x4x2_t sincos = _vsincosq_f32(x);
265  return sincos.val[0];
266 }
267 
268 static inline float32x4_t _vcosq_f32(float32x4_t x) {
269  const float32x4x2_t sincos = _vsincosq_f32(x);
270  return sincos.val[1];
271 }
272 
273 static inline float32x4_t _vtanq_f32(float32x4_t x) {
274  const float32x4x2_t sincos = _vsincosq_f32(x);
275  return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
276 }
277 
278 
279 #endif /*LV_HAVE_NEON*/
280 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
_vsinq_f32
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:263
_vmagnitudesquaredq_f32
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:88
_vsincosq_f32
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:188
volk_arch_defs.val
val
Definition: volk_arch_defs.py:66
_vtaylor_polyq_f32
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:142
_vcosq_f32
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:268
_vlogq_f32
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:156
_vinvsqrtq_f32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:98
_vtanq_f32
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:273
_vmultiply_complexq_f32
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
_vinvq_f32
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:108