Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
test_suite_math.c
1/*
2 * Copyright 2012-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/*
29 * NE10 Library : test_suite_math.c
30 */
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <math.h>
35
36#include "NE10_math.h"
37#include "seatest.h"
38
39//function table
40ne10_func_2args_t ftbl_2args[MAX_FUNC_COUNT];
41ne10_func_3args_t ftbl_3args[MAX_FUNC_COUNT];
42ne10_func_4args_t ftbl_4args[MAX_FUNC_COUNT];
43ne10_func_5args_t ftbl_5args[MAX_FUNC_COUNT];
44ne10_func_3args_cst_t ftbl_3args_cst[MAX_FUNC_COUNT];
45ne10_func_4args_cst_t ftbl_4args_cst[MAX_FUNC_COUNT];
46ne10_func_5args_cst_t ftbl_5args_cst[MAX_FUNC_COUNT];
47
48//input and output
49#if defined (SMOKE_TEST)||(REGRESSION_TEST)
50static ne10_float32_t * guarded_acc = NULL;
51static ne10_float32_t * guarded_src1 = NULL;
52static ne10_float32_t * guarded_src2 = NULL;
53static ne10_float32_t * guarded_cst = NULL;
54static ne10_float32_t * theacc = NULL;
55static ne10_float32_t * thesrc1 = NULL;
56static ne10_float32_t * thesrc2 = NULL;
57static ne10_float32_t * thecst = NULL;
58
59static ne10_float32_t * guarded_dst_c = NULL;
60static ne10_float32_t * guarded_dst_neon = NULL;
61static ne10_float32_t * thedst_c = NULL;
62static ne10_float32_t * thedst_neon = NULL;
63#endif
64
65#ifdef PERFORMANCE_TEST
66static ne10_float32_t * perftest_guarded_acc = NULL;
67static ne10_float32_t * perftest_guarded_src1 = NULL;
68static ne10_float32_t * perftest_guarded_src2 = NULL;
69static ne10_float32_t * perftest_guarded_cst = NULL;
70static ne10_float32_t * perftest_theacc = NULL;
71static ne10_float32_t * perftest_thesrc1 = NULL;
72static ne10_float32_t * perftest_thesrc2 = NULL;
73static ne10_float32_t * perftest_thecst = NULL;
74
75static ne10_float32_t * perftest_thedst_c = NULL;
76static ne10_float32_t * perftest_guarded_dst_c = NULL;
77static ne10_float32_t * perftest_guarded_dst_neon = NULL;
78static ne10_float32_t * perftest_thedst_neon = NULL;
79static ne10_uint32_t perftest_length = 0;
80
81static ne10_int64_t time_c = 0;
82static ne10_int64_t time_neon = 0;
83static ne10_float32_t time_speedup = 0.0f;
84static ne10_float32_t time_savings = 0.0f;
85#endif
86
87void test_abs_case0()
88{
89#define MAX_VEC_COMPONENTS 4
90 ne10_int32_t loop;
91 ne10_int32_t func_loop;
92
93 /* init function table */
94 memset (ftbl_3args, 0, sizeof (ftbl_3args));
95 ftbl_3args[ 0] = (ne10_func_3args_t) ne10_abs_float_c;
96 ftbl_3args[ 1] = (ne10_func_3args_t) ne10_abs_float_neon;
97 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_abs_vec2f_c;
98 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_abs_vec2f_neon;
99 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_abs_vec3f_c;
100 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_abs_vec3f_neon;
101 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_abs_vec4f_c;
102 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_abs_vec4f_neon;
103
104 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
105
106#if defined (SMOKE_TEST)||(REGRESSION_TEST)
107 ne10_int32_t vec_size;
108 ne10_int32_t pos;
109 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
110
111 /* init src memory */
112 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
113
114 /* init dst memory */
115 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
116 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
117
118 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
119 {
120 for (loop = 0; loop < TEST_ITERATION; loop++)
121 {
122 vec_size = func_loop + 1;
123
124 GUARD_ARRAY (thedst_c, loop * vec_size);
125 GUARD_ARRAY (thedst_neon, loop * vec_size);
126
127 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
128 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
129
130 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
131 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
132
133 for (pos = 0; pos < loop; pos++)
134 {
135#ifdef DEBUG_TRACE
136 ne10_int32_t i;
137 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
138 for (i = 0; i < vec_size; i++)
139 {
140 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
141 }
142#endif
143 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
144 }
145 }
146 }
147 free (guarded_src1);
148 free (guarded_dst_c);
149 free (guarded_dst_neon);
150#endif
151
152#ifdef PERFORMANCE_TEST
153 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
154 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
155 /* init src memory */
156 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
157
158 /* init dst memory */
159 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
160 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
161
162 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
163 {
164 GET_TIME (time_c,
165 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
166 );
167 GET_TIME (time_neon,
168 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
169 );
170 time_speedup = (ne10_float32_t) time_c / time_neon;
171 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
172 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
173 }
174
175 free (perftest_guarded_src1);
176 free (perftest_guarded_dst_c);
177 free (perftest_guarded_dst_neon);
178#endif
179
180 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
181#undef MAX_VEC_COMPONENTS
182}
183
184void test_addc_case0()
185{
186#define MAX_VEC_COMPONENTS 4
187 ne10_int32_t loop;
188 ne10_int32_t func_loop;
189
190 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
191
192 /* init function table */
193 memset (ftbl_4args, 0, sizeof (ftbl_4args));
194 memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
195 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_addc_float_c;
196 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_addc_float_neon;
197 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addc_vec2f_c;
198 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addc_vec2f_neon;
199 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addc_vec3f_c;
200 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addc_vec3f_neon;
201 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addc_vec4f_c;
202 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addc_vec4f_neon;
203
204#if defined (SMOKE_TEST)||(REGRESSION_TEST)
205 ne10_int32_t vec_size;
206 ne10_int32_t pos;
207 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
208
209 /* init src memory */
210 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
211 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
212
213 /* init dst memory */
214 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
215 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
216
217 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
218 {
219 for (loop = 0; loop < TEST_ITERATION; loop++)
220 {
221 vec_size = func_loop + 1;
222
223 GUARD_ARRAY (thedst_c, loop * vec_size);
224 GUARD_ARRAY (thedst_neon, loop * vec_size);
225
226 if (func_loop == 0)
227 {
228 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
229 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
230 }
231 else
232 {
233 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
234 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
235 }
236
237
238 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
239 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
240
241 for (pos = 0; pos < loop; pos++)
242 {
243#ifdef DEBUG_TRACE
244 ne10_int32_t i;
245 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
246 for (i = 0; i < vec_size; i++)
247 {
248 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
249 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
250 }
251#endif
252 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
253 }
254 }
255 }
256 free (guarded_src1);
257 free (guarded_cst);
258 free (guarded_dst_c);
259 free (guarded_dst_neon);
260#endif
261
262#ifdef PERFORMANCE_TEST
263 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
264 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
265 /* init src memory */
266 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
267 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
268
269 /* init dst memory */
270 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
271 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
272
273 for (func_loop = 0; func_loop < 1; func_loop++)
274 {
275 GET_TIME (time_c,
276 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
277 );
278 GET_TIME (time_neon,
279 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
280 );
281 time_speedup = (ne10_float32_t) time_c / time_neon;
282 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
283 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
284 }
285 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
286 {
287 GET_TIME (time_c,
288 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
289 );
290 GET_TIME (time_neon,
291 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
292 );
293 time_speedup = (ne10_float32_t) time_c / time_neon;
294 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
295 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
296 }
297
298 free (perftest_guarded_src1);
299 free (perftest_guarded_cst);
300 free (perftest_guarded_dst_c);
301 free (perftest_guarded_dst_neon);
302#endif
303
304 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
305#undef MAX_VEC_COMPONENTS
306}
307
308void test_add_case0()
309{
310#define MAX_VEC_COMPONENTS 4
311 ne10_int32_t loop;
312 ne10_int32_t func_loop;
313
314 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
315
316 /* init function table */
317 memset (ftbl_4args, 0, sizeof (ftbl_4args));
318 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_add_float_c;
319 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_add_float_neon;
320 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_add_vec2f_c;
321 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_add_vec2f_neon;
322 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_add_vec3f_c;
323 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_add_vec3f_neon;
324 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_add_vec4f_c;
325 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_add_vec4f_neon;
326
327#if defined (SMOKE_TEST)||(REGRESSION_TEST)
328 ne10_int32_t vec_size;
329 ne10_int32_t pos;
330 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
331
332 /* init src memory */
333 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
334 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
335
336 /* init dst memory */
337 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
338 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
339
340 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
341 {
342 for (loop = 0; loop < TEST_ITERATION; loop++)
343 {
344 vec_size = func_loop + 1;
345
346 GUARD_ARRAY (thedst_c, loop * vec_size);
347 GUARD_ARRAY (thedst_neon, loop * vec_size);
348
349 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
350 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
351
352 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
353 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
354
355 for (pos = 0; pos < loop; pos++)
356 {
357#ifdef DEBUG_TRACE
358 ne10_int32_t i;
359 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
360 for (i = 0; i < vec_size; i++)
361 {
362 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
363 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
364 }
365#endif
366 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
367 }
368 }
369 }
370 free (guarded_src1);
371 free (guarded_src2);
372 free (guarded_dst_c);
373 free (guarded_dst_neon);
374#endif
375
376#ifdef PERFORMANCE_TEST
377 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
378 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
379 /* init src memory */
380 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
381 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
382
383 /* init dst memory */
384 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
385 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
386
387 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
388 {
389 GET_TIME (time_c,
390 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
391 );
392 GET_TIME (time_neon,
393 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
394 );
395 time_speedup = (ne10_float32_t) time_c / time_neon;
396 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
397 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
398 }
399
400 free (perftest_guarded_src1);
401 free (perftest_guarded_src2);
402 free (perftest_guarded_dst_c);
403 free (perftest_guarded_dst_neon);
404#endif
405
406 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
407#undef MAX_VEC_COMPONENTS
408}
409
410void test_cross_case0()
411{
412#define MAX_VEC_COMPONENTS 3
413 ne10_int32_t loop;
414 ne10_int32_t func_loop;
415
416 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
417
418 /* init function table */
419 memset (ftbl_4args, 0, sizeof (ftbl_4args));
420 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_cross_vec3f_c;
421 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_cross_vec3f_neon;
422
423#if defined (SMOKE_TEST)||(REGRESSION_TEST)
424 ne10_int32_t vec_size;
425 ne10_int32_t pos;
426 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
427
428 /* init src memory */
429 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
430 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
431
432 /* init dst memory */
433 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
434 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
435
436 for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
437 {
438 for (loop = 0; loop < TEST_ITERATION; loop++)
439 {
440 vec_size = func_loop + 1;
441
442 GUARD_ARRAY (thedst_c, loop * vec_size);
443 GUARD_ARRAY (thedst_neon, loop * vec_size);
444
445 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
446 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
447
448 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
449 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
450
451 for (pos = 0; pos < loop; pos++)
452 {
453#ifdef DEBUG_TRACE
454 ne10_int32_t i;
455 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
456 for (i = 0; i < vec_size; i++)
457 {
458 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
459 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
460 }
461#endif
462 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
463 }
464 }
465 }
466 free (guarded_src1);
467 free (guarded_src2);
468 free (guarded_dst_c);
469 free (guarded_dst_neon);
470#endif
471
472#ifdef PERFORMANCE_TEST
473 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
474 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
475 /* init src memory */
476 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
477 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
478
479 /* init dst memory */
480 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
481 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
482
483 for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
484 {
485 GET_TIME (time_c,
486 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
487 );
488 GET_TIME (time_neon,
489 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
490 );
491 time_speedup = (ne10_float32_t) time_c / time_neon;
492 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
493 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
494 }
495
496 free (perftest_guarded_src1);
497 free (perftest_guarded_src2);
498 free (perftest_guarded_dst_c);
499 free (perftest_guarded_dst_neon);
500#endif
501
502 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
503#undef MAX_VEC_COMPONENTS
504}
505
506void test_divc_case0()
507{
508#define MAX_VEC_COMPONENTS 4
509 ne10_int32_t loop;
510 ne10_int32_t func_loop;
511
512 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
513
514 /* init function table */
515 memset (ftbl_4args, 0, sizeof (ftbl_4args));
516 memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
517 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_divc_float_c;
518 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_divc_float_neon;
519 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_divc_vec2f_c;
520 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_divc_vec2f_neon;
521 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_divc_vec3f_c;
522 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_divc_vec3f_neon;
523 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_divc_vec4f_c;
524 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_divc_vec4f_neon;
525
526#if defined (SMOKE_TEST)||(REGRESSION_TEST)
527 ne10_int32_t vec_size;
528 ne10_int32_t pos;
529 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
530
531 /* init src memory */
532 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
533 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
534
535 /* init dst memory */
536 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
537 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
538
539 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
540 {
541 for (loop = 0; loop < TEST_ITERATION; loop++)
542 {
543 vec_size = func_loop + 1;
544
545 GUARD_ARRAY (thedst_c, loop * vec_size);
546 GUARD_ARRAY (thedst_neon, loop * vec_size);
547
548 if (func_loop == 0)
549 {
550 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
551 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
552 }
553 else
554 {
555 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
556 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
557 }
558
559 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
560 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
561
562 for (pos = 0; pos < loop; pos++)
563 {
564#ifdef DEBUG_TRACE
565 ne10_int32_t i;
566 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
567 for (i = 0; i < vec_size; i++)
568 {
569 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
570 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
571 }
572#endif
573 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
574 }
575 }
576 }
577 free (guarded_src1);
578 free (guarded_cst);
579 free (guarded_dst_c);
580 free (guarded_dst_neon);
581#endif
582
583#ifdef PERFORMANCE_TEST
584 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
585 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
586 /* init src memory */
587 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
588 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
589
590 /* init dst memory */
591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
593
594 for (func_loop = 0; func_loop < 1; func_loop++)
595 {
596 GET_TIME (time_c,
597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
598 );
599 GET_TIME (time_neon,
600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
601 );
602 time_speedup = (ne10_float32_t) time_c / time_neon;
603 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
604 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
605 }
606 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
607 {
608 GET_TIME (time_c,
609 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
610 );
611 GET_TIME (time_neon,
612 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
613 );
614 time_speedup = (ne10_float32_t) time_c / time_neon;
615 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
616 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
617 }
618
619 free (perftest_guarded_src1);
620 free (perftest_guarded_cst);
621 free (perftest_guarded_dst_c);
622 free (perftest_guarded_dst_neon);
623#endif
624
625 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
626#undef MAX_VEC_COMPONENTS
627}
628
629void test_div_case0()
630{
631#define MAX_VEC_COMPONENTS 4
632 ne10_int32_t loop;
633 ne10_int32_t func_loop;
634
635 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
636
637 /* init function table */
638 memset (ftbl_4args, 0, sizeof (ftbl_4args));
639 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_div_float_c;
640 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_div_float_neon;
641 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vdiv_vec2f_c;
642 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vdiv_vec2f_neon;
643 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vdiv_vec3f_c;
644 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vdiv_vec3f_neon;
645 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vdiv_vec4f_c;
646 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vdiv_vec4f_neon;
647
648#if defined (SMOKE_TEST)||(REGRESSION_TEST)
649 ne10_int32_t vec_size;
650 ne10_int32_t pos;
651 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
652
653 /* init src memory */
654 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
655 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
656
657 /* init dst memory */
658 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
659 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
660
661 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
662 {
663 for (loop = 0; loop < TEST_ITERATION; loop++)
664 {
665 vec_size = func_loop + 1;
666
667 GUARD_ARRAY (thedst_c, loop * vec_size);
668 GUARD_ARRAY (thedst_neon, loop * vec_size);
669
670 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
671 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
672
673 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
674 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
675
676 for (pos = 0; pos < loop; pos++)
677 {
678#ifdef DEBUG_TRACE
679 ne10_int32_t i;
680 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
681 for (i = 0; i < vec_size; i++)
682 {
683 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
684 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
685 }
686#endif
687 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
688 }
689 }
690 }
691 free (guarded_src1);
692 free (guarded_src2);
693 free (guarded_dst_c);
694 free (guarded_dst_neon);
695#endif
696
697#ifdef PERFORMANCE_TEST
698 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
699 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
700 /* init src memory */
701 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
702 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
703
704 /* init dst memory */
705 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
706 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
707
708 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
709 {
710 GET_TIME (time_c,
711 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
712 );
713 GET_TIME (time_neon,
714 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
715 );
716 time_speedup = (ne10_float32_t) time_c / time_neon;
717 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
718 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
719 }
720
721 free (perftest_guarded_src1);
722 free (perftest_guarded_src2);
723 free (perftest_guarded_dst_c);
724 free (perftest_guarded_dst_neon);
725#endif
726
727 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
728#undef MAX_VEC_COMPONENTS
729}
730
731void test_dot_case0()
732{
733#define MAX_VEC_COMPONENTS 4
734 ne10_int32_t loop;
735 ne10_int32_t func_loop;
736
737 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
738
739 /* init function table */
740 memset (ftbl_4args, 0, sizeof (ftbl_4args));
741 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_dot_vec2f_c;
742 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_dot_vec2f_neon;
743 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_dot_vec3f_c;
744 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_dot_vec3f_neon;
745 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_dot_vec4f_c;
746 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_dot_vec4f_neon;
747
748#if defined (SMOKE_TEST)||(REGRESSION_TEST)
749 ne10_int32_t pos;
750 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
751
752 /* init src memory */
753 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
754 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
755
756 /* init dst memory */
757 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
758 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
759
760 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
761 {
762 for (loop = 0; loop < TEST_ITERATION; loop++)
763 {
764#ifdef DEBUG_TRACE
765 ne10_int32_t vec_size = func_loop + 1;
766#endif
767
768 GUARD_ARRAY (thedst_c, loop);
769 GUARD_ARRAY (thedst_neon, loop);
770
771 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
772 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
773
774 CHECK_ARRAY_GUARD (thedst_c, loop);
775 CHECK_ARRAY_GUARD (thedst_neon, loop);
776
777 for (pos = 0; pos < loop; pos++)
778 {
779#ifdef DEBUG_TRACE
780 ne10_int32_t i;
781 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
782 for (i = 0; i < vec_size; i++)
783 {
784 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
785 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
786 }
787#endif
788 assert_float_vec_equal (&thedst_c[pos], &thedst_neon[pos], ERROR_MARGIN_SMALL, 1);
789 }
790 }
791 }
792 free (guarded_src1);
793 free (guarded_src2);
794 free (guarded_dst_c);
795 free (guarded_dst_neon);
796#endif
797
798#ifdef PERFORMANCE_TEST
799 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
800 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
801 /* init src memory */
802 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
803 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
804
805 /* init dst memory */
806 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
807 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
808
809 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
810 {
811 GET_TIME (time_c,
812 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
813 );
814 GET_TIME (time_neon,
815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
816 );
817 time_speedup = (ne10_float32_t) time_c / time_neon;
818 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
819 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
820 }
821
822 free (perftest_guarded_src1);
823 free (perftest_guarded_src2);
824 free (perftest_guarded_dst_c);
825 free (perftest_guarded_dst_neon);
826#endif
827
828 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
829#undef MAX_VEC_COMPONENTS
830}
831
832void test_len_case0()
833{
834#define MAX_VEC_COMPONENTS 4
835 ne10_int32_t loop;
836 ne10_int32_t func_loop;
837
838 /* init function table */
839 memset (ftbl_3args, 0, sizeof (ftbl_3args));
840 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_len_vec2f_c;
841 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_len_vec2f_neon;
842 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_len_vec3f_c;
843 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_len_vec3f_neon;
844 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_len_vec4f_c;
845 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_len_vec4f_neon;
846
847 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
848
849#if defined (SMOKE_TEST)||(REGRESSION_TEST)
850 ne10_int32_t vec_size;
851 ne10_int32_t pos;
852 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
853
854 /* init src memory */
855 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
856
857 /* init dst memory */
858 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
859 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
860
861 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
862 {
863 for (loop = 0; loop < TEST_ITERATION; loop++)
864 {
865 vec_size = func_loop + 1;
866
867 GUARD_ARRAY (thedst_c, loop);
868 GUARD_ARRAY (thedst_neon, loop);
869
870 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
871 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
872
873 CHECK_ARRAY_GUARD (thedst_c, loop);
874 CHECK_ARRAY_GUARD (thedst_neon, loop);
875
876 for (pos = 0; pos < loop; pos++)
877 {
878#ifdef DEBUG_TRACE
879 ne10_int32_t i;
880 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
881 for (i = 0; i < vec_size; i++)
882 {
883 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
884 }
885#endif
886 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, 1);
887 }
888 }
889 }
890 free (guarded_src1);
891 free (guarded_dst_c);
892 free (guarded_dst_neon);
893#endif
894
895#ifdef PERFORMANCE_TEST
896 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
897 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
898 /* init src memory */
899 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
900
901 /* init dst memory */
902 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
903 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
904
905 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
906 {
907 GET_TIME (time_c,
908 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
909 );
910 GET_TIME (time_neon,
911 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
912 );
913 time_speedup = (ne10_float32_t) time_c / time_neon;
914 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
915 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
916 }
917
918 free (perftest_guarded_src1);
919 free (perftest_guarded_dst_c);
920 free (perftest_guarded_dst_neon);
921#endif
922
923 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
924#undef MAX_VEC_COMPONENTS
925}
926
927void test_mlac_case0()
928{
929#define MAX_VEC_COMPONENTS 4
930 ne10_int32_t loop;
931 ne10_int32_t func_loop;
932
933 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
934
935 /* init function table */
936 memset (ftbl_5args, 0, sizeof (ftbl_5args));
937 memset (ftbl_5args_cst, 0, sizeof (ftbl_5args_cst));
938 ftbl_5args_cst[ 0] = (ne10_func_5args_cst_t) ne10_mlac_float_c;
939 ftbl_5args_cst[ 1] = (ne10_func_5args_cst_t) ne10_mlac_float_neon;
940 ftbl_5args[ 2] = (ne10_func_5args_t) ne10_mlac_vec2f_c;
941 ftbl_5args[ 3] = (ne10_func_5args_t) ne10_mlac_vec2f_neon;
942 ftbl_5args[ 4] = (ne10_func_5args_t) ne10_mlac_vec3f_c;
943 ftbl_5args[ 5] = (ne10_func_5args_t) ne10_mlac_vec3f_neon;
944 ftbl_5args[ 6] = (ne10_func_5args_t) ne10_mlac_vec4f_c;
945 ftbl_5args[ 7] = (ne10_func_5args_t) ne10_mlac_vec4f_neon;
946
947#if defined (SMOKE_TEST)||(REGRESSION_TEST)
948 ne10_int32_t vec_size;
949 ne10_int32_t pos;
950 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
951
952 /* init src memory */
953 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
954 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
955 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
956
957 /* init dst memory */
958 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
959 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
960
961 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
962 {
963 for (loop = 0; loop < TEST_ITERATION; loop++)
964 {
965 vec_size = func_loop + 1;
966
967 GUARD_ARRAY (thedst_c, loop * vec_size);
968 GUARD_ARRAY (thedst_neon, loop * vec_size);
969
970 if (func_loop == 0)
971 {
972 ftbl_5args_cst[2 * func_loop] (thedst_c, theacc, thesrc1, thecst[0], loop);
973 ftbl_5args_cst[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst[0], loop);
974 }
975 else
976 {
977 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thecst, loop);
978 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst, loop);
979 }
980
981 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
982 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
983
984 for (pos = 0; pos < loop; pos++)
985 {
986#ifdef DEBUG_TRACE
987 ne10_int32_t i;
988 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
989 for (i = 0; i < vec_size; i++)
990 {
991 fprintf (stdout, "theacc->%d: %f [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
992 fprintf (stdout, "thesrc->%d: %f [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
993 fprintf (stdout, "thecst->%d: %f [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
994 }
995#endif
996 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
997 }
998 }
999 }
1000 free (guarded_acc);
1001 free (guarded_src1);
1002 free (guarded_cst);
1003 free (guarded_dst_c);
1004 free (guarded_dst_neon);
1005#endif
1006
1007#ifdef PERFORMANCE_TEST
1008 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1009 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1010 /* init src memory */
1011 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1012 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1013 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1014
1015 /* init dst memory */
1016 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1017 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1018
1019 for (func_loop = 0; func_loop < 1; func_loop++)
1020 {
1021 GET_TIME (time_c,
1022 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1023 );
1024 GET_TIME (time_neon,
1025 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1026 );
1027 time_speedup = (ne10_float32_t) time_c / time_neon;
1028 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1029 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1030 }
1031 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1032 {
1033 GET_TIME (time_c,
1034 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1035 );
1036 GET_TIME (time_neon,
1037 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1038 );
1039 time_speedup = (ne10_float32_t) time_c / time_neon;
1040 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1041 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1042 }
1043
1044 free (perftest_guarded_acc);
1045 free (perftest_guarded_src1);
1046 free (perftest_guarded_cst);
1047 free (perftest_guarded_dst_c);
1048 free (perftest_guarded_dst_neon);
1049#endif
1050
1051 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1052#undef MAX_VEC_COMPONENTS
1053}
1054
1055void test_mla_case0()
1056{
1057#define MAX_VEC_COMPONENTS 4
1058 ne10_int32_t loop;
1059 ne10_int32_t func_loop;
1060
1061 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1062
1063 /* init function table */
1064 memset (ftbl_5args, 0, sizeof (ftbl_5args));
1065 ftbl_5args[ 0] = (ne10_func_5args_t) ne10_mla_float_c;
1066 ftbl_5args[ 1] = (ne10_func_5args_t) ne10_mla_float_neon;
1067 ftbl_5args[ 2] = (ne10_func_5args_t) ne10_vmla_vec2f_c;
1068 ftbl_5args[ 3] = (ne10_func_5args_t) ne10_vmla_vec2f_neon;
1069 ftbl_5args[ 4] = (ne10_func_5args_t) ne10_vmla_vec3f_c;
1070 ftbl_5args[ 5] = (ne10_func_5args_t) ne10_vmla_vec3f_neon;
1071 ftbl_5args[ 6] = (ne10_func_5args_t) ne10_vmla_vec4f_c;
1072 ftbl_5args[ 7] = (ne10_func_5args_t) ne10_vmla_vec4f_neon;
1073
1074#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1075 ne10_int32_t vec_size;
1076 ne10_int32_t pos;
1077 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1078
1079 /* init src memory */
1080 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1081 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1082 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1083
1084 /* init dst memory */
1085 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1086 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1087
1088 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1089 {
1090 for (loop = 0; loop < TEST_ITERATION; loop++)
1091 {
1092 vec_size = func_loop + 1;
1093
1094 GUARD_ARRAY (thedst_c, loop * vec_size);
1095 GUARD_ARRAY (thedst_neon, loop * vec_size);
1096
1097 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thesrc2, loop);
1098 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thesrc2, loop);
1099
1100 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1101 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1102
1103 for (pos = 0; pos < loop; pos++)
1104 {
1105#ifdef DEBUG_TRACE
1106 ne10_int32_t i;
1107 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1108 for (i = 0; i < vec_size; i++)
1109 {
1110 fprintf (stdout, "theacc->%d: %e [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
1111 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1112 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1113 }
1114#endif
1115 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1116 }
1117 }
1118 }
1119 free (guarded_acc);
1120 free (guarded_src1);
1121 free (guarded_src2);
1122 free (guarded_dst_c);
1123 free (guarded_dst_neon);
1124#endif
1125
1126#ifdef PERFORMANCE_TEST
1127 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1128 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1129 /* init src memory */
1130 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1131 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1132 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1133
1134 /* init dst memory */
1135 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1136 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1137
1138 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1139 {
1140 GET_TIME (time_c,
1141 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1142 );
1143 GET_TIME (time_neon,
1144 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1145 );
1146 time_speedup = (ne10_float32_t) time_c / time_neon;
1147 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1148 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1149 }
1150
1151 free (perftest_guarded_acc);
1152 free (perftest_guarded_src1);
1153 free (perftest_guarded_src2);
1154 free (perftest_guarded_dst_c);
1155 free (perftest_guarded_dst_neon);
1156#endif
1157
1158 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1159#undef MAX_VEC_COMPONENTS
1160}
1161
1162void test_mulc_case0()
1163{
1164#define MAX_VEC_COMPONENTS 4
1165 ne10_int32_t loop;
1166 ne10_int32_t func_loop;
1167
1168 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1169
1170 /* init function table */
1171 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1172 memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1173 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_mulc_float_c;
1174 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_mulc_float_neon;
1175 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulc_vec2f_c;
1176 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulc_vec2f_neon;
1177 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulc_vec3f_c;
1178 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulc_vec3f_neon;
1179 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulc_vec4f_c;
1180 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulc_vec4f_neon;
1181
1182#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1183 ne10_int32_t vec_size;
1184 ne10_int32_t pos;
1185 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1186
1187 /* init src memory */
1188 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1189 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1190
1191 /* init dst memory */
1192 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1193 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1194
1195 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1196 {
1197 for (loop = 0; loop < TEST_ITERATION; loop++)
1198 {
1199 vec_size = func_loop + 1;
1200
1201 GUARD_ARRAY (thedst_c, loop * vec_size);
1202 GUARD_ARRAY (thedst_neon, loop * vec_size);
1203
1204 if (func_loop == 0)
1205 {
1206 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1207 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1208 }
1209 else
1210 {
1211 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1212 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1213 }
1214
1215 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1216 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1217
1218 for (pos = 0; pos < loop; pos++)
1219 {
1220#ifdef DEBUG_TRACE
1221 ne10_int32_t i;
1222 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1223 for (i = 0; i < vec_size; i++)
1224 {
1225 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1226 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1227 }
1228#endif
1229 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1230 }
1231 }
1232 }
1233 free (guarded_src1);
1234 free (guarded_cst);
1235 free (guarded_dst_c);
1236 free (guarded_dst_neon);
1237#endif
1238
1239#ifdef PERFORMANCE_TEST
1240 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1241 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1242 /* init src memory */
1243 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1244 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1245
1246 /* init dst memory */
1247 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1248 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1249
1250 for (func_loop = 0; func_loop < 1; func_loop++)
1251 {
1252 GET_TIME (time_c,
1253 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1254 );
1255 GET_TIME (time_neon,
1256 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1257 );
1258 time_speedup = (ne10_float32_t) time_c / time_neon;
1259 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1260 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1261 }
1262 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1263 {
1264 GET_TIME (time_c,
1265 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1266 );
1267 GET_TIME (time_neon,
1268 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1269 );
1270 time_speedup = (ne10_float32_t) time_c / time_neon;
1271 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1272 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1273 }
1274
1275 free (perftest_guarded_src1);
1276 free (perftest_guarded_cst);
1277 free (perftest_guarded_dst_c);
1278 free (perftest_guarded_dst_neon);
1279#endif
1280
1281 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1282#undef MAX_VEC_COMPONENTS
1283}
1284
1285void test_mul_case0()
1286{
1287#define MAX_VEC_COMPONENTS 4
1288 ne10_int32_t loop;
1289 ne10_int32_t func_loop;
1290
1291 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1292
1293 /* init function table */
1294 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1295 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_mul_float_c;
1296 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_mul_float_neon;
1297 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vmul_vec2f_c;
1298 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vmul_vec2f_neon;
1299 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vmul_vec3f_c;
1300 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vmul_vec3f_neon;
1301 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vmul_vec4f_c;
1302 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vmul_vec4f_neon;
1303
1304#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1305 ne10_int32_t vec_size;
1306 ne10_int32_t pos;
1307 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1308
1309 /* init src memory */
1310 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1311 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1312
1313 /* init dst memory */
1314 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1315 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1316
1317 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1318 {
1319 for (loop = 0; loop < TEST_ITERATION; loop++)
1320 {
1321 vec_size = func_loop + 1;
1322
1323 GUARD_ARRAY (thedst_c, loop * vec_size);
1324 GUARD_ARRAY (thedst_neon, loop * vec_size);
1325
1326 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1327 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1328
1329 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1330 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1331
1332 for (pos = 0; pos < loop; pos++)
1333 {
1334#ifdef DEBUG_TRACE
1335 ne10_int32_t i;
1336 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1337 for (i = 0; i < vec_size; i++)
1338 {
1339 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1340 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1341 }
1342#endif
1343 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1344 }
1345 }
1346 }
1347 free (guarded_src1);
1348 free (guarded_src2);
1349 free (guarded_dst_c);
1350 free (guarded_dst_neon);
1351#endif
1352
1353#ifdef PERFORMANCE_TEST
1354 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1355 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1356 /* init src memory */
1357 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1358 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1359
1360 /* init dst memory */
1361 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1362 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1363
1364 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1365 {
1366 GET_TIME (time_c,
1367 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1368 );
1369 GET_TIME (time_neon,
1370 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1371 );
1372 time_speedup = (ne10_float32_t) time_c / time_neon;
1373 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1374 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1375 }
1376
1377 free (perftest_guarded_src1);
1378 free (perftest_guarded_src2);
1379 free (perftest_guarded_dst_c);
1380 free (perftest_guarded_dst_neon);
1381#endif
1382
1383 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1384#undef MAX_VEC_COMPONENTS
1385}
1386
1387void test_normalize_case0()
1388{
1389#define MAX_VEC_COMPONENTS 4
1390 ne10_int32_t loop;
1391 ne10_int32_t func_loop;
1392
1393 /* init function table */
1394 memset (ftbl_3args, 0, sizeof (ftbl_3args));
1395 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_normalize_vec2f_c;
1396 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_normalize_vec2f_neon;
1397 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_normalize_vec3f_c;
1398 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_normalize_vec3f_neon;
1399 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_normalize_vec4f_c;
1400 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_normalize_vec4f_neon;
1401
1402 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1403
1404#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1405 ne10_int32_t vec_size;
1406 ne10_int32_t pos;
1407 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1408
1409 /* init src memory */
1410 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1411
1412 /* init dst memory */
1413 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1414 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1415
1416 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1417 {
1418 for (loop = 0; loop < TEST_ITERATION; loop++)
1419 {
1420 vec_size = func_loop + 1;
1421
1422 GUARD_ARRAY (thedst_c, loop * vec_size);
1423 GUARD_ARRAY (thedst_neon, loop * vec_size);
1424
1425 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
1426 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
1427
1428 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1429 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1430
1431 for (pos = 0; pos < loop; pos++)
1432 {
1433#ifdef DEBUG_TRACE
1434 ne10_int32_t i;
1435 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1436 for (i = 0; i < vec_size; i++)
1437 {
1438 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1439 }
1440#endif
1441 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
1442 }
1443 }
1444 }
1445 free (guarded_src1);
1446 free (guarded_dst_c);
1447 free (guarded_dst_neon);
1448#endif
1449
1450#ifdef PERFORMANCE_TEST
1451 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1452 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1453 /* init src memory */
1454 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1455
1456 /* init dst memory */
1457 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1458 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1459
1460 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1461 {
1462 GET_TIME (time_c,
1463 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
1464 );
1465 GET_TIME (time_neon,
1466 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
1467 );
1468 time_speedup = (ne10_float32_t) time_c / time_neon;
1469 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1470 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1471 }
1472
1473 free (perftest_guarded_src1);
1474 free (perftest_guarded_dst_c);
1475 free (perftest_guarded_dst_neon);
1476#endif
1477
1478 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1479#undef MAX_VEC_COMPONENTS
1480}
1481
1482void test_rsbc_case0()
1483{
1484#define MAX_VEC_COMPONENTS 4
1485 ne10_int32_t loop;
1486 ne10_int32_t func_loop;
1487
1488 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1489
1490 /* init function table */
1491 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1492 memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1493 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_rsbc_float_c;
1494 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_rsbc_float_neon;
1495 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_rsbc_vec2f_c;
1496 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_rsbc_vec2f_neon;
1497 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_rsbc_vec3f_c;
1498 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_rsbc_vec3f_neon;
1499 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_rsbc_vec4f_c;
1500 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_rsbc_vec4f_neon;
1501
1502#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1503 ne10_int32_t vec_size;
1504 ne10_int32_t pos;
1505 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1506
1507 /* init src memory */
1508 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1509 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1510
1511 /* init dst memory */
1512 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1513 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1514
1515 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1516 {
1517 for (loop = 0; loop < TEST_ITERATION; loop++)
1518 {
1519 vec_size = func_loop + 1;
1520
1521 GUARD_ARRAY (thedst_c, loop * vec_size);
1522 GUARD_ARRAY (thedst_neon, loop * vec_size);
1523
1524 if (func_loop == 0)
1525 {
1526 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1527 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1528 }
1529 else
1530 {
1531 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1532 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1533 }
1534
1535 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1536 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1537
1538 for (pos = 0; pos < loop; pos++)
1539 {
1540#ifdef DEBUG_TRACE
1541 ne10_int32_t i;
1542 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1543 for (i = 0; i < vec_size; i++)
1544 {
1545 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1546 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1547 }
1548#endif
1549 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1550 }
1551 }
1552 }
1553 free (guarded_src1);
1554 free (guarded_cst);
1555 free (guarded_dst_c);
1556 free (guarded_dst_neon);
1557#endif
1558
1559#ifdef PERFORMANCE_TEST
1560 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1561 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1562 /* init src memory */
1563 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1564 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1565
1566 /* init dst memory */
1567 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1568 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1569
1570 for (func_loop = 0; func_loop < 1; func_loop++)
1571 {
1572 GET_TIME (time_c,
1573 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1574 );
1575 GET_TIME (time_neon,
1576 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1577 );
1578 time_speedup = (ne10_float32_t) time_c / time_neon;
1579 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1580 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1581 }
1582 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1583 {
1584 GET_TIME (time_c,
1585 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1586 );
1587 GET_TIME (time_neon,
1588 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1589 );
1590 time_speedup = (ne10_float32_t) time_c / time_neon;
1591 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1592 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1593 }
1594
1595 free (perftest_guarded_src1);
1596 free (perftest_guarded_cst);
1597 free (perftest_guarded_dst_c);
1598 free (perftest_guarded_dst_neon);
1599#endif
1600
1601 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1602#undef MAX_VEC_COMPONENTS
1603}
1604
1605void test_setc_case0()
1606{
1607#define MAX_VEC_COMPONENTS 4
1608 ne10_int32_t loop;
1609 ne10_int32_t func_loop;
1610
1611 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1612
1613 /* init function table */
1614 memset (ftbl_3args, 0, sizeof (ftbl_3args));
1615 memset (ftbl_3args_cst, 0, sizeof (ftbl_3args_cst));
1616 ftbl_3args_cst[ 0] = (ne10_func_3args_cst_t) ne10_setc_float_c;
1617 ftbl_3args_cst[ 1] = (ne10_func_3args_cst_t) ne10_setc_float_neon;
1618 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_setc_vec2f_c;
1619 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_setc_vec2f_neon;
1620 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_setc_vec3f_c;
1621 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_setc_vec3f_neon;
1622 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_setc_vec4f_c;
1623 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_setc_vec4f_neon;
1624
1625#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1626 ne10_int32_t vec_size;
1627 ne10_int32_t pos;
1628 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1629
1630 /* init src memory */
1631 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1632
1633 /* init dst memory */
1634 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1635 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1636
1637 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1638 {
1639 for (loop = 0; loop < TEST_ITERATION; loop++)
1640 {
1641 vec_size = func_loop + 1;
1642
1643 GUARD_ARRAY (thedst_c, loop * vec_size);
1644 GUARD_ARRAY (thedst_neon, loop * vec_size);
1645
1646 if (func_loop == 0)
1647 {
1648 ftbl_3args_cst[2 * func_loop] (thedst_c, thecst[0], loop);
1649 ftbl_3args_cst[2 * func_loop + 1] (thedst_neon, thecst[0], loop);
1650 }
1651 else
1652 {
1653 ftbl_3args[2 * func_loop] (thedst_c, thecst, loop);
1654 ftbl_3args[2 * func_loop + 1] (thedst_neon, thecst, loop);
1655 }
1656
1657 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1658 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1659
1660 for (pos = 0; pos < loop; pos++)
1661 {
1662#ifdef DEBUG_TRACE
1663 ne10_int32_t i;
1664 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1665 for (i = 0; i < vec_size; i++)
1666 {
1667 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1668 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1669 }
1670#endif
1671 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1672 }
1673 }
1674 }
1675 free (guarded_cst);
1676 free (guarded_dst_c);
1677 free (guarded_dst_neon);
1678#endif
1679
1680#ifdef PERFORMANCE_TEST
1681 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1682 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1683 /* init src memory */
1684 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1685
1686 /* init dst memory */
1687 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1688 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1689
1690 for (func_loop = 0; func_loop < 1; func_loop++)
1691 {
1692 GET_TIME (time_c,
1693 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop] (perftest_thedst_c, perftest_thecst[0], loop);
1694 );
1695 GET_TIME (time_neon,
1696 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst[0], loop);
1697 );
1698 time_speedup = (ne10_float32_t) time_c / time_neon;
1699 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1700 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1701 }
1702 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1703 {
1704 GET_TIME (time_c,
1705 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thecst, loop);
1706 );
1707 GET_TIME (time_neon,
1708 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, loop);
1709 );
1710 time_speedup = (ne10_float32_t) time_c / time_neon;
1711 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1712 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1713 }
1714
1715 free (perftest_guarded_cst);
1716 free (perftest_guarded_dst_c);
1717 free (perftest_guarded_dst_neon);
1718#endif
1719
1720 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1721#undef MAX_VEC_COMPONENTS
1722}
1723
1724void test_subc_case0()
1725{
1726#define MAX_VEC_COMPONENTS 4
1727 ne10_int32_t loop;
1728 ne10_int32_t func_loop;
1729
1730 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1731
1732 /* init function table */
1733 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1734 memset (ftbl_4args_cst, 0, sizeof (ftbl_4args_cst));
1735 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_subc_float_c;
1736 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_subc_float_neon;
1737 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_subc_vec2f_c;
1738 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_subc_vec2f_neon;
1739 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_subc_vec3f_c;
1740 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_subc_vec3f_neon;
1741 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_subc_vec4f_c;
1742 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_subc_vec4f_neon;
1743
1744#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1745 ne10_int32_t vec_size;
1746 ne10_int32_t pos;
1747 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1748
1749 /* init src memory */
1750 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1751 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1752
1753 /* init dst memory */
1754 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1755 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1756
1757 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1758 {
1759 for (loop = 0; loop < TEST_ITERATION; loop++)
1760 {
1761 vec_size = func_loop + 1;
1762
1763 GUARD_ARRAY (thedst_c, loop * vec_size);
1764 GUARD_ARRAY (thedst_neon, loop * vec_size);
1765
1766 if (func_loop == 0)
1767 {
1768 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1769 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1770 }
1771 else
1772 {
1773 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1774 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1775 }
1776
1777 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1778 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1779
1780 for (pos = 0; pos < loop; pos++)
1781 {
1782#ifdef DEBUG_TRACE
1783 ne10_int32_t i;
1784 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1785 for (i = 0; i < vec_size; i++)
1786 {
1787 fprintf (stdout, "thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1788 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1789 }
1790#endif
1791 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1792 }
1793 }
1794 }
1795 free (guarded_src1);
1796 free (guarded_cst);
1797 free (guarded_dst_c);
1798 free (guarded_dst_neon);
1799#endif
1800
1801#ifdef PERFORMANCE_TEST
1802 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1803 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1804 /* init src memory */
1805 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1806 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
1807
1808 /* init dst memory */
1809 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1810 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1811
1812 for (func_loop = 0; func_loop < 1; func_loop++)
1813 {
1814 GET_TIME (time_c,
1815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1816 );
1817 GET_TIME (time_neon,
1818 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1819 );
1820 time_speedup = (ne10_float32_t) time_c / time_neon;
1821 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1822 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1823 }
1824 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1825 {
1826 GET_TIME (time_c,
1827 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1828 );
1829 GET_TIME (time_neon,
1830 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1831 );
1832 time_speedup = (ne10_float32_t) time_c / time_neon;
1833 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1834 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1835 }
1836
1837 free (perftest_guarded_src1);
1838 free (perftest_guarded_cst);
1839 free (perftest_guarded_dst_c);
1840 free (perftest_guarded_dst_neon);
1841#endif
1842
1843 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1844#undef MAX_VEC_COMPONENTS
1845}
1846
1847void test_sub_case0()
1848{
1849#define MAX_VEC_COMPONENTS 4
1850 ne10_int32_t loop;
1851 ne10_int32_t func_loop;
1852
1853 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1854
1855 /* init function table */
1856 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1857 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_sub_float_c;
1858 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_sub_float_neon;
1859 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_sub_vec2f_c;
1860 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_sub_vec2f_neon;
1861 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_sub_vec3f_c;
1862 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_sub_vec3f_neon;
1863 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_sub_vec4f_c;
1864 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_sub_vec4f_neon;
1865
1866#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1867 ne10_int32_t vec_size;
1868 ne10_int32_t pos;
1869 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1870
1871 /* init src memory */
1872 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1873 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1874
1875 /* init dst memory */
1876 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1877 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1878
1879 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1880 {
1881 for (loop = 0; loop < TEST_ITERATION; loop++)
1882 {
1883 vec_size = func_loop + 1;
1884
1885 GUARD_ARRAY (thedst_c, loop * vec_size);
1886 GUARD_ARRAY (thedst_neon, loop * vec_size);
1887
1888 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1889 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1890
1891 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1892 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1893
1894 for (pos = 0; pos < loop; pos++)
1895 {
1896#ifdef DEBUG_TRACE
1897 ne10_int32_t i;
1898 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1899 for (i = 0; i < vec_size; i++)
1900 {
1901 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1902 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1903 }
1904#endif
1905 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1906 }
1907 }
1908 }
1909 free (guarded_src1);
1910 free (guarded_src2);
1911 free (guarded_dst_c);
1912 free (guarded_dst_neon);
1913#endif
1914
1915#ifdef PERFORMANCE_TEST
1916 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
1917 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1918 /* init src memory */
1919 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1920 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1921
1922 /* init dst memory */
1923 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1924 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1925
1926 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1927 {
1928 GET_TIME (time_c,
1929 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1930 );
1931 GET_TIME (time_neon,
1932 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1933 );
1934 time_speedup = (ne10_float32_t) time_c / time_neon;
1935 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1936 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1937 }
1938
1939 free (perftest_guarded_src1);
1940 free (perftest_guarded_src2);
1941 free (perftest_guarded_dst_c);
1942 free (perftest_guarded_dst_neon);
1943#endif
1944
1945 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
1946#undef MAX_VEC_COMPONENTS
1947}
1948
1949void test_addmat_case0()
1950{
1951#define MAX_VEC_COMPONENTS 4
1952 ne10_int32_t loop;
1953 ne10_int32_t func_loop;
1954
1955 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
1956
1957 /* init function table */
1958 memset (ftbl_4args, 0, sizeof (ftbl_4args));
1959 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addmat_2x2f_c;
1960 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addmat_2x2f_neon;
1961 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addmat_3x3f_c;
1962 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addmat_3x3f_neon;
1963 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addmat_4x4f_c;
1964 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addmat_4x4f_neon;
1965
1966#if defined (SMOKE_TEST)||(REGRESSION_TEST)
1967 ne10_int32_t vec_size;
1968 ne10_int32_t pos;
1969 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
1970
1971 /* init src memory */
1972 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1973 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
1974
1975 /* init dst memory */
1976 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1977 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1978
1979 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1980 {
1981 for (loop = 0; loop < TEST_ITERATION; loop++)
1982 {
1983 vec_size = (func_loop + 1) * (func_loop + 1);
1984
1985 GUARD_ARRAY (thedst_c, loop * vec_size);
1986 GUARD_ARRAY (thedst_neon, loop * vec_size);
1987
1988 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1989 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1990
1991 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1992 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1993
1994 for (pos = 0; pos < loop; pos++)
1995 {
1996#ifdef DEBUG_TRACE
1997 ne10_int32_t i;
1998 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1999 for (i = 0; i < vec_size; i++)
2000 {
2001 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2002 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2003 }
2004#endif
2005 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2006 }
2007 }
2008 }
2009 free (guarded_src1);
2010 free (guarded_src2);
2011 free (guarded_dst_c);
2012 free (guarded_dst_neon);
2013#endif
2014
2015#ifdef PERFORMANCE_TEST
2016 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2017 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2018 /* init src memory */
2019 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2020 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2021
2022 /* init dst memory */
2023 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2024 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2025
2026 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2027 {
2028 GET_TIME (time_c,
2029 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2030 );
2031 GET_TIME (time_neon,
2032 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2033 );
2034 time_speedup = (ne10_float32_t) time_c / time_neon;
2035 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2036 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2037 }
2038
2039 free (perftest_guarded_src1);
2040 free (perftest_guarded_src2);
2041 free (perftest_guarded_dst_c);
2042 free (perftest_guarded_dst_neon);
2043#endif
2044
2045 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2046#undef MAX_VEC_COMPONENTS
2047}
2048
2049void test_detmat_case0()
2050{
2051#define MAX_VEC_COMPONENTS 4
2052 ne10_int32_t loop;
2053 ne10_int32_t func_loop;
2054
2055 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2056
2057 /* init function table */
2058 memset (ftbl_3args, 0, sizeof (ftbl_3args));
2059 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_detmat_2x2f_c;
2060 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_detmat_2x2f_neon;
2061 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_detmat_3x3f_c;
2062 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_detmat_3x3f_neon;
2063 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_detmat_4x4f_c;
2064 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_detmat_4x4f_neon;
2065
2066#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2067 ne10_int32_t vec_size;
2068 ne10_int32_t pos;
2069 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2070
2071 /* init src memory */
2072 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2073
2074 /* init dst memory */
2075 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2076 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2077
2078 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2079 {
2080 for (loop = 0; loop < TEST_ITERATION; loop++)
2081 {
2082 vec_size = (func_loop + 1) * (func_loop + 1);
2083
2084 GUARD_ARRAY (thedst_c, loop);
2085 GUARD_ARRAY (thedst_neon, loop);
2086
2087 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2088 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2089
2090 CHECK_ARRAY_GUARD (thedst_c, loop);
2091 CHECK_ARRAY_GUARD (thedst_neon, loop);
2092
2093 for (pos = 0; pos < loop; pos++)
2094 {
2095#ifdef DEBUG_TRACE
2096 ne10_int32_t i;
2097 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2098 for (i = 0; i < vec_size; i++)
2099 {
2100 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2101 }
2102#endif
2103 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, 1);
2104 }
2105 }
2106 }
2107 free (guarded_src1);
2108 free (guarded_dst_c);
2109 free (guarded_dst_neon);
2110#endif
2111
2112#ifdef PERFORMANCE_TEST
2113 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2114 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2115 /* init src memory */
2116 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2117
2118 /* init dst memory */
2119 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2120 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2121
2122 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2123 {
2124 GET_TIME (time_c,
2125 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2126 );
2127 GET_TIME (time_neon,
2128 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2129 );
2130 time_speedup = (ne10_float32_t) time_c / time_neon;
2131 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2132 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2133 }
2134
2135 free (perftest_guarded_src1);
2136 free (perftest_guarded_dst_c);
2137 free (perftest_guarded_dst_neon);
2138#endif
2139
2140 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2141#undef MAX_VEC_COMPONENTS
2142}
2143
2144void test_identitymat_case0()
2145{
2146#define MAX_VEC_COMPONENTS 4
2147 ne10_int32_t loop;
2148 ne10_int32_t func_loop;
2149
2150 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2151
2152 /* init function table */
2153 memset (ftbl_2args, 0, sizeof (ftbl_2args));
2154 ftbl_2args[ 2] = (ne10_func_2args_t) ne10_identitymat_2x2f_c;
2155 ftbl_2args[ 3] = (ne10_func_2args_t) ne10_identitymat_2x2f_neon;
2156 ftbl_2args[ 4] = (ne10_func_2args_t) ne10_identitymat_3x3f_c;
2157 ftbl_2args[ 5] = (ne10_func_2args_t) ne10_identitymat_3x3f_neon;
2158 ftbl_2args[ 6] = (ne10_func_2args_t) ne10_identitymat_4x4f_c;
2159 ftbl_2args[ 7] = (ne10_func_2args_t) ne10_identitymat_4x4f_neon;
2160
2161#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2162 ne10_int32_t vec_size;
2163 ne10_int32_t pos;
2164 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2165
2166 /* init dst memory */
2167 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2168 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2169
2170 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2171 {
2172 for (loop = 0; loop < TEST_ITERATION; loop++)
2173 {
2174 vec_size = (func_loop + 1) * (func_loop + 1);
2175
2176 GUARD_ARRAY (thedst_c, loop * vec_size);
2177 GUARD_ARRAY (thedst_neon, loop * vec_size);
2178
2179 ftbl_2args[2 * func_loop] (thedst_c, loop);
2180 ftbl_2args[2 * func_loop + 1] (thedst_neon, loop);
2181
2182 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2183 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2184
2185 for (pos = 0; pos < loop; pos++)
2186 {
2187#ifdef DEBUG_TRACE
2188 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2189#endif
2190 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2191 }
2192 }
2193 }
2194 free (guarded_dst_c);
2195 free (guarded_dst_neon);
2196#endif
2197
2198#ifdef PERFORMANCE_TEST
2199 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2200 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2201 /* init dst memory */
2202 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2203 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2204
2205 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2206 {
2207 GET_TIME (time_c,
2208 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop] (perftest_thedst_c, loop);
2209 );
2210 GET_TIME (time_neon,
2211 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop + 1] (perftest_thedst_neon, loop);
2212 );
2213 time_speedup = (ne10_float32_t) time_c / time_neon;
2214 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2215 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2216 }
2217
2218 free (perftest_guarded_dst_c);
2219 free (perftest_guarded_dst_neon);
2220#endif
2221
2222 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2223#undef MAX_VEC_COMPONENTS
2224}
2225
2226void test_invmat_case0()
2227{
2228#define MAX_VEC_COMPONENTS 4
2229 ne10_int32_t loop;
2230 ne10_int32_t func_loop;
2231
2232 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2233
2234 /* init function table */
2235 memset (ftbl_3args, 0, sizeof (ftbl_3args));
2236 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_invmat_2x2f_c;
2237 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_invmat_2x2f_neon;
2238 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_invmat_3x3f_c;
2239 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_invmat_3x3f_neon;
2240 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_invmat_4x4f_c;
2241 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_invmat_4x4f_neon;
2242
2243#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2244 ne10_int32_t vec_size;
2245 ne10_int32_t pos;
2246 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2247
2248 /* init src memory */
2249 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2250
2251 /* init dst memory */
2252 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2253 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2254
2255 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2256 {
2257 for (loop = 0; loop < TEST_ITERATION; loop++)
2258 {
2259 vec_size = (func_loop + 1) * (func_loop + 1);
2260
2261 GUARD_ARRAY (thedst_c, loop * vec_size);
2262 GUARD_ARRAY (thedst_neon, loop * vec_size);
2263
2264 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2265 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2266
2267 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2268 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2269
2270 for (pos = 0; pos < loop; pos++)
2271 {
2272#ifdef DEBUG_TRACE
2273 ne10_int32_t i;
2274 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2275 for (i = 0; i < vec_size; i++)
2276 {
2277 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2278 }
2279#endif
2280 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
2281 }
2282 }
2283 }
2284 free (guarded_src1);
2285 free (guarded_dst_c);
2286 free (guarded_dst_neon);
2287#endif
2288
2289#ifdef PERFORMANCE_TEST
2290 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2291 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2292 /* init src memory */
2293 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2294
2295 /* init dst memory */
2296 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2297 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2298
2299 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2300 {
2301 GET_TIME (time_c,
2302 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2303 );
2304 GET_TIME (time_neon,
2305 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2306 );
2307 time_speedup = (ne10_float32_t) time_c / time_neon;
2308 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2309 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2310 }
2311
2312 free (perftest_guarded_src1);
2313 free (perftest_guarded_dst_c);
2314 free (perftest_guarded_dst_neon);
2315#endif
2316
2317 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2318#undef MAX_VEC_COMPONENTS
2319}
2320
2321void test_mulmat_case0()
2322{
2323#define MAX_VEC_COMPONENTS 4
2324 ne10_int32_t loop;
2325 ne10_int32_t func_loop;
2326
2327 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2328
2329 /* init function table */
2330 memset (ftbl_4args, 0, sizeof (ftbl_4args));
2331 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulmat_2x2f_c;
2332 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulmat_2x2f_neon;
2333 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulmat_3x3f_c;
2334 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulmat_3x3f_neon;
2335 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulmat_4x4f_c;
2336 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulmat_4x4f_neon;
2337
2338#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2339 ne10_int32_t vec_size;
2340 ne10_int32_t pos;
2341 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2342
2343 /* init src memory */
2344 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2345 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2346
2347 /* init dst memory */
2348 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2349 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2350
2351 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2352 {
2353 for (loop = 0; loop < TEST_ITERATION; loop++)
2354 {
2355 vec_size = (func_loop + 1) * (func_loop + 1);
2356
2357 GUARD_ARRAY (thedst_c, loop * vec_size);
2358 GUARD_ARRAY (thedst_neon, loop * vec_size);
2359
2360 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2361 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2362
2363 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2364 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2365
2366 for (pos = 0; pos < loop; pos++)
2367 {
2368#ifdef DEBUG_TRACE
2369 ne10_int32_t i;
2370 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2371 for (i = 0; i < vec_size; i++)
2372 {
2373 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2374 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2375 }
2376#endif
2377 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2378 }
2379 }
2380 }
2381 free (guarded_src1);
2382 free (guarded_src2);
2383 free (guarded_dst_c);
2384 free (guarded_dst_neon);
2385#endif
2386
2387#ifdef PERFORMANCE_TEST
2388 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2389 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2390 /* init src memory */
2391 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2392 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2393
2394 /* init dst memory */
2395 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2396 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2397
2398 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2399 {
2400 GET_TIME (time_c,
2401 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2402 );
2403 GET_TIME (time_neon,
2404 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2405 );
2406 time_speedup = (ne10_float32_t) time_c / time_neon;
2407 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2408 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2409 }
2410
2411 free (perftest_guarded_src1);
2412 free (perftest_guarded_src2);
2413 free (perftest_guarded_dst_c);
2414 free (perftest_guarded_dst_neon);
2415#endif
2416
2417 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2418#undef MAX_VEC_COMPONENTS
2419}
2420
2421void test_submat_case0()
2422{
2423#define MAX_VEC_COMPONENTS 4
2424 ne10_int32_t loop;
2425 ne10_int32_t func_loop;
2426
2427 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2428
2429 /* init function table */
2430 memset (ftbl_4args, 0, sizeof (ftbl_4args));
2431 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_submat_2x2f_c;
2432 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_submat_2x2f_neon;
2433 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_submat_3x3f_c;
2434 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_submat_3x3f_neon;
2435 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_submat_4x4f_c;
2436 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_submat_4x4f_neon;
2437
2438#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2439 ne10_int32_t vec_size;
2440 ne10_int32_t pos;
2441 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2442
2443 /* init src memory */
2444 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2445 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2446
2447 /* init dst memory */
2448 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2449 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2450
2451 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2452 {
2453 for (loop = 0; loop < TEST_ITERATION; loop++)
2454 {
2455 vec_size = (func_loop + 1) * (func_loop + 1);
2456
2457 GUARD_ARRAY (thedst_c, loop * vec_size);
2458 GUARD_ARRAY (thedst_neon, loop * vec_size);
2459
2460 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2461 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2462
2463 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2464 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2465
2466 for (pos = 0; pos < loop; pos++)
2467 {
2468#ifdef DEBUG_TRACE
2469 ne10_int32_t i;
2470 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2471 for (i = 0; i < vec_size; i++)
2472 {
2473 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2474 fprintf (stdout, "thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2475 }
2476#endif
2477 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2478 }
2479 }
2480 }
2481 free (guarded_src1);
2482 free (guarded_src2);
2483 free (guarded_dst_c);
2484 free (guarded_dst_neon);
2485#endif
2486
2487#ifdef PERFORMANCE_TEST
2488 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2489 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2490 /* init src memory */
2491 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2492 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2493
2494 /* init dst memory */
2495 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2496 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2497
2498 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2499 {
2500 GET_TIME (time_c,
2501 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2502 );
2503 GET_TIME (time_neon,
2504 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2505 );
2506 time_speedup = (ne10_float32_t) time_c / time_neon;
2507 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2508 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2509 }
2510
2511 free (perftest_guarded_src1);
2512 free (perftest_guarded_src2);
2513 free (perftest_guarded_dst_c);
2514 free (perftest_guarded_dst_neon);
2515#endif
2516
2517 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2518#undef MAX_VEC_COMPONENTS
2519}
2520
2521void test_transmat_case0()
2522{
2523#define MAX_VEC_COMPONENTS 4
2524 ne10_int32_t loop;
2525 ne10_int32_t func_loop;
2526
2527 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2528
2529 /* init function table */
2530 memset (ftbl_3args, 0, sizeof (ftbl_3args));
2531 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_transmat_2x2f_c;
2532 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_transmat_2x2f_neon;
2533 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_transmat_3x3f_c;
2534 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_transmat_3x3f_neon;
2535 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_transmat_4x4f_c;
2536 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_transmat_4x4f_neon;
2537
2538#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2539 ne10_int32_t vec_size;
2540 ne10_int32_t pos;
2541 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2542
2543 /* init src memory */
2544 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2545
2546 /* init dst memory */
2547 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2548 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2549
2550 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2551 {
2552 for (loop = 0; loop < TEST_ITERATION; loop++)
2553 {
2554 vec_size = (func_loop + 1) * (func_loop + 1);
2555
2556 GUARD_ARRAY (thedst_c, loop * vec_size);
2557 GUARD_ARRAY (thedst_neon, loop * vec_size);
2558
2559 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2560 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2561
2562 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2563 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2564
2565 for (pos = 0; pos < loop; pos++)
2566 {
2567#ifdef DEBUG_TRACE
2568 ne10_int32_t i;
2569 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2570 for (i = 0; i < vec_size; i++)
2571 {
2572 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2573 }
2574#endif
2575 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2576 }
2577 }
2578 }
2579 free (guarded_src1);
2580 free (guarded_dst_c);
2581 free (guarded_dst_neon);
2582#endif
2583
2584#ifdef PERFORMANCE_TEST
2585 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2586 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2587 /* init src memory */
2588 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2589
2590 /* init dst memory */
2591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2593
2594 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2595 {
2596 GET_TIME (time_c,
2597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2598 );
2599 GET_TIME (time_neon,
2600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2601 );
2602 time_speedup = (ne10_float32_t) time_c / time_neon;
2603 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2604 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2605 }
2606
2607 free (perftest_guarded_src1);
2608 free (perftest_guarded_dst_c);
2609 free (perftest_guarded_dst_neon);
2610#endif
2611
2612 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2613#undef MAX_VEC_COMPONENTS
2614}
2615
2616void test_mulcmatvec_case0()
2617{
2618#define MAX_VEC_COMPONENTS 4
2619 ne10_int32_t loop;
2620 ne10_int32_t func_loop;
2621
2622 fprintf (stdout, "----------%30s start\n", __FUNCTION__);
2623
2624 /* init function table */
2625 memset (ftbl_4args, 0, sizeof (ftbl_4args));
2626 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_c;
2627 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_neon;
2628 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_c;
2629 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_neon;
2630 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_c;
2631 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_neon;
2632
2633#if defined (SMOKE_TEST)||(REGRESSION_TEST)
2634 ne10_int32_t vec_size;
2635 ne10_int32_t pos;
2636 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
2637
2638 /* init src memory */
2639 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2640 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
2641
2642 /* init dst memory */
2643 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2644 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2645
2646 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2647 {
2648 for (loop = 0; loop < TEST_ITERATION; loop++)
2649 {
2650 vec_size = func_loop + 1;
2651
2652 GUARD_ARRAY (thedst_c, loop * vec_size);
2653 GUARD_ARRAY (thedst_neon, loop * vec_size);
2654
2655 ftbl_4args[2 * func_loop] (thedst_c, thecst, thesrc1, loop);
2656 ftbl_4args[2 * func_loop + 1] (thedst_neon, thecst, thesrc1, loop);
2657
2658 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2659 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2660
2661 for (pos = 0; pos < loop; pos++)
2662 {
2663#ifdef DEBUG_TRACE
2664 ne10_int32_t i;
2665 fprintf (stdout, "func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2666 for (i = 0; i < vec_size * vec_size; i++)
2667 {
2668 fprintf (stdout, "thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
2669 }
2670 for (i = 0; i < vec_size; i++)
2671 {
2672 fprintf (stdout, "thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2673 }
2674#endif
2675 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2676 }
2677 }
2678 }
2679 free (guarded_src1);
2680 free (guarded_cst);
2681 free (guarded_dst_c);
2682 free (guarded_dst_neon);
2683#endif
2684
2685#ifdef PERFORMANCE_TEST
2686 fprintf (stdout, "%25s%20s%20s%20s%20s\n", "N-component Vector", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
2687 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
2688 /* init src memory */
2689 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length); // 16 extra bytes at the begining and 16 extra bytes at the end
2690 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS); // 16 extra bytes at the begining and 16 extra bytes at the end
2691
2692 /* init dst memory */
2693 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2694 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2695
2696 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2697 {
2698 GET_TIME (time_c,
2699 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thecst, perftest_thesrc1, loop);
2700 );
2701 GET_TIME (time_neon,
2702 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, perftest_thesrc1, loop);
2703 );
2704 time_speedup = (ne10_float32_t) time_c / time_neon;
2705 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2706 ne10_log (__FUNCTION__, "%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2707 }
2708
2709 free (perftest_guarded_src1);
2710 free (perftest_guarded_cst);
2711 free (perftest_guarded_dst_c);
2712 free (perftest_guarded_dst_neon);
2713#endif
2714
2715 fprintf (stdout, "----------%30s end\n", __FUNCTION__);
2716#undef MAX_VEC_COMPONENTS
2717}
2718
2719void test_abs()
2720{
2721 test_abs_case0();
2722}
2723
2724void test_addc()
2725{
2726 test_addc_case0();
2727}
2728
2729void test_add()
2730{
2731 test_add_case0();
2732}
2733
2734void test_cross()
2735{
2736 test_cross_case0();
2737}
2738
2739void test_divc()
2740{
2741 test_divc_case0();
2742}
2743
2744void test_div()
2745{
2746 test_div_case0();
2747}
2748
2749void test_dot()
2750{
2751 test_dot_case0();
2752}
2753
2754void test_len()
2755{
2756 test_len_case0();
2757}
2758
2759void test_mlac()
2760{
2761 test_mlac_case0();
2762}
2763
2764void test_mla()
2765{
2766 test_mla_case0();
2767}
2768
2769void test_mulc()
2770{
2771 test_mulc_case0();
2772}
2773
2774void test_mul()
2775{
2776 test_mul_case0();
2777}
2778void test_normalize()
2779{
2780 test_normalize_case0();
2781}
2782
2783void test_rsbc()
2784{
2785 test_rsbc_case0();
2786}
2787
2788void test_setc()
2789{
2790 test_setc_case0();
2791}
2792
2793void test_subc()
2794{
2795 test_subc_case0();
2796}
2797
2798void test_sub()
2799{
2800 test_sub_case0();
2801}
2802
2803void test_addmat()
2804{
2805 test_addmat_case0();
2806}
2807
2808void test_detmat()
2809{
2810 test_detmat_case0();
2811}
2812
2813void test_identitymat()
2814{
2815 test_identitymat_case0();
2816}
2817
2818void test_invmat()
2819{
2820 test_invmat_case0();
2821}
2822
2823void test_mulmat()
2824{
2825 test_mulmat_case0();
2826}
2827
2828void test_mulcmatvec()
2829{
2830 test_mulcmatvec_case0();
2831}
2832
2833void test_submat()
2834{
2835 test_submat_case0();
2836}
2837
2838void test_transmat()
2839{
2840 test_transmat_case0();
2841}
2842
2843static void my_test_setup (void)
2844{
2845 //printf("------%-30s start\r\n", __FUNCTION__);
2846 ne10_log_buffer_ptr = ne10_log_buffer;
2847}
2848
2849void my_test_teardown (void)
2850{
2851 //printf("--------end\r\n");
2852}
2853
2854void test_fixture_math (void)
2855{
2856 test_fixture_start(); // starts a fixture
2857
2858 fixture_setup (my_test_setup);
2859 fixture_teardown (my_test_teardown);
2860
2861 run_test (test_abs); // run tests
2862 run_test (test_addc);
2863 run_test (test_add);
2864 run_test (test_cross);
2865 run_test (test_divc);
2866 run_test (test_div);
2867 run_test (test_dot);
2868 run_test (test_len);
2869 run_test (test_mlac);
2870 run_test (test_mla);
2871 run_test (test_mulc);
2872 run_test (test_mul);
2873 run_test (test_normalize);
2874 run_test (test_rsbc);
2875 run_test (test_setc);
2876 run_test (test_subc);
2877 run_test (test_sub);
2878 run_test (test_addmat);
2879 run_test (test_detmat);
2880 run_test (test_identitymat);
2881 run_test (test_invmat);
2882 run_test (test_mulmat);
2883 run_test (test_mulcmatvec);
2884 run_test (test_submat);
2885 run_test (test_transmat);
2886
2887 test_fixture_end(); // ends a fixture
2888}