Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
common
macros.h
1
/*
2
* Copyright 2011-15 ARM Limited and Contributors.
3
* All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the name of the <organization> nor the
13
* names of its contributors may be used to endorse or promote products
14
* derived from this software without specific prior written permission.
15
*
16
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
* DISCLAIMED. IN NO EVENT SHALL ARM Limited and Contributors. BE LIABLE FOR ANY
20
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
/*
29
* NE10 Library : common/macros.h
30
*/
31
32
#include "factor.h"
33
34
// Macros used in actual implementations
35
37
38
#define NE10_XC_OPERATION_X_C(loopCode) { \
39
NE10_TEMPLATE_XC_OPERATION_X_C( \
40
NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
41
loopCode); \
42
}
43
44
#define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
45
float32x4_t n_cst = { cst, cst, cst, cst }; \
46
NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
47
NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
48
NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
49
NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
50
); \
51
}
52
53
#define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
54
NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
55
NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
56
NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
57
NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
58
); \
59
}
60
61
/* This macro uses interleaving to boost the performance */
62
#define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
63
NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
64
NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
65
NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
66
NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
67
); \
68
}
69
70
#define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
71
NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
72
NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
73
NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
74
); \
75
}
76
78
79
#define NE10_MLAC_OPERATION_X_C(loopCode) { \
80
NE10_TEMPLATE_XC_OPERATION_X_C( \
81
NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
82
loopCode); \
83
}
84
85
#define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
86
float32x4_t n_acc; \
87
float32x4_t n_cst = { cst, cst, cst, cst }; \
88
NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
89
NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
90
NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
91
NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
92
); \
93
}
94
95
#define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
96
float32x4_t n_acc; \
97
NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
98
NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
99
NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
100
NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
101
); \
102
}
103
104
#define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
105
float32x4_t n_acc1, n_acc2, n_acc3; \
106
NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
107
NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
108
NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
109
NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
110
); \
111
}
112
113
#define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
114
float32x4_t n_acc; \
115
NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
116
NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
117
NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
118
); \
119
}
120
122
123
#define NE10_SETC_OPERATION_X_C(loopCode) { \
124
NE10_TEMPLATE_XC_OPERATION_X_C( \
125
NE10_CHECKPOINTER_DstCst_OPERATION; , \
126
loopCode); \
127
}
128
129
#define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
130
float32x4_t n_cst = { cst, cst, cst, cst }; \
131
NE10_DstCst_OPERATION_FLOAT_NEON( \
132
NE10_CHECKPOINTER_DstCst_OPERATION; , \
133
NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
134
NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
135
); \
136
}
137
138
#define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
139
NE10_DstCst_OPERATION_VEC2F_NEON( \
140
NE10_CHECKPOINTER_DstCst_OPERATION; , \
141
NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
142
NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
143
); \
144
}
145
146
/* This macro uses interleaving to boost the performance */
147
#define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
148
NE10_DstCst_OPERATION_VEC3F_NEON( \
149
NE10_CHECKPOINTER_DstCst_OPERATION; , \
150
NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
151
NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
152
); \
153
}
154
155
#define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
156
NE10_DstCst_OPERATION_VEC4F_NEON( \
157
NE10_CHECKPOINTER_DstCst_OPERATION; , \
158
NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
159
); \
160
}
161
163
164
#define NE10_X_OPERATION_FLOAT_C(loopCode) { \
165
NE10_TEMPLATE_XC_OPERATION_X_C( \
166
NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
167
loopCode); \
168
}
169
170
#define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
171
float32x4_t n_src2; \
172
NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
173
NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
174
NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
175
NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
176
); \
177
}
178
179
#define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
180
182
183
#define NE10_ABS_OPERATION_X_C(loopCode) { \
184
NE10_TEMPLATE_XC_OPERATION_X_C( \
185
NE10_CHECKPOINTER_DstSrc_OPERATION, \
186
loopCode); \
187
}
188
189
#define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
190
191
#define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
192
arm_float_t cst = 0.0f;
/* this is used to compare the values against. */
\
193
float32x4_t n_cst = { cst, cst, cst, cst }; \
194
NE10_DstSrc_OPERATION_FLOAT_NEON( \
195
NE10_CHECKPOINTER_DstSrc_OPERATION; , \
196
NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
197
NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
198
); \
199
}
200
201
#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
202
203
#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
204
205
#define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
206
207
#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
208
NE10_DstSrc_OPERATION_VEC2F_NEON( \
209
NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
210
NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
211
NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
212
); \
213
}
214
215
#define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
216
NE10_DstSrc_OPERATION_VEC3F_NEON( \
217
NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
218
NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
219
NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
220
); \
221
}
222
223
#define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
224
NE10_DstSrc_OPERATION_VEC4F_NEON( \
225
NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
226
NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
227
); \
228
}
229
230
#define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
231
233
234
#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
235
float32x4_t n_acc; \
236
float32x4_t n_src2; \
237
NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
238
NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
239
NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
240
NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
241
); \
242
}
Generated on Wed Mar 27 2024 05:52:03 for Project Ne10 by
1.9.8