Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_atan_32f_a_H
78 #define INCLUDED_volk_32f_atan_32f_a_H
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  int i, j;
92 
93  __m256 aVal, pio2, x, y, z, arctangent;
94  __m256 fzeroes, fones, ftwos, ffours, condition;
95 
96  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97  fzeroes = _mm256_setzero_ps();
98  fones = _mm256_set1_ps(1.0);
99  ftwos = _mm256_set1_ps(2.0);
100  ffours = _mm256_set1_ps(4.0);
101 
102  for(;number < eighthPoints; number++){
103  aVal = _mm256_load_ps(aPtr);
104  z = aVal;
105  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
106  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
107  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
108  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
109 
110  for(i = 0; i < 2; i++){
111  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
112  }
113  x = _mm256_div_ps(fones, x);
114  y = fzeroes;
115  for(j = TERMS - 1; j >=0 ; j--){
116  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
117  }
118 
119  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
120  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
121 
122  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
123  arctangent = y;
124  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
125  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
126 
127  _mm256_store_ps(bPtr, arctangent);
128  aPtr += 8;
129  bPtr += 8;
130  }
131 
132  number = eighthPoints * 8;
133  for(;number < num_points; number++){
134  *bPtr++ = atan(*aPtr++);
135  }
136 }
137 
138 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
139 
140 
141 #ifdef LV_HAVE_AVX
142 #include <immintrin.h>
143 
144 static inline void
145 volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
146 {
147  float* bPtr = bVector;
148  const float* aPtr = aVector;
149 
150  unsigned int number = 0;
151  unsigned int eighthPoints = num_points / 8;
152  int i, j;
153 
154  __m256 aVal, pio2, x, y, z, arctangent;
155  __m256 fzeroes, fones, ftwos, ffours, condition;
156 
157  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
158  fzeroes = _mm256_setzero_ps();
159  fones = _mm256_set1_ps(1.0);
160  ftwos = _mm256_set1_ps(2.0);
161  ffours = _mm256_set1_ps(4.0);
162 
163  for(;number < eighthPoints; number++){
164  aVal = _mm256_load_ps(aPtr);
165  z = aVal;
166  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
167  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
168  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
169  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
170 
171  for(i = 0; i < 2; i++){
172  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
173  }
174  x = _mm256_div_ps(fones, x);
175  y = fzeroes;
176  for(j = TERMS - 1; j >=0 ; j--){
177  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
178  }
179 
180  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
181  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
182 
183  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
184  arctangent = y;
185  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
186  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
187 
188  _mm256_store_ps(bPtr, arctangent);
189  aPtr += 8;
190  bPtr += 8;
191  }
192 
193  number = eighthPoints * 8;
194  for(;number < num_points; number++){
195  *bPtr++ = atan(*aPtr++);
196  }
197 }
198 
199 #endif /* LV_HAVE_AVX for aligned */
200 
201 #ifdef LV_HAVE_SSE4_1
202 #include <smmintrin.h>
203 
204 static inline void
205 volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
206 {
207  float* bPtr = bVector;
208  const float* aPtr = aVector;
209 
210  unsigned int number = 0;
211  unsigned int quarterPoints = num_points / 4;
212  int i, j;
213 
214  __m128 aVal, pio2, x, y, z, arctangent;
215  __m128 fzeroes, fones, ftwos, ffours, condition;
216 
217  pio2 = _mm_set1_ps(3.14159265358979323846/2);
218  fzeroes = _mm_setzero_ps();
219  fones = _mm_set1_ps(1.0);
220  ftwos = _mm_set1_ps(2.0);
221  ffours = _mm_set1_ps(4.0);
222 
223  for(;number < quarterPoints; number++){
224  aVal = _mm_load_ps(aPtr);
225  z = aVal;
226  condition = _mm_cmplt_ps(z, fzeroes);
227  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
228  condition = _mm_cmplt_ps(z, fones);
229  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
230 
231  for(i = 0; i < 2; i++){
232  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
233  }
234  x = _mm_div_ps(fones, x);
235  y = fzeroes;
236  for(j = TERMS - 1; j >=0 ; j--){
237  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
238  }
239 
240  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
241  condition = _mm_cmpgt_ps(z, fones);
242 
243  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
244  arctangent = y;
245  condition = _mm_cmplt_ps(aVal, fzeroes);
246  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
247 
248  _mm_store_ps(bPtr, arctangent);
249  aPtr += 4;
250  bPtr += 4;
251  }
252 
253  number = quarterPoints * 4;
254  for(;number < num_points; number++){
255  *bPtr++ = atanf(*aPtr++);
256  }
257 }
258 
259 #endif /* LV_HAVE_SSE4_1 for aligned */
260 
261 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
262 
263 #ifndef INCLUDED_volk_32f_atan_32f_u_H
264 #define INCLUDED_volk_32f_atan_32f_u_H
265 
266 #if LV_HAVE_AVX2 && LV_HAVE_FMA
267 #include <immintrin.h>
268 
269 static inline void
270 volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
271 {
272  float* bPtr = bVector;
273  const float* aPtr = aVector;
274 
275  unsigned int number = 0;
276  unsigned int eighthPoints = num_points / 8;
277  int i, j;
278 
279  __m256 aVal, pio2, x, y, z, arctangent;
280  __m256 fzeroes, fones, ftwos, ffours, condition;
281 
282  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
283  fzeroes = _mm256_setzero_ps();
284  fones = _mm256_set1_ps(1.0);
285  ftwos = _mm256_set1_ps(2.0);
286  ffours = _mm256_set1_ps(4.0);
287 
288  for(;number < eighthPoints; number++){
289  aVal = _mm256_loadu_ps(aPtr);
290  z = aVal;
291  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
292  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
293  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
294  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
295 
296  for(i = 0; i < 2; i++){
297  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
298  }
299  x = _mm256_div_ps(fones, x);
300  y = fzeroes;
301  for(j = TERMS - 1; j >=0 ; j--){
302  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
303  }
304 
305  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
306  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
307 
308  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
309  arctangent = y;
310  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
311  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
312 
313  _mm256_storeu_ps(bPtr, arctangent);
314  aPtr += 8;
315  bPtr += 8;
316  }
317 
318  number = eighthPoints * 8;
319  for(;number < num_points; number++){
320  *bPtr++ = atan(*aPtr++);
321  }
322 }
323 
324 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
325 
326 
327 #ifdef LV_HAVE_AVX
328 #include <immintrin.h>
329 
330 static inline void
331 volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
332 {
333  float* bPtr = bVector;
334  const float* aPtr = aVector;
335 
336  unsigned int number = 0;
337  unsigned int eighthPoints = num_points / 8;
338  int i, j;
339 
340  __m256 aVal, pio2, x, y, z, arctangent;
341  __m256 fzeroes, fones, ftwos, ffours, condition;
342 
343  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
344  fzeroes = _mm256_setzero_ps();
345  fones = _mm256_set1_ps(1.0);
346  ftwos = _mm256_set1_ps(2.0);
347  ffours = _mm256_set1_ps(4.0);
348 
349  for(;number < eighthPoints; number++){
350  aVal = _mm256_loadu_ps(aPtr);
351  z = aVal;
352  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
353  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
354  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
355  x = _mm256_add_ps(z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
356 
357  for(i = 0; i < 2; i++){
358  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
359  }
360  x = _mm256_div_ps(fones, x);
361  y = fzeroes;
362  for(j = TERMS - 1; j >=0 ; j--){
363  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
364  }
365 
366  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
367  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
368 
369  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
370  arctangent = y;
371  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
372  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
373 
374  _mm256_storeu_ps(bPtr, arctangent);
375  aPtr += 8;
376  bPtr += 8;
377  }
378 
379  number = eighthPoints * 8;
380  for(;number < num_points; number++){
381  *bPtr++ = atan(*aPtr++);
382  }
383 }
384 
385 #endif /* LV_HAVE_AVX for unaligned */
386 
387 #ifdef LV_HAVE_SSE4_1
388 #include <smmintrin.h>
389 
390 static inline void
391 volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
392 {
393  float* bPtr = bVector;
394  const float* aPtr = aVector;
395 
396  unsigned int number = 0;
397  unsigned int quarterPoints = num_points / 4;
398  int i, j;
399 
400  __m128 aVal, pio2, x, y, z, arctangent;
401  __m128 fzeroes, fones, ftwos, ffours, condition;
402 
403  pio2 = _mm_set1_ps(3.14159265358979323846/2);
404  fzeroes = _mm_setzero_ps();
405  fones = _mm_set1_ps(1.0);
406  ftwos = _mm_set1_ps(2.0);
407  ffours = _mm_set1_ps(4.0);
408 
409  for(;number < quarterPoints; number++){
410  aVal = _mm_loadu_ps(aPtr);
411  z = aVal;
412  condition = _mm_cmplt_ps(z, fzeroes);
413  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
414  condition = _mm_cmplt_ps(z, fones);
415  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
416 
417  for(i = 0; i < 2; i++)
418  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
419  x = _mm_div_ps(fones, x);
420  y = fzeroes;
421  for(j = TERMS - 1; j >= 0; j--)
422  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
423 
424  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
425  condition = _mm_cmpgt_ps(z, fones);
426 
427  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
428  arctangent = y;
429  condition = _mm_cmplt_ps(aVal, fzeroes);
430  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
431 
432  _mm_storeu_ps(bPtr, arctangent);
433  aPtr += 4;
434  bPtr += 4;
435  }
436 
437  number = quarterPoints * 4;
438  for(;number < num_points; number++){
439  *bPtr++ = atanf(*aPtr++);
440  }
441 }
442 
443 #endif /* LV_HAVE_SSE4_1 for unaligned */
444 
445 #ifdef LV_HAVE_GENERIC
446 
447 static inline void
448 volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
449 {
450  float* bPtr = bVector;
451  const float* aPtr = aVector;
452  unsigned int number = 0;
453 
454  for(number = 0; number < num_points; number++){
455  *bPtr++ = atanf(*aPtr++);
456  }
457 }
458 #endif /* LV_HAVE_GENERIC */
459 
460 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
volk_32f_atan_32f_generic
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:448
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_32f_atan_32f_u_avx
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:331
volk_32f_atan_32f_a_avx
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:145
TERMS
#define TERMS
Definition: volk_32f_atan_32f.h:75