Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
82 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
83 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
84 
85 #include<inttypes.h>
86 #include<stdio.h>
87 #include<volk/volk_complex.h>
88 
89 #ifndef MAX
90 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
91 #endif
92 
93 #ifdef LV_HAVE_SSE3
94 #include<xmmintrin.h>
95 #include<pmmintrin.h>
96 
97 static inline void
98 volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array,
99  float* cutoff, unsigned int num_points)
100 {
101  float result = 0.0f;
102  float fst = 0.0f;
103  float sq = 0.0f;
104  float thrd = 0.0f;
105  float frth = 0.0f;
106 
107  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
108 
109  xmm9 = _mm_setzero_ps();
110  xmm1 = _mm_setzero_ps();
111  xmm0 = _mm_load1_ps(&center_point_array[0]);
112  xmm6 = _mm_load1_ps(&center_point_array[1]);
113  xmm7 = _mm_load1_ps(&center_point_array[2]);
114  xmm8 = _mm_load1_ps(&center_point_array[3]);
115  xmm10 = _mm_load1_ps(cutoff);
116 
117  int bound = num_points/8;
118  int leftovers = num_points - 8*bound;
119  int i = 0;
120  for(; i < bound; ++i) {
121  // 1st
122  xmm2 = _mm_load_ps(src0);
123  xmm2 = _mm_max_ps(xmm10, xmm2);
124  xmm3 = _mm_mul_ps(xmm2, xmm2);
125  xmm4 = _mm_mul_ps(xmm2, xmm3);
126  xmm5 = _mm_mul_ps(xmm3, xmm3);
127 
128  xmm2 = _mm_mul_ps(xmm2, xmm0);
129  xmm3 = _mm_mul_ps(xmm3, xmm6);
130  xmm4 = _mm_mul_ps(xmm4, xmm7);
131  xmm5 = _mm_mul_ps(xmm5, xmm8);
132 
133  xmm2 = _mm_add_ps(xmm2, xmm3);
134  xmm3 = _mm_add_ps(xmm4, xmm5);
135 
136  src0 += 4;
137 
138  xmm9 = _mm_add_ps(xmm2, xmm9);
139  xmm9 = _mm_add_ps(xmm3, xmm9);
140 
141  // 2nd
142  xmm2 = _mm_load_ps(src0);
143  xmm2 = _mm_max_ps(xmm10, xmm2);
144  xmm3 = _mm_mul_ps(xmm2, xmm2);
145  xmm4 = _mm_mul_ps(xmm2, xmm3);
146  xmm5 = _mm_mul_ps(xmm3, xmm3);
147 
148  xmm2 = _mm_mul_ps(xmm2, xmm0);
149  xmm3 = _mm_mul_ps(xmm3, xmm6);
150  xmm4 = _mm_mul_ps(xmm4, xmm7);
151  xmm5 = _mm_mul_ps(xmm5, xmm8);
152 
153  xmm2 = _mm_add_ps(xmm2, xmm3);
154  xmm3 = _mm_add_ps(xmm4, xmm5);
155 
156  src0 += 4;
157 
158  xmm1 = _mm_add_ps(xmm2, xmm1);
159  xmm1 = _mm_add_ps(xmm3, xmm1);
160  }
161  xmm2 = _mm_hadd_ps(xmm9, xmm1);
162  xmm3 = _mm_hadd_ps(xmm2, xmm2);
163  xmm4 = _mm_hadd_ps(xmm3, xmm3);
164  _mm_store_ss(&result, xmm4);
165 
166  for(i = 0; i < leftovers; ++i) {
167  fst = *src0++;
168  fst = MAX(fst, *cutoff);
169  sq = fst * fst;
170  thrd = fst * sq;
171  frth = sq * sq;
172  result += (center_point_array[0] * fst +
173  center_point_array[1] * sq +
174  center_point_array[2] * thrd +
175  center_point_array[3] * frth);
176  }
177 
178  result += (float)(num_points) * center_point_array[4];
179  *target = result;
180 }
181 
182 
183 #endif /*LV_HAVE_SSE3*/
184 
185 #if LV_HAVE_AVX && LV_HAVE_FMA
186 #include<immintrin.h>
187 
188 static inline void
189 volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target, float* src0, float* center_point_array,
190  float* cutoff, unsigned int num_points)
191 {
192  const unsigned int eighth_points = num_points / 8;
193  float fst = 0.0;
194  float sq = 0.0;
195  float thrd = 0.0;
196  float frth = 0.0;
197 
198  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
199  __m256 target_vec;
200  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
201 
202  cpa0 = _mm256_set1_ps(center_point_array[0]);
203  cpa1 = _mm256_set1_ps(center_point_array[1]);
204  cpa2 = _mm256_set1_ps(center_point_array[2]);
205  cpa3 = _mm256_set1_ps(center_point_array[3]);
206  cutoff_vec = _mm256_set1_ps(*cutoff);
207  target_vec = _mm256_setzero_ps();
208 
209  unsigned int i;
210 
211  for(i = 0; i < eighth_points; ++i) {
212  x_to_1 = _mm256_load_ps(src0);
213  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
214  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
215  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
216  // x^1 * x^3 is slightly faster than x^2 * x^2
217  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
218 
219  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
220  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
221 
222  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
223  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
224  // this is slightly faster than result += (x_to_1 + x_to_3)
225  target_vec = _mm256_add_ps(x_to_1, target_vec);
226  target_vec = _mm256_add_ps(x_to_3, target_vec);
227 
228  src0 += 8;
229  }
230 
231  // the hadd for vector reduction has very very slight impact @ 50k iters
232  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
233  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
234  _mm256_store_ps(temp_results, target_vec);
235  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
236 
237  for(i = eighth_points*8; i < num_points; ++i) {
238  fst = *src0++;
239  fst = MAX(fst, *cutoff);
240  sq = fst * fst;
241  thrd = fst * sq;
242  frth = sq * sq;
243  *target += (center_point_array[0] * fst +
244  center_point_array[1] * sq +
245  center_point_array[2] * thrd +
246  center_point_array[3] * frth);
247  }
248  *target += (float)(num_points) * center_point_array[4];
249 }
250 #endif // LV_HAVE_AVX && LV_HAVE_FMA
251 
252 #ifdef LV_HAVE_AVX
253 #include<immintrin.h>
254 
255 static inline void
256 volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float* src0, float* center_point_array,
257  float* cutoff, unsigned int num_points)
258 {
259  const unsigned int eighth_points = num_points / 8;
260  float fst = 0.0;
261  float sq = 0.0;
262  float thrd = 0.0;
263  float frth = 0.0;
264 
265  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
266  __m256 target_vec;
267  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
268 
269  cpa0 = _mm256_set1_ps(center_point_array[0]);
270  cpa1 = _mm256_set1_ps(center_point_array[1]);
271  cpa2 = _mm256_set1_ps(center_point_array[2]);
272  cpa3 = _mm256_set1_ps(center_point_array[3]);
273  cutoff_vec = _mm256_set1_ps(*cutoff);
274  target_vec = _mm256_setzero_ps();
275 
276  unsigned int i;
277 
278  for(i = 0; i < eighth_points; ++i) {
279  x_to_1 = _mm256_load_ps(src0);
280  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
281  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
282  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
283  // x^1 * x^3 is slightly faster than x^2 * x^2
284  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
285 
286  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
287  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
288  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
289  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
290 
291  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
292  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
293  // this is slightly faster than result += (x_to_1 + x_to_3)
294  target_vec = _mm256_add_ps(x_to_1, target_vec);
295  target_vec = _mm256_add_ps(x_to_3, target_vec);
296 
297  src0 += 8;
298  }
299 
300  // the hadd for vector reduction has very very slight impact @ 50k iters
301  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
302  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
303  _mm256_store_ps(temp_results, target_vec);
304  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
305 
306  for(i = eighth_points*8; i < num_points; ++i) {
307  fst = *src0++;
308  fst = MAX(fst, *cutoff);
309  sq = fst * fst;
310  thrd = fst * sq;
311  frth = sq * sq;
312  *target += (center_point_array[0] * fst +
313  center_point_array[1] * sq +
314  center_point_array[2] * thrd +
315  center_point_array[3] * frth);
316  }
317  *target += (float)(num_points) * center_point_array[4];
318 }
319 #endif // LV_HAVE_AVX
320 
321 
322 
323 #ifdef LV_HAVE_GENERIC
324 
325 static inline void
326 volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array,
327  float* cutoff, unsigned int num_points)
328 {
329  const unsigned int eighth_points = num_points / 8;
330 
331  float result[8] = {0.0f,0.0f,0.0f,0.0f, 0.0f,0.0f,0.0f,0.0f};
332  float fst = 0.0f;
333  float sq = 0.0f;
334  float thrd = 0.0f;
335  float frth = 0.0f;
336 
337  unsigned int i = 0;
338  unsigned int k = 0;
339  for(i = 0; i < eighth_points; ++i) {
340  for(k = 0; k < 8; ++k) {
341  fst = *src0++;
342  fst = MAX(fst, *cutoff);
343  sq = fst * fst;
344  thrd = fst * sq;
345  frth = fst * thrd;
346  result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
347  result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
348  }
349  }
350  for(k = 0; k < 8; k+=2)
351  result[k] = result[k]+result[k+1];
352 
353  *target = result[0] + result[2] + result[4] + result[6];
354 
355  for(i = eighth_points*8; i < num_points; ++i) {
356  fst = *src0++;
357  fst = MAX(fst, *cutoff);
358  sq = fst * fst;
359  thrd = fst * sq;
360  frth = fst * thrd;
361  *target += (center_point_array[0] * fst +
362  center_point_array[1] * sq +
363  center_point_array[2] * thrd +
364  center_point_array[3] * frth);
365  }
366  *target += (float)(num_points) * center_point_array[4];
367 }
368 
369 #endif /*LV_HAVE_GENERIC*/
370 
371 #ifdef LV_HAVE_NEON
372 #include <arm_neon.h>
373 
374 static inline void
375 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0,
376  float* __restrict center_point_array,
377  float* __restrict cutoff, unsigned int num_points)
378 {
379  unsigned int i;
380  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
381 
382  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
383  float32x2_t cutoff_vector;
384  float32x2x2_t x_low, x_high;
385  float32x4_t x_qvector, c_qvector, cpa_qvector;
386  float accumulator;
387  float res_accumulators[4];
388 
389  c_qvector = vld1q_f32( zero );
390  // load the cutoff in to a vector
391  cutoff_vector = vdup_n_f32( *cutoff );
392  // ... center point array
393  cpa_qvector = vld1q_f32( center_point_array );
394 
395  for(i=0; i < num_points; ++i) {
396  // load x (src0)
397  x_to_1 = vdup_n_f32( *src0++ );
398 
399  // Get a vector of max(src0, cutoff)
400  x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
401  x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
402  x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
403  x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
404  // zip up doubles to interleave
405  x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
406  x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
407  // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
408  x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
409  // now we finally have [x^4 | x^3 | x^2 | x] !
410 
411  c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
412 
413  }
414  // there should be better vector reduction techniques
415  vst1q_f32(res_accumulators, c_qvector );
416  accumulator = res_accumulators[0] + res_accumulators[1] +
417  res_accumulators[2] + res_accumulators[3];
418 
419  *target = accumulator + (float)num_points * center_point_array[4];
420 }
421 
422 #endif /* LV_HAVE_NEON */
423 
424 
425 #ifdef LV_HAVE_NEON
426 
427 static inline void
428 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0,
429  float* __restrict center_point_array,
430  float* __restrict cutoff, unsigned int num_points)
431 {
432  unsigned int i;
433  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
434 
435  float accumulator;
436 
437  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
438  accumulator1_vec = vld1q_f32(zero);
439  accumulator2_vec = vld1q_f32(zero);
440  accumulator3_vec = vld1q_f32(zero);
441  accumulator4_vec = vld1q_f32(zero);
442  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
443  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
444 
445  // load the cutoff in to a vector
446  cutoff_vector = vdupq_n_f32( *cutoff );
447  // ... center point array
448  cpa_0 = vdupq_n_f32(center_point_array[0]);
449  cpa_1 = vdupq_n_f32(center_point_array[1]);
450  cpa_2 = vdupq_n_f32(center_point_array[2]);
451  cpa_3 = vdupq_n_f32(center_point_array[3]);
452 
453  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
454  for(i=0; i < num_points/4; ++i) {
455  // load x
456  x_to_1 = vld1q_f32( src0 );
457 
458  // Get a vector of max(src0, cutoff)
459  x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
460  x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
461  x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
462  x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
463  x_to_1 = vmulq_f32(x_to_1, cpa_0);
464  x_to_2 = vmulq_f32(x_to_2, cpa_1);
465  x_to_3 = vmulq_f32(x_to_3, cpa_2);
466  x_to_4 = vmulq_f32(x_to_4, cpa_3);
467  accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
468  accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
469  accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
470  accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
471 
472  src0 += 4;
473  }
474  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
475  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
476  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
477 
478  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
479  vst1q_f32(res_accumulators, accumulator1_vec );
480  accumulator = res_accumulators[0] + res_accumulators[1] +
481  res_accumulators[2] + res_accumulators[3];
482 
483  float fst = 0.0;
484  float sq = 0.0;
485  float thrd = 0.0;
486  float frth = 0.0;
487 
488  for(i = 4*num_points/4; i < num_points; ++i) {
489  fst = src0[i];
490  fst = MAX(fst, *cutoff);
491 
492  sq = fst * fst;
493  thrd = fst * sq;
494  frth = sq * sq;
495  //fith = sq * thrd;
496 
497  accumulator += (center_point_array[0] * fst +
498  center_point_array[1] * sq +
499  center_point_array[2] * thrd +
500  center_point_array[3] * frth); //+
501  }
502 
503  *target = accumulator + (float)num_points * center_point_array[4];
504 }
505 
506 #endif /* LV_HAVE_NEON */
507 
508 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
509 
510 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
511 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
512 
513 #include<inttypes.h>
514 #include<stdio.h>
515 #include<volk/volk_complex.h>
516 
517 #ifndef MAX
518 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
519 #endif
520 
521 #if LV_HAVE_AVX && LV_HAVE_FMA
522 #include<immintrin.h>
523 
524 static inline void
525 volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target, float* src0, float* center_point_array,
526  float* cutoff, unsigned int num_points)
527 {
528  const unsigned int eighth_points = num_points / 8;
529  float fst = 0.0;
530  float sq = 0.0;
531  float thrd = 0.0;
532  float frth = 0.0;
533 
534  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
535  __m256 target_vec;
536  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
537 
538  cpa0 = _mm256_set1_ps(center_point_array[0]);
539  cpa1 = _mm256_set1_ps(center_point_array[1]);
540  cpa2 = _mm256_set1_ps(center_point_array[2]);
541  cpa3 = _mm256_set1_ps(center_point_array[3]);
542  cutoff_vec = _mm256_set1_ps(*cutoff);
543  target_vec = _mm256_setzero_ps();
544 
545  unsigned int i;
546 
547  for(i = 0; i < eighth_points; ++i) {
548  x_to_1 = _mm256_loadu_ps(src0);
549  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
550  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
551  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
552  // x^1 * x^3 is slightly faster than x^2 * x^2
553  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
554 
555  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
556  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
557 
558  x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
559  x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
560  // this is slightly faster than result += (x_to_1 + x_to_3)
561  target_vec = _mm256_add_ps(x_to_1, target_vec);
562  target_vec = _mm256_add_ps(x_to_3, target_vec);
563 
564  src0 += 8;
565  }
566 
567  // the hadd for vector reduction has very very slight impact @ 50k iters
568  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
569  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
570  _mm256_storeu_ps(temp_results, target_vec);
571  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
572 
573  for(i = eighth_points*8; i < num_points; ++i) {
574  fst = *src0++;
575  fst = MAX(fst, *cutoff);
576  sq = fst * fst;
577  thrd = fst * sq;
578  frth = sq * sq;
579  *target += (center_point_array[0] * fst +
580  center_point_array[1] * sq +
581  center_point_array[2] * thrd +
582  center_point_array[3] * frth);
583  }
584 
585  *target += (float)(num_points) * center_point_array[4];
586 }
587 #endif // LV_HAVE_AVX && LV_HAVE_FMA
588 
589 #ifdef LV_HAVE_AVX
590 #include<immintrin.h>
591 
592 static inline void
593 volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, float* center_point_array,
594  float* cutoff, unsigned int num_points)
595 {
596  const unsigned int eighth_points = num_points / 8;
597  float fst = 0.0;
598  float sq = 0.0;
599  float thrd = 0.0;
600  float frth = 0.0;
601 
602  __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
603  __m256 target_vec;
604  __m256 x_to_1, x_to_2, x_to_3, x_to_4;
605 
606  cpa0 = _mm256_set1_ps(center_point_array[0]);
607  cpa1 = _mm256_set1_ps(center_point_array[1]);
608  cpa2 = _mm256_set1_ps(center_point_array[2]);
609  cpa3 = _mm256_set1_ps(center_point_array[3]);
610  cutoff_vec = _mm256_set1_ps(*cutoff);
611  target_vec = _mm256_setzero_ps();
612 
613  unsigned int i;
614 
615  for(i = 0; i < eighth_points; ++i) {
616  x_to_1 = _mm256_loadu_ps(src0);
617  x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
618  x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
619  x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
620  // x^1 * x^3 is slightly faster than x^2 * x^2
621  x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
622 
623  x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
624  x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
625  x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
626  x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
627 
628  x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
629  x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
630  // this is slightly faster than result += (x_to_1 + x_to_3)
631  target_vec = _mm256_add_ps(x_to_1, target_vec);
632  target_vec = _mm256_add_ps(x_to_3, target_vec);
633 
634  src0 += 8;
635  }
636 
637  // the hadd for vector reduction has very very slight impact @ 50k iters
638  __VOLK_ATTR_ALIGNED(32) float temp_results[8];
639  target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
640  _mm256_storeu_ps(temp_results, target_vec);
641  *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
642 
643  for(i = eighth_points*8; i < num_points; ++i) {
644  fst = *src0++;
645  fst = MAX(fst, *cutoff);
646  sq = fst * fst;
647  thrd = fst * sq;
648  frth = sq * sq;
649 
650  *target += (center_point_array[0] * fst +
651  center_point_array[1] * sq +
652  center_point_array[2] * thrd +
653  center_point_array[3] * frth);
654  }
655 
656  *target += (float)(num_points) * center_point_array[4];
657 }
658 #endif // LV_HAVE_AVX
659 
660 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
volk_32f_x3_sum_of_poly_32f_u_avx
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:593
volk_32f_x3_sum_of_poly_32f_neonvert
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:428
__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:47
MAX
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:90
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_32f_x3_sum_of_poly_32f_generic
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:326
volk_32f_x3_sum_of_poly_32f_a_neon
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:375
volk_32f_x3_sum_of_poly_32f_a_avx
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:256
volk_complex.h
volk_32f_x3_sum_of_poly_32f_a_sse3
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:98