80 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
81 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
88 #define ROTATOR_RELOAD 512
91 #ifdef LV_HAVE_GENERIC
98 *outVector++ = *inVector++ * (*phase);
99 (*phase) *= phase_inc;
105 *outVector++ = *inVector++ * (*phase);
106 (*phase) *= phase_inc;
118 #include <arm_neon.h>
125 const lv_32fc_t* inputVectorPtr = inVector;
127 lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
128 float32x4x2_t input_vec;
129 float32x4x2_t output_vec;
131 unsigned int i = 0, j = 0;
132 const unsigned int quarter_points = num_points / 4;
134 for(
i = 0;
i < 4; ++
i) {
140 const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
141 const float32x4x2_t incr_vec = vld2q_f32((
float*) incrPtr);
142 float32x4x2_t phase_vec = vld2q_f32((
float*) phasePtr);
146 input_vec = vld2q_f32((
float*) inputVectorPtr);
154 vst2q_f32((
float*)outputVectorPtr, output_vec);
164 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
165 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
169 input_vec = vld2q_f32((
float*) inputVectorPtr);
177 vst2q_f32((
float*)outputVectorPtr, output_vec);
189 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
190 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
193 vst2q_f32((
float*)phasePtr, phase_vec);
196 for(
i = 0;
i < num_points % 4;
i++) {
197 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
198 phasePtr[0] *= (phase_inc);
202 (*phase) = phasePtr[0];
208 #ifdef LV_HAVE_SSE4_1
209 #include <smmintrin.h>
211 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
215 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
217 unsigned int i, j = 0;
219 for(
i = 0;
i < 2; ++
i) {
220 phase_Ptr[
i] *= incr;
227 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
232 const unsigned int halfPoints = num_points / 2;
238 aVal = _mm_load_ps((
float*)aPtr);
240 yl = _mm_moveldup_ps(phase_Val);
241 yh = _mm_movehdup_ps(phase_Val);
242 ylp = _mm_moveldup_ps(inc_Val);
243 yhp = _mm_movehdup_ps(inc_Val);
245 tmp1 = _mm_mul_ps(aVal, yl);
246 tmp1p = _mm_mul_ps(phase_Val, ylp);
248 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
249 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
250 tmp2 = _mm_mul_ps(aVal, yh);
251 tmp2p = _mm_mul_ps(phase_Val, yhp);
253 z = _mm_addsub_ps(tmp1, tmp2);
254 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
256 _mm_store_ps((
float*)cPtr, z);
261 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
262 tmp2 = _mm_hadd_ps(tmp1, tmp1);
263 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
264 tmp2 = _mm_sqrt_ps(tmp1);
265 phase_Val = _mm_div_ps(phase_Val, tmp2);
268 aVal = _mm_load_ps((
float*)aPtr);
270 yl = _mm_moveldup_ps(phase_Val);
271 yh = _mm_movehdup_ps(phase_Val);
272 ylp = _mm_moveldup_ps(inc_Val);
273 yhp = _mm_movehdup_ps(inc_Val);
275 tmp1 = _mm_mul_ps(aVal, yl);
277 tmp1p = _mm_mul_ps(phase_Val, ylp);
279 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
280 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
281 tmp2 = _mm_mul_ps(aVal, yh);
282 tmp2p = _mm_mul_ps(phase_Val, yhp);
284 z = _mm_addsub_ps(tmp1, tmp2);
285 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
287 _mm_store_ps((
float*)cPtr, z);
293 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
294 tmp2 = _mm_hadd_ps(tmp1, tmp1);
295 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
296 tmp2 = _mm_sqrt_ps(tmp1);
297 phase_Val = _mm_div_ps(phase_Val, tmp2);
300 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
301 if (num_points & 1) {
302 *cPtr++ = *aPtr++ * phase_Ptr[0];
303 phase_Ptr[0] *= (phase_inc);
306 (*phase) = phase_Ptr[0];
313 #ifdef LV_HAVE_SSE4_1
314 #include <smmintrin.h>
316 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
320 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
322 unsigned int i, j = 0;
324 for(
i = 0;
i < 2; ++
i) {
325 phase_Ptr[
i] *= incr;
332 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
334 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
337 const unsigned int halfPoints = num_points / 2;
343 aVal = _mm_loadu_ps((
float*)aPtr);
345 yl = _mm_moveldup_ps(phase_Val);
346 yh = _mm_movehdup_ps(phase_Val);
347 ylp = _mm_moveldup_ps(inc_Val);
348 yhp = _mm_movehdup_ps(inc_Val);
350 tmp1 = _mm_mul_ps(aVal, yl);
351 tmp1p = _mm_mul_ps(phase_Val, ylp);
353 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
354 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
355 tmp2 = _mm_mul_ps(aVal, yh);
356 tmp2p = _mm_mul_ps(phase_Val, yhp);
358 z = _mm_addsub_ps(tmp1, tmp2);
359 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
361 _mm_storeu_ps((
float*)cPtr, z);
366 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
367 tmp2 = _mm_hadd_ps(tmp1, tmp1);
368 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
369 tmp2 = _mm_sqrt_ps(tmp1);
370 phase_Val = _mm_div_ps(phase_Val, tmp2);
373 aVal = _mm_loadu_ps((
float*)aPtr);
375 yl = _mm_moveldup_ps(phase_Val);
376 yh = _mm_movehdup_ps(phase_Val);
377 ylp = _mm_moveldup_ps(inc_Val);
378 yhp = _mm_movehdup_ps(inc_Val);
380 tmp1 = _mm_mul_ps(aVal, yl);
382 tmp1p = _mm_mul_ps(phase_Val, ylp);
384 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
385 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
386 tmp2 = _mm_mul_ps(aVal, yh);
387 tmp2p = _mm_mul_ps(phase_Val, yhp);
389 z = _mm_addsub_ps(tmp1, tmp2);
390 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
392 _mm_storeu_ps((
float*)cPtr, z);
398 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
399 tmp2 = _mm_hadd_ps(tmp1, tmp1);
400 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
401 tmp2 = _mm_sqrt_ps(tmp1);
402 phase_Val = _mm_div_ps(phase_Val, tmp2);
405 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
406 if (num_points & 1) {
407 *cPtr++ = *aPtr++ * phase_Ptr[0];
408 phase_Ptr[0] *= (phase_inc);
411 (*phase) = phase_Ptr[0];
419 #include <immintrin.h>
426 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
428 unsigned int i, j = 0;
430 for(
i = 0;
i < 4; ++
i) {
431 phase_Ptr[
i] *= incr;
435 __m256 aVal, phase_Val, z;
437 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
444 const unsigned int fourthPoints = num_points / 4;
449 aVal = _mm256_load_ps((
float*)aPtr);
454 _mm256_store_ps((
float*)cPtr, z);
463 aVal = _mm256_load_ps((
float*)aPtr);
468 _mm256_store_ps((
float*)cPtr, z);
477 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
478 (*phase) = phase_Ptr[0];
486 #include <immintrin.h>
493 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
495 unsigned int i, j = 0;
497 for(
i = 0;
i < 4; ++
i) {
498 phase_Ptr[
i] *= incr;
502 __m256 aVal, phase_Val, z;
504 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
511 const unsigned int fourthPoints = num_points / 4;
516 aVal = _mm256_loadu_ps((
float*)aPtr);
521 _mm256_storeu_ps((
float*)cPtr, z);
531 aVal = _mm256_loadu_ps((
float*)aPtr);
536 _mm256_storeu_ps((
float*)cPtr, z);
545 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
546 (*phase) = phase_Ptr[0];
552 #if LV_HAVE_AVX && LV_HAVE_FMA
553 #include <immintrin.h>
555 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
561 unsigned int i, j = 0;
563 for(
i = 0;
i < 4; ++
i) {
564 phase_Ptr[
i] *= incr;
568 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
570 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
572 const unsigned int fourthPoints = num_points / 4;
577 aVal = _mm256_load_ps((
float*)aPtr);
579 yl = _mm256_moveldup_ps(phase_Val);
580 yh = _mm256_movehdup_ps(phase_Val);
581 ylp = _mm256_moveldup_ps(inc_Val);
582 yhp = _mm256_movehdup_ps(inc_Val);
587 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
588 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
589 tmp2 = _mm256_mul_ps(aVal, yh);
590 tmp2p = _mm256_mul_ps(phase_Val, yhp);
592 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
593 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
595 _mm256_store_ps((
float*)cPtr, z);
600 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
601 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
602 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
603 tmp2 = _mm256_sqrt_ps(tmp1);
604 phase_Val = _mm256_div_ps(phase_Val, tmp2);
607 aVal = _mm256_load_ps((
float*)aPtr);
609 yl = _mm256_moveldup_ps(phase_Val);
610 yh = _mm256_movehdup_ps(phase_Val);
611 ylp = _mm256_moveldup_ps(inc_Val);
612 yhp = _mm256_movehdup_ps(inc_Val);
617 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
618 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
619 tmp2 = _mm256_mul_ps(aVal, yh);
620 tmp2p = _mm256_mul_ps(phase_Val, yhp);
622 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
623 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
625 _mm256_store_ps((
float*)cPtr, z);
631 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
632 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
633 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
634 tmp2 = _mm256_sqrt_ps(tmp1);
635 phase_Val = _mm256_div_ps(phase_Val, tmp2);
638 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
639 for(
i = 0;
i < num_points%4; ++
i) {
640 *cPtr++ = *aPtr++ * phase_Ptr[0];
641 phase_Ptr[0] *= (phase_inc);
644 (*phase) = phase_Ptr[0];
650 #if LV_HAVE_AVX && LV_HAVE_FMA
651 #include <immintrin.h>
653 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
657 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
659 unsigned int i, j = 0;
661 for(
i = 0;
i < 4; ++
i) {
662 phase_Ptr[
i] *= incr;
666 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
668 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
670 const unsigned int fourthPoints = num_points / 4;
675 aVal = _mm256_loadu_ps((
float*)aPtr);
677 yl = _mm256_moveldup_ps(phase_Val);
678 yh = _mm256_movehdup_ps(phase_Val);
679 ylp = _mm256_moveldup_ps(inc_Val);
680 yhp = _mm256_movehdup_ps(inc_Val);
685 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
686 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
687 tmp2 = _mm256_mul_ps(aVal, yh);
688 tmp2p = _mm256_mul_ps(phase_Val, yhp);
690 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
691 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
693 _mm256_storeu_ps((
float*)cPtr, z);
698 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
699 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
700 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
701 tmp2 = _mm256_sqrt_ps(tmp1);
702 phase_Val = _mm256_div_ps(phase_Val, tmp2);
705 aVal = _mm256_loadu_ps((
float*)aPtr);
707 yl = _mm256_moveldup_ps(phase_Val);
708 yh = _mm256_movehdup_ps(phase_Val);
709 ylp = _mm256_moveldup_ps(inc_Val);
710 yhp = _mm256_movehdup_ps(inc_Val);
715 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
716 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
717 tmp2 = _mm256_mul_ps(aVal, yh);
718 tmp2p = _mm256_mul_ps(phase_Val, yhp);
720 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
721 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
723 _mm256_storeu_ps((
float*)cPtr, z);
729 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
730 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
731 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
732 tmp2 = _mm256_sqrt_ps(tmp1);
733 phase_Val = _mm256_div_ps(phase_Val, tmp2);
736 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
737 for(
i = 0;
i < num_points%4; ++
i) {
738 *cPtr++ = *aPtr++ * phase_Ptr[0];
739 phase_Ptr[0] *= (phase_inc);
742 (*phase) = phase_Ptr[0];