70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
82 volk_32fc_index_max_32u_a_avx2(uint32_t* target,
lv_32fc_t* src0,
85 const uint32_t num_bytes = num_points*8;
92 __m256 xmm1, xmm2, xmm3;
93 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
95 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
96 xmm4.int_vec = xmmfour = _mm256_setzero_si256();
97 holderf.int_vec = holder0 = _mm256_setzero_si256();
98 holderi.int_vec = holder1 = _mm256_setzero_si256();
100 int bound = num_bytes >> 6;
103 xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
104 xmm9 = _mm256_setzero_si256();
105 xmm10 = _mm256_set1_epi32(8);
106 xmm3 = _mm256_setzero_ps();
107 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
109 for(;
i < bound; ++
i) {
110 xmm1 = _mm256_load_ps((
float*)src0);
111 xmm2 = _mm256_load_ps((
float*)&src0[4]);
115 xmm1 = _mm256_mul_ps(xmm1, xmm1);
116 xmm2 = _mm256_mul_ps(xmm2, xmm2);
118 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
119 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
121 xmm3 = _mm256_max_ps(xmm1, xmm3);
123 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
124 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
126 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
127 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
129 xmm9 = _mm256_add_epi32(xmm11, xmm12);
131 xmm8 = _mm256_add_epi32(xmm8, xmm10);
134 xmm10 = _mm256_set1_epi32(4);
135 if (num_bytes >> 5 & 1) {
136 xmm1 = _mm256_load_ps((
float*)src0);
138 xmm1 = _mm256_mul_ps(xmm1, xmm1);
142 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
144 xmm3 = _mm256_max_ps(xmm1, xmm3);
146 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
147 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
149 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
150 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
152 xmm9 = _mm256_add_epi32(xmm11, xmm12);
154 xmm8 = _mm256_add_epi32(xmm8, xmm10);
157 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
158 xmm10 = _mm256_set1_epi32(2);
159 if (num_bytes >> 4 & 1) {
160 xmm2 = _mm256_load_ps((
float*)src0);
165 xmm2 = _mm256_mul_ps(xmm2, xmm2);
169 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
171 xmm3 = _mm256_max_ps(xmm1, xmm3);
173 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
174 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
176 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
177 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
179 xmm9 = _mm256_add_epi32(xmm11, xmm12);
181 xmm8 = _mm256_add_epi32(xmm8, xmm10);
184 _mm256_store_ps((
float*)&(holderf.f), xmm3);
185 _mm256_store_si256(&(holderi.int_vec), xmm9);
187 target[0] = holderi.i[0];
188 sq_dist = holderf.f[0];
189 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
190 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
191 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
192 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
193 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
194 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
195 target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
196 sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
197 target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
198 sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
199 target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
200 sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
201 target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
202 sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
209 #include<xmmintrin.h>
210 #include<pmmintrin.h>
216 const uint32_t num_bytes = num_points*8;
223 __m128 xmm1, xmm2, xmm3;
224 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
226 xmm5.
int_vec = xmmfive = _mm_setzero_si128();
227 xmm4.
int_vec = xmmfour = _mm_setzero_si128();
228 holderf.
int_vec = holder0 = _mm_setzero_si128();
229 holderi.
int_vec = holder1 = _mm_setzero_si128();
231 int bound = num_bytes >> 5;
234 xmm8 = _mm_set_epi32(3, 2, 1, 0);
235 xmm9 = _mm_setzero_si128();
236 xmm10 = _mm_set_epi32(4, 4, 4, 4);
237 xmm3 = _mm_setzero_ps();
241 for(;
i < bound; ++
i) {
242 xmm1 = _mm_load_ps((
float*)src0);
243 xmm2 = _mm_load_ps((
float*)&src0[2]);
247 xmm1 = _mm_mul_ps(xmm1, xmm1);
248 xmm2 = _mm_mul_ps(xmm2, xmm2);
250 xmm1 = _mm_hadd_ps(xmm1, xmm2);
252 xmm3 = _mm_max_ps(xmm1, xmm3);
254 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
255 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
257 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
258 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
260 xmm9 = _mm_add_epi32(xmm11, xmm12);
262 xmm8 = _mm_add_epi32(xmm8, xmm10);
269 if (num_bytes >> 4 & 1) {
270 xmm2 = _mm_load_ps((
float*)src0);
275 xmm2 = _mm_mul_ps(xmm2, xmm2);
279 xmm1 = _mm_hadd_ps(xmm2, xmm2);
281 xmm3 = _mm_max_ps(xmm1, xmm3);
283 xmm10 = _mm_set_epi32(2, 2, 2, 2);
285 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
288 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
293 xmm8 = _mm_add_epi32(xmm8, xmm10);
297 if (num_bytes >> 3 & 1) {
302 xmm2 = _mm_load1_ps(&sq_dist);
306 xmm3 = _mm_max_ss(xmm3, xmm2);
308 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
309 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
311 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
313 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
314 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
316 xmm9 = _mm_add_epi32(xmm11, xmm12);
322 _mm_store_ps((
float*)&(holderf.
f), xmm3);
323 _mm_store_si128(&(holderi.
int_vec), xmm9);
325 target[0] = holderi.
i[0];
326 sq_dist = holderf.
f[0];
327 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
328 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
329 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
330 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
331 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
332 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
356 #ifdef LV_HAVE_GENERIC
361 const uint32_t num_bytes = num_points*8;
369 for(; i < num_bytes >> 3; ++
i) {
372 index = sq_dist > max ?
i : index;
373 max = sq_dist > max ? sq_dist : max;
384 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
385 #define INCLUDED_volk_32fc_index_max_32u_u_H
393 #include<immintrin.h>
396 volk_32fc_index_max_32u_u_avx2(uint32_t* target,
lv_32fc_t* src0,
399 const uint32_t num_bytes = num_points*8;
406 __m256 xmm1, xmm2, xmm3;
407 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
409 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
410 xmm4.
int_vec = xmmfour = _mm256_setzero_si256();
411 holderf.
int_vec = holder0 = _mm256_setzero_si256();
412 holderi.
int_vec = holder1 = _mm256_setzero_si256();
414 int bound = num_bytes >> 6;
417 xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
418 xmm9 = _mm256_setzero_si256();
419 xmm10 = _mm256_set1_epi32(8);
420 xmm3 = _mm256_setzero_ps();
421 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
423 for(;
i < bound; ++
i) {
424 xmm1 = _mm256_loadu_ps((
float*)src0);
425 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
429 xmm1 = _mm256_mul_ps(xmm1, xmm1);
430 xmm2 = _mm256_mul_ps(xmm2, xmm2);
432 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
433 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
435 xmm3 = _mm256_max_ps(xmm1, xmm3);
437 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
438 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
440 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
441 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
443 xmm9 = _mm256_add_epi32(xmm11, xmm12);
445 xmm8 = _mm256_add_epi32(xmm8, xmm10);
448 xmm10 = _mm256_set1_epi32(4);
449 if (num_bytes >> 5 & 1) {
450 xmm1 = _mm256_loadu_ps((
float*)src0);
452 xmm1 = _mm256_mul_ps(xmm1, xmm1);
456 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
458 xmm3 = _mm256_max_ps(xmm1, xmm3);
460 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
461 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
463 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
464 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
466 xmm9 = _mm256_add_epi32(xmm11, xmm12);
468 xmm8 = _mm256_add_epi32(xmm8, xmm10);
471 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
472 xmm10 = _mm256_set1_epi32(2);
473 if (num_bytes >> 4 & 1) {
474 xmm2 = _mm256_loadu_ps((
float*)src0);
479 xmm2 = _mm256_mul_ps(xmm2, xmm2);
483 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
485 xmm3 = _mm256_max_ps(xmm1, xmm3);
487 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
488 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
490 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
491 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
493 xmm9 = _mm256_add_epi32(xmm11, xmm12);
495 xmm8 = _mm256_add_epi32(xmm8, xmm10);
498 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
499 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
501 target[0] = holderi.
i[0];
502 sq_dist = holderf.
f[0];
503 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
504 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
505 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
506 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
507 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
508 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
509 target[0] = (holderf.
f[4] > sq_dist) ? holderi.
i[4] : target[0];
510 sq_dist = (holderf.
f[4] > sq_dist) ? holderf.
f[4] : sq_dist;
511 target[0] = (holderf.
f[5] > sq_dist) ? holderi.
i[5] : target[0];
512 sq_dist = (holderf.
f[5] > sq_dist) ? holderf.
f[5] : sq_dist;
513 target[0] = (holderf.
f[6] > sq_dist) ? holderi.
i[6] : target[0];
514 sq_dist = (holderf.
f[6] > sq_dist) ? holderf.
f[6] : sq_dist;
515 target[0] = (holderf.
f[7] > sq_dist) ? holderi.
i[7] : target[0];
516 sq_dist = (holderf.
f[7] > sq_dist) ? holderf.
f[7] : sq_dist;
523 #include <arm_neon.h>
528 unsigned int number = 0;
529 const uint32_t quarter_points = num_points / 4;
532 uint32_t indices[4] = {0, 1, 2, 3};
533 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
534 uint32x4_t vec_indices = vld1q_u32(indices);
535 uint32x4_t vec_max_indices = vec_indices;
539 float max = *src0Ptr;
542 float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
544 for(;number < quarter_points; number++)
550 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
551 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
552 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
553 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
555 uint32_t tmp_max_indices[4];
557 vst1q_u32(tmp_max_indices, vec_max_indices);
558 vst1q_f32(tmp_max, vec_max);
560 for (
int i = 0;
i < 4;
i++) {
561 if (tmp_max[
i] > max) {
563 index = tmp_max_indices[
i];
568 for(number = quarter_points * 4;number < num_points; number++)
570 const float re =
lv_creal(*src0Ptr);
571 const float im =
lv_cimag(*src0Ptr);
572 if ((re*re+im*im) > max) {