84 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
85 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
93 #ifdef LV_HAVE_GENERIC
99 unsigned int number = num_points;
102 while (number >= 8) {
103 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
104 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
105 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
106 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
107 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
108 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
109 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
110 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
115 while (number-- > 0) {
116 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
123 #include <immintrin.h>
127 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
130 unsigned int isodd = num_points & 3;
133 lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
140 s = _mm256_loadu_ps((
float*)v_scalar);
142 for(;number < quarterPoints; number++) {
143 x = _mm256_loadu_ps((
float*)b);
144 y = _mm256_loadu_ps((
float*)a);
146 z = _mm256_add_ps(y, z);
147 _mm256_storeu_ps((
float*)c,z);
154 for(
i = num_points-isodd;
i < num_points;
i++) {
155 *c++ = (*a++) +
lv_conj(*b++) * scalar;
162 #include <pmmintrin.h>
166 unsigned int number = 0;
167 const unsigned int halfPoints = num_points / 2;
170 lv_32fc_t v_scalar[2] = {scalar, scalar};
177 s = _mm_loadu_ps((
float*)v_scalar);
179 for(;number < halfPoints; number++){
180 x = _mm_loadu_ps((
float*)b);
181 y = _mm_loadu_ps((
float*)a);
183 z = _mm_add_ps(y, z);
184 _mm_storeu_ps((
float*)c,z);
191 if((num_points % 2) != 0) {
192 *c = *a +
lv_conj(*b) * scalar;
199 #include <immintrin.h>
203 unsigned int number = 0;
205 const unsigned int quarterPoints = num_points / 4;
206 unsigned int isodd = num_points & 3;
209 lv_32fc_t v_scalar[4] = {scalar, scalar, scalar, scalar};
216 s = _mm256_load_ps((
float*)v_scalar);
218 for(;number < quarterPoints; number++) {
219 x = _mm256_load_ps((
float*)b);
220 y = _mm256_load_ps((
float*)a);
222 z = _mm256_add_ps(y, z);
223 _mm256_store_ps((
float*)c,z);
230 for(
i = num_points-isodd;
i < num_points;
i++) {
231 *c++ = (*a++) +
lv_conj(*b++) * scalar;
238 #include <pmmintrin.h>
242 unsigned int number = 0;
243 const unsigned int halfPoints = num_points / 2;
246 lv_32fc_t v_scalar[2] = {scalar, scalar};
253 s = _mm_load_ps((
float*)v_scalar);
255 for(;number < halfPoints; number++){
256 x = _mm_load_ps((
float*)b);
257 y = _mm_load_ps((
float*)a);
259 z = _mm_add_ps(y, z);
260 _mm_store_ps((
float*)c,z);
267 if((num_points % 2) != 0) {
268 *c = *a +
lv_conj(*b) * scalar;
275 #include <arm_neon.h>
281 unsigned int number = num_points;
282 unsigned int quarter_points = num_points / 4;
284 float32x4x2_t a_val, b_val, c_val, scalar_val;
285 float32x4x2_t tmp_val;
287 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
288 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
290 for(number = 0; number < quarter_points; ++number) {
291 a_val = vld2q_f32((
float*)aPtr);
292 b_val = vld2q_f32((
float*)bPtr);
293 b_val.val[1] = vnegq_f32(b_val.val[1]);
297 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
298 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
300 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
301 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
303 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
304 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
306 vst2q_f32((
float*)cPtr, c_val);
313 for(number = quarter_points*4; number < num_points; number++){
314 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;