Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
80 
81 #include<inttypes.h>
82 #include<stdio.h>
83 #include<volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include<immintrin.h>
87 
88 static inline void
89 volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
90  unsigned int num_points)
91 {
92  const unsigned int num_bytes = num_points*8;
93  __m128 xmm0, xmm9, xmm10;
94  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
95 
96  lv_32fc_t diff;
97  float sq_dist;
98  int bound = num_bytes >> 6;
99  int leftovers0 = (num_bytes >> 5) & 1;
100  int leftovers1 = (num_bytes >> 4) & 1;
101  int leftovers2 = (num_bytes >> 3) & 1;
102  int i = 0;
103 
104  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
105  xmm1 = _mm256_setzero_ps();
106  xmm2 = _mm256_load_ps((float*)&points[0]);
107  xmm0 = _mm_load_ps((float*)src0);
108  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
109  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
110  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
111  xmm3 = _mm256_load_ps((float*)&points[4]);
112 
113  for(; i < bound; ++i) {
114  xmm4 = _mm256_sub_ps(xmm1, xmm2);
115  xmm5 = _mm256_sub_ps(xmm1, xmm3);
116  points += 8;
117  xmm6 = _mm256_mul_ps(xmm4, xmm4);
118  xmm7 = _mm256_mul_ps(xmm5, xmm5);
119 
120  xmm2 = _mm256_load_ps((float*)&points[0]);
121 
122  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
123  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
124 
125  xmm3 = _mm256_load_ps((float*)&points[4]);
126 
127  _mm256_store_ps(target, xmm4);
128 
129  target += 8;
130  }
131 
132  for(i = 0; i < leftovers0; ++i) {
133 
134  xmm2 = _mm256_load_ps((float*)&points[0]);
135 
136  xmm4 = _mm256_sub_ps(xmm1, xmm2);
137 
138  points += 4;
139 
140  xmm6 = _mm256_mul_ps(xmm4, xmm4);
141 
142  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144 
145  xmm9 = _mm256_extractf128_ps(xmm4, 1);
146  _mm_store_ps(target,xmm9);
147 
148  target += 4;
149  }
150 
151  for(i = 0; i < leftovers1; ++i) {
152  xmm9 = _mm_load_ps((float*)&points[0]);
153 
154  xmm10 = _mm_sub_ps(xmm0, xmm9);
155 
156  points += 2;
157 
158  xmm9 = _mm_mul_ps(xmm10, xmm10);
159 
160  xmm10 = _mm_hadd_ps(xmm9, xmm9);
161 
162  _mm_storeh_pi((__m64*)target, xmm10);
163 
164  target += 2;
165  }
166 
167  for(i = 0; i < leftovers2; ++i) {
168 
169  diff = src0[0] - points[0];
170 
171  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
172 
173  target[0] = sq_dist;
174  }
175 }
176 
177 #endif /*LV_HAVE_AVX2*/
178 
179 #ifdef LV_HAVE_SSE3
180 #include<xmmintrin.h>
181 #include<pmmintrin.h>
182 
183 static inline void
185  unsigned int num_points)
186 {
187  const unsigned int num_bytes = num_points*8;
188 
189  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
190 
191  lv_32fc_t diff;
192  float sq_dist;
193  int bound = num_bytes >> 5;
194  int i = 0;
195 
196  xmm1 = _mm_setzero_ps();
197  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
198  xmm2 = _mm_load_ps((float*)&points[0]);
199  xmm1 = _mm_movelh_ps(xmm1, xmm1);
200  xmm3 = _mm_load_ps((float*)&points[2]);
201 
202  for(; i < bound - 1; ++i) {
203  xmm4 = _mm_sub_ps(xmm1, xmm2);
204  xmm5 = _mm_sub_ps(xmm1, xmm3);
205  points += 4;
206  xmm6 = _mm_mul_ps(xmm4, xmm4);
207  xmm7 = _mm_mul_ps(xmm5, xmm5);
208 
209  xmm2 = _mm_load_ps((float*)&points[0]);
210 
211  xmm4 = _mm_hadd_ps(xmm6, xmm7);
212 
213  xmm3 = _mm_load_ps((float*)&points[2]);
214 
215  _mm_store_ps(target, xmm4);
216 
217  target += 4;
218  }
219 
220  xmm4 = _mm_sub_ps(xmm1, xmm2);
221  xmm5 = _mm_sub_ps(xmm1, xmm3);
222 
223  points += 4;
224  xmm6 = _mm_mul_ps(xmm4, xmm4);
225  xmm7 = _mm_mul_ps(xmm5, xmm5);
226 
227  xmm4 = _mm_hadd_ps(xmm6, xmm7);
228 
229  _mm_store_ps(target, xmm4);
230 
231  target += 4;
232 
233  if (num_bytes >> 4 & 1) {
234 
235  xmm2 = _mm_load_ps((float*)&points[0]);
236 
237  xmm4 = _mm_sub_ps(xmm1, xmm2);
238 
239  points += 2;
240 
241  xmm6 = _mm_mul_ps(xmm4, xmm4);
242 
243  xmm4 = _mm_hadd_ps(xmm6, xmm6);
244 
245  _mm_storeh_pi((__m64*)target, xmm4);
246 
247  target += 2;
248  }
249 
250  if (num_bytes >> 3 & 1) {
251 
252  diff = src0[0] - points[0];
253 
254  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
255 
256  target[0] = sq_dist;
257  }
258 }
259 
260 #endif /*LV_HAVE_SSE3*/
261 
262 
263 #ifdef LV_HAVE_NEON
264 #include <arm_neon.h>
265 static inline void
266 volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
267 {
268  const unsigned int quarter_points = num_points / 4;
269  unsigned int number;
270 
271  float32x4x2_t a_vec, b_vec;
272  float32x4x2_t diff_vec;
273  float32x4_t tmp, tmp1, dist_sq;
274  a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
275  a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
276  for(number=0; number < quarter_points; ++number) {
277  b_vec = vld2q_f32((float*)points);
278  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
279  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
280  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
281  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
282 
283  dist_sq = vaddq_f32(tmp, tmp1);
284  vst1q_f32(target, dist_sq);
285  points += 4;
286  target += 4;
287  }
288  for(number=quarter_points*4; number < num_points; ++number) {
289  lv_32fc_t diff = src0[0] - *points++;
290  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
291  }
292 }
293 #endif /* LV_HAVE_NEON */
294 
295 
296 #ifdef LV_HAVE_GENERIC
297 static inline void
299  unsigned int num_points)
300 {
301  const unsigned int num_bytes = num_points*8;
302 
303  lv_32fc_t diff;
304  float sq_dist;
305  unsigned int i = 0;
306 
307  for(; i < num_bytes >> 3; ++i) {
308  diff = src0[0] - points[i];
309 
310  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
311 
312  target[i] = sq_dist;
313  }
314 }
315 
316 #endif /*LV_HAVE_GENERIC*/
317 
318 
319 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
320 
321 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
322 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
323 
324 #include<inttypes.h>
325 #include<stdio.h>
326 #include<volk/volk_complex.h>
327 
328 #ifdef LV_HAVE_AVX2
329 #include<immintrin.h>
330 
331 static inline void
332 volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
333  unsigned int num_points)
334 {
335  const unsigned int num_bytes = num_points*8;
336  __m128 xmm0, xmm9;
337  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
338 
339  lv_32fc_t diff;
340  float sq_dist;
341  int bound = num_bytes >> 6;
342  int leftovers1 = (num_bytes >> 3) & 0b11;
343  int i = 0;
344 
345  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
346  xmm1 = _mm256_setzero_ps();
347  xmm0 = _mm_loadu_ps((float*)src0);
348  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
349  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
350  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
351 
352  for(; i < bound; ++i) {
353  xmm2 = _mm256_loadu_ps((float*)&points[0]);
354  xmm3 = _mm256_loadu_ps((float*)&points[4]);
355  xmm4 = _mm256_sub_ps(xmm1, xmm2);
356  xmm5 = _mm256_sub_ps(xmm1, xmm3);
357  points += 8;
358  xmm6 = _mm256_mul_ps(xmm4, xmm4);
359  xmm7 = _mm256_mul_ps(xmm5, xmm5);
360 
361  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
362  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
363 
364  _mm256_storeu_ps(target, xmm4);
365 
366  target += 8;
367  }
368 
369  if (num_bytes >> 5 & 1) {
370 
371  xmm2 = _mm256_loadu_ps((float*)&points[0]);
372 
373  xmm4 = _mm256_sub_ps(xmm1, xmm2);
374 
375  points += 4;
376 
377  xmm6 = _mm256_mul_ps(xmm4, xmm4);
378 
379  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
380  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
381 
382  xmm9 = _mm256_extractf128_ps(xmm4, 1);
383  _mm_storeu_ps(target,xmm9);
384 
385  target += 4;
386  }
387 
388  for(i = 0; i < leftovers1; ++i) {
389 
390  diff = src0[0] - points[0];
391  points += 1;
392 
393  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
394 
395  target[0] = sq_dist;
396  target += 1;
397  }
398 }
399 
400 #endif /*LV_HAVE_AVX2*/
401 
402 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
lv_cimag
#define lv_cimag(x)
Definition: volk_complex.h:85
volk_32fc_x2_square_dist_32f_a_sse3
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_32fc_x2_square_dist_32f_generic
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:298
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:61
volk_complex.h
volk_32fc_x2_square_dist_32f_neon
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:266
lv_creal
#define lv_creal(x)
Definition: volk_complex.h:83