Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47
48#include <stdio.h>
49#include <volk/volk_common.h>
50
51
52#ifdef LV_HAVE_GENERIC
53
55 const short* input,
56 const lv_32fc_t* taps,
57 unsigned int num_points)
58{
59
60 static const int N_UNROLL = 4;
61
62 lv_32fc_t acc0 = 0;
63 lv_32fc_t acc1 = 0;
64 lv_32fc_t acc2 = 0;
65 lv_32fc_t acc3 = 0;
66
67 unsigned i = 0;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69
70 for (i = 0; i < n; i += N_UNROLL) {
71 acc0 += taps[i + 0] * (float)input[i + 0];
72 acc1 += taps[i + 1] * (float)input[i + 1];
73 acc2 += taps[i + 2] * (float)input[i + 2];
74 acc3 += taps[i + 3] * (float)input[i + 3];
75 }
76
77 for (; i < num_points; i++) {
78 acc0 += taps[i] * (float)input[i];
79 }
80
81 *result = acc0 + acc1 + acc2 + acc3;
82}
83
84#endif /*LV_HAVE_GENERIC*/
85
86#ifdef LV_HAVE_NEON
87#include <arm_neon.h>
89 const short* input,
90 const lv_32fc_t* taps,
91 unsigned int num_points)
92{
93
94 unsigned ii;
95 unsigned quarter_points = num_points / 4;
96 lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97 short* inputPtr = (short*)input;
98 lv_32fc_t accumulator_vec[4];
99
100 float32x4x2_t tapsVal, accumulator_val;
101 int16x4_t input16;
102 int32x4_t input32;
103 float32x4_t input_float, prod_re, prod_im;
104
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
107
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
111 // widen 16-bit int to 32-bit int
112 input32 = vmovl_s16(input16);
113 // convert 32-bit int to float with scale
114 input_float = vcvtq_f32_s32(input32);
115
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121
122 tapsPtr += 4;
123 inputPtr += 4;
124 }
125 vst2q_f32((float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
129
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132 }
133
134 *result = accumulator_vec[0];
135}
136
137#endif /*LV_HAVE_NEON*/
138
139#if LV_HAVE_SSE && LV_HAVE_MMX
140
141static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
142 const short* input,
143 const lv_32fc_t* taps,
144 unsigned int num_points)
145{
146
147 unsigned int number = 0;
148 const unsigned int sixteenthPoints = num_points / 8;
149
150 float res[2];
151 float *realpt = &res[0], *imagpt = &res[1];
152 const short* aPtr = input;
153 const float* bPtr = (float*)taps;
154
155 __m64 m0, m1;
156 __m128 f0, f1, f2, f3;
157 __m128 a0Val, a1Val, a2Val, a3Val;
158 __m128 b0Val, b1Val, b2Val, b3Val;
159 __m128 c0Val, c1Val, c2Val, c3Val;
160
161 __m128 dotProdVal0 = _mm_setzero_ps();
162 __m128 dotProdVal1 = _mm_setzero_ps();
163 __m128 dotProdVal2 = _mm_setzero_ps();
164 __m128 dotProdVal3 = _mm_setzero_ps();
165
166 for (; number < sixteenthPoints; number++) {
167
168 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
169 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
170 f0 = _mm_cvtpi16_ps(m0);
171 f1 = _mm_cvtpi16_ps(m0);
172 f2 = _mm_cvtpi16_ps(m1);
173 f3 = _mm_cvtpi16_ps(m1);
174
175 a0Val = _mm_unpacklo_ps(f0, f1);
176 a1Val = _mm_unpackhi_ps(f0, f1);
177 a2Val = _mm_unpacklo_ps(f2, f3);
178 a3Val = _mm_unpackhi_ps(f2, f3);
179
180 b0Val = _mm_loadu_ps(bPtr);
181 b1Val = _mm_loadu_ps(bPtr + 4);
182 b2Val = _mm_loadu_ps(bPtr + 8);
183 b3Val = _mm_loadu_ps(bPtr + 12);
184
185 c0Val = _mm_mul_ps(a0Val, b0Val);
186 c1Val = _mm_mul_ps(a1Val, b1Val);
187 c2Val = _mm_mul_ps(a2Val, b2Val);
188 c3Val = _mm_mul_ps(a3Val, b3Val);
189
190 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
191 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
192 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
193 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
194
195 aPtr += 8;
196 bPtr += 16;
197 }
198
199 _mm_empty(); // clear the mmx technology state
200
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
202 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
203 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
204
205 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
206
207 _mm_store_ps(dotProductVector,
208 dotProdVal0); // Store the results back into the dot product vector
209
210 *realpt = dotProductVector[0];
211 *imagpt = dotProductVector[1];
212 *realpt += dotProductVector[2];
213 *imagpt += dotProductVector[3];
214
215 number = sixteenthPoints * 8;
216 for (; number < num_points; number++) {
217 *realpt += ((*aPtr) * (*bPtr++));
218 *imagpt += ((*aPtr++) * (*bPtr++));
219 }
220
221 *result = *(lv_32fc_t*)(&res[0]);
222}
223
224#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
225
226
227#if LV_HAVE_AVX2 && LV_HAVE_FMA
228
229static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
230 const short* input,
231 const lv_32fc_t* taps,
232 unsigned int num_points)
233{
234
235 unsigned int number = 0;
236 const unsigned int sixteenthPoints = num_points / 16;
237
238 float res[2];
239 float *realpt = &res[0], *imagpt = &res[1];
240 const short* aPtr = input;
241 const float* bPtr = (float*)taps;
242
243 __m128i m0, m1;
244 __m256i f0, f1;
245 __m256 g0, g1, h0, h1, h2, h3;
246 __m256 a0Val, a1Val, a2Val, a3Val;
247 __m256 b0Val, b1Val, b2Val, b3Val;
248
249 __m256 dotProdVal0 = _mm256_setzero_ps();
250 __m256 dotProdVal1 = _mm256_setzero_ps();
251 __m256 dotProdVal2 = _mm256_setzero_ps();
252 __m256 dotProdVal3 = _mm256_setzero_ps();
253
254 for (; number < sixteenthPoints; number++) {
255
256 m0 = _mm_loadu_si128((__m128i const*)aPtr);
257 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
258
259 f0 = _mm256_cvtepi16_epi32(m0);
260 g0 = _mm256_cvtepi32_ps(f0);
261 f1 = _mm256_cvtepi16_epi32(m1);
262 g1 = _mm256_cvtepi32_ps(f1);
263
264 h0 = _mm256_unpacklo_ps(g0, g0);
265 h1 = _mm256_unpackhi_ps(g0, g0);
266 h2 = _mm256_unpacklo_ps(g1, g1);
267 h3 = _mm256_unpackhi_ps(g1, g1);
268
269 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
270 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
271 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
272 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
273
274 b0Val = _mm256_loadu_ps(bPtr);
275 b1Val = _mm256_loadu_ps(bPtr + 8);
276 b2Val = _mm256_loadu_ps(bPtr + 16);
277 b3Val = _mm256_loadu_ps(bPtr + 24);
278
279 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
280 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
281 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
282 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
283
284 aPtr += 16;
285 bPtr += 32;
286 }
287
288 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291
292 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
293
294 _mm256_store_ps(dotProductVector,
295 dotProdVal0); // Store the results back into the dot product vector
296
297 *realpt = dotProductVector[0];
298 *imagpt = dotProductVector[1];
299 *realpt += dotProductVector[2];
300 *imagpt += dotProductVector[3];
301 *realpt += dotProductVector[4];
302 *imagpt += dotProductVector[5];
303 *realpt += dotProductVector[6];
304 *imagpt += dotProductVector[7];
305
306 number = sixteenthPoints * 16;
307 for (; number < num_points; number++) {
308 *realpt += ((*aPtr) * (*bPtr++));
309 *imagpt += ((*aPtr++) * (*bPtr++));
310 }
311
312 *result = *(lv_32fc_t*)(&res[0]);
313}
314
315#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
316
317
318#ifdef LV_HAVE_AVX2
319
320static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
321 const short* input,
322 const lv_32fc_t* taps,
323 unsigned int num_points)
324{
325
326 unsigned int number = 0;
327 const unsigned int sixteenthPoints = num_points / 16;
328
329 float res[2];
330 float *realpt = &res[0], *imagpt = &res[1];
331 const short* aPtr = input;
332 const float* bPtr = (float*)taps;
333
334 __m128i m0, m1;
335 __m256i f0, f1;
336 __m256 g0, g1, h0, h1, h2, h3;
337 __m256 a0Val, a1Val, a2Val, a3Val;
338 __m256 b0Val, b1Val, b2Val, b3Val;
339 __m256 c0Val, c1Val, c2Val, c3Val;
340
341 __m256 dotProdVal0 = _mm256_setzero_ps();
342 __m256 dotProdVal1 = _mm256_setzero_ps();
343 __m256 dotProdVal2 = _mm256_setzero_ps();
344 __m256 dotProdVal3 = _mm256_setzero_ps();
345
346 for (; number < sixteenthPoints; number++) {
347
348 m0 = _mm_loadu_si128((__m128i const*)aPtr);
349 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
350
351 f0 = _mm256_cvtepi16_epi32(m0);
352 g0 = _mm256_cvtepi32_ps(f0);
353 f1 = _mm256_cvtepi16_epi32(m1);
354 g1 = _mm256_cvtepi32_ps(f1);
355
356 h0 = _mm256_unpacklo_ps(g0, g0);
357 h1 = _mm256_unpackhi_ps(g0, g0);
358 h2 = _mm256_unpacklo_ps(g1, g1);
359 h3 = _mm256_unpackhi_ps(g1, g1);
360
361 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
362 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
363 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
364 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
365
366 b0Val = _mm256_loadu_ps(bPtr);
367 b1Val = _mm256_loadu_ps(bPtr + 8);
368 b2Val = _mm256_loadu_ps(bPtr + 16);
369 b3Val = _mm256_loadu_ps(bPtr + 24);
370
371 c0Val = _mm256_mul_ps(a0Val, b0Val);
372 c1Val = _mm256_mul_ps(a1Val, b1Val);
373 c2Val = _mm256_mul_ps(a2Val, b2Val);
374 c3Val = _mm256_mul_ps(a3Val, b3Val);
375
376 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
377 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
378 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
379 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
380
381 aPtr += 16;
382 bPtr += 32;
383 }
384
385 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
386 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
387 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
388
389 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
390
391 _mm256_store_ps(dotProductVector,
392 dotProdVal0); // Store the results back into the dot product vector
393
394 *realpt = dotProductVector[0];
395 *imagpt = dotProductVector[1];
396 *realpt += dotProductVector[2];
397 *imagpt += dotProductVector[3];
398 *realpt += dotProductVector[4];
399 *imagpt += dotProductVector[5];
400 *realpt += dotProductVector[6];
401 *imagpt += dotProductVector[7];
402
403 number = sixteenthPoints * 16;
404 for (; number < num_points; number++) {
405 *realpt += ((*aPtr) * (*bPtr++));
406 *imagpt += ((*aPtr++) * (*bPtr++));
407 }
408
409 *result = *(lv_32fc_t*)(&res[0]);
410}
411
412#endif /*LV_HAVE_AVX2*/
413
414
415#if LV_HAVE_SSE && LV_HAVE_MMX
416
417
418static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
419 const short* input,
420 const lv_32fc_t* taps,
421 unsigned int num_points)
422{
423
424 unsigned int number = 0;
425 const unsigned int sixteenthPoints = num_points / 8;
426
427 float res[2];
428 float *realpt = &res[0], *imagpt = &res[1];
429 const short* aPtr = input;
430 const float* bPtr = (float*)taps;
431
432 __m64 m0, m1;
433 __m128 f0, f1, f2, f3;
434 __m128 a0Val, a1Val, a2Val, a3Val;
435 __m128 b0Val, b1Val, b2Val, b3Val;
436 __m128 c0Val, c1Val, c2Val, c3Val;
437
438 __m128 dotProdVal0 = _mm_setzero_ps();
439 __m128 dotProdVal1 = _mm_setzero_ps();
440 __m128 dotProdVal2 = _mm_setzero_ps();
441 __m128 dotProdVal3 = _mm_setzero_ps();
442
443 for (; number < sixteenthPoints; number++) {
444
445 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
446 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
447 f0 = _mm_cvtpi16_ps(m0);
448 f1 = _mm_cvtpi16_ps(m0);
449 f2 = _mm_cvtpi16_ps(m1);
450 f3 = _mm_cvtpi16_ps(m1);
451
452 a0Val = _mm_unpacklo_ps(f0, f1);
453 a1Val = _mm_unpackhi_ps(f0, f1);
454 a2Val = _mm_unpacklo_ps(f2, f3);
455 a3Val = _mm_unpackhi_ps(f2, f3);
456
457 b0Val = _mm_load_ps(bPtr);
458 b1Val = _mm_load_ps(bPtr + 4);
459 b2Val = _mm_load_ps(bPtr + 8);
460 b3Val = _mm_load_ps(bPtr + 12);
461
462 c0Val = _mm_mul_ps(a0Val, b0Val);
463 c1Val = _mm_mul_ps(a1Val, b1Val);
464 c2Val = _mm_mul_ps(a2Val, b2Val);
465 c3Val = _mm_mul_ps(a3Val, b3Val);
466
467 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
468 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
469 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
470 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
471
472 aPtr += 8;
473 bPtr += 16;
474 }
475
476 _mm_empty(); // clear the mmx technology state
477
478 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
479 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
480 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
481
482 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
483
484 _mm_store_ps(dotProductVector,
485 dotProdVal0); // Store the results back into the dot product vector
486
487 *realpt = dotProductVector[0];
488 *imagpt = dotProductVector[1];
489 *realpt += dotProductVector[2];
490 *imagpt += dotProductVector[3];
491
492 number = sixteenthPoints * 8;
493 for (; number < num_points; number++) {
494 *realpt += ((*aPtr) * (*bPtr++));
495 *imagpt += ((*aPtr++) * (*bPtr++));
496 }
497
498 *result = *(lv_32fc_t*)(&res[0]);
499}
500
501#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
502
503#ifdef LV_HAVE_AVX2
504
505static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
506 const short* input,
507 const lv_32fc_t* taps,
508 unsigned int num_points)
509{
510
511 unsigned int number = 0;
512 const unsigned int sixteenthPoints = num_points / 16;
513
514 float res[2];
515 float *realpt = &res[0], *imagpt = &res[1];
516 const short* aPtr = input;
517 const float* bPtr = (float*)taps;
518
519 __m128i m0, m1;
520 __m256i f0, f1;
521 __m256 g0, g1, h0, h1, h2, h3;
522 __m256 a0Val, a1Val, a2Val, a3Val;
523 __m256 b0Val, b1Val, b2Val, b3Val;
524 __m256 c0Val, c1Val, c2Val, c3Val;
525
526 __m256 dotProdVal0 = _mm256_setzero_ps();
527 __m256 dotProdVal1 = _mm256_setzero_ps();
528 __m256 dotProdVal2 = _mm256_setzero_ps();
529 __m256 dotProdVal3 = _mm256_setzero_ps();
530
531 for (; number < sixteenthPoints; number++) {
532
533 m0 = _mm_load_si128((__m128i const*)aPtr);
534 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
535
536 f0 = _mm256_cvtepi16_epi32(m0);
537 g0 = _mm256_cvtepi32_ps(f0);
538 f1 = _mm256_cvtepi16_epi32(m1);
539 g1 = _mm256_cvtepi32_ps(f1);
540
541 h0 = _mm256_unpacklo_ps(g0, g0);
542 h1 = _mm256_unpackhi_ps(g0, g0);
543 h2 = _mm256_unpacklo_ps(g1, g1);
544 h3 = _mm256_unpackhi_ps(g1, g1);
545
546 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
547 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
548 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
549 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
550
551 b0Val = _mm256_load_ps(bPtr);
552 b1Val = _mm256_load_ps(bPtr + 8);
553 b2Val = _mm256_load_ps(bPtr + 16);
554 b3Val = _mm256_load_ps(bPtr + 24);
555
556 c0Val = _mm256_mul_ps(a0Val, b0Val);
557 c1Val = _mm256_mul_ps(a1Val, b1Val);
558 c2Val = _mm256_mul_ps(a2Val, b2Val);
559 c3Val = _mm256_mul_ps(a3Val, b3Val);
560
561 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
562 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
563 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
564 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
565
566 aPtr += 16;
567 bPtr += 32;
568 }
569
570 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
571 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
572 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
573
574 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
575
576 _mm256_store_ps(dotProductVector,
577 dotProdVal0); // Store the results back into the dot product vector
578
579 *realpt = dotProductVector[0];
580 *imagpt = dotProductVector[1];
581 *realpt += dotProductVector[2];
582 *imagpt += dotProductVector[3];
583 *realpt += dotProductVector[4];
584 *imagpt += dotProductVector[5];
585 *realpt += dotProductVector[6];
586 *imagpt += dotProductVector[7];
587
588 number = sixteenthPoints * 16;
589 for (; number < num_points; number++) {
590 *realpt += ((*aPtr) * (*bPtr++));
591 *imagpt += ((*aPtr++) * (*bPtr++));
592 }
593
594 *result = *(lv_32fc_t*)(&res[0]);
595}
596
597
598#endif /*LV_HAVE_AVX2*/
599
600#if LV_HAVE_AVX2 && LV_HAVE_FMA
601
602static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
603 const short* input,
604 const lv_32fc_t* taps,
605 unsigned int num_points)
606{
607
608 unsigned int number = 0;
609 const unsigned int sixteenthPoints = num_points / 16;
610
611 float res[2];
612 float *realpt = &res[0], *imagpt = &res[1];
613 const short* aPtr = input;
614 const float* bPtr = (float*)taps;
615
616 __m128i m0, m1;
617 __m256i f0, f1;
618 __m256 g0, g1, h0, h1, h2, h3;
619 __m256 a0Val, a1Val, a2Val, a3Val;
620 __m256 b0Val, b1Val, b2Val, b3Val;
621
622 __m256 dotProdVal0 = _mm256_setzero_ps();
623 __m256 dotProdVal1 = _mm256_setzero_ps();
624 __m256 dotProdVal2 = _mm256_setzero_ps();
625 __m256 dotProdVal3 = _mm256_setzero_ps();
626
627 for (; number < sixteenthPoints; number++) {
628
629 m0 = _mm_load_si128((__m128i const*)aPtr);
630 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
631
632 f0 = _mm256_cvtepi16_epi32(m0);
633 g0 = _mm256_cvtepi32_ps(f0);
634 f1 = _mm256_cvtepi16_epi32(m1);
635 g1 = _mm256_cvtepi32_ps(f1);
636
637 h0 = _mm256_unpacklo_ps(g0, g0);
638 h1 = _mm256_unpackhi_ps(g0, g0);
639 h2 = _mm256_unpacklo_ps(g1, g1);
640 h3 = _mm256_unpackhi_ps(g1, g1);
641
642 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
643 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
644 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
645 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
646
647 b0Val = _mm256_load_ps(bPtr);
648 b1Val = _mm256_load_ps(bPtr + 8);
649 b2Val = _mm256_load_ps(bPtr + 16);
650 b3Val = _mm256_load_ps(bPtr + 24);
651
652 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
653 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
654 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
655 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
656
657 aPtr += 16;
658 bPtr += 32;
659 }
660
661 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
662 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
663 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
664
665 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
666
667 _mm256_store_ps(dotProductVector,
668 dotProdVal0); // Store the results back into the dot product vector
669
670 *realpt = dotProductVector[0];
671 *imagpt = dotProductVector[1];
672 *realpt += dotProductVector[2];
673 *imagpt += dotProductVector[3];
674 *realpt += dotProductVector[4];
675 *imagpt += dotProductVector[5];
676 *realpt += dotProductVector[6];
677 *imagpt += dotProductVector[7];
678
679 number = sixteenthPoints * 16;
680 for (; number < num_points; number++) {
681 *realpt += ((*aPtr) * (*bPtr++));
682 *imagpt += ((*aPtr++) * (*bPtr++));
683 }
684
685 *result = *(lv_32fc_t*)(&res[0]);
686}
687
688
689#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
690
691
692#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/