Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
73#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
74#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
75
76#include <float.h>
77#include <inttypes.h>
78#include <stdio.h>
79#include <volk/volk_complex.h>
80
81
82#ifdef LV_HAVE_GENERIC
83
84static inline void
86 const lv_32fc_t* aVector,
87 const lv_32fc_t* bVector,
88 const lv_32fc_t scalar,
89 unsigned int num_points)
90{
91 const lv_32fc_t* aPtr = aVector;
92 const lv_32fc_t* bPtr = bVector;
93 lv_32fc_t* cPtr = cVector;
94 unsigned int number = num_points;
95
96 // unwrap loop
97 while (number >= 8) {
98 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
99 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
100 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
101 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
102 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
103 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
104 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
105 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
106 number -= 8;
107 }
108
109 // clean up any remaining
110 while (number-- > 0) {
111 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
112 }
113}
114#endif /* LV_HAVE_GENERIC */
115
116
117#ifdef LV_HAVE_AVX
118#include <immintrin.h>
120
121static inline void
123 const lv_32fc_t* aVector,
124 const lv_32fc_t* bVector,
125 const lv_32fc_t scalar,
126 unsigned int num_points)
127{
128 unsigned int number = 0;
129 unsigned int i = 0;
130 const unsigned int quarterPoints = num_points / 4;
131 unsigned int isodd = num_points & 3;
132
133 __m256 x, y, s, z;
134 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
135
136 const lv_32fc_t* a = aVector;
137 const lv_32fc_t* b = bVector;
138 lv_32fc_t* c = cVector;
139
140 // Set up constant scalar vector
141 s = _mm256_loadu_ps((float*)v_scalar);
142
143 for (; number < quarterPoints; number++) {
144 x = _mm256_loadu_ps((float*)b);
145 y = _mm256_loadu_ps((float*)a);
147 z = _mm256_add_ps(y, z);
148 _mm256_storeu_ps((float*)c, z);
149
150 a += 4;
151 b += 4;
152 c += 4;
153 }
154
155 for (i = num_points - isodd; i < num_points; i++) {
156 *c++ = (*a++) + lv_conj(*b++) * scalar;
157 }
158}
159#endif /* LV_HAVE_AVX */
160
161
162#ifdef LV_HAVE_SSE3
163#include <pmmintrin.h>
165
166static inline void
168 const lv_32fc_t* aVector,
169 const lv_32fc_t* bVector,
170 const lv_32fc_t scalar,
171 unsigned int num_points)
172{
173 unsigned int number = 0;
174 const unsigned int halfPoints = num_points / 2;
175
176 __m128 x, y, s, z;
177 lv_32fc_t v_scalar[2] = { scalar, scalar };
178
179 const lv_32fc_t* a = aVector;
180 const lv_32fc_t* b = bVector;
181 lv_32fc_t* c = cVector;
182
183 // Set up constant scalar vector
184 s = _mm_loadu_ps((float*)v_scalar);
185
186 for (; number < halfPoints; number++) {
187 x = _mm_loadu_ps((float*)b);
188 y = _mm_loadu_ps((float*)a);
190 z = _mm_add_ps(y, z);
191 _mm_storeu_ps((float*)c, z);
192
193 a += 2;
194 b += 2;
195 c += 2;
196 }
197
198 if ((num_points % 2) != 0) {
199 *c = *a + lv_conj(*b) * scalar;
200 }
201}
202#endif /* LV_HAVE_SSE */
203
204
205#ifdef LV_HAVE_AVX
206#include <immintrin.h>
208
209static inline void
211 const lv_32fc_t* aVector,
212 const lv_32fc_t* bVector,
213 const lv_32fc_t scalar,
214 unsigned int num_points)
215{
216 unsigned int number = 0;
217 unsigned int i = 0;
218 const unsigned int quarterPoints = num_points / 4;
219 unsigned int isodd = num_points & 3;
220
221 __m256 x, y, s, z;
222 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
223
224 const lv_32fc_t* a = aVector;
225 const lv_32fc_t* b = bVector;
226 lv_32fc_t* c = cVector;
227
228 // Set up constant scalar vector
229 s = _mm256_loadu_ps((float*)v_scalar);
230
231 for (; number < quarterPoints; number++) {
232 x = _mm256_load_ps((float*)b);
233 y = _mm256_load_ps((float*)a);
235 z = _mm256_add_ps(y, z);
236 _mm256_store_ps((float*)c, z);
237
238 a += 4;
239 b += 4;
240 c += 4;
241 }
242
243 for (i = num_points - isodd; i < num_points; i++) {
244 *c++ = (*a++) + lv_conj(*b++) * scalar;
245 }
246}
247#endif /* LV_HAVE_AVX */
248
249
250#ifdef LV_HAVE_SSE3
251#include <pmmintrin.h>
253
254static inline void
256 const lv_32fc_t* aVector,
257 const lv_32fc_t* bVector,
258 const lv_32fc_t scalar,
259 unsigned int num_points)
260{
261 unsigned int number = 0;
262 const unsigned int halfPoints = num_points / 2;
263
264 __m128 x, y, s, z;
265 lv_32fc_t v_scalar[2] = { scalar, scalar };
266
267 const lv_32fc_t* a = aVector;
268 const lv_32fc_t* b = bVector;
269 lv_32fc_t* c = cVector;
270
271 // Set up constant scalar vector
272 s = _mm_loadu_ps((float*)v_scalar);
273
274 for (; number < halfPoints; number++) {
275 x = _mm_load_ps((float*)b);
276 y = _mm_load_ps((float*)a);
278 z = _mm_add_ps(y, z);
279 _mm_store_ps((float*)c, z);
280
281 a += 2;
282 b += 2;
283 c += 2;
284 }
285
286 if ((num_points % 2) != 0) {
287 *c = *a + lv_conj(*b) * scalar;
288 }
289}
290#endif /* LV_HAVE_SSE */
291
292
293#ifdef LV_HAVE_NEON
294#include <arm_neon.h>
295
296static inline void
298 const lv_32fc_t* aVector,
299 const lv_32fc_t* bVector,
300 const lv_32fc_t scalar,
301 unsigned int num_points)
302{
303 const lv_32fc_t* bPtr = bVector;
304 const lv_32fc_t* aPtr = aVector;
305 lv_32fc_t* cPtr = cVector;
306 unsigned int number = num_points;
307 unsigned int quarter_points = num_points / 4;
308
309 float32x4x2_t a_val, b_val, c_val, scalar_val;
310 float32x4x2_t tmp_val;
311
312 scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
313 scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
314
315 for (number = 0; number < quarter_points; ++number) {
316 a_val = vld2q_f32((float*)aPtr);
317 b_val = vld2q_f32((float*)bPtr);
318 b_val.val[1] = vnegq_f32(b_val.val[1]);
319 __VOLK_PREFETCH(aPtr + 8);
320 __VOLK_PREFETCH(bPtr + 8);
321
322 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
323 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
324
325 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
326 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
327
328 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
329 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
330
331 vst2q_f32((float*)cPtr, c_val);
332
333 aPtr += 4;
334 bPtr += 4;
335 cPtr += 4;
336 }
337
338 for (number = quarter_points * 4; number < num_points; number++) {
339 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
340 }
341}
342#endif /* LV_HAVE_NEON */
343
344#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */