Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
10/*
11 * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12 */
13
14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16#include <string.h>
17
18static inline unsigned int log2_of_power_of_2(unsigned int val)
19{
20 // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23 };
24
25 unsigned int res = (val & b[0]) != 0;
26 res |= ((val & b[4]) != 0) << 4;
27 res |= ((val & b[3]) != 0) << 3;
28 res |= ((val & b[2]) != 0) << 2;
29 res |= ((val & b[1]) != 0) << 1;
30 return res;
31}
32
33static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
37{
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
43 ++frame_ptr;
44 temp_ptr += 2;
45 }
46 frame_ptr += frame_half;
47 }
48}
49
50static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
51 unsigned char* temp,
52 unsigned int frame_size)
53{
54 unsigned int stage = log2_of_power_of_2(frame_size);
55 unsigned int frame_half = frame_size >> 1;
56 unsigned int num_branches = 1;
57
58 while (stage) {
59 // encode stage
60 encodepolar_single_stage(frame, temp, num_branches, frame_half);
61 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
62
63 // update all the parameters.
64 num_branches = num_branches << 1;
65 frame_half = frame_half >> 1;
66 --stage;
67 }
68}
69
70#ifdef LV_HAVE_SSSE3
71#include <tmmintrin.h>
72
73static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
74 unsigned char* temp,
75 unsigned int frame_size)
76{
77 const unsigned int po2 = log2_of_power_of_2(frame_size);
78
79 unsigned int stage = po2;
80 unsigned char* frame_ptr = frame;
81 unsigned char* temp_ptr = temp;
82
83 unsigned int frame_half = frame_size >> 1;
84 unsigned int num_branches = 1;
85 unsigned int branch;
86 unsigned int bit;
87
88 // prepare constants
89 const __m128i mask_stage1 = _mm_set_epi8(0x0,
90 0xFF,
91 0x0,
92 0xFF,
93 0x0,
94 0xFF,
95 0x0,
96 0xFF,
97 0x0,
98 0xFF,
99 0x0,
100 0xFF,
101 0x0,
102 0xFF,
103 0x0,
104 0xFF);
105
106 // get some SIMD registers to play with.
107 __m128i r_frame0, r_temp0, shifted;
108
109 {
110 __m128i r_frame1, r_temp1;
111 const __m128i shuffle_separate =
112 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
113
114 while (stage > 4) {
115 frame_ptr = frame;
116 temp_ptr = temp;
117
118 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
119 for (branch = 0; branch < num_branches; ++branch) {
120 for (bit = 0; bit < frame_half; bit += 16) {
121 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
122 temp_ptr += 16;
123 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
124 temp_ptr += 16;
125
126 shifted = _mm_srli_si128(r_temp0, 1);
127 shifted = _mm_and_si128(shifted, mask_stage1);
128 r_temp0 = _mm_xor_si128(shifted, r_temp0);
129 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
130
131 shifted = _mm_srli_si128(r_temp1, 1);
132 shifted = _mm_and_si128(shifted, mask_stage1);
133 r_temp1 = _mm_xor_si128(shifted, r_temp1);
134 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
135
136 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
137 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
138
139 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
140 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
141 frame_ptr += 16;
142 }
143
144 frame_ptr += frame_half;
145 }
146 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
147
148 num_branches = num_branches << 1;
149 frame_half = frame_half >> 1;
150 stage--;
151 }
152 }
153
154 // This last part requires at least 16-bit frames.
155 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
156
157 // reset pointers to correct positions.
158 frame_ptr = frame;
159 temp_ptr = temp;
160
161 // prefetch first chunk
162 __VOLK_PREFETCH(temp_ptr);
163
164 const __m128i shuffle_stage4 =
165 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
166 const __m128i mask_stage4 = _mm_set_epi8(0x0,
167 0x0,
168 0x0,
169 0x0,
170 0x0,
171 0x0,
172 0x0,
173 0x0,
174 0xFF,
175 0xFF,
176 0xFF,
177 0xFF,
178 0xFF,
179 0xFF,
180 0xFF,
181 0xFF);
182 const __m128i mask_stage3 = _mm_set_epi8(0x0,
183 0x0,
184 0x0,
185 0x0,
186 0xFF,
187 0xFF,
188 0xFF,
189 0xFF,
190 0x0,
191 0x0,
192 0x0,
193 0x0,
194 0xFF,
195 0xFF,
196 0xFF,
197 0xFF);
198 const __m128i mask_stage2 = _mm_set_epi8(0x0,
199 0x0,
200 0xFF,
201 0xFF,
202 0x0,
203 0x0,
204 0xFF,
205 0xFF,
206 0x0,
207 0x0,
208 0xFF,
209 0xFF,
210 0x0,
211 0x0,
212 0xFF,
213 0xFF);
214
215 for (branch = 0; branch < num_branches; ++branch) {
216 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
217
218 // prefetch next chunk
219 temp_ptr += 16;
220 __VOLK_PREFETCH(temp_ptr);
221
222 // shuffle once for bit-reversal.
223 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
224
225 shifted = _mm_srli_si128(r_temp0, 8);
226 shifted = _mm_and_si128(shifted, mask_stage4);
227 r_frame0 = _mm_xor_si128(shifted, r_temp0);
228
229 shifted = _mm_srli_si128(r_frame0, 4);
230 shifted = _mm_and_si128(shifted, mask_stage3);
231 r_frame0 = _mm_xor_si128(shifted, r_frame0);
232
233 shifted = _mm_srli_si128(r_frame0, 2);
234 shifted = _mm_and_si128(shifted, mask_stage2);
235 r_frame0 = _mm_xor_si128(shifted, r_frame0);
236
237 shifted = _mm_srli_si128(r_frame0, 1);
238 shifted = _mm_and_si128(shifted, mask_stage1);
239 r_frame0 = _mm_xor_si128(shifted, r_frame0);
240
241 // store result of chunk.
242 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
243 frame_ptr += 16;
244 }
245}
246
247#endif /* LV_HAVE_SSSE3 */
248
249#ifdef LV_HAVE_AVX2
250#include <immintrin.h>
251
252static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
253 unsigned char* temp,
254 unsigned int frame_size)
255{
256 const unsigned int po2 = log2_of_power_of_2(frame_size);
257
258 unsigned int stage = po2;
259 unsigned char* frame_ptr = frame;
260 unsigned char* temp_ptr = temp;
261
262 unsigned int frame_half = frame_size >> 1;
263 unsigned int num_branches = 1;
264 unsigned int branch;
265 unsigned int bit;
266
267 // prepare constants
268 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
269 0xFF,
270 0x0,
271 0xFF,
272 0x0,
273 0xFF,
274 0x0,
275 0xFF,
276 0x0,
277 0xFF,
278 0x0,
279 0xFF,
280 0x0,
281 0xFF,
282 0x0,
283 0xFF,
284 0x0,
285 0xFF,
286 0x0,
287 0xFF,
288 0x0,
289 0xFF,
290 0x0,
291 0xFF,
292 0x0,
293 0xFF,
294 0x0,
295 0xFF,
296 0x0,
297 0xFF,
298 0x0,
299 0xFF);
300
301 const __m128i mask_stage0 = _mm_set_epi8(0x0,
302 0xFF,
303 0x0,
304 0xFF,
305 0x0,
306 0xFF,
307 0x0,
308 0xFF,
309 0x0,
310 0xFF,
311 0x0,
312 0xFF,
313 0x0,
314 0xFF,
315 0x0,
316 0xFF);
317 // get some SIMD registers to play with.
318 __m256i r_frame0, r_temp0, shifted;
319 __m128i r_temp2, r_frame2, shifted2;
320 {
321 __m256i r_frame1, r_temp1;
322 __m128i r_frame3, r_temp3;
323 const __m256i shuffle_separate = _mm256_setr_epi8(0,
324 2,
325 4,
326 6,
327 8,
328 10,
329 12,
330 14,
331 1,
332 3,
333 5,
334 7,
335 9,
336 11,
337 13,
338 15,
339 0,
340 2,
341 4,
342 6,
343 8,
344 10,
345 12,
346 14,
347 1,
348 3,
349 5,
350 7,
351 9,
352 11,
353 13,
354 15);
355 const __m128i shuffle_separate128 =
356 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
357
358 while (stage > 4) {
359 frame_ptr = frame;
360 temp_ptr = temp;
361
362 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
363 for (branch = 0; branch < num_branches; ++branch) {
364 for (bit = 0; bit < frame_half; bit += 32) {
365 if ((frame_half - bit) <
366 32) // if only 16 bits remaining in frame, not 32
367 {
368 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
369 temp_ptr += 16;
370 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
371 temp_ptr += 16;
372
373 shifted2 = _mm_srli_si128(r_temp2, 1);
374 shifted2 = _mm_and_si128(shifted2, mask_stage0);
375 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
376 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
377
378 shifted2 = _mm_srli_si128(r_temp3, 1);
379 shifted2 = _mm_and_si128(shifted2, mask_stage0);
380 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
381 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
382
383 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
384 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
385
386 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
387 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
388 frame_ptr += 16;
389 break;
390 }
391 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
392 temp_ptr += 32;
393 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
394 temp_ptr += 32;
395
396 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
397 shifted = _mm256_and_si256(shifted, mask_stage1);
398 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
399 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
400
401 shifted = _mm256_srli_si256(r_temp1, 1);
402 shifted = _mm256_and_si256(shifted, mask_stage1);
403 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
404 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
405
406 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
407 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
408 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
409 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
410
411 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
412
413 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
414 frame_ptr += 32;
415 }
416
417 frame_ptr += frame_half;
418 }
419 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
420
421 num_branches = num_branches << 1;
422 frame_half = frame_half >> 1;
423 stage--;
424 }
425 }
426
427 // This last part requires at least 32-bit frames.
428 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
429
430 // reset pointers to correct positions.
431 frame_ptr = frame;
432 temp_ptr = temp;
433
434 // prefetch first chunk
435 __VOLK_PREFETCH(temp_ptr);
436
437 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
438 8,
439 4,
440 12,
441 2,
442 10,
443 6,
444 14,
445 1,
446 9,
447 5,
448 13,
449 3,
450 11,
451 7,
452 15,
453 0,
454 8,
455 4,
456 12,
457 2,
458 10,
459 6,
460 14,
461 1,
462 9,
463 5,
464 13,
465 3,
466 11,
467 7,
468 15);
469 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
470 0x0,
471 0x0,
472 0x0,
473 0x0,
474 0x0,
475 0x0,
476 0x0,
477 0xFF,
478 0xFF,
479 0xFF,
480 0xFF,
481 0xFF,
482 0xFF,
483 0xFF,
484 0xFF,
485 0x0,
486 0x0,
487 0x0,
488 0x0,
489 0x0,
490 0x0,
491 0x0,
492 0x0,
493 0xFF,
494 0xFF,
495 0xFF,
496 0xFF,
497 0xFF,
498 0xFF,
499 0xFF,
500 0xFF);
501 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
502 0x0,
503 0x0,
504 0x0,
505 0xFF,
506 0xFF,
507 0xFF,
508 0xFF,
509 0x0,
510 0x0,
511 0x0,
512 0x0,
513 0xFF,
514 0xFF,
515 0xFF,
516 0xFF,
517 0x0,
518 0x0,
519 0x0,
520 0x0,
521 0xFF,
522 0xFF,
523 0xFF,
524 0xFF,
525 0x0,
526 0x0,
527 0x0,
528 0x0,
529 0xFF,
530 0xFF,
531 0xFF,
532 0xFF);
533 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
534 0x0,
535 0xFF,
536 0xFF,
537 0x0,
538 0x0,
539 0xFF,
540 0xFF,
541 0x0,
542 0x0,
543 0xFF,
544 0xFF,
545 0x0,
546 0x0,
547 0xFF,
548 0xFF,
549 0x0,
550 0x0,
551 0xFF,
552 0xFF,
553 0x0,
554 0x0,
555 0xFF,
556 0xFF,
557 0x0,
558 0x0,
559 0xFF,
560 0xFF,
561 0x0,
562 0x0,
563 0xFF,
564 0xFF);
565
566 for (branch = 0; branch < num_branches / 2; ++branch) {
567 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
568
569 // prefetch next chunk
570 temp_ptr += 32;
571 __VOLK_PREFETCH(temp_ptr);
572
573 // shuffle once for bit-reversal.
574 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
575
576 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
577 shifted = _mm256_and_si256(shifted, mask_stage4);
578 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
579
580
581 shifted = _mm256_srli_si256(r_frame0, 4);
582 shifted = _mm256_and_si256(shifted, mask_stage3);
583 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
584
585 shifted = _mm256_srli_si256(r_frame0, 2);
586 shifted = _mm256_and_si256(shifted, mask_stage2);
587 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
588
589 shifted = _mm256_srli_si256(r_frame0, 1);
590 shifted = _mm256_and_si256(shifted, mask_stage1);
591 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
592
593 // store result of chunk.
594 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
595 frame_ptr += 32;
596 }
597}
598#endif /* LV_HAVE_AVX2 */
599
600#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
601
602#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
603#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
604
605#ifdef LV_HAVE_SSSE3
606#include <tmmintrin.h>
607
608static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
609 unsigned char* temp,
610 unsigned int frame_size)
611{
612 const unsigned int po2 = log2_of_power_of_2(frame_size);
613
614 unsigned int stage = po2;
615 unsigned char* frame_ptr = frame;
616 unsigned char* temp_ptr = temp;
617
618 unsigned int frame_half = frame_size >> 1;
619 unsigned int num_branches = 1;
620 unsigned int branch;
621 unsigned int bit;
622
623 // prepare constants
624 const __m128i mask_stage1 = _mm_set_epi8(0x0,
625 0xFF,
626 0x0,
627 0xFF,
628 0x0,
629 0xFF,
630 0x0,
631 0xFF,
632 0x0,
633 0xFF,
634 0x0,
635 0xFF,
636 0x0,
637 0xFF,
638 0x0,
639 0xFF);
640
641 // get some SIMD registers to play with.
642 __m128i r_frame0, r_temp0, shifted;
643
644 {
645 __m128i r_frame1, r_temp1;
646 const __m128i shuffle_separate =
647 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
648
649 while (stage > 4) {
650 frame_ptr = frame;
651 temp_ptr = temp;
652
653 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
654 for (branch = 0; branch < num_branches; ++branch) {
655 for (bit = 0; bit < frame_half; bit += 16) {
656 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
657 temp_ptr += 16;
658 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
659 temp_ptr += 16;
660
661 shifted = _mm_srli_si128(r_temp0, 1);
662 shifted = _mm_and_si128(shifted, mask_stage1);
663 r_temp0 = _mm_xor_si128(shifted, r_temp0);
664 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
665
666 shifted = _mm_srli_si128(r_temp1, 1);
667 shifted = _mm_and_si128(shifted, mask_stage1);
668 r_temp1 = _mm_xor_si128(shifted, r_temp1);
669 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
670
671 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
672 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
673
674 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
675 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
676 frame_ptr += 16;
677 }
678
679 frame_ptr += frame_half;
680 }
681 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
682
683 num_branches = num_branches << 1;
684 frame_half = frame_half >> 1;
685 stage--;
686 }
687 }
688
689 // This last part requires at least 16-bit frames.
690 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
691
692 // reset pointers to correct positions.
693 frame_ptr = frame;
694 temp_ptr = temp;
695
696 // prefetch first chunk
697 __VOLK_PREFETCH(temp_ptr);
698
699 const __m128i shuffle_stage4 =
700 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
701 const __m128i mask_stage4 = _mm_set_epi8(0x0,
702 0x0,
703 0x0,
704 0x0,
705 0x0,
706 0x0,
707 0x0,
708 0x0,
709 0xFF,
710 0xFF,
711 0xFF,
712 0xFF,
713 0xFF,
714 0xFF,
715 0xFF,
716 0xFF);
717 const __m128i mask_stage3 = _mm_set_epi8(0x0,
718 0x0,
719 0x0,
720 0x0,
721 0xFF,
722 0xFF,
723 0xFF,
724 0xFF,
725 0x0,
726 0x0,
727 0x0,
728 0x0,
729 0xFF,
730 0xFF,
731 0xFF,
732 0xFF);
733 const __m128i mask_stage2 = _mm_set_epi8(0x0,
734 0x0,
735 0xFF,
736 0xFF,
737 0x0,
738 0x0,
739 0xFF,
740 0xFF,
741 0x0,
742 0x0,
743 0xFF,
744 0xFF,
745 0x0,
746 0x0,
747 0xFF,
748 0xFF);
749
750 for (branch = 0; branch < num_branches; ++branch) {
751 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
752
753 // prefetch next chunk
754 temp_ptr += 16;
755 __VOLK_PREFETCH(temp_ptr);
756
757 // shuffle once for bit-reversal.
758 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
759
760 shifted = _mm_srli_si128(r_temp0, 8);
761 shifted = _mm_and_si128(shifted, mask_stage4);
762 r_frame0 = _mm_xor_si128(shifted, r_temp0);
763
764 shifted = _mm_srli_si128(r_frame0, 4);
765 shifted = _mm_and_si128(shifted, mask_stage3);
766 r_frame0 = _mm_xor_si128(shifted, r_frame0);
767
768 shifted = _mm_srli_si128(r_frame0, 2);
769 shifted = _mm_and_si128(shifted, mask_stage2);
770 r_frame0 = _mm_xor_si128(shifted, r_frame0);
771
772 shifted = _mm_srli_si128(r_frame0, 1);
773 shifted = _mm_and_si128(shifted, mask_stage1);
774 r_frame0 = _mm_xor_si128(shifted, r_frame0);
775
776 // store result of chunk.
777 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
778 frame_ptr += 16;
779 }
780}
781#endif /* LV_HAVE_SSSE3 */
782
783#ifdef LV_HAVE_AVX2
784#include <immintrin.h>
785
786static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
787 unsigned char* temp,
788 unsigned int frame_size)
789{
790 const unsigned int po2 = log2_of_power_of_2(frame_size);
791
792 unsigned int stage = po2;
793 unsigned char* frame_ptr = frame;
794 unsigned char* temp_ptr = temp;
795
796 unsigned int frame_half = frame_size >> 1;
797 unsigned int num_branches = 1;
798 unsigned int branch;
799 unsigned int bit;
800
801 // prepare constants
802 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
803 0xFF,
804 0x0,
805 0xFF,
806 0x0,
807 0xFF,
808 0x0,
809 0xFF,
810 0x0,
811 0xFF,
812 0x0,
813 0xFF,
814 0x0,
815 0xFF,
816 0x0,
817 0xFF,
818 0x0,
819 0xFF,
820 0x0,
821 0xFF,
822 0x0,
823 0xFF,
824 0x0,
825 0xFF,
826 0x0,
827 0xFF,
828 0x0,
829 0xFF,
830 0x0,
831 0xFF,
832 0x0,
833 0xFF);
834
835 const __m128i mask_stage0 = _mm_set_epi8(0x0,
836 0xFF,
837 0x0,
838 0xFF,
839 0x0,
840 0xFF,
841 0x0,
842 0xFF,
843 0x0,
844 0xFF,
845 0x0,
846 0xFF,
847 0x0,
848 0xFF,
849 0x0,
850 0xFF);
851 // get some SIMD registers to play with.
852 __m256i r_frame0, r_temp0, shifted;
853 __m128i r_temp2, r_frame2, shifted2;
854 {
855 __m256i r_frame1, r_temp1;
856 __m128i r_frame3, r_temp3;
857 const __m256i shuffle_separate = _mm256_setr_epi8(0,
858 2,
859 4,
860 6,
861 8,
862 10,
863 12,
864 14,
865 1,
866 3,
867 5,
868 7,
869 9,
870 11,
871 13,
872 15,
873 0,
874 2,
875 4,
876 6,
877 8,
878 10,
879 12,
880 14,
881 1,
882 3,
883 5,
884 7,
885 9,
886 11,
887 13,
888 15);
889 const __m128i shuffle_separate128 =
890 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
891
892 while (stage > 4) {
893 frame_ptr = frame;
894 temp_ptr = temp;
895
896 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
897 for (branch = 0; branch < num_branches; ++branch) {
898 for (bit = 0; bit < frame_half; bit += 32) {
899 if ((frame_half - bit) <
900 32) // if only 16 bits remaining in frame, not 32
901 {
902 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
903 temp_ptr += 16;
904 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
905 temp_ptr += 16;
906
907 shifted2 = _mm_srli_si128(r_temp2, 1);
908 shifted2 = _mm_and_si128(shifted2, mask_stage0);
909 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
910 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
911
912 shifted2 = _mm_srli_si128(r_temp3, 1);
913 shifted2 = _mm_and_si128(shifted2, mask_stage0);
914 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
915 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
916
917 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
918 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
919
920 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
921 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
922 frame_ptr += 16;
923 break;
924 }
925 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
926 temp_ptr += 32;
927 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
928 temp_ptr += 32;
929
930 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
931 shifted = _mm256_and_si256(shifted, mask_stage1);
932 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
933 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
934
935 shifted = _mm256_srli_si256(r_temp1, 1);
936 shifted = _mm256_and_si256(shifted, mask_stage1);
937 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
938 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
939
940 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
941 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
942 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
943 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
944
945 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
946
947 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
948 frame_ptr += 32;
949 }
950
951 frame_ptr += frame_half;
952 }
953 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
954
955 num_branches = num_branches << 1;
956 frame_half = frame_half >> 1;
957 stage--;
958 }
959 }
960
961 // This last part requires at least 32-bit frames.
962 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
963
964 // reset pointers to correct positions.
965 frame_ptr = frame;
966 temp_ptr = temp;
967
968 // prefetch first chunk.
969 __VOLK_PREFETCH(temp_ptr);
970
971 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
972 8,
973 4,
974 12,
975 2,
976 10,
977 6,
978 14,
979 1,
980 9,
981 5,
982 13,
983 3,
984 11,
985 7,
986 15,
987 0,
988 8,
989 4,
990 12,
991 2,
992 10,
993 6,
994 14,
995 1,
996 9,
997 5,
998 13,
999 3,
1000 11,
1001 7,
1002 15);
1003 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1004 0x0,
1005 0x0,
1006 0x0,
1007 0x0,
1008 0x0,
1009 0x0,
1010 0x0,
1011 0xFF,
1012 0xFF,
1013 0xFF,
1014 0xFF,
1015 0xFF,
1016 0xFF,
1017 0xFF,
1018 0xFF,
1019 0x0,
1020 0x0,
1021 0x0,
1022 0x0,
1023 0x0,
1024 0x0,
1025 0x0,
1026 0x0,
1027 0xFF,
1028 0xFF,
1029 0xFF,
1030 0xFF,
1031 0xFF,
1032 0xFF,
1033 0xFF,
1034 0xFF);
1035 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1036 0x0,
1037 0x0,
1038 0x0,
1039 0xFF,
1040 0xFF,
1041 0xFF,
1042 0xFF,
1043 0x0,
1044 0x0,
1045 0x0,
1046 0x0,
1047 0xFF,
1048 0xFF,
1049 0xFF,
1050 0xFF,
1051 0x0,
1052 0x0,
1053 0x0,
1054 0x0,
1055 0xFF,
1056 0xFF,
1057 0xFF,
1058 0xFF,
1059 0x0,
1060 0x0,
1061 0x0,
1062 0x0,
1063 0xFF,
1064 0xFF,
1065 0xFF,
1066 0xFF);
1067 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1068 0x0,
1069 0xFF,
1070 0xFF,
1071 0x0,
1072 0x0,
1073 0xFF,
1074 0xFF,
1075 0x0,
1076 0x0,
1077 0xFF,
1078 0xFF,
1079 0x0,
1080 0x0,
1081 0xFF,
1082 0xFF,
1083 0x0,
1084 0x0,
1085 0xFF,
1086 0xFF,
1087 0x0,
1088 0x0,
1089 0xFF,
1090 0xFF,
1091 0x0,
1092 0x0,
1093 0xFF,
1094 0xFF,
1095 0x0,
1096 0x0,
1097 0xFF,
1098 0xFF);
1099
1100 for (branch = 0; branch < num_branches / 2; ++branch) {
1101 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1102
1103 // prefetch next chunk
1104 temp_ptr += 32;
1105 __VOLK_PREFETCH(temp_ptr);
1106
1107 // shuffle once for bit-reversal.
1108 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1109
1110 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1111 shifted = _mm256_and_si256(shifted, mask_stage4);
1112 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1113
1114 shifted = _mm256_srli_si256(r_frame0, 4);
1115 shifted = _mm256_and_si256(shifted, mask_stage3);
1116 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1117
1118 shifted = _mm256_srli_si256(r_frame0, 2);
1119 shifted = _mm256_and_si256(shifted, mask_stage2);
1120 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1121
1122 shifted = _mm256_srli_si256(r_frame0, 1);
1123 shifted = _mm256_and_si256(shifted, mask_stage1);
1124 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1125
1126 // store result of chunk.
1127 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1128 frame_ptr += 32;
1129 }
1130}
1131#endif /* LV_HAVE_AVX2 */
1132
1133
1134#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */