14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (
val & b[0]) != 0;
26 res |= ((
val & b[4]) != 0) << 4;
27 res |= ((
val & b[3]) != 0) << 3;
28 res |= ((
val & b[2]) != 0) << 2;
29 res |= ((
val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
52 unsigned int frame_size)
55 unsigned int frame_half = frame_size >> 1;
56 unsigned int num_branches = 1;
61 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
64 num_branches = num_branches << 1;
65 frame_half = frame_half >> 1;
75 unsigned int frame_size)
79 unsigned int stage = po2;
80 unsigned char* frame_ptr = frame;
81 unsigned char* temp_ptr = temp;
83 unsigned int frame_half = frame_size >> 1;
84 unsigned int num_branches = 1;
107 __m128i r_frame0, r_temp0, shifted;
111 const __m128i shuffle_separate =
112 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
119 for (branch = 0; branch < num_branches; ++branch) {
120 for (bit = 0; bit < frame_half; bit += 16) {
144 frame_ptr += frame_half;
146 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
148 num_branches = num_branches << 1;
149 frame_half = frame_half >> 1;
165 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
215 for (branch = 0; branch < num_branches; ++branch) {
250#include <immintrin.h>
252static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
254 unsigned int frame_size)
258 unsigned int stage = po2;
259 unsigned char* frame_ptr = frame;
260 unsigned char* temp_ptr = temp;
262 unsigned int frame_half = frame_size >> 1;
263 unsigned int num_branches = 1;
268 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
318 __m256i r_frame0, r_temp0, shifted;
319 __m128i r_temp2, r_frame2, shifted2;
321 __m256i r_frame1, r_temp1;
323 const __m256i shuffle_separate = _mm256_setr_epi8(0,
355 const __m128i shuffle_separate128 =
356 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
363 for (branch = 0; branch < num_branches; ++branch) {
364 for (bit = 0; bit < frame_half; bit += 32) {
365 if ((frame_half - bit) <
391 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
393 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
396 shifted = _mm256_srli_si256(r_temp0, 1);
397 shifted = _mm256_and_si256(shifted, mask_stage1);
398 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
399 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
401 shifted = _mm256_srli_si256(r_temp1, 1);
402 shifted = _mm256_and_si256(shifted, mask_stage1);
403 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
404 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
406 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
407 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
408 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
409 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
411 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
413 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
417 frame_ptr += frame_half;
419 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
421 num_branches = num_branches << 1;
422 frame_half = frame_half >> 1;
437 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
469 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
501 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
533 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
566 for (branch = 0; branch < num_branches / 2; ++branch) {
567 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
574 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
576 shifted = _mm256_srli_si256(r_temp0, 8);
577 shifted = _mm256_and_si256(shifted, mask_stage4);
578 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
581 shifted = _mm256_srli_si256(r_frame0, 4);
582 shifted = _mm256_and_si256(shifted, mask_stage3);
583 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
585 shifted = _mm256_srli_si256(r_frame0, 2);
586 shifted = _mm256_and_si256(shifted, mask_stage2);
587 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
589 shifted = _mm256_srli_si256(r_frame0, 1);
590 shifted = _mm256_and_si256(shifted, mask_stage1);
591 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
594 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
602#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
603#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
606#include <tmmintrin.h>
610 unsigned int frame_size)
614 unsigned int stage = po2;
615 unsigned char* frame_ptr = frame;
616 unsigned char* temp_ptr = temp;
618 unsigned int frame_half = frame_size >> 1;
619 unsigned int num_branches = 1;
642 __m128i r_frame0, r_temp0, shifted;
646 const __m128i shuffle_separate =
647 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
654 for (branch = 0; branch < num_branches; ++branch) {
655 for (bit = 0; bit < frame_half; bit += 16) {
679 frame_ptr += frame_half;
681 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
683 num_branches = num_branches << 1;
684 frame_half = frame_half >> 1;
700 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
750 for (branch = 0; branch < num_branches; ++branch) {
784#include <immintrin.h>
786static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
788 unsigned int frame_size)
792 unsigned int stage = po2;
793 unsigned char* frame_ptr = frame;
794 unsigned char* temp_ptr = temp;
796 unsigned int frame_half = frame_size >> 1;
797 unsigned int num_branches = 1;
802 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
852 __m256i r_frame0, r_temp0, shifted;
853 __m128i r_temp2, r_frame2, shifted2;
855 __m256i r_frame1, r_temp1;
857 const __m256i shuffle_separate = _mm256_setr_epi8(0,
889 const __m128i shuffle_separate128 =
890 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
897 for (branch = 0; branch < num_branches; ++branch) {
898 for (bit = 0; bit < frame_half; bit += 32) {
899 if ((frame_half - bit) <
925 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
927 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
930 shifted = _mm256_srli_si256(r_temp0, 1);
931 shifted = _mm256_and_si256(shifted, mask_stage1);
932 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
933 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
935 shifted = _mm256_srli_si256(r_temp1, 1);
936 shifted = _mm256_and_si256(shifted, mask_stage1);
937 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
938 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
940 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
941 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
942 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
943 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
945 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
947 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
951 frame_ptr += frame_half;
953 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
955 num_branches = num_branches << 1;
956 frame_half = frame_half >> 1;
971 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1003 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1035 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1067 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1100 for (branch = 0; branch < num_branches / 2; ++branch) {
1101 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1108 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1110 shifted = _mm256_srli_si256(r_temp0, 8);
1111 shifted = _mm256_and_si256(shifted, mask_stage4);
1112 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1114 shifted = _mm256_srli_si256(r_frame0, 4);
1115 shifted = _mm256_and_si256(shifted, mask_stage3);
1116 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1118 shifted = _mm256_srli_si256(r_frame0, 2);
1119 shifted = _mm256_and_si256(shifted, mask_stage2);
1120 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1122 shifted = _mm256_srli_si256(r_frame0, 1);
1123 shifted = _mm256_and_si256(shifted, mask_stage1);
1124 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1127 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);