11#ifndef INC_BLKSORT_H_
22#define INC_BLKSORT_H_
3+ /* *
4+ @file blksort.h
5+
6+ USAGE:
7+ Put '#define BLKSORT_IMPLEMENTATION' before including this file to create the implementation.
8+ */
9+
310#include < cstdint>
411#if defined(_MSC_VER)
512# define BLKSORT_RESTRICT __restrict
3542
3643#ifndef BLKSORT_ALIGN
3744# if defined(_MSC_VER)
38- #define BLKSORT_ALIGN (x ) __declspec(align(x))
45+ # define BLKSORT_ALIGN (x ) __declspec(align(x))
3946# elif defined(__GNUC__) || defined(__clang__)
40- #define BLKSORT_ALIGN (x ) __attribute__((aligned(x)))
47+ # define BLKSORT_ALIGN (x ) __attribute__((aligned(x)))
4148# else
4249# error
4350# endif
@@ -109,13 +116,13 @@ class BlkSort
109116
110117#include < algorithm>
111118#ifdef __AVX__
112- #define BLKSORT_AVX (1 )
113- #include < immintrin.h>
119+ # define BLKSORT_AVX (1 )
120+ # include < immintrin.h>
114121#endif
115122
116123#ifdef __ARM_NEON
117- #define BLKSORT_NEON (1 )
118- #include < arm_neon.h>
124+ # define BLKSORT_NEON (1 )
125+ # include < arm_neon.h>
119126#endif
120127
121128#if BLOCKSORT_PERF
@@ -359,7 +366,8 @@ void sort(uint32_t size, Item* data, uint32_t depth)
359366void counting_sort (uint32_t size, uint16_t * dst, const uint8_t * key, const uint16_t * value)
360367{
361368 assert (0 == (size & 15 ));
362- BLKSORT_ALIGN (16 ) uint16_t count[259 ];
369+ BLKSORT_ALIGN (16 )
370+ uint16_t count[259 ];
363371 ::memset (count, 0 , 256 * sizeof (uint16_t ));
364372 for (uint32_t i = 0 ; i < size; i += 4 ) {
365373 count[key[i + 0 ]] += 1 ;
@@ -542,14 +550,13 @@ void BlkSort::decode_internal(uint8_t* BLKSORT_RESTRICT dst, uint8_t* BLKSORT_RE
542550 start = std::chrono::high_resolution_clock::now ();
543551#endif
544552 uint16_t * id = (uint16_t *)buffer_;
553+ // clang-format off
554+ BLKSORT_ALIGN (Align) static const uint16_t ID0 [8 ] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 };
555+ BLKSORT_ALIGN (Align) static const uint16_t ID1 [8 ] = {8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 };
556+ BLKSORT_ALIGN (Align) static const uint16_t ID2 [8 ] = {16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 };
557+ BLKSORT_ALIGN (Align) static const uint16_t ID3 [8 ] = {24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 };
558+ // clang-format on
545559#if defined(BLKSORT_AVX)
546- // clang-format off
547- BLKSORT_ALIGN (16 ) static const uint16_t ID0 [8 ] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 };
548- BLKSORT_ALIGN (16 ) static const uint16_t ID1 [8 ] = {8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 };
549- BLKSORT_ALIGN (16 ) static const uint16_t ID2 [8 ] = {16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 };
550- BLKSORT_ALIGN (16 ) static const uint16_t ID3 [8 ] = {24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 };
551- // clang-format on
552-
553560# if 0
554561 if (size_ <= 32 ) {
555562 __m128i c0 = _mm_load_si128 ((const __m128i*)ID0 );
@@ -563,30 +570,24 @@ void BlkSort::decode_internal(uint8_t* BLKSORT_RESTRICT dst, uint8_t* BLKSORT_RE
563570 }
564571
565572 } else
566- #else
567- __m128i c0 = _mm_load_si128 ((const __m128i*)ID0 );
568- __m128i c1 = _mm_load_si128 ((const __m128i*)ID1 );
569- __m128i c2 = _mm_load_si128 ((const __m128i*)ID2 );
570- __m128i c3 = _mm_load_si128 ((const __m128i*)ID3 );
571- __m128i add = _mm_set1_epi16 (32 );
572- for (uint32_t i = 0 ; i < size_; i += 32 ) {
573- _mm_store_si128 ((__m128i*)&id[i], c0);
574- c0 = _mm_adds_epi16 (c0, add);
575- _mm_store_si128 ((__m128i*)&id[i + 8 ], c1);
576- c1 = _mm_adds_epi16 (c1, add);
577- _mm_store_si128 ((__m128i*)&id[i + 16 ], c2);
578- c2 = _mm_adds_epi16 (c2, add);
579- _mm_store_si128 ((__m128i*)&id[i + 24 ], c3);
580- c3 = _mm_adds_epi16 (c3, add);
581- }
573+ # else
574+ __m128i c0 = _mm_load_si128 ((const __m128i*)ID0 );
575+ __m128i c1 = _mm_load_si128 ((const __m128i*)ID1 );
576+ __m128i c2 = _mm_load_si128 ((const __m128i*)ID2 );
577+ __m128i c3 = _mm_load_si128 ((const __m128i*)ID3 );
578+ __m128i add = _mm_set1_epi16 (32 );
579+ for (uint32_t i = 0 ; i < size_; i += 32 ) {
580+ _mm_store_si128 ((__m128i*)&id[i], c0);
581+ c0 = _mm_adds_epi16 (c0, add);
582+ _mm_store_si128 ((__m128i*)&id[i + 8 ], c1);
583+ c1 = _mm_adds_epi16 (c1, add);
584+ _mm_store_si128 ((__m128i*)&id[i + 16 ], c2);
585+ c2 = _mm_adds_epi16 (c2, add);
586+ _mm_store_si128 ((__m128i*)&id[i + 24 ], c3);
587+ c3 = _mm_adds_epi16 (c3, add);
588+ }
582589# endif
583590#elif defined(BLKSORT_NEON)
584- // clang-format off
585- BLKSORT_ALIGN (16 ) static const uint16_t ID0 [8 ] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 };
586- BLKSORT_ALIGN (16 ) static const uint16_t ID1 [8 ] = {8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 };
587- BLKSORT_ALIGN (16 ) static const uint16_t ID2 [8 ] = {16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 };
588- BLKSORT_ALIGN (16 ) static const uint16_t ID3 [8 ] = {24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 };
589- // clang-format on
590591 uint16x8_t c0 = vld1q_u16 (ID0 );
591592 uint16x8_t c1 = vld1q_u16 (ID1 );
592593 uint16x8_t c2 = vld1q_u16 (ID2 );
@@ -604,11 +605,11 @@ void BlkSort::decode_internal(uint8_t* BLKSORT_RESTRICT dst, uint8_t* BLKSORT_RE
604605 }
605606
606607#else
607- for (uint32_t i = 0 ; i < size_; i+= 4 ) {
608- id[i+ 0 ] = i+ 0 ;
609- id[i+ 1 ] = i+ 1 ;
610- id[i+ 2 ] = i+ 2 ;
611- id[i+ 3 ] = i+ 3 ;
608+ for (uint32_t i = 0 ; i < size_; i += 4 ) {
609+ id[i + 0 ] = i + 0 ;
610+ id[i + 1 ] = i + 1 ;
611+ id[i + 2 ] = i + 2 ;
612+ id[i + 3 ] = i + 3 ;
612613 }
613614#endif
614615
@@ -660,10 +661,10 @@ void BlkSort::decode_internal(uint8_t* BLKSORT_RESTRICT dst, uint8_t* BLKSORT_RE
660661
661662void BlkSort::mtf_init (uint8_t * BLKSORT_RESTRICT id)
662663{
663- static BLKSORT_ALIGN (16 ) const uint8_t ID0 [16 ] = {0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 };
664- static BLKSORT_ALIGN (16 ) const uint8_t ID1 [16 ] = {16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 };
665- static BLKSORT_ALIGN (16 ) const uint8_t ID2 [16 ] = {32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 };
666- static BLKSORT_ALIGN (16 ) const uint8_t ID3 [16 ] = {48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 };
664+ static BLKSORT_ALIGN (Align ) const uint8_t ID0 [16 ] = {0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 };
665+ static BLKSORT_ALIGN (Align ) const uint8_t ID1 [16 ] = {16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 };
666+ static BLKSORT_ALIGN (Align ) const uint8_t ID2 [16 ] = {32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 };
667+ static BLKSORT_ALIGN (Align ) const uint8_t ID3 [16 ] = {48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 };
667668#ifdef BLKSORT_AVX
668669 __m128i c0 = _mm_load_si128 ((const __m128i*)ID0 );
669670 __m128i c1 = _mm_load_si128 ((const __m128i*)ID1 );
@@ -727,7 +728,7 @@ void BlkSort::mtf_encode(uint32_t size, uint8_t* BLKSORT_RESTRICT data)
727728 std::chrono::high_resolution_clock::time_point start, end;
728729 start = std::chrono::high_resolution_clock::now ();
729730#endif
730- BLKSORT_ALIGN (16 ) uint8_t table[256 ];
731+ BLKSORT_ALIGN (Align ) uint8_t table[256 ];
731732 mtf_init (table);
732733#if BLOCKSORT_PERF
733734 end = std::chrono::high_resolution_clock::now ();
@@ -761,7 +762,7 @@ void BlkSort::mtf_decode(uint32_t size, uint8_t* BLKSORT_RESTRICT data)
761762 std::chrono::high_resolution_clock::time_point start, end;
762763 start = std::chrono::high_resolution_clock::now ();
763764#endif
764- BLKSORT_ALIGN (16 ) uint8_t table[256 ];
765+ BLKSORT_ALIGN (Align ) uint8_t table[256 ];
765766 mtf_init (table);
766767#if BLOCKSORT_PERF
767768 end = std::chrono::high_resolution_clock::now ();
0 commit comments