Skip to content

Commit 170fef1

Browse files
Add SSE41 path for streamvbyte_compressedbytes. (#57)
* Move compressedbytes to _encode.c. In preparation of SIMD implementations. * Split compressedbytes scalar path * Make data_bytes scalar path branchless * Move x64 control byte calculation into a helper This code path will be shared with compressedbytes calculation later. * Add SSE41 path for streamvbyte_compressedbytes
1 parent f759071 commit 170fef1

File tree

3 files changed

+86
-39
lines changed

3 files changed

+86
-39
lines changed

include/streamvbyte.h

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -43,43 +43,15 @@ static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) {
4343
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
4444
// the compressed data: the user needs to ensure that this region is allocated, and it
4545
// is not included by streamvbyte_compressedbytes.
46-
static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
47-
// number of control bytes:
48-
size_t cb = (length + 3) / 4;
49-
// maximum number of control bytes:
50-
size_t db = 0;
51-
for (uint32_t c = 0; c < length; c++) {
52-
uint32_t val = in[c];
53-
54-
if (val < (1 << 8)) db += 1;
55-
else if (val < (1 << 16)) db += 2;
56-
else if (val < (1 << 24)) db += 3;
57-
else db += 4;
58-
}
59-
return cb + db;
60-
}
46+
size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length);
6147

6248
// return the exact number of compressed bytes given length input integers
6349
// runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you
6450
// care about speed more than potentially over-allocating memory
6551
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
6652
// the compressed data: the user needs to ensure that this region is allocated, and it
6753
// is not included by streamvbyte_compressedbytes.
68-
static inline size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
69-
// number of control bytes:
70-
size_t cb = (length + 3) / 4;
71-
// maximum number of control bytes:
72-
size_t db = 0;
73-
for (uint32_t c = 0; c < length; c++) {
74-
uint32_t val = in[c];
75-
76-
if (val == 0) db += 0;
77-
else if (val < (1 << 8)) db += 1;
78-
else if (val < (1 << 16)) db += 2;
79-
else db += 4;
80-
}
81-
return cb + db;
82-
}
54+
size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length);
8355

8456
// Read "length" 32-bit integers in varint format from in, storing the result in out.
8557
// Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes

src/streamvbyte_encode.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,46 @@ static uint8_t *svb_encode_scalar(const uint32_t *in,
6363
#include "streamvbyte_arm_encode.c"
6464
#endif
6565

66+
static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) {
67+
size_t db = 0;
68+
for (uint32_t c = 0; c < length; c++) {
69+
uint32_t val = in[c];
70+
71+
uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF);
72+
db += bytes;
73+
}
74+
return db;
75+
}
76+
77+
static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) {
78+
size_t db = 0;
79+
for (uint32_t c = 0; c < length; c++) {
80+
uint32_t val = in[c];
81+
82+
uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2;
83+
db += bytes;
84+
}
85+
return db;
86+
}
87+
88+
size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
89+
// number of control bytes:
90+
size_t cb = (length + 3) / 4;
91+
92+
#ifdef STREAMVBYTE_X64
93+
if (streamvbyte_sse41()) {
94+
return cb + svb_data_bytes_SSE41(in, length);
95+
}
96+
#endif
97+
return cb + svb_data_bytes_scalar(in, length);
98+
}
99+
100+
size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
101+
// number of control bytes:
102+
size_t cb = (length + 3) / 4;
103+
104+
return cb + svb_data_bytes_0124_scalar(in, length);
105+
}
66106

67107

68108
// Encode an array of a given length read from in to bout in streamvbyte format.

src/streamvbyte_x64_encode.c

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,55 @@
11
#include "streamvbyte_isadetection.h"
22
#ifdef STREAMVBYTE_X64
33
// contributed by aqrit
4+
5+
static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length);
6+
7+
STREAMVBYTE_TARGET_SSE41
8+
static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) {
9+
const __m128i mask_01 = _mm_set1_epi8(0x01);
10+
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);
11+
12+
__m128i m0, m1;
13+
size_t keys;
14+
15+
m0 = _mm_min_epu8(mask_01, lo);
16+
m1 = _mm_min_epu8(mask_01, hi);
17+
m0 = _mm_packus_epi16(m0, m1);
18+
m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101
19+
m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
20+
keys = (size_t)_mm_movemask_epi8(m0);
21+
return keys;
22+
}
23+
STREAMVBYTE_UNTARGET_REGION
24+
25+
STREAMVBYTE_TARGET_SSE41
26+
size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) {
27+
size_t dataLen = 0;
28+
29+
for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
30+
{
31+
__m128i r0, r1;
32+
size_t keys;
33+
34+
r0 = _mm_loadu_si128((__m128i *) &in[0]);
35+
r1 = _mm_loadu_si128((__m128i *) &in[4]);
36+
37+
keys = svb_control_SSE41(r0, r1);
38+
dataLen += len_lut[keys & 0xFF];
39+
dataLen += len_lut[keys >> 8];
40+
}
41+
42+
dataLen += svb_data_bytes_scalar(in, count & 7);
43+
return dataLen;
44+
}
45+
STREAMVBYTE_UNTARGET_REGION
46+
447
STREAMVBYTE_TARGET_SSE41
548
size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) {
649
uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundry
750
uint8_t *restrict keyPtr = &out[0];
851
uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys
952

10-
const __m128i mask_01 = _mm_set1_epi8(0x01);
11-
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);
12-
1353
for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
1454
{
1555
__m128i r0, r1, r2, r3;
@@ -18,12 +58,7 @@ size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* ou
1858
r0 = _mm_loadu_si128((__m128i*)&in[0]);
1959
r1 = _mm_loadu_si128((__m128i*)&in[4]);
2060

21-
r2 = _mm_min_epu8(mask_01, r0);
22-
r3 = _mm_min_epu8(mask_01, r1);
23-
r2 = _mm_packus_epi16(r2, r3);
24-
r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101
25-
r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
26-
keys = (size_t)_mm_movemask_epi8(r2);
61+
keys = svb_control_SSE41(r0, r1);
2762

2863
r2 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys << 4) & 0x03F0]);
2964
r3 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys >> 4) & 0x03F0]);

0 commit comments

Comments
 (0)