Skip to content

Commit caa9fdc

Browse files
tests: add deduplication tests for buzhash64(e)
this will detect if there is anything going wrong regarding deduplication with the encrypted buzhash mode.
1 parent fbfec8e commit caa9fdc

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

src/borg/testsuite/chunkers/buzhash64_test.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,43 @@ def rnd_key():
140140
parts = cf_expand(chunker.chunkify(bio))
141141
reconstructed = b"".join(parts)
142142
assert reconstructed == data
143+
144+
145+
@pytest.mark.parametrize("do_encrypt", (False, True))
146+
def test_buzhash64_dedup_shifted(do_encrypt):
147+
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
148+
key = b"0123456789ABCDEF" * 2
149+
chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt)
150+
rdata = os.urandom(4000000)
151+
152+
def chunkit(data):
153+
size = 0
154+
chunks = []
155+
with BytesIO(data) as f:
156+
for chunk in chunker.chunkify(f):
157+
chunks.append(sha256(chunk.data).digest())
158+
size += len(chunk.data)
159+
return chunks, size
160+
161+
# 2 identical files
162+
data1, data2 = rdata, rdata
163+
chunks1, size1 = chunkit(data1)
164+
chunks2, size2 = chunkit(data2)
165+
# exact same chunking
166+
assert size1 == len(data1)
167+
assert size2 == len(data2)
168+
assert chunks1 == chunks2
169+
170+
# 2 almost identical files
171+
data1, data2 = rdata, b"inserted" + rdata
172+
chunks1, size1 = chunkit(data1)
173+
chunks2, size2 = chunkit(data2)
174+
assert size1 == len(data1)
175+
assert size2 == len(data2)
176+
# almost same chunking
177+
# many chunks overall
178+
assert len(chunks1) > 100
179+
assert len(chunks2) > 100
180+
# only a few unique chunks per file, most chunks are duplicates
181+
assert len(set(chunks1) - set(chunks2)) <= 2
182+
assert len(set(chunks2) - set(chunks1)) <= 2

0 commit comments

Comments
 (0)