@@ -140,3 +140,43 @@ def rnd_key():
140140 parts = cf_expand (chunker .chunkify (bio ))
141141 reconstructed = b"" .join (parts )
142142 assert reconstructed == data
143+
144+
145+ @pytest .mark .parametrize ("do_encrypt" , (False , True ))
146+ def test_buzhash64_dedup_shifted (do_encrypt ):
147+ min_exp , max_exp , mask = 10 , 16 , 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
148+ key = b"0123456789ABCDEF" * 2
149+ chunker = ChunkerBuzHash64 (key , min_exp , max_exp , mask , 4095 , do_encrypt = do_encrypt )
150+ rdata = os .urandom (4000000 )
151+
152+ def chunkit (data ):
153+ size = 0
154+ chunks = []
155+ with BytesIO (data ) as f :
156+ for chunk in chunker .chunkify (f ):
157+ chunks .append (sha256 (chunk .data ).digest ())
158+ size += len (chunk .data )
159+ return chunks , size
160+
161+ # 2 identical files
162+ data1 , data2 = rdata , rdata
163+ chunks1 , size1 = chunkit (data1 )
164+ chunks2 , size2 = chunkit (data2 )
165+ # exact same chunking
166+ assert size1 == len (data1 )
167+ assert size2 == len (data2 )
168+ assert chunks1 == chunks2
169+
170+ # 2 almost identical files
171+ data1 , data2 = rdata , b"inserted" + rdata
172+ chunks1 , size1 = chunkit (data1 )
173+ chunks2 , size2 = chunkit (data2 )
174+ assert size1 == len (data1 )
175+ assert size2 == len (data2 )
176+ # almost same chunking
177+ # many chunks overall
178+ assert len (chunks1 ) > 100
179+ assert len (chunks2 ) > 100
180+ # only a few unique chunks per file, most chunks are duplicates
181+ assert len (set (chunks1 ) - set (chunks2 )) <= 2
182+ assert len (set (chunks2 ) - set (chunks1 )) <= 2
0 commit comments