Skip to content

Commit 647a1f6

Browse files
committed
feat(validation): Enhance WARC validation for gzip files with multi-member support
1 parent 7318a0f commit 647a1f6

File tree

1 file changed

+44
-18
lines changed

1 file changed

+44
-18
lines changed

src/main/java/org/commoncrawl/whirlwind/ValidateWARC.java

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,14 @@
2323
import java.io.InputStream;
2424
import java.nio.file.Files;
2525
import java.nio.file.Path;
26-
import java.util.concurrent.atomic.AtomicInteger;
2726

2827
public class ValidateWARC {
28+
29+
private static final int READ_BUFFER_SIZE = 64 * 1024;
30+
31+
private static class StopReadingException extends IOException {
32+
}
33+
2934
public static void main(String[] args) throws Exception {
3035
if (args.length != 1) {
3136
System.err.println("Usage: java ValidateWARC <file.gz>");
@@ -38,41 +43,62 @@ public static void main(String[] args) throws Exception {
3843
}
3944

4045
int n = getWarcCompressionInformation(requested);
41-
if (n <= 1) {
42-
System.out.println("Single-member gzip (likely whole-file gzip). members=" + n);
46+
if (n == 0) {
47+
System.out.println("No gzip members found (empty or not a gzip file). members=0");
48+
} else if (n == 1) {
49+
System.out.println("Single-member gzip (likely whole-file gzip). members=1");
4350
} else {
4451
System.out.println("Concatenated multi-member gzip (record-compressed). members=" + n);
4552
}
4653

4754
}
4855

49-
public static int getWarcCompressionInformation(Path inputWarc) throws IOException {
50-
final AtomicInteger memberCount = new AtomicInteger(0);
56+
/**
57+
* Counts gzip members in the file. If stopAfter > 0, stops early once that many
58+
* members have been found (avoids decompressing the entire file). Pass
59+
* stopAfter = 0 to count all members.
60+
*/
61+
private static int countGzipMembers(Path inputWarc, int stopAfter) throws IOException {
62+
int[] memberCount = { 0 };
5163

5264
try (InputStream fis = Files.newInputStream(inputWarc);
5365
BufferedInputStream bis = new BufferedInputStream(fis);
5466
GzipCompressorInputStream gz = GzipCompressorInputStream.builder().setDecompressConcatenated(true)
55-
.setOnMemberEnd(x -> memberCount.incrementAndGet()).setInputStream(bis).get()) {
67+
.setOnMemberEnd(x -> {
68+
memberCount[0]++;
69+
if (stopAfter > 0 && memberCount[0] >= stopAfter) {
70+
throw new StopReadingException();
71+
}
72+
}).setInputStream(bis).get()) {
5673

57-
byte[] buf = new byte[64 * 1024];
74+
byte[] buf = new byte[READ_BUFFER_SIZE];
5875
while (gz.read(buf) != -1) {
59-
// Read the entire stream to trigger member processing
60-
// We might not need to read the whole stream, just enough to get an idea
76+
// drain the stream to trigger member callbacks
6177
}
62-
} catch (IOException e) {
63-
throw new IllegalArgumentException("The file is either not a gzip file or is corrupted.", e);
78+
} catch (StopReadingException e) {
79+
// early exit — threshold reached
6480
}
6581

66-
return memberCount.get();
82+
return memberCount[0];
6783
}
6884

69-
public static void validateRandomAccessWarcOrFail(Path inputWarc) throws IOException {
70-
int n = getWarcCompressionInformation(inputWarc);
85+
public static int getWarcCompressionInformation(Path inputWarc) throws IOException {
86+
return countGzipMembers(inputWarc, 0);
87+
}
7188

72-
if (n <= 1) {
73-
throw new IOException(
74-
"Non-chunked gzip file detected, gzip block continues\n" + " beyond single record. " + n);
75-
}
89+
/**
90+
* Fast check: returns true if the gzip file contains more than one member.
91+
* Stops reading as soon as the second member is detected, avoiding full
92+
* decompression of large files.
93+
*/
94+
public static boolean isMultiMemberGzip(Path inputWarc) throws IOException {
95+
return countGzipMembers(inputWarc, 2) >= 2;
96+
}
7697

98+
public static void validateRandomAccessWarcOrFail(Path inputWarc) throws IOException {
99+
if (!isMultiMemberGzip(inputWarc)) {
100+
throw new IOException("Non-chunked gzip file detected, gzip block continues\n"
101+
+ " beyond single record. File must be record-compressed (multi-member gzip).");
102+
}
77103
}
78104
}

0 commit comments

Comments
 (0)