2323import java .io .InputStream ;
2424import java .nio .file .Files ;
2525import java .nio .file .Path ;
26- import java .util .concurrent .atomic .AtomicInteger ;
2726
2827public class ValidateWARC {
28+
29+ private static final int READ_BUFFER_SIZE = 64 * 1024 ;
30+
31+ private static class StopReadingException extends IOException {
32+ }
33+
2934 public static void main (String [] args ) throws Exception {
3035 if (args .length != 1 ) {
3136 System .err .println ("Usage: java ValidateWARC <file.gz>" );
@@ -38,41 +43,62 @@ public static void main(String[] args) throws Exception {
3843 }
3944
4045 int n = getWarcCompressionInformation (requested );
41- if (n <= 1 ) {
42- System .out .println ("Single-member gzip (likely whole-file gzip). members=" + n );
46+ if (n == 0 ) {
47+ System .out .println ("No gzip members found (empty or not a gzip file). members=0" );
48+ } else if (n == 1 ) {
49+ System .out .println ("Single-member gzip (likely whole-file gzip). members=1" );
4350 } else {
4451 System .out .println ("Concatenated multi-member gzip (record-compressed). members=" + n );
4552 }
4653
4754 }
4855
49- public static int getWarcCompressionInformation (Path inputWarc ) throws IOException {
50- final AtomicInteger memberCount = new AtomicInteger (0 );
56+ /**
57+ * Counts gzip members in the file. If stopAfter > 0, stops early once that many
58+ * members have been found (avoids decompressing the entire file). Pass
59+ * stopAfter = 0 to count all members.
60+ */
61+ private static int countGzipMembers (Path inputWarc , int stopAfter ) throws IOException {
62+ int [] memberCount = { 0 };
5163
5264 try (InputStream fis = Files .newInputStream (inputWarc );
5365 BufferedInputStream bis = new BufferedInputStream (fis );
5466 GzipCompressorInputStream gz = GzipCompressorInputStream .builder ().setDecompressConcatenated (true )
55- .setOnMemberEnd (x -> memberCount .incrementAndGet ()).setInputStream (bis ).get ()) {
67+ .setOnMemberEnd (x -> {
68+ memberCount [0 ]++;
69+ if (stopAfter > 0 && memberCount [0 ] >= stopAfter ) {
70+ throw new StopReadingException ();
71+ }
72+ }).setInputStream (bis ).get ()) {
5673
57- byte [] buf = new byte [64 * 1024 ];
74+ byte [] buf = new byte [READ_BUFFER_SIZE ];
5875 while (gz .read (buf ) != -1 ) {
59- // Read the entire stream to trigger member processing
60- // We might not need to read the whole stream, just enough to get an idea
76+ // drain the stream to trigger member callbacks
6177 }
62- } catch (IOException e ) {
63- throw new IllegalArgumentException ( "The file is either not a gzip file or is corrupted." , e );
78+ } catch (StopReadingException e ) {
79+ // early exit — threshold reached
6480 }
6581
66- return memberCount . get () ;
82+ return memberCount [ 0 ] ;
6783 }
6884
69- public static void validateRandomAccessWarcOrFail (Path inputWarc ) throws IOException {
70- int n = getWarcCompressionInformation (inputWarc );
85+ public static int getWarcCompressionInformation (Path inputWarc ) throws IOException {
86+ return countGzipMembers (inputWarc , 0 );
87+ }
7188
72- if (n <= 1 ) {
73- throw new IOException (
74- "Non-chunked gzip file detected, gzip block continues\n " + " beyond single record. " + n );
75- }
89+ /**
90+ * Fast check: returns true if the gzip file contains more than one member.
91+ * Stops reading as soon as the second member is detected, avoiding full
92+ * decompression of large files.
93+ */
94+ public static boolean isMultiMemberGzip (Path inputWarc ) throws IOException {
95+ return countGzipMembers (inputWarc , 2 ) >= 2 ;
96+ }
7697
98+ public static void validateRandomAccessWarcOrFail (Path inputWarc ) throws IOException {
99+ if (!isMultiMemberGzip (inputWarc )) {
100+ throw new IOException ("Non-chunked gzip file detected, gzip block continues\n "
101+ + " beyond single record. File must be record-compressed (multi-member gzip)." );
102+ }
77103 }
78104}
0 commit comments