File tree Expand file tree Collapse file tree 1 file changed +5
-5
lines changed
Expand file tree Collapse file tree 1 file changed +5
-5
lines changed Original file line number Diff line number Diff line change @@ -19,16 +19,16 @@ cdx_toolkit: jwarc.jar
1919 curl ' https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
2020 @echo
2121 @echo cleanup previous work
22- rm -f TEST-000000.extracted.warc.gz
22+ rm -f data/ TEST-000000.extracted.warc.gz
2323 @echo retrieve the content from the commoncrawl data server
24- curl --request GET --url ' https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header ' Range: bytes=80610731-80628153' > TEST-000000.extracted.warc.gz
24+ curl --request GET --url ' https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header ' Range: bytes=80610731-80628153' > data/ TEST-000000.extracted.warc.gz
2525 @echo
2626 @echo index this new warc
27- java -jar jwarc.jar cdxj TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
28- cat TEST-000000.extracted.warc.cdxj
27+ java -jar jwarc.jar cdxj data/ TEST-000000.extracted.warc.gz > data/ TEST-000000.extracted.warc.cdxj
28+ cat data/ TEST-000000.extracted.warc.cdxj
2929 @echo
3030 @echo iterate this new warc
31- java -jar jwarc.jar ls TEST-000000.extracted.warc.gz
31+ java -jar jwarc.jar ls data/ TEST-000000.extracted.warc.gz
3232 @echo
3333
3434download_collinfo :
You can’t perform that action at this time.
0 commit comments