Skip to content

Commit cb80eea

Browse files
committed
fix: place TEST-0000... under data
1 parent e46fdd1 commit cb80eea

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,16 @@ cdx_toolkit: jwarc.jar
1919
curl 'https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
2020
@echo
2121
@echo cleanup previous work
22-
rm -f TEST-000000.extracted.warc.gz
22+
rm -f data/TEST-000000.extracted.warc.gz
2323
@echo retrieve the content from the commoncrawl data server
24-
curl --request GET --url 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header 'Range: bytes=80610731-80628153' > TEST-000000.extracted.warc.gz
24+
curl --request GET --url 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header 'Range: bytes=80610731-80628153' > data/TEST-000000.extracted.warc.gz
2525
@echo
2626
@echo index this new warc
27-
java -jar jwarc.jar cdxj TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
28-
cat TEST-000000.extracted.warc.cdxj
27+
java -jar jwarc.jar cdxj data/TEST-000000.extracted.warc.gz > data/TEST-000000.extracted.warc.cdxj
28+
cat data/TEST-000000.extracted.warc.cdxj
2929
@echo
3030
@echo iterate this new warc
31-
java -jar jwarc.jar ls TEST-000000.extracted.warc.gz
31+
java -jar jwarc.jar ls data/TEST-000000.extracted.warc.gz
3232
@echo
3333

3434
download_collinfo:

0 commit comments

Comments
 (0)