|
1 | 1 | build: |
2 | 2 | mvn clean package |
3 | 3 |
|
4 | | -cdxj: build ensure_jwarc |
| 4 | +cdxj: build jwarc.jar |
5 | 5 | @echo "creating *.cdxj index files from the local warcs" |
6 | | - java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj |
7 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj |
8 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj |
| 6 | + java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj |
| 7 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj |
| 8 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj |
9 | 9 |
|
10 | | -extract: |
| 10 | +extract: jwarc.jar |
11 | 11 | @echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index" |
12 | | - java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html |
13 | | - java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt |
14 | | - java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json |
15 | | - @echo "hint: python -m json.tool extraction.json" |
| 12 | + java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > data/extraction.html |
| 13 | + java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > data/extraction.txt |
| 14 | + java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > data/extraction.json |
| 15 | + @echo "hint: python -m json.tool data/extraction.json" |
| 16 | + |
| 17 | +cdx_toolkit: jwarc.jar |
| 18 | + @echo demonstrate that we have this entry in the index |
| 19 | + curl 'https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810' |
| 20 | + @echo |
| 21 | + @echo cleanup previous work |
| 22 | + rm -f TEST-000000.extracted.warc.gz |
| 23 | + @echo retrieve the content from the commoncrawl data server |
| 24 | + curl --request GET --url 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header 'Range: bytes=80610731-80628153' > TEST-000000.extracted.warc.gz |
| 25 | + @echo |
| 26 | + @echo index this new warc |
| 27 | + java -jar jwarc.jar cdxj TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj |
| 28 | + cat TEST-000000.extracted.warc.cdxj |
| 29 | + @echo |
| 30 | + @echo iterate this new warc |
| 31 | + java -jar jwarc.jar ls TEST-000000.extracted.warc.gz |
| 32 | + @echo |
16 | 33 |
|
17 | | -# cdx_toolkit: |
18 | | -# @echo demonstrate that we have this entry in the index |
19 | | -# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete |
20 | | -# @echo |
21 | | -# @echo cleanup previous work |
22 | | -# rm -f TEST-000000.extracted.warc.gz |
23 | | -# @echo retrieve the content from the commoncrawl s3 bucket |
24 | | -# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete |
25 | | -# @echo |
26 | | -# @echo index this new warc |
27 | | -# cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj |
28 | | -# cat TEST-000000.extracted.warc.cdxj |
29 | | -# @echo |
30 | | -# @echo iterate this new warc |
31 | | -# python ./warcio-iterator.py TEST-000000.extracted.warc.gz |
32 | | -# @echo |
33 | | -# |
34 | 34 | download_collinfo: |
35 | 35 | @echo "downloading collinfo.json so we can find out the crawl name" |
36 | | - curl -O https://index.commoncrawl.org/collinfo.json |
| 36 | + curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json |
37 | 37 |
|
38 | 38 | CC-MAIN-2024-22.warc.paths.gz: |
39 | 39 | @echo "downloading the list from s3, requires s3 auth even though it is free" |
40 | 40 | @echo "note that this file should be in the repo" |
41 | | - aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz |
| 41 | + aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz |
42 | 42 |
|
43 | 43 | duck_ccf_local_files: build |
44 | | - @echo "warning! only works on Common Crawl Foundadtion's development machine" |
| 44 | + @echo "warning! only works on Common Crawl Foundation's development machine" |
45 | 45 | mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"ccf_local_files" |
46 | 46 |
|
47 | 47 | duck_cloudfront: build |
48 | 48 | @echo "warning! this might take 1-10 minutes" |
49 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront" |
50 | | - |
| 49 | + mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="cloudfront" |
51 | 50 |
|
52 | | -ensure_jwarc: |
53 | | - @echo "Ensuring JWarc JAR is present" |
54 | | - @if [ ! -f jwarc.jar ] ; then \ |
55 | | - echo "jwarc.jar not found, downloading..." ; \ |
56 | | - curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \ |
57 | | - else \ |
58 | | - echo "jwarc.jar found." ; \ |
59 | | - fi |
60 | | - |
61 | | -get_jwarc: |
| 51 | +jwarc.jar: |
62 | 52 | @echo "downloading JWarc JAR" |
63 | 53 | curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar |
64 | 54 |
|
65 | | -wreck_the_warc: build ensure_jwarc |
| 55 | +wreck_the_warc: build jwarc.jar |
66 | 56 | @echo |
67 | 57 | @echo we will break and then fix this warc |
68 | 58 | cp data/whirlwind.warc.gz data/testing.warc.gz |
69 | 59 | rm -f data/testing.warc |
70 | 60 | gzip -d data/testing.warc.gz # windows gunzip no work-a |
71 | 61 | @echo |
72 | | - @echo iterate over this uncompressed warc: works |
73 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc" |
74 | | - @echo |
75 | 62 | @echo compress it the wrong way |
76 | 63 | gzip data/testing.warc |
77 | 64 | @echo |
78 | | - @echo iterating over this compressed warc fails |
79 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true |
| 65 | + @echo showing the records in the compressed warc - note the offsets of request and response are |
| 66 | + java -jar jwarc.jar ls data/testing.warc.gz |
| 67 | + @echo |
| 68 | + @echo access the request record - failing |
| 69 | + java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true |
| 70 | + @echo |
| 71 | + @echo access the response record - failing |
| 72 | + java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true |
80 | 73 | @echo |
81 | 74 | @echo "now let's do it the right way" |
82 | 75 | gzip -d data/testing.warc.gz |
83 | 76 | mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz" |
84 | 77 | @echo |
85 | | - @echo and now iterating works |
86 | | - mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" |
| 78 | + @echo showing the records in the compressed warc - note the skewed offsets of request and response |
| 79 | + java -jar jwarc.jar ls data/testing.warc.gz |
| 80 | + @echo |
| 81 | + @echo access the request record - works |
| 82 | + java -jar jwarc.jar extract data/testing.warc.gz 518 | head |
| 83 | + @echo |
| 84 | + @echo access the response record - works |
| 85 | + java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20 |
87 | 86 | @echo |
0 commit comments