Skip to content

Commit 10dd303

Browse files
committed
Merge branch 'main' into luca/feature/part4
# Conflicts: # Makefile # README.md
2 parents 893788e + 34c6c87 commit 10dd303

File tree

4 files changed

+455
-186
lines changed

4 files changed

+455
-186
lines changed

.editorconfig

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ root = true
66
end_of_line = lf
77
insert_final_newline = true
88

9-
# LF: not sure about this
10-
# [*.java]
11-
# charset = utf-8
12-
# indent_style = space
13-
# indent_size = 4
9+
[*.java]
10+
charset = utf-8
11+
indent_style = space
12+
indent_size = 4
1413

1514
[Makefile]
1615
indent_style = tab

Makefile

Lines changed: 48 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,87 +1,86 @@
11
build:
22
mvn clean package
33

4-
cdxj: build ensure_jwarc
4+
cdxj: build jwarc.jar
55
@echo "creating *.cdxj index files from the local warcs"
6-
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > whirlwind.warc.cdxj
7-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > whirlwind.warc.wet.cdxj
8-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > whirlwind.warc.wat.cdxj
6+
java -jar jwarc.jar cdxj data/whirlwind.warc.gz > data/whirlwind.warc.cdxj
7+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wet.gz --records conversion" > data/whirlwind.warc.wet.cdxj
8+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.CdxjIndexer -Dexec.args="data/whirlwind.warc.wat.gz --records metadata" > data/whirlwind.warc.wat.cdxj
99

10-
extract:
10+
extract: jwarc.jar
1111
@echo "creating extraction.* from local warcs, the offset numbers are from the cdxj index"
12-
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > extraction.html
13-
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > extraction.txt
14-
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > extraction.json
15-
@echo "hint: python -m json.tool extraction.json"
12+
java -jar jwarc.jar extract --payload data/whirlwind.warc.gz 1023 > data/extraction.html
13+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wet.gz 466 > data/extraction.txt
14+
java -jar jwarc.jar extract --payload data/whirlwind.warc.wat.gz 443 > data/extraction.json
15+
@echo "hint: python -m json.tool data/extraction.json"
16+
17+
cdx_toolkit: jwarc.jar
18+
@echo demonstrate that we have this entry in the index
19+
curl 'https://index.commoncrawl.org/CC-MAIN-2024-22-index?url=an.wikipedia.org/wiki/Escopete&output=json&from=20240518015810&to=20240518015810'
20+
@echo
21+
@echo cleanup previous work
22+
rm -f TEST-000000.extracted.warc.gz
23+
@echo retrieve the content from the commoncrawl data server
24+
curl --request GET --url 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/segments/1715971057216.39/warc/CC-MAIN-20240517233122-20240518023122-00000.warc.gz' --header 'Range: bytes=80610731-80628153' > TEST-000000.extracted.warc.gz
25+
@echo
26+
@echo index this new warc
27+
java -jar jwarc.jar cdxj TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
28+
cat TEST-000000.extracted.warc.cdxj
29+
@echo
30+
@echo iterate this new warc
31+
java -jar jwarc.jar ls TEST-000000.extracted.warc.gz
32+
@echo
1633

17-
# cdx_toolkit:
18-
# @echo demonstrate that we have this entry in the index
19-
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 iter an.wikipedia.org/wiki/Escopete
20-
# @echo
21-
# @echo cleanup previous work
22-
# rm -f TEST-000000.extracted.warc.gz
23-
# @echo retrieve the content from the commoncrawl s3 bucket
24-
# cdxt --crawl CC-MAIN-2024-22 --from 20240518015810 --to 20240518015810 warc an.wikipedia.org/wiki/Escopete
25-
# @echo
26-
# @echo index this new warc
27-
# cdxj-indexer TEST-000000.extracted.warc.gz > TEST-000000.extracted.warc.cdxj
28-
# cat TEST-000000.extracted.warc.cdxj
29-
# @echo
30-
# @echo iterate this new warc
31-
# python ./warcio-iterator.py TEST-000000.extracted.warc.gz
32-
# @echo
33-
#
3434
download_collinfo:
3535
@echo "downloading collinfo.json so we can find out the crawl name"
36-
curl -O https://index.commoncrawl.org/collinfo.json
36+
curl -o data/collinfo.json https://index.commoncrawl.org/collinfo.json
3737

3838
CC-MAIN-2024-22.warc.paths.gz:
3939
@echo "downloading the list from s3, requires s3 auth even though it is free"
4040
@echo "note that this file should be in the repo"
41-
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz
41+
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > data/CC-MAIN-2024-22.warc.paths.gz
4242

4343
duck_ccf_local_files: build
44-
@echo "warning! only works on Common Crawl Foundadtion's development machine"
44+
@echo "warning! only works on Common Crawl Foundation's development machine"
4545
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"ccf_local_files"
4646

4747
duck_cloudfront: build
4848
@echo "warning! this might take 1-10 minutes"
49-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args"cloudfront"
50-
49+
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.Duck -Dexec.args="cloudfront"
5150

52-
ensure_jwarc:
53-
@echo "Ensuring JWarc JAR is present"
54-
@if [ ! -f jwarc.jar ] ; then \
55-
echo "jwarc.jar not found, downloading..." ; \
56-
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar ; \
57-
else \
58-
echo "jwarc.jar found." ; \
59-
fi
60-
61-
get_jwarc:
51+
jwarc.jar:
6252
@echo "downloading JWarc JAR"
6353
curl -fL -o jwarc.jar https://github.com/iipc/jwarc/releases/download/v0.33.0/jwarc-0.33.0.jar
6454

65-
wreck_the_warc: build ensure_jwarc
55+
wreck_the_warc: build jwarc.jar
6656
@echo
6757
@echo we will break and then fix this warc
6858
cp data/whirlwind.warc.gz data/testing.warc.gz
6959
rm -f data/testing.warc
7060
gzip -d data/testing.warc.gz # windows gunzip no work-a
7161
@echo
72-
@echo iterate over this uncompressed warc: works
73-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc"
74-
@echo
7562
@echo compress it the wrong way
7663
gzip data/testing.warc
7764
@echo
78-
@echo iterating over this compressed warc fails
79-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz" || /usr/bin/true
65+
@echo showing the records in the compressed warc - note the offsets of request and response are
66+
java -jar jwarc.jar ls data/testing.warc.gz
67+
@echo
68+
@echo access the request record - failing
69+
java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
70+
@echo
71+
@echo access the response record - failing
72+
java -jar jwarc.jar extract data/testing.warc.gz 3734 || /usr/bin/true
8073
@echo
8174
@echo "now let's do it the right way"
8275
gzip -d data/testing.warc.gz
8376
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.RecompressWARC -Dexec.args="data/testing.warc data/testing.warc.gz"
8477
@echo
85-
@echo and now iterating works
86-
mvn -q exec:java -Dexec.mainClass=org.commoncrawl.whirlwind.ReadWARC -Dexec.args="data/testing.warc.gz"
78+
@echo showing the records in the compressed warc - note the skewed offsets of request and response
79+
java -jar jwarc.jar ls data/testing.warc.gz
80+
@echo
81+
@echo access the request record - works
82+
java -jar jwarc.jar extract data/testing.warc.gz 518 | head
83+
@echo
84+
@echo access the response record - works
85+
java -jar jwarc.jar extract data/testing.warc.gz 1027 | head -n 20
8786
@echo

0 commit comments

Comments
 (0)