-
Notifications
You must be signed in to change notification settings - Fork 94
Expand file tree
/
Copy pathhtml_tag_count.py
More file actions
31 lines (22 loc) · 857 Bytes
/
html_tag_count.py
File metadata and controls
31 lines (22 loc) · 857 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
from collections import Counter
from sparkcc import CCSparkJob
class TagCountJob(CCSparkJob):
""" Count HTML tag names in Common Crawl WARC files"""
name = "TagCount"
# match HTML tags (element names) on binary HTML data
html_tag_pattern = re.compile(b'<([a-z0-9]+)')
def process_record(self, record):
if not self.is_response_record(record):
# skip over WARC request or metadata records
return
if not self.is_html(record):
# skip non-HTML or unknown content types
return
data = self.get_payload_stream(record).read()
counts = Counter(TagCountJob.html_tag_pattern.findall(data))
for tag, count in counts.items():
yield tag.decode('ascii').lower(), count
if __name__ == '__main__':
job = TagCountJob()
job.run()