Skip to content

Commit 31d1cdf

Browse files
committed
ETL: Filter headers and footers out of documents
These patterns are common to all of the Oral History Center interview PDFs that I was able to find. Processing 7 documents with these new filters added exactly 0.04s of runtime on my workstation, so there is not a large performance impact. Implements: AP-455
1 parent 6e4ea80 commit 31d1cdf

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

willa/etl/doc_proc.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import json
77
import logging
8+
import re
89
from contextlib import nullcontext
910
from functools import reduce
1011
from operator import add
@@ -27,6 +28,24 @@
2728
"""The logging object for this module."""
2829

2930

31+
FOOTER_RE = re.compile(r'Copyright © 20\d\d by The Regents of the University of California ?')
32+
"""The compiled regular expression for matching footer text."""
33+
34+
35+
def _filter_docs(docs: list[Document]) -> list[Document]:
36+
"""Run filters on a list of ``Document`` to remove header/footer and other undesired content.
37+
38+
:param list[Document]: The document(s) to filter.
39+
:returns list[Document]: The same document(s), filtered and sanitised.
40+
"""
41+
for doc in docs:
42+
content = doc.page_content
43+
content = content.replace('Oral History Center, The Bancroft Library, University of California, Berkeley ', '')
44+
doc.page_content = FOOTER_RE.sub('', content)
45+
46+
return docs
47+
48+
3049
def load_pdf(name: str, record: Record | None) -> list[Document]:
3150
"""Load a given single PDF from storage, including optional PyMARC record.
3251
@@ -45,7 +64,7 @@ def load_pdf(name: str, record: Record | None) -> list[Document]:
4564
for doc in docs:
4665
doc.metadata['tind_metadata'] = pymarc_to_metadata(record)
4766

48-
return docs
67+
return _filter_docs(docs)
4968

5069

5170
def load_pdfs() -> dict[str, list[Document]]:
@@ -91,7 +110,7 @@ def load_pdfs() -> dict[str, list[Document]]:
91110
if len(metadata) > 0:
92111
for doc in new_docs:
93112
doc.metadata['tind_metadata'] = metadata
94-
docs[tind_id] = new_docs
113+
docs[tind_id] = _filter_docs(new_docs)
95114
LOGGER.info("Loaded %d document(s) from %s.", len(new_docs), tind_id)
96115

97116
return docs

willa/lcvendor/pypdf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,7 @@ def lazy_load(
718718
blob = Blob.from_path(self.file_path)
719719
yield from self.parser.lazy_parse(blob)
720720

721+
721722
class PyPDFDirectoryLoader(BaseLoader):
722723
"""Load and parse a directory of PDF files using 'pypdf' library.
723724

0 commit comments

Comments
 (0)