Skip to content

Commit eef344b

Browse files
jason-raitzawilfox
andauthored
AP-369 Doc processing (#10)
* AP-369 - adds new etl utility doc-processing - tests for doc_proc - adds .DS_Store to .gitignore - adds langchain deps - updates readme and pyproject.toml Co-authored-by: A. Wilcox <AWilcox@Wilcox-Tech.com> * Update willa/etl/__init__.py * Update tests/etl/test_doc_proc.py * Update willa/etl/doc_proc.py * Update README.rst * Update .gitignore
1 parent 6f2bfda commit eef344b

8 files changed

Lines changed: 132 additions & 0 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,7 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
209+
# MacOS
210+
.DS_Store
211+

README.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ follow later in this document.
2828

2929

3030

31+
Linting & Testing locally
32+
==========================
33+
To run the tests, you can use the following command::
34+
35+
python -m unittest
36+
37+
To run linting::
38+
39+
python -m pylint willa
40+
41+
3142
Deployment
3243
==========
3344

@@ -52,3 +63,6 @@ The following keys are available for configuration in the ``.env`` file:
5263

5364
``DEFAULT_STORAGE_DIR``
5465
The default directory to store files retrieved from TIND.
66+
67+
``RUN_OLLAMA_TESTS``
68+
Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running.

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ build-backend = "setuptools.build_meta"
66
name = "willa"
77
version = "0.0.1"
88
dependencies = [
9+
"langchain",
10+
"langchain_community",
911
"pymarc",
12+
"pypdf",
1013
"python-dotenv",
1114
"requests",
1215
]
@@ -22,6 +25,8 @@ license-files = ["LICENSE"]
2225

2326
[project.optional-dependencies]
2427
test = [
28+
"langchain_core",
29+
"langchain_ollama",
2530
"requests-mock",
2631
]
2732
lint = [

tests/etl/__init__.py

Whitespace-only changes.

tests/etl/parnell_kerby.pdf

95.8 KB
Binary file not shown.

tests/etl/test_doc_proc.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
Test suite for document processing utilities.
3+
"""
4+
5+
import os
6+
import shutil
7+
import tempfile
8+
import unittest
9+
10+
from langchain_core.vectorstores import InMemoryVectorStore
11+
from langchain_ollama import OllamaEmbeddings
12+
from willa.etl.doc_proc import (
13+
load_pdfs,
14+
split_doc,
15+
split_all_docs,
16+
embed_docs,
17+
)
18+
19+
class DocumentProcessingTest(unittest.TestCase):
20+
"""Test suite for document processing utilities."""
21+
22+
def setUp(self) -> None:
23+
os.environ['DEFAULT_STORAGE_DIR'] = tempfile.mkdtemp(prefix='willatest')
24+
shutil.copyfile(os.path.join(os.path.dirname(__file__), 'parnell_kerby.pdf'),
25+
os.path.join(os.environ['DEFAULT_STORAGE_DIR'], 'parnell_kerby.pdf'))
26+
self.embedding_model = 'nomic-embed-text'
27+
28+
def test_load_pdfs(self) -> None:
29+
"""Test loading PDF files."""
30+
docs = load_pdfs()
31+
self.assertGreater(len(docs), 0, "Should load at least one document.")
32+
33+
def test_split_doc(self) -> None:
34+
"""Test splitting a document into chunks."""
35+
docs = load_pdfs()
36+
if docs:
37+
doc = docs[0]
38+
chunks = split_doc(doc)
39+
self.assertGreater(len(chunks), 0, "Should create at least one chunk.")
40+
41+
def test_split_all_docs(self) -> None:
42+
"""Test splitting all documents into chunks."""
43+
docs = load_pdfs()
44+
if docs:
45+
chunked_docs = split_all_docs(docs)
46+
self.assertGreater(len(chunked_docs), 0, "Should create chunks from all documents.")
47+
48+
@unittest.skipUnless(os.getenv("RUN_OLLAMA_TESTS"), "requires running ollama")
49+
def test_embed_docs(self) -> None:
50+
"""Test embedding documents."""
51+
docs = load_pdfs()
52+
if docs:
53+
chunked_docs = split_all_docs(docs)
54+
embeddings = OllamaEmbeddings(model=self.embedding_model)
55+
vector_store = InMemoryVectorStore(embeddings)
56+
embed_ids = embed_docs(chunked_docs, vector_store)
57+
self.assertGreater(len(embed_ids), 0, "Should return IDs for embedded documents.")

willa/etl/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""
2+
Provides ETL functionality for Willa.
3+
"""
4+
5+
__copyright__ = "© 2025 The Regents of the University of California. MIT license."

willa/etl/doc_proc.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Utility functions to gather pdf files, use langchain pypdf loader to load them,
3+
and split them into chunks for vectorization.
4+
"""
5+
6+
import os
7+
8+
from langchain_community.document_loaders import PyPDFDirectoryLoader
9+
# from langchain_community.document_loaders import DirectoryLoader
10+
from langchain.text_splitter import RecursiveCharacterTextSplitter
11+
from langchain_core.vectorstores.base import VectorStore
12+
import willa.config # pylint: disable=W0611
13+
14+
15+
def load_pdfs() -> list:
16+
"""Load PDF files from a specified directory using a langchain loader."""
17+
directory_path = os.getenv('DEFAULT_STORAGE_DIR', 'tmp/files/')
18+
loader = PyPDFDirectoryLoader(directory_path, mode="single")
19+
# loader = DirectoryLoader(directory_path, glob="**/*.pdf")
20+
21+
docs = loader.load()
22+
if not docs:
23+
print("No documents found in the specified directory.")
24+
else:
25+
print(f"Loaded {len(docs)} documents from {directory_path}.")
26+
return docs
27+
28+
29+
def split_doc(doc: dict, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
30+
"""Split a document into chunks for vectorization."""
31+
text_splitter = RecursiveCharacterTextSplitter(
32+
chunk_size=chunk_size,
33+
chunk_overlap=chunk_overlap,
34+
add_start_index=True
35+
)
36+
return text_splitter.split_documents([doc])
37+
38+
39+
def split_all_docs(docs: list, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
40+
"""Split all documents into chunks."""
41+
return [split_doc(doc, chunk_size, chunk_overlap) for doc in docs]
42+
43+
44+
def embed_docs(chunked_docs: list, vector_store: VectorStore) -> list:
45+
"""Embed documents using Ollama embeddings and store them in a vector store."""
46+
document_ids = vector_store.add_documents(documents=chunked_docs)
47+
return document_ids

0 commit comments

Comments
 (0)