AP-369 Doc processing (#10)

jason-raitz · awilfox · web-flow · commit eef344b2f27b · 2025-08-06T17:18:42.000-04:00
* AP-369
 - adds new etl utility doc-processing
 - tests for doc_proc
 - adds .DS_Store to .gitignore
 - adds langchain deps
 - updates readme and pyproject.toml

Co-authored-by: A. Wilcox &lt;AWilcox@Wilcox-Tech.com&gt;
* Update willa/etl/__init__.py
* Update tests/etl/test_doc_proc.py
* Update willa/etl/doc_proc.py
* Update README.rst
* Update .gitignore
diff --git a/.gitignore b/.gitignore
@@ -205,3 +205,7 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# MacOS
+.DS_Store
+
diff --git a/README.rst b/README.rst
@@ -28,6 +28,17 @@ follow later in this document.
 
 
 
+Linting & Testing locally
+==========================
+To run the tests, you can use the following command::
+
+    python -m unittest
+
+To run linting::
+
+    python -m pylint willa
+
+
 Deployment
 ==========
 
@@ -52,3 +63,6 @@ The following keys are available for configuration in the ``.env`` file:
 
 ``DEFAULT_STORAGE_DIR``
     The default directory to store files retrieved from TIND.
+
+``RUN_OLLAMA_TESTS``
+    Set to ``true`` to run the Ollama tests.  Should only be set if Ollama is running.
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,10 @@ build-backend = "setuptools.build_meta"
 name = "willa"
 version = "0.0.1"
 dependencies = [
+    "langchain",
+    "langchain_community",
     "pymarc",
+    "pypdf",
     "python-dotenv",
     "requests",
 ]
@@ -22,6 +25,8 @@ license-files = ["LICENSE"]
 
 [project.optional-dependencies]
 test = [
+    "langchain_core",
+    "langchain_ollama",
     "requests-mock",
 ]
 lint = [
diff --git a/tests/etl/__init__.py b/tests/etl/__init__.py
diff --git a/tests/etl/parnell_kerby.pdf b/tests/etl/parnell_kerby.pdf
diff --git a/tests/etl/test_doc_proc.py b/tests/etl/test_doc_proc.py
@@ -0,0 +1,57 @@
+"""
+Test suite for document processing utilities.
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_ollama import OllamaEmbeddings
+from willa.etl.doc_proc import (
+    load_pdfs,
+    split_doc,
+    split_all_docs,
+    embed_docs,
+)
+
+class DocumentProcessingTest(unittest.TestCase):
+    """Test suite for document processing utilities."""
+
+    def setUp(self) -> None:
+        os.environ['DEFAULT_STORAGE_DIR'] = tempfile.mkdtemp(prefix='willatest')
+        shutil.copyfile(os.path.join(os.path.dirname(__file__), 'parnell_kerby.pdf'),
+                        os.path.join(os.environ['DEFAULT_STORAGE_DIR'], 'parnell_kerby.pdf'))
+        self.embedding_model = 'nomic-embed-text'
+
+    def test_load_pdfs(self) -> None:
+        """Test loading PDF files."""
+        docs = load_pdfs()
+        self.assertGreater(len(docs), 0, "Should load at least one document.")
+
+    def test_split_doc(self) -> None:
+        """Test splitting a document into chunks."""
+        docs = load_pdfs()
+        if docs:
+            doc = docs[0]
+            chunks = split_doc(doc)
+            self.assertGreater(len(chunks), 0, "Should create at least one chunk.")
+
+    def test_split_all_docs(self) -> None:
+        """Test splitting all documents into chunks."""
+        docs = load_pdfs()
+        if docs:
+            chunked_docs = split_all_docs(docs)
+            self.assertGreater(len(chunked_docs), 0, "Should create chunks from all documents.")
+
+    @unittest.skipUnless(os.getenv("RUN_OLLAMA_TESTS"), "requires running ollama")
+    def test_embed_docs(self) -> None:
+        """Test embedding documents."""
+        docs = load_pdfs()
+        if docs:
+            chunked_docs = split_all_docs(docs)
+            embeddings = OllamaEmbeddings(model=self.embedding_model)
+            vector_store = InMemoryVectorStore(embeddings)
+            embed_ids = embed_docs(chunked_docs, vector_store)
+            self.assertGreater(len(embed_ids), 0, "Should return IDs for embedded documents.")
diff --git a/willa/etl/__init__.py b/willa/etl/__init__.py
@@ -0,0 +1,5 @@
+"""
+Provides ETL functionality for Willa.
+"""
+
+__copyright__ = "© 2025 The Regents of the University of California.  MIT license."
diff --git a/willa/etl/doc_proc.py b/willa/etl/doc_proc.py
@@ -0,0 +1,47 @@
+"""
+Utility functions to gather pdf files, use langchain pypdf loader to load them,
+and split them into chunks for vectorization.
+"""
+
+import os
+
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+# from langchain_community.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.vectorstores.base import VectorStore
+import willa.config  # pylint: disable=W0611
+
+
+def load_pdfs() -> list:
+    """Load PDF files from a specified directory using a langchain loader."""
+    directory_path = os.getenv('DEFAULT_STORAGE_DIR', 'tmp/files/')
+    loader = PyPDFDirectoryLoader(directory_path, mode="single")
+    # loader = DirectoryLoader(directory_path, glob="**/*.pdf")
+
+    docs = loader.load()
+    if not docs:
+        print("No documents found in the specified directory.")
+    else:
+        print(f"Loaded {len(docs)} documents from {directory_path}.")
+    return docs
+
+
+def split_doc(doc: dict, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
+    """Split a document into chunks for vectorization."""
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        add_start_index=True
+    )
+    return text_splitter.split_documents([doc])
+
+
+def split_all_docs(docs: list, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
+    """Split all documents into chunks."""
+    return [split_doc(doc, chunk_size, chunk_overlap) for doc in docs]
+
+
+def embed_docs(chunked_docs: list, vector_store: VectorStore) -> list:
+    """Embed documents using Ollama embeddings and store them in a vector store."""
+    document_ids = vector_store.add_documents(documents=chunked_docs)
+    return document_ids

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +"""
 +Provides ETL functionality for Willa.
 +"""
++
 +__copyright__ = "© 2025 The Regents of the University of California.  MIT license."