|
| 1 | +""" |
| 2 | +Test suite for document processing utilities. |
| 3 | +""" |
| 4 | + |
| 5 | +import os |
| 6 | +import shutil |
| 7 | +import tempfile |
| 8 | +import unittest |
| 9 | + |
| 10 | +from langchain_core.vectorstores import InMemoryVectorStore |
| 11 | +from langchain_ollama import OllamaEmbeddings |
| 12 | +from willa.etl.doc_proc import ( |
| 13 | + load_pdfs, |
| 14 | + split_doc, |
| 15 | + split_all_docs, |
| 16 | + embed_docs, |
| 17 | +) |
| 18 | + |
| 19 | +class DocumentProcessingTest(unittest.TestCase): |
| 20 | + """Test suite for document processing utilities.""" |
| 21 | + |
| 22 | + def setUp(self) -> None: |
| 23 | + os.environ['DEFAULT_STORAGE_DIR'] = tempfile.mkdtemp(prefix='willatest') |
| 24 | + shutil.copyfile(os.path.join(os.path.dirname(__file__), 'parnell_kerby.pdf'), |
| 25 | + os.path.join(os.environ['DEFAULT_STORAGE_DIR'], 'parnell_kerby.pdf')) |
| 26 | + self.embedding_model = 'nomic-embed-text' |
| 27 | + |
| 28 | + def test_load_pdfs(self) -> None: |
| 29 | + """Test loading PDF files.""" |
| 30 | + docs = load_pdfs() |
| 31 | + self.assertGreater(len(docs), 0, "Should load at least one document.") |
| 32 | + |
| 33 | + def test_split_doc(self) -> None: |
| 34 | + """Test splitting a document into chunks.""" |
| 35 | + docs = load_pdfs() |
| 36 | + if docs: |
| 37 | + doc = docs[0] |
| 38 | + chunks = split_doc(doc) |
| 39 | + self.assertGreater(len(chunks), 0, "Should create at least one chunk.") |
| 40 | + |
| 41 | + def test_split_all_docs(self) -> None: |
| 42 | + """Test splitting all documents into chunks.""" |
| 43 | + docs = load_pdfs() |
| 44 | + if docs: |
| 45 | + chunked_docs = split_all_docs(docs) |
| 46 | + self.assertGreater(len(chunked_docs), 0, "Should create chunks from all documents.") |
| 47 | + |
| 48 | + @unittest.skipUnless(os.getenv("RUN_OLLAMA_TESTS"), "requires running ollama") |
| 49 | + def test_embed_docs(self) -> None: |
| 50 | + """Test embedding documents.""" |
| 51 | + docs = load_pdfs() |
| 52 | + if docs: |
| 53 | + chunked_docs = split_all_docs(docs) |
| 54 | + embeddings = OllamaEmbeddings(model=self.embedding_model) |
| 55 | + vector_store = InMemoryVectorStore(embeddings) |
| 56 | + embed_ids = embed_docs(chunked_docs, vector_store) |
| 57 | + self.assertGreater(len(embed_ids), 0, "Should return IDs for embedded documents.") |
0 commit comments