Add end-to-end testing of TIND and LanceDB

awilfox · awilfox · commit 2a8b4e8f7f12 · 2025-12-18T16:36:03.000-06:00
Implements: AP-517
diff --git a/README.rst b/README.rst
@@ -123,6 +123,28 @@ You can also pass the ``--profile`` argument::
 Configuration
 =============
 
+Test Configuration
+------------------
+
+The following keys are searched for by the test suite.
+They are ignored by the main app.
+
+``RUN_OLLAMA_TESTS``
+    Set to ``true`` to run the Ollama tests.  Should only be set if Ollama is running.
+
+``RUN_E2E_TESTS``
+    Set to anything (like ``true``) to run the end-to-end tests.
+    This requires Ollama and a functional TIND key.
+
+``KEEP_E2E_FILES``
+    Set to anything (like ``true``) when running E2E tests to additionally keep the files
+    downloaded and created for your own later debugging.  This includes the PDF and metadata
+    records from TIND, and the LanceDB test store.  You may remove them when you are done.
+
+
+App Configuration
+-----------------
+
 The following keys are available for configuration in the ``.env`` file:
 
 ``TIND_API_KEY``
@@ -134,9 +156,6 @@ The following keys are available for configuration in the ``.env`` file:
 ``DEFAULT_STORAGE_DIR``
     The default directory to store files retrieved from TIND.
 
-``RUN_OLLAMA_TESTS``
-    Set to ``true`` to run the Ollama tests.  Should only be set if Ollama is running.
-
 ``OLLAMA_URL``
     Set to the instance of Ollama to use for the Web interface.
     Defaults to ``http://localhost:11434``; you may want ``http://ollama:11434`` for Docker runs.
@@ -153,6 +172,9 @@ The following keys are available for configuration in the ``.env`` file:
 ``POSTGRES_DB``   
     The name of the database for the app.  Defaults to ``willa``.
 
+``POSTGRES_HOST``
+    The hostname of the Postgres server.  Likely ``db`` in a Docker Compose environment.
+
 ``POSTGRES_PORT``   
     The Postgres port.  Defaults to ``5432``.
 
@@ -236,7 +258,7 @@ The following keys are available for configuration in the ``.env`` file:
     Defaults to '500' if not set.
 
 ``K_VALUE``
-    Int. The k value used for retrieving context from the vector_store. The default is 4   
+    Int. The k value used for retrieving context from the vector_store. The default is 4.
 
 ``NULL_AUTH``
     Boolean.  Whether to allow anyone to login with any name and password.  Defaults to ``False``.
diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
diff --git a/tests/e2e/test_everything.py b/tests/e2e/test_everything.py
@@ -0,0 +1,79 @@
+"""
+Test suite for the entire ETL pipeline, including external actors.
+
+This differs from ``tests.etl.test_pipeline`` in the following ways:
+
+* We do not mock TIND; we connect to the real thing.
+* We use a LanceDB vector store instead of an in-memory vector store.
+* We embed documents using the actual configured embeddings provider.
+  This defaults to ollama to prevent cost issues with Bedrock.
+* We process multiple PDF transcripts into the vector store.
+* We ensure retrieval of all documents, to prevent AP-503 recurring.
+"""
+
+import os.path
+import shutil
+import tempfile
+import unittest
+
+from willa.config import CONFIG
+import willa.etl.pipeline
+
+
+class E2ETest(unittest.TestCase):
+    """Test the entire pipeline.
+
+    1. Extract - Fetch three records from TIND.
+    2. Transform/Load - Process them into a LanceDB vector store.
+    3. Perform a number of queries to ensure the process was successful.
+
+    The queries we run ensure results from every document are included.
+    """
+    def setUp(self) -> None:
+        """Initialise the environment for the end-to-end test."""
+        self.temp_dir = tempfile.mkdtemp(prefix='willatest')
+
+        storage_dir = os.path.join(self.temp_dir, 'pdfs')
+        os.mkdir(storage_dir)
+        CONFIG['DEFAULT_STORAGE_DIR'] = storage_dir
+
+        data_dir = os.path.join(self.temp_dir, 'lancedb')
+        os.mkdir(data_dir)
+        CONFIG['LANCEDB_URI'] = data_dir
+
+    @unittest.skipUnless(os.getenv("RUN_E2E_TESTS"), "requires network, keys, ollama")
+    def test_e2e_pipeline(self) -> None:
+        """Test the pipeline."""
+        self.assertIn('TIND_API_KEY', CONFIG, 'You must configure TIND API access')
+        self.assertIn('TIND_API_URL', CONFIG, 'You must configure TIND API access')
+        self.assertIn('OLLAMA_URL', CONFIG, 'You must have ollama running')
+        self.assertEqual(CONFIG['EMBED_BACKEND'], 'ollama',
+                         'You must use ollama embeddings for the E2E test')
+
+        willa.etl.pipeline.fetch_one_from_tind('219376')  # Sierra Club
+        willa.etl.pipeline.fetch_one_from_tind('218207')  # Genentech
+        willa.etl.pipeline.fetch_one_from_tind('103806')  # One from outside our present collections.
+
+        store = willa.etl.pipeline.run_pipeline()
+
+        # The interviewee's name should only appear in their document.
+        expected = {'Perrault': '219376', 'Itakura': '218207', 'Parnell': '103806'}
+        # We can reuse the same retriever for each query to save time and memory.
+        retriever = store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])})
+        for name, tind_id in expected.items():
+            results = retriever.invoke(name)
+            self.assertEqual(len(results), 4)  # default number of docs to return.
+            metadata = results[0].metadata
+            self.assertIn('tind_metadata', metadata, "TIND metadata missing!")
+            tind_md = metadata['tind_metadata']
+            self.assertIn('tind_id', tind_md,"TIND ID missing!")
+            self.assertListEqual(tind_md['tind_id'], [tind_id],
+                                 f"TIND ID {tind_md['tind_id'][0]} doesn't match {tind_id}")
+
+    def tearDown(self) -> None:
+        """Remove files, unless `KEEP_E2E_FILES` is present in the environment."""
+        if os.getenv('KEEP_E2E_FILES'):
+            print(f"Files in {self.temp_dir} remain for your inspection.")
+            return
+
+        shutil.rmtree(self.temp_dir)