|
| 1 | +""" |
| 2 | +Test suite for the entire ETL pipeline, including external actors. |
| 3 | +
|
| 4 | +This differs from ``tests.etl.test_pipeline`` in the following ways: |
| 5 | +
|
| 6 | +* We do not mock TIND; we connect to the real thing. |
| 7 | +* We use a LanceDB vector store instead of an in-memory vector store. |
| 8 | +* We embed documents using the actual configured embeddings provider. |
| 9 | + This defaults to ollama to prevent cost issues with Bedrock. |
| 10 | +* We process multiple PDF transcripts into the vector store. |
| 11 | +* We ensure retrieval of all documents, to prevent AP-503 recurring. |
| 12 | +""" |
| 13 | + |
| 14 | +import os.path |
| 15 | +import shutil |
| 16 | +import tempfile |
| 17 | +import unittest |
| 18 | + |
| 19 | +from willa.config import CONFIG |
| 20 | +import willa.etl.pipeline |
| 21 | + |
| 22 | + |
| 23 | +class E2ETest(unittest.TestCase): |
| 24 | + """Test the entire pipeline. |
| 25 | +
|
| 26 | + 1. Extract - Fetch three records from TIND. |
| 27 | + 2. Transform/Load - Process them into a LanceDB vector store. |
| 28 | + 3. Perform a number of queries to ensure the process was successful. |
| 29 | +
|
| 30 | + The queries we run ensure results from every document are included. |
| 31 | + """ |
| 32 | + def setUp(self) -> None: |
| 33 | + """Initialise the environment for the end-to-end test.""" |
| 34 | + self.temp_dir = tempfile.mkdtemp(prefix='willatest') |
| 35 | + |
| 36 | + storage_dir = os.path.join(self.temp_dir, 'pdfs') |
| 37 | + os.mkdir(storage_dir) |
| 38 | + CONFIG['DEFAULT_STORAGE_DIR'] = storage_dir |
| 39 | + |
| 40 | + data_dir = os.path.join(self.temp_dir, 'lancedb') |
| 41 | + os.mkdir(data_dir) |
| 42 | + CONFIG['LANCEDB_URI'] = data_dir |
| 43 | + |
| 44 | + @unittest.skipUnless(os.getenv("RUN_E2E_TESTS"), "requires network, keys, ollama") |
| 45 | + def test_e2e_pipeline(self) -> None: |
| 46 | + """Test the pipeline.""" |
| 47 | + self.assertIn('TIND_API_KEY', CONFIG, 'You must configure TIND API access') |
| 48 | + self.assertIn('TIND_API_URL', CONFIG, 'You must configure TIND API access') |
| 49 | + self.assertIn('OLLAMA_URL', CONFIG, 'You must have ollama running') |
| 50 | + self.assertEqual(CONFIG['EMBED_BACKEND'], 'ollama', |
| 51 | + 'You must use ollama embeddings for the E2E test') |
| 52 | + |
| 53 | + willa.etl.pipeline.fetch_one_from_tind('219376') # Sierra Club |
| 54 | + willa.etl.pipeline.fetch_one_from_tind('218207') # Genentech |
| 55 | + willa.etl.pipeline.fetch_one_from_tind('103806') # One from outside our present collections. |
| 56 | + |
| 57 | + store = willa.etl.pipeline.run_pipeline() |
| 58 | + |
| 59 | + # The interviewee's name should only appear in their document. |
| 60 | + expected = {'Perrault': '219376', 'Itakura': '218207', 'Parnell': '103806'} |
| 61 | + # We can reuse the same retriever for each query to save time and memory. |
| 62 | + retriever = store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])}) |
| 63 | + for name, tind_id in expected.items(): |
| 64 | + results = retriever.invoke(name) |
| 65 | + self.assertEqual(len(results), 4) # default number of docs to return. |
| 66 | + metadata = results[0].metadata |
| 67 | + self.assertIn('tind_metadata', metadata, "TIND metadata missing!") |
| 68 | + tind_md = metadata['tind_metadata'] |
| 69 | + self.assertIn('tind_id', tind_md,"TIND ID missing!") |
| 70 | + self.assertListEqual(tind_md['tind_id'], [tind_id], |
| 71 | + f"TIND ID {tind_md['tind_id'][0]} doesn't match {tind_id}") |
| 72 | + |
| 73 | + def tearDown(self) -> None: |
| 74 | + """Remove files, unless `KEEP_E2E_FILES` is present in the environment.""" |
| 75 | + if os.getenv('KEEP_E2E_FILES'): |
| 76 | + print(f"Files in {self.temp_dir} remain for your inspection.") |
| 77 | + return |
| 78 | + |
| 79 | + shutil.rmtree(self.temp_dir) |
0 commit comments