Skip to content

Commit 2a8b4e8

Browse files
committed
Add end-to-end testing of TIND and LanceDB
Implements: AP-517
1 parent 17aff8a commit 2a8b4e8

File tree

3 files changed

+105
-4
lines changed

3 files changed

+105
-4
lines changed

README.rst

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,28 @@ You can also pass the ``--profile`` argument::
123123
Configuration
124124
=============
125125

126+
Test Configuration
127+
------------------
128+
129+
The following keys are searched for by the test suite.
130+
They are ignored by the main app.
131+
132+
``RUN_OLLAMA_TESTS``
133+
Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running.
134+
135+
``RUN_E2E_TESTS``
136+
Set to anything (like ``true``) to run the end-to-end tests.
137+
This requires Ollama and a functional TIND key.
138+
139+
``KEEP_E2E_FILES``
140+
Set to anything (like ``true``) when running E2E tests to additionally keep the files
141+
downloaded and created for your own later debugging. This includes the PDF and metadata
142+
records from TIND, and the LanceDB test store. You may remove them when you are done.
143+
144+
145+
App Configuration
146+
-----------------
147+
126148
The following keys are available for configuration in the ``.env`` file:
127149

128150
``TIND_API_KEY``
@@ -134,9 +156,6 @@ The following keys are available for configuration in the ``.env`` file:
134156
``DEFAULT_STORAGE_DIR``
135157
The default directory to store files retrieved from TIND.
136158

137-
``RUN_OLLAMA_TESTS``
138-
Set to ``true`` to run the Ollama tests. Should only be set if Ollama is running.
139-
140159
``OLLAMA_URL``
141160
Set to the instance of Ollama to use for the Web interface.
142161
Defaults to ``http://localhost:11434``; you may want ``http://ollama:11434`` for Docker runs.
@@ -153,6 +172,9 @@ The following keys are available for configuration in the ``.env`` file:
153172
``POSTGRES_DB``
154173
The name of the database for the app. Defaults to ``willa``.
155174

175+
``POSTGRES_HOST``
176+
The hostname of the Postgres server. Likely ``db`` in a Docker Compose environment.
177+
156178
``POSTGRES_PORT``
157179
The Postgres port. Defaults to ``5432``.
158180

@@ -236,7 +258,7 @@ The following keys are available for configuration in the ``.env`` file:
236258
Defaults to '500' if not set.
237259

238260
``K_VALUE``
239-
Int. The k value used for retrieving context from the vector_store. The default is 4
261+
Int. The k value used for retrieving context from the vector_store. The default is 4.
240262

241263
``NULL_AUTH``
242264
Boolean. Whether to allow anyone to login with any name and password. Defaults to ``False``.

tests/e2e/__init__.py

Whitespace-only changes.

tests/e2e/test_everything.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Test suite for the entire ETL pipeline, including external actors.
3+
4+
This differs from ``tests.etl.test_pipeline`` in the following ways:
5+
6+
* We do not mock TIND; we connect to the real thing.
7+
* We use a LanceDB vector store instead of an in-memory vector store.
8+
* We embed documents using the actual configured embeddings provider.
9+
This defaults to ollama to prevent cost issues with Bedrock.
10+
* We process multiple PDF transcripts into the vector store.
11+
* We ensure retrieval of all documents, to prevent AP-503 recurring.
12+
"""
13+
14+
import os.path
15+
import shutil
16+
import tempfile
17+
import unittest
18+
19+
from willa.config import CONFIG
20+
import willa.etl.pipeline
21+
22+
23+
class E2ETest(unittest.TestCase):
24+
"""Test the entire pipeline.
25+
26+
1. Extract - Fetch three records from TIND.
27+
2. Transform/Load - Process them into a LanceDB vector store.
28+
3. Perform a number of queries to ensure the process was successful.
29+
30+
The queries we run ensure results from every document are included.
31+
"""
32+
def setUp(self) -> None:
33+
"""Initialise the environment for the end-to-end test."""
34+
self.temp_dir = tempfile.mkdtemp(prefix='willatest')
35+
36+
storage_dir = os.path.join(self.temp_dir, 'pdfs')
37+
os.mkdir(storage_dir)
38+
CONFIG['DEFAULT_STORAGE_DIR'] = storage_dir
39+
40+
data_dir = os.path.join(self.temp_dir, 'lancedb')
41+
os.mkdir(data_dir)
42+
CONFIG['LANCEDB_URI'] = data_dir
43+
44+
@unittest.skipUnless(os.getenv("RUN_E2E_TESTS"), "requires network, keys, ollama")
45+
def test_e2e_pipeline(self) -> None:
46+
"""Test the pipeline."""
47+
self.assertIn('TIND_API_KEY', CONFIG, 'You must configure TIND API access')
48+
self.assertIn('TIND_API_URL', CONFIG, 'You must configure TIND API access')
49+
self.assertIn('OLLAMA_URL', CONFIG, 'You must have ollama running')
50+
self.assertEqual(CONFIG['EMBED_BACKEND'], 'ollama',
51+
'You must use ollama embeddings for the E2E test')
52+
53+
willa.etl.pipeline.fetch_one_from_tind('219376') # Sierra Club
54+
willa.etl.pipeline.fetch_one_from_tind('218207') # Genentech
55+
willa.etl.pipeline.fetch_one_from_tind('103806') # One from outside our present collections.
56+
57+
store = willa.etl.pipeline.run_pipeline()
58+
59+
# The interviewee's name should only appear in their document.
60+
expected = {'Perrault': '219376', 'Itakura': '218207', 'Parnell': '103806'}
61+
# We can reuse the same retriever for each query to save time and memory.
62+
retriever = store.as_retriever(search_kwargs={"k": int(CONFIG['K_VALUE'])})
63+
for name, tind_id in expected.items():
64+
results = retriever.invoke(name)
65+
self.assertEqual(len(results), 4) # default number of docs to return.
66+
metadata = results[0].metadata
67+
self.assertIn('tind_metadata', metadata, "TIND metadata missing!")
68+
tind_md = metadata['tind_metadata']
69+
self.assertIn('tind_id', tind_md,"TIND ID missing!")
70+
self.assertListEqual(tind_md['tind_id'], [tind_id],
71+
f"TIND ID {tind_md['tind_id'][0]} doesn't match {tind_id}")
72+
73+
def tearDown(self) -> None:
74+
"""Remove files, unless `KEEP_E2E_FILES` is present in the environment."""
75+
if os.getenv('KEEP_E2E_FILES'):
76+
print(f"Files in {self.temp_dir} remain for your inspection.")
77+
return
78+
79+
shutil.rmtree(self.temp_dir)

0 commit comments

Comments
 (0)