Skip to content

Commit e1e485e

Browse files
author
Kavyansh Chourasia
committed
Ingestion Pipeline: Test LLM credentials from env
1 parent 356289e commit e1e485e

File tree

8 files changed

+164
-174
lines changed

8 files changed

+164
-174
lines changed

components/ingestion-pipeline/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ tests/files/
22
.pytest_cache/
33
**/__pycache__/
44
.vscode/
5+
.env

components/ingestion-pipeline/poetry.lock

Lines changed: 24 additions & 106 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/ingestion-pipeline/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ packages = [
1313

1414
# Where you declare runtime dependencies
1515
[tool.poetry.dependencies]
16-
python = "^3.10"
16+
python = "~3.11"
1717
pytesseract = "^0.3.9"
1818
PyPDF2 = "*"
1919
azure-ai-formrecognizer = "^3.3.3"

components/ingestion-pipeline/src/ingestion_pipeline/text_extractors/llm_json_extractor.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -199,33 +199,43 @@ def _process_page_with_llm(
199199

200200
# Extract the response content
201201
extracted_json_text = response["choices"][0]["message"]["content"].strip()
202-
202+
203203
# Parse the JSON response
204204
try:
205205
page_sections = json.loads(extracted_json_text)
206206
if not isinstance(page_sections, list):
207207
raise ValueError("Response is not a JSON array")
208-
208+
209209
# Validate the structure
210210
for section in page_sections:
211-
if not isinstance(section, dict) or "section_title" not in section or "section_content" not in section:
211+
if (
212+
not isinstance(section, dict)
213+
or "section_title" not in section
214+
or "section_content" not in section
215+
):
212216
raise ValueError("Invalid section structure")
213-
217+
214218
return page_sections
215219
except (json.JSONDecodeError, ValueError) as e:
216-
self.logger.error(f"Error parsing JSON response for page {page_number}: {e}")
220+
self.logger.error(
221+
f"Error parsing JSON response for page {page_number}: {e}"
222+
)
217223
# Return a fallback structure
218-
return [{
219-
"section_title": f"Page {page_number} (Parse Error)",
220-
"section_content": f"*Error parsing JSON response: {str(e)}*\n\nRaw response:\n{extracted_json_text}"
221-
}]
224+
return [
225+
{
226+
"section_title": f"Page {page_number} (Parse Error)",
227+
"section_content": f"*Error parsing JSON response: {str(e)}*\n\nRaw response:\n{extracted_json_text}",
228+
}
229+
]
222230

223231
except Exception as e:
224232
self.logger.error(f"Error processing page {page_number} with LLM: {e}")
225-
return [{
226-
"section_title": f"Page {page_number} (Processing Error)",
227-
"section_content": f"*Error processing page {page_number}: {str(e)}*"
228-
}]
233+
return [
234+
{
235+
"section_title": f"Page {page_number} (Processing Error)",
236+
"section_content": f"*Error processing page {page_number}: {str(e)}*",
237+
}
238+
]
229239

230240
def _process_pages_in_batches(
231241
self,
@@ -257,12 +267,16 @@ def _process_pages_in_batches(
257267
# Determine context to use - limit to recent pages to prevent context overflow
258268
if i >= batch_size:
259269
# Keep only the latest batch_size pages worth of sections
260-
context_sections = accumulated_context_sections[-batch_size * 10:] # Approximate limit
270+
context_sections = accumulated_context_sections[
271+
-batch_size * 10 :
272+
] # Approximate limit
261273
else:
262274
context_sections = accumulated_context_sections
263275

264276
# Convert context sections to JSON string
265-
context_json = json.dumps(context_sections, indent=2) if context_sections else ""
277+
context_json = (
278+
json.dumps(context_sections, indent=2) if context_sections else ""
279+
)
266280

267281
# Process the current page with context
268282
page_sections = self._process_page_with_llm(
Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,46 @@
1+
from dotenv import load_dotenv
12
import pytest
23
import json
34
import logging
45
import os
5-
from ingestion_pipeline.text_postprocessors.clean_markdown_post_processor import CleanMarkdownPostProcessor
6+
from ingestion_pipeline.text_postprocessors.clean_markdown_post_processor import (
7+
CleanMarkdownPostProcessor,
8+
)
9+
10+
load_dotenv(".env")
11+
612

713
@pytest.fixture
814
def azure_openai_credentials():
915
return {
10-
"api_key": "**",
11-
"api_base": "**",
12-
"api_version": "2023-03-15-preview",
13-
"deployment_name": "gpt-4o"
16+
"api_key": os.getenv("AZURE_OPENAI_API_KEY"),
17+
"api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
18+
"api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
19+
"deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
1420
}
1521

22+
1623
def test_simple_metadata_extractor_mineru(azure_openai_credentials):
1724
folder_path = "tests/files/mineru/"
1825
output_folder = os.path.join(folder_path, "cleaned_v3")
1926
_run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
2027

28+
2129
def test_simple_metadata_extractor_smoldocling(azure_openai_credentials):
2230
folder_path = "tests/files/smoldocling/"
2331
output_folder = os.path.join(folder_path, "cleaned_v3")
2432
_run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
2533

34+
2635
def test_simple_metadata_extractor_olmocr(azure_openai_credentials):
2736
folder_path = "tests/files/olmocr/"
2837
output_folder = os.path.join(folder_path, "cleaned_v3")
2938
_run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
3039

40+
3141
def _run_cleaning_test(folder_path, cleaned_output_folder, azure_openai_credentials):
3242
logger = logging.getLogger(__name__)
33-
43+
3444
# Get markdown files
3545
markdown_files = [
3646
os.path.join(folder_path, file)
@@ -44,14 +54,17 @@ def _run_cleaning_test(folder_path, cleaned_output_folder, azure_openai_credenti
4454
for markdown_file_path in markdown_files:
4555
with open(markdown_file_path, "r", encoding="utf-8") as file:
4656
markdown_content = file.read()
47-
result = post_processor.post_process(markdown_content, **azure_openai_credentials)
57+
result = post_processor.post_process(
58+
markdown_content, **azure_openai_credentials
59+
)
4860

4961
# Save the result as a JSON file
5062
output_file_path = os.path.join(
51-
cleaned_output_folder,
52-
os.path.basename(markdown_file_path)
63+
cleaned_output_folder, os.path.basename(markdown_file_path)
5364
)
5465
with open(output_file_path, "w", encoding="utf-8") as md_file:
5566
md_file.write(result)
5667

57-
logger.info("Cleaned markdown for %s saved to %s", markdown_file_path, output_file_path)
68+
logger.info(
69+
"Cleaned markdown for %s saved to %s", markdown_file_path, output_file_path
70+
)

components/ingestion-pipeline/tests/test_llm_json_extractor.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1+
from dotenv import load_dotenv
12
import pytest
23
import logging
34
import os
45
import json
56
from ingestion_pipeline.text_extractors.llm_json_extractor import LLMJSONExtractor
67

8+
load_dotenv(".env")
9+
10+
711
@pytest.fixture
812
def azure_openai_credentials():
913
return {
10-
"api_key": "**",
11-
"api_base": "**",
12-
"api_version": "2024-08-01-preview",
13-
"deployment_name": "gpt-4o"
14+
"api_key": os.getenv("AZURE_OPENAI_API_KEY"),
15+
"api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
16+
"api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
17+
"deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
1418
}
1519

20+
1621
def test_llm_json_extractor_social(azure_openai_credentials):
1722
"""
1823
Test LLM JSON extractor with english-one-column-social.pdf
@@ -21,10 +26,11 @@ def test_llm_json_extractor_social(azure_openai_credentials):
2126
pdf_file_path = "tests/files/english-one-column-social.pdf"
2227
_run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials)
2328

29+
2430
def _run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials):
2531
"""
2632
Helper function to run LLM JSON extractor test
27-
33+
2834
Args:
2935
pdf_file_path: Path to the PDF file to analyze
3036
azure_openai_credentials: Azure OpenAI credentials
@@ -42,33 +48,47 @@ def _run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials):
4248
logger.info(f"Extracting text from PDF file: {pdf_file_path}")
4349

4450
# Extract text in JSON format
45-
extracted_json_text = json_extractor.extract_text(pdf_file_path, **azure_openai_credentials)
51+
extracted_json_text = json_extractor.extract_text(
52+
pdf_file_path, **azure_openai_credentials
53+
)
4654

47-
logger.info(f"Extraction completed. Result length: {len(extracted_json_text)} characters")
55+
logger.info(
56+
f"Extraction completed. Result length: {len(extracted_json_text)} characters"
57+
)
4858

4959
# Validate that the result is valid JSON
5060
try:
5161
parsed_json = json.loads(extracted_json_text)
5262
assert isinstance(parsed_json, list), "Result should be a JSON array"
53-
63+
5464
# Validate structure of each section
5565
for i, section in enumerate(parsed_json):
5666
assert isinstance(section, dict), f"Section {i} should be a dictionary"
57-
assert "section_title" in section, f"Section {i} should have 'section_title'"
58-
assert "section_content" in section, f"Section {i} should have 'section_content'"
67+
assert (
68+
"section_title" in section
69+
), f"Section {i} should have 'section_title'"
70+
assert (
71+
"section_content" in section
72+
), f"Section {i} should have 'section_content'"
5973
assert "page_number" in section, f"Section {i} should have 'page_number'"
60-
74+
6175
# Validate data types
62-
assert isinstance(section["section_title"], str), f"Section {i} title should be string"
63-
assert isinstance(section["section_content"], str), f"Section {i} content should be string"
64-
assert isinstance(section["page_number"], int), f"Section {i} page_number should be integer"
65-
76+
assert isinstance(
77+
section["section_title"], str
78+
), f"Section {i} title should be string"
79+
assert isinstance(
80+
section["section_content"], str
81+
), f"Section {i} content should be string"
82+
assert isinstance(
83+
section["page_number"], int
84+
), f"Section {i} page_number should be integer"
85+
6686
logger.info(f"Successfully extracted {len(parsed_json)} sections from PDF")
67-
87+
6888
# Log first few sections for verification
6989
for i, section in enumerate(parsed_json[:3]):
7090
logger.info(f"Section {i+1}: {section['section_title'][:50]}...")
71-
91+
7292
except json.JSONDecodeError as e:
7393
pytest.fail(f"Failed to parse extracted text as JSON: {e}")
7494
except Exception as e:
Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,43 @@
1+
from dotenv import load_dotenv
12
import pytest
23
import json
34
import logging
45
import os
5-
from ingestion_pipeline.metadata_extractors.simple_metadata_extractor import SimpleMetadataExtractor
6+
from ingestion_pipeline.metadata_extractors.simple_metadata_extractor import (
7+
SimpleMetadataExtractor,
8+
)
9+
10+
load_dotenv(".env")
11+
612

713
@pytest.fixture
814
def azure_openai_credentials():
915
return {
10-
"api_key": "**",
11-
"api_base": "**",
12-
"api_version": "2023-03-15-preview",
13-
"deployment_name": "gpt-4o"
16+
"api_key": os.getenv("AZURE_OPENAI_API_KEY"),
17+
"api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
18+
"api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
19+
"deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
1420
}
1521

22+
1623
def test_simple_metadata_extractor_mineru(azure_openai_credentials):
1724
folder_path = "tests/files/mineru/cleaned_v3"
1825
metadata_output_folder = os.path.join(folder_path, "metadata")
1926
_run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials)
2027

28+
2129
def test_simple_metadata_extractor_smoldocling():
2230
folder_path = "tests/files/smoldocling/cleaned"
2331
metadata_output_folder = os.path.join(folder_path, "metadata")
2432
_run_extraction_test(folder_path, metadata_output_folder)
2533

34+
2635
def test_simple_metadata_extractor_olmocr(azure_openai_credentials):
2736
folder_path = "tests/files/olmocr/cleaned_v3"
2837
metadata_output_folder = os.path.join(folder_path, "metadata")
2938
_run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials)
3039

40+
3141
def _run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials):
3242
logging.basicConfig(level=logging.INFO)
3343
logger = logging.getLogger(__name__)
@@ -47,24 +57,30 @@ def _run_extraction_test(folder_path, metadata_output_folder, azure_openai_crede
4757
markdown_content = file.read()
4858

4959
output_json_structure = {
50-
"activity_names":{
51-
"values": [ "" ],
52-
"description": "List of activity names in the chapter. Activities in the text can be defined as guided, hands-on tasks or thought experiments designed to help students actively engage with and understand key concepts through observation, analysis, or exploration."
60+
"activity_names": {
61+
"values": [""],
62+
"description": "List of activity names in the chapter. Activities in the text can be defined as guided, hands-on tasks or thought experiments designed to help students actively engage with and understand key concepts through observation, analysis, or exploration.",
5363
},
5464
"subtopic_names": {
55-
"values": [ "" ],
56-
"description": "List of subtopic names in the chapter. Subtopic names in the text can be defined as concise headings that organize and highlight specific themes or concepts within a broader chapter, guiding the flow of content and learning."
65+
"values": [""],
66+
"description": "List of subtopic names in the chapter. Subtopic names in the text can be defined as concise headings that organize and highlight specific themes or concepts within a broader chapter, guiding the flow of content and learning.",
5767
},
5868
}
59-
60-
result = extractor.extract(markdown_content, output_json_structure, **azure_openai_credentials)
69+
70+
result = extractor.extract(
71+
markdown_content, output_json_structure, **azure_openai_credentials
72+
)
6173

6274
# Save the result as a JSON file
6375
output_file_path = os.path.join(
6476
metadata_output_folder,
65-
os.path.basename(markdown_file_path).replace(".md", ".json")
77+
os.path.basename(markdown_file_path).replace(".md", ".json"),
6678
)
6779
with open(output_file_path, "w", encoding="utf-8") as json_file:
6880
json.dump(result, json_file, indent=2, ensure_ascii=False)
6981

70-
logger.info("Extracted Metadata for %s saved to %s", markdown_file_path, output_file_path)
82+
logger.info(
83+
"Extracted Metadata for %s saved to %s",
84+
markdown_file_path,
85+
output_file_path,
86+
)

0 commit comments

Comments
 (0)