Ingestion Pipeline: Test LLM credentials from env

Kavyansh Chourasia · Kavyansh Chourasia · commit e1e485e16b6d · 2025-07-21T21:18:11.000+05:30
diff --git a/components/ingestion-pipeline/.gitignore b/components/ingestion-pipeline/.gitignore
@@ -2,3 +2,4 @@ tests/files/
 .pytest_cache/
 **/__pycache__/
 .vscode/
+.env
diff --git a/components/ingestion-pipeline/poetry.lock b/components/ingestion-pipeline/poetry.lock
diff --git a/components/ingestion-pipeline/pyproject.toml b/components/ingestion-pipeline/pyproject.toml
@@ -13,7 +13,7 @@ packages = [
 
 # Where you declare runtime dependencies
 [tool.poetry.dependencies]
-python = "^3.10"
+python = "~3.11"
 pytesseract = "^0.3.9"
 PyPDF2 = "*"
 azure-ai-formrecognizer = "^3.3.3"
diff --git a/components/ingestion-pipeline/src/ingestion_pipeline/text_extractors/llm_json_extractor.py b/components/ingestion-pipeline/src/ingestion_pipeline/text_extractors/llm_json_extractor.py
@@ -199,33 +199,43 @@ def _process_page_with_llm(
 
             # Extract the response content
             extracted_json_text = response["choices"][0]["message"]["content"].strip()
-            
+
             # Parse the JSON response
             try:
                 page_sections = json.loads(extracted_json_text)
                 if not isinstance(page_sections, list):
                     raise ValueError("Response is not a JSON array")
-                
+
                 # Validate the structure
                 for section in page_sections:
-                    if not isinstance(section, dict) or "section_title" not in section or "section_content" not in section:
+                    if (
+                        not isinstance(section, dict)
+                        or "section_title" not in section
+                        or "section_content" not in section
+                    ):
                         raise ValueError("Invalid section structure")
-                
+
                 return page_sections
             except (json.JSONDecodeError, ValueError) as e:
-                self.logger.error(f"Error parsing JSON response for page {page_number}: {e}")
+                self.logger.error(
+                    f"Error parsing JSON response for page {page_number}: {e}"
+                )
                 # Return a fallback structure
-                return [{
-                    "section_title": f"Page {page_number} (Parse Error)",
-                    "section_content": f"*Error parsing JSON response: {str(e)}*\n\nRaw response:\n{extracted_json_text}"
-                }]
+                return [
+                    {
+                        "section_title": f"Page {page_number} (Parse Error)",
+                        "section_content": f"*Error parsing JSON response: {str(e)}*\n\nRaw response:\n{extracted_json_text}",
+                    }
+                ]
 
         except Exception as e:
             self.logger.error(f"Error processing page {page_number} with LLM: {e}")
-            return [{
-                "section_title": f"Page {page_number} (Processing Error)",
-                "section_content": f"*Error processing page {page_number}: {str(e)}*"
-            }]
+            return [
+                {
+                    "section_title": f"Page {page_number} (Processing Error)",
+                    "section_content": f"*Error processing page {page_number}: {str(e)}*",
+                }
+            ]
 
     def _process_pages_in_batches(
         self,
@@ -257,12 +267,16 @@ def _process_pages_in_batches(
             # Determine context to use - limit to recent pages to prevent context overflow
             if i >= batch_size:
                 # Keep only the latest batch_size pages worth of sections
-                context_sections = accumulated_context_sections[-batch_size * 10:]  # Approximate limit
+                context_sections = accumulated_context_sections[
+                    -batch_size * 10 :
+                ]  # Approximate limit
             else:
                 context_sections = accumulated_context_sections
 
             # Convert context sections to JSON string
-            context_json = json.dumps(context_sections, indent=2) if context_sections else ""
+            context_json = (
+                json.dumps(context_sections, indent=2) if context_sections else ""
+            )
 
             # Process the current page with context
             page_sections = self._process_page_with_llm(
diff --git a/components/ingestion-pipeline/tests/test_clean_markdown_post_processor.py b/components/ingestion-pipeline/tests/test_clean_markdown_post_processor.py
@@ -1,36 +1,46 @@
+from dotenv import load_dotenv
 import pytest
 import json
 import logging
 import os
-from ingestion_pipeline.text_postprocessors.clean_markdown_post_processor import CleanMarkdownPostProcessor
+from ingestion_pipeline.text_postprocessors.clean_markdown_post_processor import (
+    CleanMarkdownPostProcessor,
+)
+
+load_dotenv(".env")
+
 
 @pytest.fixture
 def azure_openai_credentials():
     return {
-        "api_key": "**",
-        "api_base": "**",
-        "api_version": "2023-03-15-preview",
-        "deployment_name": "gpt-4o"
+        "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
+        "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
+        "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
+        "deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
     }
 
+
 def test_simple_metadata_extractor_mineru(azure_openai_credentials):
     folder_path = "tests/files/mineru/"
     output_folder = os.path.join(folder_path, "cleaned_v3")
     _run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
 
+
 def test_simple_metadata_extractor_smoldocling(azure_openai_credentials):
     folder_path = "tests/files/smoldocling/"
     output_folder = os.path.join(folder_path, "cleaned_v3")
     _run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
 
+
 def test_simple_metadata_extractor_olmocr(azure_openai_credentials):
     folder_path = "tests/files/olmocr/"
     output_folder = os.path.join(folder_path, "cleaned_v3")
     _run_cleaning_test(folder_path, output_folder, azure_openai_credentials)
 
+
 def _run_cleaning_test(folder_path, cleaned_output_folder, azure_openai_credentials):
     logger = logging.getLogger(__name__)
-    
+
     # Get markdown files
     markdown_files = [
         os.path.join(folder_path, file)
@@ -44,14 +54,17 @@ def _run_cleaning_test(folder_path, cleaned_output_folder, azure_openai_credenti
     for markdown_file_path in markdown_files:
         with open(markdown_file_path, "r", encoding="utf-8") as file:
             markdown_content = file.read()
-        result = post_processor.post_process(markdown_content, **azure_openai_credentials)
+        result = post_processor.post_process(
+            markdown_content, **azure_openai_credentials
+        )
 
         # Save the result as a JSON file
         output_file_path = os.path.join(
-            cleaned_output_folder,
-            os.path.basename(markdown_file_path)
+            cleaned_output_folder, os.path.basename(markdown_file_path)
         )
         with open(output_file_path, "w", encoding="utf-8") as md_file:
             md_file.write(result)
 
-        logger.info("Cleaned markdown for %s saved to %s", markdown_file_path, output_file_path)
+        logger.info(
+            "Cleaned markdown for %s saved to %s", markdown_file_path, output_file_path
+        )
diff --git a/components/ingestion-pipeline/tests/test_llm_json_extractor.py b/components/ingestion-pipeline/tests/test_llm_json_extractor.py
@@ -1,18 +1,23 @@
+from dotenv import load_dotenv
 import pytest
 import logging
 import os
 import json
 from ingestion_pipeline.text_extractors.llm_json_extractor import LLMJSONExtractor
 
+load_dotenv(".env")
+
+
 @pytest.fixture
 def azure_openai_credentials():
     return {
-        "api_key": "**",
-        "api_base": "**",
-        "api_version": "2024-08-01-preview",
-        "deployment_name": "gpt-4o"
+        "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
+        "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
+        "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
+        "deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
     }
 
+
 def test_llm_json_extractor_social(azure_openai_credentials):
     """
     Test LLM JSON extractor with english-one-column-social.pdf
@@ -21,10 +26,11 @@ def test_llm_json_extractor_social(azure_openai_credentials):
     pdf_file_path = "tests/files/english-one-column-social.pdf"
     _run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials)
 
+
 def _run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials):
     """
     Helper function to run LLM JSON extractor test
-    
+
     Args:
         pdf_file_path: Path to the PDF file to analyze
         azure_openai_credentials: Azure OpenAI credentials
@@ -42,33 +48,47 @@ def _run_llm_json_extractor_test(pdf_file_path, azure_openai_credentials):
     logger.info(f"Extracting text from PDF file: {pdf_file_path}")
 
     # Extract text in JSON format
-    extracted_json_text = json_extractor.extract_text(pdf_file_path, **azure_openai_credentials)
+    extracted_json_text = json_extractor.extract_text(
+        pdf_file_path, **azure_openai_credentials
+    )
 
-    logger.info(f"Extraction completed. Result length: {len(extracted_json_text)} characters")
+    logger.info(
+        f"Extraction completed. Result length: {len(extracted_json_text)} characters"
+    )
 
     # Validate that the result is valid JSON
     try:
         parsed_json = json.loads(extracted_json_text)
         assert isinstance(parsed_json, list), "Result should be a JSON array"
-        
+
         # Validate structure of each section
         for i, section in enumerate(parsed_json):
             assert isinstance(section, dict), f"Section {i} should be a dictionary"
-            assert "section_title" in section, f"Section {i} should have 'section_title'"
-            assert "section_content" in section, f"Section {i} should have 'section_content'"
+            assert (
+                "section_title" in section
+            ), f"Section {i} should have 'section_title'"
+            assert (
+                "section_content" in section
+            ), f"Section {i} should have 'section_content'"
             assert "page_number" in section, f"Section {i} should have 'page_number'"
-            
+
             # Validate data types
-            assert isinstance(section["section_title"], str), f"Section {i} title should be string"
-            assert isinstance(section["section_content"], str), f"Section {i} content should be string"
-            assert isinstance(section["page_number"], int), f"Section {i} page_number should be integer"
-        
+            assert isinstance(
+                section["section_title"], str
+            ), f"Section {i} title should be string"
+            assert isinstance(
+                section["section_content"], str
+            ), f"Section {i} content should be string"
+            assert isinstance(
+                section["page_number"], int
+            ), f"Section {i} page_number should be integer"
+
         logger.info(f"Successfully extracted {len(parsed_json)} sections from PDF")
-        
+
         # Log first few sections for verification
         for i, section in enumerate(parsed_json[:3]):
             logger.info(f"Section {i+1}: {section['section_title'][:50]}...")
-            
+
     except json.JSONDecodeError as e:
         pytest.fail(f"Failed to parse extracted text as JSON: {e}")
     except Exception as e:
diff --git a/components/ingestion-pipeline/tests/test_simple_metadata_extractor.py b/components/ingestion-pipeline/tests/test_simple_metadata_extractor.py
@@ -1,33 +1,43 @@
+from dotenv import load_dotenv
 import pytest
 import json
 import logging
 import os
-from ingestion_pipeline.metadata_extractors.simple_metadata_extractor import SimpleMetadataExtractor
+from ingestion_pipeline.metadata_extractors.simple_metadata_extractor import (
+    SimpleMetadataExtractor,
+)
+
+load_dotenv(".env")
+
 
 @pytest.fixture
 def azure_openai_credentials():
     return {
-        "api_key": "**",
-        "api_base": "**",
-        "api_version": "2023-03-15-preview",
-        "deployment_name": "gpt-4o"
+        "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
+        "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
+        "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
+        "deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
     }
 
+
 def test_simple_metadata_extractor_mineru(azure_openai_credentials):
     folder_path = "tests/files/mineru/cleaned_v3"
     metadata_output_folder = os.path.join(folder_path, "metadata")
     _run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials)
 
+
 def test_simple_metadata_extractor_smoldocling():
     folder_path = "tests/files/smoldocling/cleaned"
     metadata_output_folder = os.path.join(folder_path, "metadata")
     _run_extraction_test(folder_path, metadata_output_folder)
 
+
 def test_simple_metadata_extractor_olmocr(azure_openai_credentials):
     folder_path = "tests/files/olmocr/cleaned_v3"
     metadata_output_folder = os.path.join(folder_path, "metadata")
     _run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials)
 
+
 def _run_extraction_test(folder_path, metadata_output_folder, azure_openai_credentials):
     logging.basicConfig(level=logging.INFO)
     logger = logging.getLogger(__name__)
@@ -47,24 +57,30 @@ def _run_extraction_test(folder_path, metadata_output_folder, azure_openai_crede
             markdown_content = file.read()
 
         output_json_structure = {
-            "activity_names":{
-                "values": [ "" ],
-                "description": "List of activity names in the chapter. Activities in the text can be defined as guided, hands-on tasks or thought experiments designed to help students actively engage with and understand key concepts through observation, analysis, or exploration."
+            "activity_names": {
+                "values": [""],
+                "description": "List of activity names in the chapter. Activities in the text can be defined as guided, hands-on tasks or thought experiments designed to help students actively engage with and understand key concepts through observation, analysis, or exploration.",
             },
             "subtopic_names": {
-                "values": [ "" ],
-                "description": "List of subtopic names in the chapter. Subtopic names in the text can be defined as concise headings that organize and highlight specific themes or concepts within a broader chapter, guiding the flow of content and learning."
+                "values": [""],
+                "description": "List of subtopic names in the chapter. Subtopic names in the text can be defined as concise headings that organize and highlight specific themes or concepts within a broader chapter, guiding the flow of content and learning.",
             },
         }
-        
-        result = extractor.extract(markdown_content, output_json_structure, **azure_openai_credentials)
+
+        result = extractor.extract(
+            markdown_content, output_json_structure, **azure_openai_credentials
+        )
 
         # Save the result as a JSON file
         output_file_path = os.path.join(
             metadata_output_folder,
-            os.path.basename(markdown_file_path).replace(".md", ".json")
+            os.path.basename(markdown_file_path).replace(".md", ".json"),
         )
         with open(output_file_path, "w", encoding="utf-8") as json_file:
             json.dump(result, json_file, indent=2, ensure_ascii=False)
 
-        logger.info("Extracted Metadata for %s saved to %s", markdown_file_path, output_file_path)
+        logger.info(
+            "Extracted Metadata for %s saved to %s",
+            markdown_file_path,
+            output_file_path,
+        )
diff --git a/components/ingestion-pipeline/tests/test_toc_page_finder.py b/components/ingestion-pipeline/tests/test_toc_page_finder.py