Skip to content

Commit afc0853

Browse files
author
Kavyansh Chourasia
committed
Shiksha Ingestion: Added TOC Page finding step
1 parent 3908758 commit afc0853

File tree

3 files changed

+135
-9
lines changed

3 files changed

+135
-9
lines changed
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import logging
2+
import os
3+
import json
4+
from dataclasses import dataclass
5+
from enum import Enum, auto
6+
from typing import Dict, Any, Optional, Set, Type
7+
from dotenv import load_dotenv
8+
from ingestion_pipeline.utils.toc_page_finder import TOCPageFinder
9+
from ingestion_pipeline.base.pipeline import BasePipelineStep, StepResult, StepStatus
10+
11+
# Configure logging
12+
logging.basicConfig(
13+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
14+
)
15+
logger = logging.getLogger(__name__)
16+
17+
load_dotenv(".env")
18+
19+
20+
def azure_openai_credentials():
21+
"""
22+
Returns credentials for connecting to Azure OpenAI service.
23+
24+
Returns:
25+
Dict[str, str]: A dictionary with Azure OpenAI credentials including API key, base URL,
26+
API version, and deployment name.
27+
"""
28+
return {
29+
"api_key": os.getenv("AZURE_OPENAI_API_KEY"),
30+
"api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
31+
"api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
32+
"deployment_name": os.getenv("AZURE_OPENAI_MODEL"),
33+
}
34+
35+
36+
class TOCPageFindingStep(BasePipelineStep):
37+
"""Find table of contents page range in a PDF file.
38+
39+
This step identifies the table of contents section in a PDF and outputs:
40+
- toc_page_range: JSON file containing start/end pages and summary
41+
- page_offset: The offset for subsequent processing (end_page - 1)
42+
"""
43+
44+
name = "toc_page_finding"
45+
description = "Find table of contents page range in PDF"
46+
input_types = {"pdf"}
47+
output_types = {"toc_page_range", "page_offset"}
48+
49+
def process(self, input_paths: Dict[str, str], output_dir: str) -> StepResult:
50+
"""
51+
Process the step - find TOC page range in PDF.
52+
53+
Args:
54+
input_paths: Dictionary with "pdf" key mapping to PDF file path
55+
output_dir: Directory where TOC page range JSON will be saved
56+
57+
Returns:
58+
StepResult with status and output paths including:
59+
- toc_page_range: Path to JSON file with TOC page range information
60+
- page_offset: Calculated offset (end_page - 1) for subsequent pipeline steps
61+
"""
62+
pdf_path = input_paths["pdf"]
63+
output_filename = os.path.basename(pdf_path).replace(
64+
".pdf", "_toc_page_range.json"
65+
)
66+
output_path = os.path.join(output_dir, output_filename)
67+
68+
try:
69+
logger.info(f"Finding TOC page range in {pdf_path}")
70+
71+
# Get configuration with defaults
72+
max_pages_to_check = self.config.get("max_pages_to_check", 30)
73+
batch_size = self.config.get("batch_size", 3)
74+
75+
# Get credentials for TOC page finding
76+
credentials = azure_openai_credentials()
77+
78+
# Find TOC page range
79+
toc_finder = TOCPageFinder()
80+
start_page, end_page, range_summary = toc_finder.get_toc_page_range(
81+
pdf_path,
82+
max_pages_to_check=max_pages_to_check,
83+
batch_size=batch_size,
84+
**credentials,
85+
)
86+
87+
# Create page range dictionary
88+
page_range_dict = {
89+
"start": start_page,
90+
"end": end_page,
91+
"summary": range_summary,
92+
}
93+
94+
logger.info(f"Found TOC page range: {page_range_dict}")
95+
96+
# Save page range
97+
os.makedirs(output_dir, exist_ok=True)
98+
with open(output_path, "w", encoding="utf-8") as f:
99+
json.dump(page_range_dict, f, indent=2)
100+
101+
logger.info(f"Saved TOC page range to {output_path}")
102+
103+
return StepResult(
104+
status=StepStatus.COMPLETED,
105+
output_paths={
106+
"toc_page_range": output_path,
107+
"page_offset": end_page - 1,
108+
},
109+
metadata={
110+
"start_page": start_page,
111+
"end_page": end_page,
112+
"page_count": (
113+
end_page - start_page + 1 if end_page >= start_page else 0
114+
),
115+
"summary": range_summary,
116+
},
117+
)
118+
119+
except Exception as e:
120+
logger.exception(f"Error finding TOC page range: {e}")
121+
return StepResult(status=StepStatus.FAILED, error=e)

shiksha-ingestion/pipeline_steps/step_1_toc_extraction.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,29 +38,36 @@ class TOCExtractionStep(BasePipelineStep):
3838

3939
name = "toc_extraction"
4040
description = "Extract table of contents from PDF"
41-
input_types = {"pdf"}
41+
input_types = {"pdf", "toc_page_range"}
4242
output_types = {"toc"}
4343

4444
def process(self, input_paths: Dict[str, str], output_dir: str) -> StepResult:
4545
"""
4646
Process the step - extract TOC from PDF.
4747
4848
Args:
49-
input_paths: Dictionary with "pdf" key mapping to PDF file path
49+
input_paths: Dictionary with "pdf" and "toc_page_range" keys mapping to file paths
5050
output_dir: Directory where TOC JSON will be saved
5151
5252
Returns:
5353
StepResult with status and output paths
5454
"""
5555
pdf_path = input_paths["pdf"]
56+
toc_page_range_path = input_paths["toc_page_range"]
5657
output_filename = os.path.basename(pdf_path).replace(".pdf", ".json")
5758
output_path = os.path.join(output_dir, output_filename)
5859

5960
try:
6061
logger.info(f"Processing {pdf_path}")
6162

62-
# Get configuration
63-
page_number = self.config.get("page_number", 0)
63+
# Load TOC page range from step 0 output
64+
with open(toc_page_range_path, "r", encoding="utf-8") as f:
65+
page_range_data = json.load(f)
66+
67+
start_page = page_range_data["start"]
68+
end_page = page_range_data["end"]
69+
70+
logger.info(f"Using TOC page range: {start_page}-{end_page}")
6471

6572
# Get credentials for TOC extraction
6673
credentials = azure_openai_credentials()
@@ -84,7 +91,7 @@ def process(self, input_paths: Dict[str, str], output_dir: str) -> StepResult:
8491

8592
toc = extractor.extract_table_of_contents(
8693
pdf_path,
87-
page_range=(page_number, page_number + 1),
94+
page_range=(start_page, end_page),
8895
document_specific_hint=document_hint,
8996
**credentials,
9097
)

shiksha-ingestion/pipeline_steps/step_2_pdf_splitting.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class PDFSplittingStep(BasePipelineStep):
2727

2828
name = "pdf_splitting"
2929
description = "Split PDF documents into chapters based on table of contents"
30-
input_types = {"pdf", "toc"}
30+
input_types = {"pdf", "toc", "page_offset"}
3131
output_types = {"split_pdfs"}
3232

3333
def process(self, input_paths: Dict[str, str], output_dir: str) -> StepResult:
@@ -43,13 +43,11 @@ def process(self, input_paths: Dict[str, str], output_dir: str) -> StepResult:
4343
"""
4444
pdf_path = input_paths["pdf"]
4545
toc_path = input_paths["toc"]
46+
page_offset = input_paths["page_offset"]
4647

4748
try:
4849
logger.info(f"Processing PDF splitting for {pdf_path}")
4950

50-
# Get page offset from config or default to 0
51-
page_offset = self.config.get("page_offset", 0)
52-
5351
# Read the TOC from the JSON file
5452
toc = self._read_toc_from_json(toc_path)
5553

0 commit comments

Comments
 (0)