TAP_OCR/main_v3.py at main · theapprenticeproject/TAP_OCR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
#!/usr/bin/env python3
# LAST_EDIT: 07-08-2025 (19:30)  |  @Neiblaze


import os
import sys
import json
import argparse
import logging
from pathlib import Path
from typing import Dict, List, Any
import cv2
import numpy as np

import pandas as pd
import fitz
from PIL import Image, ImageEnhance, ImageOps
import google.generativeai as genai
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class EnhancedDocumentProcessor:
    def __init__(self, api_key: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-2.5-flash')

    def preprocess_image(self, image: Image.Image) -> Image.Image:
        try:
            if image.mode != 'RGB':
                image = image.convert('RGB')

            img_array = np.array(image)

            img_array = self.auto_rotate_image(img_array)

            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)

            denoised = cv2.medianBlur(gray, 3)

            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            enhanced = clahe.apply(denoised)

            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
            sharpened = cv2.filter2D(enhanced, -1, kernel)

            processed_image = Image.fromarray(sharpened).convert('RGB')

            return processed_image

        except Exception as e:
            logger.warning(f"Image preprocessing failed, using original: {e}")
            return image

    def auto_rotate_image(self, img_array: np.ndarray) -> np.ndarray:
        try:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)

            # HoughLinesP → detect lines and determine rotation
            edges = cv2.Canny(gray, 50, 150, apertureSize=3)
            lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)

            if lines is not None:
                angles = []
                for line in lines:
                    x1, y1, x2, y2 = line[0]
                    angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
                    angles.append(angle)

                if angles:
                    median_angle = np.median(angles)

                    # Only rotate if (angle) >= significant
                    if abs(median_angle) > 1:
                        if 85 <= abs(median_angle) <= 95:
                            rotation_angle = 90 if median_angle > 0 else -90
                        elif abs(median_angle) > 45:
                            rotation_angle = median_angle
                        else:
                            rotation_angle = 0

                        if rotation_angle != 0:
                            logger.info(f"Auto-rotating image by {rotation_angle} degrees")
                            center = (img_array.shape[1] // 2, img_array.shape[0] // 2)
                            rotation_matrix = cv2.getRotationMatrix2D(center, -rotation_angle, 1.0)
                            img_array = cv2.warpAffine(img_array, rotation_matrix, (img_array.shape[1], img_array.shape[0]))

            return img_array

        except Exception as e:
            logger.warning(f"Auto-rotation failed, using original orientation: {e}")
            return img_array

    def extract_text_from_pdf(self, pdf_path: str) -> tuple[str, bool]:
        try:
            doc = fitz.open(pdf_path)
            text_content = ""
            is_image_based = True

            for page_num in range(doc.page_count):
                page = doc.load_page(page_num)
                text = page.get_text().strip()

                if text:
                    text_content += f"\n--- Page {page_num + 1} ---\n{text}"
                    is_image_based = False

            doc.close()

            if not text_content.strip():
                is_image_based = True

            return text_content, is_image_based

        except Exception as e:
            logger.error(f"Error extracting text from PDF: {e}")
            raise

    def convert_pdf_to_images(self, pdf_path: str) -> List[Image.Image]:
        try:
            doc = fitz.open(pdf_path)
            images = []

            for page_num in range(doc.page_count):
                page = doc.load_page(page_num)
                mat = fitz.Matrix(3.0, 3.0)
                pix = page.get_pixmap(matrix=mat)
                img_data = pix.tobytes("png")

                from io import BytesIO
                img = Image.open(BytesIO(img_data))

                processed_img = self.preprocess_image(img)
                images.append(processed_img)

            doc.close()
            return images

        except Exception as e:
            logger.error(f"Error converting PDF to images: {e}")
            raise

    def extract_data_from_text(self, text_content: str) -> Dict[str, Any]:
        prompt = f"""
                You are an expert data extraction specialist. Extract ONLY the essential student information we need from this document.

                SMART FIELD IDENTIFICATION:
                Look for these field patterns and extract intelligently:

                1. STUDENT NAME: Look for fields like:
                - "Candidate's Name", "Student Name", "Name", "नाम", "छात्र का नाम"
                - Names in ANY language (Hindi/हिंदी, Bengali/বাংলা, Punjabi/ਪੰਜਾਬੀ, Marathi/मराठी, Tamil/தமிழ், etc.)
                - Transliterate non-English names to English script

                2. PHONE: Look for fields like:
                - "Mobile No", "Phone", "Contact", "Mobile Number", "मोबाइल नंबर"
                - Extract 10-digit Indian numbers (ignore country codes)

                3. GENDER: Look for:
                - "Sex", "Gender", "लिंग"
                - Codes: 1/M/Male=Male, 2/F/Female=Female, 3/O/Other=Other
                - If missing, intelligently infer from name patterns

                4. SCHOOL NAME: Extract from:
                - Document header/title/letterhead
                - Any school identification text

                5. GRADE/CLASS: Extract from:
                - Header information like "CLASS 10", "Grade 12", "कक्षा 10"
                - Any class/standard references

                6. COURSE/SUBJECTS: Look for:
                - Subject combinations, stream information
                - Subject codes (901=Hindi, 917=English, etc.)
                - Compile into meaningful course description

                7. SL NO: Use:
                - Form numbers, Roll numbers, Serial numbers, Row numbers
                - Generate sequence if not available

                LANGUAGE HANDLING:
                - Handle names in Devanagari (Hindi/Marathi): मुस्कान, दिव्यांशी, etc.
                - Handle Bengali script: মুস্কান, দিব্যাংশী, etc.
                - Handle Punjabi Gurmukhi: ਮੁਸਕਾਨ, ਦਿਵਿਆਂਸ਼ੀ, etc.
                - Handle Tamil script: முஸ்கான், திவ்யாஷீ, etc.
                - Transliterate ALL names to English script accurately
                - Preserve original spelling and pronunciation

                IGNORE these fields completely:
                - Photos, signatures, administrative codes, addresses, dates of birth, caste, minority status, Aadhar numbers, parent names, etc.

                    Return ONLY a valid JSON object:
                    {{
                        "school_name": "Extracted from document header/title, otherwise 'N/A'",
                        "grade": "Extracted class/grade info, otherwise 'N/A'",
                        "students": [
                            {{
                                "sl_no": "Form/Roll/Serial number as string",
                                "student_name": "Name transliterated to English if needed",
                                "gender": "Male/Female/Other or intelligently inferred",
                                "course_name": "Subject combination or course info, otherwise 'N/A'",
                                "phone": "10-digit mobile number only",
                                "language": "Detected language of original name or document",
                                "confidence_score": "A value between 0.0 and 1.0 indicating confidence in extraction accuracy"
                            }}
                        ]
                    }}

                    CONFIDENCE SCORING:
                    - 0.9-1.0: All required fields clearly identified and extracted
                    - 0.7-0.89: Minor uncertainty in field mapping or transliteration
                    - 0.5-0.69: Some fields unclear or requiring inference
                    - 0.0-0.49: Major uncertainty in data extraction

                    Focus ONLY on the 6 essential fields we need. Ignore everything else.
                    Return ONLY the JSON, no explanations.

                Document text:
                {text_content}
        """

        try:
            response = self.model.generate_content(prompt)
            result_text = response.text.strip()

            if "```json" in result_text:
                result_text = result_text.split("```json")[1].split("```")[0].strip()
            elif "```" in result_text:
                result_text = result_text.split("```")[1].strip()

            return json.loads(result_text)

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse JSON from response: {e}")
            logger.error(f"Raw response: {result_text}")
            raise
        except Exception as e:
            logger.error(f"Error in API call: {e}")
            raise

    def extract_data_from_images(self, images: List[Image.Image]) -> Dict[str, Any]:
        prompt = """
            You are an expert data extraction specialist with advanced visual processing capabilities. Extract ONLY the essential student information from this document image.

            CRITICAL VISUAL PROCESSING RULES:

            ## STRIKETHROUGH TEXT HANDLING:
            - If you see text with strikethrough/line-through formatting, COMPLETELY IGNORE the struck-through text
            - Only extract the corrected/replacement text that appears alongside or instead of the struck-through text
            - Examples:
              * If you see "Sohel Verma Shivani" where "Sohel Verma" has strikethrough, extract only "Shivani"
              * If you see "Old Name New Name" where "Old Name" is crossed out, extract only "New Name"
              * Look for handwritten corrections, overwriting, or replacement text

            ## TEXT CORRECTION PATTERNS:
            - Handwritten corrections over printed text
            - Names written in margins as corrections
            - Multiple names where one is clearly marked as incorrect
            - Overwritten or replaced information

            ## SMART FIELD IDENTIFICATION:
            Look for these field patterns and extract intelligently:

            1. STUDENT NAME: Look for fields like:
            - "Candidate's Name", "Student Name", "Name", "नाम", "छात्र का नाम"
            - Names in ANY Indian language (Hindi/हिंदी, Bengali/বাংলা, Punjabi/ਪੰਜਾਬੀ, Marathi/मराठी, Tamil/தமிழ், Gujarati/ગુજરાતી, etc.)
            - **IMPORTANT**: Ignore any names that are visually struck through or crossed out, obscured with a line or scribble
            - Transliterate non-English names to English script accurately

            2. PHONE: Look for fields like:
            - "Mobile No", "Phone", "Contact", "Mobile Number", "मोबाइल नंबर"
            - Extract 10-digit Indian numbers (ignore country codes)
            - **IMPORTANT**: Ignore any numbers that are struck through or corrected

            3. GENDER: Look for:
            - "Sex", "Gender", "लिंग"
            - Codes: 1/M/Male=Male, 2/F/Female=Female, 3/O/Other=Other
            - If missing, intelligently infer from the FINAL/CORRECT name (not struck-through names)

            4. SCHOOL NAME: Extract from:
            - Document header/title/letterhead
            - Any school identification text

            5. GRADE/CLASS: Extract from:
            - Header information like "CLASS 10", "Grade 12", "कक्षा 10"
            - Any class/standard references

            6. COURSE/SUBJECTS: Look for:
            - Subject combinations, stream information
            - Subject codes (901=Hindi, 917=English, etc.)
            - Compile into meaningful course description

            7. SL NO: Use:
            - Form numbers, Roll numbers, Serial numbers, Row numbers
            - Generate sequence if not available

            ## MULTI-LANGUAGE NAME HANDLING:
            - Hindi/Devanagari: मुस्कान → Muskan, दिव्यांशी → Divyanshi
            - Bengali: মুস্কান → Muskan, দিব্যাংশী → Divyanshi
            - Punjabi/Gurmukhi: ਮੁਸਕਾਨ → Muskan, ਦਿਵਿਆਂਸ਼ੀ → Divyanshi
            - Tamil: முஸ்கான் → Muskan, திவ்யாஷீ → Divyanshi
            - Marathi: मुस्कान → Muskan, दिव्यांशी → Divyanshi
            - Gujarati: મુસ્કાન → Muskan, દિવ્યાંશી → Divyanshi
            - Handle mixed scripts and transliterate accurately

            ## IGNORE COMPLETELY:
            - Photos, signatures, administrative codes, addresses, dates of birth, caste information, minority status, Aadhar numbers, parent names, guardian details, etc.
            - ANY text that is visually struck through, crossed out, obscured with a line or scribble, or marked as incorrect

            ## INTELLIGENT PROCESSING:
            - Focus ONLY on the 6 essential data fields we need
            - Skip irrelevant columns and administrative data
            - Handle complex table layouts smartly
            - Transliterate names preserving pronunciation
            - Clean phone numbers to 10-digit format
            - Always prioritize corrected/final information over struck-through text

            Return ONLY a valid JSON object:
            {
                "school_name": "Extracted from document header/title, otherwise 'N/A'",
                "grade": "Extracted class/grade info, otherwise 'N/A'",
                "students": [
                    {
                        "sl_no": "Form/Roll/Serial number as string",
                        "student_name": "CORRECTED name only (strictly ignore strikethrough), transliterated to English if needed",
                        "gender": "Male/Female/Other or intelligently inferred from FINAL name",
                        "course_name": "Subject combination or course info, otherwise 'N/A'",
                        "phone": "CORRECTED 10-digit mobile number only (ignore struck-through numbers)",
                        "language": "Detected original language of name/document",
                        "confidence_score": "A value between 0.0 and 1.0 indicating confidence in extraction accuracy",
                        "has_corrections": true/false
                    }
                ]
            }

            CONFIDENCE SCORING:
            - 0.9-1.0: All essential fields clearly identified, corrections properly handled
            - 0.7-0.89: Minor uncertainty in correction detection or field mapping
            - 0.5-0.69: Some fields unclear or correction patterns ambiguous
            - 0.0-0.49: Major uncertainty in essential data extraction or correction handling

            CORRECTION DETECTION:
            - Set "has_corrections": true if you detect any strikethrough, overwriting, or correction patterns
            - Set "has_corrections": false if the text appears clean without corrections

            Return ONLY the JSON, no explanations.
        """

        try:
            response = self.model.generate_content([prompt] + images)
            result_text = response.text.strip()

            if "```json" in result_text:
                result_text = result_text.split("```json")[1].split("```")[0].strip()
            elif "```" in result_text:
                result_text = result_text.split("```")[1].strip()

            return json.loads(result_text)

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse JSON from response: {e}")
            logger.error(f"Raw response: {result_text}")
            raise
        except Exception as e:
            logger.error(f"Error in Gemini Vision API call: {e}")
            raise

    def renumber_students(self, data: Dict[str, Any]) -> Dict[str, Any]:
        students = data.get("students", [])
        if not students:
            return data

        for i, student in enumerate(students, 1):
            student["sl_no"] = str(i)

        return data

    def create_csv_output(self, data: Dict[str, Any], output_path: str):
        try:
            rows = []

            rows.append(["", "", "", "", "", "", ""])
            rows.append(["School Name", data.get("school_name", "N/A"), "", "", "", "", ""])
            rows.append(["Grade", data.get("grade", "N/A"), "", "", "", "", ""])
            rows.append(["", "", "", "", "", "", ""])
            rows.append(["SL No", "Student Name", "Gender", "Course Name", "Phone", "Language", "Has Corrections"])

            students = data.get("students", [])
            corrections_detected = 0

            for student in students:
                has_corrections = student.get("has_corrections", False)
                if has_corrections:
                    corrections_detected += 1

                rows.append([
                    student.get("sl_no", ""),
                    student.get("student_name", ""),
                    student.get("gender", ""),
                    student.get("course_name", ""),
                    student.get("phone", ""),
                    student.get("language", ""),
                    "Yes" if has_corrections else "No"
                ])

            df = pd.DataFrame(rows)
            df.to_csv(output_path, index=False, header=False)

            logger.info(f"CSV output saved to: {output_path}")
            if corrections_detected > 0:
                logger.info(f"Detected corrections in {corrections_detected} student records")

            return len(students)

        except Exception as e:
            logger.error(f"Error creating CSV output: {e}")
            raise

    def create_confidence_csv(self, data: Dict[str, Any], output_path: str):
        try:
            students = data.get("students", [])
            if not students:
                return

            confidence_rows = []
            confidence_rows.append(["SL No", "Student Name", "Phone", "Confidence Score", "Has Corrections", "Needs Review"])

            for student in students:
                confidence_score = student.get("confidence_score", 0.0)
                has_corrections = student.get("has_corrections", False)

                if confidence_score < 0.9:
                    has_corrections = True

                needs_review = "Yes" if confidence_score < 0.8 or has_corrections else "No"

                confidence_rows.append([
                    student.get("sl_no", ""),
                    student.get("student_name", ""),
                    student.get("phone", ""),
                    f"{confidence_score:.2f}",
                    "Yes" if has_corrections else "No",
                    needs_review
                ])

            df_confidence = pd.DataFrame(confidence_rows)
            df_confidence.to_csv(output_path, index=False, header=False)

            logger.info(f"Confidence CSV saved to: {output_path}")

        except Exception as e:
            logger.error(f"Error creating confidence CSV: {e}")
            raise

    def process_document(self, file_path: str) -> tuple[str, str]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        file_ext = Path(file_path).suffix.lower()

        if file_ext == '.pdf':
            logger.info("Processing PDF file with enhanced preprocessing...")

            text_content, is_image_based = self.extract_text_from_pdf(file_path)

            if is_image_based or not text_content.strip():
                logger.info("PDF appears to be image-based, using enhanced OCR...")
                images = self.convert_pdf_to_images(file_path)
                extracted_data = self.extract_data_from_images(images)
            else:
                logger.info("PDF contains text, extracting directly...")
                extracted_data = self.extract_data_from_text(text_content)

        elif file_ext in ['.jpg', '.jpeg', '.png']:
            logger.info("Processing image file with enhanced preprocessing...")
            image = Image.open(file_path)
            processed_image = self.preprocess_image(image)
            extracted_data = self.extract_data_from_images([processed_image])

        else:
            raise ValueError(f"Unsupported file format: {file_ext}")

        extracted_data = self.renumber_students(extracted_data)

        input_path = Path(file_path)
        output_base_dir = Path("output") / input_path.stem
        output_base_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_base_dir / f"{input_path.stem}_output.csv"
        confidence_path = output_base_dir / f"{input_path.stem}_confidence.csv"

        student_count = self.create_csv_output(extracted_data, str(output_path))
        self.create_confidence_csv(extracted_data, str(confidence_path))

        logger.info(f"Processing complete!")
        logger.info(f"School: {extracted_data.get('school_name', 'N/A')}")
        logger.info(f"Grade: {extracted_data.get('grade', 'N/A')}")
        logger.info(f"Students processed: {student_count}")

        students = extracted_data.get("students", [])
        if students:
            avg_confidence = sum(s.get("confidence_score", 0) for s in students) / len(students)
            logger.info(f"Average confidence score: {avg_confidence:.2f}")

            corrections_count = sum(1 for s in students if s.get("has_corrections", False))
            if corrections_count > 0:
                logger.info(f"Records with detected corrections: {corrections_count}")

            low_confidence = [s for s in students if s.get("confidence_score", 1.0) < 0.7]
            if low_confidence:
                logger.warning(f"Found {len(low_confidence)} records with low confidence (<0.7)")

        return str(output_path), str(confidence_path)


def main():
    load_dotenv()

    parser = argparse.ArgumentParser(
        description="TAP — Document Data Extractor (V3)",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument('file_path', help='Path to the input PDF or image file')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging')

    args = parser.parse_args()

    # if args.verbose:
    #     logging.getLogger().setLevel(logging.DEBUG)

    api_key = os.getenv('GOOGLE_GEMINI_API_KEY')
    if not api_key:
        logger.error("GOOGLE_GEMINI_API_KEY not found")
        sys.exit(1)

    try:
        processor = EnhancedDocumentProcessor(api_key)
        output_file, confidence_file = processor.process_document(args.file_path)

        print(f"\nProcessing completed successfully!")
        print(f"Main output: {output_file}")
        print(f"Confidence report: {confidence_file}")

    except FileNotFoundError as e:
        logger.error(f"File error: {e}")
        sys.exit(1)
    except ValueError as e:
        logger.error(f"Input error: {e}")
        sys.exit(1)
    except Exception as e:
        logger.error(f"Processing error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()