-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreport_xml_parsing.py
More file actions
146 lines (115 loc) · 4.57 KB
/
report_xml_parsing.py
File metadata and controls
146 lines (115 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import json
import argparse
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
def get_abstract_text(root: ET.Element, label: str) -> str:
"""
Extract <AbstractText Label="LABEL"> ... </AbstractText>.
If multiple exist, they will be joined with newline.
Returns empty string if not found.
"""
parts: List[str] = []
for node in root.findall(".//AbstractText"):
if (node.attrib.get("Label") or "").strip().upper() == label.upper():
text = "".join(node.itertext()).strip()
if text:
parts.append(text)
return "\n".join(parts)
def get_mesh_terms(root: ET.Element) -> Dict[str, List[str]]:
"""
Extract MeSH terms from <MeSH> element.
Returns dict with 'major' and 'automatic' term lists.
"""
mesh = {"major": [], "automatic": []}
mesh_elem = root.find(".//MeSH")
if mesh_elem is None:
return mesh
for child in mesh_elem:
tag = child.tag.lower()
text = (child.text or "").strip()
if text:
if tag == "major":
mesh["major"].append(text)
elif tag == "automatic":
mesh["automatic"].append(text)
return mesh
def map_url_to_local_path(url_text: str, images_root: Optional[str]) -> str:
"""
If images_root is provided, return a local path under images_root using the basename of the URL.
Otherwise return the URL as-is.
"""
url_text = (url_text or "").strip()
if not url_text:
return ""
if images_root:
# Typical pattern in this dataset: /hadoop/.../CXR1_1_IM-0001-3001.jpg
# We map to: <images_root>/CXR1_1_IM-0001-3001.jpg
return os.path.join(images_root, os.path.basename(url_text))
return url_text
def parse_single_xml(xml_path: str, images_root: Optional[str]) -> Optional[Dict]:
"""
Returns a single record for the frontal image (figureId="F1") from an XML file.
Returns None if no F1 image or no findings text.
"""
tree = ET.parse(xml_path)
root = tree.getroot()
# Extract report text sections
findings = get_abstract_text(root, "FINDINGS")
impression = get_abstract_text(root, "IMPRESSION")
mesh = get_mesh_terms(root)
# Skip if no findings text
if not findings:
return None
# Find the frontal image (figureId="F1")
for pimg in root.findall(".//parentImage"):
fig_id_elem = pimg.find(".//figureId")
fig_id = (fig_id_elem.text or "").strip() if fig_id_elem is not None else ""
if fig_id != "F1":
continue
image_id = (pimg.attrib.get("id") or "").strip()
# Find the <url> inside this parentImage
url_node = pimg.find(".//url")
url_text = url_node.text.strip() if (url_node is not None and url_node.text) else ""
image_path = map_url_to_local_path(url_text, images_root)
if not (image_id or image_path):
continue
# The reports reference .jpg but files are .png
if image_path.endswith(".jpg"):
image_path = image_path[:-4] + ".png"
return {
"image_id": image_id,
"image_path": image_path,
"findings": findings,
"impression": impression,
"mesh": mesh,
}
return None
def build_dataset(xml_dir: str, images_root: Optional[str]) -> List[Dict]:
all_records: List[Dict] = []
for name in sorted(os.listdir(xml_dir)):
if not name.lower().endswith(".xml"):
continue
xml_path = os.path.join(xml_dir, name)
try:
record = parse_single_xml(xml_path, images_root)
if record is not None:
all_records.append(record)
except Exception as e:
# Keep going but report failures
print(f"[WARN] Failed to parse {xml_path}: {e}")
return all_records
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--xml_dir", required=True, help="Folder containing XML files")
ap.add_argument("--out_json", required=True, help="Output JSON path")
ap.add_argument("--images_root", default=None,
help="Optional: local images folder. If set, image_path becomes images_root/basename(url)")
args = ap.parse_args()
data = build_dataset(args.xml_dir, args.images_root)
os.makedirs(os.path.dirname(os.path.abspath(args.out_json)), exist_ok=True)
with open(args.out_json, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"Saved {len(data)} records to {args.out_json}")
if __name__ == "__main__":
main()