-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy patheval_baselines.py
More file actions
154 lines (137 loc) · 5.29 KB
/
eval_baselines.py
File metadata and controls
154 lines (137 loc) · 5.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Evaluation baseline script for benchmarking extractors.
This script runs baseline evaluations on a dataset using Ray for distributed processing.
It supports resuming from previous results and can be configured for CPU or GPU execution.
"""
import argparse
import json
from pathlib import Path
from eval_baselines.base import (RayBatchProcessMaper, build_dataset,
export_results, reduce_results)
def main():
"""Main entry point for the evaluation baseline script."""
# Parse command line arguments
parser = argparse.ArgumentParser(
description='Run baseline evaluation on a benchmark dataset'
)
parser.add_argument(
'--bench', type=str, required=True, help='Benchmark dataset path to evaluate'
)
parser.add_argument(
'--task_dir',
type=str,
required=True,
help='Directory to store task results and intermediate files',
)
parser.add_argument(
'--extractor_name',
type=str,
required=True,
help='Name of the extractor to use for evaluation',
)
parser.add_argument(
'--default_config',
type=str,
default='cpu',
choices=['cpu', 'gpu'],
help='Default configuration preset: "cpu" or "gpu". "GPU" is for MinerU_HTML and ReaderLM, "CPU" is for other extractors',
)
parser.add_argument(
'--model_path',
type=str,
default=None,
help='Path to the model file (if required by extractor)',
)
parser.add_argument(
'--batch_size',
type=int,
default=None,
help='Batch size for processing (overrides default_config if provided)',
)
parser.add_argument(
'--gpu_num',
type=int,
default=None,
help='Number of GPUs to use (overrides default_config if provided)',
)
parser.add_argument(
'--cpu_num',
type=int,
default=None,
help='Number of CPUs to use (overrides default_config if provided)',
)
parser.add_argument(
'--key',
type=str,
default=None,
help='Process only a specific case by its key (for debugging)',
)
parser.add_argument(
'--force_update',
action='store_true',
help='Force re-evaluation of all cases, ignoring existing results',
)
parser.add_argument(
'--dump-data', action='store_true', help='Dump the data to the target directory'
)
args = parser.parse_args()
# Build the baseline dataset from the benchmark
baseline_dataset = build_dataset(args.bench)
# Check for existing results and skip already completed cases
finished_results = []
if not args.force_update:
cases_dir = Path(args.task_dir) / 'cases'
cases_dir.mkdir(parents=True, exist_ok=True)
# Scan for completed cases with existing score files
for case_dir in cases_dir.iterdir():
if case_dir.is_dir():
score_file = case_dir / 'rouge_score.json'
if score_file.exists():
# Load existing score and create result dictionary
score = json.loads(score_file.read_text(encoding='utf-8'))
result_dict = {
'track_id': case_dir.name,
**score,
'meta.level': baseline_dataset[case_dir.name].level,
}
finished_results.append(result_dict)
# Remove from dataset to skip re-processing
baseline_dataset.pop(case_dir.name)
print(f'Found {len(finished_results)} finished results, skipping them')
# Filter dataset to a specific key if requested (for debugging)
if args.key is not None:
baseline_dataset = {args.key: baseline_dataset[args.key]}
# Apply default configuration presets if parameters are not explicitly set
if args.default_config == 'cpu':
args.batch_size = 1 if args.batch_size is None else args.batch_size
args.gpu_num = 0 if args.gpu_num is None else args.gpu_num
args.cpu_num = 1 if args.cpu_num is None else args.cpu_num
elif args.default_config == 'gpu':
args.batch_size = 512 if args.batch_size is None else args.batch_size
args.gpu_num = 1 if args.gpu_num is None else args.gpu_num
args.cpu_num = 4 if args.cpu_num is None else args.cpu_num
# Initialize and run the Ray batch processor
mapper = RayBatchProcessMaper(
baseline_dataset,
args.extractor_name,
{'model_path': args.model_path},
args.batch_size,
args.gpu_num,
args.cpu_num,
args.task_dir if args.dump_data else None,
)
results = mapper.run()
# Collect and validate results from the mapper
result_benchmark_datas = []
for result in results:
if isinstance(result, tuple) or isinstance(result, dict):
result_benchmark_datas.append(result)
else:
raise ValueError(f'Unknown result type: {type(result)}')
# Merge with previously finished results
result_benchmark_datas.extend(finished_results)
# Export results to files and generate summary statistics
flat_eval_df = export_results(result_benchmark_datas, args.task_dir)
reduce_results(flat_eval_df, args.task_dir)
if __name__ == '__main__':
main()