Skip to content

Commit 564e9b0

Browse files
Minor eval fixes (#471)
1 parent 4a60103 commit 564e9b0

File tree

6 files changed

+1646
-15
lines changed

6 files changed

+1646
-15
lines changed

experiments/eval/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def main() -> None:
220220
"--simulated-user-type",
221221
type=str,
222222
default="none",
223-
help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, none)",
223+
help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)",
224224
)
225225
parser.add_argument(
226226
"--how-helpful-user-proxy",

experiments/eval/systems/magentic_one_system.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ async def _runner() -> Tuple[str, List[str]]:
188188
# Convert list of logevent objects to list of dicts
189189
messages_json = [msg.model_dump() for msg in messages_so_far]
190190
await f.write(json.dumps(messages_json, indent=2))
191+
await f.flush() # Flush to disk immediately
191192
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
192193

193194
# get last message with source MagenticOneOrchestrator, might not be the last message
@@ -215,8 +216,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
215216
usage_json = {
216217
"client": get_usage(model_client),
217218
}
218-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
219-
json.dump(usage_json, f)
219+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
220+
await f.write(json.dumps(usage_json, indent=2))
220221

221222
# Step 5: Prepare the screenshots
222223
screenshots_paths = []

experiments/eval/systems/magentic_ui_sim_user_system.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from magentic_ui.types import CheckpointEvent
2424
from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
2525
from magentic_ui.teams import GroupChat
26-
from magentic_ui.agents.users import MetadataUserProxy
26+
from magentic_ui.agents.users import MetadataUserProxy, DummyUserProxy
2727
from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
2828
from magentic_ui.tools.playwright.browser.utils import get_available_port
2929
from magentic_ui.approval_guard import (
@@ -106,7 +106,11 @@ def __init__(
106106
self,
107107
name: str = "MagenticUISimUserSystem",
108108
simulated_user_type: Literal[
109-
"co-planning", "co-execution", "co-planning-and-execution", "none"
109+
"co-planning",
110+
"co-execution",
111+
"co-planning-and-execution",
112+
"none",
113+
"dummy",
110114
] = "none",
111115
how_helpful_user_proxy: Literal["strict", "soft", "no_hints"] = "soft",
112116
web_surfer_only: bool = False,
@@ -217,7 +221,7 @@ async def _runner() -> Tuple[str, List[str]]:
217221
if self.simulated_user_type in ["co-execution", "none"]
218222
else True,
219223
autonomous_execution=True
220-
if self.simulated_user_type in ["co-planning", "none"]
224+
if self.simulated_user_type in ["co-planning", "none", "dummy"]
221225
else False,
222226
allow_follow_up_input=False,
223227
final_answer_prompt=FINAL_ANSWER_PROMPT,
@@ -312,6 +316,10 @@ def get_model_client(
312316

313317
if self.simulated_user_type == "none":
314318
user_proxy = None
319+
elif self.simulated_user_type == "dummy":
320+
user_proxy = DummyUserProxy(
321+
name="user_proxy",
322+
)
315323
else:
316324
user_proxy = MetadataUserProxy(
317325
name="user_proxy",
@@ -346,7 +354,7 @@ def get_model_client(
346354
from autogen_core import CancellationToken
347355
from autogen_core.models import UserMessage
348356

349-
prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, 'ground_truth', '')}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
357+
prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, "ground_truth", "")}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
350358
result = await model_client_orch.create(
351359
messages=[UserMessage(content=prompt, source="user")],
352360
cancellation_token=CancellationToken(),
@@ -410,16 +418,17 @@ def get_model_client(
410418
# Convert list of logevent objects to list of dicts
411419
messages_json = [msg.model_dump() for msg in messages_so_far]
412420
await f.write(json.dumps(messages_json, indent=2))
421+
await f.flush() # Flush to disk immediately
413422
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
414423

415424
if message_str.startswith("Final Answer:"):
416425
answer = message_str[len("Final Answer:") :].strip()
417426
# remove the "FINAL ANSWER:" part and get the string after it
418427
answer = answer.split("FINAL ANSWER:")[1].strip()
419428

420-
assert isinstance(
421-
answer, str
422-
), f"Expected answer to be a string, got {type(answer)}"
429+
assert isinstance(answer, str), (
430+
f"Expected answer to be a string, got {type(answer)}"
431+
)
423432

424433
# save the usage of each of the client in a usage json file
425434
def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
@@ -447,8 +456,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
447456
if key != "user_proxy"
448457
),
449458
}
450-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
451-
json.dump(usage_json, f)
459+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
460+
await f.write(json.dumps(usage_json, indent=2))
452461

453462
await team.close()
454463
# Step 5: Prepare the screenshots

experiments/eval/systems/magentic_ui_system.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ async def _runner() -> Tuple[str, List[str]]:
265265
# Convert list of logevent objects to list of dicts
266266
messages_json = [msg.model_dump() for msg in messages_so_far]
267267
await f.write(json.dumps(messages_json, indent=2))
268+
await f.flush() # Flush to disk immediately
268269
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
269270

270271
if message_str.startswith("Final Answer:"):
@@ -301,8 +302,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
301302
if key != "user_proxy"
302303
),
303304
}
304-
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
305-
json.dump(usage_json, f)
305+
async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
306+
await f.write(json.dumps(usage_json, indent=2))
306307

307308
await team.close()
308309
# Step 5: Prepare the screenshots

src/magentic_ui/eval/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def _run_single_task(
207207
)
208208

209209
logger.info(f"Completed task for task_id={task_id}")
210+
211+
# Evaluate immediately after task completion
212+
_evaluate_single_task(task_id, system, output_dir, benchmark, redo_eval=False)
213+
210214
return task_id, answer, end_time - start_time
211215
except Exception:
212216
# Log the error with traceback

0 commit comments

Comments
 (0)