Minor eval fixes (#471)

husseinmozannar · web-flow · commit 564e9b04aa85 · 2025-12-10T16:49:54.000-05:00
diff --git a/experiments/eval/run.py b/experiments/eval/run.py
@@ -220,7 +220,7 @@ def main() -> None:
         "--simulated-user-type",
         type=str,
         default="none",
-        help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, none)",
+        help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)",
     )
     parser.add_argument(
         "--how-helpful-user-proxy",
diff --git a/experiments/eval/systems/magentic_one_system.py b/experiments/eval/systems/magentic_one_system.py
@@ -188,6 +188,7 @@ async def _runner() -> Tuple[str, List[str]]:
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
             # get last message with source MagenticOneOrchestrator, might not be the last message
@@ -215,8 +216,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
             usage_json = {
                 "client": get_usage(model_client),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             # Step 5: Prepare the screenshots
             screenshots_paths = []
diff --git a/experiments/eval/systems/magentic_ui_sim_user_system.py b/experiments/eval/systems/magentic_ui_sim_user_system.py
@@ -23,7 +23,7 @@
 from magentic_ui.types import CheckpointEvent
 from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
 from magentic_ui.teams import GroupChat
-from magentic_ui.agents.users import MetadataUserProxy
+from magentic_ui.agents.users import MetadataUserProxy, DummyUserProxy
 from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
 from magentic_ui.tools.playwright.browser.utils import get_available_port
 from magentic_ui.approval_guard import (
@@ -106,7 +106,11 @@ def __init__(
         self,
         name: str = "MagenticUISimUserSystem",
         simulated_user_type: Literal[
-            "co-planning", "co-execution", "co-planning-and-execution", "none"
+            "co-planning",
+            "co-execution",
+            "co-planning-and-execution",
+            "none",
+            "dummy",
         ] = "none",
         how_helpful_user_proxy: Literal["strict", "soft", "no_hints"] = "soft",
         web_surfer_only: bool = False,
@@ -217,7 +221,7 @@ async def _runner() -> Tuple[str, List[str]]:
                 if self.simulated_user_type in ["co-execution", "none"]
                 else True,
                 autonomous_execution=True
-                if self.simulated_user_type in ["co-planning", "none"]
+                if self.simulated_user_type in ["co-planning", "none", "dummy"]
                 else False,
                 allow_follow_up_input=False,
                 final_answer_prompt=FINAL_ANSWER_PROMPT,
@@ -312,6 +316,10 @@ def get_model_client(
 
             if self.simulated_user_type == "none":
                 user_proxy = None
+            elif self.simulated_user_type == "dummy":
+                user_proxy = DummyUserProxy(
+                    name="user_proxy",
+                )
             else:
                 user_proxy = MetadataUserProxy(
                     name="user_proxy",
@@ -346,7 +354,7 @@ def get_model_client(
                 from autogen_core import CancellationToken
                 from autogen_core.models import UserMessage
 
-                prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, 'ground_truth', '')}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
+                prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, "ground_truth", "")}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
                 result = await model_client_orch.create(
                     messages=[UserMessage(content=prompt, source="user")],
                     cancellation_token=CancellationToken(),
@@ -410,16 +418,17 @@ def get_model_client(
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
                 if message_str.startswith("Final Answer:"):
                     answer = message_str[len("Final Answer:") :].strip()
                     # remove the "FINAL ANSWER:" part and get the string after it
                     answer = answer.split("FINAL ANSWER:")[1].strip()
 
-            assert isinstance(
-                answer, str
-            ), f"Expected answer to be a string, got {type(answer)}"
+            assert isinstance(answer, str), (
+                f"Expected answer to be a string, got {type(answer)}"
+            )
 
             # save the usage of each of the client in a usage json file
             def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
@@ -447,8 +456,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
                     if key != "user_proxy"
                 ),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             await team.close()
             # Step 5: Prepare the screenshots
diff --git a/experiments/eval/systems/magentic_ui_system.py b/experiments/eval/systems/magentic_ui_system.py
@@ -265,6 +265,7 @@ async def _runner() -> Tuple[str, List[str]]:
                     # Convert list of logevent objects to list of dicts
                     messages_json = [msg.model_dump() for msg in messages_so_far]
                     await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
                 # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
 
                 if message_str.startswith("Final Answer:"):
@@ -301,8 +302,8 @@ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
                     if key != "user_proxy"
                 ),
             }
-            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
-                json.dump(usage_json, f)
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
 
             await team.close()
             # Step 5: Prepare the screenshots
diff --git a/src/magentic_ui/eval/core.py b/src/magentic_ui/eval/core.py
@@ -207,6 +207,10 @@ def _run_single_task(
             )
 
         logger.info(f"Completed task for task_id={task_id}")
+
+        # Evaluate immediately after task completion
+        _evaluate_single_task(task_id, system, output_dir, benchmark, redo_eval=False)
+
         return task_id, answer, end_time - start_time
     except Exception:
         # Log the error with traceback
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ def main() -> None:`
`220`	`220`	`"--simulated-user-type",`
`221`	`221`	`type=str,`
`222`	`222`	`default="none",`
`223`		`- help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, none)",`
	`223`	`+ help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)",`
`224`	`224`	`)`
`225`	`225`	`parser.add_argument(`
`226`	`226`	`"--how-helpful-user-proxy",`