fix(openai): preserve multimodal tool output ordering

tpirc3 · tpirc3 · commit 524d69ffb72c · 2026-02-26T20:58:22.000+08:00
diff --git a/livekit-agents/livekit/agents/llm/_provider_format/openai.py b/livekit-agents/livekit/agents/llm/_provider_format/openai.py
@@ -141,10 +141,13 @@ def _to_chat_tool_output_content(output: Any) -> str | list[dict[str, Any]]:
     if not image_parts:
         return llm.utils.tool_output_to_text(output, include_image_placeholder=False)
 
-    parts: list[dict[str, Any]] = [{"type": "text", "text": text} for text in text_parts]
-    for image in image_parts:
+    parts: list[dict[str, Any]] = []
+    for part in llm.utils.tool_output_parts(output):
+        if isinstance(part, str):
+            parts.append({"type": "text", "text": part})
+            continue
         try:
-            parts.append(_to_image_content(image))
+            parts.append(_to_image_content(part))
         except ValueError as e:
             logger.warning(
                 "Failed to serialize tool output image for openai chat format", exc_info=e
diff --git a/tests/test_tool_output_multimodal.py b/tests/test_tool_output_multimodal.py
@@ -184,6 +184,36 @@ def test_openai_chat_supports_multimodal_tool_output_when_enabled() -> None:
     assert content[1]["type"] == "image_url"
 
 
+def test_openai_chat_preserves_multimodal_tool_output_order_when_enabled() -> None:
+    image_1 = ImageContent(image="data:image/png;base64,aW1nMQ==")
+    image_2 = ImageContent(image="data:image/png;base64,aW1nMg==")
+    chat_ctx = ChatContext(
+        items=[
+            _fnc_call(),
+            FunctionCallOutput(
+                name="capture",
+                call_id="call_1",
+                output=["before", image_1, "middle", image_2, "after"],
+                is_error=False,
+            ),
+        ]
+    )
+
+    openai_messages, _ = chat_ctx.to_provider_format(
+        "openai",
+        inject_dummy_user_message=False,
+        supports_tool_image_output=True,
+    )
+    content = openai_messages[1]["content"]
+    assert isinstance(content, list)
+    assert [part["type"] for part in content] == ["text", "image_url", "text", "image_url", "text"]
+    assert content[0]["text"] == "before"
+    assert content[1]["image_url"]["url"].endswith("aW1nMQ==")
+    assert content[2]["text"] == "middle"
+    assert content[3]["image_url"]["url"].endswith("aW1nMg==")
+    assert content[4]["text"] == "after"
+
+
 def test_openai_chat_keeps_text_tool_output_as_string_when_enabled() -> None:
     chat_ctx = ChatContext(
         items=[