Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion livekit-agents/livekit/agents/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,7 +1300,7 @@ def _print_run_event(
)
)
elif event.type == "function_call_output":
output = event.item.output
output = llm.utils.tool_output_to_text(event.item.output)
display_output = output
is_error = output.lower().startswith("error") or output.lower().startswith("exception")

Expand Down
5 changes: 3 additions & 2 deletions livekit-agents/livekit/agents/evals/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,11 @@ def _format_items(items: list) -> str:
elif item.type == "function_call":
parts.append(f"[function call: {item.name}({item.arguments})]")
elif item.type == "function_call_output":
output_text = llm_utils.tool_output_to_text(item.output)
if item.is_error:
parts.append(f"[function error: {item.output}]")
parts.append(f"[function error: {output_text}]")
else:
parts.append(f"[function output: {item.output}]")
parts.append(f"[function output: {output_text}]")
elif item.type == "agent_handoff":
parts.append(f"[agent handoff: {item.old_agent_id} -> {item.new_agent_id}]")
elif item.type == "agent_config_update":
Expand Down
6 changes: 5 additions & 1 deletion livekit-agents/livekit/agents/inference/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def __init__(
conn_options: APIConnectOptions,
extra_kwargs: dict[str, Any],
provider_fmt: str = "openai", # used internally for chat_ctx format
provider_format_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__(llm_v, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
self._model = model
Expand All @@ -303,6 +304,7 @@ def __init__(
self._client = client
self._llm = llm_v
self._extra_kwargs = drop_unsupported_params(model, extra_kwargs)
self._provider_format_kwargs = provider_format_kwargs or {}
self._tool_ctx = llm.ToolContext(tools)

async def _run(self) -> None:
Expand All @@ -317,7 +319,9 @@ async def _run(self) -> None:
retryable = True

try:
chat_ctx, _ = self._chat_ctx.to_provider_format(format=self._provider_fmt)
chat_ctx, _ = self._chat_ctx.to_provider_format(
format=self._provider_fmt, **self._provider_format_kwargs
)
tool_schemas = cast(
list[ChatCompletionToolParam],
self._tool_ctx.parse_function_tools("openai", strict=self._strict_tool_schema),
Expand Down
23 changes: 16 additions & 7 deletions livekit-agents/livekit/agents/llm/_provider_format/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,22 @@ def to_chat_ctx(
}
)
elif msg.type == "function_call_output":
result_content: list[Any] | str = msg.output
try:
parsed = json.loads(msg.output)
if isinstance(parsed, list):
result_content = parsed
except (json.JSONDecodeError, TypeError):
pass
result_content: list[Any] | str
if isinstance(msg.output, str):
result_content = msg.output
try:
parsed = json.loads(msg.output)
if isinstance(parsed, list):
result_content = parsed
except (json.JSONDecodeError, TypeError):
pass
else:
result_content = []
for part in llm.utils.tool_output_parts(msg.output):
if isinstance(part, str):
result_content.append({"type": "text", "text": part})
else:
result_content.append(_to_image_content(part))
content.append(
{
"tool_use_id": msg.call_id,
Expand Down
41 changes: 34 additions & 7 deletions livekit-agents/livekit/agents/llm/_provider_format/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,27 @@ def to_chat_ctx(
}
)
elif msg.type == "function_call_output":
tool_result_content: list[dict[str, Any]]
if msg.is_error:
tool_result_content = [{"text": llm.utils.tool_output_to_text(msg.output)}]
else:
tool_result_content = []
for part in llm.utils.tool_output_parts(msg.output):
if isinstance(part, str):
tool_result_content.append({"text": part})
else:
try:
tool_result_content.append(_build_image(part))
except ValueError:
tool_result_content.append(
{"text": llm.utils.TOOL_OUTPUT_IMAGE_PLACEHOLDER}
)
current_content.append(
{
"toolResult": {
"toolUseId": msg.call_id,
"content": [
{"json": msg.output}
if isinstance(msg.output, dict)
else {"text": msg.output}
],
"status": "success",
"content": tool_result_content,
"status": "error" if msg.is_error else "success",
}
}
)
Expand All @@ -95,12 +106,28 @@ def _build_image(image: llm.ImageContent) -> dict:

return {
"image": {
"format": "jpeg",
"format": _bedrock_image_format_from_mime_type(img.mime_type),
"source": {"bytes": img.data_bytes},
}
}


def _bedrock_image_format_from_mime_type(mime_type: str | None) -> str:
if not mime_type:
return "jpeg"

mime_base = mime_type.split(";", 1)[0].strip().lower()
if mime_base in {"image/jpeg", "image/jpg", "image/pjpeg"}:
return "jpeg"

if mime_base.startswith("image/"):
image_format = mime_base.split("/", 1)[1]
if image_format in {"png", "gif", "webp"}:
return image_format

raise ValueError(f"Unsupported mime_type {mime_type!r} for AWS Bedrock image format.")


def to_fnc_ctx(tool_ctx: llm.ToolContext) -> list[dict[str, Any]]:
return [_build_tool_spec(tool) for tool in tool_ctx.function_tools.values()]

Expand Down
33 changes: 25 additions & 8 deletions livekit-agents/livekit/agents/llm/_provider_format/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,33 @@ def to_chat_ctx(
fc_part["thought_signature"] = sig
parts.append(fc_part)
elif msg.type == "function_call_output":
response = {"output": msg.output} if not msg.is_error else {"error": msg.output}
parts.append(
{
"function_response": {
"id": msg.call_id,
"name": msg.name,
"response": response,
if msg.is_error:
response = {"error": llm.utils.tool_output_to_text(msg.output)}
parts.append(
{
"function_response": {
"id": msg.call_id,
"name": msg.name,
"response": response,
}
}
)
else:
text_output = llm.utils.tool_output_to_text(
msg.output, include_image_placeholder=False
)
_, image_parts = llm.utils.split_tool_output_parts(msg.output)
response_payload: dict[str, Any] = {}
if text_output:
response_payload["output"] = text_output
function_response: dict[str, Any] = {
"id": msg.call_id,
"name": msg.name,
"response": response_payload,
}
)
if image_parts:
function_response["parts"] = [_to_image_part(image) for image in image_parts]
parts.append({"function_response": function_response})

if current_role is not None and parts:
turns.append({"role": current_role, "parts": parts})
Expand Down
77 changes: 67 additions & 10 deletions livekit-agents/livekit/agents/llm/_provider_format/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
from typing import Any, Literal

from livekit.agents import llm
from livekit.agents.log import logger

from .utils import group_tool_calls


def to_chat_ctx(
chat_ctx: llm.ChatContext, *, inject_dummy_user_message: bool = True
chat_ctx: llm.ChatContext,
*,
inject_dummy_user_message: bool = True,
supports_tool_image_output: bool = False,
) -> tuple[list[dict], Literal[None]]:
item_groups = group_tool_calls(chat_ctx)
messages = []
Expand All @@ -36,7 +40,12 @@ def to_chat_ctx(

# append tool outputs following the tool calls
for tool_output in group.tool_outputs:
messages.append(_to_chat_item(tool_output))
messages.append(
_to_chat_tool_output_item(
tool_output,
supports_tool_image_output=supports_tool_image_output,
)
)

return messages, None

Expand Down Expand Up @@ -84,16 +93,28 @@ def _to_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
"tool_calls": [tc],
}

elif msg.type == "function_call_output":
return {
"role": "tool",
"tool_call_id": msg.call_id,
"content": msg.output,
}

raise ValueError(f"unsupported message type: {msg.type}")


def _to_chat_tool_output_item(
msg: llm.ChatItem, *, supports_tool_image_output: bool
) -> dict[str, Any]:
if msg.type != "function_call_output":
raise ValueError(f"unsupported message type: {msg.type}")

content: str | list[dict[str, Any]]
if supports_tool_image_output:
content = _to_chat_tool_output_content(msg.output)
else:
content = llm.utils.tool_output_to_text(msg.output)

return {
"role": "tool",
"tool_call_id": msg.call_id,
"content": content,
}


def _to_image_content(image: llm.ImageContent) -> dict[str, Any]:
img = llm.utils.serialize_image(image)
if img.external_url:
Expand All @@ -115,6 +136,27 @@ def _to_image_content(image: llm.ImageContent) -> dict[str, Any]:
}


def _to_chat_tool_output_content(output: Any) -> str | list[dict[str, Any]]:
text_parts, image_parts = llm.utils.split_tool_output_parts(output)
if not image_parts:
return llm.utils.tool_output_to_text(output, include_image_placeholder=False)

parts: list[dict[str, Any]] = []
for part in llm.utils.tool_output_parts(output):
if isinstance(part, str):
parts.append({"type": "text", "text": part})
continue
try:
parts.append(_to_image_content(part))
except ValueError as e:
logger.warning(
"Failed to serialize tool output image for openai chat format", exc_info=e
)
parts.append({"type": "text", "text": llm.utils.TOOL_OUTPUT_IMAGE_PLACEHOLDER})

return parts


def _to_responses_image_content(image: llm.ImageContent) -> dict[str, Any]:
img = llm.utils.serialize_image(image)
if img.external_url:
Expand Down Expand Up @@ -184,7 +226,7 @@ def _to_responses_chat_item(msg: llm.ChatItem) -> dict[str, Any]:
return {
"type": "function_call_output",
"call_id": msg.call_id,
"output": msg.output,
"output": _to_responses_tool_output(msg.output),
}

raise ValueError(f"unsupported message type: {msg.type}")
Expand Down Expand Up @@ -212,6 +254,21 @@ def to_fnc_ctx(tool_ctx: llm.ToolContext, *, strict: bool = True) -> list[dict[s
return schemas


def _to_responses_tool_output(output: Any) -> str | list[dict[str, Any]]:
normalized = llm.utils.normalize_function_output_value(output)
if isinstance(normalized, str):
return normalized

parts: list[dict[str, Any]] = []
for part in llm.utils.tool_output_parts(normalized):
if isinstance(part, str):
parts.append({"type": "input_text", "text": part})
else:
parts.append(_to_responses_image_content(part))

return parts or ""


def to_responses_fnc_ctx(tool_ctx: llm.ToolContext, *, strict: bool = True) -> list[dict[str, Any]]:
from livekit.plugins import openai

Expand Down
25 changes: 23 additions & 2 deletions livekit-agents/livekit/agents/llm/chat_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def text_content(self) -> str | None:


ChatContent: TypeAlias = ImageContent | AudioContent | str
ToolOutputContent: TypeAlias = str | ImageContent
FunctionCallOutputValue: TypeAlias = ToolOutputContent | list[ToolOutputContent]


class FunctionCall(BaseModel):
Expand All @@ -197,7 +199,7 @@ class FunctionCallOutput(BaseModel):
type: Literal["function_call_output"] = Field(default="function_call_output")
name: str = Field(default="")
call_id: str
output: str
output: FunctionCallOutputValue
is_error: bool
created_at: float = Field(default_factory=time.time)

Expand Down Expand Up @@ -459,6 +461,16 @@ def to_dict(
item.content = [c for c in item.content if not isinstance(c, ImageContent)]
if exclude_audio:
item.content = [c for c in item.content if not isinstance(c, AudioContent)]
elif item.type == "function_call_output" and exclude_image:
item = item.model_copy()
if isinstance(item.output, ImageContent):
item.output = ""
elif isinstance(item.output, list):
filtered: list[ToolOutputContent] = []
for content in item.output:
if isinstance(content, str):
filtered.append(content)
item.output = filtered if filtered else ""

items.append(item)

Expand All @@ -483,7 +495,16 @@ def to_dict(
@overload
def to_provider_format(
self,
format: Literal["openai", "openai.responses"],
format: Literal["openai"],
*,
inject_dummy_user_message: bool = True,
supports_tool_image_output: bool = False,
) -> tuple[list[dict], Literal[None]]: ...

@overload
def to_provider_format(
self,
format: Literal["openai.responses"],
*,
inject_dummy_user_message: bool = True,
) -> tuple[list[dict], Literal[None]]: ...
Expand Down
Loading